aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.hgignore14
-rw-r--r--tools/Makefile1
-rw-r--r--tools/blktap2/Makefile34
-rw-r--r--tools/blktap2/README122
-rw-r--r--tools/blktap2/daemon/Makefile55
-rw-r--r--tools/blktap2/daemon/lib/Makefile69
-rw-r--r--tools/blktap2/daemon/lib/xs_api.c323
-rw-r--r--tools/blktap2/daemon/lib/xs_api.h62
-rw-r--r--tools/blktap2/daemon/tapdisk-channel.c1367
-rw-r--r--tools/blktap2/daemon/tapdisk-daemon.c599
-rw-r--r--tools/blktap2/daemon/tapdisk-dispatch-common.c94
-rw-r--r--tools/blktap2/daemon/tapdisk-dispatch.h95
-rw-r--r--tools/blktap2/drivers/Makefile105
-rw-r--r--tools/blktap2/drivers/aes.c1319
-rw-r--r--tools/blktap2/drivers/aes.h28
-rw-r--r--tools/blktap2/drivers/atomicio.c61
-rw-r--r--tools/blktap2/drivers/blk.h30
-rw-r--r--tools/blktap2/drivers/blk_linux.c43
-rw-r--r--tools/blktap2/drivers/blktap2.h66
-rw-r--r--tools/blktap2/drivers/block-aio.c272
-rw-r--r--tools/blktap2/drivers/block-cache.c787
-rw-r--r--tools/blktap2/drivers/block-log.c688
-rw-r--r--tools/blktap2/drivers/block-qcow.c1517
-rw-r--r--tools/blktap2/drivers/block-ram.c269
-rw-r--r--tools/blktap2/drivers/block-vhd.c2321
-rw-r--r--tools/blktap2/drivers/bswap.h214
-rw-r--r--tools/blktap2/drivers/check_gcrypt14
-rw-r--r--tools/blktap2/drivers/disktypes.h184
-rw-r--r--tools/blktap2/drivers/img2qcow.c318
-rw-r--r--tools/blktap2/drivers/io-optimize.c664
-rw-r--r--tools/blktap2/drivers/io-optimize.h68
-rw-r--r--tools/blktap2/drivers/lock.c1000
-rw-r--r--tools/blktap2/drivers/lock.h51
-rw-r--r--tools/blktap2/drivers/log.h123
-rw-r--r--tools/blktap2/drivers/profile.h191
-rw-r--r--tools/blktap2/drivers/qcow-create.c121
-rw-r--r--tools/blktap2/drivers/qcow.h131
-rw-r--r--tools/blktap2/drivers/qcow2raw.c449
-rw-r--r--tools/blktap2/drivers/scheduler.c265
-rw-r--r--tools/blktap2/drivers/scheduler.h65
-rw-r--r--tools/blktap2/drivers/tapdisk-client.c496
-rw-r--r--tools/blktap2/drivers/tapdisk-diff.c797
-rw-r--r--tools/blktap2/drivers/tapdisk-driver.c100
-rw-r--r--tools/blktap2/drivers/tapdisk-driver.h62
-rw-r--r--tools/blktap2/drivers/tapdisk-filter.c271
-rw-r--r--tools/blktap2/drivers/tapdisk-filter.h67
-rw-r--r--tools/blktap2/drivers/tapdisk-image.c160
-rw-r--r--tools/blktap2/drivers/tapdisk-image.h55
-rw-r--r--tools/blktap2/drivers/tapdisk-interface.c250
-rw-r--r--tools/blktap2/drivers/tapdisk-interface.h53
-rw-r--r--tools/blktap2/drivers/tapdisk-ipc.c279
-rw-r--r--tools/blktap2/drivers/tapdisk-ipc.h43
-rw-r--r--tools/blktap2/drivers/tapdisk-log.c255
-rw-r--r--tools/blktap2/drivers/tapdisk-log.h51
-rw-r--r--tools/blktap2/drivers/tapdisk-queue.c441
-rw-r--r--tools/blktap2/drivers/tapdisk-queue.h113
-rw-r--r--tools/blktap2/drivers/tapdisk-ring.c439
-rw-r--r--tools/blktap2/drivers/tapdisk-ring.h87
-rw-r--r--tools/blktap2/drivers/tapdisk-server.c415
-rw-r--r--tools/blktap2/drivers/tapdisk-server.h65
-rw-r--r--tools/blktap2/drivers/tapdisk-stream.c600
-rw-r--r--tools/blktap2/drivers/tapdisk-utils.c199
-rw-r--r--tools/blktap2/drivers/tapdisk-utils.h42
-rw-r--r--tools/blktap2/drivers/tapdisk-vbd.c1758
-rw-r--r--tools/blktap2/drivers/tapdisk-vbd.h193
-rw-r--r--tools/blktap2/drivers/tapdisk.c66
-rw-r--r--tools/blktap2/drivers/tapdisk.h158
-rw-r--r--tools/blktap2/drivers/tapdisk2.c436
-rw-r--r--tools/blktap2/drivers/td.c691
-rw-r--r--tools/blktap2/drivers/xmsnap78
-rw-r--r--tools/blktap2/include/Makefile14
-rw-r--r--tools/blktap2/include/atomicio.h33
-rw-r--r--tools/blktap2/include/blktaplib.h249
-rw-r--r--tools/blktap2/include/libvhd-journal.h68
-rw-r--r--tools/blktap2/include/libvhd.h308
-rw-r--r--tools/blktap2/include/list.h93
-rw-r--r--tools/blktap2/include/lvm-util.h71
-rw-r--r--tools/blktap2/include/relative-path.h43
-rw-r--r--tools/blktap2/include/tapdisk-message.h141
-rw-r--r--tools/blktap2/include/vhd-util.h44
-rw-r--r--tools/blktap2/include/vhd.h221
-rw-r--r--tools/blktap2/lvm/Makefile38
-rw-r--r--tools/blktap2/lvm/lvm-util.c349
-rw-r--r--tools/blktap2/vhd/Makefile55
-rw-r--r--tools/blktap2/vhd/lib/Makefile73
-rw-r--r--tools/blktap2/vhd/lib/atomicio.c61
-rw-r--r--tools/blktap2/vhd/lib/libvhd-journal.c1534
-rw-r--r--tools/blktap2/vhd/lib/libvhd.c3328
-rw-r--r--tools/blktap2/vhd/lib/relative-path.c299
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-check.c977
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-coalesce.c218
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-create.c80
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-fill.c105
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-modify.c132
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-query.c159
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-read.c742
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-repair.c84
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-resize.c1131
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-revert.c106
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-scan.c1315
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-set-field.c106
-rw-r--r--tools/blktap2/vhd/lib/vhd-util-snapshot.c216
-rw-r--r--tools/blktap2/vhd/vhd-update.c261
-rw-r--r--tools/blktap2/vhd/vhd-util.c160
-rwxr-xr-xtools/check/check_uuid_devel6
-rw-r--r--tools/python/xen/xend/XendDomainInfo.py49
-rw-r--r--tools/python/xen/xend/server/BlktapController.py54
-rw-r--r--tools/python/xen/xend/server/DevController.py32
108 files changed, 35869 insertions, 5 deletions
diff --git a/.hgignore b/.hgignore
index 1b798d15f4..966c180e38 100644
--- a/.hgignore
+++ b/.hgignore
@@ -103,7 +103,19 @@
^stubdom/lwip/
^stubdom/ioemu/
^tools/.*/build/lib.*/.*\.py$
-^tools/blktap/Makefile\.smh$
+^tools/blktap2/daemon/blktapctrl$
+^tools/blktap2/drivers/img2qcow$
+^tools/blktap2/drivers/lock-util$
+^tools/blktap2/drivers/qcow-create$
+^tools/blktap2/drivers/qcow2raw$
+^tools/blktap2/drivers/tapdisk$
+^tools/blktap2/drivers/tapdisk-client$
+^tools/blktap2/drivers/tapdisk-diff$
+^tools/blktap2/drivers/tapdisk-stream$
+^tools/blktap2/drivers/tapdisk2$
+^tools/blktap2/drivers/td-util$
+^tools/blktap2/vhd/vhd-update$
+^tools/blktap2/vhd/vhd-util$
^tools/blktap/drivers/blktapctrl$
^tools/blktap/drivers/img2qcow$
^tools/blktap/drivers/qcow-create$
diff --git a/tools/Makefile b/tools/Makefile
index 3209f2f8bd..dff96a5c76 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -22,6 +22,7 @@ SUBDIRS-$(VTPM_TOOLS) += vtpm
SUBDIRS-y += xenstat
SUBDIRS-$(CONFIG_Linux) += libaio
SUBDIRS-$(CONFIG_Linux) += blktap
+SUBDIRS-$(CONFIG_Linux) += blktap2
SUBDIRS-y += libfsimage
SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
SUBDIRS-$(CONFIG_Linux) += fs-back
diff --git a/tools/blktap2/Makefile b/tools/blktap2/Makefile
new file mode 100644
index 0000000000..20a9451fa1
--- /dev/null
+++ b/tools/blktap2/Makefile
@@ -0,0 +1,34 @@
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS += $(CFLAGS_libxenctrl)
+LDFLAGS += $(LDFLAGS_libxenctrl)
+
+SUBDIRS-y :=
+SUBDIRS-y += include
+SUBDIRS-y += lvm
+SUBDIRS-y += vhd
+SUBDIRS-y += drivers
+SUBDIRS-y += daemon
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build:
+ @set -e; for subdir in $(SUBDIRS-y); do \
+ $(MAKE) -C $$subdir all; \
+ done
+
+.PHONY: install
+install:
+ @set -e; for subdir in $(SUBDIRS-y); do \
+ $(MAKE) -C $$subdir install; \
+ done
+
+.PHONY: clean
+clean:
+ rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
+ @set -e; for subdir in $(SUBDIRS-y); do \
+ $(MAKE) -C $$subdir clean; \
+ done
diff --git a/tools/blktap2/README b/tools/blktap2/README
new file mode 100644
index 0000000000..5e4108030e
--- /dev/null
+++ b/tools/blktap2/README
@@ -0,0 +1,122 @@
+Blktap Userspace Tools + Library
+================================
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+{firstname.lastname}@cl.cam.ac.uk
+
+The blktap userspace toolkit provides a user-level disk I/O
+interface. The blktap mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries. Using these tools, blktap allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well. Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+ formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+ to flushing dirty pages which are present in the Linux loopback
+ driver. (Specifically, doing a large number of writes to an
+ NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+ resources, and process-granularity QoS techniques (disk scheduling
+ and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+ networking libraries, compression utilities, peer-to-peer
+ file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+ fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired. The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code. We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2006 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - File-backed Qcow disks
+ - Standalone sparse Qcow disks
+ - Fast shareable RAM disk between VMs (requires some form of cluster-based
+ filesystem support e.g. OCFS2 in the guest kernel)
+ - Some VMDK images - your mileage may vary
+
+Raw and QCow images have asynchronous backends and so should perform
+fairly well. VMDK is based directly on the qemu vmdk driver, which is
+synchronous (a.k.a. slow).
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap backend driver in your dom0 kernel. It
+will cooperate fine with the existing backend driver, so you can
+experiment with tap disks without breaking existing VM configs.
+
+To build the tools separately, "make && make install" in
+tools/blktap.
+
+
+Using the Tools
+===============
+
+Prepare the image for booting. For qcow files use the qcow utilities
+installed earlier. e.g. qcow-create generates a blank standalone image
+or a file-backed CoW image. img2qcow takes an existing image or
+partition and creates a sparse, standalone qcow-based file.
+
+The userspace disk agent is configured to start automatically via xend
+(alternatively you can start it manually => 'blktapctrl')
+
+Customise the VM config file to use the 'tap' handler, followed by the
+driver type. e.g. for a raw image such as a file or partition:
+
+disk = ['tap:aio:<FILENAME>,sda1,w']
+
+e.g. for a qcow image:
+
+disk = ['tap:qcow:<FILENAME>,sda1,w']
+
+
+Mounting images in Dom0 using the blktap driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach. You will need to build a xenlinux Dom0 kernel that
+includes the blkfront driver (e.g. the default 'make world' or
+'make kernels' build. Simply use the xm command-line tool to activate
+the backend disks, and blkfront will generate a virtual block device that
+can be accessed in the same way as a loop device or partition:
+
+e.g. for a raw image file <FILENAME> that would normally be mounted using
+the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the
+following:
+
+xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk <--- don't use loop driver
+
+In this way, you can use any of the userspace device-type drivers built
+with the blktap userspace toolkit to open and mount disks such as qcow
+or vmdk images:
+
+xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk
+
+
+
+
diff --git a/tools/blktap2/daemon/Makefile b/tools/blktap2/daemon/Makefile
new file mode 100644
index 0000000000..a7869b61b0
--- /dev/null
+++ b/tools/blktap2/daemon/Makefile
@@ -0,0 +1,55 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+IBIN = blktapctrl
+INST_DIR = $(SBINDIR)
+
+LIBDIR = lib
+
+LIBS := -lxenstore
+LIBS += -Llib
+LIBS += -lblktap
+LIBS += -lxenctrl
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS += -L $(XEN_LIBXC) -L $(XEN_XENSTORE)
+endif
+
+OBJS := tapdisk-dispatch-common.o
+OBJS += tapdisk-channel.o
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -fno-strict-aliasing -fPIC
+CFLAGS += -Ilib -I../include -I../drivers -I../../include $(INCLUDES)
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -g
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+all: libblktap $(IBIN)
+
+blktapctrl: tapdisk-daemon.c $(OBJS)
+ $(CC) $(CFLAGS) -o blktapctrl tapdisk-daemon.c $(LIBS) $(OBJS)
+
+libblktap:
+ @set -e
+ $(MAKE) -C $(LIBDIR) all
+
+install: all
+ $(MAKE) -C $(LIBDIR) install
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+ $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean:
+ $(MAKE) -C $(LIBDIR) clean
+ rm -rf *.o *~ $(IBIN) $(DEPS) xen TAGS
+
+.PHONY: all clean install blktapctrl libblktap
+
+-include $(DEPS)
+
diff --git a/tools/blktap2/daemon/lib/Makefile b/tools/blktap2/daemon/lib/Makefile
new file mode 100644
index 0000000000..e4e289ab51
--- /dev/null
+++ b/tools/blktap2/daemon/lib/Makefile
@@ -0,0 +1,69 @@
+XEN_ROOT=../../../../
+BLKTAP_ROOT := ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR = 3.1
+MINOR = 0
+SONAME = libblktap.so.$(MAJOR)
+
+BLKTAP_INSTALL_DIR = /usr/sbin
+
+LIBS := -lxenstore
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS += -L$(XEN_XENSTORE)
+endif
+
+SRCS :=
+SRCS += xs_api.c
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -fno-strict-aliasing -fPIC
+# get asprintf():
+CFLAGS += -D _GNU_SOURCE
+CFLAGS += -g
+CFLAGS += -I../../include -I../../../include/ $(INCLUDES)
+
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+OBJS = $(patsubst %.c,%.o,$(SRCS))
+IBINS :=
+
+LIB = libblktap.a libblktap.so.$(MAJOR).$(MINOR)
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build: libblktap.a
+
+.PHONY: libblktap
+libblktap: libblktap.a
+
+install: all
+ $(INSTALL_DIR) -p $(DESTDIR)$(LIBDIR)
+ $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
+ ln -sf libblktap.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libblktap.so.$(MAJOR)
+ ln -sf libblktap.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libblktap.so
+
+clean:
+ rm -rf *.a *.so* *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS
+
+libblktap.a: $(OBJS)
+ $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,$(SONAME) $(SHLIB_CFLAGS) \
+ -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+ ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
+ ln -sf libblktap.so.$(MAJOR) libblktap.so
+ $(AR) rc $@ libblktap.so
+
+.PHONY: TAGS all build clean install libblktap
+
+TAGS:
+ etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff --git a/tools/blktap2/daemon/lib/xs_api.c b/tools/blktap2/daemon/lib/xs_api.c
new file mode 100644
index 0000000000..2a7d6acdb8
--- /dev/null
+++ b/tools/blktap2/daemon/lib/xs_api.c
@@ -0,0 +1,323 @@
+/*
+ * xs_api.c
+ *
+ * blocktap interface functions to xenstore
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <xs.h>
+
+#include "xs_api.h"
+#include "blktaplib.h"
+
+#define DOMNAME "Domain-0"
+#define BASE_DEV_VAL 2048
+
+static LIST_HEAD(watches);
+
+int
+xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+ va_list ap;
+ const char *name;
+ char *path, **e;
+ int ret = 0, num,i;
+ unsigned int len;
+ xs_transaction_t xth;
+
+again:
+ if ((xth = xs_transaction_start(xs)) == XBT_NULL) {
+ DPRINTF("unable to start xs trasanction\n");
+ ret = ENOMEM;
+ return ret;
+ }
+
+ va_start(ap, dir);
+ while ((ret == 0) && (name = va_arg(ap, char *)) != NULL) {
+ char *p;
+ const char *fmt = va_arg(ap, char *);
+ void *result = va_arg(ap, void *);
+
+ if (asprintf(&path, "%s/%s", dir, name) == -1) {
+ EPRINTF("allocation error in xs_gather!\n");
+ ret = ENOMEM;
+ break;
+ }
+
+ p = xs_read(xs, xth, path, &len);
+ free(path);
+
+ if (!p) {
+ ret = ENOENT;
+ break;
+ }
+
+ if (fmt) {
+ if (sscanf(p, fmt, result) == 0)
+ ret = EINVAL;
+ free(p);
+ } else
+ *(char **)result = p;
+ }
+
+ va_end(ap);
+
+ if (!xs_transaction_end(xs, xth, ret)) {
+ if (ret == 0 && errno == EAGAIN)
+ goto again;
+ else
+ ret = errno;
+ }
+
+ return ret;
+}
+
+/* Single printf and write: returns -errno or 0. */
+int
+xs_printf(struct xs_handle *h, const char *dir,
+ const char *node, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+ char *buf, *path;
+
+ va_start(ap, fmt);
+ ret = vasprintf(&buf, fmt, ap);
+ va_end(ap);
+
+ if (ret == -1)
+ return 0;
+
+ ret = asprintf(&path, "%s/%s", dir, node);
+ if (ret == -1) {
+ free(buf);
+ return 0;
+ }
+
+ ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1);
+
+ free(buf);
+ free(path);
+
+ return ret;
+}
+
+int
+xs_exists(struct xs_handle *h, const char *path)
+{
+ char **d;
+ unsigned int num;
+ xs_transaction_t xth;
+
+ if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+ EPRINTF("unable to start xs trasanction\n");
+ return 0;
+ }
+
+ d = xs_directory(h, xth, path, &num);
+ xs_transaction_end(h, xth, 0);
+ if (!d)
+ return 0;
+
+ free(d);
+ return 1;
+}
+
+
+
+/**
+ * This assumes that the domain name we are looking for is unique.
+ * Name parameter Domain-0
+ */
+char *
+get_dom_domid(struct xs_handle *h)
+{
+ int i;
+ xs_transaction_t xth;
+ unsigned int num, len;
+ char *val, *path, *domid, **e;
+
+ e = NULL;
+ domid = NULL;
+
+ if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+ EPRINTF("unable to start xs trasanction\n");
+ return NULL;
+ }
+
+ e = xs_directory(h, xth, "/local/domain", &num);
+ if (e == NULL)
+ goto done;
+
+ for (i = 0; (i < num) && (domid == NULL); i++) {
+ if (asprintf(&path, "/local/domain/%s/name", e[i]) == -1)
+ break;
+
+ val = xs_read(h, xth, path, &len);
+ free(path);
+ if (val == NULL)
+ continue;
+
+ if (strcmp(val, DOMNAME) == 0) {
+ /* match! */
+ if (asprintf(&path,
+ "/local/domain/%s/domid", e[i]) == -1) {
+ free(val);
+ break;
+ }
+ domid = xs_read(h, xth, path, &len);
+ free(path);
+ }
+ free(val);
+ }
+
+ done:
+ xs_transaction_end(h, xth, 0);
+ free(e);
+ return domid;
+}
+
+/*
+ * a little paranoia: we don't just trust token
+ */
+static struct xenbus_watch *find_watch(const char *token)
+{
+ int ret;
+ long nonce;
+ unsigned long addr;
+ struct xenbus_watch *i, *cmp;
+
+ ret = sscanf(token, "%lX:%lX", &addr, &nonce);
+ if (ret != 2) {
+ EPRINTF("invalid watch token %s\n", token);
+ return NULL;
+ }
+
+ cmp = (struct xenbus_watch *)addr;
+ list_for_each_entry(i, &watches, list)
+ if (i == cmp && i->nonce == nonce)
+ return i;
+
+ return NULL;
+}
+
+/*
+ * Register callback to watch this node;
+ * like xs_watch, return 0 on failure
+ */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+ /* Pointer in ascii is the token. */
+ char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+ /* 1-second granularity should suffice here */
+ watch->nonce = time(NULL);
+
+ sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+ if (find_watch(token)) {
+ EPRINTF("watch collision!\n");
+ return -EINVAL;
+ }
+
+ if (!xs_watch(h, watch->node, token)) {
+ EPRINTF("unable to set watch!\n");
+ return -EINVAL;
+ }
+
+ list_add(&watch->list, &watches);
+
+ return 0;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+ char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+ sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+ if (!find_watch(token)) {
+ EPRINTF("no such watch!\n");
+ return -EINVAL;
+ }
+
+ if (!xs_unwatch(h, watch->node, token))
+ EPRINTF("XENBUS Failed to release watch %s\n", watch->node);
+
+ list_del(&watch->list);
+
+ return 0;
+}
+
+/*
+ * re-register callbacks to all watches
+ */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+ struct xenbus_watch *watch;
+ char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+ list_for_each_entry(watch, &watches, list) {
+ sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+ xs_watch(h, watch->node, token);
+ }
+}
+
+/*
+ * based on watch_thread()
+ */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+ unsigned int num;
+ struct xenbus_watch *w;
+ char **res, *token, *node = NULL;
+
+ res = xs_read_watch(h, &num);
+ if (res == NULL)
+ return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+
+ node = res[XS_WATCH_PATH];
+ token = res[XS_WATCH_TOKEN];
+ DPRINTF("got watch %s on %s\n", token, node);
+
+ w = find_watch(token);
+ if (w)
+ w->callback(h, w, node);
+
+ DPRINTF("handled watch %s on %s\n", token, node);
+
+ free(res);
+
+ return 1;
+}
diff --git a/tools/blktap2/daemon/lib/xs_api.h b/tools/blktap2/daemon/lib/xs_api.h
new file mode 100644
index 0000000000..e6f055ac0c
--- /dev/null
+++ b/tools/blktap2/daemon/lib/xs_api.h
@@ -0,0 +1,62 @@
+/*
+ * xs_api.h
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XS_API_H_
+#define _XS_API_H_
+
+#include <xs.h>
+
+#include "list.h"
+
+struct xenbus_watch
+{
+ struct list_head list;
+ char *node;
+ void *data;
+ long nonce;
+ void (*callback) (struct xs_handle *h,
+ struct xenbus_watch *,
+ const char *node);
+};
+
+int xs_gather(struct xs_handle *xs, const char *dir, ...);
+int xs_printf(struct xs_handle *h, const char *dir, const char *node,
+ const char *fmt, ...) __attribute__((format(printf, 4, 5)));
+int xs_exists(struct xs_handle *h, const char *path);
+char *get_dom_domid(struct xs_handle *h);
+int convert_dev_name_to_num(char *name);
+
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+void reregister_xenbus_watches(struct xs_handle *h);
+int xs_fire_next_watch(struct xs_handle *h);
+
+#endif
diff --git a/tools/blktap2/daemon/tapdisk-channel.c b/tools/blktap2/daemon/tapdisk-channel.c
new file mode 100644
index 0000000000..c2dac3a858
--- /dev/null
+++ b/tools/blktap2/daemon/tapdisk-channel.c
@@ -0,0 +1,1367 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/wait.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_CHANNEL_IDLE 1
+#define TAPDISK_CHANNEL_WAIT_PID 2
+#define TAPDISK_CHANNEL_WAIT_OPEN 3
+#define TAPDISK_CHANNEL_WAIT_PAUSE 4
+#define TAPDISK_CHANNEL_WAIT_RESUME 5
+#define TAPDISK_CHANNEL_WAIT_CLOSE 6
+#define TAPDISK_CHANNEL_CLOSED 7
+
+static void tapdisk_channel_error(tapdisk_channel_t *,
+ const char *fmt, ...)
+ __attribute__((format(printf, 2, 3)));
+static void tapdisk_channel_fatal(tapdisk_channel_t *,
+ const char *fmt, ...)
+ __attribute__((format(printf, 2, 3)));
+static int tapdisk_channel_parse_params(tapdisk_channel_t *);
+static void tapdisk_channel_pause_event(struct xs_handle *,
+ struct xenbus_watch *,
+ const char *);
+
+static int
+tapdisk_channel_check_uuid(tapdisk_channel_t *channel)
+{
+ uint32_t uuid;
+ char *uuid_str;
+
+ uuid_str = xs_read(channel->xsh, XBT_NULL, channel->uuid_str, NULL);
+ if (!uuid_str)
+ return -errno;
+
+ uuid = strtoul(uuid_str, NULL, 10);
+ free(uuid_str);
+
+ if (uuid != channel->cookie)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int
+tapdisk_channel_validate_watch(tapdisk_channel_t *channel, const char *path)
+{
+ int err, len;
+
+ len = strsep_len(path, '/', 7);
+ if (len < 0)
+ return -EINVAL;
+
+ err = tapdisk_channel_check_uuid(channel);
+ if (err)
+ return err;
+
+ if (!xs_exists(channel->xsh, path))
+ return -ENOENT;
+
+ return 0;
+}
+
+static inline int
+tapdisk_channel_validate_message(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ switch (message->type) {
+ case TAPDISK_MESSAGE_PID_RSP:
+ if (channel->state != TAPDISK_CHANNEL_WAIT_PID)
+ return -EINVAL;
+ break;
+
+ case TAPDISK_MESSAGE_OPEN_RSP:
+ if (channel->state != TAPDISK_CHANNEL_WAIT_OPEN)
+ return -EINVAL;
+ break;
+
+ case TAPDISK_MESSAGE_PAUSE_RSP:
+ if (channel->state != TAPDISK_CHANNEL_WAIT_PAUSE)
+ return -EINVAL;
+ break;
+
+ case TAPDISK_MESSAGE_RESUME_RSP:
+ if (channel->state != TAPDISK_CHANNEL_WAIT_RESUME)
+ return -EINVAL;
+ break;
+
+ case TAPDISK_MESSAGE_CLOSE_RSP:
+ if (channel->state != TAPDISK_CHANNEL_WAIT_CLOSE)
+ return -EINVAL;
+ break;
+
+ case TAPDISK_MESSAGE_RUNTIME_ERROR:
+ /*
+ * runtime errors can be received at any time
+ * and should not affect the state machine
+ */
+ return 0;
+ }
+
+ channel->state = TAPDISK_CHANNEL_IDLE;
+ return 0;
+}
+
+static int
+tapdisk_channel_send_message(tapdisk_channel_t *channel,
+ tapdisk_message_t *message, int timeout)
+{
+ fd_set writefds;
+ struct timeval tv;
+ int ret, len, offset;
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ DPRINTF("%s: sending '%s' message to %d:%d\n",
+ channel->path, tapdisk_message_name(message->type),
+ channel->channel_id, channel->cookie);
+
+ if (channel->state != TAPDISK_CHANNEL_IDLE &&
+ message->type != TAPDISK_MESSAGE_CLOSE)
+ EPRINTF("%s: writing message to non-idle channel (%d)\n",
+ channel->path, channel->state);
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(channel->write_fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(channel->write_fd + 1,
+ NULL, &writefds, NULL, &tv);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(channel->write_fd, &writefds)) {
+ ret = write(channel->write_fd,
+ message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("%s: error writing '%s' message to %d:%d\n",
+ channel->path, tapdisk_message_name(message->type),
+ channel->channel_id, channel->cookie);
+ return -EIO;
+ }
+
+ switch (message->type) {
+ case TAPDISK_MESSAGE_PID:
+ channel->state = TAPDISK_CHANNEL_WAIT_PID;
+ break;
+
+ case TAPDISK_MESSAGE_OPEN:
+ channel->state = TAPDISK_CHANNEL_WAIT_OPEN;
+ break;
+
+ case TAPDISK_MESSAGE_PAUSE:
+ channel->state = TAPDISK_CHANNEL_WAIT_PAUSE;
+ break;
+
+ case TAPDISK_MESSAGE_RESUME:
+ channel->state = TAPDISK_CHANNEL_WAIT_RESUME;
+ break;
+
+ case TAPDISK_MESSAGE_CLOSE:
+ channel->state = TAPDISK_CHANNEL_WAIT_CLOSE;
+ break;
+
+ default:
+ EPRINTF("%s: unrecognized message type %d\n",
+ channel->path, message->type);
+ }
+
+ return 0;
+}
+
+static void
+__tapdisk_channel_error(tapdisk_channel_t *channel,
+ const char *fmt, va_list ap)
+{
+ int err;
+ char *dir, *buf, *message;
+
+ err = vasprintf(&buf, fmt, ap);
+ if (err == -1) {
+ EPRINTF("failed to allocate error message\n");
+ buf = NULL;
+ }
+
+ if (buf)
+ message = buf;
+ else
+ message = "tapdisk error";
+
+ EPRINTF("%s: %s\n", channel->path, message);
+
+ err = asprintf(&dir, "%s/tapdisk-error", channel->path);
+ if (err == -1) {
+ EPRINTF("%s: failed to write %s\n", __func__, message);
+ dir = NULL;
+ goto out;
+ }
+
+ xs_write(channel->xsh, XBT_NULL, dir, message, strlen(message));
+
+out:
+ free(dir);
+ free(buf);
+}
+
+static void
+tapdisk_channel_error(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ __tapdisk_channel_error(channel, fmt, ap);
+ va_end(ap);
+}
+
+static void
+tapdisk_channel_fatal(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ __tapdisk_channel_error(channel, fmt, ap);
+ va_end(ap);
+
+ tapdisk_channel_close(channel);
+}
+
+static int
+tapdisk_channel_connect_backdev(tapdisk_channel_t *channel)
+{
+ int err, major, minor;
+ char *s, *path, *devname;
+
+ s = NULL;
+ path = NULL;
+ devname = NULL;
+
+ err = ioctl(channel->blktap_fd,
+ BLKTAP_IOCTL_BACKDEV_SETUP, channel->minor);
+ if (err) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = asprintf(&path, "%s/backdev-node", channel->path);
+ if (err == -1) {
+ path = NULL;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ s = xs_read(channel->xsh, XBT_NULL, path, NULL);
+ if (!s) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = sscanf(s, "%d:%d", &major, &minor);
+ if (err != 2) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ err = asprintf(&devname,"%s/%s%d",
+ BLKTAP_DEV_DIR, BACKDEV_NAME, minor);
+ if (err == -1) {
+ devname = NULL;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = make_blktap_device(devname, major, minor, S_IFBLK | 0600);
+ if (err)
+ goto fail;
+
+ free(path);
+ err = asprintf(&path, "%s/backdev-path", channel->path);
+ if (err == -1) {
+ path = NULL;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = xs_write(channel->xsh, XBT_NULL, path, devname, strlen(devname));
+ if (err == 0) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = 0;
+ out:
+ free(devname);
+ free(path);
+ free(s);
+ return err;
+
+ fail:
+ EPRINTF("backdev setup failed [%d]\n", err);
+ goto out;
+}
+
+static int
+tapdisk_channel_complete_connection(tapdisk_channel_t *channel)
+{
+ int err;
+ char *path;
+
+ if (!xs_printf(channel->xsh, channel->path,
+ "sectors", "%llu", channel->image.size)) {
+ EPRINTF("ERROR: Failed writing sectors");
+ return -errno;
+ }
+
+ if (!xs_printf(channel->xsh, channel->path,
+ "sector-size", "%lu", channel->image.secsize)) {
+ EPRINTF("ERROR: Failed writing sector-size");
+ return -errno;
+ }
+
+ if (!xs_printf(channel->xsh, channel->path,
+ "info", "%u", channel->image.info)) {
+ EPRINTF("ERROR: Failed writing info");
+ return -errno;
+ }
+
+ err = tapdisk_channel_connect_backdev(channel);
+ if (err)
+ goto clean;
+
+ channel->connected = 1;
+ return 0;
+
+ clean:
+ if (asprintf(&path, "%s/info", channel->path) == -1)
+ return err;
+
+ if (!xs_rm(channel->xsh, XBT_NULL, path))
+ goto clean_out;
+
+ free(path);
+ if (asprintf(&path, "%s/sector-size", channel->path) == -1)
+ return err;
+
+ if (!xs_rm(channel->xsh, XBT_NULL, path))
+ goto clean_out;
+
+ free(path);
+ if (asprintf(&path, "%s/sectors", channel->path) == -1)
+ return err;
+
+ xs_rm(channel->xsh, XBT_NULL, path);
+
+ clean_out:
+ free(path);
+ return err;
+}
+
+static int
+tapdisk_channel_send_open_request(tapdisk_channel_t *channel)
+{
+ int len;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+
+ len = strlen(channel->vdi_path);
+
+ message.type = TAPDISK_MESSAGE_OPEN;
+ message.cookie = channel->cookie;
+ message.drivertype = channel->drivertype;
+ message.u.params.storage = channel->storage;
+ message.u.params.devnum = channel->minor;
+ message.u.params.domid = channel->domid;
+ message.u.params.path_len = len;
+ strncpy(message.u.params.path, channel->vdi_path, len);
+
+ if (channel->mode == 'r')
+ message.u.params.flags |= TAPDISK_MESSAGE_FLAG_RDONLY;
+ if (channel->shared)
+ message.u.params.flags |= TAPDISK_MESSAGE_FLAG_SHARED;
+
+ /* TODO: clean this up */
+ if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/add-cache"))
+ message.u.params.flags |= TAPDISK_MESSAGE_FLAG_ADD_CACHE;
+ if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/log-dirty"))
+ message.u.params.flags |= TAPDISK_MESSAGE_FLAG_LOG_DIRTY;
+
+ return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_open_response(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ int err;
+
+ channel->image.size = message->u.image.sectors;
+ channel->image.secsize = message->u.image.sector_size;
+ channel->image.info = message->u.image.info;
+
+ err = tapdisk_channel_complete_connection(channel);
+ if (err)
+ goto fail;
+
+ /* did we receive a pause request before the connection completed? */
+ if (channel->pause_needed) {
+ DPRINTF("%s: deferred pause request\n", channel->path);
+ tapdisk_channel_pause_event(channel->xsh,
+ &channel->pause_watch,
+ channel->pause_str);
+ channel->pause_needed = 0;
+ }
+
+ return 0;
+
+fail:
+ tapdisk_channel_fatal(channel,
+ "failure completing connection: %d", err);
+ return err;
+}
+
+static int
+tapdisk_channel_send_shutdown_request(tapdisk_channel_t *channel)
+{
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+
+ message.type = TAPDISK_MESSAGE_CLOSE;
+ message.drivertype = channel->drivertype;
+ message.cookie = channel->cookie;
+
+ return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_shutdown_response(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ channel->open = 0;
+ channel->state = TAPDISK_CHANNEL_CLOSED;
+ tapdisk_channel_close(channel);
+ return 0;
+}
+
+static int
+tapdisk_channel_receive_runtime_error(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ tapdisk_channel_error(channel,
+ "runtime error: %s", message->u.string.text);
+ return 0;
+}
+
+static int
+tapdisk_channel_send_pid_request(tapdisk_channel_t *channel)
+{
+ int err;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+
+ message.type = TAPDISK_MESSAGE_PID;
+ message.drivertype = channel->drivertype;
+ message.cookie = channel->cookie;
+
+ err = tapdisk_channel_send_message(channel, &message, 2);
+
+ if (!err)
+ channel->open = 1;
+
+ return err;
+}
+
+static int
+tapdisk_channel_receive_pid_response(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ int err;
+
+ channel->tapdisk_pid = message->u.tapdisk_pid;
+
+ DPRINTF("%s: tapdisk pid: %d\n", channel->path, channel->tapdisk_pid);
+
+ err = setpriority(PRIO_PROCESS, channel->tapdisk_pid, PRIO_SPECIAL_IO);
+ if (err) {
+ tapdisk_channel_fatal(channel,
+ "setting tapdisk priority: %d", err);
+ return err;
+ }
+
+ err = tapdisk_channel_send_open_request(channel);
+ if (err) {
+ tapdisk_channel_fatal(channel,
+ "sending open request: %d", err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_channel_send_pause_request(tapdisk_channel_t *channel)
+{
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+
+ DPRINTF("pausing %s\n", channel->path);
+
+ message.type = TAPDISK_MESSAGE_PAUSE;
+ message.drivertype = channel->drivertype;
+ message.cookie = channel->cookie;
+
+ return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_pause_response(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ int err;
+
+ if (!xs_write(channel->xsh, XBT_NULL,
+ channel->pause_done_str, "", strlen(""))) {
+ err = -errno;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ tapdisk_channel_fatal(channel,
+ "failure receiving pause response: %d\n", err);
+ return err;
+}
+
+static int
+tapdisk_channel_send_resume_request(tapdisk_channel_t *channel)
+{
+ int len;
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+
+ len = strlen(channel->vdi_path);
+
+ DPRINTF("resuming %s\n", channel->path);
+
+ message.type = TAPDISK_MESSAGE_RESUME;
+ message.drivertype = channel->drivertype;
+ message.cookie = channel->cookie;
+ message.u.params.path_len = len;
+ strncpy(message.u.params.path, channel->vdi_path, len);
+
+ return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_resume_response(tapdisk_channel_t *channel,
+ tapdisk_message_t *message)
+{
+ int err;
+
+ if (!xs_rm(channel->xsh, XBT_NULL, channel->pause_done_str)) {
+ err = -errno;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ tapdisk_channel_fatal(channel,
+ "failure receiving pause response: %d", err);
+ return err;
+}
+
+static void
+tapdisk_channel_shutdown_event(struct xs_handle *xsh,
+ struct xenbus_watch *watch, const char *path)
+{
+ int err;
+ tapdisk_channel_t *channel;
+
+ channel = watch->data;
+
+ DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+ if (!xs_exists(channel->xsh, channel->path)) {
+ tapdisk_channel_close(channel);
+ return;
+ }
+
+ err = tapdisk_channel_validate_watch(channel, path);
+ if (err) {
+ if (err == -EINVAL)
+ tapdisk_channel_fatal(channel, "bad shutdown watch");
+ return;
+ }
+
+ tapdisk_channel_send_shutdown_request(channel);
+}
+
+static void
+tapdisk_channel_pause_event(struct xs_handle *xsh,
+ struct xenbus_watch *watch, const char *path)
+{
+ int err, paused;
+ tapdisk_channel_t *channel;
+
+ channel = watch->data;
+
+ DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+ if (!xs_exists(channel->xsh, channel->path)) {
+ tapdisk_channel_close(channel);
+ return;
+ }
+
+ /* NB: The VBD is essentially considered ready since the
+ * backend hotplug event ocurred, which is just after
+ * start-tapdisk, not after watch registration. We start
+ * testing xenstore keys with the very first shot, but defer
+ * until after connection completion. */
+
+ err = tapdisk_channel_validate_watch(channel, path);
+ if (err) {
+ if (err == -EINVAL)
+ tapdisk_channel_fatal(channel, "bad pause watch");
+
+ if (err != -ENOENT)
+ return;
+
+ err = 0;
+ }
+
+ paused = xs_exists(xsh, channel->pause_done_str);
+
+ if (xs_exists(xsh, channel->pause_str)) {
+ /*
+ * Duplicate requests are a protocol validation, but
+ * impossible to identify if watch registration and an
+ * actual pause request may fire separately in close
+ * succession. Warn, but do not signal an error.
+ */
+ int pausing = channel->state == TAPDISK_CHANNEL_WAIT_PAUSE;
+ if (pausing || paused) {
+ DPRINTF("Ignoring pause event for %s vbd %s\n",
+ pausing ? "pausing" : "paused", channel->path);
+ goto out;
+ }
+
+ /* defer if tapdisk is not ready yet */
+ if (!channel->connected) {
+ DPRINTF("%s: deferring pause request\n", path);
+ channel->pause_needed = 1;
+ goto out;
+ }
+
+ err = tapdisk_channel_send_pause_request(channel);
+
+ } else if (xs_exists(xsh, channel->pause_done_str)) {
+ free(channel->params);
+ channel->params = NULL;
+ channel->vdi_path = NULL;
+
+ err = xs_gather(channel->xsh, channel->path,
+ "params", NULL, &channel->params, NULL);
+ if (err) {
+ EPRINTF("failure re-reading params: %d\n", err);
+ channel->params = NULL;
+ goto out;
+ }
+
+ err = tapdisk_channel_parse_params(channel);
+ if (err)
+ goto out;
+
+ err = tapdisk_channel_send_resume_request(channel);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (err)
+ tapdisk_channel_error(channel, "pause event failed: %d", err);
+}
+
+static int
+tapdisk_channel_open_control_socket(char *devname)
+{
+ int err, fd;
+ fd_set socks;
+ struct timeval timeout;
+
+ err = mkdir(BLKTAP_CTRL_DIR, 0755);
+ if (err == -1 && errno != EEXIST) {
+ EPRINTF("Failure creating %s directory: %d\n",
+ BLKTAP_CTRL_DIR, errno);
+ return -errno;
+ }
+
+ err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+ if (err) {
+ if (errno == EEXIST) {
+ /*
+ * Remove fifo since it may have data from
+ * it's previous use --- earlier invocation
+ * of tapdisk may not have read all messages.
+ */
+ err = unlink(devname);
+ if (err) {
+ EPRINTF("ERROR: unlink(%s) failed (%d)\n",
+ devname, errno);
+ return -errno;
+ }
+
+ err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+ }
+
+ if (err) {
+ EPRINTF("ERROR: pipe failed (%d)\n", errno);
+ return -errno;
+ }
+ }
+
+ fd = open(devname, O_RDWR | O_NONBLOCK);
+ if (fd == -1) {
+ EPRINTF("Failed to open %s\n", devname);
+ return -errno;
+ }
+
+ return fd;
+}
+
+static int
+tapdisk_channel_get_device_number(tapdisk_channel_t *channel)
+{
+ char *devname;
+ domid_translate_t tr;
+ int major, minor, err;
+
+ tr.domid = channel->domid;
+ tr.busid = channel->busid;
+
+ minor = ioctl(channel->blktap_fd, BLKTAP_IOCTL_NEWINTF, tr);
+ if (minor <= 0 || minor > MAX_TAP_DEV) {
+ EPRINTF("invalid dev id: %d\n", minor);
+ return -EINVAL;
+ }
+
+ major = ioctl(channel->blktap_fd, BLKTAP_IOCTL_MAJOR, minor);
+ if (major < 0) {
+ EPRINTF("invalid major id: %d\n", major);
+ return -EINVAL;
+ }
+
+ err = asprintf(&devname, "%s/%s%d",
+ BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor);
+ if (err == -1) {
+ EPRINTF("get_new_dev: malloc failed\n");
+ return -ENOMEM;
+ }
+
+ err = make_blktap_device(devname, major, minor, S_IFCHR | 0600);
+ free(devname);
+
+ if (err)
+ return err;
+
+ DPRINTF("Received device id %d and major %d, "
+ "sent domid %d and be_id %d\n",
+ minor, major, tr.domid, tr.busid);
+
+ channel->major = major;
+ channel->minor = minor;
+
+ return 0;
+}
+
+static int
+tapdisk_channel_start_process(tapdisk_channel_t *channel,
+ char *write_dev, char *read_dev)
+{
+ pid_t child;
+ char *argv[] = { "tapdisk", write_dev, read_dev, NULL };
+
+ if ((child = fork()) == -1)
+ return -errno;
+
+ if (!child) {
+ int i;
+ for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++)
+ if (i != STDIN_FILENO &&
+ i != STDOUT_FILENO &&
+ i != STDERR_FILENO)
+ close(i);
+
+ execvp("tapdisk", argv);
+ _exit(1);
+ } else {
+ pid_t got;
+ do {
+ got = waitpid(child, NULL, 0);
+ } while (got != child);
+ }
+ return 0;
+}
+
+static int
+tapdisk_channel_launch_tapdisk(tapdisk_channel_t *channel)
+{
+ int err;
+ char *read_dev, *write_dev;
+
+ read_dev = NULL;
+ write_dev = NULL;
+ channel->read_fd = -1;
+ channel->write_fd = -1;
+
+ err = tapdisk_channel_get_device_number(channel);
+ if (err)
+ return err;
+
+ err = asprintf(&write_dev,
+ "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, channel->minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ write_dev = NULL;
+ goto fail;
+ }
+
+ err = asprintf(&read_dev,
+ "%s/tapctrlread%d", BLKTAP_CTRL_DIR, channel->minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ read_dev = NULL;
+ goto fail;
+ }
+
+ channel->write_fd = tapdisk_channel_open_control_socket(write_dev);
+ if (channel->write_fd < 0) {
+ err = channel->write_fd;
+ channel->write_fd = -1;
+ goto fail;
+ }
+
+ channel->read_fd = tapdisk_channel_open_control_socket(read_dev);
+ if (channel->read_fd < 0) {
+ err = channel->read_fd;
+ channel->read_fd = -1;
+ goto fail;
+ }
+
+ err = tapdisk_channel_start_process(channel, write_dev, read_dev);
+ if (err)
+ goto fail;
+
+ channel->open = 1;
+ channel->channel_id = channel->write_fd;
+
+ free(read_dev);
+ free(write_dev);
+
+ DPRINTF("process launched, channel = %d:%d\n",
+ channel->channel_id, channel->cookie);
+
+ return tapdisk_channel_send_pid_request(channel);
+
+fail:
+ free(read_dev);
+ free(write_dev);
+ if (channel->read_fd != -1)
+ close(channel->read_fd);
+ if (channel->write_fd != -1)
+ close(channel->write_fd);
+ return err;
+}
+
+static int
+tapdisk_channel_connect(tapdisk_channel_t *channel)
+{
+ int err;
+
+ tapdisk_daemon_find_channel(channel);
+
+ if (!channel->tapdisk_pid)
+ return tapdisk_channel_launch_tapdisk(channel);
+
+ DPRINTF("%s: process exists: %d, channel = %d:%d\n",
+ channel->path, channel->tapdisk_pid,
+ channel->channel_id, channel->cookie);
+
+ err = tapdisk_channel_get_device_number(channel);
+ if (err)
+ return err;
+
+ return tapdisk_channel_send_pid_request(channel);
+}
+
+static int
+tapdisk_channel_init(tapdisk_channel_t *channel)
+{
+ int err;
+
+ channel->uuid_str = NULL;
+ channel->pause_str = NULL;
+ channel->pause_done_str = NULL;
+ channel->shutdown_str = NULL;
+ channel->share_tapdisk_str = NULL;
+
+ err = asprintf(&channel->uuid_str,
+ "%s/tapdisk-uuid", channel->path);
+ if (err == -1) {
+ channel->uuid_str = NULL;
+ goto fail;
+ }
+
+ err = asprintf(&channel->pause_str, "%s/pause", channel->path);
+ if (err == -1) {
+ channel->pause_str = NULL;
+ goto fail;
+ }
+
+ err = asprintf(&channel->pause_done_str,
+ "%s/pause-done", channel->path);
+ if (err == -1) {
+ channel->pause_done_str = NULL;
+ goto fail;
+ }
+
+ err = asprintf(&channel->shutdown_str,
+ "%s/shutdown-tapdisk", channel->path);
+ if (err == -1) {
+ channel->shutdown_str = NULL;
+ goto fail;
+ }
+
+ channel->share_tapdisk_str = "/local/domain/0/tapdisk/share-tapdisks";
+
+ return 0;
+
+fail:
+ free(channel->uuid_str);
+ free(channel->pause_str);
+ free(channel->pause_done_str);
+ free(channel->shutdown_str);
+ channel->uuid_str = NULL;
+ channel->pause_str = NULL;
+ channel->pause_done_str = NULL;
+ channel->shutdown_str = NULL;
+ channel->share_tapdisk_str = NULL;
+ return -ENOMEM;
+}
+
+static int
+tapdisk_channel_set_watches(tapdisk_channel_t *channel)
+{
+ int err;
+
+ /* watch for pause events */
+ channel->pause_watch.node = channel->pause_str;
+ channel->pause_watch.callback = tapdisk_channel_pause_event;
+ channel->pause_watch.data = channel;
+ err = register_xenbus_watch(channel->xsh, &channel->pause_watch);
+ if (err) {
+ channel->pause_watch.node = NULL;
+ goto fail;
+ }
+
+ /* watch for shutdown events */
+ channel->shutdown_watch.node = channel->shutdown_str;
+ channel->shutdown_watch.callback = tapdisk_channel_shutdown_event;
+ channel->shutdown_watch.data = channel;
+ err = register_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+ if (err) {
+ channel->shutdown_watch.node = NULL;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ if (channel->pause_watch.node) {
+ unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+ channel->pause_watch.node = NULL;
+ }
+ if (channel->shutdown_watch.node) {
+ unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+ channel->shutdown_watch.node = NULL;
+ }
+ return err;
+}
+
+static void
+tapdisk_channel_get_storage_type(tapdisk_channel_t *channel)
+{
+ int err, type;
+ unsigned int len;
+ char *path, *stype;
+
+ channel->storage = TAPDISK_STORAGE_TYPE_DEFAULT;
+
+ err = asprintf(&path, "%s/sm-data/storage-type", channel->path);
+ if (err == -1)
+ return;
+
+ stype = xs_read(channel->xsh, XBT_NULL, path, &len);
+ if (!stype)
+ goto out;
+ else if (!strcmp(stype, "nfs"))
+ channel->storage = TAPDISK_STORAGE_TYPE_NFS;
+ else if (!strcmp(stype, "ext"))
+ channel->storage = TAPDISK_STORAGE_TYPE_EXT;
+ else if (!strcmp(stype, "lvm"))
+ channel->storage = TAPDISK_STORAGE_TYPE_LVM;
+
+out:
+ free(path);
+ free(stype);
+}
+
+static int
+tapdisk_channel_get_busid(tapdisk_channel_t *channel)
+{
+ int len, end;
+ const char *ptr;
+ char *tptr, num[10];
+
+ len = strsep_len(channel->path, '/', 6);
+ end = strlen(channel->path);
+ if(len < 0 || end < 0) {
+ EPRINTF("invalid path: %s\n", channel->path);
+ return -EINVAL;
+ }
+
+ ptr = channel->path + len + 1;
+ strncpy(num, ptr, end - len);
+ tptr = num + (end - (len + 1));
+ *tptr = '\0';
+
+ channel->busid = atoi(num);
+ return 0;
+}
+
+static int
+tapdisk_channel_parse_params(tapdisk_channel_t *channel)
+{
+ int i, size, err;
+ unsigned int len;
+ char *ptr, *path, handle[10];
+ char *vdi_type;
+ char *vtype;
+
+ path = channel->params;
+ size = sizeof(dtypes) / sizeof(disk_info_t *);
+
+ if (strlen(path) + 1 >= TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+ goto fail;
+
+ ptr = strchr(path, ':');
+ if (!ptr)
+ goto fail;
+
+ channel->vdi_path = ptr + 1;
+ memcpy(handle, path, (ptr - path));
+ ptr = handle + (ptr - path);
+ *ptr = '\0';
+
+ err = asprintf(&vdi_type, "%s/sm-data/vdi-type", channel->path);
+ if (err == -1)
+ goto fail;
+
+ if (xs_exists(channel->xsh, vdi_type)) {
+ vtype = xs_read(channel->xsh, XBT_NULL, vdi_type, &len);
+ free(vdi_type);
+ if (!vtype)
+ goto fail;
+ if (len >= sizeof(handle) - 1) {
+ free(vtype);
+ goto fail;
+ }
+ sprintf(handle, "%s", vtype);
+ free(vtype);
+ }
+
+ for (i = 0; i < size; i++) {
+ if (strncmp(handle, dtypes[i]->handle, (ptr - path)))
+ continue;
+
+ if (dtypes[i]->idnum == -1)
+ goto fail;
+
+ channel->drivertype = dtypes[i]->idnum;
+ return 0;
+ }
+
+fail:
+ EPRINTF("%s: invalid blktap params: %s\n",
+ channel->path, channel->params);
+ channel->vdi_path = NULL;
+ return -EINVAL;
+}
+
+static int
+tapdisk_channel_gather_info(tapdisk_channel_t *channel)
+{
+ int err;
+
+ err = xs_gather(channel->xsh, channel->path,
+ "frontend", NULL, &channel->frontpath,
+ "frontend-id", "%li", &channel->domid,
+ "params", NULL, &channel->params,
+ "mode", "%c", &channel->mode, NULL);
+ if (err) {
+ EPRINTF("could not find device info: %d\n", err);
+ return err;
+ }
+
+ err = tapdisk_channel_parse_params(channel);
+ if (err)
+ return err;
+
+ err = tapdisk_channel_get_busid(channel);
+ if (err)
+ return err;
+
+ tapdisk_channel_get_storage_type(channel);
+
+ return 0;
+}
+
+static int
+tapdisk_channel_verify_start_request(tapdisk_channel_t *channel)
+{
+ char *path;
+ unsigned int err;
+
+ err = asprintf(&path, "%s/start-tapdisk", channel->path);
+ if (err == -1)
+ goto mem_fail;
+
+ if (!xs_exists(channel->xsh, path))
+ goto fail;
+
+ free(path);
+ err = asprintf(&path, "%s/shutdown-request", channel->path);
+ if (err == -1)
+ goto mem_fail;
+
+ if (xs_exists(channel->xsh, path))
+ goto fail;
+
+ if (xs_exists(channel->xsh, channel->shutdown_str))
+ goto fail;
+
+ free(path);
+ err = asprintf(&path, "%s/shutdown-done", channel->path);
+ if (err == -1)
+ goto mem_fail;
+
+ if (xs_exists(channel->xsh, path))
+ goto fail;
+
+ free(path);
+
+ return 0;
+
+fail:
+ free(path);
+ EPRINTF("%s:%s: invalid start request\n", __func__, channel->path);
+ return -EINVAL;
+
+mem_fail:
+ EPRINTF("%s:%s: out of memory\n", __func__, channel->path);
+ return -ENOMEM;
+}
+
+void
+tapdisk_channel_close(tapdisk_channel_t *channel)
+{
+ if (channel->channel_id)
+ DPRINTF("%s: closing channel %d:%d\n",
+ channel->path, channel->channel_id, channel->cookie);
+
+ if (channel->open)
+ tapdisk_channel_send_shutdown_request(channel);
+
+ if (channel->pause_watch.node) {
+ unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+ channel->pause_watch.node = NULL;
+ }
+
+ if (channel->shutdown_watch.node) {
+ unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+ channel->shutdown_watch.node = NULL;
+ }
+
+ tapdisk_daemon_close_channel(channel);
+
+ free(channel->params);
+ free(channel->frontpath);
+ free(channel->shutdown_str);
+ free(channel->pause_done_str);
+ free(channel->pause_str);
+ free(channel->uuid_str);
+ free(channel->path);
+ free(channel);
+}
+
+int
+tapdisk_channel_open(tapdisk_channel_t **_channel,
+ char *path, struct xs_handle *xsh,
+ int blktap_fd, uint16_t cookie)
+{
+ int err;
+ char *msg;
+ tapdisk_channel_t *channel;
+
+ msg = NULL;
+ *_channel = NULL;
+
+ channel = calloc(1, sizeof(tapdisk_channel_t));
+ if (!channel)
+ return -ENOMEM;
+
+ channel->xsh = xsh;
+ channel->blktap_fd = blktap_fd;
+ channel->cookie = cookie;
+ channel->state = TAPDISK_CHANNEL_IDLE;
+
+ INIT_LIST_HEAD(&channel->list);
+
+ channel->path = strdup(path);
+ if (!channel->path) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tapdisk_channel_init(channel);
+ if (err) {
+ msg = "allocating device";
+ goto fail;
+ }
+
+ err = tapdisk_channel_check_uuid(channel);
+ if (err) {
+ msg = "checking uuid";
+ goto fail;
+ }
+
+ err = tapdisk_channel_gather_info(channel);
+ if (err) {
+ msg = "gathering parameters";
+ goto fail;
+ }
+
+ err = tapdisk_channel_verify_start_request(channel);
+ if (err) {
+ msg = "invalid start request";
+ goto fail;
+ }
+
+ err = tapdisk_channel_set_watches(channel);
+ if (err) {
+ msg = "registering xenstore watches";
+ goto fail;
+ }
+
+ err = tapdisk_channel_connect(channel);
+ if (err) {
+ msg = "connecting to tapdisk";
+ goto fail;
+ }
+
+ *_channel = channel;
+ return 0;
+
+fail:
+ tapdisk_channel_fatal(channel, "%s: %d", (msg ? : "failure"), err);
+ return err;
+}
+
+int
+tapdisk_channel_receive_message(tapdisk_channel_t *c, tapdisk_message_t *m)
+{
+ int err;
+
+ err = tapdisk_channel_validate_message(c, m);
+ if (err)
+ goto fail;
+
+ switch (m->type) {
+ case TAPDISK_MESSAGE_PID_RSP:
+ return tapdisk_channel_receive_pid_response(c, m);
+
+ case TAPDISK_MESSAGE_OPEN_RSP:
+ return tapdisk_channel_receive_open_response(c, m);
+
+ case TAPDISK_MESSAGE_PAUSE_RSP:
+ return tapdisk_channel_receive_pause_response(c, m);
+
+ case TAPDISK_MESSAGE_RESUME_RSP:
+ return tapdisk_channel_receive_resume_response(c, m);
+
+ case TAPDISK_MESSAGE_CLOSE_RSP:
+ return tapdisk_channel_receive_shutdown_response(c, m);
+
+ case TAPDISK_MESSAGE_RUNTIME_ERROR:
+ return tapdisk_channel_receive_runtime_error(c, m);
+ }
+
+fail:
+ tapdisk_channel_fatal(c, "received unexpected message %s in state %d",
+ tapdisk_message_name(m->type), c->state);
+ return -EINVAL;
+}
diff --git a/tools/blktap2/daemon/tapdisk-daemon.c b/tools/blktap2/daemon/tapdisk-daemon.c
new file mode 100644
index 0000000000..ecfc0f3c5b
--- /dev/null
+++ b/tools/blktap2/daemon/tapdisk-daemon.c
@@ -0,0 +1,599 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_DAEMON_DOMID_WATCH "domid-watch"
+#define TAPDISK_DAEMON_PIDFILE "/var/run/blktapctrl.pid"
+
+typedef struct tapdisk_daemon {
+ char *node;
+ int blktap_fd;
+ uint16_t cookie;
+
+ struct xs_handle *xsh;
+ struct list_head channels;
+ struct xenbus_watch watch;
+} tapdisk_daemon_t;
+
+static tapdisk_daemon_t tapdisk_daemon;
+
+#define tapdisk_daemon_for_each_channel(c, tmp) \
+ list_for_each_entry_safe(c, tmp, &tapdisk_daemon.channels, list)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+static void
+tapdisk_daemon_print_drivers(void)
+{
+ int i, size;
+
+ DPRINTF("blktap-daemon: v1.0.2\n");
+
+ size = sizeof(dtypes) / sizeof(disk_info_t *);
+ for (i = 0; i < size; i++)
+ DPRINTF("Found driver: [%s]\n", dtypes[i]->name);
+}
+
+static int
+tapdisk_daemon_write_pidfile(long pid)
+{
+ char buf[100];
+ int len, fd, flags, err;
+
+ fd = open(TAPDISK_DAEMON_PIDFILE, O_RDWR | O_CREAT, 0600);
+ if (fd == -1) {
+ EPRINTF("Opening pid file failed (%d)\n", errno);
+ return -errno;
+ }
+
+ /* We exit silently if daemon already running */
+ err = lockf(fd, F_TLOCK, 0);
+ if (err == -1)
+ exit(0);
+
+ /* Set FD_CLOEXEC, so that tapdisk doesn't get this file descriptor */
+ flags = fcntl(fd, F_GETFD);
+ if (flags == -1) {
+ EPRINTF("F_GETFD failed (%d)\n", errno);
+ return -errno;
+ }
+
+ flags |= FD_CLOEXEC;
+ err = fcntl(fd, F_SETFD, flags);
+ if (err == -1) {
+ EPRINTF("F_SETFD failed (%d)\n", errno);
+ return -errno;
+ }
+
+ len = sprintf(buf, "%ld\n", pid);
+ err = write(fd, buf, len);
+ if (err != len) {
+ EPRINTF("Writing pid file failed (%d)\n", errno);
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_daemon_init(void)
+{
+ char *devname;
+ int i, err, blktap_major;
+
+ memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+
+ err = asprintf(&devname, "%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME);
+ if (err == -1) {
+ devname = NULL;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = xc_find_device_number("blktap0");
+ if (err < 0)
+ goto fail;
+
+ blktap_major = major(err);
+ err = make_blktap_device(devname, blktap_major, 0, S_IFCHR | 0600);
+ if (err)
+ goto fail;
+
+ tapdisk_daemon.blktap_fd = open(devname, O_RDWR);
+ if (tapdisk_daemon.blktap_fd == -1) {
+ err = -errno;
+ EPRINTF("blktap0 open failed\n");
+ goto fail;
+ }
+
+ for (i = 0; i < 2; i++) {
+ tapdisk_daemon.xsh = xs_daemon_open();
+ if (!tapdisk_daemon.xsh) {
+ EPRINTF("xs_daemon_open failed -- is xenstore running?\n");
+ sleep(2);
+ } else
+ break;
+ }
+
+ if (!tapdisk_daemon.xsh) {
+ err = -ENOSYS;
+ goto fail;
+ }
+
+ INIT_LIST_HEAD(&tapdisk_daemon.channels);
+
+ free(devname);
+ return 0;
+
+fail:
+ if (tapdisk_daemon.blktap_fd > 0)
+ close(tapdisk_daemon.blktap_fd);
+ free(devname);
+ memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+ EPRINTF("%s: %d\n", __func__, err);
+
+ return err;
+}
+
+static int
+tapdisk_daemon_set_node(void)
+{
+ int err;
+ char *domid;
+
+ domid = get_dom_domid(tapdisk_daemon.xsh);
+ if (!domid)
+ return -EAGAIN;
+
+ err = asprintf(&tapdisk_daemon.node,
+ "/local/domain/%s/backend/tap", domid);
+ if (err == -1) {
+ tapdisk_daemon.node = NULL;
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(domid);
+ return err;
+}
+
+static int
+tapdisk_daemon_get_domid(void)
+{
+ int err;
+ unsigned int num;
+ char **res, *node, *token, *domid;
+
+ res = xs_read_watch(tapdisk_daemon.xsh, &num);
+ if (!res)
+ return -EAGAIN;
+
+ err = 0;
+ node = res[XS_WATCH_PATH];
+ token = res[XS_WATCH_TOKEN];
+
+ if (strcmp(token, TAPDISK_DAEMON_DOMID_WATCH)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = tapdisk_daemon_set_node();
+
+out:
+ free(res);
+ return err;
+}
+
+static int
+tapdisk_daemon_wait_for_domid(void)
+{
+ int err;
+ char *domid;
+ fd_set readfds;
+
+ err = tapdisk_daemon_set_node();
+ if (!err)
+ return 0;
+
+ if (!xs_watch(tapdisk_daemon.xsh, "/local/domain",
+ TAPDISK_DAEMON_DOMID_WATCH)) {
+ EPRINTF("unable to set domain id watch\n");
+ return -EINVAL;
+ }
+
+ do {
+ FD_ZERO(&readfds);
+ FD_SET(xs_fileno(tapdisk_daemon.xsh), &readfds);
+
+ select(xs_fileno(tapdisk_daemon.xsh) + 1,
+ &readfds, NULL, NULL, NULL);
+
+ if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), &readfds))
+ err = tapdisk_daemon_get_domid();
+ else
+ err = -EAGAIN;
+ } while (err == -EAGAIN);
+
+ xs_unwatch(tapdisk_daemon.xsh,
+ "/local/domain", TAPDISK_DAEMON_DOMID_WATCH);
+ return err;
+}
+
+static inline int
+tapdisk_daemon_new_vbd_event(const char *node)
+{
+ return (!strcmp(node, "start-tapdisk"));
+}
+
+static int
+tapdisk_daemon_write_uuid(char *path, uint32_t uuid)
+{
+ int err;
+ char *cpath, uuid_str[12];
+
+ snprintf(uuid_str, sizeof(uuid_str), "%u", uuid);
+
+ err = asprintf(&cpath, "%s/tapdisk-uuid", path);
+ if (err == -1)
+ return -ENOMEM;
+
+ err = xs_write(tapdisk_daemon.xsh, XBT_NULL,
+ cpath, uuid_str, strlen(uuid_str));
+ free(cpath);
+
+ return (err ? 0 : -errno);
+}
+
+static void
+tapdisk_daemon_probe(struct xs_handle *xsh,
+ struct xenbus_watch *watch, const char *path)
+{
+ char *cpath;
+ int len, err;
+ uint32_t cookie;
+ const char *node;
+ tapdisk_channel_t *channel;
+
+ len = strsep_len(path, '/', 7);
+ if (len < 0)
+ return;
+
+ node = path + len + 1;
+
+ if (!tapdisk_daemon_new_vbd_event(node))
+ return;
+
+ if (!xs_exists(xsh, path))
+ return;
+
+ cpath = strdup(path);
+ if (!cpath) {
+ EPRINTF("failed to allocate control path for %s\n", path);
+ return;
+ }
+ cpath[len] = '\0';
+
+ cookie = tapdisk_daemon.cookie++;
+ err = tapdisk_daemon_write_uuid(cpath, cookie);
+ if (err)
+ goto out;
+
+ DPRINTF("%s: got watch on %s, uuid = %u\n", __func__, path, cookie);
+
+ err = tapdisk_channel_open(&channel, cpath,
+ tapdisk_daemon.xsh,
+ tapdisk_daemon.blktap_fd,
+ cookie);
+ if (!err)
+ list_add(&channel->list, &tapdisk_daemon.channels);
+ else
+ EPRINTF("failed to open tapdisk channel for %s: %d\n",
+ path, err);
+
+out:
+ free(cpath);
+}
+
+static int
+tapdisk_daemon_start(void)
+{
+ int err;
+
+ err = tapdisk_daemon_wait_for_domid();
+ if (err)
+ return err;
+
+ tapdisk_daemon.watch.node = tapdisk_daemon.node;
+ tapdisk_daemon.watch.callback = tapdisk_daemon_probe;
+
+ err = register_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+ if (err)
+ goto fail;
+
+ ioctl(tapdisk_daemon.blktap_fd,
+ BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+ ioctl(tapdisk_daemon.blktap_fd, BLKTAP_IOCTL_SENDPID, getpid());
+
+ return 0;
+
+fail:
+ free(tapdisk_daemon.node);
+ tapdisk_daemon.node = NULL;
+ tapdisk_daemon.watch.node = NULL;
+ EPRINTF("%s: %d\n", __func__, err);
+ return err;
+}
+
+static int
+tapdisk_daemon_stop(void)
+{
+ unregister_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+
+ ioctl(tapdisk_daemon.blktap_fd,
+ BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH);
+ close(tapdisk_daemon.blktap_fd);
+
+ return 0;
+}
+
+static void
+tapdisk_daemon_free(void)
+{
+ free(tapdisk_daemon.node);
+ xs_daemon_close(tapdisk_daemon.xsh);
+ memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+}
+
+static int
+tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set readfds;
+ struct timeval tv;
+ int ret, len, offset;
+
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ memset(message, 0, sizeof(tapdisk_message_t));
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, &readfds, NULL, NULL, &tv);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ return (offset == len ? 0 : -EIO);
+}
+
+static int
+tapdisk_daemon_receive_message(int fd)
+{
+ int err;
+ tapdisk_message_t m;
+ tapdisk_channel_t *c, *tmp;
+
+ err = tapdisk_daemon_read_message(fd, &m, 2);
+ if (err) {
+ EPRINTF("failed reading message on %d: %d\n", fd, err);
+ return err;
+ }
+
+ tapdisk_daemon_for_each_channel(c, tmp)
+ if (c->cookie == m.cookie && c->read_fd == fd) {
+ DPRINTF("got '%s' message from %d:%d\n",
+ tapdisk_message_name(m.type),
+ c->channel_id, c->cookie);
+
+ return tapdisk_channel_receive_message(c, &m);
+ }
+
+ EPRINTF("unrecognized message on %d: '%s' (uuid = %u)\n",
+ fd, tapdisk_message_name(m.type), m.cookie);
+
+ return -EINVAL;
+}
+
+static int
+tapdisk_daemon_set_fds(fd_set *readfds)
+{
+ int max, fd;
+ tapdisk_channel_t *channel, *tmp;
+
+ max = xs_fileno(tapdisk_daemon.xsh);
+
+ FD_ZERO(readfds);
+ FD_SET(max, readfds);
+
+ tapdisk_daemon_for_each_channel(channel, tmp) {
+ fd = channel->read_fd;
+ max = MAX(fd, max);
+ FD_SET(fd, readfds);
+ }
+
+ return max;
+}
+
+static int
+tapdisk_daemon_check_fds(fd_set *readfds)
+{
+ int err;
+ tapdisk_channel_t *channel, *tmp;
+
+ if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), readfds))
+ xs_fire_next_watch(tapdisk_daemon.xsh);
+
+ tapdisk_daemon_for_each_channel(channel, tmp)
+ if (FD_ISSET(channel->read_fd, readfds))
+ return tapdisk_daemon_receive_message(channel->read_fd);
+
+ return 0;
+}
+
+static int
+tapdisk_daemon_run(void)
+{
+ int err, max;
+ fd_set readfds;
+
+ while (1) {
+ max = tapdisk_daemon_set_fds(&readfds);
+
+ err = select(max + 1, &readfds, NULL, NULL, NULL);
+ if (err < 0)
+ continue;
+
+ err = tapdisk_daemon_check_fds(&readfds);
+ }
+
+ return err;
+}
+
+void
+tapdisk_daemon_find_channel(tapdisk_channel_t *channel)
+{
+ tapdisk_channel_t *c, *tmp;
+
+ channel->read_fd = 0;
+ channel->write_fd = 0;
+ channel->tapdisk_pid = 0;
+
+ /* do we want multiple vbds per tapdisk? */
+ if (!xs_exists(tapdisk_daemon.xsh, channel->share_tapdisk_str)) {
+ channel->shared = 0;
+ return;
+ }
+
+ channel->shared = 1;
+
+ /* check if we already have a process started */
+ tapdisk_daemon_for_each_channel(c, tmp)
+ if (c->drivertype == channel->drivertype) {
+ channel->write_fd = c->write_fd;
+ channel->read_fd = c->read_fd;
+ channel->channel_id = c->channel_id;
+ channel->tapdisk_pid = c->tapdisk_pid;
+ return;
+ }
+}
+
+void
+tapdisk_daemon_close_channel(tapdisk_channel_t *channel)
+{
+ tapdisk_channel_t *c, *tmp;
+
+ list_del(&channel->list);
+
+ tapdisk_daemon_for_each_channel(c, tmp)
+ if (c->channel_id == channel->channel_id)
+ return;
+
+ close(channel->read_fd);
+ close(channel->write_fd);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int err;
+ char buf[128];
+
+ if (daemon(0, 0)) {
+ EPRINTF("daemon() failed (%d)\n", errno);
+ return -errno;
+ }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+#include <sys/resource.h>
+ {
+ /* set up core-dumps*/
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ EPRINTF("setrlimit failed: %d\n", errno);
+ }
+#endif
+
+ snprintf(buf, sizeof(buf), "BLKTAP-DAEMON[%d]", getpid());
+ openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+
+ err = tapdisk_daemon_write_pidfile(getpid());
+ if (err)
+ goto out;
+
+ tapdisk_daemon_print_drivers();
+
+ err = tapdisk_daemon_init();
+ if (err)
+ goto out;
+
+ err = tapdisk_daemon_start();
+ if (err)
+ goto out;
+
+ tapdisk_daemon_run();
+
+ tapdisk_daemon_stop();
+ tapdisk_daemon_free();
+
+ err = 0;
+
+out:
+ if (err)
+ EPRINTF("failed to start %s: %d\n", argv[0], err);
+ closelog();
+ return err;
+}
diff --git a/tools/blktap2/daemon/tapdisk-dispatch-common.c b/tools/blktap2/daemon/tapdisk-dispatch-common.c
new file mode 100644
index 0000000000..3d72b7dc7a
--- /dev/null
+++ b/tools/blktap2/daemon/tapdisk-dispatch-common.c
@@ -0,0 +1,94 @@
+/*
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "tapdisk-dispatch.h"
+
+int
+strsep_len(const char *str, char c, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; str[i]; i++)
+ if (str[i] == c) {
+ if (len == 0)
+ return i;
+ len--;
+ }
+
+ return (len == 0) ? i : -ERANGE;
+}
+
+int
+make_blktap_device(char *devname, int major, int minor, int perm)
+{
+ int err;
+
+ err = unlink(devname);
+ if (err && errno != ENOENT) {
+ EPRINTF("unlink %s failed: %d\n", devname, errno);
+ return -errno;
+ }
+
+ /* Need to create device */
+ err = mkdir(BLKTAP_DEV_DIR, 0755);
+ if (err && errno != EEXIST) {
+ EPRINTF("Failed to create %s directory\n", BLKTAP_DEV_DIR);
+ return -errno;
+ }
+
+ err = mknod(devname, perm, makedev(major, minor));
+ if (err) {
+ int ret = -errno;
+ struct stat st;
+
+ EPRINTF("mknod %s failed: %d\n", devname, -errno);
+
+ err = lstat(devname, &st);
+ if (err) {
+ DPRINTF("lstat %s failed: %d\n", devname, -errno);
+ err = access(devname, F_OK);
+ if (err)
+ DPRINTF("access %s failed: %d\n", devname, -errno);
+ else
+ DPRINTF("access %s succeeded\n", devname);
+ } else
+ DPRINTF("lstat %s: %u:%u\n", devname,
+ (unsigned int)st.st_rdev >> 8,
+ (unsigned int)st.st_rdev & 0xff);
+
+ return ret;
+ }
+
+ DPRINTF("Created %s device\n", devname);
+ return 0;
+}
diff --git a/tools/blktap2/daemon/tapdisk-dispatch.h b/tools/blktap2/daemon/tapdisk-dispatch.h
new file mode 100644
index 0000000000..bcd1e9dc9e
--- /dev/null
+++ b/tools/blktap2/daemon/tapdisk-dispatch.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_DISPATCH_H_
+#define _TAPDISK_DISPATCH_H_
+
+#include "xs_api.h"
+#include "blktaplib.h"
+#include "tapdisk-message.h"
+
+struct tapdisk_channel {
+ int state;
+
+ int read_fd;
+ int write_fd;
+ int blktap_fd;
+ int channel_id;
+
+ char mode;
+ char shared;
+ char open;
+ unsigned int domid;
+ unsigned int busid;
+ unsigned int major;
+ unsigned int minor;
+ unsigned int storage;
+ unsigned int drivertype;
+ uint16_t cookie;
+ pid_t tapdisk_pid;
+
+ /*
+ * special accounting needed to handle pause
+ * requests received before tapdisk process is ready
+ */
+ char connected;
+ char pause_needed;
+
+ char *path;
+ char *frontpath;
+ char *params;
+ char *vdi_path;
+ char *uuid_str;
+ char *pause_str;
+ char *pause_done_str;
+ char *shutdown_str;
+ char *share_tapdisk_str;
+
+ image_t image;
+
+ struct list_head list;
+ struct xenbus_watch pause_watch;
+ struct xenbus_watch shutdown_watch;
+
+ struct xs_handle *xsh;
+};
+
+typedef struct tapdisk_channel tapdisk_channel_t;
+
+int strsep_len(const char *str, char c, unsigned int len);
+int make_blktap_device(char *devname, int major, int minor, int perm);
+
+int tapdisk_channel_open(tapdisk_channel_t **,
+ char *node, struct xs_handle *,
+ int blktap_fd, uint16_t cookie);
+void tapdisk_channel_close(tapdisk_channel_t *);
+
+void tapdisk_daemon_find_channel(tapdisk_channel_t *);
+void tapdisk_daemon_close_channel(tapdisk_channel_t *);
+
+int tapdisk_channel_receive_message(tapdisk_channel_t *, tapdisk_message_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
new file mode 100644
index 0000000000..90cd6beca9
--- /dev/null
+++ b/tools/blktap2/drivers/Makefile
@@ -0,0 +1,105 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT= ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHDDIR = $(BLKTAP_ROOT)/vhd/lib
+
+IBIN = tapdisk tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff
+QCOW_UTIL = img2qcow qcow-create qcow2raw
+LOCK_UTIL = lock-util
+INST_DIR = $(SBINDIR)
+
+CFLAGS += -Werror -g -O0
+CFLAGS += -Wno-unused
+CFLAGS += -fno-strict-aliasing
+CFLAGS += -I../lib -I../../libxc
+CFLAGS += -I../include -I../../include
+CFLAGS += -I $(LIBAIO_DIR)
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -DUSE_NFS_LOCKS
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+LIBS += -lrt -lz
+
+ifeq ($(shell . ./check_gcrypt $(CC)),yes)
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB += -lgcrypt
+else
+CRYPT_LIB += -lcrypto
+$(warning === libgcrypt not installed: falling back to libcrypto ===)
+endif
+
+LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz
+
+tapdisk tapdisk2 td-util tapdisk-stream tapdisk-diff $(QCOW_UTIL): LIBS += -L$(LIBVHDDIR) -lvhd -luuid
+
+LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src
+tapdisk tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := $(LIBAIO_DIR)/libaio.a
+tapdisk tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS += -I$(LIBAIO_DIR) -I$(XEN_LIBXC)
+
+ifeq ($(VHD_STATIC),y)
+td-util: CFLAGS += -static
+endif
+
+TAP-OBJS-y := scheduler.o
+TAP-OBJS-y += tapdisk-ipc.o
+TAP-OBJS-y += tapdisk-vbd.o
+TAP-OBJS-y += tapdisk-image.o
+TAP-OBJS-y += tapdisk-driver.o
+TAP-OBJS-y += tapdisk-interface.o
+TAP-OBJS-y += tapdisk-server.o
+TAP-OBJS-y += tapdisk-queue.o
+TAP-OBJS-y += tapdisk-filter.o
+TAP-OBJS-y += tapdisk-log.o
+TAP-OBJS-y += tapdisk-utils.o
+TAP-OBJS-y += io-optimize.o
+TAP-OBJS-y += lock.o
+TAP-OBJS-$(CONFIG_Linux) += blk_linux.o
+
+MISC-OBJS-y := atomicio.o
+
+BLK-OBJS-y := block-aio.o
+BLK-OBJS-y += block-ram.o
+BLK-OBJS-y += block-cache.o
+BLK-OBJS-y += block-vhd.o
+BLK-OBJS-y += block-log.o
+BLK-OBJS-y += block-qcow.o
+BLK-OBJS-y += aes.o
+
+all: $(IBIN) lock-util qcow-util
+
+tapdisk: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk.c
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.c
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+tapdisk-client: tapdisk-client.o
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(LDFLAGS_img)
+
+tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+td-util: td.o tapdisk-utils.o tapdisk-log.o
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(LDFLAGS_img)
+
+lock-util: lock.c
+ $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LIBS)
+
+.PHONY: qcow-util
+qcow-util: img2qcow qcow2raw qcow-create
+
+img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+ $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+install: all
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+ $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR)
+
+clean:
+ rm -rf *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
+
+.PHONY: clean install
diff --git a/tools/blktap2/drivers/aes.c b/tools/blktap2/drivers/aes.c
new file mode 100644
index 0000000000..ea81ae53bb
--- /dev/null
+++ b/tools/blktap2/drivers/aes.c
@@ -0,0 +1,1319 @@
+/**
+ *
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.
+ */
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//#include "vl.h"
+#include <inttypes.h>
+#include <string.h>
+#include "aes.h"
+
+//#define NDEBUG
+#include <assert.h>
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#define MAXKC (256/32)
+#define MAXKB (256/8)
+#define MAXNR 14
+
+/* This controls loop-unrolling in aes_core.c */
+#undef FULL_UNROLL
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+
+ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const u32 Te4[256] = {
+ 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+ 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+ 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+ 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+ 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+ 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+ 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+ 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+ 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+ 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+ 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+ 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+ 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+ 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+ 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+ 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+ 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+ 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+ 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+ 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+ 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+ 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+ 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+ 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+ 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+ 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+ 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+ 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+ 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+ 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+ 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+ 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+ 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+ 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+ 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+ 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+ 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+ 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+ 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+ 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+ 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+ 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+ 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+ 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+ 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+ 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+ 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+ 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+ 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+ 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+ 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+ 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+ 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+ 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+ 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+ 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+ 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+ 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+ 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+ 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+ 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+ 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+ 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+ 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const u32 Td0[256] = {
+ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+
+ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u32 Td4[256] = {
+ 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+ 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+ 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+ 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+ 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+ 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+ 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+ 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+ 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+ 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+ 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+ 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+ 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+ 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+ 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+ 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+ 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+ 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+ 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+ 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+ 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+ 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+ 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+ 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+ 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+ 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+ 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+ 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+ 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+ 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+ 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+ 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+ 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+ 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+ 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+ 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+ 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+ 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+ 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+ 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+ 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+ 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+ 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+ 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+ 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+ 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+ 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+ 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+ 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+ 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+ 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+ 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+ 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+ 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+ 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+ 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+ 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+ 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+ 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+ 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+ 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+ 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+ 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+ 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const u32 rcon[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key) {
+
+ u32 *rk;
+ int i = 0;
+ u32 temp;
+
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ rk = key->rd_key;
+
+ if (bits==128)
+ key->rounds = 10;
+ else if (bits==192)
+ key->rounds = 12;
+ else
+ key->rounds = 14;
+
+ rk[0] = GETU32(userKey );
+ rk[1] = GETU32(userKey + 4);
+ rk[2] = GETU32(userKey + 8);
+ rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 0;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(userKey + 16);
+ rk[5] = GETU32(userKey + 20);
+ if (bits == 192) {
+ while (1) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 0;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(userKey + 24);
+ rk[7] = GETU32(userKey + 28);
+ if (bits == 256) {
+ while (1) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 0;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ (Te4[(temp >> 24) ] & 0xff000000) ^
+ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp ) & 0xff] & 0x000000ff);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key) {
+
+ u32 *rk;
+ int i, j, status;
+ u32 temp;
+
+ /* first, start with an encryption schedule */
+ status = AES_set_encrypt_key(userKey, bits, key);
+ if (status < 0)
+ return status;
+
+ rk = key->rd_key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < (key->rounds); i++) {
+ rk += 4;
+ rk[0] =
+ Td0[Te4[(rk[0] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[0] ) & 0xff] & 0xff];
+ rk[1] =
+ Td0[Te4[(rk[1] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[1] ) & 0xff] & 0xff];
+ rk[2] =
+ Td0[Te4[(rk[2] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[2] ) & 0xff] & 0xff];
+ rk[3] =
+ Td0[Te4[(rk[3] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[3] ) & 0xff] & 0xff];
+ }
+ return 0;
+}
+
+#ifndef AES_ASM
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key) {
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Te0[(s0 >> 24) ] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[(s3 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Te0[(s1 >> 24) ] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[(s0 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Te0[(s2 >> 24) ] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[(s1 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Te0[(s3 >> 24) ] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[(s2 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Te0[(t0 >> 24) ] ^
+ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^
+ Te3[(t3 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Te0[(t1 >> 24) ] ^
+ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^
+ Te3[(t0 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Te0[(t2 >> 24) ] ^
+ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^
+ Te3[(t1 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Te0[(t3 >> 24) ] ^
+ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^
+ Te3[(t2 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Te4[(t0 >> 24) ] & 0xff000000) ^
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ (Te4[(t1 >> 24) ] & 0xff000000) ^
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ (Te4[(t2 >> 24) ] & 0xff000000) ^
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ (Te4[(t3 >> 24) ] & 0xff000000) ^
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key) {
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Td0[(s0 >> 24) ] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[(s1 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Td0[(s1 >> 24) ] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[(s2 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Td0[(s2 >> 24) ] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[(s3 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Td0[(s3 >> 24) ] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[(s0 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Td0[(t0 >> 24) ] ^
+ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^
+ Td3[(t1 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Td0[(t1 >> 24) ] ^
+ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^
+ Td3[(t2 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Td0[(t2 >> 24) ] ^
+ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^
+ Td3[(t3 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Td0[(t3 >> 24) ] ^
+ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^
+ Td3[(t0 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Td4[(t0 >> 24) ] & 0xff000000) ^
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ (Td4[(t1 >> 24) ] & 0xff000000) ^
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ (Td4[(t2 >> 24) ] & 0xff000000) ^
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ (Td4[(t3 >> 24) ] & 0xff000000) ^
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+#endif /* AES_ASM */
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ const unsigned long length, const AES_KEY *key,
+ unsigned char *ivec, const int enc)
+{
+
+ unsigned long n;
+ unsigned long len = length;
+ unsigned char tmp[AES_BLOCK_SIZE];
+
+ assert(in && out && key && ivec);
+
+ if (enc) {
+ while (len >= AES_BLOCK_SIZE) {
+ for(n=0; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] = in[n] ^ ivec[n];
+ AES_encrypt(tmp, out, key);
+ memcpy(ivec, out, AES_BLOCK_SIZE);
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ if (len) {
+ for(n=0; n < len; ++n)
+ tmp[n] = in[n] ^ ivec[n];
+ for(n=len; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] = ivec[n];
+ AES_encrypt(tmp, tmp, key);
+ memcpy(out, tmp, AES_BLOCK_SIZE);
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ }
+ } else {
+ while (len >= AES_BLOCK_SIZE) {
+ memcpy(tmp, in, AES_BLOCK_SIZE);
+ AES_decrypt(in, out, key);
+ for(n=0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= ivec[n];
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ if (len) {
+ memcpy(tmp, in, AES_BLOCK_SIZE);
+ AES_decrypt(tmp, tmp, key);
+ for(n=0; n < len; ++n)
+ out[n] = tmp[n] ^ ivec[n];
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ }
+ }
+}
diff --git a/tools/blktap2/drivers/aes.h b/tools/blktap2/drivers/aes.h
new file mode 100644
index 0000000000..9fb54a900d
--- /dev/null
+++ b/tools/blktap2/drivers/aes.h
@@ -0,0 +1,28 @@
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#include <stdint.h>
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+ uint32_t rd_key[4 *(AES_MAXNR + 1)];
+ int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ const unsigned long length, const AES_KEY *key,
+ unsigned char *ivec, const int enc);
+
+#endif
diff --git a/tools/blktap2/drivers/atomicio.c b/tools/blktap2/drivers/atomicio.c
new file mode 100644
index 0000000000..ae0e24b00a
--- /dev/null
+++ b/tools/blktap2/drivers/atomicio.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+ ssize_t (*f) (int, void *, size_t);
+ int fd;
+ void *_s;
+ size_t n;
+{
+ char *s = _s;
+ size_t pos = 0;
+ ssize_t res;
+
+ while (n > pos) {
+ res = (f) (fd, s + pos, n - pos);
+ switch (res) {
+ case -1:
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return 0;
+ case 0:
+ errno = EPIPE;
+ return pos;
+ default:
+ pos += (size_t)res;
+ }
+ }
+ return (pos);
+}
+
diff --git a/tools/blktap2/drivers/blk.h b/tools/blktap2/drivers/blk.h
new file mode 100644
index 0000000000..73ca40c629
--- /dev/null
+++ b/tools/blktap2/drivers/blk.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+int blk_getimagesize(int fd, uint64_t *size);
+int blk_getsectorsize(int fd, uint64_t *sector_size);
diff --git a/tools/blktap2/drivers/blk_linux.c b/tools/blktap2/drivers/blk_linux.c
new file mode 100644
index 0000000000..75ddcc389f
--- /dev/null
+++ b/tools/blktap2/drivers/blk_linux.c
@@ -0,0 +1,43 @@
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+ int rc;
+
+ *size = 0;
+ rc = ioctl(fd, BLKGETSIZE, size);
+ if (rc) {
+ DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+#if defined(BLKSSZGET)
+ int rc;
+
+ *sector_size = DEFAULT_SECTOR_SIZE;
+ rc = ioctl(fd, BLKSSZGET, sector_size);
+ if (rc) {
+ DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size");
+ *sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ if (*sector_size != DEFAULT_SECTOR_SIZE)
+ DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
+ *sector_size, DEFAULT_SECTOR_SIZE);
+#else
+ *sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+ return 0;
+}
+
diff --git a/tools/blktap2/drivers/blktap2.h b/tools/blktap2/drivers/blktap2.h
new file mode 100644
index 0000000000..38350d2fad
--- /dev/null
+++ b/tools/blktap2/drivers/blktap2.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _BLKTAP_2_H_
+#define _BLKTAP_2_H_
+
+#define MISC_MAJOR_NUMBER 10
+
+#define BLKTAP2_MAX_MESSAGE_LEN 256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE 1
+#define BLKTAP2_RING_MESSAGE_RESUME 2
+#define BLKTAP2_RING_MESSAGE_CLOSE 3
+
+#define BLKTAP2_IOCTL_KICK_FE 1
+#define BLKTAP2_IOCTL_ALLOC_TAP 200
+#define BLKTAP2_IOCTL_FREE_TAP 201
+#define BLKTAP2_IOCTL_CREATE_DEVICE 202
+#define BLKTAP2_IOCTL_SET_PARAMS 203
+#define BLKTAP2_IOCTL_PAUSE 204
+#define BLKTAP2_IOCTL_REOPEN 205
+#define BLKTAP2_IOCTL_RESUME 206
+
+#define BLKTAP2_CONTROL_NAME "blktap-control"
+#define BLKTAP2_DIRECTORY "/dev/xen/blktap-2"
+#define BLKTAP2_CONTROL_DEVICE BLKTAP2_DIRECTORY"/control"
+#define BLKTAP2_RING_DEVICE BLKTAP2_DIRECTORY"/blktap"
+#define BLKTAP2_IO_DEVICE BLKTAP2_DIRECTORY"/tapdev"
+
+struct blktap2_handle {
+ unsigned int ring;
+ unsigned int device;
+ unsigned int minor;
+};
+
+struct blktap2_params {
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+ unsigned long long capacity;
+ unsigned long sector_size;
+};
+
+#endif
diff --git a/tools/blktap2/drivers/block-aio.c b/tools/blktap2/drivers/block-aio.c
new file mode 100644
index 0000000000..2c5af1483c
--- /dev/null
+++ b/tools/blktap2/drivers/block-aio.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <errno.h>
+#include <libaio.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_AIO_REQS TAPDISK_DATA_REQUESTS
+
+struct tdaio_state;
+
+struct aio_request {
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct tdaio_state *state;
+};
+
+struct tdaio_state {
+ int fd;
+ td_driver_t *driver;
+
+ int aio_free_count;
+ struct aio_request aio_requests[MAX_AIO_REQS];
+ struct aio_request *aio_free_list[MAX_AIO_REQS];
+};
+
+/*Get Image size, secsize*/
+static int tdaio_get_image_info(int fd, td_disk_info_t *info)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ info->size = 0;
+ if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+ DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+
+ /*Get the sector size*/
+#if defined(BLKSSZGET)
+ {
+ int arg;
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ ioctl(fd, BLKSSZGET, &info->sector_size);
+
+ if (info->sector_size != DEFAULT_SECTOR_SIZE)
+ DPRINTF("Note: sector size is %ld (not %d)\n",
+ info->sector_size, DEFAULT_SECTOR_SIZE);
+ }
+#else
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+ } else {
+ /*Local file? try fstat instead*/
+ info->size = (stat.st_size >> SECTOR_SHIFT);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+ }
+
+ if (info->size == 0) {
+ info->size =((uint64_t) 16836057);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ }
+ info->info = 0;
+
+ return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int i, fd, ret, o_flags;
+ struct tdaio_state *prv;
+
+ ret = 0;
+ prv = (struct tdaio_state *)driver->data;
+
+ DPRINTF("block-aio open('%s')", name);
+
+ memset(prv, 0, sizeof(struct tdaio_state));
+
+ prv->aio_free_count = MAX_AIO_REQS;
+ for (i = 0; i < MAX_AIO_REQS; i++)
+ prv->aio_free_list[i] = &prv->aio_requests[i];
+
+ /* Open the file */
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+
+ if ( (fd == -1) && (errno == EINVAL) ) {
+
+ /* Maybe O_DIRECT isn't supported. */
+ o_flags &= ~O_DIRECT;
+ fd = open(name, o_flags);
+ if (fd != -1) DPRINTF("WARNING: Accessing image without"
+ "O_DIRECT! (%s)\n", name);
+
+ } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+
+ if (fd == -1) {
+ DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+ ret = 0 - errno;
+ goto done;
+ }
+
+ ret = tdaio_get_image_info(fd, &driver->info);
+ if (ret) {
+ close(fd);
+ goto done;
+ }
+
+ prv->fd = fd;
+
+done:
+ return ret;
+}
+
+void tdaio_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct aio_request *aio = (struct aio_request *)arg;
+ struct tdaio_state *prv = aio->state;
+
+ td_complete_request(aio->treq, err);
+ prv->aio_free_list[prv->aio_free_count++] = aio;
+}
+
+void tdaio_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct aio_request *aio;
+ struct tdaio_state *prv;
+
+ prv = (struct tdaio_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdaio_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct aio_request *aio;
+ struct tdaio_state *prv;
+
+ prv = (struct tdaio_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdaio_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+int tdaio_close(td_driver_t *driver)
+{
+ struct tdaio_state *prv = (struct tdaio_state *)driver->data;
+
+ close(prv->fd);
+
+ return 0;
+}
+
+int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return TD_NO_PARENT;
+}
+
+int tdaio_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return -EINVAL;
+}
+
+struct tap_disk tapdisk_aio = {
+ .disk_type = "tapdisk_aio",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdaio_state),
+ .td_open = tdaio_open,
+ .td_close = tdaio_close,
+ .td_queue_read = tdaio_queue_read,
+ .td_queue_write = tdaio_queue_write,
+ .td_get_parent_id = tdaio_get_parent_id,
+ .td_validate_parent = tdaio_validate_parent,
+ .td_debug = NULL,
+};
diff --git a/tools/blktap2/drivers/block-cache.c b/tools/blktap2/drivers/block-cache.c
new file mode 100644
index 0000000000..1d2f4eb879
--- /dev/null
+++ b/tools/blktap2/drivers/block-cache.c
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT 12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT 9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME 60
+
+typedef struct radix_tree radix_tree_t;
+typedef struct radix_tree_node radix_tree_node_t;
+typedef struct radix_tree_link radix_tree_link_t;
+typedef struct radix_tree_leaf radix_tree_leaf_t;
+typedef struct radix_tree_page radix_tree_page_t;
+
+typedef struct block_cache block_cache_t;
+typedef struct block_cache_request block_cache_request_t;
+typedef struct block_cache_stats block_cache_stats_t;
+
+struct radix_tree_page {
+ char *buf;
+ size_t size;
+ uint64_t sec;
+ radix_tree_link_t *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+ radix_tree_page_t *page;
+ char *buf;
+};
+
+struct radix_tree_link {
+ uint32_t time;
+ union {
+ radix_tree_node_t *next;
+ radix_tree_leaf_t leaf;
+ } u;
+};
+
+struct radix_tree_node {
+ int height;
+ radix_tree_link_t links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+ int height;
+ uint64_t size;
+ uint32_t nodes;
+ radix_tree_node_t *root;
+
+ block_cache_t *cache;
+};
+
+struct block_cache_request {
+ int err;
+ char *buf;
+ uint64_t secs;
+ td_request_t treq;
+ block_cache_t *cache;
+};
+
+struct block_cache_stats {
+ uint64_t reads;
+ uint64_t hits;
+ uint64_t misses;
+ uint64_t prunes;
+};
+
+struct block_cache {
+ int ptype;
+ char *name;
+
+ uint64_t sectors;
+
+ block_cache_request_t requests[BLOCK_CACHE_REQUESTS];
+ block_cache_request_t *request_free_list[BLOCK_CACHE_REQUESTS];
+ int requests_free;
+
+ event_id_t timeout_id;
+
+ radix_tree_t tree;
+
+ block_cache_stats_t stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+ return (uint64_t)RADIX_TREE_NODE_SIZE <<
+ (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+ int height;
+ uint64_t tree_size;
+
+ height = 1; /* always allocate root node */
+ tree_size = radix_tree_calculate_size(height);
+ while (sectors > tree_size)
+ tree_size = radix_tree_calculate_size(++height);
+
+ return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+ return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+ RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+ return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+ if (link)
+ memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+ radix_tree_node_t *node;
+
+ node = calloc(1, sizeof(radix_tree_node_t));
+ if (!node)
+ return NULL;
+
+ node->height = height;
+ tree->nodes++;
+
+ return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+ return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ if (!node)
+ return;
+
+ free(node);
+ tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+ char *buf, uint64_t sec, size_t size)
+{
+ radix_tree_page_t *page;
+
+ page = calloc(1, sizeof(radix_tree_page_t));
+ if (!page)
+ return NULL;
+
+ page->buf = buf;
+ page->sec = sec;
+ page->size = size;
+ tree->size += size;
+
+ return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+ int i;
+
+ for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+ DBG("%s: ejecting sector 0x%llx\n",
+ tree->cache->name, page->sec + i);
+
+ tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+ tree->size -= page->size;
+ free(page->buf);
+ free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+ int i;
+
+ if (!page)
+ return;
+
+ for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+ radix_tree_clear_link(page->owners[i]);
+
+ radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
+ radix_tree_page_t *page, off_t off)
+{
+ int i;
+
+ if (off + RADIX_TREE_NODE_SIZE > page->size)
+ return;
+
+ for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+ if (page->owners[i])
+ continue;
+
+ page->owners[i] = link;
+ link->u.leaf.page = page;
+ link->u.leaf.buf = page->buf + off;
+
+ break;
+ }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+ int idx;
+ struct timeval now;
+ radix_tree_link_t *link;
+ radix_tree_node_t *node;
+
+ node = tree->root;
+ gettimeofday(&now, NULL);
+
+ do {
+ idx = radix_tree_index(node, sector);
+ link = node->links + idx;
+ link->time = now.tv_sec;
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ return link->u.leaf.buf;
+
+ if (!link->u.next)
+ return NULL;
+
+ node = link->u.next;
+ } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+ radix_tree_page_t *page, off_t off)
+{
+ int idx;
+ struct timeval now;
+ radix_tree_link_t *link;
+ radix_tree_node_t *node;
+
+ node = tree->root;
+ gettimeofday(&now, NULL);
+
+ do {
+ idx = radix_tree_index(node, sector);
+ link = node->links + idx;
+ link->time = now.tv_sec;
+
+ if (radix_tree_node_contains_leaves(tree, node)) {
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ radix_tree_insert_leaf(tree, link, page, off);
+ return link->u.leaf.buf;
+ }
+
+ if (!link->u.next) {
+ link->u.next = radix_tree_allocate_child_node(tree,
+ node);
+ if (!link->u.next)
+ return NULL;
+ }
+
+ node = link->u.next;
+ } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+ uint64_t sector, uint64_t sectors)
+{
+ int i;
+ radix_tree_page_t *page;
+
+ page = radix_tree_allocate_page(tree, buf, sector,
+ sectors << RADIX_TREE_NODE_SHIFT);
+ if (!page)
+ return -ENOMEM;
+
+ for (i = 0; i < sectors; i++)
+ if (!radix_tree_add_leaf(tree, sector + i,
+ page, (i << RADIX_TREE_NODE_SHIFT)))
+ goto fail;
+
+ return 0;
+
+fail:
+ page->buf = NULL;
+ radix_tree_remove_page(tree, page);
+ return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+ int i;
+ radix_tree_link_t *link;
+
+ if (!node)
+ return;
+
+ for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+ link = node->links + i;
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ else
+ radix_tree_delete_branch(tree, link->u.next);
+
+ radix_tree_clear_link(link);
+ }
+
+ radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+ radix_tree_delete_branch(tree, tree->root);
+ tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+ radix_tree_node_t *node, uint32_t now)
+{
+ int i, empty;
+ radix_tree_link_t *link;
+
+ empty = 1;
+ if (!node)
+ return empty;
+
+ for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+ link = node->links + i;
+
+ if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+ if (radix_tree_node_contains_leaves(tree, node)) {
+ empty = 0;
+ continue;
+ }
+
+ if (radix_tree_prune_branch(tree, link->u.next, now))
+ radix_tree_clear_link(link);
+ else
+ empty = 0;
+
+ continue;
+ }
+
+ if (radix_tree_node_contains_leaves(tree, node))
+ radix_tree_remove_page(tree, link->u.leaf.page);
+ else
+ radix_tree_delete_branch(tree, link->u.next);
+
+ radix_tree_clear_link(link);
+ }
+
+ if (empty && !radix_tree_node_is_root(tree, node))
+ radix_tree_free_node(tree, node);
+
+ return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+ struct timeval now;
+
+ if (!tree->root)
+ return;
+
+ DPRINTF("tree %s has %"PRIu64" bytes\n",
+ tree->cache->name, tree->size);
+
+ gettimeofday(&now, NULL);
+ radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+ DPRINTF("tree %s now has %"PRIu64" bytes\n",
+ tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+ tree->height = radix_tree_calculate_height(sectors);
+ tree->root = radix_tree_allocate_node(tree, tree->height);
+ if (!tree->root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+ radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id, char mode, void *private)
+{
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ cache = (block_cache_t *)private;
+ tree = &cache->tree;
+
+ radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+ if (!cache->requests_free)
+ return NULL;
+
+ return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+ memset(breq, 0, sizeof(block_cache_request_t));
+ cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int i, err;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ if (!td_flag_test(flags, TD_OPEN_RDONLY))
+ return -EINVAL;
+
+ if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+ return -EINVAL;
+
+ cache = (block_cache_t *)driver->data;
+ err = tapdisk_namedup(&cache->name, (char *)name);
+ if (err)
+ return -ENOMEM;
+
+ cache->sectors = driver->info.size;
+
+ tree = &cache->tree;
+ err = radix_tree_initialize(tree, cache->sectors);
+ if (err)
+ goto fail;
+
+ tree->cache = cache;
+ cache->requests_free = BLOCK_CACHE_REQUESTS;
+ for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+ cache->request_free_list[i] = cache->requests + i;
+
+ cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+ -1, /* dummy fd */
+ BLOCK_CACHE_PAGE_IDLETIME << 1,
+ block_cache_prune_event,
+ cache);
+ if (cache->timeout_id < 0)
+ goto fail;
+
+ DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+ "tree: %p, height: %d\n",
+ cache->name, cache->sectors, tree, tree->height);
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE))
+ DPRINTF("mlockall failed: %d\n", -errno);
+
+ return 0;
+
+fail:
+ free(cache->name);
+ radix_tree_free(&cache->tree);
+ return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver)
+{
+ radix_tree_t *tree;
+ block_cache_t *cache;
+
+ cache = (block_cache_t *)driver->data;
+ tree = &cache->tree;
+
+ DPRINTF("closing cache for %s\n", cache->name);
+
+ tapdisk_server_unregister_event(cache->timeout_id);
+ radix_tree_free(tree);
+ free(cache->name);
+
+ return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache, char *buf)
+{
+ int i, n;
+ uint64_t cksm, *data;
+
+ return 0;
+
+ cksm = 0;
+ data = (uint64_t *)buf;
+ n = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+ for (i = 0; i < n; i++)
+ cksm += data[i];
+
+ return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+ int i;
+ off_t off;
+
+ cache->stats.hits += treq.secs;
+
+ for (i = 0; i < treq.secs; i++) {
+ DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+ cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+ off = i << RADIX_TREE_NODE_SHIFT;
+ memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+ }
+
+ td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+ int i;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+ block_cache_request_t *breq;
+
+ breq = (block_cache_request_t *)clone.cb_data;
+ cache = breq->cache;
+ tree = &cache->tree;
+ breq->secs -= clone.secs;
+ breq->err = (breq->err ? breq->err : err);
+
+ if (breq->secs)
+ return;
+
+ if (breq->err) {
+ free(breq->buf);
+ goto out;
+ }
+
+ for (i = 0; i < breq->treq.secs; i++) {
+ off_t off = i << RADIX_TREE_NODE_SHIFT;
+ DBG("%s: populating sec 0x%08llx\n",
+ cache->name, breq->treq.sec + i);
+ memcpy(breq->treq.buf + off,
+ breq->buf + off, RADIX_TREE_NODE_SIZE);
+ }
+
+ if (radix_tree_add_leaves(tree, breq->buf,
+ breq->treq.sec, breq->treq.secs))
+ free(breq->buf);
+
+out:
+ td_complete_request(breq->treq, breq->err);
+ block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+ char *buf;
+ size_t size;
+ td_request_t clone;
+ radix_tree_t *tree;
+ block_cache_request_t *breq;
+
+ DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+ clone = treq;
+ tree = &cache->tree;
+ size = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+ cache->stats.misses += treq.secs;
+
+ if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+ goto out;
+
+ breq = block_cache_get_request(cache);
+ if (!breq)
+ goto out;
+
+ if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
+ block_cache_put_request(cache, breq);
+ goto out;
+ }
+
+ breq->treq = treq;
+ breq->secs = treq.secs;
+ breq->err = 0;
+ breq->buf = buf;
+ breq->cache = cache;
+
+ clone.buf = buf;
+ clone.cb = block_cache_populate_cache;
+ clone.cb_data = breq;
+
+out:
+ td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ int i;
+ radix_tree_t *tree;
+ block_cache_t *cache;
+ char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+ cache = (block_cache_t *)driver->data;
+ tree = &cache->tree;
+
+ cache->stats.reads += treq.secs;
+
+ if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+ return td_forward_request(treq);
+
+ for (i = 0; i < treq.secs; i++) {
+ iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+ if (!iov[i])
+ return block_cache_miss(cache, treq);
+ }
+
+ return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ block_cache_t *cache;
+
+ if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+ return -EINVAL;
+
+ cache = (block_cache_t *)driver->data;
+ if (strcmp(driver->name, pdriver->name))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+ block_cache_t *cache;
+ block_cache_stats_t *stats;
+
+ cache = (block_cache_t *)driver->data;
+ stats = &cache->stats;
+
+ WARN("BLOCK CACHE %s\n", cache->name);
+ WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
+ stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+ .disk_type = "tapdisk_block_cache",
+ .flags = 0,
+ .private_data_size = sizeof(block_cache_t),
+ .td_open = block_cache_open,
+ .td_close = block_cache_close,
+ .td_queue_read = block_cache_queue_read,
+ .td_queue_write = block_cache_queue_write,
+ .td_get_parent_id = block_cache_get_parent_id,
+ .td_validate_parent = block_cache_validate_parent,
+ .td_debug = block_cache_debug,
+};
diff --git a/tools/blktap2/drivers/block-log.c b/tools/blktap2/drivers/block-log.c
new file mode 100644
index 0000000000..2cc051b7d2
--- /dev/null
+++ b/tools/blktap2/drivers/block-log.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ * u64 sector;
+ * u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+ int fd;
+ event_id_t id;
+} poll_fd_t;
+
+struct tdlog_state {
+ uint64_t size;
+
+ void* writelog;
+
+ char* ctlpath;
+ poll_fd_t ctl;
+
+ int connected;
+ poll_fd_t connections[MAX_CONNECTIONS];
+
+ char* shmpath;
+ void* shm;
+
+ log_sring_t* sring;
+ log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+
+#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG]
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit(int nr, void* bmap)
+{
+ return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit(int nr, void* bmap)
+{
+ BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit(int nr, void* bmap)
+{
+ BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static inline int bitmap_size(uint64_t sz)
+{
+ return sz >> 3;
+}
+
+static int writelog_create(struct tdlog_state *s)
+{
+ uint64_t bmsize;
+
+ bmsize = bitmap_size(s->size);
+
+ BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+ if (!(s->writelog = calloc(bmsize, 1))) {
+ BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+ if (s->writelog)
+ free(s->writelog);
+
+ return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ set_bit(sector + i, s->writelog);
+
+ return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+ if (!end)
+ end = s->size;
+
+ /* clear to word boundaries */
+ while (BITMAP_SHIFT(start))
+ clear_bit(start++, s->writelog);
+ while (BITMAP_SHIFT(end))
+ clear_bit(end--, s->writelog);
+
+ memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+ return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+ struct disk_range* range = s->shm;
+ uint64_t i = 0;
+
+ BDPRINTF("sector count: %"PRIu64, s->size);
+
+ for (i = 0; i < s->size; i++) {
+ if (test_bit(i, s->writelog)) {
+ /* range start */
+ range->sector = i;
+ range->count = 1;
+ /* find end */
+ for (i++; i < s->size && test_bit(i, s->writelog); i++)
+ range->count++;
+
+ BDPRINTF("export: dirty extent %"PRIu64":%u",
+ range->sector, range->count);
+ range++;
+
+ /* out of space in shared memory region */
+ if ((void*)range >= bmend(s->shm)) {
+ BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+ return i;
+ }
+
+ /* undo forloop increment */
+ i--;
+ }
+ }
+
+ /* NULL-terminate range list */
+ range->sector = 0;
+ range->count = 0;
+
+ return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+ int i;
+
+ for (i = 0; i < len && path[i]; i++)
+ if (strchr(":/", path[i]))
+ path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+ char* res;
+ char *file;
+
+ file = strrchr(name, '/');
+ if (!file) {
+ BWPRINTF("invalid name %s\n", name);
+ return NULL;
+ }
+
+ if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+ BWPRINTF("could not allocate path");
+ return NULL;
+ }
+
+ path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+ return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+ int i, l, fd;
+
+ /* device name -> path */
+ if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+ BWPRINTF("could not allocate shm path");
+ return -1;
+ }
+
+ path_escape(s->shmpath + 5, strlen(name));
+
+ if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+ BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+ strerror(errno));
+ goto err;
+ }
+ if (ftruncate(fd, SHMSIZE) < 0) {
+ BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+ close(fd);
+ goto err;
+ }
+
+ s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (s->shm == MAP_FAILED) {
+ BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+ goto err;
+ }
+ return 0;
+
+ err:
+ s->shm = NULL;
+ free(s->shmpath);
+ s->shmpath = NULL;
+ return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+ if (s->shm) {
+ munmap(s->shm, SHMSIZE);
+ s->shm = NULL;
+ }
+
+ if (s->shmpath) {
+ shm_unlink(s->shmpath);
+ s->shmpath = NULL;
+ }
+
+ return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+ struct sockaddr_un saddr;
+
+ if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+ return -1;
+
+ if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ BWPRINTF("error opening control socket: %s", strerror(errno));
+ goto err;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+ if (unlink(s->ctlpath) && errno != ENOENT) {
+ BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+ strerror(errno));
+ goto err_sock;
+ }
+
+ if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) {
+ BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+ strerror(errno));
+ goto err_sock;
+ }
+
+ if (listen(s->ctl.fd, 1) < 0) {
+ BWPRINTF("error listening on control socket: %s", strerror(errno));
+ goto err_sock;
+ }
+
+ s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ s->ctl.fd, 0, ctl_accept, s);
+ if (s->ctl.id < 0) {
+ BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+ goto err_sock;
+ }
+
+ return 0;
+
+ err_sock:
+ close(s->ctl.fd);
+ s->ctl.fd = -1;
+ err:
+ free(s->ctlpath);
+ s->ctlpath = NULL;
+
+ return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+ while (s->connected) {
+ tapdisk_server_unregister_event(s->connections[s->connected].id);
+ close(s->connections[s->connected].fd);
+ s->connections[s->connected].fd = -1;
+ s->connections[s->connected].id = 0;
+ s->connected--;
+ }
+
+ if (s->ctl.fd >= 0) {
+ tapdisk_server_unregister_event(s->ctl.id);
+ close(s->ctl.fd);
+ s->ctl.fd = -1;
+ s->ctl.id = 0;
+ }
+
+ if (s->ctlpath) {
+ unlink(s->ctlpath);
+ free(s->ctlpath);
+ s->ctlpath = NULL;
+ }
+
+ /* XXX this must be fixed once requests are actually in flight */
+ /* could just drain the existing ring here first */
+ if (s->sring) {
+ SHARED_RING_INIT(s->sring);
+ BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+ }
+
+ return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+ int i;
+
+ for (i = 0; i <= s->connected; i++) {
+ if (s->connections[i].fd == fd) {
+ tapdisk_server_unregister_event(s->connections[i].id);
+ close(s->connections[i].fd);
+ s->connections[i].fd = -1;
+ s->connections[i].id = 0;
+ s->connected--;
+ return 0;
+ }
+ }
+
+ BWPRINTF("requested to close unknown socket %d", fd);
+ return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+ struct tdlog_state* s = (struct tdlog_state *)private;
+ int fd;
+ event_id_t cid;
+
+ if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+ BWPRINTF("error accepting control connection: %s", strerror(errno));
+ return;
+ }
+
+ if (s->connected) {
+ BWPRINTF("control session in progress, closing new connection");
+ close(fd);
+ return;
+ }
+
+ cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ fd, 0, ctl_request, s);
+ if (cid < 0) {
+ BWPRINTF("error registering connection event handler: %s", strerror(cid));
+ close(fd);
+ return;
+ }
+
+ s->connections[s->connected].fd = fd;
+ s->connections[s->connected].id = cid;
+ s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+ char msg[CTLRSPLEN_SHMP + 1];
+ uint32_t sz;
+ int rc;
+
+ BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+ SHMSIZE, s->shmpath);
+
+ /* TMP: sanity-check shm */
+ sz = 0xdeadbeef;
+ memcpy(s->shm, &sz, sizeof(sz));
+
+ sz = SHMSIZE;
+ memcpy(msg, &sz, sizeof(sz));
+ snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+ if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+ BWPRINTF("error writing shmpath: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: peeking bitmap");
+
+ writelog_export(s);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+ BWPRINTF("error writing peek ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: clearing bitmap");
+
+ writelog_clear(s, 0, 0);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+ BWPRINTF("error writing clear ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+ int rc;
+
+ BDPRINTF("ctl: getting bitmap");
+
+ writelog_export(s);
+ writelog_clear(s, 0, 0);
+
+ if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+ BWPRINTF("error writing get ack: %s", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+ RING_IDX reqstart, reqend;
+ log_request_t req;
+
+ /* XXX testing */
+ RING_IDX rspstart, rspend;
+ log_response_t rsp;
+ struct log_ctlmsg msg;
+ int rc;
+
+ reqstart = s->bring.req_cons;
+ reqend = s->sring->req_prod;
+
+ BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+ while (reqstart != reqend) {
+ /* XXX actually submit these! */
+ memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+ BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+ s->bring.req_cons = ++reqstart;
+
+ rsp.sector = req.sector;
+ rsp.count = req.count;
+ memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+ sizeof(rsp));
+ s->bring.rsp_prod_pvt++;
+ }
+
+ RING_PUSH_RESPONSES(&s->bring);
+ memset(&msg, 0, sizeof(msg));
+ memcpy(msg.msg, LOGCMD_KICK, 4);
+ if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error sending notify: %s", strerror(errno));
+ return -1;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
+{
+ if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+ return ctl_get_shmpath(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+ return ctl_peek_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+ return ctl_clear_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+ return ctl_get_writes(s, fd);
+ } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+ return ctl_kick(s, fd);
+ }
+
+ BWPRINTF("unknown control request %.4s", msg->msg);
+ return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+ int i;
+
+ for (i = 0; i < s->connected; i++)
+ if (s->connections[i].id == id)
+ return s->connections[i].fd;
+
+ BWPRINTF("unrecognized event callback id %d", id);
+ return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+ struct tdlog_state* s = (struct tdlog_state*)private;
+ struct log_ctlmsg msg;
+ int rc, i, fd = -1;
+
+ fd = ctl_find_connection(s, id);
+ if (fd == -1)
+ return;
+
+ if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+ strerror(errno));
+ ctl_close_sock(s, fd);
+ return;
+ } else if (rc == 0) {
+ BDPRINTF("ctl_request: EOF, closing socket");
+ ctl_close_sock(s, fd);
+ return;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+ sizeof(msg));
+ return;
+ }
+
+ ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+ int rc;
+
+ memset(s, 0, sizeof(*s));
+
+ s->size = driver->info.size;
+
+ if ((rc = writelog_create(s))) {
+ tdlog_close(driver);
+ return rc;
+ }
+ if ((rc = shmem_open(s, name))) {
+ tdlog_close(driver);
+ return rc;
+ }
+ if ((rc = ctl_open(s, name))) {
+ tdlog_close(driver);
+ return rc;
+ }
+
+ s->sring = (log_sring_t*)sringstart(s->shm);
+ SHARED_RING_INIT(s->sring);
+ BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+ BDPRINTF("opened ctl socket");
+
+ return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+ ctl_close(s);
+ shmem_close(s);
+ writelog_free(s);
+
+ return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+ td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+ struct tdlog_state* s = (struct tdlog_state*)driver->data;
+ int rc;
+
+ writelog_set(s, treq.sec, treq.secs);
+ td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+ return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+ td_driver_t *parent, td_flag_t flags)
+{
+ return 0;
+}
+
+struct tap_disk tapdisk_log = {
+ .disk_type = "tapdisk_log",
+ .private_data_size = sizeof(struct tdlog_state),
+ .flags = 0,
+ .td_open = tdlog_open,
+ .td_close = tdlog_close,
+ .td_queue_read = tdlog_queue_read,
+ .td_queue_write = tdlog_queue_write,
+ .td_get_parent_id = tdlog_get_parent_id,
+ .td_validate_parent = tdlog_validate_parent,
+};
diff --git a/tools/blktap2/drivers/block-qcow.c b/tools/blktap2/drivers/block-qcow.c
new file mode 100644
index 0000000000..1ddd92d750
--- /dev/null
+++ b/tools/blktap2/drivers/block-qcow.c
@@ -0,0 +1,1517 @@
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+#include "bswap.h"
+#include "aes.h"
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+#include "blk.h"
+#include "atomicio.h"
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#if 1
+#define ASSERT(_p) \
+ if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+struct pending_aio {
+ td_callback_t cb;
+ int id;
+ void *private;
+ int nb_sectors;
+ char *buf;
+ uint64_t sector;
+};
+
+#undef IOCB_IDX
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+struct qcow_request {
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct tdqcow_state *state;
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+ int i;
+ uint32_t md[4];
+
+ /* Generate checksum */
+ gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+ return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+ int i;
+ unsigned char *md;
+ uint32_t ret;
+
+ md = malloc(MD5_DIGEST_LENGTH);
+ if(!md) return 0;
+
+ /* Generate checksum */
+ if (MD5((unsigned char *)ptr, len, md) != md)
+ ret = 0;
+ else
+ memcpy(&ret, md, sizeof(uint32_t));
+
+ free(md);
+ return ret;
+}
+
+#endif
+
+
+static void free_aio_state(struct tdqcow_state* s)
+{
+ free(s->aio_requests);
+ free(s->aio_free_list);
+}
+
+static int init_aio_state(td_driver_t *driver)
+{
+ int i, ret;
+ td_disk_info_t *bs = &(driver->info);
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+ // A segment (i.e. a page) can span multiple clusters
+ s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
+ MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
+
+ s->aio_free_count = s->max_aio_reqs;
+
+ if (!(s->aio_requests = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) ||
+ !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
+ DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
+ s->max_aio_reqs);
+ goto fail;
+ }
+
+ for (i = 0; i < s->max_aio_reqs; i++)
+ s->aio_free_list[i] = &s->aio_requests[i];
+
+ DPRINTF("AIO state initialised\n");
+
+ return 0;
+ fail:
+ return -1;
+}
+
+int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+ int fd;
+ QCowHeader header;
+
+ /*Set to the backing file size*/
+ fd = open(filename, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ be32_to_cpus(&header.magic);
+ be64_to_cpus(&header.size);
+ if (header.magic == QCOW_MAGIC) {
+ *size = header.size >> SECTOR_SHIFT;
+ return 0;
+ }
+
+ if(S_ISBLK(st->st_mode)) {
+ fd = open(filename, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ if (blk_getimagesize(fd, size) != 0) {
+ printf("Unable to get Block device size\n");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ } else *size = (st->st_size >> SECTOR_SHIFT);
+ return 0;
+}
+
+static int qcow_set_key(struct tdqcow_state *s, const char *key)
+{
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for (i = 0; i < len; i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for (i=0; i<16; i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for (i = 0; i < 16; i++)
+ DPRINTF(" %02x", tmp[i]);
+ DPRINTF("\n");
+ for (i = 0; i < 16; i++)
+ DPRINTF(" %02x", out[i]);
+ DPRINTF("\n");
+ }
+#endif
+ return 0;
+}
+
+void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct qcow_request *aio = (struct qcow_request *)arg;
+ struct tdqcow_state *s = aio->state;
+
+ td_complete_request(aio->treq, err);
+
+ s->aio_free_list[s->aio_free_count++] = aio;
+}
+
+static void async_read(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct qcow_request *aio;
+ struct tdqcow_state *prv;
+
+ prv = (struct tdqcow_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdqcow_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+static void async_write(td_driver_t *driver, td_request_t treq)
+{
+ int size;
+ uint64_t offset;
+ struct qcow_request *aio;
+ struct tdqcow_state *prv;
+
+ prv = (struct tdqcow_state *)driver->data;
+ size = treq.secs * driver->info.sector_size;
+ offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ if (prv->aio_free_count == 0)
+ goto fail;
+
+ aio = prv->aio_free_list[--prv->aio_free_count];
+ aio->treq = treq;
+ aio->state = prv;
+
+ td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+ size, offset, tdqcow_complete, aio);
+ td_queue_tiocb(driver, &aio->tiocb);
+
+ return;
+
+fail:
+ td_complete_request(treq, -EBUSY);
+}
+
+/*
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for (i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+int qtruncate(int fd, off_t length, int sparse)
+{
+ int ret, i;
+ int current = 0, rem = 0;
+ uint64_t sectors;
+ struct stat st;
+ char *buf;
+
+ /* If length is greater than the current file len
+ * we synchronously write zeroes to the end of the
+ * file, otherwise we truncate the length down
+ */
+ ret = fstat(fd, &st);
+ if (ret == -1)
+ return -1;
+ if (S_ISBLK(st.st_mode))
+ return 0;
+
+ sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+ current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+ rem = st.st_size % DEFAULT_SECTOR_SIZE;
+
+ /* If we are extending this file, we write zeros to the end --
+ * this tries to ensure that the extents allocated wind up being
+ * contiguous on disk.
+ */
+ if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
+ /*We are extending the file*/
+ if ((ret = posix_memalign((void **)&buf,
+ 512, DEFAULT_SECTOR_SIZE))) {
+ DPRINTF("posix_memalign failed: %d\n", ret);
+ return -1;
+ }
+ memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
+ if (lseek(fd, 0, SEEK_END)==-1) {
+ DPRINTF("Lseek EOF failed (%d), internal error\n",
+ errno);
+ free(buf);
+ return -1;
+ }
+ if (rem) {
+ ret = write(fd, buf, rem);
+ if (ret != rem) {
+ DPRINTF("write failed: ret = %d, err = %s\n",
+ ret, strerror(errno));
+ free(buf);
+ return -1;
+ }
+ }
+ for (i = current; i < sectors; i++ ) {
+ ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
+ if (ret != DEFAULT_SECTOR_SIZE) {
+ DPRINTF("write failed: ret = %d, err = %s\n",
+ ret, strerror(errno));
+ free(buf);
+ return -1;
+ }
+ }
+ free(buf);
+ } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
+ if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
+ DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct tdqcow_state *s,
+ uint64_t offset, int allocate,
+ int compressed_size,
+ int n_start, int n_end)
+{
+ int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+ char *tmp_ptr2, *l2_ptr, *l1_ptr;
+ uint64_t *tmp_ptr;
+ uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+ uint32_t min_count;
+ int new_l2_table;
+
+ /*Check L1 table for the extent offset*/
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ l2_offset = s->l1_table[l1_index];
+ new_l2_table = 0;
+ if (!l2_offset) {
+ if (!allocate)
+ return 0;
+ /*
+ * allocating a new l2 entry + extent
+ * at the end of the file, we must also
+ * update the L1 entry safely.
+ */
+ l2_offset = s->fd_end;
+
+ /* round to cluster size */
+ l2_offset = (l2_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+
+ /* update the L1 entry */
+ s->l1_table[l1_index] = l2_offset;
+
+ /*Truncate file for L2 table
+ *(initialised to zero in case we crash)*/
+ if (qtruncate(s->fd,
+ l2_offset + (s->l2_size * sizeof(uint64_t)),
+ s->sparse) != 0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
+
+ /*Update the L1 table entry on disk
+ * (for O_DIRECT we write 4KByte blocks)*/
+ l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+ l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+ if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+ DPRINTF("ERROR allocating memory for L1 table\n");
+ }
+ memcpy(tmp_ptr, l1_ptr, 4096);
+
+ /* Convert block to write to big endian */
+ for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
+ cpu_to_be64s(&tmp_ptr[i]);
+ }
+
+ /*
+ * Issue non-asynchronous L1 write.
+ * For safety, we must ensure that
+ * entry is written before blocks.
+ */
+ lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+ if (write(s->fd, tmp_ptr, 4096) != 4096) {
+ free(tmp_ptr);
+ return 0;
+ }
+ free(tmp_ptr);
+
+ new_l2_table = 1;
+ goto cache_miss;
+ } else if (s->min_cluster_alloc == s->l2_size) {
+ /*Fast-track the request*/
+ cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ return cluster_offset + (l2_index * s->cluster_size);
+ }
+
+ /*Check to see if L2 entry is already cached*/
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for (j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i << s->l2_bits);
+ goto found;
+ }
+ }
+
+cache_miss:
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for (i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+ /*If extent pre-allocated, read table from disk,
+ *otherwise write new table to disk*/
+ if (new_l2_table) {
+ /*Should we allocate the whole extent? Adjustable parameter.*/
+ if (s->cluster_alloc == s->l2_size) {
+ cluster_offset = l2_offset +
+ (s->l2_size * sizeof(uint64_t));
+ cluster_offset = (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ if (qtruncate(s->fd, cluster_offset +
+ (s->cluster_size * s->l2_size),
+ s->sparse) != 0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = cluster_offset +
+ (s->cluster_size * s->l2_size);
+ for (i = 0; i < s->l2_size; i++) {
+ l2_table[i] = cpu_to_be64(cluster_offset +
+ (i*s->cluster_size));
+ }
+ } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+ lseek(s->fd, l2_offset, SEEK_SET);
+ if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ } else {
+ lseek(s->fd, l2_offset, SEEK_SET);
+ if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ }
+
+ /*Update the cache entries*/
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+
+found:
+ /*The extent is split into 's->l2_size' blocks of
+ *size 's->cluster_size'*/
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ if (!cluster_offset ||
+ ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+ if (!allocate)
+ return 0;
+
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+ (n_end - n_start) < s->cluster_sectors) {
+ /* cluster is already allocated but compressed, we must
+ decompress it in the case it is not completely
+ overwritten */
+ if (decompress_cluster(s, cluster_offset) < 0)
+ return 0;
+ cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+ cluster_offset = (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ /* write the cluster content - not asynchronous */
+ lseek(s->fd, cluster_offset, SEEK_SET);
+ if (write(s->fd, s->cluster_cache, s->cluster_size) !=
+ s->cluster_size)
+ return -1;
+ } else {
+ /* allocate a new cluster */
+ cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+ if (allocate == 1) {
+ /* round to cluster size */
+ cluster_offset =
+ (cluster_offset + s->cluster_size - 1)
+ & ~(s->cluster_size - 1);
+ if (qtruncate(s->fd, cluster_offset +
+ s->cluster_size, s->sparse)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return 0;
+ }
+ s->fd_end = (cluster_offset + s->cluster_size);
+ /* if encrypted, we must initialize the cluster
+ content which won't be written */
+ if (s->crypt_method &&
+ (n_end - n_start) < s->cluster_sectors) {
+ uint64_t start_sect;
+ start_sect = (offset &
+ ~(s->cluster_size - 1))
+ >> 9;
+ memset(s->cluster_data + 512,
+ 0xaa, 512);
+ for (i = 0; i < s->cluster_sectors;i++)
+ {
+ if (i < n_start || i >= n_end)
+ {
+ encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1, 1,
+ &s->aes_encrypt_key);
+ lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
+ if (write(s->fd, s->cluster_data, 512) != 512)
+ return -1;
+ }
+ }
+ }
+ } else {
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ (uint64_t)compressed_size
+ << (63 - s->cluster_bits);
+ }
+ }
+ /* update L2 table */
+ tmp = cpu_to_be64(cluster_offset);
+ l2_table[l2_index] = tmp;
+
+ /*For IO_DIRECT we write 4KByte blocks*/
+ l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+ l2_ptr = (char *)l2_table + (l2_sector << 12);
+
+ if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+ DPRINTF("ERROR allocating memory for L1 table\n");
+ }
+ memcpy(tmp_ptr2, l2_ptr, 4096);
+ lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
+ if (write(s->fd, tmp_ptr2, 4096) != 4096) {
+ free(tmp_ptr2);
+ return -1;
+ }
+ free(tmp_ptr2);
+ }
+ return cluster_offset;
+}
+
+static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ (out_len != out_buf_size) ) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+ int ret, csize;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ csize = cluster_offset >> (63 - s->cluster_bits);
+ csize &= (s->cluster_size - 1);
+ lseek(s->fd, coffset, SEEK_SET);
+ ret = read(s->fd, s->cluster_data, csize);
+ if (ret != csize)
+ return -1;
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+static int
+tdqcow_read_header(int fd, QCowHeader *header)
+{
+ int err;
+ char *buf;
+ struct stat st;
+ size_t size, expected;
+
+ memset(header, 0, sizeof(*header));
+
+ err = fstat(fd, &st);
+ if (err)
+ return -errno;
+
+ err = lseek(fd, 0, SEEK_SET);
+ if (err == (off_t)-1)
+ return -errno;
+
+ size = (sizeof(*header) + 511) & ~511;
+ err = posix_memalign((void **)&buf, 512, size);
+ if (err)
+ return err;
+
+ expected = size;
+ if (st.st_size < size)
+ expected = st.st_size;
+
+ errno = 0;
+ err = read(fd, buf, size);
+ if (err != expected) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ memcpy(header, buf, sizeof(*header));
+ be32_to_cpus(&header->magic);
+ be32_to_cpus(&header->version);
+ be64_to_cpus(&header->backing_file_offset);
+ be32_to_cpus(&header->backing_file_size);
+ be32_to_cpus(&header->mtime);
+ be64_to_cpus(&header->size);
+ be32_to_cpus(&header->crypt_method);
+ be64_to_cpus(&header->l1_table_offset);
+
+ err = 0;
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
+{
+ char *buf;
+ struct stat st;
+ size_t expected;
+ int i, err, shift;
+ QCowHeader_ext *exthdr;
+ uint32_t l1_table_bytes, l1_table_block, l1_table_size;
+
+ buf = NULL;
+ s->l1_table = NULL;
+
+ shift = s->cluster_bits + s->l2_bits;
+
+ s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+ s->l1_table_offset = header->l1_table_offset;
+
+ s->min_cluster_alloc = 1; /* default */
+
+ l1_table_bytes = s->l1_size * sizeof(uint64_t);
+ l1_table_size = (l1_table_bytes + 4095) & ~4095;
+ l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
+
+ DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
+ (uint64_t)s->l1_table_offset,
+ (int) (s->l1_size * sizeof(uint64_t)),
+ l1_table_size);
+
+ err = fstat(s->fd, &st);
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ err = lseek(s->fd, 0, SEEK_SET);
+ if (err == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&buf, 512, l1_table_block);
+ if (err) {
+ buf = NULL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+ if (err) {
+ s->l1_table = NULL;
+ goto out;
+ }
+
+ memset(buf, 0, l1_table_block);
+ memset(s->l1_table, 0, l1_table_size);
+
+ expected = l1_table_block;
+ if (st.st_size < l1_table_block)
+ expected = st.st_size;
+
+ errno = 0;
+ err = read(s->fd, buf, l1_table_block);
+ if (err != expected) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
+ exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+
+ /* check for xen extended header */
+ if (s->l1_table_offset % 4096 == 0 &&
+ be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
+ uint32_t flags = be32_to_cpu(exthdr->flags);
+ uint32_t cksum = be32_to_cpu(exthdr->cksum);
+
+ /*
+ * Try to detect old tapdisk images. They have to be fixed
+ * because they use big endian rather than native endian for
+ * the L1 table. After this block, the l1 table will
+ * definitely be in BIG endian.
+ */
+ if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
+ DPRINTF("qcow: converting to big endian L1 table\n");
+
+ /* convert to big endian */
+ for (i = 0; i < s->l1_size; i++)
+ cpu_to_be64s(&s->l1_table[i]);
+
+ flags |= EXTHDR_L1_BIG_ENDIAN;
+ exthdr->flags = cpu_to_be32(flags);
+
+ memcpy(buf + s->l1_table_offset,
+ s->l1_table, l1_table_size);
+
+ err = lseek(s->fd, 0, SEEK_SET);
+ if (err == (off_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = atomicio(vwrite, s->fd, buf, l1_table_block);
+ if (err != l1_table_block) {
+ err = -errno;
+ goto out;
+ }
+ }
+
+ /* check the L1 table checksum */
+ if (cksum != gen_cksum((char *)s->l1_table,
+ s->l1_size * sizeof(uint64_t)))
+ DPRINTF("qcow: bad L1 checksum\n");
+ else {
+ s->extended = 1;
+ s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
+ s->min_cluster_alloc =
+ be32_to_cpu(exthdr->min_cluster_alloc);
+ }
+ }
+
+ /* convert L1 table to native endian for operation */
+ for (i = 0; i < s->l1_size; i++)
+ be64_to_cpus(&s->l1_table[i]);
+
+ err = 0;
+
+out:
+ if (err) {
+ free(buf);
+ free(s->l1_table);
+ s->l1_table = NULL;
+ }
+ return err;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ int fd, len, i, ret, size, o_flags;
+ td_disk_info_t *bs = &(driver->info);
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ QCowHeader header;
+ uint64_t final_cluster = 0;
+
+ DPRINTF("QCOW: Opening %s\n", name);
+
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+ if (fd < 0) {
+ DPRINTF("Unable to open %s (%d)\n", name, -errno);
+ return -1;
+ }
+
+ s->fd = fd;
+ s->name = strdup(name);
+ if (!s->name)
+ goto fail;
+
+ if (tdqcow_read_header(fd, &header))
+ goto fail;
+
+ if (header.magic != QCOW_MAGIC)
+ goto fail;
+
+ switch (header.version) {
+ case QCOW_VERSION:
+ break;
+ case 2:
+ //TODO: Port qcow2 to new blktap framework.
+ // close(fd);
+ // dd->drv = &tapdisk_qcow2;
+ // return dd->drv->td_open(dd, name, flags);
+ goto fail;
+ default:
+ goto fail;
+ }
+
+ if (header.size <= 1 || header.cluster_bits < 9)
+ goto fail;
+ if (header.crypt_method > QCOW_CRYPT_AES)
+ goto fail;
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header)
+ s->encrypted = 1;
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = header.l2_bits;
+ s->l2_size = 1 << s->l2_bits;
+ s->cluster_alloc = s->l2_size;
+ bs->size = header.size / 512;
+ s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+ s->backing_file_offset = header.backing_file_offset;
+ s->backing_file_size = header.backing_file_size;
+
+ /* allocate and load l1 table */
+ if (tdqcow_load_l1_table(s, &header))
+ goto fail;
+
+ /* alloc L2 cache */
+ size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+ ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+ if(ret != 0) goto fail;
+
+ size = s->cluster_size;
+ ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+ if(ret != 0) goto fail;
+
+ ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+ if(ret != 0) goto fail;
+ s->cluster_cache_offset = -1;
+
+ if (s->backing_file_offset != 0)
+ s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+
+ bs->sector_size = 512;
+ bs->info = 0;
+
+ for(i = 0; i < s->l1_size; i++)
+ if (s->l1_table[i] > final_cluster)
+ final_cluster = s->l1_table[i];
+
+ if (init_aio_state(driver)!=0) {
+ DPRINTF("Unable to initialise AIO state\n");
+ free_aio_state(s);
+ goto fail;
+ }
+
+ if (!final_cluster)
+ s->fd_end = s->l1_table_offset +
+ ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
+ else {
+ s->fd_end = lseek64(fd, 0, SEEK_END);
+ if (s->fd_end == (off64_t)-1)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ DPRINTF("QCOW Open failed\n");
+
+ free_aio_state(s);
+ free(s->l1_table);
+ free(s->l2_cache);
+ free(s->cluster_cache);
+ free(s->cluster_data);
+ close(fd);
+ return -1;
+}
+
+void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ int ret = 0, index_in_cluster, n, i;
+ uint64_t cluster_offset, sector, nb_sectors;
+ struct qcow_prv* prv;
+ td_request_t clone = treq;
+ char* buf = treq.buf;
+
+ sector = treq.sec;
+ nb_sectors = treq.secs;
+
+ /*We store a local record of the request*/
+ while (nb_sectors > 0) {
+ cluster_offset =
+ get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
+ index_in_cluster = sector & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+
+ if (s->aio_free_count == 0) {
+ td_complete_request(treq, -EBUSY);
+ return;
+ }
+
+ if(!cluster_offset) {
+ treq.buf = buf;
+ treq.sec = sector;
+ treq.secs = n;
+ td_forward_request(treq);
+
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ if (decompress_cluster(s, cluster_offset) < 0) {
+ td_complete_request(treq, -EIO);
+ goto done;
+ }
+ memcpy(buf, s->cluster_cache + index_in_cluster * 512,
+ 512 * n);
+
+ treq.buf = buf;
+ treq.sec = sector;
+ treq.secs = n;
+ td_complete_request(treq, 0);
+ } else {
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9)+index_in_cluster;
+ clone.secs = n;
+ async_read(driver, clone);
+ }
+ nb_sectors -= n;
+ sector += n;
+ buf += n * 512;
+ }
+done:
+ return;
+}
+
+void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+ int ret = 0, index_in_cluster, n, i;
+ uint64_t cluster_offset, sector, nb_sectors;
+ td_callback_t cb;
+ struct qcow_prv* prv;
+ char* buf = treq.buf;
+ td_request_t clone=treq;
+
+ sector = treq.sec;
+ nb_sectors = treq.secs;
+
+ /*We store a local record of the request*/
+ while (nb_sectors > 0) {
+ index_in_cluster = sector & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+
+ if (s->aio_free_count == 0) {
+ td_complete_request(treq, -EBUSY);
+ return;
+ }
+
+ cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster+n);
+ if (!cluster_offset) {
+ DPRINTF("Ooops, no write cluster offset!\n");
+ td_complete_request(treq, -EIO);
+ return;
+ }
+
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector, s->cluster_data,
+ (unsigned char *)buf, n, 1,
+ &s->aes_encrypt_key);
+
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9) + index_in_cluster;
+ clone.secs = n;
+ async_write(driver, clone);
+ } else {
+ clone.buf = buf;
+ clone.sec = (cluster_offset>>9) + index_in_cluster;
+ clone.secs = n;
+
+ async_write(driver, clone);
+ }
+
+ nb_sectors -= n;
+ sector += n;
+ buf += n * 512;
+ }
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ return;
+}
+
+static int
+tdqcow_update_checksum(struct tdqcow_state *s)
+{
+ int i, fd, err;
+ uint32_t offset, cksum, out;
+
+ if (!s->extended)
+ return 0;
+
+ fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
+ if (fd == -1) {
+ err = errno;
+ goto out;
+ }
+
+ offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
+ if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
+ err = errno;
+ goto out;
+ }
+
+ /* convert to big endian for checksum */
+ for (i = 0; i < s->l1_size; i++)
+ cpu_to_be64s(&s->l1_table[i]);
+
+ cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+
+ /* and back again... */
+ for (i = 0; i < s->l1_size; i++)
+ be64_to_cpus(&s->l1_table[i]);
+
+ DPRINTF("Writing cksum: %d", cksum);
+
+ out = cpu_to_be32(cksum);
+ if (write(fd, &out, sizeof(out)) != sizeof(out)) {
+ err = errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (err)
+ DPRINTF("failed to update checksum: %d\n", err);
+ if (fd != -1)
+ close(fd);
+ return err;
+}
+
+int tdqcow_close(td_driver_t *driver)
+{
+ struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+ /*Update the hdr cksum*/
+ tdqcow_update_checksum(s);
+
+ free_aio_state(s);
+ free(s->name);
+ free(s->l1_table);
+ free(s->l2_cache);
+ free(s->cluster_cache);
+ free(s->cluster_data);
+ close(s->fd);
+ return 0;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+ const char *backing_file, int sparse)
+{
+ int fd, header_size, backing_filename_len, l1_size, i;
+ int shift, length, adjust, flags = 0, ret = 0;
+ QCowHeader header;
+ QCowHeader_ext exthdr;
+ char backing_filename[PATH_MAX], *ptr;
+ uint64_t tmp, size, total_length;
+ struct stat st;
+
+ DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
+
+ fd = open(filename,
+ O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ if (fd < 0)
+ return -1;
+
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+
+ /*Create extended header fields*/
+ exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+ header_size = sizeof(header) + sizeof(QCowHeader_ext);
+ backing_filename_len = 0;
+ size = (total_size >> SECTOR_SHIFT);
+ if (backing_file) {
+ if (strcmp(backing_file, "fat:")) {
+ const char *p;
+ /* XXX: this is a hack: we do not attempt to
+ *check for URL like syntax */
+ p = strchr(backing_file, ':');
+ if (p && (p - backing_file) >= 2) {
+ /* URL like but exclude "c:" like filenames */
+ strncpy(backing_filename, backing_file,
+ sizeof(backing_filename));
+ } else {
+ if (realpath(backing_file, backing_filename) == NULL ||
+ stat(backing_filename, &st) != 0) {
+ return -1;
+ }
+ }
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_filename);
+ header.backing_file_size = cpu_to_be32(
+ backing_filename_len);
+ header_size += backing_filename_len;
+
+ /*Set to the backing file size*/
+ if(get_filesize(backing_filename, &size, &st)) {
+ return -1;
+ }
+ DPRINTF("Backing file size detected: %"PRId64" sectors"
+ "(total %"PRId64" [%"PRId64" MB])\n",
+ size,
+ (uint64_t)(size << SECTOR_SHIFT),
+ (uint64_t)(size >> 11));
+ } else {
+ backing_file = NULL;
+ DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n",
+ total_size,
+ (uint64_t) (total_size << SECTOR_SHIFT));
+ }
+ header.mtime = cpu_to_be32(st.st_mtime);
+ header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+ unmodifyed sectors */
+ header.l2_bits = 12; /* 32 KB L2 tables */
+ exthdr.min_cluster_alloc = cpu_to_be32(1);
+ } else {
+ DPRINTF("Setting file size: %"PRId64" sectors"
+ "(total %"PRId64" [%"PRId64" MB])\n",
+ size,
+ (uint64_t) (size << SECTOR_SHIFT),
+ (uint64_t) (size >> 11));
+ header.cluster_bits = 12; /* 4 KB clusters */
+ header.l2_bits = 9; /* 4 KB L2 tables */
+ exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+ }
+ /*Set the header size value*/
+ header.size = cpu_to_be64(size * 512);
+
+ header_size = (header_size + 7) & ~7;
+ if (header_size % 4096 > 0) {
+ header_size = ((header_size >> 12) + 1) << 12;
+ }
+
+ shift = header.cluster_bits + header.l2_bits;
+ l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+ header.l1_table_offset = cpu_to_be64(header_size);
+ DPRINTF("L1 Table offset: %d, size %d\n",
+ header_size,
+ (int)(l1_size * sizeof(uint64_t)));
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+
+ ptr = calloc(1, l1_size * sizeof(uint64_t));
+ exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+ printf("Created cksum: %d\n",exthdr.cksum);
+ free(ptr);
+
+ /*adjust file length to system page size boundary*/
+ length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
+ getpagesize());
+ if (qtruncate(fd, length, 0)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+
+ if (sparse == 0) {
+ /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
+ total_length = length + (l1_size * (1 << 9)) + (size * 512);
+ if (qtruncate(fd, total_length, 0)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+ printf("File truncated to length %"PRIu64"\n",total_length);
+ } else
+ flags = SPARSE_FILE;
+
+ flags |= EXTHDR_L1_BIG_ENDIAN;
+ exthdr.flags = cpu_to_be32(flags);
+
+ /* write all the data */
+ lseek(fd, 0, SEEK_SET);
+ ret += write(fd, &header, sizeof(header));
+ ret += write(fd, &exthdr, sizeof(exthdr));
+ if (backing_file)
+ ret += write(fd, backing_filename, backing_filename_len);
+
+ lseek(fd, header_size, SEEK_SET);
+ tmp = 0;
+ for (i = 0;i < l1_size; i++) {
+ ret += write(fd, &tmp, sizeof(tmp));
+ }
+
+ close(fd);
+
+ return 0;
+}
+
+static int qcow_make_empty(struct tdqcow_state *s)
+{
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+ memset(s->l1_table, 0, l1_length);
+ lseek(s->fd, s->l1_table_offset, SEEK_SET);
+ if (write(s->fd, s->l1_table, l1_length) < 0)
+ return -1;
+ if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
+ DPRINTF("ERROR truncating file\n");
+ return -1;
+ }
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+ return 0;
+}
+
+static int qcow_get_cluster_size(struct tdqcow_state *s)
+{
+ return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
+ const uint8_t *buf)
+{
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+ if (!out_buf)
+ return -1;
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ free(out_buf);
+ return -1;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ free(out_buf);
+ deflateEnd(&strm);
+ return -1;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+ } else {
+ cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
+ out_len, 0, 0);
+ cluster_offset &= s->cluster_offset_mask;
+ lseek(s->fd, cluster_offset, SEEK_SET);
+ if (write(s->fd, out_buf, out_len) != out_len) {
+ free(out_buf);
+ return -1;
+ }
+ }
+
+ free(out_buf);
+ return 0;
+}
+
+static int
+tdqcow_get_image_type(const char *file, int *type)
+{
+ int fd;
+ size_t size;
+ QCowHeader header;
+
+ fd = open(file, O_RDONLY);
+ if (fd == -1)
+ return -errno;
+
+ size = read(fd, &header, sizeof(header));
+ close(fd);
+ if (size != sizeof(header))
+ return (errno ? -errno : -EIO);
+
+ be32_to_cpus(&header.magic);
+ if (header.magic == QCOW_MAGIC)
+ *type = DISK_TYPE_QCOW;
+ else
+ *type = DISK_TYPE_AIO;
+
+ return 0;
+}
+
+int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ off_t off;
+ char *buf, *filename;
+ int len, secs, type, err = -EINVAL;
+ struct tdqcow_state *child = (struct tdqcow_state *)driver->data;
+
+ if (!child->backing_file_offset)
+ return TD_NO_PARENT;
+
+ /* read the backing file name */
+ len = child->backing_file_size;
+ off = child->backing_file_offset - (child->backing_file_offset % 512);
+ secs = (len + (child->backing_file_offset - off) + 511) >> 9;
+
+ if (posix_memalign((void **)&buf, 512, secs << 9))
+ return -1;
+
+ if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
+ goto out;
+
+ if (read(child->fd, buf, secs << 9) != secs << 9)
+ goto out;
+ filename = buf + (child->backing_file_offset - off);
+ filename[len] = '\0';
+
+ if (tdqcow_get_image_type(filename, &type))
+ goto out;
+
+ id->name = strdup(filename);
+ id->drivertype = type;
+ err = 0;
+ out:
+ free(buf);
+ return err;
+}
+
+int tdqcow_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ struct stat stats;
+ uint64_t psize, csize;
+ struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
+ struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
+
+ if (stat(p->name, &stats))
+ return -EINVAL;
+ if (get_filesize(p->name, &psize, &stats))
+ return -EINVAL;
+
+ if (stat(c->name, &stats))
+ return -EINVAL;
+ if (get_filesize(c->name, &csize, &stats))
+ return -EINVAL;
+
+ if (csize != psize)
+ return -EINVAL;
+
+ return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+ .disk_type = "tapdisk_qcow",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdqcow_state),
+ .td_open = tdqcow_open,
+ .td_close = tdqcow_close,
+ .td_queue_read = tdqcow_queue_read,
+ .td_queue_write = tdqcow_queue_write,
+ .td_get_parent_id = tdqcow_get_parent_id,
+ .td_validate_parent = tdqcow_validate_parent,
+ .td_debug = NULL,
+};
diff --git a/tools/blktap2/drivers/block-ram.c b/tools/blktap2/drivers/block-ram.c
new file mode 100644
index 0000000000..16b4ec9dc7
--- /dev/null
+++ b/tools/blktap2/drivers/block-ram.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+char *img;
+long int disksector_size;
+long int disksize;
+long int diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+ int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ info->size = 0;
+ if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+ DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+
+ /*Get the sector size*/
+#if defined(BLKSSZGET)
+ {
+ int arg;
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ ioctl(fd, BLKSSZGET, &info->sector_size);
+
+ if (info->sector_size != DEFAULT_SECTOR_SIZE)
+ DPRINTF("Note: sector size is %ld (not %d)\n",
+ info->sector_size, DEFAULT_SECTOR_SIZE);
+ }
+#else
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+ } else {
+ /*Local file? try fstat instead*/
+ info->size = (stat.st_size >> SECTOR_SHIFT);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(info->size << SECTOR_SHIFT),
+ (long long unsigned)info->size);
+ }
+
+ if (info->size == 0) {
+ info->size =((uint64_t) MAX_RAMDISK_SIZE);
+ info->sector_size = DEFAULT_SECTOR_SIZE;
+ }
+ info->info = 0;
+
+ /*Store variables locally*/
+ disksector_size = info->sector_size;
+ disksize = info->size;
+ diskinfo = info->info;
+ DPRINTF("Image sector_size: \n\t[%lu]\n",
+ info->sector_size);
+
+ return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ char *p;
+ uint64_t size;
+ int i, fd, ret = 0, count = 0, o_flags;
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+ connections++;
+
+ if (connections > 1) {
+ driver->info.sector_size = disksector_size;
+ driver->info.size = disksize;
+ driver->info.info = diskinfo;
+ DPRINTF("Image already open, returning parameters:\n");
+ DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost "
+ "sector_shift [%llu]\n",
+ (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+ (long long unsigned)driver->info.size);
+ DPRINTF("Image sector_size: \n\t[%lu]\n",
+ driver->info.sector_size);
+
+ prv->fd = -1;
+ goto done;
+ }
+
+ /* Open the file */
+ o_flags = O_DIRECT | O_LARGEFILE |
+ ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+ fd = open(name, o_flags);
+
+ if ((fd == -1) && (errno == EINVAL)) {
+
+ /* Maybe O_DIRECT isn't supported. */
+ o_flags &= ~O_DIRECT;
+ fd = open(name, o_flags);
+ if (fd != -1) DPRINTF("WARNING: Accessing image without"
+ "O_DIRECT! (%s)\n", name);
+
+ } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+
+ if (fd == -1) {
+ DPRINTF("Unable to open [%s]!\n",name);
+ ret = 0 - errno;
+ goto done;
+ }
+
+ prv->fd = fd;
+
+ ret = get_image_info(fd, &driver->info);
+ size = MAX_RAMDISK_SIZE;
+
+ if (driver->info.size > size) {
+ DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+ (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+ return -ENOMEM;
+ }
+
+ /*Read the image into memory*/
+ if (posix_memalign((void **)&img,
+ DEFAULT_SECTOR_SIZE,
+ driver->info.size << SECTOR_SHIFT)) {
+ DPRINTF("Mem malloc failed\n");
+ return -errno;
+ }
+ p = img;
+ DPRINTF("Reading %llu bytes.......",
+ (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+ for (i = 0; i < driver->info.size; i++) {
+ ret = read(prv->fd, p, driver->info.sector_size);
+ if (ret != driver->info.sector_size) {
+ DPRINTF("ret = %d, errno = %d\n", ret, errno);
+ ret = 0 - errno;
+ break;
+ } else {
+ count += ret;
+ p = img + count;
+ }
+ }
+ DPRINTF("[%d]\n",count);
+ if (count != driver->info.size << SECTOR_SHIFT) {
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+
+done:
+ return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+ int size = treq.secs * driver->info.sector_size;
+ uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ memcpy(treq.buf, img + offset, size);
+
+ td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+ int size = treq.secs * driver->info.sector_size;
+ uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size;
+
+ /* We assume that write access is controlled
+ * at a higher level for multiple disks */
+ memcpy(img + offset, treq.buf, size);
+
+ td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver)
+{
+ struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+ connections--;
+
+ return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+ .disk_type = "tapdisk_ram",
+ .flags = 0,
+ .private_data_size = sizeof(struct tdram_state),
+ .td_open = tdram_open,
+ .td_close = tdram_close,
+ .td_queue_read = tdram_queue_read,
+ .td_queue_write = tdram_queue_write,
+ .td_get_parent_id = tdram_get_parent_id,
+ .td_validate_parent = tdram_validate_parent,
+ .td_debug = NULL,
+};
diff --git a/tools/blktap2/drivers/block-vhd.c b/tools/blktap2/drivers/block-vhd.c
new file mode 100644
index 0000000000..54431c12d8
--- /dev/null
+++ b/tools/blktap2/drivers/block-vhd.c
@@ -0,0 +1,2321 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * A note on write transactions:
+ * Writes that require updating the BAT or bitmaps cannot be signaled
+ * as complete until all updates have reached disk. Transactions are
+ * used to ensure proper ordering in these cases. The two types of
+ * transactions are as follows:
+ * - Bitmap updates only: data writes that require updates to the same
+ * bitmap are grouped in a transaction. Only after all data writes
+ * in a transaction complete does the bitmap write commence. Only
+ * after the bitmap write finishes are the data writes signalled as
+ * complete.
+ * - BAT and bitmap updates: data writes are grouped in transactions
+ * as above, but a special extra write is included in the transaction,
+ * which zeros out the newly allocated bitmap on disk. When the data
+ * writes and the zero-bitmap write complete, the BAT and bitmap writes
+ * are started in parallel. The transaction is completed only after both
+ * the BAT and bitmap writes successfully return.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
+ /* e2fsprogs-devel. */
+#include <string.h> /* for memset. */
+#include <libaio.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+unsigned int SPB;
+
+#define DEBUGGING 2
+#define ASSERTING 1
+#define MICROSOFT_COMPAT
+
+#define VHD_BATMAP_MAX_RETRIES 10
+
+#define __TRACE(s) \
+ do { \
+ DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \
+ PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \
+ "%lu, BBLK: 0x%04x\n", \
+ s->vhd.file, s->queued, s->completed, s->returned, \
+ VHD_REQS_DATA - s->vreq_free_count, \
+ s->bat.pbw_blk); \
+ } while(0)
+
+#define __ASSERT(_p) \
+ if (!(_p)) { \
+ DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \
+ __FILE__, __LINE__, #_p); \
+ DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n", \
+ __FILE__, __LINE__, #_p); \
+ tlog_flush(); \
+ *(int*)0 = 0; \
+ }
+
+#if (DEBUGGING == 1)
+ #define DBG(level, _f, _a...) DPRINTF(_f, ##_a)
+ #define ERR(err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a)
+ #define TRACE(s) ((void)0)
+#elif (DEBUGGING == 2)
+ #define DBG(level, _f, _a...) tlog_write(level, _f, ##_a)
+ #define ERR(err, _f, _a...) tlog_error(err, _f, ##_a)
+ #define TRACE(s) __TRACE(s)
+#else
+ #define DBG(level, _f, _a...) ((void)0)
+ #define ERR(err, _f, _a...) ((void)0)
+ #define TRACE(s) ((void)0)
+#endif
+
+#if (ASSERTING == 1)
+ #define ASSERT(_p) __ASSERT(_p)
+#else
+ #define ASSERT(_p) ((void)0)
+#endif
+
+/******VHD DEFINES******/
+#define VHD_CACHE_SIZE 32
+
+#define VHD_REQS_DATA TAPDISK_DATA_REQUESTS
+#define VHD_REQS_META (VHD_CACHE_SIZE + 2)
+#define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META)
+
+#define VHD_OP_BAT_WRITE 0
+#define VHD_OP_DATA_READ 1
+#define VHD_OP_DATA_WRITE 2
+#define VHD_OP_BITMAP_READ 3
+#define VHD_OP_BITMAP_WRITE 4
+#define VHD_OP_ZERO_BM_WRITE 5
+
+#define VHD_BM_BAT_LOCKED 0
+#define VHD_BM_BAT_CLEAR 1
+#define VHD_BM_BIT_CLEAR 2
+#define VHD_BM_BIT_SET 3
+#define VHD_BM_NOT_CACHED 4
+#define VHD_BM_READ_PENDING 5
+
+#define VHD_FLAG_OPEN_RDONLY 1
+#define VHD_FLAG_OPEN_NO_CACHE 2
+#define VHD_FLAG_OPEN_QUIET 4
+#define VHD_FLAG_OPEN_STRICT 8
+#define VHD_FLAG_OPEN_QUERY 16
+#define VHD_FLAG_OPEN_PREALLOCATE 32
+
+#define VHD_FLAG_BAT_LOCKED 1
+#define VHD_FLAG_BAT_WRITE_STARTED 2
+
+#define VHD_FLAG_BM_UPDATE_BAT 1
+#define VHD_FLAG_BM_WRITE_PENDING 2
+#define VHD_FLAG_BM_READ_PENDING 4
+#define VHD_FLAG_BM_LOCKED 8
+
+#define VHD_FLAG_REQ_UPDATE_BAT 1
+#define VHD_FLAG_REQ_UPDATE_BITMAP 2
+#define VHD_FLAG_REQ_QUEUED 4
+#define VHD_FLAG_REQ_FINISHED 8
+
+#define VHD_FLAG_TX_LIVE 1
+#define VHD_FLAG_TX_UPDATE_BAT 2
+
+typedef uint8_t vhd_flag_t;
+
+struct vhd_state;
+struct vhd_request;
+
+struct vhd_req_list {
+ struct vhd_request *head;
+ struct vhd_request *tail;
+};
+
+struct vhd_transaction {
+ int error;
+ int closed;
+ int started;
+ int finished;
+ vhd_flag_t status;
+ struct vhd_req_list requests;
+};
+
+struct vhd_request {
+ int error;
+ uint8_t op;
+ vhd_flag_t flags;
+ td_request_t treq;
+ struct tiocb tiocb;
+ struct vhd_state *state;
+ struct vhd_request *next;
+ struct vhd_transaction *tx;
+};
+
+struct vhd_bat_state {
+ vhd_bat_t bat;
+ vhd_batmap_t batmap;
+ vhd_flag_t status;
+ uint32_t pbw_blk; /* blk num of pending write */
+ uint64_t pbw_offset; /* file offset of same */
+ struct vhd_request req; /* for writing bat table */
+ struct vhd_request zero_req; /* for initializing bitmaps */
+ char *bat_buf;
+};
+
+struct vhd_bitmap {
+ u32 blk;
+ u64 seqno; /* lru sequence number */
+ vhd_flag_t status;
+
+ char *map; /* map should only be modified
+ * in finish_bitmap_write */
+ char *shadow; /* in-memory bitmap changes are
+ * made to shadow and copied to
+ * map only after having been
+ * flushed to disk */
+ struct vhd_transaction tx; /* transaction data structure
+ * encapsulating data, bitmap,
+ * and bat writes */
+ struct vhd_req_list queue; /* data writes waiting for next
+ * transaction */
+ struct vhd_req_list waiting; /* pending requests that cannot
+ * be serviced until this bitmap
+ * is read from disk */
+ struct vhd_request req;
+};
+
+struct vhd_state {
+ vhd_flag_t flags;
+
+ /* VHD stuff */
+ vhd_context_t vhd;
+ u32 spp; /* sectors per page */
+ u32 spb; /* sectors per block */
+ u64 next_db; /* pointer to the next
+ * (unallocated) datablock */
+
+ struct vhd_bat_state bat;
+
+ u64 bm_lru; /* lru sequence number */
+ u32 bm_secs; /* size of bitmap, in sectors */
+ struct vhd_bitmap *bitmap[VHD_CACHE_SIZE];
+
+ int bm_free_count;
+ struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE];
+ struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE];
+
+ int vreq_free_count;
+ struct vhd_request *vreq_free[VHD_REQS_DATA];
+ struct vhd_request vreq_list[VHD_REQS_DATA];
+
+ td_driver_t *driver;
+
+ uint64_t queued;
+ uint64_t completed;
+ uint64_t returned;
+ uint64_t reads;
+ uint64_t read_size;
+ uint64_t writes;
+ uint64_t write_size;
+};
+
+#define test_vhd_flag(word, flag) ((word) & (flag))
+#define set_vhd_flag(word, flag) ((word) |= (flag))
+#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
+
+#define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)])
+
+static void vhd_complete(void *, struct tiocb *, int);
+static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
+
+static struct vhd_state *_vhd_master;
+static unsigned long _vhd_zsize;
+static char *_vhd_zeros;
+
+static int
+vhd_initialize(struct vhd_state *s)
+{
+ if (_vhd_zeros)
+ return 0;
+
+ _vhd_zsize = 2 * getpagesize();
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+ _vhd_zsize += VHD_BLOCK_SIZE;
+
+ _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (_vhd_zeros == MAP_FAILED) {
+ EPRINTF("vhd_initialize failed: %d\n", -errno);
+ _vhd_zeros = NULL;
+ _vhd_zsize = 0;
+ return -errno;
+ }
+
+ _vhd_master = s;
+ return 0;
+}
+
+static void
+vhd_free(struct vhd_state *s)
+{
+ if (_vhd_master != s || !_vhd_zeros)
+ return;
+
+ munmap(_vhd_zeros, _vhd_zsize);
+ _vhd_zsize = 0;
+ _vhd_zeros = NULL;
+ _vhd_master = NULL;
+}
+
+static char *
+_get_vhd_zeros(const char *func, unsigned long size)
+{
+ if (!_vhd_zeros || _vhd_zsize < size) {
+ EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
+ func, size, _vhd_zsize, _vhd_zeros);
+ ASSERT(0);
+ }
+
+ return _vhd_zeros;
+}
+
+#define vhd_zeros(size) _get_vhd_zeros(__func__, size)
+
+static inline void
+set_batmap(struct vhd_state *s, uint32_t blk)
+{
+ if (s->bat.batmap.map) {
+ vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
+ DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
+ }
+}
+
+static inline int
+test_batmap(struct vhd_state *s, uint32_t blk)
+{
+ if (!s->bat.batmap.map)
+ return 0;
+ return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
+}
+
+static int
+vhd_kill_footer(struct vhd_state *s)
+{
+ int err;
+ off64_t end;
+ char *zeros;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return 0;
+
+ err = posix_memalign((void **)&zeros, 512, 512);
+ if (err)
+ return -err;
+
+ err = 1;
+ memset(zeros, 0xc7c7c7c7, 512);
+
+ if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
+ goto fail;
+
+ if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
+ goto fail;
+
+ if (write(s->vhd.fd, zeros, 512) != 512)
+ goto fail;
+
+ err = 0;
+
+ fail:
+ free(zeros);
+ if (err)
+ return (errno ? -errno : -EIO);
+ return 0;
+}
+
+static inline int
+find_next_free_block(struct vhd_state *s)
+{
+ int err;
+ off64_t eom;
+ uint32_t i, entry;
+
+ err = vhd_end_of_headers(&s->vhd, &eom);
+ if (err)
+ return err;
+
+ s->next_db = secs_round_up(eom);
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ entry = bat_entry(s, i);
+ if (entry != DD_BLK_UNUSED && entry >= s->next_db)
+ s->next_db = entry + s->spb + s->bm_secs;
+ }
+
+ return 0;
+}
+
+static void
+vhd_free_bat(struct vhd_state *s)
+{
+ free(s->bat.bat.bat);
+ free(s->bat.batmap.map);
+ free(s->bat.bat_buf);
+ memset(&s->bat, 0, sizeof(struct vhd_bat));
+}
+
+static int
+vhd_initialize_bat(struct vhd_state *s)
+{
+ int err, psize, batmap_required, i;
+
+ memset(&s->bat, 0, sizeof(struct vhd_bat));
+
+ psize = getpagesize();
+
+ err = vhd_read_bat(&s->vhd, &s->bat.bat);
+ if (err) {
+ EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
+ return err;
+ }
+
+ batmap_required = 1;
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
+ batmap_required = 0;
+ } else {
+ err = find_next_free_block(s);
+ if (err)
+ goto fail;
+ }
+
+ if (vhd_has_batmap(&s->vhd)) {
+ for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
+ err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
+ if (err) {
+ EPRINTF("%s: reading batmap: %d\n",
+ s->vhd.file, err);
+ if (batmap_required)
+ goto fail;
+ } else {
+ break;
+ }
+ }
+ if (err)
+ EPRINTF("%s: ignoring non-critical batmap error\n",
+ s->vhd.file);
+ }
+
+ err = posix_memalign((void **)&s->bat.bat_buf,
+ VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+ if (err) {
+ s->bat.bat_buf = NULL;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ vhd_free_bat(s);
+ return err;
+}
+
+static void
+vhd_free_bitmap_cache(struct vhd_state *s)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap_list + i;
+ free(bm->map);
+ free(bm->shadow);
+ s->bitmap_free[i] = NULL;
+ }
+
+ memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+}
+
+static int
+vhd_initialize_bitmap_cache(struct vhd_state *s)
+{
+ int i, err, map_size;
+ struct vhd_bitmap *bm;
+
+ memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+
+ s->bm_lru = 0;
+ map_size = vhd_sectors_to_bytes(s->bm_secs);
+ s->bm_free_count = VHD_CACHE_SIZE;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap_list + i;
+
+ err = posix_memalign((void **)&bm->map, 512, map_size);
+ if (err) {
+ bm->map = NULL;
+ goto fail;
+ }
+
+ err = posix_memalign((void **)&bm->shadow, 512, map_size);
+ if (err) {
+ bm->shadow = NULL;
+ goto fail;
+ }
+
+ memset(bm->map, 0, map_size);
+ memset(bm->shadow, 0, map_size);
+ s->bitmap_free[i] = bm;
+ }
+
+ return 0;
+
+fail:
+ vhd_free_bitmap_cache(s);
+ return err;
+}
+
+static int
+vhd_initialize_dynamic_disk(struct vhd_state *s)
+{
+ int err;
+
+ err = vhd_get_header(&s->vhd);
+ if (err) {
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ EPRINTF("Error reading VHD DD header.\n");
+ return err;
+ }
+
+ if (s->vhd.header.hdr_ver != 0x00010000) {
+ EPRINTF("unsupported header version! (0x%x)\n",
+ s->vhd.header.hdr_ver);
+ return -EINVAL;
+ }
+
+ s->spp = getpagesize() >> VHD_SECTOR_SHIFT;
+ s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
+ s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
+ return 0;
+
+ err = vhd_initialize_bat(s);
+ if (err)
+ return err;
+
+ err = vhd_initialize_bitmap_cache(s);
+ if (err) {
+ vhd_free_bat(s);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_check_version(struct vhd_state *s)
+{
+ if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
+ return 0;
+
+ if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ EPRINTF("WARNING: %s vhd creator version 0x%08x, "
+ "but only versions up to 0x%08x are "
+ "supported for IO\n", s->vhd.file,
+ s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+vhd_log_open(struct vhd_state *s)
+{
+ char buf[5];
+ uint32_t i, allocated, full;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ return;
+
+ snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
+ if (!vhd_type_dynamic(&s->vhd)) {
+ DPRINTF("%s version: %s 0x%08x\n",
+ s->vhd.file, buf, s->vhd.footer.crtr_ver);
+ return;
+ }
+
+ allocated = 0;
+ full = 0;
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ if (bat_entry(s, i) != DD_BLK_UNUSED)
+ allocated++;
+ if (test_batmap(s, i))
+ full++;
+ }
+
+ DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+ s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
+ allocated, full, s->next_db);
+}
+
+static int
+__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
+{
+ int i, o_flags, err;
+ struct vhd_state *s;
+
+ DBG(TLOG_INFO, "vhd_open: %s\n", name);
+ if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
+ libvhd_set_log_level(1);
+
+ s = (struct vhd_state *)driver->data;
+ memset(s, 0, sizeof(struct vhd_state));
+
+ s->flags = flags;
+ s->driver = driver;
+
+ err = vhd_initialize(s);
+ if (err)
+ return err;
+
+ o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ?
+ VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
+
+ err = vhd_open(&s->vhd, name, o_flags);
+ if (err) {
+ libvhd_set_log_level(1);
+ err = vhd_open(&s->vhd, name, o_flags);
+ if (err) {
+ EPRINTF("Unable to open [%s] (%d)!\n", name, err);
+ return err;
+ }
+ }
+
+ err = vhd_check_version(s);
+ if (err)
+ goto fail;
+
+ s->spb = s->spp = 1;
+
+ if (vhd_type_dynamic(&s->vhd)) {
+ err = vhd_initialize_dynamic_disk(s);
+ if (err)
+ goto fail;
+ }
+
+ vhd_log_open(s);
+
+ SPB = s->spb;
+
+ s->vreq_free_count = VHD_REQS_DATA;
+ for (i = 0; i < VHD_REQS_DATA; i++)
+ s->vreq_free[i] = s->vreq_list + i;
+
+ driver->info.size = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
+ driver->info.sector_size = VHD_SECTOR_SIZE;
+ driver->info.info = 0;
+
+ DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n",
+ driver->info.size, driver->info.sector_size, driver->info.info);
+
+ if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) &&
+ !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
+ err = vhd_kill_footer(s);
+ if (err) {
+ DPRINTF("ERROR killing footer: %d\n", err);
+ goto fail;
+ }
+ s->writes++;
+ }
+
+ return 0;
+
+ fail:
+ vhd_free_bat(s);
+ vhd_free_bitmap_cache(s);
+ vhd_close(&s->vhd);
+ vhd_free(s);
+ return err;
+}
+
+static int
+_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+ vhd_flag_t vhd_flags = 0;
+
+ if (flags & TD_OPEN_RDONLY)
+ vhd_flags |= VHD_FLAG_OPEN_RDONLY;
+ if (flags & TD_OPEN_QUIET)
+ vhd_flags |= VHD_FLAG_OPEN_QUIET;
+ if (flags & TD_OPEN_STRICT)
+ vhd_flags |= VHD_FLAG_OPEN_STRICT;
+ if (flags & TD_OPEN_QUERY)
+ vhd_flags |= (VHD_FLAG_OPEN_QUERY |
+ VHD_FLAG_OPEN_QUIET |
+ VHD_FLAG_OPEN_RDONLY |
+ VHD_FLAG_OPEN_NO_CACHE);
+
+ /* pre-allocate for all but NFS and LVM storage */
+ if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
+ driver->storage != TAPDISK_STORAGE_TYPE_LVM)
+ vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
+
+ return __vhd_open(driver, name, vhd_flags);
+}
+
+static void
+vhd_log_close(struct vhd_state *s)
+{
+ uint32_t i, allocated, full;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+ return;
+
+ allocated = 0;
+ full = 0;
+
+ for (i = 0; i < s->bat.bat.entries; i++) {
+ if (bat_entry(s, i) != DD_BLK_UNUSED)
+ allocated++;
+ if (test_batmap(s, i))
+ full++;
+ }
+
+ DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+ s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
+}
+
+static int
+_vhd_close(td_driver_t *driver)
+{
+ int err;
+ struct vhd_state *s;
+ struct vhd_bitmap *bm;
+
+ DBG(TLOG_WARN, "vhd_close\n");
+ s = (struct vhd_state *)driver->data;
+
+ /* don't write footer if tapdisk is read-only */
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
+ goto free;
+
+ /*
+ * write footer if:
+ * - we killed it on open (opened with strict)
+ * - we've written data since opening
+ */
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
+ memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
+ err = vhd_write_footer(&s->vhd, &s->vhd.footer);
+ memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
+
+ if (err)
+ EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
+
+ if (!vhd_has_batmap(&s->vhd))
+ goto free;
+
+ err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
+ if (err)
+ EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
+ }
+
+ free:
+ vhd_log_close(s);
+ vhd_free_bat(s);
+ vhd_free_bitmap_cache(s);
+ vhd_close(&s->vhd);
+ vhd_free(s);
+
+ memset(s, 0, sizeof(struct vhd_state));
+
+ return 0;
+}
+
+int
+vhd_validate_parent(td_driver_t *child_driver,
+ td_driver_t *parent_driver, td_flag_t flags)
+{
+ struct stat stats;
+ struct vhd_state *child = (struct vhd_state *)child_driver->data;
+ struct vhd_state *parent;
+
+ if (parent_driver->type != DISK_TYPE_VHD) {
+ if (child_driver->type != DISK_TYPE_VHD)
+ return -EINVAL;
+ if (child->vhd.footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+ if (!vhd_parent_raw(&child->vhd))
+ return -EINVAL;
+ return 0;
+ }
+
+ parent = (struct vhd_state *)parent_driver->data;
+
+ /*
+ * This check removed because of cases like:
+ * - parent VHD marked as 'hidden'
+ * - parent VHD modified during coalesce
+ */
+ /*
+ if (stat(parent->vhd.file, &stats)) {
+ DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
+ return -errno;
+ }
+
+ if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
+ DPRINTF("ERROR: parent file has been modified since "
+ "snapshot. Child image no longer valid.\n");
+ return -EINVAL;
+ }
+ */
+
+ if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) {
+ DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
+ "snapshot. Child image no longer valid.\n",
+ __func__, child->vhd.file, parent->vhd.file);
+ return -EINVAL;
+ }
+
+ /* TODO: compare sizes */
+
+ return 0;
+}
+
+int
+vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ int err;
+ char *parent;
+ struct vhd_state *s;
+
+ DBG(TLOG_DBG, "\n");
+ memset(id, 0, sizeof(td_disk_id_t));
+
+ s = (struct vhd_state *)driver->data;
+
+ if (s->vhd.footer.type != HD_TYPE_DIFF)
+ return TD_NO_PARENT;
+
+ err = vhd_parent_locator_get(&s->vhd, &parent);
+ if (err)
+ return err;
+
+ id->name = parent;
+ id->drivertype = DISK_TYPE_VHD;
+ if (vhd_parent_raw(&s->vhd)) {
+ DPRINTF("VHD: parent is raw\n");
+ id->drivertype = DISK_TYPE_AIO;
+ }
+ return 0;
+}
+
+static inline void
+clear_req_list(struct vhd_req_list *list)
+{
+ list->head = list->tail = NULL;
+}
+
+static inline void
+add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
+{
+ if (!list->head)
+ list->head = list->tail = e;
+ else
+ list->tail = list->tail->next = e;
+}
+
+static inline int
+remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
+{
+ struct vhd_request *i = list->head;
+
+ if (list->head == e) {
+ if (list->tail == e)
+ clear_req_list(list);
+ else
+ list->head = list->head->next;
+ return 0;
+ }
+
+ while (i->next) {
+ if (i->next == e) {
+ if (list->tail == e) {
+ i->next = NULL;
+ list->tail = i;
+ } else
+ i->next = i->next->next;
+ return 0;
+ }
+ i = i->next;
+ }
+
+ return -EINVAL;
+}
+
+static inline void
+init_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+ memset(req, 0, sizeof(struct vhd_request));
+ req->state = s;
+}
+
+static inline void
+init_tx(struct vhd_transaction *tx)
+{
+ memset(tx, 0, sizeof(struct vhd_transaction));
+}
+
+static inline void
+add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
+{
+ ASSERT(!tx->closed);
+
+ r->tx = tx;
+ tx->started++;
+ add_to_tail(&tx->requests, r);
+ set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
+
+ DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
+ "started: %d, finished: %d, status: %u\n",
+ r->treq.sec / SPB, r->treq.sec, tx,
+ tx->started, tx->finished, tx->status);
+}
+
+static inline int
+transaction_completed(struct vhd_transaction *tx)
+{
+ return (tx->started == tx->finished);
+}
+
+static inline void
+init_bat(struct vhd_state *s)
+{
+ s->bat.req.tx = NULL;
+ s->bat.req.next = NULL;
+ s->bat.req.error = 0;
+ s->bat.pbw_blk = 0;
+ s->bat.pbw_offset = 0;
+ s->bat.status = 0;
+}
+
+static inline void
+lock_bat(struct vhd_state *s)
+{
+ set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+unlock_bat(struct vhd_state *s)
+{
+ clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline int
+bat_locked(struct vhd_state *s)
+{
+ return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ bm->blk = 0;
+ bm->seqno = 0;
+ bm->status = 0;
+ init_tx(&bm->tx);
+ clear_req_list(&bm->queue);
+ clear_req_list(&bm->waiting);
+ memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
+ memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
+ init_vhd_request(s, &bm->req);
+}
+
+static inline struct vhd_bitmap *
+get_bitmap(struct vhd_state *s, uint32_t block)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm && bm->blk == block)
+ return bm;
+ }
+
+ return NULL;
+}
+
+static inline void
+lock_bitmap(struct vhd_bitmap *bm)
+{
+ set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline void
+unlock_bitmap(struct vhd_bitmap *bm)
+{
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_locked(struct vhd_bitmap *bm)
+{
+ return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_valid(struct vhd_bitmap *bm)
+{
+ return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+}
+
+static inline int
+bitmap_in_use(struct vhd_bitmap *bm)
+{
+ return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING) ||
+ test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
+ test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
+ bm->waiting.head || bm->tx.requests.head || bm->queue.head);
+}
+
+static inline int
+bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i, n;
+
+ n = s->spb >> 3;
+ for (i = 0; i < n; i++)
+ if (bm->map[i] != (char)0xFF)
+ return 0;
+
+ DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
+ return 1;
+}
+
+static struct vhd_bitmap *
+remove_lru_bitmap(struct vhd_state *s)
+{
+ int i, idx = 0;
+ u64 seq = s->bm_lru;
+ struct vhd_bitmap *bm, *lru = NULL;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
+ idx = i;
+ lru = bm;
+ seq = lru->seqno;
+ }
+ }
+
+ if (lru) {
+ s->bitmap[idx] = NULL;
+ ASSERT(!bitmap_in_use(lru));
+ }
+
+ return lru;
+}
+
+static int
+alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
+{
+ struct vhd_bitmap *bm;
+
+ *bitmap = NULL;
+
+ if (s->bm_free_count > 0) {
+ bm = s->bitmap_free[--s->bm_free_count];
+ } else {
+ bm = remove_lru_bitmap(s);
+ if (!bm)
+ return -EBUSY;
+ }
+
+ init_vhd_bitmap(s, bm);
+ bm->blk = blk;
+ *bitmap = bm;
+
+ return 0;
+}
+
+static inline uint64_t
+__bitmap_lru_seqno(struct vhd_state *s)
+{
+ int i;
+ struct vhd_bitmap *bm;
+
+ if (s->bm_lru == 0xffffffff) {
+ s->bm_lru = 0;
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ bm = s->bitmap[i];
+ if (bm) {
+ bm->seqno >>= 1;
+ if (bm->seqno > s->bm_lru)
+ s->bm_lru = bm->seqno;
+ }
+ }
+ }
+
+ return ++s->bm_lru;
+}
+
+static inline void
+touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ bm->seqno = __bitmap_lru_seqno(s);
+}
+
+static inline void
+install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i;
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ if (!s->bitmap[i]) {
+ touch_bitmap(s, bm);
+ s->bitmap[i] = bm;
+ return;
+ }
+ }
+
+ ASSERT(0);
+}
+
+static inline void
+free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i;
+
+ for (i = 0; i < VHD_CACHE_SIZE; i++)
+ if (s->bitmap[i] == bm)
+ break;
+
+ ASSERT(!bitmap_locked(bm));
+ ASSERT(!bitmap_in_use(bm));
+ ASSERT(i < VHD_CACHE_SIZE);
+
+ s->bitmap[i] = NULL;
+ s->bitmap_free[s->bm_free_count++] = bm;
+}
+
+static int
+read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
+{
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ /* in fixed disks, every block is present */
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return VHD_BM_BIT_SET;
+
+ blk = sector / s->spb;
+ sec = sector % s->spb;
+
+ if (blk > s->vhd.header.max_bat_size) {
+ DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
+ sector, op);
+ return -EINVAL;
+ }
+
+ if (bat_entry(s, blk) == DD_BLK_UNUSED) {
+ if (op == VHD_OP_DATA_WRITE &&
+ s->bat.pbw_blk != blk && bat_locked(s))
+ return VHD_BM_BAT_LOCKED;
+
+ return VHD_BM_BAT_CLEAR;
+ }
+
+ if (test_batmap(s, blk)) {
+ DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
+ return VHD_BM_BIT_SET;
+ }
+
+ bm = get_bitmap(s, blk);
+ if (!bm)
+ return VHD_BM_NOT_CACHED;
+
+ /* bump lru count */
+ touch_bitmap(s, bm);
+
+ if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
+ return VHD_BM_READ_PENDING;
+
+ return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ?
+ VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
+}
+
+static int
+read_bitmap_cache_span(struct vhd_state *s,
+ uint64_t sector, int nr_secs, int value)
+{
+ int ret;
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ /* in fixed disks, every block is present */
+ if (s->vhd.footer.type == HD_TYPE_FIXED)
+ return nr_secs;
+
+ sec = sector % s->spb;
+ blk = sector / s->spb;
+
+ if (test_batmap(s, blk))
+ return MIN(nr_secs, s->spb - sec);
+
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && bitmap_valid(bm));
+
+ for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
+ if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
+ break;
+
+ return ret;
+}
+
+static inline struct vhd_request *
+alloc_vhd_request(struct vhd_state *s)
+{
+ struct vhd_request *req = NULL;
+
+ if (s->vreq_free_count > 0) {
+ req = s->vreq_free[--s->vreq_free_count];
+ ASSERT(req->treq.secs == 0);
+ init_vhd_request(s, req);
+ return req;
+ }
+
+ return NULL;
+}
+
+static inline void
+free_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+ memset(req, 0, sizeof(struct vhd_request));
+ s->vreq_free[s->vreq_free_count++] = req;
+}
+
+static inline void
+aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+ struct tiocb *tiocb = &req->tiocb;
+
+ td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
+ vhd_sectors_to_bytes(req->treq.secs),
+ offset, vhd_complete, req);
+ td_queue_tiocb(s->driver, tiocb);
+
+ s->queued++;
+ s->reads++;
+ s->read_size += req->treq.secs;
+ TRACE(s);
+}
+
+static inline void
+aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+ struct tiocb *tiocb = &req->tiocb;
+
+ td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
+ vhd_sectors_to_bytes(req->treq.secs),
+ offset, vhd_complete, req);
+ td_queue_tiocb(s->driver, tiocb);
+
+ s->queued++;
+ s->writes++;
+ s->write_size += req->treq.secs;
+ TRACE(s);
+}
+
+static inline uint64_t
+reserve_new_block(struct vhd_state *s, uint32_t blk)
+{
+ int gap = 0;
+
+ ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+ /* data region of segment should begin on page boundary */
+ if ((s->next_db + s->bm_secs) % s->spp)
+ gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+
+ s->bat.pbw_blk = blk;
+ s->bat.pbw_offset = s->next_db + gap;
+
+ return s->next_db;
+}
+
+static int
+schedule_bat_write(struct vhd_state *s)
+{
+ int i;
+ u32 blk;
+ char *buf;
+ u64 offset;
+ struct vhd_request *req;
+
+ ASSERT(bat_locked(s));
+
+ req = &s->bat.req;
+ buf = s->bat.bat_buf;
+ blk = s->bat.pbw_blk;
+
+ init_vhd_request(s, req);
+ memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
+
+ ((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
+
+ for (i = 0; i < 128; i++)
+ BE32_OUT(&((u32 *)buf)[i]);
+
+ offset = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
+ req->treq.secs = 1;
+ req->treq.buf = buf;
+ req->op = VHD_OP_BAT_WRITE;
+ req->next = NULL;
+
+ aio_write(s, req, offset);
+ set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
+
+ DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
+ "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
+
+ return 0;
+}
+
+static void
+schedule_zero_bm_write(struct vhd_state *s,
+ struct vhd_bitmap *bm, uint64_t lb_end)
+{
+ uint64_t offset;
+ struct vhd_request *req = &s->bat.zero_req;
+
+ init_vhd_request(s, req);
+
+ offset = vhd_sectors_to_bytes(lb_end);
+ req->op = VHD_OP_ZERO_BM_WRITE;
+ req->treq.sec = s->bat.pbw_blk * s->spb;
+ req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
+ req->treq.buf = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
+ req->next = NULL;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
+ s->bat.pbw_blk, offset);
+
+ lock_bitmap(bm);
+ add_to_transaction(&bm->tx, req);
+ aio_write(s, req, offset);
+}
+
+static int
+update_bat(struct vhd_state *s, uint32_t blk)
+{
+ int err;
+ uint64_t lb_end;
+ struct vhd_bitmap *bm;
+
+ ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+ if (bat_locked(s)) {
+ ASSERT(s->bat.pbw_blk == blk);
+ return 0;
+ }
+
+ /* empty bitmap could already be in
+ * cache if earlier bat update failed */
+ bm = get_bitmap(s, blk);
+ if (!bm) {
+ /* install empty bitmap in cache */
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ install_bitmap(s, bm);
+ }
+
+ lock_bat(s);
+ lb_end = reserve_new_block(s, blk);
+ schedule_zero_bm_write(s, bm, lb_end);
+ set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
+
+ return 0;
+}
+
+static int
+allocate_block(struct vhd_state *s, uint32_t blk)
+{
+ char *zeros;
+ int err, gap;
+ uint64_t offset, size;
+ struct vhd_bitmap *bm;
+
+ ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+ if (bat_locked(s)) {
+ ASSERT(s->bat.pbw_blk == blk);
+ if (s->bat.req.error)
+ return -EBUSY;
+ return 0;
+ }
+
+ gap = 0;
+ s->bat.pbw_blk = blk;
+ offset = vhd_sectors_to_bytes(s->next_db);
+
+ /* data region of segment should begin on page boundary */
+ if ((s->next_db + s->bm_secs) % s->spp) {
+ gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+ s->next_db += gap;
+ }
+
+ s->bat.pbw_offset = s->next_db;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
+ blk, s->bat.pbw_offset);
+
+ if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
+ ERR(errno, "lseek failed\n");
+ return -errno;
+ }
+
+ size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
+ err = write(s->vhd.fd, vhd_zeros(size), size);
+ if (err != size) {
+ err = (err == -1 ? -errno : -EIO);
+ ERR(err, "write failed");
+ return err;
+ }
+
+ /* empty bitmap could already be in
+ * cache if earlier bat update failed */
+ bm = get_bitmap(s, blk);
+ if (!bm) {
+ /* install empty bitmap in cache */
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ install_bitmap(s, bm);
+ }
+
+ lock_bat(s);
+ lock_bitmap(bm);
+ schedule_bat_write(s);
+ add_to_transaction(&bm->tx, &s->bat.req);
+
+ return 0;
+}
+
+static int
+schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+ u64 offset;
+ u32 blk = 0, sec = 0;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED) {
+ offset = vhd_sectors_to_bytes(treq.sec);
+ goto make_request;
+ }
+
+ blk = treq.sec / s->spb;
+ sec = treq.sec % s->spb;
+ bm = get_bitmap(s, blk);
+ offset = bat_entry(s, blk);
+
+ ASSERT(offset != DD_BLK_UNUSED);
+ ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
+
+ offset += s->bm_secs + sec;
+ offset = vhd_sectors_to_bytes(offset);
+
+ make_request:
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->flags = flags;
+ req->op = VHD_OP_DATA_READ;
+ req->next = NULL;
+
+ aio_read(s, req, offset);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+ "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
+ s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
+ treq.buf);
+
+ return 0;
+}
+
+static int
+schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+ int err;
+ u64 offset;
+ u32 blk = 0, sec = 0;
+ struct vhd_bitmap *bm = NULL;
+ struct vhd_request *req;
+
+ if (s->vhd.footer.type == HD_TYPE_FIXED) {
+ offset = vhd_sectors_to_bytes(treq.sec);
+ goto make_request;
+ }
+
+ blk = treq.sec / s->spb;
+ sec = treq.sec % s->spb;
+ offset = bat_entry(s, blk);
+
+ if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+ err = allocate_block(s, blk);
+ else
+ err = update_bat(s, blk);
+
+ if (err)
+ return err;
+
+ offset = s->bat.pbw_offset;
+ }
+
+ offset += s->bm_secs + sec;
+ offset = vhd_sectors_to_bytes(offset);
+
+ make_request:
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->flags = flags;
+ req->op = VHD_OP_DATA_WRITE;
+ req->next = NULL;
+
+ if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
+ bm = get_bitmap(s, blk);
+ ASSERT(bm && bitmap_valid(bm));
+ lock_bitmap(bm);
+
+ if (bm->tx.closed) {
+ add_to_tail(&bm->queue, req);
+ set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
+ } else
+ add_to_transaction(&bm->tx, req);
+ }
+
+ aio_write(s, req, offset);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+ "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
+ s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
+
+ return 0;
+}
+
+static int
+schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
+{
+ int err;
+ u64 offset;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req = NULL;
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+
+ offset = bat_entry(s, blk);
+
+ ASSERT(offset != DD_BLK_UNUSED);
+ ASSERT(!get_bitmap(s, blk));
+
+ offset = vhd_sectors_to_bytes(offset);
+
+ err = alloc_vhd_bitmap(s, &bm, blk);
+ if (err)
+ return err;
+
+ req = &bm->req;
+ init_vhd_request(s, req);
+
+ req->treq.sec = blk * s->spb;
+ req->treq.secs = s->bm_secs;
+ req->treq.buf = bm->map;
+ req->treq.cb = NULL;
+ req->op = VHD_OP_BITMAP_READ;
+ req->next = NULL;
+
+ aio_read(s, req, offset);
+ lock_bitmap(bm);
+ install_bitmap(s, bm);
+ set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
+ "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
+ req->treq.secs, offset);
+
+ return 0;
+}
+
+static void
+schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
+{
+ u64 offset;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ bm = get_bitmap(s, blk);
+ offset = bat_entry(s, blk);
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+ ASSERT(bm && bitmap_valid(bm) &&
+ !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+ if (offset == DD_BLK_UNUSED) {
+ ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
+ offset = s->bat.pbw_offset;
+ }
+
+ offset = vhd_sectors_to_bytes(offset);
+
+ req = &bm->req;
+ init_vhd_request(s, req);
+
+ req->treq.sec = blk * s->spb;
+ req->treq.secs = s->bm_secs;
+ req->treq.buf = bm->shadow;
+ req->treq.cb = NULL;
+ req->op = VHD_OP_BITMAP_WRITE;
+ req->next = NULL;
+
+ aio_write(s, req, offset);
+ lock_bitmap(bm);
+ touch_bitmap(s, bm); /* bump lru count */
+ set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+ DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
+ "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
+ req->treq.secs, offset);
+}
+
+/*
+ * queued requests will be submitted once the bitmap
+ * describing them is read and the requests are validated.
+ */
+static int
+__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_request *req;
+
+ ASSERT(vhd_type_dynamic(&s->vhd));
+
+ blk = treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+ req = alloc_vhd_request(s);
+ if (!req)
+ return -EBUSY;
+
+ req->treq = treq;
+ req->op = op;
+ req->next = NULL;
+
+ add_to_tail(&bm->waiting, req);
+ lock_bitmap(bm);
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
+ "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
+
+ TRACE(s);
+ return 0;
+}
+
+static void
+vhd_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
+ s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+ while (treq.secs) {
+ int err;
+ td_request_t clone;
+
+ err = 0;
+ clone = treq;
+
+ switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
+ case -EINVAL:
+ err = -EINVAL;
+ goto fail;
+
+ case VHD_BM_BAT_CLEAR:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ td_forward_request(clone);
+ break;
+
+ case VHD_BM_BIT_CLEAR:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+ td_forward_request(clone);
+ break;
+
+ case VHD_BM_BIT_SET:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+ err = schedule_data_read(s, clone, 0);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_NOT_CACHED:
+ err = schedule_bitmap_read(s, clone.sec / s->spb);
+ if (err)
+ goto fail;
+
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_READ_PENDING:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BAT_LOCKED:
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ treq.sec += clone.secs;
+ treq.secs -= clone.secs;
+ treq.buf += vhd_sectors_to_bytes(clone.secs);
+ continue;
+
+ fail:
+ clone.secs = treq.secs;
+ td_complete_request(clone, err);
+ break;
+ }
+}
+
+static void
+vhd_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
+ s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+ while (treq.secs) {
+ int err;
+ uint8_t flags;
+ td_request_t clone;
+
+ err = 0;
+ flags = 0;
+ clone = treq;
+
+ switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
+ case -EINVAL:
+ err = -EINVAL;
+ goto fail;
+
+ case VHD_BM_BAT_LOCKED:
+ err = -EBUSY;
+ clone.blocked = 1;
+ goto fail;
+
+ case VHD_BM_BAT_CLEAR:
+ flags = (VHD_FLAG_REQ_UPDATE_BAT |
+ VHD_FLAG_REQ_UPDATE_BITMAP);
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = schedule_data_write(s, clone, flags);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BIT_CLEAR:
+ flags = VHD_FLAG_REQ_UPDATE_BITMAP;
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+ err = schedule_data_write(s, clone, flags);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_BIT_SET:
+ clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+ err = schedule_data_write(s, clone, 0);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_NOT_CACHED:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = schedule_bitmap_read(s, clone.sec / s->spb);
+ if (err)
+ goto fail;
+
+ err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+ if (err)
+ goto fail;
+ break;
+
+ case VHD_BM_READ_PENDING:
+ clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+ err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+ if (err)
+ goto fail;
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ treq.sec += clone.secs;
+ treq.secs -= clone.secs;
+ treq.buf += vhd_sectors_to_bytes(clone.secs);
+ continue;
+
+ fail:
+ clone.secs = treq.secs;
+ td_complete_request(clone, err);
+ break;
+ }
+}
+
+static inline void
+signal_completion(struct vhd_request *list, int error)
+{
+ struct vhd_state *s;
+ struct vhd_request *r, *next;
+
+ if (!list)
+ return;
+
+ r = list;
+ s = list->state;
+
+ while (r) {
+ int err;
+
+ err = (error ? error : r->error);
+ next = r->next;
+ td_complete_request(r->treq, err);
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
+ "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
+ free_vhd_request(s, r);
+ r = next;
+
+ s->returned++;
+ TRACE(s);
+ }
+}
+
+static void
+start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ int i, error = 0;
+ struct vhd_transaction *tx;
+ struct vhd_request *r, *next;
+
+ if (!bm->queue.head)
+ return;
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+ r = bm->queue.head;
+ tx = &bm->tx;
+ clear_req_list(&bm->queue);
+
+ if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
+ tx->error = -EIO;
+
+ while (r) {
+ next = r->next;
+ r->next = NULL;
+ clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
+
+ add_to_transaction(tx, r);
+ if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
+ tx->finished++;
+ if (!r->error) {
+ u32 sec = r->treq.sec % s->spb;
+ for (i = 0; i < r->treq.secs; i++)
+ vhd_bitmap_set(&s->vhd,
+ bm->shadow, sec + i);
+ }
+ }
+ r = next;
+ }
+
+ /* perhaps all the queued writes already completed? */
+ if (tx->started && transaction_completed(tx))
+ finish_data_transaction(s, bm);
+}
+
+static void
+finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ struct vhd_transaction *tx = &bm->tx;
+
+ if (!bat_locked(s))
+ return;
+
+ if (s->bat.pbw_blk != bm->blk)
+ return;
+
+ if (!s->bat.req.error)
+ goto release;
+
+ if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
+ goto release;
+
+ tx->closed = 1;
+ return;
+
+ release:
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+ unlock_bat(s);
+ init_bat(s);
+}
+
+static void
+finish_bitmap_transaction(struct vhd_state *s,
+ struct vhd_bitmap *bm, int error)
+{
+ int map_size;
+ struct vhd_transaction *tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
+ tx->error = (tx->error ? tx->error : error);
+ map_size = vhd_sectors_to_bytes(s->bm_secs);
+
+ if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+ if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
+ /* still waiting for bat write */
+ ASSERT(bm->blk == s->bat.pbw_blk);
+ ASSERT(test_vhd_flag(s->bat.status,
+ VHD_FLAG_BAT_WRITE_STARTED));
+ s->bat.req.tx = tx;
+ return;
+ }
+ }
+
+ if (tx->error) {
+ /* undo changes to shadow */
+ memcpy(bm->shadow, bm->map, map_size);
+ } else {
+ /* complete atomic write */
+ memcpy(bm->map, bm->shadow, map_size);
+ if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
+ set_batmap(s, bm->blk);
+ }
+
+ /* transaction done; signal completions */
+ signal_completion(tx->requests.head, tx->error);
+ init_tx(tx);
+ start_new_bitmap_transaction(s, bm);
+
+ if (!bitmap_in_use(bm))
+ unlock_bitmap(bm);
+
+ finish_bat_transaction(s, bm);
+}
+
+static void
+finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+ struct vhd_transaction *tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+ tx->closed = 1;
+
+ if (!tx->error)
+ return schedule_bitmap_write(s, bm->blk);
+
+ return finish_bitmap_transaction(s, bm, 0);
+}
+
+static void
+finish_bat_write(struct vhd_request *req)
+{
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ bm = get_bitmap(s, s->bat.pbw_blk);
+
+ DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
+ s->bat.pbw_blk, s->bat.pbw_offset, req->error);
+ ASSERT(bm && bitmap_valid(bm));
+ ASSERT(bat_locked(s) &&
+ test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+ tx = &bm->tx;
+ ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
+
+ if (!req->error) {
+ bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
+ s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
+ } else
+ tx->error = req->error;
+
+ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+ tx->finished++;
+ remove_from_req_list(&tx->requests, req);
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+ } else {
+ clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+ if (s->bat.req.tx)
+ finish_bitmap_transaction(s, bm, req->error);
+ }
+
+ finish_bat_transaction(s, bm);
+}
+
+static void
+finish_zero_bm_write(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx = req->tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+ ASSERT(bat_locked(s));
+ ASSERT(s->bat.pbw_blk == blk);
+ ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+ tx->finished++;
+ remove_from_req_list(&tx->requests, req);
+
+ if (req->error) {
+ unlock_bat(s);
+ init_bat(s);
+ tx->error = req->error;
+ clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+ } else
+ schedule_bat_write(s);
+
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+}
+
+static void
+finish_bitmap_read(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_request *r, *next;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+
+ DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+ ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+ r = bm->waiting.head;
+ clear_req_list(&bm->waiting);
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+ if (!req->error) {
+ memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
+
+ while (r) {
+ struct vhd_request tmp;
+
+ tmp = *r;
+ next = r->next;
+ free_vhd_request(s, r);
+
+ ASSERT(tmp.op == VHD_OP_DATA_READ ||
+ tmp.op == VHD_OP_DATA_WRITE);
+
+ if (tmp.op == VHD_OP_DATA_READ)
+ vhd_queue_read(s->driver, tmp.treq);
+ else if (tmp.op == VHD_OP_DATA_WRITE)
+ vhd_queue_write(s->driver, tmp.treq);
+
+ r = next;
+ }
+ } else {
+ int err = req->error;
+ unlock_bitmap(bm);
+ free_vhd_bitmap(s, bm);
+ return signal_completion(r, err);
+ }
+
+ if (!bitmap_in_use(bm))
+ unlock_bitmap(bm);
+}
+
+static void
+finish_bitmap_write(struct vhd_request *req)
+{
+ u32 blk;
+ struct vhd_bitmap *bm;
+ struct vhd_transaction *tx;
+ struct vhd_state *s = req->state;
+
+ s->returned++;
+ TRACE(s);
+
+ blk = req->treq.sec / s->spb;
+ bm = get_bitmap(s, blk);
+ tx = &bm->tx;
+
+ DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
+ blk, tx->started, tx->finished);
+ ASSERT(tx->closed);
+ ASSERT(bm && bitmap_valid(bm));
+ ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+ clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+ finish_bitmap_transaction(s, bm, req->error);
+}
+
+static void
+finish_data_read(struct vhd_request *req)
+{
+ struct vhd_state *s = req->state;
+
+ DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
+ req->treq.sec, req->treq.sec / s->spb);
+ signal_completion(req, 0);
+}
+
+static void
+finish_data_write(struct vhd_request *req)
+{
+ int i;
+ struct vhd_transaction *tx = req->tx;
+ struct vhd_state *s = (struct vhd_state *)req->state;
+
+ set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
+
+ if (tx) {
+ u32 blk, sec;
+ struct vhd_bitmap *bm;
+
+ blk = req->treq.sec / s->spb;
+ sec = req->treq.sec % s->spb;
+ bm = get_bitmap(s, blk);
+
+ ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+ tx->finished++;
+
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
+ "tx->started: %d, tx->finished: %d\n", req->treq.sec,
+ req->treq.sec / s->spb, tx->started, tx->finished);
+
+ if (!req->error)
+ for (i = 0; i < req->treq.secs; i++)
+ vhd_bitmap_set(&s->vhd, bm->shadow, sec + i);
+
+ if (transaction_completed(tx))
+ finish_data_transaction(s, bm);
+
+ } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
+ ASSERT(!req->next);
+ DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
+ req->treq.sec, req->treq.sec / s->spb);
+ signal_completion(req, 0);
+ }
+}
+
+void
+vhd_complete(void *arg, struct tiocb *tiocb, int err)
+{
+ struct vhd_request *req = (struct vhd_request *)arg;
+ struct vhd_state *s = req->state;
+ struct iocb *io = &tiocb->iocb;
+
+ s->completed++;
+ TRACE(s);
+
+ req->error = err;
+
+ if (req->error)
+ ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
+ "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
+ s->vhd.file, req->op, req->treq.sec, req->treq.secs,
+ io->u.c.nbytes, req->treq.sec / s->spb,
+ bat_entry(s, req->treq.sec / s->spb));
+
+ switch (req->op) {
+ case VHD_OP_DATA_READ:
+ finish_data_read(req);
+ break;
+
+ case VHD_OP_DATA_WRITE:
+ finish_data_write(req);
+ break;
+
+ case VHD_OP_BITMAP_READ:
+ finish_bitmap_read(req);
+ break;
+
+ case VHD_OP_BITMAP_WRITE:
+ finish_bitmap_write(req);
+ break;
+
+ case VHD_OP_ZERO_BM_WRITE:
+ finish_zero_bm_write(req);
+ break;
+
+ case VHD_OP_BAT_WRITE:
+ finish_bat_write(req);
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+void
+vhd_debug(td_driver_t *driver)
+{
+ int i;
+ struct vhd_state *s = (struct vhd_state *)driver->data;
+
+ DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
+ "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
+ s->returned);
+ DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
+ s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
+ DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
+ s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
+
+ DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA);
+ for (i = 0; i < VHD_REQS_DATA; i++) {
+ struct vhd_request *r = &s->vreq_list[i];
+ td_request_t *t = &r->treq;
+ if (t->secs)
+ DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d,"
+ " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
+ "next: %p, tx: %p\n", i, t->id, r->error, r->op,
+ t->sec, r->flags, r, r->next, r->tx);
+ }
+
+ DBG(TLOG_WARN, "BITMAP CACHE:\n");
+ for (i = 0; i < VHD_CACHE_SIZE; i++) {
+ int qnum = 0, wnum = 0, rnum = 0;
+ struct vhd_bitmap *bm = s->bitmap[i];
+ struct vhd_transaction *tx;
+ struct vhd_request *r;
+
+ if (!bm)
+ continue;
+
+ tx = &bm->tx;
+ r = bm->queue.head;
+ while (r) {
+ qnum++;
+ r = r->next;
+ }
+
+ r = bm->waiting.head;
+ while (r) {
+ wnum++;
+ r = r->next;
+ }
+
+ r = tx->requests.head;
+ while (r) {
+ rnum++;
+ r = r->next;
+ }
+
+ DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
+ "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
+ "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
+ i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
+ wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
+ tx->started, tx->finished, tx->status, tx->requests.head, rnum);
+ }
+
+ DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
+ "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
+ s->bat.pbw_offset, s->bat.req.tx);
+
+/*
+ for (i = 0; i < s->hdr.max_bat_size; i++)
+ DPRINTF("%d: %u\n", i, s->bat.bat[i]);
+*/
+}
+
+struct tap_disk tapdisk_vhd = {
+ .disk_type = "tapdisk_vhd",
+ .flags = 0,
+ .private_data_size = sizeof(struct vhd_state),
+ .td_open = _vhd_open,
+ .td_close = _vhd_close,
+ .td_queue_read = vhd_queue_read,
+ .td_queue_write = vhd_queue_write,
+ .td_get_parent_id = vhd_get_parent_id,
+ .td_validate_parent = vhd_validate_parent,
+ .td_debug = vhd_debug,
+};
diff --git a/tools/blktap2/drivers/bswap.h b/tools/blktap2/drivers/bswap.h
new file mode 100644
index 0000000000..45016b978b
--- /dev/null
+++ b/tools/blktap2/drivers/bswap.h
@@ -0,0 +1,214 @@
+#ifndef BSWAP_H
+#define BSWAP_H
+
+//#include "config-host.h"
+
+#include <inttypes.h>
+
+#if defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/types.h>
+#elif defined(__OpenBSD__)
+#include <machine/endian.h>
+#define bswap_16(x) swap16(x)
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+#else
+
+#ifdef HAVE_BYTESWAP_H
+#include <byteswap.h>
+#else
+
+#define bswap_16(x) \
+({ \
+ uint16_t __x = (x); \
+ ((uint16_t)( \
+ (((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \
+ (((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \
+})
+
+#define bswap_32(x) \
+({ \
+ uint32_t __x = (x); \
+ ((uint32_t)( \
+ (((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \
+ (((uint32_t)(__x) & (uint32_t)0x0000ff00UL) << 8) | \
+ (((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >> 8) | \
+ (((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \
+})
+
+#define bswap_64(x) \
+({ \
+ uint64_t __x = (x); \
+ ((uint64_t)( \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) << 8) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >> 8) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
+ (uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \
+})
+
+#endif /* !HAVE_BYTESWAP_H */
+
+static inline uint16_t bswap16(uint16_t x)
+{
+ return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x)
+{
+ return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x)
+{
+ return bswap_64(x);
+}
+
+static inline void bswap16s(uint16_t *s)
+{
+ *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+ *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+ *s = bswap64(*s);
+}
+
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+ return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+ return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+ endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+ endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+ return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+ *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(__powerpc__)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v;
+ p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v;
+ p1[1] = v >> 8;
+ p1[2] = v >> 16;
+ p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+ const uint8_t *p1 = (const uint8_t *)p;
+ return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+ const uint8_t *p1 = (const uint8_t *)p;
+ return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v >> 8;
+ p1[1] = v;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+ uint8_t *p1 = (uint8_t *)p;
+
+ p1[0] = v >> 24;
+ p1[1] = v >> 16;
+ p1[2] = v >> 8;
+ p1[3] = v;
+}
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+#endif /* BSWAP_H */
diff --git a/tools/blktap2/drivers/check_gcrypt b/tools/blktap2/drivers/check_gcrypt
new file mode 100644
index 0000000000..154ba2492a
--- /dev/null
+++ b/tools/blktap2/drivers/check_gcrypt
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) { return 0; }
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+ echo "yes"
+else
+ echo "no"
+fi
+
+rm -f .gcrypt*
diff --git a/tools/blktap2/drivers/disktypes.h b/tools/blktap2/drivers/disktypes.h
new file mode 100644
index 0000000000..d0923f18b4
--- /dev/null
+++ b/tools/blktap2/drivers/disktypes.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DISKTYPES_H__
+#define __DISKTYPES_H__
+
+typedef struct disk_info {
+ int idnum;
+ char name[50]; /* e.g. "RAMDISK" */
+ char handle[10]; /* xend handle, e.g. 'ram' */
+ int single_handler; /* is there a single controller for all */
+ /* instances of disk type? */
+#ifdef TAPDISK
+ struct tap_disk *drv;
+#endif
+} disk_info_t;
+
+extern struct tap_disk tapdisk_aio;
+/* extern struct tap_disk tapdisk_sync; */
+/* extern struct tap_disk tapdisk_vmdk; */
+/* extern struct tap_disk tapdisk_vhdsync; */
+extern struct tap_disk tapdisk_vhd;
+extern struct tap_disk tapdisk_ram;
+ extern struct tap_disk tapdisk_qcow;
+extern struct tap_disk tapdisk_block_cache;
+extern struct tap_disk tapdisk_log;
+
+#define MAX_DISK_TYPES 20
+
+#define DISK_TYPE_AIO 0
+#define DISK_TYPE_SYNC 1
+#define DISK_TYPE_VMDK 2
+#define DISK_TYPE_VHDSYNC 3
+#define DISK_TYPE_VHD 4
+#define DISK_TYPE_RAM 5
+#define DISK_TYPE_QCOW 6
+#define DISK_TYPE_BLOCK_CACHE 7
+#define DISK_TYPE_LOG 9
+
+/*Define Individual Disk Parameters here */
+static disk_info_t null_disk = {
+ -1,
+ "null disk",
+ "null",
+ 0,
+#ifdef TAPDISK
+ 0,
+#endif
+};
+
+static disk_info_t aio_disk = {
+ DISK_TYPE_AIO,
+ "raw image (aio)",
+ "aio",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_aio,
+#endif
+};
+/*
+static disk_info_t sync_disk = {
+ DISK_TYPE_SYNC,
+ "raw image (sync)",
+ "sync",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_sync,
+#endif
+};
+
+static disk_info_t vmdk_disk = {
+ DISK_TYPE_VMDK,
+ "vmware image (vmdk)",
+ "vmdk",
+ 1,
+#ifdef TAPDISK
+ &tapdisk_vmdk,
+#endif
+};
+
+static disk_info_t vhdsync_disk = {
+ DISK_TYPE_VHDSYNC,
+ "virtual server image (vhd) - synchronous",
+ "vhdsync",
+ 1,
+#ifdef TAPDISK
+ &tapdisk_vhdsync,
+#endif
+};
+*/
+
+static disk_info_t vhd_disk = {
+ DISK_TYPE_VHD,
+ "virtual server image (vhd)",
+ "vhd",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_vhd,
+#endif
+};
+
+
+static disk_info_t ram_disk = {
+ DISK_TYPE_RAM,
+ "ramdisk image (ram)",
+ "ram",
+ 1,
+#ifdef TAPDISK
+ &tapdisk_ram,
+#endif
+};
+
+
+static disk_info_t qcow_disk = {
+ DISK_TYPE_QCOW,
+ "qcow disk (qcow)",
+ "qcow",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_qcow,
+#endif
+};
+
+
+static disk_info_t block_cache_disk = {
+ DISK_TYPE_BLOCK_CACHE,
+ "block cache image (bc)",
+ "bc",
+ 1,
+#ifdef TAPDISK
+ &tapdisk_block_cache,
+#endif
+};
+
+static disk_info_t log_disk = {
+ DISK_TYPE_LOG,
+ "write logger (log)",
+ "log",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_log,
+#endif
+};
+
+/*Main disk info array */
+static disk_info_t *dtypes[] = {
+ &aio_disk,
+ &null_disk, /* &sync_disk, */
+ &null_disk, /* &vmdk_disk, */
+ &null_disk, /* &vhdsync_disk, */
+ &vhd_disk,
+ &ram_disk,
+ &qcow_disk,
+ &block_cache_disk,
+ &null_disk,
+ &log_disk,
+};
+
+#endif
diff --git a/tools/blktap2/drivers/img2qcow.c b/tools/blktap2/drivers/img2qcow.c
new file mode 100644
index 0000000000..b12509ddd8
--- /dev/null
+++ b/tools/blktap2/drivers/img2qcow.c
@@ -0,0 +1,318 @@
+/* img2qcow.c
+ *
+ * Generates a qcow format disk and fills it from an existing image.
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+#include "blk.h"
+
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_events = 0, submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow;
+td_vbd_t* qcow_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+
+static void print_bytes(void *ptr, int length)
+{
+ int i,k;
+ unsigned char *p = ptr;
+
+ DFPRINTF("Buf dump, length %d:\n",length);
+ for (k = 0; k < length; k++) {
+ DFPRINTF("%x",*p);
+ *p++;
+ if(k % 16 == 0) DFPRINTF("\n");
+ else if(k % 2 == 0) DFPRINTF(" ");
+ }
+ DFPRINTF("\n");
+ return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+ //Output progress every PROGRESS_QUANT
+ uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+ if (progress/blocks > prev) {
+ memcpy(output+prev+1,"=>",2);
+ prev++;
+ DFPRINTF("\r%s %"PRIi64"%%",
+ output, (int64_t)((prev-1)*PROGRESS_QUANT));
+ }
+ return;
+}
+
+static int get_image_info(td_disk_info_t *driver, int fd)
+{
+ int ret;
+ long size;
+ unsigned long total_size;
+ struct statvfs statBuf;
+ struct stat stat;
+ uint64_t sector_size=DEFAULT_SECTOR_SIZE;
+
+ ret = fstat(fd, &stat);
+ if (ret != 0) {
+ DFPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ if (blk_getimagesize(fd, &driver->size) != 0)
+ return -EINVAL;
+
+ DFPRINTF("Image size: \n\tpre sector_shift [%"PRIu64"]\n\tpost "
+ "sector_shift [%"PRIu64"]\n",
+ (uint64_t)(driver->size << SECTOR_SHIFT),
+ (uint64_t)driver->size);
+
+ /*Get the sector size*/
+ if (!blk_getsectorsize(fd, &sector_size))
+ driver->sector_size = sector_size;
+
+ } else {
+ /*Local file? try fstat instead*/
+ driver->size = (stat.st_size >> SECTOR_SHIFT);
+ driver->sector_size = DEFAULT_SECTOR_SIZE;
+ DFPRINTF("Image size: [%"PRIu64"]\n",
+ (uint64_t)driver->size);
+ }
+
+ return 0;
+}
+
+void send_responses(td_request_t treq, int err)
+{
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+
+ returned_events++;
+
+ free(treq.buf);
+}
+
+int main(int argc, const char *argv[])
+{
+ int ret = -1, fd, len, err;
+ struct timeval timeout;
+ uint64_t i;
+ char *buf;
+ td_request_t treq;
+ td_disk_info_t info;
+ td_vbd_request_t* vreq;
+
+ if (argc != 3) {
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n",
+ argv[0]);
+ exit(-1);
+ }
+
+
+ /*Open image*/
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd == -1) {
+ DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
+ exit(-1);
+ }
+
+ get_image_info(&info, fd);
+
+ /*Create qcow file*/
+ ret = qcow_create(argv[1],info.size<<SECTOR_SHIFT,NULL,0);
+
+ if (ret < 0) {
+ DFPRINTF("Unable to create QCOW file\n");
+ exit(-1);
+ } else DFPRINTF("Qcow file created: size %"PRIu64" sectors\n",
+ (uint64_t)info.size);
+
+ /* Open Qcow image*/
+ err = tapdisk_server_initialize(NULL, NULL);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+ return err;
+ }
+
+ err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+ return err;
+ }
+
+ qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+ if (!qcow_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(qcow_vbd, argv[1], DISK_TYPE_QCOW,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ 0);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open qcow file.\n");
+ return err;
+ }
+
+ ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+ /*Initialise the output string*/
+ memset(output,0x20,(100/PROGRESS_QUANT)+5);
+ output[0] = '[';
+ output[(100/PROGRESS_QUANT)+2] = ']';
+ output[(100/PROGRESS_QUANT)+3] = '\0';
+ DFPRINTF("%s",output);
+
+ i = 0;
+ while (running) {
+
+ if (!complete) {
+ /*Read sector from image*/
+ if (lseek(fd, i*512, SEEK_SET) == (off_t)-1) {
+ DFPRINTF("Unable to access file offset %"PRIu64"\n",
+ (uint64_t)i*512);
+ exit(-1);
+ }
+
+ if( (ret = posix_memalign((void **)&buf,
+ BLOCK_PROCESSSZ,
+ BLOCK_PROCESSSZ)) != 0) {
+ DFPRINTF("Unable to read memalign buf (%d)\n",ret);
+ exit(-1);
+ }
+
+ /*We attempt to read 4k sized blocks*/
+ len = read(fd, buf, BLOCK_PROCESSSZ);
+ if (len < 512) {
+ DFPRINTF("Unable to read sector %"PRIu64"\n",
+ (uint64_t) (i));
+ complete = 1;
+ continue;
+ }
+
+ len = (len >> 9);
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = buf;
+ treq.sec = i;
+ treq.secs = len;
+ treq.image = 0;
+ treq.cb = send_responses;
+ treq.cb_data = buf;
+ treq.id = 0;
+ treq.sidx = 0;
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &qcow_vbd->pending_requests);
+
+ ddqcow->ops->td_queue_write(ddqcow,treq);
+ --vreq->submitting;
+
+ submit_events++;
+
+ i += len;
+
+ if (i == info.size)
+ complete = 1;
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+ debug_output(i,info.size);
+ }
+
+ while(returned_events != submit_events) {
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0) {
+ DFPRINTF("server wait returned %d\n", ret);
+ sleep(2);
+ }
+ }
+
+ if (complete && (returned_events == submit_events))
+ running = 0;
+ }
+ memcpy(output+prev+1,"=",1);
+ DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output);
+
+ ddqcow->ops->td_close(ddqcow);
+ free(ddqcow->data);
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/io-optimize.c b/tools/blktap2/drivers/io-optimize.c
new file mode 100644
index 0000000000..5d397652e5
--- /dev/null
+++ b/tools/blktap2/drivers/io-optimize.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "io-optimize.h"
+#include "tapdisk-log.h"
+
+#if (!defined(TEST) && defined(DEBUG))
+#define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a)
+#elif defined(TEST)
+#define DBG(ctx, f, a...) printf(f, ##a)
+#else
+#define DBG(ctx, f, a...) ((void)0)
+#endif
+
+static void print_merged_iocbs(struct opioctx *ctx,
+ struct iocb **iocbs, int num_iocbs);
+
+void
+opio_free(struct opioctx *ctx)
+{
+ free(ctx->opios);
+ free(ctx->free_opios);
+ free(ctx->iocb_queue);
+ free(ctx->event_queue);
+}
+
+int
+opio_init(struct opioctx *ctx, int num_iocbs)
+{
+ int i;
+
+ memset(ctx, 0, sizeof(struct opioctx));
+
+ ctx->num_opios = num_iocbs;
+ ctx->free_opio_cnt = num_iocbs;
+ ctx->opios = calloc(1, sizeof(struct opio) * num_iocbs);
+ ctx->free_opios = calloc(1, sizeof(struct opio *) * num_iocbs);
+ ctx->iocb_queue = calloc(1, sizeof(struct iocb *) * num_iocbs);
+ ctx->event_queue = calloc(1, sizeof(struct io_event) * num_iocbs);
+
+ if (!ctx->opios || !ctx->free_opios ||
+ !ctx->iocb_queue || !ctx->event_queue)
+ goto fail;
+
+ for (i = 0; i < num_iocbs; i++)
+ ctx->free_opios[i] = &ctx->opios[i];
+
+ return 0;
+
+ fail:
+ opio_free(ctx);
+ return -ENOMEM;
+}
+
+static inline struct opio *
+alloc_opio(struct opioctx *ctx)
+{
+ if (ctx->free_opio_cnt <= 0)
+ return NULL;
+ return ctx->free_opios[--ctx->free_opio_cnt];
+}
+
+static inline void
+free_opio(struct opioctx *ctx, struct opio *op)
+{
+ memset(op, 0, sizeof(struct opio));
+ ctx->free_opios[ctx->free_opio_cnt++] = op;
+}
+
+static inline void
+restore_iocb(struct opio *op)
+{
+ struct iocb *io = op->iocb;
+
+ io->data = op->data;
+ io->u.c.buf = op->buf;
+ io->u.c.nbytes = op->nbytes;
+}
+
+static inline int
+iocb_optimized(struct opioctx *ctx, struct iocb *io)
+{
+ unsigned long iop = (unsigned long)io->data;
+ unsigned long start = (unsigned long)ctx->opios;
+ unsigned long end = start + (ctx->num_opios * sizeof(struct opio));
+
+ return (iop >= start && iop < end);
+}
+
+static inline int
+contiguous_sectors(struct iocb *l, struct iocb *r)
+{
+ return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset);
+}
+
+static inline int
+contiguous_buffers(struct iocb *l, struct iocb *r)
+{
+ return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf);
+}
+
+static inline int
+contiguous_iocbs(struct iocb *l, struct iocb *r)
+{
+ return ((l->aio_fildes == r->aio_fildes) &&
+ contiguous_sectors(l, r) &&
+ contiguous_buffers(l, r));
+}
+
+static inline void
+init_opio_list(struct opio *op)
+{
+ op->list.head = op->list.tail = op;
+}
+
+static struct opio *
+opio_iocb_init(struct opioctx *ctx, struct iocb *io)
+{
+ struct opio *op;
+
+ op = alloc_opio(ctx);
+ if (!op)
+ return NULL;
+
+ op->buf = io->u.c.buf;
+ op->nbytes = io->u.c.nbytes;
+ op->offset = io->u.c.offset;
+ op->data = io->data;
+ op->iocb = io;
+ io->data = op;
+
+ init_opio_list(op);
+
+ return op;
+}
+
+static inline struct opio *
+opio_get(struct opioctx *ctx, struct iocb *io)
+{
+ if (iocb_optimized(ctx, io))
+ return (struct opio *)io->data;
+ else
+ return opio_iocb_init(ctx, io);
+}
+
+static int
+merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+ struct opio *ophead, *opio;
+
+ ophead = opio_get(ctx, head);
+ if (!ophead)
+ return -ENOMEM;
+
+ opio = opio_get(ctx, io);
+ if (!opio)
+ return -ENOMEM;
+
+ opio->head = ophead;
+ head->u.c.nbytes += io->u.c.nbytes;
+ ophead->list.tail = ophead->list.tail->next = opio;
+
+ return 0;
+}
+
+static int
+merge(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+ if (head->aio_lio_opcode != io->aio_lio_opcode)
+ return -EINVAL;
+
+ if (!contiguous_iocbs(head, io))
+ return -EINVAL;
+
+ return merge_tail(ctx, head, io);
+}
+
+int
+io_merge(struct opioctx *ctx, struct iocb **queue, int num)
+{
+ int i, on_queue;
+ struct iocb *io, **q;
+ struct opio *ophead;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->iocb_queue;
+ memcpy(q, queue, num * sizeof(struct iocb *));
+
+ for (i = 1; i < num; i++) {
+ io = q[i];
+ if (merge(ctx, queue[on_queue], io) != 0)
+ queue[++on_queue] = io;
+ }
+
+#if (defined(TEST) || defined(DEBUG))
+ print_merged_iocbs(ctx, queue, on_queue + 1);
+#endif
+
+ return ++on_queue;
+}
+
+static int
+expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io)
+{
+ int idx;
+ struct opio *op, *next;
+
+ idx = 0;
+ op = (struct opio *)io->data;
+ while (op) {
+ next = op->next;
+ restore_iocb(op);
+ queue[idx++] = op->iocb;
+ free_opio(ctx, op);
+ op = next;
+ }
+
+ return idx;
+}
+
+int
+io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num)
+{
+ int i, on_queue;
+ struct iocb *io, **q;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->iocb_queue;
+ memcpy(q, queue, num * sizeof(struct iocb *));
+
+ for (i = idx; i < num; i++) {
+ io = q[i];
+ if (!iocb_optimized(ctx, io))
+ queue[on_queue++] = io;
+ else
+ on_queue += expand_iocb(ctx, queue + on_queue, io);
+ }
+
+ return on_queue;
+}
+
+static int
+expand_event(struct opioctx *ctx,
+ struct io_event *event, struct io_event *queue, int idx)
+{
+ int err;
+ struct iocb *io;
+ struct io_event *ep;
+ struct opio *ophead, *op, *next;
+
+ io = event->obj;
+ ophead = (struct opio *)io->data;
+ op = ophead;
+
+ if (event->res == io->u.c.nbytes)
+ err = 0;
+ else if ((int)event->res < 0)
+ err = (int)event->res;
+ else
+ err = -EIO;
+
+ while (op) {
+ next = op->next;
+ ep = &queue[idx++];
+ ep->obj = op->iocb;
+ ep->res = (err ? err : op->nbytes);
+ restore_iocb(op);
+ free_opio(ctx, op);
+ op = next;
+ }
+
+ return idx;
+}
+
+int
+io_split(struct opioctx *ctx, struct io_event *events, int num)
+{
+ int on_queue;
+ struct iocb *io;
+ struct io_event *ep, *q;
+
+ if (!num)
+ return 0;
+
+ on_queue = 0;
+ q = ctx->event_queue;
+ memcpy(q, events, num * sizeof(struct io_event));
+
+ for (ep = q; num-- > 0; ep++) {
+ io = ep->obj;
+ if (!iocb_optimized(ctx, io))
+ events[on_queue++] = *ep;
+ else
+ on_queue = expand_event(ctx, ep, events, on_queue);
+ }
+
+ return on_queue;
+}
+
+/******************************************************************************
+debug print functions
+******************************************************************************/
+static inline void
+__print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix)
+{
+ char *type;
+
+ type = (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write");
+
+ DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx,"
+ " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes,
+ io->u.c.buf, type, (unsigned long)io->data,
+ iocb_optimized(ctx, io));
+}
+
+static char *null_prefix = "";
+#define print_iocb(ctx, io) __print_iocb(ctx, io, null_prefix)
+
+static void
+print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+ int i;
+ char pref[10];
+ struct iocb *io;
+
+ DBG(ctx, "iocbs:\n");
+ for (i = 0; i < num_iocbs; i++) {
+ io = iocbs[i];
+ snprintf(pref, 10, "%d: ", i);
+ __print_iocb(ctx, io, pref);
+ }
+}
+
+static void
+print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt)
+{
+ char pref[10];
+
+ while (op) {
+ snprintf(pref, 10, " %d: ", (*cnt)++);
+ __print_iocb(ctx, op->iocb, pref);
+ op = op->next;
+ }
+}
+
+static void
+print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+ int i, cnt;
+ char pref[10];
+ struct iocb *io;
+ struct opio *op;
+
+ DBG(ctx, "merged iocbs:\n");
+ for (i = 0, cnt = 0; i < num_iocbs; i++) {
+ io = iocbs[i];
+ snprintf(pref, 10, "%d: ", cnt++);
+ __print_iocb(ctx, io, pref);
+
+ if (iocb_optimized(ctx, io)) {
+ op = (struct opio *)io->data;
+ print_optimized_iocbs(ctx, op->next, &cnt);
+ }
+ }
+}
+
+static void
+print_events(struct opioctx *ctx, struct io_event *events, int num_events)
+{
+ int i;
+ struct iocb *io;
+
+ for (i = 0; i < num_events; i++) {
+ io = events[i].obj;
+ print_iocb(ctx, io);
+ }
+}
+/******************************************************************************
+end debug print functions
+******************************************************************************/
+
+#if defined(TEST)
+
+#define hmask 0x80000000UL
+#define smask 0x40000000UL
+#define make_data(idx, is_head, sparse) \
+ (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0))
+#define data_idx(data) (int)((unsigned long)(data) & (0x0fffffff))
+#define data_is_head(data) (((unsigned long)(data) & hmask) ? 1 : 0)
+#define data_is_sparse(data) (((unsigned long)(data) & smask) ? 1 : 0)
+
+static void
+usage(void)
+{
+ fprintf(stderr, "usage: io_optimize [-n num_runs] "
+ "[-i num_iocbs] [-s num_secs] [-r random_seed]\n");
+ exit(-1);
+}
+
+static int xalloc_cnt, xfree_cnt;
+static inline char *
+xalloc(int size)
+{
+ char *buf = malloc(size);
+ if (!buf) {
+ fprintf(stderr, "xalloc failed\n");
+ exit(ENOMEM);
+ }
+ xalloc_cnt++;
+ return buf;
+}
+
+static inline void
+xfree(void *buf)
+{
+ free(buf);
+ xfree_cnt++;
+}
+
+static void
+randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs)
+{
+ int i, j;
+
+ i = 0;
+ while (i < num_iocbs) {
+ char *buf;
+ short type;
+ int segs, sparse_mem;
+ uint64_t offset, nbytes;
+
+ type = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE);
+ offset = ((random() % num_secs) << 9);
+
+ if (random() % 10 < 4) {
+ segs = 1;
+ nbytes = (((random() % 7) + 1) << 9);
+ } else {
+ segs = (random() % 10) + 1;
+ nbytes = 4096;
+ }
+
+ if (i + segs > num_iocbs)
+ segs = (num_iocbs - i);
+
+ sparse_mem = (random() % 10 < 2 ? 1 : 0);
+
+ if (sparse_mem)
+ buf = xalloc(nbytes);
+ else
+ buf = xalloc(segs * nbytes);
+
+ for (j = 0; j < segs; j++) {
+ struct iocb *io = iocbs[i + j];
+ io->aio_lio_opcode = type;
+ io->u.c.nbytes = nbytes;
+ io->u.c.offset = offset;
+ io->u.c.buf = buf;
+ offset += nbytes;
+
+ io->data = make_data(i + j, (j == 0), sparse_mem);
+
+ if (j + 1 < segs && sparse_mem)
+ buf = xalloc(nbytes);
+ else
+ buf += nbytes;
+ }
+
+ i += segs;
+ }
+}
+
+static int
+simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs)
+{
+ int i, done;
+ struct iocb *io;
+ struct io_event *ep;
+
+ if (num_iocbs > 1)
+ done = (random() % (num_iocbs - 1)) + 1;
+ else
+ done = num_iocbs;
+
+ for (i = 0; i < done; i++) {
+ io = iocbs[i];
+ ep = &events[i];
+ ep->obj = io;
+ ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0);
+ }
+
+ return done;
+}
+
+static inline void
+process_events(struct opioctx *ctx,
+ struct iocb *iocb_list, struct io_event *events, int num)
+{
+ int i;
+ struct iocb *io;
+
+ for (i = 0; i < num; i++) {
+ io = events[i].obj;
+ print_iocb(ctx, io);
+ if (data_idx(io->data) != (io - iocb_list)) {
+ printf("corrupt data! data_idx = %d, io = %d\n",
+ data_idx(io->data), (io - iocb_list));
+ exit(-1);
+ }
+ if (data_is_head(io->data) || data_is_sparse(io->data))
+ xfree(io->u.c.buf);
+ memset(io, 0, sizeof(struct iocb));
+ }
+}
+
+static inline void
+init_optest(struct iocb *iocb_list,
+ struct iocb **iocbs, struct io_event *events, int num)
+{
+ int i;
+
+ memset(iocb_list, 0, num * sizeof(struct iocb));
+ memset(events, 0, num * sizeof(struct io_event));
+
+ for (i = 0; i < num; i++)
+ iocbs[i] = &iocb_list[i];
+}
+
+int
+main(int argc, char **argv)
+{
+ uint64_t num_secs;
+ struct opioctx ctx;
+ struct io_event *events;
+ int i, c, num_runs, num_iocbs, seed;
+ struct iocb *iocb_list, **iocbs, **ioqueue;
+
+ num_runs = 1;
+ num_iocbs = 300;
+ seed = time(NULL);
+ num_secs = ((4ULL << 20) >> 9); /* 4GB disk */
+
+ while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) {
+ switch (c) {
+ case 'n':
+ num_runs = atoi(optarg);
+ break;
+ case 'i':
+ num_iocbs = atoi(optarg);
+ break;
+ case 's':
+ num_secs = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ seed = atoi(optarg);
+ break;
+ case 'h':
+ usage();
+ case '?':
+ fprintf(stderr, "Unrecognized option: -%c\n", optopt);
+ usage();
+ }
+ }
+
+ printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n",
+ num_runs, num_iocbs, num_secs, seed);
+
+ srand(seed);
+
+ iocb_list = malloc(num_iocbs * sizeof(struct iocb));
+ iocbs = malloc(num_iocbs * sizeof(struct iocb *));
+ events = malloc(num_iocbs * sizeof(struct io_event));
+
+ if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) {
+ fprintf(stderr, "initialization failed\n");
+ exit(ENOMEM);
+ }
+
+ for (i = 0; i < num_runs; i++) {
+ int op_rem, op_done, num_split, num_events, num_done;
+
+ ioqueue = iocbs;
+ init_optest(iocb_list, ioqueue, events, num_iocbs);
+ randomize_iocbs(ioqueue, num_iocbs, num_secs);
+ print_iocbs(&ctx, ioqueue, num_iocbs);
+
+ op_done = 0;
+ num_done = 0;
+ op_rem = io_merge(&ctx, ioqueue, num_iocbs);
+ print_iocbs(&ctx, ioqueue, op_rem);
+ print_merged_iocbs(&ctx, ioqueue, op_rem);
+
+ while (num_done < num_iocbs) {
+ DBG(&ctx, "optimized remaining: %d\n", op_rem);
+
+ DBG(&ctx, "simulating\n");
+ num_events = simulate_io(ioqueue + op_done, events, op_rem);
+ print_events(&ctx, events, num_events);
+
+ DBG(&ctx, "splitting %d\n", num_events);
+ num_split = io_split(&ctx, events, num_events);
+ print_events(&ctx, events, num_split);
+
+ DBG(&ctx, "processing %d\n", num_split);
+ process_events(&ctx, iocb_list, events, num_split);
+
+ op_rem -= num_events;
+ op_done += num_events;
+ num_done += num_split;
+ }
+
+ DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n",
+ i, num_done, xalloc_cnt, xfree_cnt);
+ if (xalloc_cnt != xfree_cnt)
+ exit(-1);
+ xalloc_cnt = xfree_cnt = 0;
+ }
+
+ free(iocbs);
+ free(events);
+ free(iocb_list);
+ opio_free(&ctx);
+
+ return 0;
+}
+#endif
diff --git a/tools/blktap2/drivers/io-optimize.h b/tools/blktap2/drivers/io-optimize.h
new file mode 100644
index 0000000000..9a0d86b6a9
--- /dev/null
+++ b/tools/blktap2/drivers/io-optimize.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __IO_OPTIMIZE_H__
+#define __IO_OPTIMIZE_H__
+
+#include <libaio.h>
+
+struct opio;
+
+struct opio_list {
+ struct opio *head;
+ struct opio *tail;
+};
+
+struct opio {
+ char *buf;
+ unsigned long nbytes;
+ long long offset;
+ void *data;
+ struct iocb *iocb;
+ struct io_event event;
+ struct opio *head;
+ struct opio *next;
+ struct opio_list list;
+};
+
+struct opioctx {
+ int num_opios;
+ int free_opio_cnt;
+ struct opio *opios;
+ struct opio **free_opios;
+ struct iocb **iocb_queue;
+ struct io_event *event_queue;
+};
+
+int opio_init(struct opioctx *ctx, int num_iocbs);
+void opio_free(struct opioctx *ctx);
+int io_merge(struct opioctx *ctx, struct iocb **queue, int num);
+int io_split(struct opioctx *ctx, struct io_event *events, int num);
+int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num);
+
+#endif
diff --git a/tools/blktap2/drivers/lock.c b/tools/blktap2/drivers/lock.c
new file mode 100644
index 0000000000..107c4b609b
--- /dev/null
+++ b/tools/blktap2/drivers/lock.c
@@ -0,0 +1,1000 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This module implements a "dot locking" style advisory file locking algorithm.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <dirent.h>
+#include <limits.h>
+#include "lock.h"
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+/* format: xenlk.hostname.uuid.<xf><rw>*/
+#define LF_POSTFIX ".xenlk"
+#define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s"
+#define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s"
+#define RETRY_MAX 16
+
+#if defined(LOGS)
+#define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args)
+#else
+#define LOG(format, args...)
+#endif
+
+/* random wait - up to .5 seconds */
+#define XSLEEP usleep(random() & 0x7ffff)
+
+typedef int (*eval_func)(char *name, int readonly);
+
+static char *create_lockfn(char *fn_to_lock)
+{
+ char *lockfn;
+
+ /* allocate string to hold constructed lock file */
+ lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1);
+ if (unlikely(!lockfn)) {
+ return 0;
+ }
+
+ /* append postfix to file to lock */
+ strcpy(lockfn, fn_to_lock);
+ strcat(lockfn, LF_POSTFIX);
+
+ return lockfn;
+}
+
+static char *create_lockfn_link(char *fn_to_lock, char *format,
+ char *uuid, int readonly)
+{
+ char hostname[128];
+ char *lockfn_link;
+ char *ptr;
+
+ /* get hostname */
+ if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) {
+ return 0;
+ }
+
+ /* allocate string to hold constructed lock file link */
+ lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) +
+ strlen(hostname) + strlen(uuid) + 8);
+ if (unlikely(!lockfn_link)) {
+ return 0;
+ }
+
+ /* construct lock file link with specific format */
+ strcpy(lockfn_link, fn_to_lock);
+ ptr = lockfn_link + strlen(lockfn_link);
+ sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w");
+
+ return lockfn_link;
+}
+
+static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno)
+{
+ int result = LOCK_OK;
+ int uniq;
+ char *buf;
+ int fd;
+ int pid = (int)getpid();
+ int clstat;
+
+ *reterrno = 0;
+
+ /* create file to normalize time */
+ srandom((int)time(0) ^ pid);
+ uniq = random() % 0xffffff;
+ buf = malloc(strlen(fn) + 24);
+ if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; }
+
+ strcpy(buf, fn);
+ sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq);
+
+ fd = open(buf, O_WRONLY | O_CREAT, 0644);
+ if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ if (lstat(buf, statnow) == -1) {
+ unlink(buf);
+ *reterrno = errno;
+ result = LOCK_ESTAT;
+ goto finish;
+ }
+ unlink(buf);
+
+finish:
+ return result;
+}
+
+static int writer_eval(char *name, int readonly)
+{
+ return name[strlen(name)-1] == 'w';
+}
+
+static int reader_eval(char *name, int readonly)
+{
+ return name[strlen(name)-1] == 'r' && !readonly;
+}
+
+static int lock_holder(char *fn, char *lockfn, char *lockfn_link,
+ int force, int readonly, int *stole, eval_func eval,
+ int *elt, int *ioerror)
+{
+ int status = 0;
+ int ustat;
+ DIR *pd = 0;
+ struct dirent *dptr;
+ char *ptr;
+ char *dirname = malloc(strlen(lockfn));
+ char *uname = malloc(strlen(lockfn_link) + 8);
+ int elt_established = 0;
+ int fd;
+ char tmpbuf[4096];
+
+ *stole = 0;
+ *ioerror = 0;
+ *elt = 0;
+
+ if (!dirname) goto finish;
+ if (!uname) goto finish;
+
+ /* get directory */
+ ptr = strrchr(lockfn, '/');
+ if (!ptr) {
+ strcpy(dirname, ".");
+ } else {
+ int numbytes = ptr - lockfn;
+ strncpy(dirname, lockfn, numbytes);
+ dirname[numbytes] = '\0';
+ }
+ pd = opendir(dirname);
+ if (!pd) {
+ *ioerror = errno ? errno : EIO;
+ goto finish;
+ }
+
+ /*
+ * scan through directory entries and use eval function
+ * if we have a match (i.e. reader or writer lock) but
+ * note that if we are forcing, we will remove any and
+ * all locks that appear for target of our lock, regardless
+ * if it a reader/writer owns the lock.
+ */
+ errno = 0;
+ dptr = readdir(pd);
+ if (!dptr) {
+ *ioerror = EIO;
+ }
+ while (dptr) {
+ char *p1 = strrchr(fn, '/');
+ char *p2 = strrchr(lockfn, '/');
+ char *p3 = strrchr(lockfn_link, '/');
+ if (p1) p1+=1;
+ if (p2) p2+=1;
+ if (p3) p3+=1;
+ if (strcmp(dptr->d_name, p1 ? p1 : fn) &&
+ strcmp(dptr->d_name, p2 ? p2 : lockfn) &&
+ strcmp(dptr->d_name, p3 ? p3 : lockfn_link) &&
+ !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) {
+ strcpy(uname, dirname);
+ strcat(uname, "/");
+ strcat(uname, dptr->d_name);
+ if (!elt_established) {
+ /* read final lock file and extract lease time */
+ fd = open(uname, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ *ioerror = errno;
+ status = 1;
+ close(fd);
+ goto finish;
+ }
+ close(fd);
+ ptr = strrchr(tmpbuf, '.');
+ if (ptr) {
+ *elt = atoi(ptr+1);
+ elt_established = 1;
+ }
+ }
+ if (force) {
+ ustat = unlink(uname);
+ if (ustat == -1) {
+ LOG("failed to unlink %s\n", uname);
+ }
+ *stole = 1;
+ *elt = 0;
+ } else {
+ if ((*eval)(dptr->d_name, readonly)) {
+ closedir(pd);
+ status = 1;
+ goto finish;
+ }
+ }
+ }
+ dptr = readdir(pd);
+ if (!dptr & errno) {
+ *ioerror = EIO;
+ }
+ }
+
+ closedir(pd);
+
+finish:
+ free(dirname);
+ free(uname);
+
+ /* if IO error, force a taken status */
+ return (*ioerror) ? 1 : status;
+}
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus)
+{
+ char *lockfn = 0;
+ char *lockfn_xlink = 0;
+ char *lockfn_flink = 0;
+ char *buf = 0;
+ int fd;
+ int status = 0;
+ struct stat stat1, stat2;
+ int retry_attempts = 0;
+ int clstat;
+ int tmpstat;
+ int stealx = 0;
+ int stealw = 0;
+ int stealr = 0;
+ int established_lease_time = 0;
+ char tmpbuf[4096];
+ int ioerr;
+
+ if (!fn_to_lock || !uuid) {
+ *retstatus = LOCK_EBADPARM;
+ return EINVAL;
+ }
+
+ *retstatus = 0;
+
+ /* seed random with time/pid combo */
+ srandom((int)time(0) ^ getpid());
+
+ /* build lock file strings */
+ lockfn = create_lockfn(fn_to_lock);
+ if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+ lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT,
+ uuid, readonly);
+ if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+ lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid,
+ readonly);
+ if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+try_again:
+ if (retry_attempts++ > RETRY_MAX) {
+ if (*retstatus == LOCK_EXLOCK_OPEN) {
+ struct stat statnow, stat_exlock;
+ int diff;
+
+ if (lstat(lockfn, &stat_exlock) == -1) {
+ goto finish;
+ }
+
+ if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) {
+ goto finish;
+ }
+
+ diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime;
+ if (diff > DEFAULT_LEASE_TIME_SECS) {
+ unlink(lockfn);
+ retry_attempts = 0;
+ goto try_again;
+ }
+ }
+ goto finish;
+ }
+
+ /* try to open exlusive lockfile */
+ fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (fd == -1) {
+ LOG("Initial lockfile creation failed %s force=%d, errno=%d\n",
+ lockfn, force, errno);
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ /* already owned? (hostname & uuid match, skip time bits) */
+ errno = 0;
+ fd = open(lockfn, O_RDWR, 0644);
+ if (fd != -1) {
+ buf = malloc(strlen(lockfn_xlink)+1);
+ if (!buf) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ *retstatus = LOCK_ENOMEM;
+ status = ENOMEM;
+ goto finish;
+ }
+ if (read(fd, buf, strlen(lockfn_xlink)) !=
+ (strlen(lockfn_xlink))) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ free(buf);
+ goto force_lock;
+ }
+ if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) {
+ LOG("lock owned by us, reasserting\n");
+ /* our lock, reassert by rewriting below */
+ if (lseek(fd, 0, SEEK_SET) == -1) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ goto force_lock;
+ }
+ free(buf);
+ goto skip;
+ }
+ free(buf);
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+force_lock:
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ if (force) {
+ /* remove lock file, we are forcing lock, try again */
+ status = unlink(lockfn);
+ if (unlikely(status == -1)) {
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_OPEN;
+ status = EIO;
+ goto finish;
+ }
+ LOG("force removal of %s lockfile failed, "
+ "errno=%d, trying again\n", lockfn, errno);
+ }
+ stealx = 1;
+ }
+ XSLEEP;
+ *retstatus = LOCK_EXLOCK_OPEN;
+ goto try_again;
+ }
+
+ LOG("lockfile created %s\n", lockfn);
+
+skip:
+ /*
+ * write into the temporary xlock
+ */
+ if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) !=
+ strlen(lockfn_xlink)) {
+ if (errno == EIO) {
+ *retstatus = LOCK_EXLOCK_WRITE;
+ status = EIO;
+ goto finish;
+ }
+ status = errno;
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ XSLEEP;
+ *retstatus = LOCK_EXLOCK_WRITE;
+ if (unlink(lockfn) == -1) {
+ LOG("removal of %s lockfile failed, "
+ "errno=%d, trying again\n", lockfn, errno);
+ }
+ goto try_again;
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+
+ while (retry_attempts++ < RETRY_MAX) {
+ tmpstat = link(lockfn, lockfn_xlink);
+ LOG("linking %s and %s\n", lockfn, lockfn_xlink);
+ if ((tmpstat == -1) && (errno != EEXIST)) {
+ LOG("link status is %d, errno=%d\n", tmpstat, errno);
+ }
+
+ if ((lstat(lockfn, &stat1) == -1) ||
+ (lstat(lockfn_xlink, &stat2) == -1)) {
+ /* try again, cleanup first */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing lock file %s", lockfn);
+ }
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ XSLEEP;
+ status = LOCK_ESTAT;
+ goto finish;
+ }
+
+ /* compare inodes */
+ if (stat1.st_ino == stat2.st_ino) {
+ /* success, inodes are the same */
+ /* should we check that st_nlink's are also 2?? */
+ *retstatus = LOCK_OK;
+ status = 0;
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ goto finish;
+ } else {
+ status = errno;
+ /* try again, cleanup first */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing lock file %s", lockfn);
+ }
+ tmpstat = unlink(lockfn_xlink);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing linked lock file %s",
+ lockfn_xlink);
+ }
+ XSLEEP;
+ *retstatus = LOCK_EINODE;
+ goto try_again;
+ }
+ }
+
+finish:
+ if (!*retstatus) {
+
+ /* we have exclusive lock */
+
+ status = 0;
+
+ /* fast check, see if we own a final lock and are reasserting */
+ if (!lstat(lockfn_flink, &stat1)) {
+ char *ptr;
+
+ /* set the return value to notice this is a reassert */
+ *retstatus = 1;
+
+ /* read existing lock file and extract
+ established lease time */
+ fd = open(lockfn_flink, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ if (errno == EIO) {
+ close(fd);
+ *retstatus = LOCK_EINODE;
+ status = EIO;
+ goto skip_scan;
+ }
+ }
+ close(fd);
+ ptr = strrchr(tmpbuf, '.');
+ if (ptr) {
+ *lease_time = atoi(ptr+1);
+ } else {
+ *lease_time = 10; /* wkchack */
+ }
+ goto skip_scan;
+ } else {
+ if (errno == EIO) {
+ *retstatus = LOCK_EINODE;
+ status = EIO;
+ goto skip_scan;
+ }
+ }
+
+ /* we allow exclusive writer, or multiple readers */
+ if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+ readonly, &stealw, writer_eval,
+ &established_lease_time, &ioerr)) {
+ if (ioerr) {
+ *retstatus = LOCK_EREAD;
+ status = ioerr;
+ goto skip_scan;
+ }
+ *retstatus = LOCK_EHELD_WR;
+ } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+ readonly, &stealr, reader_eval,
+ &established_lease_time, &ioerr)) {
+ if (ioerr) {
+ *retstatus = LOCK_EREAD;
+ status = ioerr;
+ goto skip_scan;
+ }
+ *retstatus = LOCK_EHELD_RD;
+ }
+ if (established_lease_time) *lease_time =
+ established_lease_time;
+ }
+
+skip_scan:
+ if (*retstatus >= 0) {
+ /* update file, changes last modify time */
+ fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644);
+ if (fd == -1) {
+ *retstatus = LOCK_EOPEN;
+ status = errno;
+ } else {
+ char tmpbuf[32];
+ int failed_write;
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ sprintf(tmpbuf, ".%d", *lease_time);
+ failed_write = write(fd, lockfn_flink,
+ strlen(lockfn_flink)) !=
+ strlen(lockfn_flink);
+ if (failed_write) status = errno;
+ failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) !=
+ strlen(tmpbuf);
+ if (failed_write) status = errno;
+ if (failed_write) {
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ XSLEEP;
+ *retstatus = LOCK_EUPDATE;
+ goto try_again;
+ }
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+
+ if (!*retstatus && force && (stealx || stealw || stealr)) {
+ struct timeval timeout;
+
+ /* enforce quiet time on steal */
+ timeout.tv_sec = *lease_time;
+ timeout.tv_usec = 0;
+ select(0, 0, 0, 0, &timeout);
+ }
+
+ /* remove exclusive lock, final read/write locks will hold */
+ tmpstat = unlink(lockfn);
+ if (unlikely(tmpstat == -1)) {
+ LOG("error removing exclusive lock file %s",
+ lockfn);
+ }
+
+ free(lockfn);
+ free(lockfn_xlink);
+ free(lockfn_flink);
+
+ /* set lease time to -1 if error, so no one is apt to use it */
+ if (*retstatus < 0) *lease_time = -1;
+
+ LOG("returning status %d, errno=%d\n", status, errno);
+ return status;
+}
+
+
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status)
+{
+ char *lockfn_link = 0;
+ int reterrno = 0;
+
+ if (!fn_to_unlock || !uuid) {
+ *status = LOCK_EBADPARM;
+ return 0;
+ }
+
+ lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid,
+ readonly);
+ if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; }
+
+ if (unlink(lockfn_link) == -1) {
+ LOG("error removing linked lock file %s", lockfn_link);
+ reterrno = errno;
+ *status = LOCK_ENOLOCK;
+ goto finish;
+ }
+
+ *status = LOCK_OK;
+
+finish:
+ free(lockfn_link);
+ return reterrno;
+}
+
+int lock_delta(char *fn, int *ret_lease, int *max_lease)
+{
+ int reterrno = 0;
+ DIR *pd = 0;
+ struct dirent *dptr;
+ char *ptr;
+ int result = INT_MAX;
+ struct stat statbuf, statnow;
+ char *dirname = malloc(strlen(fn));
+ char *uname = malloc(strlen(fn) + 8);
+ int elt_established = 0;
+ char *dotptr;
+ char tmpbuf[4096];
+ int fd;
+
+ if (!fn || !dirname || !uname) {
+ *ret_lease = LOCK_EBADPARM;
+ *max_lease = -1;
+ return 0;
+ }
+
+ if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) {
+ result = LOCK_ESTAT;
+ goto finish;
+ }
+
+ /* get directory */
+ ptr = strrchr(fn, '/');
+ if (!ptr) {
+ strcpy(dirname, ".");
+ ptr = fn;
+ } else {
+ int numbytes = ptr - fn;
+ strncpy(dirname, fn, numbytes);
+ ptr += 1;
+ }
+ pd = opendir(dirname);
+ if (!pd) { reterrno = errno; goto finish; }
+
+ dptr = readdir(pd);
+ while (dptr) {
+ if (strcmp(dptr->d_name, ptr) &&
+ !strncmp(dptr->d_name, ptr, strlen(ptr))) {
+ char *fpath = malloc(strlen(dptr->d_name) +
+ strlen(dirname) + 2);
+ if (!fpath) {
+ closedir(pd);
+ result = LOCK_ENOMEM;
+ goto finish;
+ }
+ strcpy(fpath, dirname);
+ strcat(fpath, "/");
+ strcat(fpath, dptr->d_name);
+ if (lstat(fpath, &statbuf) != -1) {
+ int diff = (int)statnow.st_mtime -
+ (int)statbuf.st_mtime;
+ /* adjust diff if someone updated the lock
+ between now and when we created the "now"
+ file
+ */
+ diff = (diff < 0) ? 0 : diff;
+ result = diff < result ? diff : result;
+ } else {
+ closedir(pd);
+ reterrno = errno;
+ goto finish;
+ }
+
+ if (!elt_established) {
+ /* read final lock file and extract lease time */
+ fd = open(fpath, O_RDONLY, 0644);
+ memset(tmpbuf, 0, sizeof(tmpbuf));
+ if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+ /* error on read? */
+ }
+ close(fd);
+ dotptr = strrchr(tmpbuf, '.');
+ if (dotptr) {
+ *max_lease = atoi(dotptr+1);
+ elt_established = 1;
+ }
+ }
+
+ free(fpath);
+ }
+ dptr = readdir(pd);
+ }
+
+ closedir(pd);
+
+finish:
+ free(dirname);
+ free(uname);
+
+ /* returns smallest lock time, or error */
+ if (result == INT_MAX) result = LOCK_ENOLOCK;
+
+ /* set lease time to -1 if error, so no one is apt to use it */
+ if ((result < 0) || reterrno) *max_lease = -1;
+ *ret_lease = result;
+ return reterrno;
+}
+
+#if defined(TEST)
+/*
+ * the following is for sanity testing.
+ */
+
+static void usage(char *prg)
+{
+ printf("usage %s\n"
+ " dtr <filename>]\n"
+ " p <filename> [num iterations]\n"
+ " u <filename> [0|1] [<uniqid>]\n"
+ " l <filename> [0|1] [0|1] [<uniqid>] [<leasetime>]\n", prg);
+ printf(" p : perf test lock take and reassert\n");
+ printf(" d : delta lock time\n");
+ printf(" t : test the file (after random locks)\n");
+ printf(" r : random lock tests (must ^C)\n");
+ printf(" u : unlock, readonly? uniqID (default is PID)\n");
+ printf(" l : lock, readonly? force?, uniqID (default is PID), lease time\n");
+}
+
+static void test_file(char *fn)
+{
+ FILE *fptr;
+ int prev_count = 0;
+ int count, pid, time;
+
+ fptr = fopen(fn, "r");
+ if (!fptr) {
+ LOG("ERROR on file %s open, errno=%d\n", fn, errno);
+ return;
+ }
+
+ while (!feof(fptr)) {
+ fscanf(fptr, "%d %d %d\n", &count, &pid, &time);
+ if (prev_count != count) {
+ LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n",
+ prev_count, count, pid, time);
+ }
+ prev_count = count + 1;
+ }
+}
+
+static void random_locks(char *fn)
+{
+ int pid = getpid();
+ int status;
+ char *filebuf = malloc(256);
+ int count = 0;
+ int dummy;
+ int clstat;
+ char uuid[12];
+ int readonly;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+ int err;
+
+ /* this will never return, kill to exit */
+
+ srandom((int)time(0) ^ pid);
+
+ LOG("pid: %d using file %s\n", pid, fn);
+ sprintf(uuid, "%08d", pid);
+
+ while (1) {
+ XSLEEP;
+ readonly = random() & 1;
+ sysstatus = lock(fn, uuid, 0, readonly, &lease, status);
+ if (status == LOCK_OK) {
+ /* got lock, open, read, modify write close file */
+ int fd = open(fn, O_RDWR, 0644);
+ if (fd == -1) {
+ LOG("pid: %d ERROR on file %s open, errno=%d\n",
+ pid, fn, errno);
+ } else {
+ if (!readonly) {
+ /* ugly code to read data in test format */
+ /* format is "%d %d %d" 'count pid time' */
+ struct stat statbuf;
+ int bytes;
+ status = stat(fn, &statbuf);
+ if (status != -1) {
+ if (statbuf.st_size > 256) {
+ lseek(fd, -256, SEEK_END);
+ }
+ memset(filebuf, 0, 256);
+ bytes = read(fd, filebuf, 256);
+ if (bytes) {
+ int bw = bytes-2;
+ while (bw && filebuf[bw]!='\n')
+ bw--;
+ if (!bw) bw = -1;
+ sscanf(&filebuf[bw+1],
+ "%d %d %d",
+ &count, &dummy, &dummy);
+ count += 1;
+ }
+ lseek(fd, 0, SEEK_END);
+ sprintf(filebuf, "%d %d %d\n",
+ count, pid, (int)time(0));
+ write(fd, filebuf, strlen(filebuf));
+ } else {
+ LOG("pid: %d ERROR on file %s stat, "
+ "errno=%d\n", pid, fn, errno);
+ }
+ }
+ clstat = close(fd);
+ if (unlikely(clstat == -1)) {
+ LOG("fail on close\n");
+ }
+ }
+ XSLEEP;
+ err = unlock(fn, uuid, readonly, &status);
+ LOG("unlock status is %d (err=%d)\n", status, err);
+ }
+ }
+}
+
+static void perf_lock(char *fn, int loops)
+{
+ int sysstatus;
+ char buf[9];
+ int start = loops;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ sprintf(buf, "%08d", getpid());
+
+ while (loops--) {
+ sysstatus = lock(fn, buf, 0, 0, &lease, &status);
+ if (status < 0) {
+ printf("failed to get lock at iteration %d errno=%d\n",
+ start - loops, errno);
+ return;
+ }
+ }
+ unlock(fn, buf, 0, &status);
+}
+
+int main(int argc, char *argv[])
+{
+ int status;
+ char *ptr;
+ char uuid[12];
+ int force;
+ int readonly;
+ int max_lease, cur_lease;
+ int intstatus;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ if (argc < 3) {
+ usage(argv[0]);
+ return 0;
+ }
+
+ sprintf(uuid, "%08d", getpid());
+ ptr = uuid;
+
+ if (!strcmp(argv[1],"d")) {
+ status = lock_delta(argv[2], &cur_lease, &max_lease);
+
+ printf("lock delta for %s is %d seconds, max lease is %d\n",
+ argv[2], cur_lease, max_lease);
+ } else if (!strcmp(argv[1],"t")) {
+ test_file(argv[2]);
+ } else if (!strcmp(argv[1],"r")) {
+ random_locks(argv[2]);
+ } else if (!strcmp(argv[1],"p")) {
+ perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3]));
+ } else if (!strcmp(argv[1],"l")) {
+ if (argc < 4) force = 0; else force = atoi(argv[3]);
+ if (argc < 5) readonly = 0; else readonly = atoi(argv[4]);
+ if (argc >= 6) ptr = argv[5];
+ if (argc == 7) lease = atoi(argv[6]);
+ status = lock(argv[2], ptr, readonly, force, &lease, &intstatus);
+ printf("lock status = %d\n", status);
+ } else if (!strcmp(argv[1],"u") ) {
+ if (argc < 5) readonly = 0; else readonly = atoi(argv[3]);
+ if (argc == 5) ptr = argv[4];
+ status = unlock(argv[2], ptr, readonly, &intstatus);
+ printf("unlock status = %d\n", intstatus);
+ } else {
+ usage(argv[0]);
+ }
+
+ return status;
+}
+#elif defined(UTIL)
+/*
+ * the following is used for non-libary, standalone
+ * program utility as a shell program
+ */
+
+static void usage(char *prg)
+{
+ printf("usage %s\n"
+ " delta <filename>\n"
+ " unlock <filename> <r|w> <uniqid>\n"
+ " lock <filename> <r|w> <0|1> <uniqid> <leasetime>\n", prg);
+ printf(" delta : get time since lock last refreshed\n");
+ printf(" returns delta time and max lease time in seconds\n");
+ printf(" unlock: unlock request filename, r|w, uniqID\n");
+ printf(" returns status (success is 0)\n");
+ printf(" lock : lock request filename, r|w, force?, uniqID, lease time request\n");
+ printf(" returns status (success is 0) and established lease time in seconds\n");
+}
+
+int main(int argc, char *argv[])
+{
+ int status = 0;
+ int dlock;
+ char *ptr;
+ int force;
+ int readonly;
+ int cur_lease, max_lease, intstatus;
+ int lease = DEFAULT_LEASE_TIME_SECS;
+
+ if (argc < 3) {
+ if (argc == 2 && !strcmp(argv[1], "-h")) {
+ usage(argv[0]);
+ } else {
+ printf("%d\n", LOCK_EUSAGE);
+ }
+ return 0;
+ }
+
+ if (!strcmp(argv[1],"delta") && (argc == 3)) {
+ status = lock_delta(argv[2], &cur_lease, &max_lease);
+ printf("%d %d\n", cur_lease, max_lease);
+ } else if (!strcmp(argv[1],"lock") && (argc == 7)) {
+ readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+ force = atoi(argv[4]);
+ ptr = argv[5];
+ lease = atoi(argv[6]);
+ status = lock(argv[2], ptr, force, readonly, &lease, &intstatus);
+ printf("%d %d\n", intstatus, lease);
+ } else if (!strcmp(argv[1],"unlock") && (argc == 5)) {
+ readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+ ptr = argv[4];
+ status = unlock(argv[2], ptr, readonly, &intstatus);
+ printf("%d\n", intstatus);
+ } else {
+ printf("%d\n", LOCK_EUSAGE);
+ }
+
+ /* this is either 0 or a system defined errno */
+ return status;
+}
+#endif
diff --git a/tools/blktap2/drivers/lock.h b/tools/blktap2/drivers/lock.h
new file mode 100644
index 0000000000..98baaaa705
--- /dev/null
+++ b/tools/blktap2/drivers/lock.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define DEFAULT_LEASE_TIME_SECS 30
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat);
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat);
+int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time);
+
+typedef enum {
+ LOCK_OK = 0,
+ LOCK_EBADPARM = -1,
+ LOCK_ENOMEM = -2,
+ LOCK_ESTAT = -3,
+ LOCK_EHELD_WR = -4,
+ LOCK_EHELD_RD = -5,
+ LOCK_EOPEN = -6,
+ LOCK_EXLOCK_OPEN = -7,
+ LOCK_EXLOCK_WRITE= -8,
+ LOCK_EINODE = -9,
+ LOCK_EUPDATE = -10,
+ LOCK_EREAD = -11,
+ LOCK_EREMOVE = -12,
+ LOCK_ENOLOCK = -13,
+ LOCK_EUSAGE = -14,
+} lock_error;
diff --git a/tools/blktap2/drivers/log.h b/tools/blktap2/drivers/log.h
new file mode 100644
index 0000000000..8f00df4478
--- /dev/null
+++ b/tools/blktap2/drivers/log.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* log.h: API for writelog communication */
+
+#ifndef __LOG_H__
+#define __LOG_H__ 1
+
+#include <inttypes.h>
+
+#include <xen/io/ring.h>
+/* for wmb et al */
+#include <xenctrl.h>
+
+#define LOGCMD_SHMP "shmp"
+#define LOGCMD_PEEK "peek"
+#define LOGCMD_CLEAR "clrw"
+#define LOGCMD_GET "getw"
+#define LOGCMD_KICK "kick"
+
+#define CTLRSPLEN_SHMP 256
+#define CTLRSPLEN_PEEK 4
+#define CTLRSPLEN_CLEAR 4
+#define CTLRSPLEN_GET 4
+#define CTLRSPLEN_KICK 0
+
+/* shmregion is arbitrarily capped at 8 megs for a minimum of
+ * 64 MB of data per read (if there are no contiguous regions)
+ * In the off-chance that there is more dirty data, multiple
+ * reads must be done */
+#define SHMSIZE (8 * 1024 * 1024)
+#define SRINGSIZE 4096
+
+/* The shared memory region is split up into 3 subregions:
+ * The first half is reserved for the dirty bitmap log.
+ * The second half begins with 1 page for read request descriptors,
+ * followed by a big area for supplying read data.
+ */
+static inline void* bmstart(void* shm)
+{
+ return shm;
+}
+
+static inline void* bmend(void* shm)
+{
+ return shm + SHMSIZE/2;
+}
+
+static inline void* sringstart(void* shm)
+{
+ return bmend(shm);
+}
+
+static inline void* sdatastart(void* shm)
+{
+ return sringstart(shm) + SRINGSIZE;
+}
+
+static inline void* sdataend(void* shm)
+{
+ return shm + SHMSIZE;
+}
+
+/* format for messages between log client and server */
+struct log_ctlmsg {
+ char msg[4];
+ char params[16];
+};
+
+/* extent descriptor */
+struct disk_range {
+ uint64_t sector;
+ uint32_t count;
+};
+
+/* dirty write logging space. This is an extent ring at the front,
+ * full of disk_ranges plus a pointer into the data area */
+/* I think I'd rather have the header in front of each data section to
+ * avoid having two separate spaces that can run out, but then I'd either
+ * lose page alignment on the data blocks or spend an entire page on the
+ * header */
+
+struct log_extent {
+ uint64_t sector;
+ uint32_t count;
+ uint32_t offset; /* offset from start of data area to start of extent */
+};
+
+/* struct above should be 16 bytes, or 256 extents/page */
+
+typedef struct log_extent log_request_t;
+typedef struct log_extent log_response_t;
+
+DEFINE_RING_TYPES(log, log_request_t, log_response_t);
+
+#define LOG_HEADER_PAGES 4
+
+#endif
diff --git a/tools/blktap2/drivers/profile.h b/tools/blktap2/drivers/profile.h
new file mode 100644
index 0000000000..f628ba223e
--- /dev/null
+++ b/tools/blktap2/drivers/profile.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __TAP_PROFILE_H__
+#define __TAP_PROFILE_H__
+
+#ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <inttypes.h>
+
+//#define PROFILING
+//#define LOGGING
+
+#define TAPPROF_IN 1
+#define TAPPROF_OUT 2
+
+struct profile_times {
+ char *fn_name;
+ uint64_t in, out_sum, cnt;
+};
+
+struct profile_info {
+ FILE *log;
+ int size;
+ char *name;
+ unsigned long long seq;
+ struct profile_times *pt;
+};
+
+#ifdef PROFILING
+
+static inline void
+tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size)
+{
+ memset(prof, 0, sizeof(struct profile_info));
+#ifdef LOGGING
+ prof->log = fopen(log_name, "w");
+#endif
+ prof->size = size;
+ prof->name = strdup(tap_name);
+ prof->pt = malloc(sizeof(struct profile_times) * prof->size);
+ if (prof->pt)
+ memset(prof->pt, 0, sizeof(struct profile_times) * prof->size);
+}
+
+static inline void
+tp_close(struct profile_info *prof)
+{
+ int i;
+ struct profile_times *pt;
+
+ for (i = 0; i < prof->size; i++) {
+ pt = &prof->pt[i];
+ if (pt->fn_name) {
+ syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n",
+ prof->name, pt->fn_name, pt->cnt,
+ ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0));
+ free(pt->fn_name);
+ }
+ }
+
+#ifdef LOGGING
+ if (prof->log)
+ fclose(prof->log);
+#endif
+ free(prof->name);
+ if (prof->pt)
+ free(prof->pt);
+}
+
+static inline u64
+tp_get_id(struct profile_info *prof)
+{
+ return prof->seq++;
+}
+
+static inline int
+tp_fn_id(struct profile_info *prof, const char *name)
+{
+ int i;
+ struct profile_times *pt;
+
+ for (i = 0; i < prof->size; i++) {
+ pt = &prof->pt[i];
+ if (!pt->fn_name)
+ return i;
+ if (!strcmp(pt->fn_name, name))
+ return i;
+ }
+
+ return prof->size - 1;
+}
+
+static inline void
+__tp_in(struct profile_info *prof, const char *func)
+{
+ long long _time;
+ int idx = tp_fn_id(prof, func);
+ struct profile_times *pt = &prof->pt[idx];
+
+ if (!pt->fn_name)
+ pt->fn_name = strdup(func);
+
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+ pt->in = _time;
+}
+
+#define tp_in(prof) __tp_in(prof, __func__)
+
+static inline void
+__tp_out(struct profile_info *prof, const char *func)
+{
+ long long _time;
+ int idx = tp_fn_id(prof, func);
+ struct profile_times *pt = &prof->pt[idx];
+
+ if (!pt->fn_name || !pt->in)
+ return;
+
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+ pt->cnt++;
+ pt->out_sum += (_time - pt->in);
+ pt->in = 0;
+}
+
+#define tp_out(prof) __tp_out(prof, __func__)
+
+static inline void
+__tp_log(struct profile_info *prof, u64 id, const char *func, int direction)
+{
+ long long _time;
+ asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+
+ if (direction == TAPPROF_IN)
+ __tp_in(prof, func);
+ else
+ __tp_out(prof, func);
+
+#ifdef LOGGING
+ if (prof->log)
+ fprintf(prof->log, "%s: %s: %llu, %lld\n", func,
+ ((direction == TAPPROF_IN) ? "in" : "out"), id, _time);
+#endif
+}
+
+#define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction)
+
+#else
+#define tp_open(prof, tname, lname, size) ((void)0)
+#define tp_close(prof) ((void)0)
+#define tp_in(prof) ((void)0)
+#define tp_out(prof) ((void)0)
+#define tp_log(prof, sec, direction) ((void)0)
+#endif
+
+#endif
diff --git a/tools/blktap2/drivers/qcow-create.c b/tools/blktap2/drivers/qcow-create.c
new file mode 100644
index 0000000000..6a641af95f
--- /dev/null
+++ b/tools/blktap2/drivers/qcow-create.c
@@ -0,0 +1,121 @@
+/* qcow-create.c
+ *
+ * Generates a qcow format disk.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include "tapdisk.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define MAX_NAME_LEN 1000
+
+void help(void)
+{
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr,
+ "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> "
+ "[<BACKING_FILENAME>]\n");
+ exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = -1, c, backed = 0;
+ int sparse = 1;
+ uint64_t size;
+ char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
+
+ for(;;) {
+ c = getopt(argc, argv, "hr");
+ if (c == -1)
+ break;
+ switch(c) {
+ case 'h':
+ help();
+ exit(0);
+ break;
+ case 'r':
+ sparse = 0;
+ break;
+ default:
+ fprintf(stderr, "Unknown option\n");
+ help();
+ }
+ }
+
+ printf("Optind %d, argc %d\n", optind, argc);
+ if ( !(optind == (argc - 2) || optind == (argc - 3)) )
+ help();
+
+ size = atoi(argv[optind++]);
+ size = size << 20;
+
+ if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+ MAX_NAME_LEN) {
+ fprintf(stderr,"Device name too long\n");
+ exit(-1);
+ }
+
+ if (optind != argc) {
+ /*Backing file argument*/
+ backed = 1;
+ if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+ MAX_NAME_LEN) {
+ fprintf(stderr,"Device name too long\n");
+ exit(-1);
+ }
+ }
+
+ DFPRINTF("Creating file size %"PRIu64", name %s\n",(uint64_t)size, filename);
+ if (!backed)
+ ret = qcow_create(filename,size,NULL,sparse);
+ else
+ ret = qcow_create(filename,size,bfilename,sparse);
+
+ if (ret < 0)
+ DPRINTF("Unable to create QCOW file\n");
+ else
+ DPRINTF("QCOW file successfully created\n");
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/qcow.h b/tools/blktap2/drivers/qcow.h
new file mode 100644
index 0000000000..a88f1d5d92
--- /dev/null
+++ b/tools/blktap2/drivers/qcow.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _QCOW_H_
+#define _QCOW_H_
+
+#include "aes.h"
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0x00
+#define QCOW_CRYPT_AES 0x01
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+#define SPARSE_FILE 0x01
+#define EXTHDR_L1_BIG_ENDIAN 0x02
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define ROUNDUP(l, s) \
+({ \
+ (uint64_t)( \
+ (l + (s - 1)) - ((l + (s - 1)) % s)); \
+})
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} QCowHeader;
+
+/*Extended header for Xen enhancements*/
+typedef struct QCowHeader_ext {
+ uint32_t xmagic;
+ uint32_t cksum;
+ uint32_t min_cluster_alloc;
+ uint32_t flags;
+} QCowHeader_ext;
+
+uint32_t gen_cksum(char *ptr, int len);
+int get_filesize(char *filename, uint64_t *size, struct stat *st);
+int qtruncate(int fd, off_t length, int sparse);
+
+#define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
+
+struct tdqcow_state {
+ int fd; /*Main Qcow file descriptor */
+ uint64_t fd_end; /*Store a local record of file length */
+ char *name; /*Record of the filename*/
+ uint32_t backing_file_size;
+ uint64_t backing_file_offset;
+ uint8_t extended; /*File contains extended header*/
+ int encrypted; /*File contents are encrypted or plain*/
+ int cluster_bits; /*Determines length of cluster as
+ *indicated by file hdr*/
+ int cluster_size; /*Length of cluster*/
+ int cluster_sectors; /*Number of sectors per cluster*/
+ int cluster_alloc; /*Blktap fix for allocating full
+ *extents*/
+ int min_cluster_alloc; /*Blktap historical extent alloc*/
+ int sparse; /*Indicates whether to preserve sparseness*/
+ int l2_bits; /*Size of L2 table entry*/
+ int l2_size; /*Full table size*/
+ int l1_size; /*L1 table size*/
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset; /*L1 table offset from beginning of
+ *file*/
+ uint64_t *l1_table; /*L1 table entries*/
+ uint64_t *l2_cache; /*We maintain a cache of size
+ *L2_CACHE_SIZE of most read entries*/
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
+ uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset; /**/
+ uint32_t crypt_method; /*current crypt method, 0 if no
+ *key yet */
+ uint32_t crypt_method_header; /**/
+ AES_KEY aes_encrypt_key; /*AES key*/
+ AES_KEY aes_decrypt_key; /*AES key*/
+
+ /* libaio state */
+ int aio_free_count;
+ int max_aio_reqs;
+ struct qcow_request *aio_requests;
+ struct qcow_request **aio_free_list;
+
+};
+
+int qcow_create(const char *filename, uint64_t total_size,
+ const char *backing_file, int sparse);
+
+#endif //_QCOW_H_
diff --git a/tools/blktap2/drivers/qcow2raw.c b/tools/blktap2/drivers/qcow2raw.c
new file mode 100644
index 0000000000..689e7f5cd1
--- /dev/null
+++ b/tools/blktap2/drivers/qcow2raw.c
@@ -0,0 +1,449 @@
+/* qcow2raw.c
+ *
+ * Generates raw image data from an existing qcow image
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define AIO_VBD 1
+#define WINDOW 32
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_read_events = 0, returned_write_events = 0;
+static int submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow, *ddaio;
+td_vbd_t* qcow_vbd, *aio_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+struct request_info {
+ void* buf;
+ uint64_t logical_sec;
+ int pending;
+};
+
+static void print_bytes(void *ptr, int length)
+{
+ int i,k;
+ unsigned char *p = ptr;
+
+ DFPRINTF("Buf dump, length %d:\n",length);
+ for (k = 0; k < length; k++) {
+ DFPRINTF("%x",*p);
+ *p++;
+ if (k % 16 == 0) DFPRINTF("\n");
+ else if (k % 2 == 0) DFPRINTF(" ");
+ }
+ DFPRINTF("\n");
+ return;
+}
+
+void
+queue_event(event_id_t id, char mode, void *private)
+{
+ tapdisk_complete_tiocbs(&server.aio_queue);
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+ //Output progress every PROGRESS_QUANT
+ uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+ if (progress/blocks > prev) {
+ memcpy(output+prev+1,"=>",2);
+ prev++;
+ DFPRINTF("\r%s %"PRIu64"%%",
+ output, (uint64_t)((prev-1)*PROGRESS_QUANT));
+ }
+ return;
+}
+
+static void send_write_responses(td_request_t treq, int err)
+{
+ struct request_info* req;
+
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+ returned_write_events+=treq.secs;
+ written += treq.secs;
+
+ req= (struct request_info*)treq.cb_data;
+
+ //Wait for whole request to complete.
+ req->pending-=treq.secs;
+ if(req->pending)
+ return;
+
+ //Whole request has completed, we can free buffers.
+ free(req->buf);
+ free(req);
+
+ debug_output(written, ddaio->info.size);
+
+ return;
+}
+
+static void send_read_responses(td_request_t treq, int err)
+{
+ int ret;
+ struct request_info* req;
+ td_vbd_request_t* vreq;
+
+ if (err < 0) {
+ DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+ return;
+ }
+ returned_read_events+=treq.secs;
+
+ req= (struct request_info*)treq.cb_data;
+
+ //do nothing until all fragments complete.
+ req->pending-=treq.secs;
+
+ if(req->pending)
+ return;
+
+ //This read is done.
+ tapdisk_vbd_complete_vbd_request(qcow_vbd, treq.private);
+
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = req->buf;
+ treq.sec = req->logical_sec;
+ treq.secs = BLOCK_PROCESSSZ>>9;
+ treq.image = tapdisk_vbd_first_image(aio_vbd);
+ treq.cb = send_write_responses;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ req->pending = BLOCK_PROCESSSZ>>9;
+ treq.cb_data = req;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ //Put it in the VBD's queue, so we don't lose
+ //track of it.
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &aio_vbd->pending_requests);
+
+ ddaio->ops->td_queue_write(ddaio,treq);
+ --vreq->submitting;
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+
+ return;
+}
+
+int main(int argc, const char *argv[])
+{
+ int ret = -1, fd, len,input;
+ uint64_t size;
+ struct timeval timeout;
+ uint64_t i;
+ char *buf;
+ struct stat finfo;
+ td_request_t treq;
+ td_vbd_request_t* vreq;
+ struct request_info* req;
+ int err;
+
+ if (argc != 3) {
+ fprintf(stderr, "Qcow-utils: v1.0.0\n");
+ fprintf(stderr, "usage: %s <Dest File descriptor> "
+ "<Qcow SRC IMAGE>\n",
+ argv[0]);
+ exit(-1);
+ }
+
+ err = tapdisk_server_initialize(NULL, NULL);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+ return err;
+ }
+
+ err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+ return err;
+ }
+
+ qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+ if (!qcow_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(qcow_vbd, argv[2], DISK_TYPE_QCOW,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open qcow file.\n");
+ return err;
+ }
+
+ ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+ /*Setup aio destination file*/
+ ret = stat(argv[1],&finfo);
+ if (ret == -1) {
+ /*Check errno*/
+ switch(errno) {
+ case ENOENT:
+ /*File doesn't exist, create*/
+ fd = open(argv[1],
+ O_RDWR | O_LARGEFILE | O_CREAT, 0644);
+ if (fd < 0) {
+ DFPRINTF("ERROR creating file [%s] "
+ "(errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+ if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+ DFPRINTF("Unable to create file "
+ "[%s] of size %"PRIu64" (errno %d). "
+ "Exiting...\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ 0 - errno);
+ close(fd);
+ exit(-1);
+ }
+ close(fd);
+ break;
+ case ENXIO:
+ DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
+ exit(-1);
+ default:
+ DFPRINTF("An error occurred opening Device [%s] "
+ "(errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+ } else {
+ fprintf(stderr, "WARNING: All existing data in "
+ "%s will be overwritten.\nDo you wish to continue? "
+ "(y or n) ",
+ argv[1]);
+ if (getchar() != 'y') {
+ DFPRINTF("Exiting...\n");
+ exit(-1);
+ }
+
+ /*TODO - Test the existing file or device for adequate space*/
+ fd = open(argv[1], O_RDWR | O_LARGEFILE);
+ if (fd < 0) {
+ DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
+ argv[1], 0 - errno);
+ exit(-1);
+ }
+
+ if (S_ISBLK(finfo.st_mode)) {
+ if (blk_getimagesize(fd, &size) != 0) {
+ close(fd);
+ return -1;
+ }
+
+ if (size < ddqcow->info.size<<9) {
+ DFPRINTF("ERROR: Not enough space on device "
+ "%s (%"PRIu64" bytes available, "
+ "%"PRIu64" bytes required\n",
+ argv[1], size,
+ (uint64_t)ddqcow->info.size<<9);
+ close(fd);
+ exit(-1);
+ }
+ } else {
+ if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+ DFPRINTF("Unable to create file "
+ "[%s] of size %"PRIu64" (errno %d). "
+ "Exiting...\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ 0 - errno);
+ close(fd);
+ exit(-1);
+ } else DFPRINTF("File [%s] truncated to length %"PRIu64" "
+ "(%"PRIu64")\n",
+ argv[1],
+ (uint64_t)ddqcow->info.size<<9,
+ (uint64_t)ddqcow->info.size);
+ }
+ close(fd);
+ }
+
+ //Now the output file should be there, reopen it as an aio VBD
+ err=tapdisk_vbd_initialize(-1,-1, AIO_VBD);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't initialize aio vbd.\n");
+ return err;
+ }
+
+ aio_vbd = tapdisk_server_get_vbd(AIO_VBD);
+ if (!aio_vbd) {
+ err = -ENODEV;
+ DPRINTF("qcow2raw Couldn't create aio vbd.\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open_vdi(aio_vbd, argv[1], DISK_TYPE_AIO,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ 0);
+ if( err ) {
+ DPRINTF("qcow2raw Couldn't open aio file.\n");
+ return err;
+ }
+
+ ddaio=(tapdisk_vbd_first_image(aio_vbd))->driver;
+
+ /*Initialise the output string*/
+ memset(output,0x20,(100/PROGRESS_QUANT)+5);
+ output[0] = '[';
+ output[(100/PROGRESS_QUANT)+2] = ']';
+ output[(100/PROGRESS_QUANT)+3] = '\0';
+ DFPRINTF("%s",output);
+
+ i = 0;
+ while (running) {
+ timeout.tv_sec = 0;
+
+ if (!complete) {
+ /*Read Pages from qcow image*/
+ if ( (ret = posix_memalign((void **)&buf,
+ BLOCK_PROCESSSZ,
+ BLOCK_PROCESSSZ))
+ != 0) {
+ DFPRINTF("Unable to alloc memory (%d)\n",ret);
+ exit(-1);
+ }
+
+ /*Attempt to read 4k sized blocks*/
+ submit_events+=BLOCK_PROCESSSZ>>9;
+
+ //Set up the read request
+ treq.op = TD_OP_READ;
+ treq.buf = buf;
+ treq.sec = i;
+ treq.secs = BLOCK_PROCESSSZ>>9;
+ treq.image = tapdisk_vbd_first_image(qcow_vbd);
+ treq.cb = send_read_responses;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ req = calloc(1, sizeof(struct request_info));
+ req->buf = buf;
+ req->logical_sec = i;
+ req->pending = BLOCK_PROCESSSZ>>9;
+ treq.cb_data = req;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ //Put it in the VBD's queue, so we don't lose
+ //track of it.
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private,
+ &qcow_vbd->pending_requests);
+
+ ddqcow->ops->td_queue_read(ddqcow, treq);
+ --vreq->submitting;
+
+ i += BLOCK_PROCESSSZ>>9;
+
+ if (i >= ddqcow->info.size)
+ complete = 1;
+
+
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+ }
+
+
+ while(returned_write_events != submit_events) {
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0) {
+ DFPRINTF("server wait returned %d\n", ret);
+ sleep(2);
+ }
+ }
+ if (complete && (returned_write_events == submit_events))
+ running = 0;
+ }
+ memcpy(output+prev+1,"=",1);
+ DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output);
+
+ ddqcow->ops->td_close(ddqcow);
+ ddaio->ops->td_close(ddaio);
+ free(ddqcow->data);
+ free(ddaio->data);
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/scheduler.c b/tools/blktap2/drivers/scheduler.c
new file mode 100644
index 0000000000..6b8d0093e7
--- /dev/null
+++ b/tools/blktap2/drivers/scheduler.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "tapdisk-log.h"
+
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+
+#define SCHEDULER_MAX_TIMEOUT 600
+#define SCHEDULER_POLL_FD (SCHEDULER_POLL_READ_FD | \
+ SCHEDULER_POLL_WRITE_FD | \
+ SCHEDULER_POLL_EXCEPT_FD)
+
+#define MIN(a, b) ((a) <= (b) ? (a) : (b))
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+#define scheduler_for_each_event(s, event, tmp) \
+ list_for_each_entry_safe(event, tmp, &(s)->events, next)
+
+typedef struct event {
+ char mode;
+ event_id_t id;
+
+ int fd;
+ int timeout;
+ int deadline;
+
+ event_cb_t cb;
+ void *private;
+
+ struct list_head next;
+} event_t;
+
+static void
+scheduler_prepare_events(scheduler_t *s)
+{
+ int diff;
+ struct timeval now;
+ event_t *event, *tmp;
+
+ FD_ZERO(&s->read_fds);
+ FD_ZERO(&s->write_fds);
+ FD_ZERO(&s->except_fds);
+
+ s->max_fd = 0;
+ s->timeout = SCHEDULER_MAX_TIMEOUT;
+
+ gettimeofday(&now, NULL);
+
+ scheduler_for_each_event(s, event, tmp) {
+ if (event->mode & SCHEDULER_POLL_READ_FD) {
+ FD_SET(event->fd, &s->read_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_WRITE_FD) {
+ FD_SET(event->fd, &s->write_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_EXCEPT_FD) {
+ FD_SET(event->fd, &s->except_fds);
+ s->max_fd = MAX(event->fd, s->max_fd);
+ }
+
+ if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+ diff = event->deadline - now.tv_sec;
+ if (diff > 0)
+ s->timeout = MIN(s->timeout, diff);
+ else
+ s->timeout = 0;
+ }
+ }
+
+ s->timeout = MIN(s->timeout, s->max_timeout);
+}
+
+static void
+scheduler_event_callback(event_t *event, char mode)
+{
+ if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ event->deadline = now.tv_sec + event->timeout;
+ }
+
+ event->cb(event->id, mode, event->private);
+}
+
+static void
+scheduler_run_events(scheduler_t *s)
+{
+ struct timeval now;
+ event_t *event, *tmp;
+
+ gettimeofday(&now, NULL);
+
+ again:
+ s->restart = 0;
+
+ scheduler_for_each_event(s, event, tmp) {
+ if ((event->mode & SCHEDULER_POLL_READ_FD) &&
+ FD_ISSET(event->fd, &s->read_fds)) {
+ FD_CLR(event->fd, &s->read_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_READ_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_WRITE_FD) &&
+ FD_ISSET(event->fd, &s->write_fds)) {
+ FD_CLR(event->fd, &s->write_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_WRITE_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_EXCEPT_FD) &&
+ FD_ISSET(event->fd, &s->except_fds)) {
+ FD_CLR(event->fd, &s->except_fds);
+ scheduler_event_callback(event, SCHEDULER_POLL_EXCEPT_FD);
+ goto next;
+ }
+
+ if ((event->mode & SCHEDULER_POLL_TIMEOUT) &&
+ (event->deadline <= now.tv_sec))
+ scheduler_event_callback(event, SCHEDULER_POLL_TIMEOUT);
+
+ next:
+ if (s->restart)
+ goto again;
+ }
+}
+
+int
+scheduler_register_event(scheduler_t *s, char mode, int fd,
+ int timeout, event_cb_t cb, void *private)
+{
+ event_t *event;
+ struct timeval now;
+
+ if (!cb)
+ return -EINVAL;
+
+ if (!(mode & SCHEDULER_POLL_TIMEOUT) && !(mode & SCHEDULER_POLL_FD))
+ return -EINVAL;
+
+ event = calloc(1, sizeof(event_t));
+ if (!event)
+ return -ENOMEM;
+
+ gettimeofday(&now, NULL);
+
+ INIT_LIST_HEAD(&event->next);
+
+ event->mode = mode;
+ event->fd = fd;
+ event->timeout = timeout;
+ event->deadline = now.tv_sec + timeout;
+ event->cb = cb;
+ event->private = private;
+ event->id = s->uuid++;
+
+ if (!s->uuid)
+ s->uuid++;
+
+ list_add_tail(&event->next, &s->events);
+
+ return event->id;
+}
+
+void
+scheduler_unregister_event(scheduler_t *s, event_id_t id)
+{
+ event_t *event, *tmp;
+
+ if (!id)
+ return;
+
+ scheduler_for_each_event(s, event, tmp)
+ if (event->id == id) {
+ list_del(&event->next);
+ free(event);
+ s->restart = 1;
+ break;
+ }
+}
+
+void
+scheduler_set_max_timeout(scheduler_t *s, int timeout)
+{
+ if (timeout >= 0)
+ s->max_timeout = MIN(s->max_timeout, timeout);
+}
+
+int
+scheduler_wait_for_events(scheduler_t *s)
+{
+ int ret;
+ struct timeval tv;
+
+ scheduler_prepare_events(s);
+
+ tv.tv_sec = s->timeout;
+ tv.tv_usec = 0;
+
+ DBG("timeout: %d, max_timeout: %d\n",
+ s->timeout, s->max_timeout);
+
+ ret = select(s->max_fd + 1, &s->read_fds,
+ &s->write_fds, &s->except_fds, &tv);
+
+ s->restart = 0;
+ s->timeout = SCHEDULER_MAX_TIMEOUT;
+ s->max_timeout = SCHEDULER_MAX_TIMEOUT;
+
+ if (ret < 0)
+ return ret;
+
+ scheduler_run_events(s);
+
+ return ret;
+}
+
+void
+scheduler_initialize(scheduler_t *s)
+{
+ memset(s, 0, sizeof(scheduler_t));
+
+ s->uuid = 1;
+
+ FD_ZERO(&s->read_fds);
+ FD_ZERO(&s->write_fds);
+ FD_ZERO(&s->except_fds);
+
+ INIT_LIST_HEAD(&s->events);
+}
diff --git a/tools/blktap2/drivers/scheduler.h b/tools/blktap2/drivers/scheduler.h
new file mode 100644
index 0000000000..ea37e8f837
--- /dev/null
+++ b/tools/blktap2/drivers/scheduler.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _SCHEDULER_H_
+#define _SCHEDULER_H_
+
+#include <sys/select.h>
+
+#include "list.h"
+
+#define SCHEDULER_POLL_READ_FD 0x1
+#define SCHEDULER_POLL_WRITE_FD 0x2
+#define SCHEDULER_POLL_EXCEPT_FD 0x4
+#define SCHEDULER_POLL_TIMEOUT 0x8
+
+typedef int event_id_t;
+typedef void (*event_cb_t) (event_id_t id, char mode, void *private);
+
+typedef struct scheduler {
+ fd_set read_fds;
+ fd_set write_fds;
+ fd_set except_fds;
+
+ struct list_head events;
+
+ int uuid;
+ int max_fd;
+ int timeout;
+ int restart;
+ int max_timeout;
+} scheduler_t;
+
+void scheduler_initialize(scheduler_t *);
+event_id_t scheduler_register_event(scheduler_t *, char mode,
+ int fd, int timeout,
+ event_cb_t cb, void *private);
+void scheduler_unregister_event(scheduler_t *, event_id_t);
+void scheduler_set_max_timeout(scheduler_t *, int);
+int scheduler_wait_for_events(scheduler_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-client.c b/tools/blktap2/drivers/tapdisk-client.c
new file mode 100644
index 0000000000..c85b5fc530
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-client.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* client harness for tapdisk log */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "log.h"
+
+#define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+struct writelog {
+ char* shmpath;
+ uint32_t shmsize;
+ void* shm;
+
+ /* next unprocessed item in the writelog */
+ void* cur;
+ unsigned int inflight;
+
+ /* pointer to start and end of free data space for requests */
+ void* dhd;
+ void* dtl;
+
+ log_sring_t* sring;
+ log_front_ring_t fring;
+};
+
+/* bytes free on the data ring */
+static inline unsigned int dring_avail(struct writelog* wl)
+{
+ /* one byte reserved to distinguish empty from full */
+ if (wl->dhd == wl->dtl)
+ return sdataend(wl->shm) - sdatastart(wl->shm) - 1;
+
+ if (wl->dhd < wl->dtl)
+ return wl->dtl - wl->dhd - 1;
+
+ return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1;
+}
+
+/* advance ring pointer by len bytes */
+static inline void* dring_advance(struct writelog* wl, void* start, size_t len)
+{
+ void* next;
+ int dsz = sdataend(wl->shm) - sdatastart(wl->shm);
+
+ next = start + (len % dsz);
+ if (next > sdataend(wl->shm))
+ next -= dsz;
+
+ return next;
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "usage: tapdisk-client <sock>\n");
+}
+
+/* returns socket file descriptor */
+static int tdctl_open(const char* sockpath)
+{
+ struct sockaddr_un saddr;
+ int fd;
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ BWPRINTF("error creating socket: %s", strerror(errno));
+ return -1;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, sockpath, strlen(sockpath));
+
+ if (connect(fd, &saddr, sizeof(saddr)) < 0) {
+ BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen)
+{
+ int rc;
+
+ if ((rc = write(fd, msg, sizeof(*msg))) < 0) {
+ BWPRINTF("error sending ctl request: %s", strerror(errno));
+ return -1;
+ } else if (rc < sizeof(*msg)) {
+ BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg));
+ return -1;
+ }
+
+ if (!rsplen)
+ return 0;
+
+ if ((rc = read(fd, rsp, rsplen)) < 0) {
+ BWPRINTF("error reading ctl response: %s", strerror(errno));
+ return -1;
+ } else if (rc < rsplen) {
+ BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_get_shmem(int fd, struct writelog* wl)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_SHMP + 1];
+ int rc;
+
+ memset(&req, 0, sizeof(req));
+ memset(rsp, 0, sizeof(rsp));
+
+ memcpy(req.msg, LOGCMD_SHMP, 4);
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) {
+ BWPRINTF("error getting shared memory parameters");
+ return -1;
+ }
+
+ memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize));
+ wl->shmpath = strdup(rsp + sizeof(wl->shmsize));
+
+ BDPRINTF("shared memory parameters: size: %u, path: %s",
+ wl->shmsize, wl->shmpath);
+
+ return 0;
+}
+
+static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd)
+{
+ memset(msg, 0, sizeof(*msg));
+ memcpy(msg->msg, cmd, 4);
+}
+
+static int ctl_get_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_GET];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_GET);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) {
+ BWPRINTF("error getting writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_peek_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_PEEK];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_PEEK);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) {
+ BWPRINTF("error peeking writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* submit pending requests */
+static int ctl_kick(int fd)
+{
+ struct log_ctlmsg req;
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_KICK);
+
+ if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) {
+ BWPRINTF("error kicking ring");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ctl_clear_writes(int fd)
+{
+ struct log_ctlmsg req;
+ char rsp[CTLRSPLEN_CLEAR];
+ int rc;
+
+ ctlmsg_init(&req, LOGCMD_CLEAR);
+
+ if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) {
+ BWPRINTF("error clearing writes");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int writelog_map(struct writelog* wl)
+{
+ int fd;
+ void* shm;
+
+ if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) {
+ BWPRINTF("could not open shared memory at %s: %s", wl->shmpath,
+ strerror(errno));
+ return -1;
+ }
+
+ wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (wl->shm == MAP_FAILED) {
+ BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+ return -1;
+ }
+ wl->cur = wl->shm;
+ wl->inflight = 0;
+ wl->dhd = wl->dtl = sdatastart(wl->shm);
+
+ BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm),
+ dring_avail(wl));
+
+ wl->sring = sringstart(wl->shm);
+ /* need some thought about what to do on reconnect */
+ FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE);
+
+ return 0;
+}
+
+static int writelog_dump(struct writelog* wl)
+{
+ struct disk_range* range = wl->shm;
+
+ for (range = wl->shm; (void*)range < bmend(wl->shm); range++) {
+ if (!range->count)
+ break;
+
+ BDPRINTF("dirty extent: %"PRIu64":%u",
+ range->sector, range->count);
+ }
+
+ return 0;
+}
+
+/* walk dirty map and enqueue read requests.
+ * returns: 0 when entire bitmap has been enqueued,
+ * 1 when the ring is full
+ * -1 on error
+ */
+static int writelog_enqueue_requests(struct writelog* wl)
+{
+ struct disk_range* range = wl->shm;
+ log_request_t* req;
+
+ for (range = wl->cur; (void*)range < bmend(wl->shm); range++) {
+ if (!range->count)
+ break;
+
+ if (RING_FULL(&wl->fring))
+ break;
+
+ /* insert range into request stream */
+ /* 1. get next request slot from ring */
+ /* 2. ensure enough shm space is available */
+
+ BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)",
+ range->sector, range->count, RING_FREE_REQUESTS(&wl->fring),
+ RING_SIZE(&wl->fring));
+
+ req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt);
+
+ req->sector = range->sector;
+ req->count = range->count;
+ /* ... */
+ req->offset = 0;
+
+ wl->fring.req_prod_pvt++;
+ wl->inflight++;
+ }
+
+ wl->cur = range;
+
+ if (range->count)
+ return 1;
+
+ return 0;
+}
+
+static int writelog_dequeue_responses(struct writelog* wl)
+{
+ RING_IDX rstart, rend;
+ log_response_t rsp;
+
+ rstart = wl->fring.rsp_cons;
+ rend = wl->sring->rsp_prod;
+
+ BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend);
+
+ while (rstart != rend) {
+ memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp));
+ BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count);
+ wl->fring.rsp_cons = ++rstart;
+ wl->inflight--;
+ }
+
+ return 0;
+}
+
+static int writelog_free(struct writelog* wl)
+{
+ if (wl->shmpath) {
+ free(wl->shmpath);
+ wl->shmpath = NULL;
+ }
+ if (wl->shm) {
+ munmap(wl->shm, wl->shmsize);
+ wl->shm = NULL;
+ }
+
+ return 0;
+}
+
+int get_writes(struct writelog* wl, int fd, int peek)
+{
+ int rc;
+
+ if (peek)
+ rc = ctl_peek_writes(fd);
+ else
+ rc = ctl_get_writes(fd);
+
+ if (rc < 0)
+ return rc;
+
+ wl->cur = wl->shm;
+
+ return 0;
+}
+
+int await_responses(struct writelog* wl, int fd)
+{
+ struct log_ctlmsg msg;
+ int rc;
+
+ /* sit on socket waiting for kick */
+ if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+ BWPRINTF("error reading from control socket: %s", strerror(errno));
+ return -1;
+ } else if (!rc) {
+ BWPRINTF("EOF on control socket");
+ return -1;
+ } else if (rc < sizeof(msg)) {
+ BWPRINTF("short reply (%d/%d bytes)", rc, (int) sizeof(msg));
+ return -1;
+ }
+
+ if (strncmp(msg.msg, LOGCMD_KICK, 4)) {
+ BWPRINTF("Unknown message received: %.4s", msg.msg);
+ return -1;
+ }
+
+ if (writelog_dequeue_responses(wl) < 0)
+ return -1;
+
+ return 0;
+}
+
+/* read_loop:
+ * 1. extract dirty bitmap
+ * 2. feed as much as possible onto ring
+ * 3. kick
+ * 4. as responses come back, feed more of the dirty bitmap
+ * into the ring
+ * 5. when entire bitmap has been queued, go to 1?
+ */
+int read_loop(struct writelog* wl, int fd)
+{
+ int rc;
+
+ if (get_writes(wl, fd, 1) < 0)
+ return -1;
+ writelog_dump(wl);
+
+ do {
+ rc = writelog_enqueue_requests(wl);
+
+ if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring))
+ RING_PUSH_REQUESTS(&wl->fring);
+ if (ctl_kick(fd) < 0)
+ return -1;
+
+ /* collect responses */
+ if (wl->inflight && await_responses(wl, fd) < 0)
+ return -1;
+ } while (rc > 0);
+
+ return rc;
+}
+
+int main(int argc, char* argv[])
+{
+ int fd;
+ struct writelog wl;
+ char cmd;
+
+ if (argc < 2) {
+ usage();
+ return 1;
+ }
+
+ if (argc < 3)
+ cmd = 'p';
+ else
+ cmd = argv[2][0];
+
+ fd = tdctl_open(argv[1]);
+
+ if (ctl_get_shmem(fd, &wl) < 0)
+ return 1;
+
+ if (writelog_map(&wl) < 0) {
+ BWPRINTF("Error mapping write log: %s", strerror(errno));
+ return 1;
+ }
+
+ switch (cmd) {
+ case 'p':
+ if (get_writes(&wl, fd, 1) < 0)
+ return 1;
+ writelog_dump(&wl);
+ break;
+ case 'c':
+ if (ctl_clear_writes(fd) < 0)
+ return 1;
+ break;
+ case 'g':
+ if (get_writes(&wl, fd, 0) < 0)
+ return 1;
+ writelog_dump(&wl);
+ break;
+ case 'r':
+ if (read_loop(&wl, fd) < 0)
+ return 1;
+ break;
+ default:
+ usage();
+ return 1;
+ }
+
+ writelog_free(&wl);
+ close(fd);
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-diff.c b/tools/blktap2/drivers/tapdisk-diff.c
new file mode 100644
index 0000000000..0f31c57d42
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-diff.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "libvhd.h"
+
+#define POLL_READ 0
+#define POLL_WRITE 1
+
+#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT)
+
+struct tapdisk_stream_poll {
+ int pipe[2];
+ int set;
+};
+
+struct tapdisk_stream_request {
+ uint64_t sec;
+ uint32_t secs;
+ uint64_t seqno;
+ blkif_request_t blkif_req;
+ struct list_head next;
+};
+
+struct tapdisk_stream {
+ td_vbd_t *vbd;
+
+ unsigned int id;
+
+ int err;
+
+ uint64_t cur;
+ uint64_t start;
+ uint64_t end;
+
+ uint64_t started;
+ uint64_t completed;
+
+ struct tapdisk_stream_poll poll;
+ event_id_t enqueue_event_id;
+
+ struct list_head free_list;
+ struct list_head pending_list;
+ struct list_head completed_list;
+
+ struct tapdisk_stream_request requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static char *program;
+static struct tapdisk_stream stream1, stream2;
+static vhd_context_t vhd1;
+
+static void
+usage(FILE *stream)
+{
+ printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n",
+ program);
+}
+
+static int
+open_vhd(const char *path, vhd_context_t *vhd)
+{
+ int err;
+
+ err = vhd_open(vhd, path, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("error opening %s: %d\n", path, err);
+ return err;
+ }
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ {
+ printf("error reading BAT for %s: %d\n", path, err);
+ vhd_close(vhd);
+ return err;
+ }
+
+ return 0;
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+ p->set = 0;
+ p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+ int err;
+
+ tapdisk_stream_poll_initialize(p);
+
+ err = pipe(p->pipe);
+ if (err)
+ return -errno;
+
+ err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ close(p->pipe[POLL_READ]);
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+ return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+ if (p->pipe[POLL_READ] != -1)
+ close(p->pipe[POLL_READ]);
+ if (p->pipe[POLL_WRITE] != -1)
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+ int dummy;
+
+ read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+ p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+ int dummy = 0;
+
+ if (!p->set) {
+ write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+ p->set = 1;
+ }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+ return ((s->cur == s->end || s->err) &&
+ list_empty(&s->pending_list) &&
+ list_empty(&s->completed_list));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+ memset(req, 0, sizeof(*req));
+ INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *req)
+{
+ return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *req;
+
+ if (list_empty(&s->free_list))
+ return NULL;
+
+ req = list_entry(s->free_list.next,
+ struct tapdisk_stream_request, next);
+
+ list_del_init(&req->next);
+ tapdisk_stream_initialize_request(req);
+
+ return req;
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ struct tapdisk_stream_request *itr;
+
+ list_for_each_entry(itr, &s->completed_list, next)
+ if (sreq->seqno < itr->seqno) {
+ list_add_tail(&sreq->next, &itr->next);
+ return;
+ }
+
+ list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static int
+tapdisk_result_compare(struct tapdisk_stream_request *sreq1,
+ struct tapdisk_stream_request *sreq2)
+{
+ unsigned long idx1, idx2;
+ char *buf1, *buf2;
+ int result;
+
+ assert(sreq1->seqno == sreq2->seqno);
+ assert(sreq1->secs == sreq2->secs);
+ idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1,
+ sreq1);
+ idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2,
+ sreq2);
+ buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0);
+ buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0);
+
+ result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT);
+ return result;
+}
+
+static int
+tapdisk_stream_process_data(void)
+{
+ struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2;
+ int advance_both;
+ int result = 0;
+
+ sreq1 = list_entry(stream1.completed_list.next,
+ struct tapdisk_stream_request, next);
+ sreq2 = list_entry(stream2.completed_list.next,
+ struct tapdisk_stream_request, next);
+ tmp1 = list_entry(sreq1->next.next,
+ struct tapdisk_stream_request, next);
+ tmp2 = list_entry(sreq2->next.next,
+ struct tapdisk_stream_request, next);
+ while (result == 0 &&
+ &sreq1->next != &stream1.completed_list &&
+ &sreq2->next != &stream2.completed_list) {
+ //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno);
+ advance_both = 1;
+ if (sreq1->seqno < sreq2->seqno) {
+ advance_both = 0;
+ goto advance1;
+ }
+ if (sreq1->seqno > sreq2->seqno)
+ goto advance2;
+
+ result = tapdisk_result_compare(sreq1, sreq2);
+
+ stream1.completed++;
+ stream2.completed++;
+
+ list_del_init(&sreq1->next);
+ list_add_tail(&sreq1->next, &stream1.free_list);
+ list_del_init(&sreq2->next);
+ list_add_tail(&sreq2->next, &stream2.free_list);
+
+advance1:
+ sreq1 = tmp1;
+ tmp1 = list_entry(tmp1->next.next,
+ struct tapdisk_stream_request, next);
+ if (!advance_both)
+ continue;
+advance2:
+ sreq2 = tmp2;
+ tmp2 = list_entry(tmp2->next.next,
+ struct tapdisk_stream_request, next);
+ }
+
+ return result;
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+ struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+ list_del_init(&sreq->next);
+
+ if (rsp->status == BLKIF_RSP_OKAY)
+ tapdisk_stream_queue_completed(s, sreq);
+ else {
+ s->err = EIO;
+ list_add_tail(&sreq->next, &s->free_list);
+ fprintf(stderr, "error reading sector 0x%"PRIx64"\n", sreq->sec);
+ }
+
+ if (tapdisk_stream_process_data()) {
+ fprintf(stderr, "mismatch at sector 0x%"PRIx64"\n",
+ sreq->sec);
+ stream1.err = EINVAL;
+ stream2.err = EINVAL;
+ }
+
+ tapdisk_stream_poll_set(&stream1.poll);
+ tapdisk_stream_poll_set(&stream2.poll);
+}
+
+static inline int
+tapdisk_stream_enqueue_copy(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *r)
+{
+ td_vbd_t *vbd;
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+ int idx;
+
+ vbd = stream2.vbd;
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ return 1;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = r->sec;
+ sreq->secs = r->secs;
+ sreq->seqno = r->seqno;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = r->blkif_req.nr_segments;
+ breq->sector_number = r->blkif_req.sector_number;
+ breq->operation = BLKIF_OP_READ;
+
+ for (int i = 0; i < r->blkif_req.nr_segments; i++) {
+ struct blkif_request_segment *seg = breq->seg + i;
+ seg->first_sect = r->blkif_req.seg[i].first_sect;
+ seg->last_sect = r->blkif_req.seg[i].last_sect;
+ }
+ s->cur += sreq->secs;
+
+ vreq = vbd->request_list + idx;
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+
+ return 0;
+}
+
+static void
+tapdisk_stream_enqueue1(void)
+{
+ td_vbd_t *vbd;
+ int i, idx, psize, blk;
+ struct tapdisk_stream *s = &stream1;
+
+ vbd = s->vbd;
+ psize = getpagesize();
+
+ while (s->cur < s->end && !s->err) {
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+
+ /* skip any blocks that are not present in this image */
+ blk = s->cur >> SPB_SHIFT;
+ while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) {
+ //printf("skipping block %d\n", blk);
+ blk++;
+ s->cur = blk << SPB_SHIFT;
+ }
+
+ if (s->cur >= s->end)
+ break;
+
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ break;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = s->cur;
+ sreq->secs = 0;
+ sreq->seqno = s->started++;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = 0;
+ breq->sector_number = sreq->sec;
+ breq->operation = BLKIF_OP_READ;
+
+ for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+ uint32_t secs;
+ struct blkif_request_segment *seg = breq->seg + i;
+
+ secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+ secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs);
+ if (!secs)
+ break;
+
+ sreq->secs += secs;
+ s->cur += secs;
+
+ seg->first_sect = 0;
+ seg->last_sect = secs - 1;
+ breq->nr_segments++;
+ }
+
+ vreq = vbd->request_list + idx;
+
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+ }
+
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static void
+tapdisk_stream_enqueue2(void)
+{
+ td_vbd_t *vbd;
+ int i, blk;
+ struct tapdisk_stream_request *itr;
+ struct tapdisk_stream *s = &stream2;
+
+ vbd = s->vbd;
+
+ /* issue the same requests that we issued on stream1 */
+ list_for_each_entry(itr, &stream1.completed_list, next) {
+ if (itr->sec < s->cur)
+ continue;
+ if (tapdisk_stream_enqueue_copy(s, itr))
+ goto done;
+ }
+
+ list_for_each_entry(itr, &stream1.pending_list, next) {
+ if (itr->sec < s->cur)
+ continue;
+ if (tapdisk_stream_enqueue_copy(s, itr))
+ goto done;
+ }
+
+ stream2.cur = stream1.cur;
+
+done:
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static inline int
+tapdisk_diff_done(void)
+{
+ return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2));
+}
+
+static void
+tapdisk_diff_stop(void)
+{
+ tapdisk_stream_close_image(&stream1);
+ tapdisk_stream_close_image(&stream2);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+ tapdisk_stream_poll_clear(&s->poll);
+
+ if (tapdisk_diff_done()) {
+ tapdisk_diff_stop();
+ return;
+ }
+
+ if (s == &stream1)
+ tapdisk_stream_enqueue1();
+ else if (s == &stream2)
+ tapdisk_stream_enqueue2();
+ else
+ assert(0);
+
+ if (tapdisk_diff_done()) {
+ // we have to check again for the case when stream1 had no
+ // blocks at all
+ tapdisk_diff_stop();
+ return;
+ }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+ int err;
+ image_t image;
+
+ s->id = tapdisk_stream_count++;
+
+ err = tapdisk_vbd_initialize(-1, -1, s->id);
+ if (err)
+ goto out;
+
+ s->vbd = tapdisk_server_get_vbd(s->id);
+ if (!s->vbd) {
+ err = ENODEV;
+ goto out;
+ }
+
+ tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+ err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ s->vbd->reopened = 1;
+
+ err = tapdisk_vbd_get_image_info(s->vbd, &image);
+ if (err) {
+ fprintf(stderr, "failed getting image size: %d\n", err);
+ return err;
+ }
+
+ s->start = 0;
+ s->cur = s->start;
+ s->end = image.size;
+
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to open image %s: %d\n", path, err);
+ return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(s->id);
+ if (vbd) {
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ free((void *)vbd->ring.vstart);
+ free(vbd->name);
+ free(vbd);
+ s->vbd = NULL;
+ }
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+ size_t size;
+ td_ring_t *ring;
+ int err, i, psize;
+
+ ring = &s->vbd->ring;
+ psize = getpagesize();
+ size = psize * BLKTAP_MMAP_REGION_SIZE;
+
+ /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+ err = posix_memalign((void **)&ring->vstart, psize, size);
+ if (err) {
+ fprintf(stderr, "failed to allocate buffers: %d\n", err);
+ ring->vstart = 0;
+ return err;
+ }
+
+ for (i = 0; i < MAX_REQUESTS; i++) {
+ struct tapdisk_stream_request *req = s->requests + i;
+ tapdisk_stream_initialize_request(req);
+ list_add_tail(&req->next, &s->free_list);
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+ int err;
+ struct tapdisk_stream_poll *p = &s->poll;
+
+ err = tapdisk_stream_poll_open(p);
+ if (err)
+ goto out;
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ p->pipe[POLL_READ], 0,
+ tapdisk_stream_enqueue, s);
+ if (err < 0)
+ goto out;
+
+ s->enqueue_event_id = err;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to register event: %d\n", err);
+ return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+ if (s->enqueue_event_id) {
+ tapdisk_server_unregister_event(s->enqueue_event_id);
+ s->enqueue_event_id = 0;
+ }
+ tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+ memset(s, 0, sizeof(*s));
+ INIT_LIST_HEAD(&s->free_list);
+ INIT_LIST_HEAD(&s->pending_list);
+ INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *arg)
+{
+ int err, type;
+ char *path;
+
+ err = tapdisk_parse_disk_type(arg, &path, &type);
+ if (err)
+ return err;
+
+ tapdisk_stream_initialize(s);
+
+ err = tapdisk_stream_open_image(s, path, type);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_initialize_requests(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_register_enqueue_event(s);
+ if (err)
+ return err;
+
+ tapdisk_stream_enqueue(s->enqueue_event_id,
+ SCHEDULER_POLL_READ_FD, s);
+
+ return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+ tapdisk_stream_close_image(s);
+ tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+ tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+ tapdisk_server_run();
+ return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, err, type1;
+ const char *arg1 = NULL, *arg2 = NULL;
+ char *path1;
+
+ err = 0;
+
+ program = basename(argv[0]);
+
+ while ((c = getopt(argc, argv, "n:m:h")) != -1) {
+ switch (c) {
+ case 'n':
+ arg1 = optarg;
+ break;
+ case 'm':
+ arg2 = optarg;
+ break;
+ case 'h':
+ usage(stdout);
+ return 0;
+ default:
+ goto fail_usage;
+ }
+ }
+
+ if (!arg1 || !arg2)
+ goto fail_usage;
+
+ err = tapdisk_parse_disk_type(arg1, &path1, &type1);
+ if (err)
+ return err;
+ if (type1 != DISK_TYPE_VHD) {
+ printf("error: first VDI is not VHD\n");
+ return EINVAL;
+ }
+
+ err = open_vhd(path1, &vhd1);
+ if (err)
+ return err;
+
+ tapdisk_start_logging("tapdisk-diff");
+
+ err = tapdisk_server_initialize(NULL, NULL);
+ if (err)
+ goto out;
+
+ err = tapdisk_stream_open(&stream1, arg1);
+ if (err) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ arg1, strerror(-err));
+ goto out;
+ }
+
+ err = tapdisk_stream_open(&stream2, arg2);
+ if (err) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ arg2, strerror(-err));
+ goto out1;
+ }
+
+ if (stream1.end != stream2.end) {
+ fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n",
+ stream1.end, stream2.end);
+ err = EINVAL;
+ goto out2;
+ }
+
+ tapdisk_server_run();
+
+out2:
+ tapdisk_stream_release(&stream2);
+out1:
+ tapdisk_stream_release(&stream1);
+out:
+ vhd_close(&vhd1);
+ tapdisk_stop_logging();
+
+ return err ? : stream1.err;
+
+fail_usage:
+ usage(stderr);
+ return 1;
+}
diff --git a/tools/blktap2/drivers/tapdisk-driver.c b/tools/blktap2/drivers/tapdisk-driver.c
new file mode 100644
index 0000000000..ca5629ab73
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-driver.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdlib.h>
+
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+td_driver_t *
+tapdisk_driver_allocate(int type, char *name, td_flag_t flags, int storage)
+{
+ int err;
+ td_driver_t *driver;
+ struct tap_disk *ops;
+
+ ops = tapdisk_server_find_driver_interface(type);
+ if (!ops)
+ return NULL;
+
+ driver = calloc(1, sizeof(td_driver_t));
+ if (!driver)
+ return NULL;
+
+ err = tapdisk_namedup(&driver->name, name);
+ if (err)
+ goto fail;
+
+ driver->ops = ops;
+ driver->type = type;
+ driver->storage = storage;
+ driver->data = calloc(1, ops->private_data_size);
+ if (!driver->data)
+ goto fail;
+
+ if (td_flag_test(flags, TD_OPEN_RDONLY))
+ td_flag_set(driver->state, TD_DRIVER_RDONLY);
+
+ return driver;
+
+fail:
+ free(driver->name);
+ free(driver->data);
+ free(driver);
+ return NULL;
+}
+
+void
+tapdisk_driver_free(td_driver_t *driver)
+{
+ if (!driver)
+ return;
+
+ if (driver->refcnt)
+ return;
+
+ if (td_flag_test(driver->state, TD_DRIVER_OPEN))
+ EPRINTF("freeing open driver %s (state 0x%08x)\n",
+ driver->name, driver->state);
+
+ free(driver->name);
+ free(driver->data);
+ free(driver);
+}
+
+void
+tapdisk_driver_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+ tapdisk_server_queue_tiocb(tiocb);
+}
+
+void
+tapdisk_driver_debug(td_driver_t *driver)
+{
+ if (driver->ops->td_debug)
+ driver->ops->td_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-driver.h b/tools/blktap2/drivers/tapdisk-driver.h
new file mode 100644
index 0000000000..de0a9be233
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-driver.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_DRIVER_H_
+#define _TAPDISK_DRIVER_H_
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-queue.h"
+
+#define TD_DRIVER_OPEN 0x0001
+#define TD_DRIVER_RDONLY 0x0002
+
+struct td_driver_handle {
+ int type;
+ char *name;
+
+ int storage;
+
+ int refcnt;
+ td_flag_t state;
+
+ td_disk_info_t info;
+
+ void *data;
+ struct tap_disk *ops;
+
+ struct list_head next;
+};
+
+td_driver_t *tapdisk_driver_allocate(int, char *, td_flag_t, int);
+void tapdisk_driver_free(td_driver_t *);
+
+void tapdisk_driver_queue_tiocb(td_driver_t *, struct tiocb *);
+
+void tapdisk_driver_debug(td_driver_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-filter.c b/tools/blktap2/drivers/tapdisk-filter.c
new file mode 100644
index 0000000000..fc018eadbd
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-filter.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#include <syslog.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-filter.h"
+
+#define RSEED 7
+#define PRE_CHECK 0
+#define POST_CHECK 1
+
+#define WRITE_INTEGRITY "buffer integrity failure after write"
+#define READ_INTEGRITY "disk integrity failure after read"
+
+#define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a)
+
+/*
+ * simulate IO errors by knocking request size to zero before
+ * submitting and restoring original size before returning
+ */
+static inline void
+inject_fault(struct tfilter *filter, struct iocb *io)
+{
+ struct fiocb *fio;
+
+ if (!filter->ffree)
+ return;
+
+ fio = filter->flist[--filter->ffree];
+
+ fio->bytes = io->u.c.nbytes;
+ fio->data = io->data;
+ io->u.c.nbytes = 0;
+ io->data = fio;
+}
+
+static inline int
+fault_injected(struct tfilter *filter, struct iocb *io)
+{
+ unsigned long iop = (unsigned long)io->data;
+ unsigned long start = (unsigned long)filter->fiocbs;
+ unsigned long end = start + (filter->iocbs * sizeof(struct fiocb));
+
+ return (iop >= start && iop < end);
+}
+
+static inline void
+recover_fault(struct tfilter *filter, struct iocb *io)
+{
+ struct fiocb *fio = (struct fiocb *)io->data;
+
+ io->u.c.nbytes = fio->bytes;
+ io->data = fio->data;
+
+ memset(fio, 0, sizeof(struct fiocb));
+ filter->flist[filter->ffree++] = fio;
+}
+
+static inline uint64_t
+chksum(char *buf)
+{
+ int i, num = 512 >> 3;
+ uint64_t *p = (uint64_t *)buf;
+ uint64_t sum = 0;
+
+ for (i = 0; i < num; i++)
+ sum += p[i];
+
+ return sum;
+}
+
+static inline void
+check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type)
+{
+ uint64_t sum;
+ struct dhash *hash;
+
+ hash = filter->dhash + sec;
+ if (!hash->time.tv_sec)
+ return;
+
+ sum = chksum(buf);
+ if (hash->hash != chksum(buf)) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06lu, "
+ "from disk: 0x%020" PRIx64 " at %012lu.%06lu\n",
+ type, hash->hash, hash->time.tv_sec,
+ hash->time.tv_usec, sum, now.tv_sec, now.tv_usec);
+ }
+}
+
+static inline void
+insert_hash(struct tfilter *filter, uint64_t sec, char *buf)
+{
+ struct dhash *hash;
+
+ hash = filter->dhash + sec;
+ hash->hash = chksum(buf);
+ gettimeofday(&hash->time, NULL);
+}
+
+static void
+check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf)
+{
+ struct dhash *hash;
+
+ if (sec >= filter->secs)
+ return;
+
+ hash = filter->dhash + sec;
+
+ if (rw) {
+ if (type == PRE_CHECK)
+ insert_hash(filter, sec, buf);
+ else
+ check_hash(filter, sec, buf, WRITE_INTEGRITY);
+ } else if (type == POST_CHECK) {
+ check_hash(filter, sec, buf, READ_INTEGRITY);
+ insert_hash(filter, sec, buf);
+ }
+}
+
+static void
+check_data(struct tfilter *filter, int type, struct iocb *io)
+{
+ int rw;
+ uint64_t i, sec;
+
+ rw = (io->aio_lio_opcode == IO_CMD_PWRITE);
+
+ for (i = 0; i < io->u.c.nbytes; i += 512) {
+ char *buf = io->u.c.buf + i;
+ uint64_t sec = (io->u.c.offset + i) >> 9;
+ check_sector(filter, type, rw, sec, buf);
+ }
+}
+
+struct tfilter *
+tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs)
+{
+ int i;
+ struct tfilter *filter = NULL;
+
+ if (!mode)
+ return NULL;
+
+ filter = calloc(1, sizeof(struct tfilter));
+ if (!filter)
+ goto fail;
+
+ filter->mode = mode;
+ filter->secs = secs;
+ filter->iocbs = iocbs;
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ filter->fiocbs = calloc(iocbs, sizeof(struct fiocb));
+ filter->flist = calloc(iocbs, sizeof(struct fiocb *));
+ if (!filter->fiocbs || !filter->flist)
+ filter->mode &= ~TD_INJECT_FAULTS;
+ else {
+ srand(RSEED);
+ filter->ffree = iocbs;
+ for (i = 0; i < iocbs; i++)
+ filter->flist[i] = filter->fiocbs + i;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY) {
+ filter->dhash = calloc(secs, sizeof(struct dhash));
+ if (!filter->dhash)
+ filter->mode &= ~TD_CHECK_INTEGRITY;
+ }
+
+ syslog(LOG_WARNING, "WARNING: "
+ "FILTERING IN MODE 0x%04x\n", filter->mode);
+
+ return filter;
+
+ fail:
+ tapdisk_free_tfilter(filter);
+ return NULL;
+}
+
+void
+tapdisk_free_tfilter(struct tfilter *filter)
+{
+ if (!filter)
+ return;
+
+ free(filter->dhash);
+ free(filter->flist);
+ free(filter->fiocbs);
+ free(filter);
+}
+
+void
+tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num)
+{
+ int i;
+
+ if (!filter)
+ return;
+
+ for (i = 0; i < num; i++) {
+ struct iocb *io = iocbs[i];
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ if ((random() % 100) <= TD_FAULT_RATE) {
+ inject_fault(filter, io);
+ continue;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY)
+ check_data(filter, PRE_CHECK, io);
+ }
+}
+
+void
+tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num)
+{
+ int i;
+
+ if (!filter)
+ return;
+
+ for (i = 0; i < num; i++) {
+ struct iocb *io = events[i].obj;
+
+ if (filter->mode & TD_INJECT_FAULTS) {
+ if (fault_injected(filter, io)) {
+ recover_fault(filter, io);
+ continue;
+ }
+ }
+
+ if (filter->mode & TD_CHECK_INTEGRITY)
+ check_data(filter, POST_CHECK, io);
+ }
+}
diff --git a/tools/blktap2/drivers/tapdisk-filter.h b/tools/blktap2/drivers/tapdisk-filter.h
new file mode 100644
index 0000000000..c4e977e4aa
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-filter.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef TAPDISK_FILTER_H
+#define TAPDISK_FILTER_H
+
+#include <libaio.h>
+#include <inttypes.h>
+#include <time.h>
+
+#define TD_INJECT_FAULTS 0x00001 /* simulate random IO failures */
+#define TD_CHECK_INTEGRITY 0x00002 /* check data integrity */
+
+#define TD_FAULT_RATE 5
+
+struct dhash {
+ uint64_t hash;
+ struct timeval time;
+};
+
+struct fiocb {
+ size_t bytes;
+ void *data;
+};
+
+struct tfilter {
+ int mode;
+ uint64_t secs;
+ int iocbs;
+
+ struct dhash *dhash;
+
+ int ffree;
+ struct fiocb *fiocbs;
+ struct fiocb **flist;
+};
+
+struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs);
+void tapdisk_free_tfilter(struct tfilter *);
+void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int);
+void tapdisk_filter_events(struct tfilter *, struct io_event *, int);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-image.c b/tools/blktap2/drivers/tapdisk-image.c
new file mode 100644
index 0000000000..6da7f48bd8
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-image.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+td_image_t *
+tapdisk_image_allocate(char *file, int type, int storage,
+ td_flag_t flags, void *private)
+{
+ int err;
+ td_image_t *image;
+
+ image = calloc(1, sizeof(td_image_t));
+ if (!image)
+ return NULL;
+
+ err = tapdisk_namedup(&image->name, file);
+ if (err) {
+ free(image);
+ return NULL;
+ }
+
+ image->type = type;
+ image->flags = flags;
+ image->storage = storage;
+ image->private = private;
+ INIT_LIST_HEAD(&image->next);
+
+ return image;
+}
+
+void
+tapdisk_image_free(td_image_t *image)
+{
+ if (!image)
+ return;
+
+ list_del(&image->next);
+
+ free(image->name);
+ tapdisk_driver_free(image->driver);
+ free(image);
+}
+
+int
+tapdisk_image_check_td_request(td_image_t *image, td_request_t treq)
+{
+ int rdonly;
+ td_driver_t *driver;
+ td_disk_info_t *info;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ info = &driver->info;
+ rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+ if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE)
+ goto fail;
+
+ if (treq.op == TD_OP_WRITE && rdonly)
+ goto fail;
+
+ if (treq.secs <= 0 || treq.sec + treq.secs > info->size)
+ goto fail;
+
+ return 0;
+
+fail:
+ ERR(-EINVAL, "bad td request on %s (%s, %"PRIu64"): %d at %"PRIu64,
+ image->name, (rdonly ? "ro" : "rw"), info->size, treq.op,
+ treq.sec + treq.secs);
+ return -EINVAL;
+
+}
+
+int
+tapdisk_image_check_ring_request(td_image_t *image, blkif_request_t *req)
+{
+ td_driver_t *driver;
+ td_disk_info_t *info;
+ int i, psize, rdonly;
+ uint64_t nsects, total;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ nsects = 0;
+ total = 0;
+ info = &driver->info;
+
+ rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+ if (req->operation != BLKIF_OP_READ &&
+ req->operation != BLKIF_OP_WRITE)
+ goto fail;
+
+ if (req->operation == BLKIF_OP_WRITE && rdonly)
+ goto fail;
+
+ if (!req->nr_segments || req->nr_segments > MAX_SEGMENTS_PER_REQ)
+ goto fail;
+
+ total = 0;
+ psize = getpagesize();
+
+ for (i = 0; i < req->nr_segments; i++) {
+ nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+
+ if (req->seg[i].last_sect >= psize >> 9 || nsects <= 0)
+ goto fail;
+
+ total += nsects;
+ }
+
+ if (req->sector_number + nsects > info->size)
+ goto fail;
+
+ return 0;
+
+fail:
+ ERR(-EINVAL, "bad request on %s (%s, %"PRIu64"): id: %"PRIu64": %d at %"PRIu64,
+ image->name, (rdonly ? "ro" : "rw"), info->size, req->id,
+ req->operation, req->sector_number + total);
+ return -EINVAL;
+}
diff --git a/tools/blktap2/drivers/tapdisk-image.h b/tools/blktap2/drivers/tapdisk-image.h
new file mode 100644
index 0000000000..8779dff8b7
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-image.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IMAGE_H_
+#define _TAPDISK_IMAGE_H_
+
+#include "tapdisk.h"
+#include <xen/io/blkif.h>
+
+struct td_image_handle {
+ int type;
+ char *name;
+
+ td_flag_t flags;
+ int storage;
+
+ td_driver_t *driver;
+ td_disk_info_t info;
+
+ void *private;
+
+ struct list_head next;
+};
+
+td_image_t *tapdisk_image_allocate(char *, int, int, td_flag_t, void *);
+void tapdisk_image_free(td_image_t *);
+
+int tapdisk_image_check_td_request(td_image_t *, td_request_t);
+int tapdisk_image_check_ring_request(td_image_t *, blkif_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-interface.c b/tools/blktap2/drivers/tapdisk-interface.c
new file mode 100644
index 0000000000..58366d0a0b
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-interface.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+int
+td_load(td_image_t *image)
+{
+ int err;
+ td_image_t *shared;
+ td_driver_t *driver;
+
+ shared = tapdisk_server_get_shared_image(image);
+ if (!shared)
+ return -ENODEV;
+
+ driver = shared->driver;
+ if (!driver)
+ return -EBADF;
+
+ driver->refcnt++;
+ image->driver = driver;
+ image->info = driver->info;
+
+ DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt, driver->state, driver->type);
+ return 0;
+}
+
+int
+td_open(td_image_t *image)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ driver = tapdisk_driver_allocate(image->type,
+ image->name,
+ image->flags,
+ image->storage);
+ if (!driver)
+ return -ENOMEM;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = driver->ops->td_open(driver, image->name, image->flags);
+ if (err) {
+ if (!image->driver)
+ tapdisk_driver_free(driver);
+ return err;
+ }
+
+ td_flag_set(driver->state, TD_DRIVER_OPEN);
+ DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt + 1,
+ driver->state, driver->type);
+ }
+
+ image->driver = driver;
+ image->info = driver->info;
+ driver->refcnt++;
+ return 0;
+}
+
+int
+td_close(td_image_t *image)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ driver->refcnt--;
+ if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ driver->ops->td_close(driver);
+ td_flag_clear(driver->state, TD_DRIVER_OPEN);
+ }
+
+ DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n",
+ driver->name, driver->refcnt, driver->state, driver->type);
+
+ return 0;
+}
+
+int
+td_get_parent_id(td_image_t *image, td_disk_id_t *id)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver)
+ return -ENODEV;
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN))
+ return -EBADF;
+
+ return driver->ops->td_get_parent_id(driver, id);
+}
+
+int
+td_validate_parent(td_image_t *image, td_image_t *parent)
+{
+ td_driver_t *driver, *pdriver;
+
+ driver = image->driver;
+ pdriver = parent->driver;
+ if (!driver || !pdriver)
+ return -ENODEV;
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN) ||
+ !td_flag_test(pdriver->state, TD_DRIVER_OPEN))
+ return -EBADF;
+
+ return 0;
+ return driver->ops->td_validate_parent(driver, pdriver, 0);
+}
+
+void
+td_queue_write(td_image_t *image, td_request_t treq)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = -EBADF;
+ goto fail;
+ }
+
+ err = tapdisk_image_check_td_request(image, treq);
+ if (err)
+ goto fail;
+
+ driver->ops->td_queue_write(driver, treq);
+ return;
+
+fail:
+ td_complete_request(treq, err);
+}
+
+void
+td_queue_read(td_image_t *image, td_request_t treq)
+{
+ int err;
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+ err = -EBADF;
+ goto fail;
+ }
+
+ err = tapdisk_image_check_td_request(image, treq);
+ if (err)
+ goto fail;
+
+ driver->ops->td_queue_read(driver, treq);
+ return;
+
+fail:
+ td_complete_request(treq, err);
+}
+
+void
+td_forward_request(td_request_t treq)
+{
+ tapdisk_vbd_forward_request(treq);
+}
+
+void
+td_complete_request(td_request_t treq, int res)
+{
+ treq.cb(treq, res);
+}
+
+void
+td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+ tapdisk_driver_queue_tiocb(driver, tiocb);
+}
+
+void
+td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg);
+}
+
+void
+td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg);
+}
+
+void
+td_debug(td_image_t *image)
+{
+ td_driver_t *driver;
+
+ driver = image->driver;
+ if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN))
+
+ return;
+
+ tapdisk_driver_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-interface.h b/tools/blktap2/drivers/tapdisk-interface.h
new file mode 100644
index 0000000000..1e48e5811a
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-interface.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_INTERFACE_H_
+#define _TAPDISK_INTERFACE_H_
+
+#include "tapdisk.h"
+#include "tapdisk-queue.h"
+
+int td_open(td_image_t *);
+int td_load(td_image_t *);
+int td_close(td_image_t *);
+int td_get_parent_id(td_image_t *, td_disk_id_t *);
+int td_validate_parent(td_image_t *, td_image_t *);
+
+void td_queue_write(td_image_t *, td_request_t);
+void td_queue_read(td_image_t *, td_request_t);
+void td_forward_request(td_request_t);
+void td_complete_request(td_request_t, int);
+
+void td_debug(td_image_t *);
+
+void td_queue_tiocb(td_driver_t *, struct tiocb *);
+void td_prep_read(struct tiocb *, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+void td_prep_write(struct tiocb *, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-ipc.c b/tools/blktap2/drivers/tapdisk-ipc.c
new file mode 100644
index 0000000000..3cfdb6c8f8
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-ipc.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-ipc.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+
+static int
+tapdisk_ipc_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set writefds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ DPRINTF("sending '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, NULL, &writefds, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &writefds)) {
+ ret = write(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure writing message\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+tapdisk_ipc_write(td_ipc_t *ipc, int type)
+{
+ tapdisk_message_t message;
+
+ if (ipc->wfd == -1)
+ return 0;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+ message.type = type;
+ message.cookie = ipc->uuid;
+
+ return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+}
+
+int
+tapdisk_ipc_write_error(td_ipc_t *ipc, const char *text)
+{
+ tapdisk_message_t message;
+
+ memset(&message, 0, sizeof(message));
+ message.type = TAPDISK_MESSAGE_RUNTIME_ERROR;
+ message.cookie = ipc->uuid;
+ snprintf(message.u.string.text, sizeof(message.u.string.text), "%s", text);
+
+ return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+}
+
+static int
+tapdisk_ipc_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+ fd_set readfds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(tapdisk_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ memset(message, 0, sizeof(tapdisk_message_t));
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, &readfds, NULL, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len) {
+ EPRINTF("failure reading message\n");
+ return -EIO;
+ }
+
+ DPRINTF("received '%s' message (uuid = %u)\n",
+ tapdisk_message_name(message->type), message->cookie);
+
+ return 0;
+}
+
+int
+tapdisk_ipc_read(td_ipc_t *ipc)
+{
+ int err;
+ td_vbd_t *vbd;
+ td_uuid_t uuid;
+ tapdisk_message_t message;
+
+ err = tapdisk_ipc_read_message(ipc->rfd, &message, 2);
+ if (err) {
+ tapdisk_server_check_state();
+ return err;
+ }
+
+ uuid = message.cookie;
+ vbd = tapdisk_server_get_vbd(uuid);
+
+ if (!vbd && message.type != TAPDISK_MESSAGE_PID) {
+ EPRINTF("received message for non-existing vbd: %u\n", uuid);
+ err = -EINVAL;
+ goto fail;
+ }
+
+ switch (message.type) {
+ case TAPDISK_MESSAGE_PID:
+ err = tapdisk_vbd_initialize(ipc->rfd, ipc->wfd, uuid);
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+ message.cookie = uuid;
+
+ if (!err) {
+ message.type = TAPDISK_MESSAGE_PID_RSP;
+ message.u.tapdisk_pid = getpid();
+ } else
+ message.type = TAPDISK_MESSAGE_ERROR;
+
+ return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
+
+ case TAPDISK_MESSAGE_OPEN:
+ {
+ image_t image;
+ char *devname;
+ td_flag_t flags;
+
+ flags = 0;
+
+ if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
+ flags |= TD_OPEN_RDONLY;
+ if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED)
+ flags |= TD_OPEN_SHAREABLE;
+ if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE)
+ flags |= TD_OPEN_ADD_CACHE;
+ if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX)
+ flags |= TD_OPEN_VHD_INDEX;
+ if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY)
+ flags |= TD_OPEN_LOG_DIRTY;
+
+ err = asprintf(&devname, "%s/%s%d",
+ BLKTAP_DEV_DIR, BLKTAP_DEV_NAME,
+ message.u.params.devnum);
+ if (err == -1)
+ goto fail;
+
+ err = tapdisk_vbd_open(vbd,
+ message.u.params.path,
+ message.drivertype,
+ message.u.params.storage,
+ devname, flags);
+ free(devname);
+ if (err)
+ goto fail;
+
+ err = tapdisk_vbd_get_image_info(vbd, &image);
+ if (err)
+ goto fail;
+
+ memset(&message, 0, sizeof(tapdisk_message_t));
+ message.cookie = uuid;
+ message.u.image.sectors = image.size;
+ message.u.image.sector_size = image.secsize;
+ message.u.image.info = image.info;
+ message.type = TAPDISK_MESSAGE_OPEN_RSP;
+
+ return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
+ }
+
+ case TAPDISK_MESSAGE_PAUSE:
+ tapdisk_vbd_pause(vbd);
+ return 0; /* response written asynchronously */
+
+ case TAPDISK_MESSAGE_RESUME:
+ tapdisk_vbd_resume(vbd,
+ message.u.params.path,
+ message.drivertype);
+ return 0; /* response written asynchronously */
+
+ case TAPDISK_MESSAGE_CLOSE:
+ tapdisk_vbd_close(vbd);
+ return 0; /* response written asynchronously */
+
+ case TAPDISK_MESSAGE_EXIT:
+ return 0;
+ }
+
+ err = -EINVAL;
+ EPRINTF("received unrecognized message %s, uuid = %d\n",
+ tapdisk_message_name(message.type), uuid);
+
+fail:
+ memset(&message, 0, sizeof(tapdisk_message_t));
+ message.cookie = uuid;
+ message.type = TAPDISK_MESSAGE_ERROR;
+ tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+ tapdisk_server_check_state();
+
+ return -err;
+}
diff --git a/tools/blktap2/drivers/tapdisk-ipc.h b/tools/blktap2/drivers/tapdisk-ipc.h
new file mode 100644
index 0000000000..25eb48cafc
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-ipc.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IPC_H_
+#define _TAPDISK_IPC_H_
+
+#include "tapdisk-message.h"
+
+typedef struct td_ipc_handle {
+ int rfd;
+ int wfd;
+ td_uuid_t uuid;
+} td_ipc_t;
+
+int tapdisk_ipc_read(td_ipc_t *ipc);
+int tapdisk_ipc_write(td_ipc_t *ipc, int type);
+int tapdisk_ipc_write_error(td_ipc_t *ipc, const char *message);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-log.c b/tools/blktap2/drivers/tapdisk-log.c
new file mode 100644
index 0000000000..980affa3a2
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-log.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+
+#define MAX_ENTRY_LEN 512
+#define MAX_ERROR_MESSAGES 16
+
+struct error {
+ int cnt;
+ int err;
+ char *func;
+ char msg[MAX_ENTRY_LEN];
+};
+
+struct ehandle {
+ int cnt;
+ int dropped;
+ struct error errors[MAX_ERROR_MESSAGES];
+};
+
+struct tlog {
+ char *p;
+ int size;
+ uint64_t cnt;
+ char *buf;
+ int level;
+ char *file;
+ int append;
+};
+
+static struct ehandle tapdisk_err;
+static struct tlog tapdisk_log;
+
+void
+open_tlog(char *file, size_t bytes, int level, int append)
+{
+ tapdisk_log.size = ((bytes + 511) & (~511));
+
+ if (asprintf(&tapdisk_log.file, "%s.%d", file, getpid()) == -1)
+ return;
+
+ if (posix_memalign((void **)&tapdisk_log.buf, 512, tapdisk_log.size)) {
+ free(tapdisk_log.file);
+ tapdisk_log.buf = NULL;
+ return;
+ }
+
+ memset(tapdisk_log.buf, 0, tapdisk_log.size);
+
+ tapdisk_log.p = tapdisk_log.buf;
+ tapdisk_log.level = level;
+ tapdisk_log.append = append;
+}
+
+void
+close_tlog(void)
+{
+ if (!tapdisk_log.buf)
+ return;
+
+ if (tapdisk_log.append)
+ tlog_flush();
+
+ free(tapdisk_log.buf);
+ free(tapdisk_log.file);
+
+ memset(&tapdisk_log, 0, sizeof(struct tlog));
+}
+
+void
+__tlog_write(int level, const char *func, const char *fmt, ...)
+{
+ char *buf;
+ va_list ap;
+ struct timeval t;
+ int ret, len, avail;
+
+ if (!tapdisk_log.buf)
+ return;
+
+ if (level > tapdisk_log.level)
+ return;
+
+ avail = tapdisk_log.size - (tapdisk_log.p - tapdisk_log.buf);
+ if (avail < MAX_ENTRY_LEN) {
+ if (tapdisk_log.append)
+ tlog_flush();
+ tapdisk_log.p = tapdisk_log.buf;
+ }
+
+ buf = tapdisk_log.p;
+ gettimeofday(&t, NULL);
+ len = snprintf(buf, MAX_ENTRY_LEN - 1, "%08"PRIu64":%010ld.%06ld:"
+ "%s ", tapdisk_log.cnt, t.tv_sec, t.tv_usec, func);
+
+ va_start(ap, fmt);
+ ret = vsnprintf(buf + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+ va_end(ap);
+
+ len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+ len + ret : MAX_ENTRY_LEN - 1);
+ buf[len] = '\0';
+
+ tapdisk_log.cnt++;
+ tapdisk_log.p += len;
+}
+
+void
+__tlog_error(int err, const char *func, const char *fmt, ...)
+{
+ va_list ap;
+ int i, len, ret;
+ struct error *e;
+ struct timeval t;
+
+ err = (err > 0 ? err : -err);
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ if (e->err == err && e->func == func) {
+ e->cnt++;
+ return;
+ }
+ }
+
+ if (tapdisk_err.cnt >= MAX_ERROR_MESSAGES) {
+ tapdisk_err.dropped++;
+ return;
+ }
+
+ gettimeofday(&t, NULL);
+ e = &tapdisk_err.errors[tapdisk_err.cnt];
+
+ len = snprintf(e->msg, MAX_ENTRY_LEN - 1, "%010ld.%06ld:%s ",
+ t.tv_sec, t.tv_usec, func);
+
+ va_start(ap, fmt);
+ ret = vsnprintf(e->msg + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+ va_end(ap);
+
+ len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+ len + ret : MAX_ENTRY_LEN - 1);
+ e->msg[len] = '\0';
+
+ e->cnt++;
+ e->err = err;
+ e->func = (char *)func;
+ tapdisk_err.cnt++;
+}
+
+void
+tlog_print_errors(void)
+{
+ int i;
+ struct error *e;
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ syslog(LOG_INFO, "TAPDISK ERROR: errno %d at %s (cnt = %d): "
+ "%s\n", e->err, e->func, e->cnt, e->msg);
+ }
+
+ if (tapdisk_err.dropped)
+ syslog(LOG_INFO, "TAPDISK ERROR: %d other error messages "
+ "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush_errors(void)
+{
+ int i;
+ struct error *e;
+
+ for (i = 0; i < tapdisk_err.cnt; i++) {
+ e = &tapdisk_err.errors[i];
+ tlog_write(TLOG_WARN, "TAPDISK ERROR: errno %d at %s "
+ "(cnt = %d): %s\n", e->err, e->func, e->cnt,
+ e->msg);
+ }
+
+ if (tapdisk_err.dropped)
+ tlog_write(TLOG_WARN, "TAPDISK ERROR: %d other error messages "
+ "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush(void)
+{
+ int fd, flags;
+ size_t size, wsize;
+
+ if (!tapdisk_log.buf)
+ return;
+
+ flags = O_CREAT | O_WRONLY | O_DIRECT | O_NONBLOCK;
+ if (!tapdisk_log.append)
+ flags |= O_TRUNC;
+
+ fd = open(tapdisk_log.file, flags, 0644);
+ if (fd == -1)
+ return;
+
+ if (tapdisk_log.append)
+ if (lseek64(fd, 0, SEEK_END) == (loff_t)-1)
+ goto out;
+
+ tlog_flush_errors();
+
+ size = tapdisk_log.p - tapdisk_log.buf;
+ wsize = ((size + 511) & (~511));
+
+ memset(tapdisk_log.buf + size, '\n', wsize - size);
+ write(fd, tapdisk_log.buf, wsize);
+
+ tapdisk_log.p = tapdisk_log.buf;
+
+out:
+ close(fd);
+}
diff --git a/tools/blktap2/drivers/tapdisk-log.h b/tools/blktap2/drivers/tapdisk-log.h
new file mode 100644
index 0000000000..ae2a408dd4
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-log.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_LOG_H_
+#define _TAPDISK_LOG_H_
+
+#define TLOG_WARN 0
+#define TLOG_INFO 1
+#define TLOG_DBG 2
+
+void open_tlog(char *file, size_t bytes, int level, int append);
+void close_tlog(void);
+void tlog_flush(void);
+void tlog_print_errors(void);
+
+void __tlog_write(int level, const char *func, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
+void __tlog_error(int err, const char *func, const char *fmt, ...)
+ __attribute__((format(printf, 3, 4)));
+
+#define tlog_write(_level, _f, _a...) \
+ __tlog_write(_level, __func__, _f, ##_a)
+
+#define tlog_error(_err, _f, _a...) \
+ __tlog_error(_err, __func__, _f, ##_a)
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-queue.c b/tools/blktap2/drivers/tapdisk-queue.c
new file mode 100644
index 0000000000..5461d415e0
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-queue.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+
+#include "tapdisk.h"
+#include "tapdisk-log.h"
+#include "tapdisk-queue.h"
+#include "tapdisk-filter.h"
+#include "atomicio.h"
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+/*
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD 1
+
+static inline void
+queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (queue->queued) {
+ struct tiocb *prev = (struct tiocb *)
+ queue->iocbs[queue->queued - 1]->data;
+ prev->next = tiocb;
+ }
+
+ queue->iocbs[queue->queued++] = iocb;
+}
+
+static inline int
+deferred_tiocbs(struct tqueue *queue)
+{
+ return (queue->deferred.head != NULL);
+}
+
+static inline void
+defer_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ struct tlist *list = &queue->deferred;
+
+ if (!list->head)
+ list->head = list->tail = tiocb;
+ else
+ list->tail = list->tail->next = tiocb;
+
+ queue->tiocbs_deferred++;
+ queue->deferrals++;
+}
+
+static inline void
+queue_deferred_tiocb(struct tqueue *queue)
+{
+ struct tlist *list = &queue->deferred;
+
+ if (list->head) {
+ struct tiocb *tiocb = list->head;
+
+ list->head = tiocb->next;
+ if (!list->head)
+ list->tail = NULL;
+
+ queue_tiocb(queue, tiocb);
+ queue->tiocbs_deferred--;
+ }
+}
+
+static inline void
+queue_deferred_tiocbs(struct tqueue *queue)
+{
+ while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue))
+ queue_deferred_tiocb(queue);
+}
+
+/*
+ * td_complete may queue more tiocbs
+ */
+static void
+complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res)
+{
+ int err;
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (res == iocb->u.c.nbytes)
+ err = 0;
+ else if ((int)res < 0)
+ err = (int)res;
+ else
+ err = -EIO;
+
+ tiocb->cb(tiocb->arg, tiocb, err);
+}
+
+static int
+cancel_tiocbs(struct tqueue *queue, int err)
+{
+ int queued;
+ struct tiocb *tiocb;
+
+ if (!queue->queued)
+ return 0;
+
+ /*
+ * td_complete may queue more tiocbs, which
+ * will overwrite the contents of queue->iocbs.
+ * use a private linked list to keep track
+ * of the tiocbs we're cancelling.
+ */
+ tiocb = (struct tiocb *)queue->iocbs[0]->data;
+ queued = queue->queued;
+ queue->queued = 0;
+
+ for (; tiocb != NULL; tiocb = tiocb->next)
+ complete_tiocb(queue, tiocb, err);
+
+ return queued;
+}
+
+static int
+fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err)
+{
+ ERR(err, "io_submit error: %d of %d failed",
+ total - succeeded, total);
+
+ /* take any non-submitted, merged iocbs
+ * off of the queue, split them, and fail them */
+ queue->queued = io_expand_iocbs(&queue->opioctx,
+ queue->iocbs, succeeded, total);
+
+ return cancel_tiocbs(queue, err);
+}
+
+static inline ssize_t
+iocb_rw(struct iocb *iocb)
+{
+ int fd = iocb->aio_fildes;
+ char *buf = iocb->u.c.buf;
+ long long off = iocb->u.c.offset;
+ size_t size = iocb->u.c.nbytes;
+ ssize_t (*func)(int, void *, size_t) =
+ (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read);
+
+ if (lseek64(fd, off, SEEK_SET) == (off64_t)-1)
+ return -errno;
+
+ if (atomicio(func, fd, buf, size) != size)
+ return -errno;
+
+ return size;
+}
+
+static int
+io_synchronous_rw(struct tqueue *queue)
+{
+ int i, merged, split;
+ struct iocb *iocb;
+ struct tiocb *tiocb;
+ struct io_event *ep;
+
+ if (!queue->queued)
+ return 0;
+
+ tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+ merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+
+ queue->queued = 0;
+
+ for (i = 0; i < merged; i++) {
+ ep = queue->aio_events + i;
+ iocb = queue->iocbs[i];
+ ep->obj = iocb;
+ ep->res = iocb_rw(iocb);
+ }
+
+ split = io_split(&queue->opioctx, queue->aio_events, merged);
+ tapdisk_filter_events(queue->filter, queue->aio_events, split);
+
+ for (i = split, ep = queue->aio_events; i-- > 0; ep++) {
+ iocb = ep->obj;
+ tiocb = (struct tiocb *)iocb->data;
+ complete_tiocb(queue, tiocb, ep->res);
+ }
+
+ queue_deferred_tiocbs(queue);
+
+ return split;
+}
+
+int
+tapdisk_init_queue(struct tqueue *queue, int size,
+ int sync, struct tfilter *filter)
+{
+ int i, err;
+
+ memset(queue, 0, sizeof(struct tqueue));
+
+ queue->size = size;
+ queue->sync = sync;
+ queue->filter = filter;
+
+ if (sync) {
+ /* set up a pipe so we can return
+ * a poll fd that won't fire. */
+ if (pipe(queue->dummy_pipe))
+ return -errno;
+ queue->poll_fd = queue->dummy_pipe[0];
+ } else {
+ queue->aio_ctx = (io_context_t)REQUEST_ASYNC_FD;
+ queue->poll_fd = io_setup(size, &queue->aio_ctx);
+
+ if (queue->poll_fd < 0) {
+ if (queue->poll_fd == -EAGAIN)
+ DPRINTF("Couldn't setup AIO context. If you "
+ "are trying to concurrently use a "
+ "large number of blktap-based disks, "
+ "you may need to increase the "
+ "system-wide aio request limit. "
+ "(e.g. 'echo 1048576 > /proc/sys/fs/"
+ "aio-max-nr')\n");
+ else
+ DPRINTF("Couldn't get fd for AIO poll "
+ "support. This is probably because "
+ "your kernel does not have the "
+ "aio-poll patch applied.\n");
+ return queue->poll_fd;
+ }
+ }
+
+ err = -ENOMEM;
+ queue->iocbs = calloc(size, sizeof(struct iocb *));
+ queue->aio_events = calloc(size, sizeof(struct io_event));
+ if (!queue->iocbs || !queue->aio_events)
+ goto fail;
+
+ err = opio_init(&queue->opioctx, size);
+ if (err)
+ goto fail;
+
+ return 0;
+
+ fail:
+ tapdisk_free_queue(queue);
+ return err;
+}
+
+void
+tapdisk_free_queue(struct tqueue *queue)
+{
+ if (queue->sync) {
+ close(queue->dummy_pipe[0]);
+ close(queue->dummy_pipe[1]);
+ } else
+ io_destroy(queue->aio_ctx);
+
+ free(queue->iocbs);
+ free(queue->aio_events);
+ opio_free(&queue->opioctx);
+}
+
+void
+tapdisk_debug_queue(struct tqueue *queue)
+{
+ struct tiocb *tiocb = queue->deferred.head;
+
+ WARN("TAPDISK QUEUE:\n");
+ WARN("size: %d, sync: %d, queued: %d, iocbs_pending: %d, "
+ "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n",
+ queue->size, queue->sync, queue->queued, queue->iocbs_pending,
+ queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals);
+
+ if (tiocb) {
+ WARN("deferred:\n");
+ for (; tiocb != NULL; tiocb = tiocb->next) {
+ struct iocb *io = &tiocb->iocb;
+ WARN("%s of %lu bytes at %lld\n",
+ (io->aio_lio_opcode == IO_CMD_PWRITE ?
+ "write" : "read"),
+ io->u.c.nbytes, io->u.c.offset);
+ }
+ }
+}
+
+void
+tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size,
+ long long offset, td_queue_callback_t cb, void *arg)
+{
+ struct iocb *iocb = &tiocb->iocb;
+
+ if (rw)
+ io_prep_pwrite(iocb, fd, buf, size, offset);
+ else
+ io_prep_pread(iocb, fd, buf, size, offset);
+
+ iocb->data = tiocb;
+ tiocb->cb = cb;
+ tiocb->arg = arg;
+ tiocb->next = NULL;
+}
+
+void
+tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+ if (!tapdisk_queue_full(queue))
+ queue_tiocb(queue, tiocb);
+ else
+ defer_tiocb(queue, tiocb);
+}
+
+/*
+ * fail_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_submit_tiocbs(struct tqueue *queue)
+{
+ int merged, submitted, err = 0;
+
+ if (!queue->queued)
+ return 0;
+
+ if (queue->sync)
+ return io_synchronous_rw(queue);
+
+ tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+ merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+ submitted = io_submit(queue->aio_ctx, merged, queue->iocbs);
+
+ DBG("queued: %d, merged: %d, submitted: %d\n",
+ queue->queued, merged, submitted);
+
+ if (submitted < 0) {
+ err = submitted;
+ submitted = 0;
+ } else if (submitted < merged)
+ err = -EIO;
+
+ queue->iocbs_pending += submitted;
+ queue->tiocbs_pending += queue->queued;
+ queue->queued = 0;
+
+ if (err)
+ queue->tiocbs_pending -=
+ fail_tiocbs(queue, submitted, merged, err);
+
+ return submitted;
+}
+
+int
+tapdisk_submit_all_tiocbs(struct tqueue *queue)
+{
+ int submitted = 0;
+
+ do {
+ submitted += tapdisk_submit_tiocbs(queue);
+ } while (!tapdisk_queue_empty(queue));
+
+ return submitted;
+}
+
+int
+tapdisk_complete_tiocbs(struct tqueue *queue)
+{
+ int i, ret, split;
+ struct iocb *iocb;
+ struct tiocb *tiocb;
+ struct io_event *ep;
+
+ ret = io_getevents(queue->aio_ctx, 0,
+ queue->size, queue->aio_events, NULL);
+ split = io_split(&queue->opioctx, queue->aio_events, ret);
+ tapdisk_filter_events(queue->filter, queue->aio_events, split);
+
+ DBG("events: %d, tiocbs: %d\n", ret, split);
+
+ queue->iocbs_pending -= ret;
+ queue->tiocbs_pending -= split;
+
+ for (i = split, ep = queue->aio_events; i-- > 0; ep++) {
+ iocb = ep->obj;
+ tiocb = (struct tiocb *)iocb->data;
+ complete_tiocb(queue, tiocb, ep->res);
+ }
+
+ queue_deferred_tiocbs(queue);
+
+ return split;
+}
+
+/*
+ * cancel_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_cancel_tiocbs(struct tqueue *queue)
+{
+ return cancel_tiocbs(queue, -EIO);
+}
+
+int
+tapdisk_cancel_all_tiocbs(struct tqueue *queue)
+{
+ int cancelled = 0;
+
+ do {
+ cancelled += tapdisk_cancel_tiocbs(queue);
+ } while (!tapdisk_queue_empty(queue));
+
+ return cancelled;
+}
diff --git a/tools/blktap2/drivers/tapdisk-queue.h b/tools/blktap2/drivers/tapdisk-queue.h
new file mode 100644
index 0000000000..40ff88669c
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TAPDISK_QUEUE_H
+#define TAPDISK_QUEUE_H
+
+#include <libaio.h>
+
+#include "io-optimize.h"
+
+struct tiocb;
+struct tfilter;
+
+typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err);
+
+
+struct tiocb {
+ td_queue_callback_t cb;
+ void *arg;
+
+ struct iocb iocb;
+ struct tiocb *next;
+};
+
+struct tlist {
+ struct tiocb *head;
+ struct tiocb *tail;
+};
+
+struct tqueue {
+ int size;
+ int sync;
+
+ int poll_fd;
+ io_context_t aio_ctx;
+ struct opioctx opioctx;
+ int dummy_pipe[2];
+
+ int queued;
+ struct iocb **iocbs;
+ struct io_event *aio_events;
+
+ /* number of iocbs pending in the aio layer */
+ int iocbs_pending;
+
+ /* number of tiocbs pending in the queue --
+ * this is likely to be larger than iocbs_pending
+ * due to request coalescing */
+ int tiocbs_pending;
+
+ /* iocbs may be deferred if the aio ring is full.
+ * tapdisk_queue_complete will ensure deferred
+ * iocbs are queued as slots become available. */
+ struct tlist deferred;
+ int tiocbs_deferred;
+
+ /* optional tapdisk filter */
+ struct tfilter *filter;
+
+ uint64_t deferrals;
+};
+
+/*
+ * Interface for request producer (i.e., tapdisk)
+ * NB: the following functions may cause additional tiocbs to be queued:
+ * - tapdisk_submit_tiocbs
+ * - tapdisk_cancel_tiocbs
+ * - tapdisk_complete_tiocbs
+ * The *_all_tiocbs variants will handle the first two cases;
+ * be sure to call submit after calling complete in the third case.
+ */
+#define tapdisk_queue_count(q) ((q)->queued)
+#define tapdisk_queue_empty(q) ((q)->queued == 0)
+#define tapdisk_queue_full(q) \
+ (((q)->tiocbs_pending + (q)->queued) >= (q)->size)
+int tapdisk_init_queue(struct tqueue *, int size, int sync, struct tfilter *);
+void tapdisk_free_queue(struct tqueue *);
+void tapdisk_debug_queue(struct tqueue *);
+void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *);
+int tapdisk_submit_tiocbs(struct tqueue *);
+int tapdisk_submit_all_tiocbs(struct tqueue *);
+int tapdisk_complete_tiocbs(struct tqueue *);
+int tapdisk_cancel_tiocbs(struct tqueue *);
+int tapdisk_cancel_all_tiocbs(struct tqueue *);
+void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t,
+ long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-ring.c b/tools/blktap2/drivers/tapdisk-ring.c
new file mode 100644
index 0000000000..a5d40cb0a1
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-ring.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk-ring.h"
+
+static int
+tapdisk_uring_create_ctlfd(td_uring_t *ring)
+{
+ int fd, err;
+ struct sockaddr_un saddr;
+
+ if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >=
+ sizeof(saddr.sun_family))
+ return -ENAMETOOLONG;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd == -1)
+ return -errno;
+
+ memset(&saddr, 0, sizeof(struct sockaddr_un));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+ err = unlink(ring->ctlfd_path);
+ if (err == -1 && errno != ENOENT) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = bind(fd, &saddr, sizeof(struct sockaddr_un));
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = listen(fd, 1);
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ ring->ctlfd = fd;
+ return 0;
+
+fail:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_destroy_ctlfd(td_uring_t *ring)
+{
+ if (ring->ctlfd) {
+ close(ring->ctlfd);
+ ring->ctlfd = 0;
+ }
+
+ if (ring->ctlfd_path) {
+ unlink(ring->ctlfd_path);
+ free(ring->ctlfd_path);
+ ring->ctlfd_path = NULL;
+ }
+}
+
+static int
+tapdisk_uring_connect_ctlfd(td_uring_t *ring)
+{
+ int fd, err;
+ struct sockaddr_un saddr;
+
+ if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >=
+ sizeof(saddr.sun_path))
+ return -ENAMETOOLONG;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd == -1)
+ return -errno;
+
+ memset(&saddr, 0, sizeof(struct sockaddr_un));
+ saddr.sun_family = AF_UNIX;
+ memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+ err = connect(fd, &saddr, sizeof(saddr));
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ ring->ctlfd = fd;
+ return 0;
+
+fail:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_disconnect_ctlfd(td_uring_t *ring)
+{
+ if (ring->ctlfd)
+ close(ring->ctlfd);
+ free(ring->ctlfd_path);
+ ring->ctlfd_path = NULL;
+}
+
+static int
+tapdisk_uring_create_shmem(td_uring_t *ring)
+{
+ int fd, err;
+
+ fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750);
+ if (fd == -1)
+ return -errno;
+
+ err = ftruncate(fd, ring->shmem_size);
+ if (err == -1) {
+ err = -errno;
+ goto out;
+ }
+
+ ring->shmem = mmap(NULL, ring->shmem_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ring->shmem == MAP_FAILED) {
+ ring->shmem = NULL;
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_destroy_shmem(td_uring_t *ring)
+{
+ if (ring->shmem) {
+ munmap(ring->shmem, ring->shmem_size);
+ ring->shmem = NULL;
+ }
+
+ if (ring->shmem_path) {
+ shm_unlink(ring->shmem_path);
+ free(ring->shmem_path);
+ ring->shmem_path = NULL;
+ }
+}
+
+static int
+tapdisk_uring_connect_shmem(td_uring_t *ring)
+{
+ int fd, err;
+ td_uring_header_t header, *p;
+
+ fd = shm_open(ring->shmem_path, O_RDWR);
+ if (fd == -1)
+ return -errno;
+
+ p = mmap(NULL, sizeof(td_uring_header_t),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (p == MAP_FAILED) {
+ err = -errno;
+ goto out;
+ }
+
+ memcpy(&header, p, sizeof(td_uring_header_t));
+ munmap(p, sizeof(td_uring_header_t));
+
+ if (memcmp(header.cookie,
+ TAPDISK_URING_COOKIE, sizeof(header.cookie))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (header.version != TD_URING_CURRENT_VERSION) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ring->ring_size = header.ring_size;
+ ring->data_size = header.data_size;
+ ring->shmem_size = header.shmem_size;
+
+ ring->shmem = mmap(NULL, ring->shmem_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ring->shmem == MAP_FAILED) {
+ rint->shmem = NULL;
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ close(fd);
+ return err;
+}
+
+static void
+tapdisk_uring_disconnect_shmem(td_uring_t *ring)
+{
+ if (ring->shmem)
+ munmap(ring->shmem, ring->shmem_size);
+ free(ring->shmem_path);
+ ring->shmem_path = NULL;
+}
+
+int
+tapdisk_uring_create(td_uring_t *ring, const char *location,
+ uint32_t ring_size, uint32_t data_size)
+{
+ int fd, err;
+
+ memset(ring, 0, sizeof(td_uring_t));
+
+ ring->ring_size = ring_size;
+ ring->data_size = data_size;
+ ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t);
+
+ err = asprintf(&ring->shmem_path, "%s.shm", location);
+ if (err == -1) {
+ ring->shmem_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+ if (err == -1) {
+ ring->ctlfd_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = tapdisk_uring_create_ctlfd(ring);
+ if (err)
+ goto fail;
+
+ err = tapdisk_uring_create_shmem(ring);
+ if (err)
+ goto fail;
+
+ ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t);
+ ring->data_area = (unsigned long)ring->ring_area + ring->ring_size;
+
+ return 0;
+
+fail:
+ tapdisk_uring_destroy(ring);
+ return err;
+}
+
+int
+tapdisk_uring_destroy(td_uring_t *ring)
+{
+ tapdisk_uring_destroy_shmem(ring);
+ tapdisk_uring_destroy_ctlfd(ring);
+ return 0;
+}
+
+int
+tapdisk_uring_connect(td_uring_t *ring, const char *location)
+{
+ int fd, err;
+
+ memset(ring, 0, sizeof(td_uring_t));
+
+ err = asprintf(&ring->shmem_path, "%s.shm", location);
+ if (err == -1) {
+ ring->shmem_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+ if (err == -1) {
+ ring->ctlfd_path = NULL;
+ err = -errno;
+ goto fail;
+ }
+
+ err = tapdisk_uring_connect_ctlfd(ring);
+ if (err)
+ goto fail;
+
+ err = tapdisk_uring_connect_shmem(ring);
+ if (err)
+ goto fail;
+
+ err = 0;
+
+fail:
+}
+
+int
+tapdisk_uring_disconnect(td_uring_t *ring)
+{
+ tapdisk_uring_disconnect_shmem(ring);
+ tapdisk_uring_disconnect_ctlfd(ring);
+ return 0;
+}
+
+static int
+tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout)
+{
+ fd_set readfds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(td_uring_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ while (offset < len) {
+ FD_ZERO(&readfds);
+ FD_SET(fd, &readfds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, &readfds, NULL, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &readfds)) {
+ ret = read(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len)
+ return -EIO;
+
+ return 0;
+}
+
+static int
+tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout)
+{
+ fd_set writefds;
+ int ret, len, offset;
+ struct timeval tv, *t;
+
+ t = NULL;
+ offset = 0;
+ len = sizeof(td_uring_message_t);
+
+ if (timeout) {
+ tv.tv_sec = timeout;
+ tv.tv_usec = 0;
+ t = &tv;
+ }
+
+ while (offset < len) {
+ FD_ZERO(&writefds);
+ FD_SET(fd, &writefds);
+
+ /* we don't bother reinitializing tv. at worst, it will wait a
+ * bit more time than expected. */
+
+ ret = select(fd + 1, NULL, &writefds, NULL, t);
+ if (ret == -1)
+ break;
+ else if (FD_ISSET(fd, &writefds)) {
+ ret = write(fd, message + offset, len - offset);
+ if (ret <= 0)
+ break;
+ offset += ret;
+ } else
+ break;
+ }
+
+ if (offset != len)
+ return -EIO;
+
+ return 0;
+}
+
+int
+tapdisk_uring_poll(td_uring_t *ring)
+{
+ int err;
+ td_uring_message_t message;
+
+ err = tapdisk_uring_read_message(ring->ctlfd, &message, 1);
+ if (err)
+ return err;
+
+ if (message.type != TAPDISK_URING_MESSAGE_KICK)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+tapdisk_uring_kick(td_uring_t *ring)
+{
+ td_uring_message_t message;
+
+ memset(&message, 0, sizeof(td_uring_message_t));
+ message.type = TAPDISK_URING_MESSAGE_KICK;
+
+ return tapdisk_uring_write_message(ring->ctlfd, &message, 1);
+}
diff --git a/tools/blktap2/drivers/tapdisk-ring.h b/tools/blktap2/drivers/tapdisk-ring.h
new file mode 100644
index 0000000000..a70ee10609
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-ring.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_RING_H_
+#define _TAPDISK_RING_H_
+
+#include <inttypes.h>
+
+#include <xenctrl.h>
+#include <xen/io/ring.h>
+
+typedef struct td_uring td_uring_t;
+typedef struct td_uring_header td_uring_header_t;
+typedef struct td_uring_request td_uring_request_t;
+typedef struct td_uring_response td_uring_response_t;
+
+struct td_uring {
+ int ctlfd;
+
+ char *shmem_path;
+ char *ctlfd_path;
+
+ void *shmem;
+ void *ring_area;
+ void *data_area;
+};
+
+struct td_uring_header {
+ char cookie[8];
+ uint32_t version;
+ uint32_t shmem_size;
+ uint32_t ring_size;
+ uint32_t data_size;
+ char reserved[4064];
+};
+
+struct td_uring_request {
+ uint8_t op;
+ uint64_t id;
+ uint64_t sec;
+ uint32_t secs;
+ uint32_t offset;
+};
+
+struct td_uring_response {
+ uint8_t op;
+ uint64_t id;
+ uint8_t status;
+};
+
+DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t);
+
+int tapdisk_uring_create(td_uring_t *, const char *location,
+ uint32_t ring_size, uint32_t data_size);
+int tapdisk_uring_destroy(td_uring_t *);
+
+int tapdisk_uring_connect(td_uring_t *, const char *location);
+int tapdisk_uring_disconnect(td_uring_t *);
+
+int tapdisk_uring_poll(td_uring_t *);
+int tapdisk_uring_kick(td_uring_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-server.c b/tools/blktap2/drivers/tapdisk-server.c
new file mode 100644
index 0000000000..c6a3de514e
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-server.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/signal.h>
+
+#define TAPDISK
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+ tapdisk_server_t server;
+
+#define tapdisk_server_for_each_vbd(vbd, tmp) \
+ list_for_each_entry_safe(vbd, tmp, &server.vbds, next)
+
+struct tap_disk *
+tapdisk_server_find_driver_interface(int type)
+{
+ int n;
+
+ n = sizeof(dtypes) / sizeof(struct disk_info_t *);
+ if (type > n)
+ return NULL;
+
+ return dtypes[type]->drv;
+}
+
+td_image_t *
+tapdisk_server_get_shared_image(td_image_t *image)
+{
+ td_vbd_t *vbd, *tmpv;
+ td_image_t *img, *tmpi;
+
+ if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE))
+ return NULL;
+
+ tapdisk_server_for_each_vbd(vbd, tmpv)
+ tapdisk_vbd_for_each_image(vbd, img, tmpi)
+ if (img->type == image->type &&
+ !strcmp(img->name, image->name))
+ return img;
+
+ return NULL;
+}
+
+td_vbd_t *
+tapdisk_server_get_vbd(uint16_t uuid)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ if (vbd->uuid == uuid)
+ return vbd;
+
+ return NULL;
+}
+
+void
+tapdisk_server_add_vbd(td_vbd_t *vbd)
+{
+ list_add_tail(&vbd->next, &server.vbds);
+}
+
+void
+tapdisk_server_remove_vbd(td_vbd_t *vbd)
+{
+ list_del(&vbd->next);
+ INIT_LIST_HEAD(&vbd->next);
+ tapdisk_server_check_state();
+}
+
+void
+tapdisk_server_queue_tiocb(struct tiocb *tiocb)
+{
+ tapdisk_queue_tiocb(&server.aio_queue, tiocb);
+}
+
+void
+tapdisk_server_debug(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_debug_queue(&server.aio_queue);
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_debug(vbd);
+
+ tlog_flush();
+}
+
+void
+tapdisk_server_check_state(void)
+{
+ if (list_empty(&server.vbds))
+ server.run = 0;
+}
+
+event_id_t
+tapdisk_server_register_event(char mode, int fd,
+ int timeout, event_cb_t cb, void *data)
+{
+ return scheduler_register_event(&server.scheduler,
+ mode, fd, timeout, cb, data);
+}
+
+void
+tapdisk_server_unregister_event(event_id_t event)
+{
+ return scheduler_unregister_event(&server.scheduler, event);
+}
+
+void
+tapdisk_server_set_max_timeout(int seconds)
+{
+ scheduler_set_max_timeout(&server.scheduler, seconds);
+}
+
+static void
+tapdisk_server_assert_locks(void)
+{
+
+}
+
+static void
+tapdisk_server_set_retry_timeout(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ if (tapdisk_vbd_retry_needed(vbd)) {
+ tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL);
+ return;
+ }
+}
+
+static void
+tapdisk_server_check_progress(void)
+{
+ struct timeval now;
+ td_vbd_t *vbd, *tmp;
+
+ gettimeofday(&now, NULL);
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_check_progress(vbd);
+}
+
+static void
+tapdisk_server_submit_tiocbs(void)
+{
+ tapdisk_submit_all_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_kick_responses(void)
+{
+ int n;
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_kick(vbd);
+}
+
+static void
+tapdisk_server_check_vbds(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_check_state(vbd);
+}
+
+static void
+tapdisk_server_stop_vbds(void)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_kill_queue(vbd);
+}
+
+static void
+tapdisk_server_send_error(const char *message)
+{
+ td_vbd_t *vbd, *tmp;
+
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_ipc_write_error(&vbd->ipc, message);
+}
+
+static void
+tapdisk_server_read_ipc_message(event_id_t id, char mode, void *private)
+{
+ tapdisk_ipc_read(&server.ipc);
+}
+
+static void
+tapdisk_server_aio_queue_event(event_id_t id, char mode, void *private)
+{
+ tapdisk_complete_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_free_aio_queue(void)
+{
+ tapdisk_server_unregister_event(server.aio_queue_event_id);
+ tapdisk_free_queue(&server.aio_queue);
+}
+
+static int
+tapdisk_server_initialize_aio_queue(void)
+{
+ int err;
+ event_id_t id;
+
+ err = tapdisk_init_queue(&server.aio_queue,
+ TAPDISK_TIOCBS, 0, NULL);
+ if (err)
+ return err;
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ server.aio_queue.poll_fd, 0,
+ tapdisk_server_aio_queue_event,
+ NULL);
+ if (id < 0) {
+ tapdisk_free_queue(&server.aio_queue);
+ return id;
+ }
+
+ server.aio_queue_event_id = id;
+
+ return 0;
+}
+
+static void
+tapdisk_server_close(void)
+{
+ tapdisk_server_free_aio_queue();
+
+ if (server.control_event)
+ scheduler_unregister_event(&server.scheduler, server.control_event);
+
+ if (server.ipc.rfd != -1)
+ close(server.ipc.rfd);
+
+ if (server.ipc.wfd != -1)
+ close(server.ipc.wfd);
+}
+
+static void
+__tapdisk_server_run(void)
+{
+ int ret;
+
+ while (server.run) {
+ tapdisk_server_assert_locks();
+ tapdisk_server_set_retry_timeout();
+ tapdisk_server_check_progress();
+
+ ret = scheduler_wait_for_events(&server.scheduler);
+ if (ret < 0)
+ DBG(TLOG_WARN, "server wait returned %d\n", ret);
+
+ tapdisk_server_check_vbds();
+ tapdisk_server_submit_tiocbs();
+ tapdisk_server_kick_responses();
+ }
+}
+
+static void
+tapdisk_server_signal_handler(int signal)
+{
+ td_vbd_t *vbd, *tmp;
+ static int xfsz_error_sent = 0;
+
+ switch (signal) {
+ case SIGBUS:
+ case SIGINT:
+ tapdisk_server_for_each_vbd(vbd, tmp)
+ tapdisk_vbd_close(vbd);
+ break;
+
+ case SIGXFSZ:
+ ERR(EFBIG, "received SIGXFSZ");
+ tapdisk_server_stop_vbds();
+ if (xfsz_error_sent)
+ break;
+
+ tapdisk_server_send_error("received SIGXFSZ, closing queues");
+ xfsz_error_sent = 1;
+ break;
+
+ case SIGUSR1:
+ tapdisk_server_debug();
+ break;
+ }
+}
+
+int
+tapdisk_server_initialize(const char *read, const char *write)
+{
+ int err;
+ event_id_t event_id;
+
+ event_id = 0;
+ memset(&server, 0, sizeof(tapdisk_server_t));
+ server.ipc.rfd = server.ipc.wfd = -1;
+
+ INIT_LIST_HEAD(&server.vbds);
+
+ if (read) {
+ server.ipc.rfd = open(read, O_RDWR | O_NONBLOCK);
+ if (server.ipc.rfd < 0) {
+ err = -errno;
+ EPRINTF("FD open failed %s: %d\n", read, err);
+ goto fail;
+ }
+ }
+
+ if (write) {
+ server.ipc.wfd = open(write, O_RDWR | O_NONBLOCK);
+ if (server.ipc.wfd < 0) {
+ err = -errno;
+ EPRINTF("FD open failed %s, %d\n", write, err);
+ goto fail;
+ }
+ }
+
+ scheduler_initialize(&server.scheduler);
+
+ if (read) {
+ event_id = scheduler_register_event(&server.scheduler,
+ SCHEDULER_POLL_READ_FD,
+ server.ipc.rfd, 0,
+ tapdisk_server_read_ipc_message,
+ NULL);
+ if (event_id < 0) {
+ err = event_id;
+ goto fail;
+ }
+ }
+
+ err = tapdisk_server_initialize_aio_queue();
+ if (err)
+ goto fail;
+
+ server.control_event = event_id;
+ server.run = 1;
+
+ return 0;
+
+fail:
+ if (server.ipc.rfd > 0)
+ close(server.ipc.rfd);
+ if (server.ipc.wfd > 0)
+ close(server.ipc.wfd);
+ if (event_id > 0)
+ scheduler_unregister_event(&server.scheduler,
+ server.control_event);
+ return err;
+}
+
+int
+tapdisk_server_run()
+{
+ int err;
+
+ err = tapdisk_set_resource_limits();
+ if (err)
+ return err;
+
+ signal(SIGBUS, tapdisk_server_signal_handler);
+ signal(SIGINT, tapdisk_server_signal_handler);
+ signal(SIGUSR1, tapdisk_server_signal_handler);
+ signal(SIGXFSZ, tapdisk_server_signal_handler);
+
+ __tapdisk_server_run();
+ tapdisk_server_close();
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-server.h b/tools/blktap2/drivers/tapdisk-server.h
new file mode 100644
index 0000000000..09a4e13b81
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-server.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_SERVER_H_
+#define _TAPDISK_SERVER_H_
+
+#include "tapdisk-vbd.h"
+#include "tapdisk-queue.h"
+
+struct tap_disk *tapdisk_server_find_driver_interface(int);
+
+td_image_t *tapdisk_server_get_shared_image(td_image_t *);
+
+td_vbd_t *tapdisk_server_get_vbd(td_uuid_t);
+void tapdisk_server_add_vbd(td_vbd_t *);
+void tapdisk_server_remove_vbd(td_vbd_t *);
+
+void tapdisk_server_queue_tiocb(struct tiocb *);
+
+void tapdisk_server_check_state(void);
+
+event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *);
+void tapdisk_server_unregister_event(event_id_t);
+void tapdisk_server_set_max_timeout(int);
+
+int tapdisk_server_initialize(const char *, const char *);
+int tapdisk_server_run(void);
+
+#define TAPDISK_TIOCBS (TAPDISK_DATA_REQUESTS + 50)
+
+typedef struct tapdisk_server {
+ int run;
+ td_ipc_t ipc;
+ struct list_head vbds;
+ scheduler_t scheduler;
+ event_id_t control_event;
+ struct tqueue aio_queue;
+ event_id_t aio_queue_event_id;
+} tapdisk_server_t;
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-stream.c b/tools/blktap2/drivers/tapdisk-stream.c
new file mode 100644
index 0000000000..8fa9d9e0bf
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-stream.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+
+#define POLL_READ 0
+#define POLL_WRITE 1
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+struct tapdisk_stream_poll {
+ int pipe[2];
+ int set;
+};
+
+struct tapdisk_stream_request {
+ uint64_t sec;
+ uint32_t secs;
+ uint64_t seqno;
+ blkif_request_t blkif_req;
+ struct list_head next;
+};
+
+struct tapdisk_stream {
+ td_vbd_t *vbd;
+
+ unsigned int id;
+ int in_fd;
+ int out_fd;
+
+ int err;
+
+ uint64_t cur;
+ uint64_t start;
+ uint64_t end;
+
+ uint64_t started;
+ uint64_t completed;
+
+ struct tapdisk_stream_poll poll;
+ event_id_t enqueue_event_id;
+
+ struct list_head free_list;
+ struct list_head pending_list;
+ struct list_head completed_list;
+
+ struct tapdisk_stream_request requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static void
+usage(const char *app, int err)
+{
+ printf("usage: %s <-n type:/path/to/image> "
+ "[-c sector count] [-s skip sectors]\n", app);
+ exit(err);
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+ p->set = 0;
+ p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+ int err;
+
+ tapdisk_stream_poll_initialize(p);
+
+ err = pipe(p->pipe);
+ if (err)
+ return -errno;
+
+ err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ close(p->pipe[POLL_READ]);
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+ return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+ if (p->pipe[POLL_READ] != -1)
+ close(p->pipe[POLL_READ]);
+ if (p->pipe[POLL_WRITE] != -1)
+ close(p->pipe[POLL_WRITE]);
+ tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+ int dummy;
+
+ read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+ p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+ int dummy = 0;
+
+ if (!p->set) {
+ write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+ p->set = 1;
+ }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+ return (list_empty(&s->pending_list) && (s->cur == s->end || s->err));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+ memset(req, 0, sizeof(*req));
+ INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *req)
+{
+ return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *req;
+
+ if (list_empty(&s->free_list))
+ return NULL;
+
+ req = list_entry(s->free_list.next,
+ struct tapdisk_stream_request, next);
+
+ list_del_init(&req->next);
+ tapdisk_stream_initialize_request(req);
+
+ return req;
+}
+
+static void
+tapdisk_stream_print_request(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq);
+ char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0);
+ write(s->out_fd, buf, sreq->secs << SECTOR_SHIFT);
+}
+
+static void
+tapdisk_stream_write_data(struct tapdisk_stream *s)
+{
+ struct tapdisk_stream_request *sreq, *tmp;
+
+ list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) {
+ if (sreq->seqno != s->completed)
+ break;
+
+ s->completed++;
+ tapdisk_stream_print_request(s, sreq);
+
+ list_del_init(&sreq->next);
+ list_add_tail(&sreq->next, &s->free_list);
+ }
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+ struct tapdisk_stream_request *sreq)
+{
+ struct tapdisk_stream_request *itr;
+
+ list_for_each_entry(itr, &s->completed_list, next)
+ if (sreq->seqno < itr->seqno) {
+ list_add_tail(&sreq->next, &itr->next);
+ return;
+ }
+
+ list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+ struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+ list_del_init(&sreq->next);
+
+ if (rsp->status == BLKIF_RSP_OKAY)
+ tapdisk_stream_queue_completed(s, sreq);
+ else {
+ s->err = EIO;
+ list_add_tail(&sreq->next, &s->free_list);
+ fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec);
+ }
+
+ tapdisk_stream_write_data(s);
+ tapdisk_stream_poll_set(&s->poll);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+ td_vbd_t *vbd;
+ int i, idx, psize;
+ struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+ vbd = s->vbd;
+ tapdisk_stream_poll_clear(&s->poll);
+
+ if (tapdisk_stream_stop(s)) {
+ tapdisk_stream_close_image(s);
+ return;
+ }
+
+ psize = getpagesize();
+
+ while (s->cur < s->end && !s->err) {
+ blkif_request_t *breq;
+ td_vbd_request_t *vreq;
+ struct tapdisk_stream_request *sreq;
+
+ sreq = tapdisk_stream_get_request(s);
+ if (!sreq)
+ break;
+
+ idx = tapdisk_stream_request_idx(s, sreq);
+
+ sreq->sec = s->cur;
+ sreq->secs = 0;
+ sreq->seqno = s->started++;
+
+ breq = &sreq->blkif_req;
+ breq->id = idx;
+ breq->nr_segments = 0;
+ breq->sector_number = sreq->sec;
+ breq->operation = BLKIF_OP_READ;
+
+ for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+ uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+ struct blkif_request_segment *seg = breq->seg + i;
+
+ if (!secs)
+ break;
+
+ sreq->secs += secs;
+ s->cur += secs;
+
+ seg->first_sect = 0;
+ seg->last_sect = secs - 1;
+ breq->nr_segments++;
+ }
+
+ vreq = vbd->request_list + idx;
+
+ assert(list_empty(&vreq->next));
+ assert(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, breq, sizeof(*breq));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+ list_add_tail(&sreq->next, &s->pending_list);
+ }
+
+ tapdisk_vbd_issue_requests(vbd);
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+ int err;
+
+ s->id = tapdisk_stream_count++;
+
+ err = tapdisk_server_initialize(NULL, NULL);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_initialize(-1, -1, s->id);
+ if (err)
+ goto out;
+
+ s->vbd = tapdisk_server_get_vbd(s->id);
+ if (!s->vbd) {
+ err = ENODEV;
+ goto out;
+ }
+
+ tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+ err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ TD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ s->vbd->reopened = 1;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to open %s: %d\n", path, err);
+ return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(s->id);
+ if (vbd) {
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ free((void *)vbd->ring.vstart);
+ free(vbd->name);
+ free(vbd);
+ s->vbd = NULL;
+ }
+}
+
+static int
+tapdisk_stream_set_position(struct tapdisk_stream *s,
+ uint64_t count, uint64_t skip)
+{
+ int err;
+ image_t image;
+
+ err = tapdisk_vbd_get_image_info(s->vbd, &image);
+ if (err) {
+ fprintf(stderr, "failed getting image size: %d\n", err);
+ return err;
+ }
+
+ if (count == (uint64_t)-1)
+ count = image.size - skip;
+
+ if (count + skip > image.size) {
+ fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
+ (uint64_t) (count + skip), (uint64_t) image.size);
+ return -EINVAL;
+ }
+
+ s->start = skip;
+ s->cur = s->start;
+ s->end = s->start + count;
+
+ return 0;
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+ size_t size;
+ td_ring_t *ring;
+ int err, i, psize;
+
+ ring = &s->vbd->ring;
+ psize = getpagesize();
+ size = psize * BLKTAP_MMAP_REGION_SIZE;
+
+ /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+ err = posix_memalign((void **)&ring->vstart, psize, size);
+ if (err) {
+ fprintf(stderr, "failed to allocate buffers: %d\n", err);
+ ring->vstart = 0;
+ return err;
+ }
+
+ for (i = 0; i < MAX_REQUESTS; i++) {
+ struct tapdisk_stream_request *req = s->requests + i;
+ tapdisk_stream_initialize_request(req);
+ list_add_tail(&req->next, &s->free_list);
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+ int err;
+ struct tapdisk_stream_poll *p = &s->poll;
+
+ err = tapdisk_stream_poll_open(p);
+ if (err)
+ goto out;
+
+ err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ p->pipe[POLL_READ], 0,
+ tapdisk_stream_enqueue, s);
+ if (err < 0)
+ goto out;
+
+ s->enqueue_event_id = err;
+ err = 0;
+
+out:
+ if (err)
+ fprintf(stderr, "failed to register event: %d\n", err);
+ return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+ if (s->enqueue_event_id) {
+ tapdisk_server_unregister_event(s->enqueue_event_id);
+ s->enqueue_event_id = 0;
+ }
+ tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+ memset(s, 0, sizeof(*s));
+ s->in_fd = s->out_fd = -1;
+ INIT_LIST_HEAD(&s->free_list);
+ INIT_LIST_HEAD(&s->pending_list);
+ INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open_fds(struct tapdisk_stream *s)
+{
+ s->out_fd = dup(STDOUT_FILENO);
+ if (s->out_fd == -1) {
+ fprintf(stderr, "failed to open output: %d\n", errno);
+ return errno;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *path,
+ int type, uint64_t count, uint64_t skip)
+{
+ int err;
+
+ tapdisk_stream_initialize(s);
+
+ err = tapdisk_stream_open_fds(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_open_image(s, path, type);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_set_position(s, count, skip);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_initialize_requests(s);
+ if (err)
+ return err;
+
+ err = tapdisk_stream_register_enqueue_event(s);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+ close(s->out_fd);
+ tapdisk_stream_close_image(s);
+ tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+ tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+ tapdisk_server_run();
+ return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, err, type;
+ char *params, *path;
+ uint64_t count, skip;
+ struct tapdisk_stream stream;
+
+ err = 0;
+ skip = 0;
+ count = (uint64_t)-1;
+ params = NULL;
+
+ while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
+ switch (c) {
+ case 'n':
+ params = optarg;
+ break;
+ case 'c':
+ count = strtoull(optarg, NULL, 10);
+ break;
+ case 's':
+ skip = strtoull(optarg, NULL, 10);
+ break;
+ default:
+ err = EINVAL;
+ case 'h':
+ usage(argv[0], err);
+ }
+ }
+
+ if (!params)
+ usage(argv[0], EINVAL);
+
+ err = tapdisk_parse_disk_type(params, &path, &type);
+ if (err) {
+ fprintf(stderr, "invalid argument %s: %d\n", params, err);
+ return err;
+ }
+
+ tapdisk_start_logging("tapdisk-stream");
+
+ err = tapdisk_stream_open(&stream, path, type, count, skip);
+ if (err)
+ goto out;
+
+ err = tapdisk_stream_run(&stream);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ tapdisk_stream_release(&stream);
+ tapdisk_stop_logging();
+ return err;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.c b/tools/blktap2/drivers/tapdisk-utils.c
new file mode 100644
index 0000000000..560f3bf6cc
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-utils.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/fs.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+
+#include "tapdisk.h"
+#include "disktypes.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+void
+tapdisk_start_logging(const char *name)
+{
+ static char buf[128];
+
+ snprintf(buf, sizeof(buf), "%s[%d]", name, getpid());
+ openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+ open_tlog("/tmp/tapdisk.log", (64 << 10), TLOG_WARN, 0);
+}
+
+void
+tapdisk_stop_logging(void)
+{
+ closelog();
+ close_tlog();
+}
+
+int
+tapdisk_set_resource_limits(void)
+{
+ int err;
+ struct rlimit rlim;
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+
+ err = setrlimit(RLIMIT_MEMLOCK, &rlim);
+ if (err == -1) {
+ EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno);
+ return -errno;
+ }
+
+ err = mlockall(MCL_CURRENT | MCL_FUTURE);
+ if (err == -1) {
+ EPRINTF("mlockall failed: %d\n", errno);
+ return -errno;
+ }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+ err = setrlimit(RLIMIT_CORE, &rlim);
+ if (err == -1)
+ EPRINTF("RLIMIT_CORE failed: %d\n", errno);
+#endif
+
+ return 0;
+}
+
+int
+tapdisk_namedup(char **dup, const char *name)
+{
+ *dup = NULL;
+
+ if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ *dup = strdup(name);
+ if (!*dup)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int
+tapdisk_parse_disk_type(const char *params, char **_path, int *_type)
+{
+ int i, err, size, handle_len;
+ char *ptr, *path, handle[10];
+
+ if (strlen(params) + 1 >= MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = strchr(params, ':');
+ if (!ptr)
+ return -EINVAL;
+
+ path = ptr + 1;
+
+ handle_len = ptr - params;
+ if (handle_len > sizeof(handle))
+ return -ENAMETOOLONG;
+
+ memcpy(handle, params, handle_len);
+ handle[handle_len] = '\0';
+
+ size = sizeof(dtypes) / sizeof(disk_info_t *);
+ for (i = 0; i < size; i++) {
+ if (strncmp(handle, dtypes[i]->handle, handle_len))
+ continue;
+
+ if (dtypes[i]->idnum == -1)
+ return -ENODEV;
+
+ *_type = dtypes[i]->idnum;
+ *_path = path;
+
+ return 0;
+ }
+
+ return -ENODEV;
+}
+
+/*Get Image size, secsize*/
+int
+tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size)
+{
+ int ret;
+ struct stat stat;
+ uint64_t sectors;
+ uint32_t sector_size;
+
+ sectors = 0;
+ sector_size = 0;
+ *_sectors = 0;
+ *_sector_size = 0;
+
+ if (fstat(fd, &stat)) {
+ DPRINTF("ERROR: fstat failed, Couldn't stat image");
+ return -EINVAL;
+ }
+
+ if (S_ISBLK(stat.st_mode)) {
+ /*Accessing block device directly*/
+ if (ioctl(fd, BLKGETSIZE, &sectors)) {
+ DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+ return -EINVAL;
+ }
+
+ /*Get the sector size*/
+#if defined(BLKSSZGET)
+ {
+ int arg;
+ sector_size = DEFAULT_SECTOR_SIZE;
+ ioctl(fd, BLKSSZGET, &sector_size);
+
+ if (sector_size != DEFAULT_SECTOR_SIZE)
+ DPRINTF("Note: sector size is %u (not %d)\n",
+ sector_size, DEFAULT_SECTOR_SIZE);
+ }
+#else
+ sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+ } else {
+ /*Local file? try fstat instead*/
+ sectors = (stat.st_size >> SECTOR_SHIFT);
+ sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ if (sectors == 0) {
+ sectors = 16836057ULL;
+ sector_size = DEFAULT_SECTOR_SIZE;
+ }
+
+ return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.h b/tools/blktap2/drivers/tapdisk-utils.h
new file mode 100644
index 0000000000..216c902377
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-utils.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_UTILS_H_
+#define _TAPDISK_UTILS_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_LEN 1000
+
+void tapdisk_start_logging(const char *);
+void tapdisk_stop_logging(void);
+int tapdisk_set_resource_limits(void);
+int tapdisk_namedup(char **, const char *);
+int tapdisk_parse_disk_type(const char *, char **, int *);
+int tapdisk_get_image_size(int, uint64_t *, uint32_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
new file mode 100644
index 0000000000..1eaaee9634
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-vbd.c
@@ -0,0 +1,1758 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <regex.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+#include "libvhd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-vbd.h"
+#include "blktap2.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+#if 1
+#define ASSERT(p) \
+ do { \
+ if (!(p)) { \
+ DPRINTF("Assertion '%s' failed, line %d, " \
+ "file %s", #p, __LINE__, __FILE__); \
+ *(int*)0 = 0; \
+ } \
+ } while (0)
+#else
+#define ASSERT(p) ((void)0)
+#endif
+
+
+#define TD_VBD_EIO_RETRIES 10
+#define TD_VBD_EIO_SLEEP 1
+#define TD_VBD_WATCHDOG_TIMEOUT 10
+
+static void tapdisk_vbd_ring_event(event_id_t, char, void *);
+static void tapdisk_vbd_callback(void *, blkif_response_t *);
+
+/*
+ * initialization
+ */
+
+static inline void
+tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq)
+{
+ memset(vreq, 0, sizeof(td_vbd_request_t));
+ INIT_LIST_HEAD(&vreq->next);
+}
+
+int
+tapdisk_vbd_initialize(int rfd, int wfd, uint16_t uuid)
+{
+ int i;
+ td_vbd_t *vbd;
+
+ vbd = tapdisk_server_get_vbd(uuid);
+ if (vbd) {
+ EPRINTF("duplicate vbds! %u\n", uuid);
+ return -EEXIST;
+ }
+
+ vbd = calloc(1, sizeof(td_vbd_t));
+ if (!vbd) {
+ EPRINTF("failed to allocate tapdisk state\n");
+ return -ENOMEM;
+ }
+
+ vbd->uuid = uuid;
+ vbd->ipc.rfd = rfd;
+ vbd->ipc.wfd = wfd;
+ vbd->ipc.uuid = uuid;
+ vbd->ring.fd = -1;
+
+ /* default blktap ring completion */
+ vbd->callback = tapdisk_vbd_callback;
+ vbd->argument = vbd;
+
+ INIT_LIST_HEAD(&vbd->images);
+ INIT_LIST_HEAD(&vbd->new_requests);
+ INIT_LIST_HEAD(&vbd->pending_requests);
+ INIT_LIST_HEAD(&vbd->failed_requests);
+ INIT_LIST_HEAD(&vbd->completed_requests);
+ INIT_LIST_HEAD(&vbd->next);
+ gettimeofday(&vbd->ts, NULL);
+
+ for (i = 0; i < MAX_REQUESTS; i++)
+ tapdisk_vbd_initialize_vreq(vbd->request_list + i);
+
+ tapdisk_server_add_vbd(vbd);
+
+ return 0;
+}
+
+void
+tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument)
+{
+ vbd->callback = callback;
+ vbd->argument = argument;
+}
+
+static int
+tapdisk_vbd_validate_chain(td_vbd_t *vbd)
+{
+ int err;
+ td_image_t *image, *parent, *tmp;
+
+ DPRINTF("VBD CHAIN:\n");
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp) {
+ DPRINTF("%s: %d\n", image->name, image->type);
+
+ if (tapdisk_vbd_is_last_image(vbd, image))
+ break;
+
+ parent = tapdisk_vbd_next_image(image);
+ err = td_validate_parent(image, parent);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+void
+tapdisk_vbd_close_vdi(td_vbd_t *vbd)
+{
+ td_image_t *image, *tmp;
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp) {
+ td_close(image);
+ tapdisk_image_free(image);
+ }
+
+ INIT_LIST_HEAD(&vbd->images);
+ td_flag_set(vbd->state, TD_VBD_CLOSED);
+}
+
+static int
+tapdisk_vbd_add_block_cache(td_vbd_t *vbd)
+{
+ int err;
+ td_driver_t *driver;
+ td_image_t *cache, *image, *target, *tmp;
+
+ target = NULL;
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp)
+ if (td_flag_test(image->flags, TD_OPEN_RDONLY) &&
+ td_flag_test(image->flags, TD_OPEN_SHAREABLE)) {
+ target = image;
+ break;
+ }
+
+ if (!target)
+ return 0;
+
+ cache = tapdisk_image_allocate(target->name,
+ DISK_TYPE_BLOCK_CACHE,
+ target->storage,
+ target->flags,
+ target->private);
+ if (!cache)
+ return -ENOMEM;
+
+ /* try to load existing cache */
+ err = td_load(cache);
+ if (!err)
+ goto done;
+
+ /* hack driver to send open() correct image size */
+ if (!target->driver) {
+ err = -ENODEV;
+ goto fail;
+ }
+
+ cache->driver = tapdisk_driver_allocate(cache->type,
+ cache->name,
+ cache->flags,
+ cache->storage);
+ if (!cache->driver) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ cache->driver->info = target->driver->info;
+
+ /* try to open new cache */
+ err = td_open(cache);
+ if (!err)
+ goto done;
+
+fail:
+ /* give up */
+ tapdisk_image_free(target);
+ return err;
+
+done:
+ /* insert cache before image */
+ list_add(&cache->next, target->next.prev);
+ return 0;
+}
+
+static int
+tapdisk_vbd_add_dirty_log(td_vbd_t *vbd)
+{
+ int err;
+ td_driver_t *driver;
+ td_image_t *log, *parent;
+
+ driver = NULL;
+ log = NULL;
+
+ parent = tapdisk_vbd_first_image(vbd);
+
+ log = tapdisk_image_allocate(parent->name,
+ DISK_TYPE_LOG,
+ parent->storage,
+ parent->flags,
+ vbd);
+ if (!log)
+ return -ENOMEM;
+
+ driver = tapdisk_driver_allocate(log->type,
+ log->name,
+ log->flags,
+ log->storage);
+ if (!driver) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ driver->info = parent->driver->info;
+ log->driver = driver;
+
+ err = td_open(log);
+ if (err)
+ goto fail;
+
+ list_add(&log->next, &vbd->images);
+ return 0;
+
+fail:
+ tapdisk_image_free(log);
+ return err;
+}
+
+/*
+ * LVHD hack: have to rescan LVM metadata on pool
+ * slaves to register lvchanges made on master. FIXME.
+ */
+static int
+tapdisk_vbd_reactivate_volume(const char *name)
+{
+ int err;
+ char *cmd;
+
+ DPRINTF("reactivating %s\n", name);
+
+ err = asprintf(&cmd, "lvchange -an %s", name);
+ if (err == - 1) {
+ EPRINTF("failed to deactivate %s\n", name);
+ return -errno;
+ }
+
+ err = system(cmd);
+ if (err) {
+ /*
+ * Assume that LV deactivation failed because the LV is open,
+ * in which case the LVM information should be up-to-date and
+ * we don't need this step anyways (so ignore the error). If
+ * the failure is due to a non-existent LV, the next command
+ * (lvchange -ay) will catch it.
+ * If we want to be more prudent/paranoid, we can instead check
+ * whether the LV is currently open (a bit more work).
+ */
+ }
+
+ free(cmd);
+ err = asprintf(&cmd, "lvchange -ay --refresh %s", name);
+ if (err == - 1) {
+ EPRINTF("failed to activate %s\n", name);
+ return -errno;
+ }
+
+ err = system(cmd);
+ if (err)
+ EPRINTF("%s failed: %d\n", cmd, err);
+ free(cmd);
+ return err;
+}
+
+static int
+tapdisk_vbd_reactivate_volumes(td_vbd_t *vbd, int resume)
+{
+ int i, cnt, err;
+ char *name, *new;
+ vhd_context_t vhd;
+ vhd_parent_locator_t *loc;
+
+ new = NULL;
+ name = NULL;
+
+ if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM)
+ return 0;
+
+ if (!resume && vbd->reactivated)
+ return 0;
+
+ name = strdup(vbd->name);
+ if (!name) {
+ EPRINTF("%s: nomem\n", vbd->name);
+ return -ENOMEM;
+ }
+
+ for (cnt = 0; 1; cnt++) {
+
+ /* only need to reactivate child and parent during resume */
+ if (resume && cnt == 2)
+ break;
+
+ err = tapdisk_vbd_reactivate_volume(name);
+ if (err)
+ goto fail;
+
+ if (!strstr(name, "VHD"))
+ break;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (!err)
+ break;
+
+ libvhd_set_log_level(1);
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+ libvhd_set_log_level(0);
+ if (err)
+ goto fail;
+
+ if (vhd.footer.type != HD_TYPE_DIFF) {
+ vhd_close(&vhd);
+ break;
+ }
+
+ loc = NULL;
+ for (i = 0; i < 8; i++)
+ if (vhd.header.loc[i].code == PLAT_CODE_MACX) {
+ loc = vhd.header.loc + i;
+ break;
+ }
+
+ if (!loc) {
+ vhd_close(&vhd);
+ err = -EINVAL;
+ goto fail;
+ }
+
+ free(name);
+ err = vhd_parent_locator_read(&vhd, loc, &name);
+ vhd_close(&vhd);
+
+ if (err) {
+ name = NULL;
+ goto fail;
+ }
+
+ /*
+ * vhd_parent_locator_read returns path relative to child:
+ * ./VG_XenStorage--<sr-uuid>-VHD--<vdi-uuid>
+ * we have to convert this to absolute path for lvm
+ */
+ err = asprintf(&new, "/dev/mapper/%s", name + 2);
+ if (err == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ free(name);
+ name = new;
+ }
+
+ err = 0;
+ vbd->reactivated = 1;
+
+out:
+ free(name);
+ return err;
+
+fail:
+ EPRINTF("failed to reactivate %s: %d\n", vbd->name, err);
+ goto out;
+}
+
+/*
+ * LVHD hack:
+ * raw volumes are named /dev/<sr-vg-name>-<sr-uuid>/LV-<sr-uuid>
+ * vhd volumes are named /dev/<sr-vg-name>-<sr-uuid>/VHD-<sr-uuid>
+ *
+ * a live snapshot of a raw volume will result in the writeable volume's
+ * name changing from the raw to vhd format, but this change will not be
+ * reflected by xenstore. hence this mess.
+ */
+static int
+tapdisk_vbd_check_file(td_vbd_t *vbd)
+{
+ int i, err;
+ regex_t re;
+ size_t len, max;
+ regmatch_t matches[4];
+ char *new, *src, *dst, error[256];
+
+ if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM)
+ return 0;
+
+ err = tapdisk_vbd_reactivate_volume(vbd->name);
+ if (!err)
+ return 0;
+ else
+ DPRINTF("reactivating %s failed\n", vbd->name);
+
+#define HEX "[A-Za-z0-9]"
+#define UUID HEX"\\{8\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{12\\}"
+#define VG "VG_"HEX"\\+"
+#define TYPE "\\(LV\\|VHD\\)"
+#define RE "\\(/dev/"VG"-"UUID"/\\)"TYPE"\\(-"UUID"\\)"
+
+ err = regcomp(&re, RE, 0);
+ if (err)
+ goto regerr;
+
+#undef HEX
+#undef UUID
+#undef VG
+#undef TYPE
+#undef RE
+
+ err = regexec(&re, vbd->name, 4, matches, 0);
+ if (err)
+ goto regerr;
+
+ max = strlen("VHD") + 1;
+ for (i = 1; i < 4; i++) {
+ if (matches[i].rm_so == -1 || matches[i].rm_eo == -1) {
+ EPRINTF("%s: failed to tokenize name\n", vbd->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ max += matches[i].rm_eo - matches[i].rm_so;
+ }
+
+ new = malloc(max);
+ if (!new) {
+ EPRINTF("%s: failed to allocate new name\n", vbd->name);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ src = new;
+ for (i = 1; i < 4; i++) {
+ dst = vbd->name + matches[i].rm_so;
+ len = matches[i].rm_eo - matches[i].rm_so;
+
+ if (i == 2) {
+ if (memcmp(dst, "LV", len)) {
+ EPRINTF("%s: bad name format\n", vbd->name);
+ free(new);
+ err = -EINVAL;
+ goto out;
+ }
+
+ src += sprintf(src, "VHD");
+ continue;
+ }
+
+ memcpy(src, dst, len + 1);
+ src += len;
+ }
+
+ *src = '\0';
+
+ err = tapdisk_vbd_reactivate_volume(new);
+ if (err)
+ DPRINTF("reactivating %s failed\n", new);
+
+ err = access(new, F_OK);
+ if (err == -1) {
+ EPRINTF("neither %s nor %s accessible\n",
+ vbd->name, new);
+ err = -errno;
+ free(new);
+ goto out;
+ }
+
+ DPRINTF("couldn't find %s, trying %s\n", vbd->name, new);
+
+ err = 0;
+ free(vbd->name);
+ vbd->name = new;
+ vbd->type = DISK_TYPE_VHD;
+
+out:
+ regfree(&re);
+ return err;
+
+regerr:
+ regerror(err, &re, error, sizeof(error));
+ EPRINTF("%s: regex failed: %s\n", vbd->name, error);
+ err = -EINVAL;
+ goto out;
+}
+
+static int
+__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
+{
+ char *file;
+ int err, type;
+ td_flag_t flags;
+ td_disk_id_t id;
+ td_image_t *image, *tmp;
+ struct tfilter *filter = NULL;
+
+ err = tapdisk_vbd_reactivate_volumes(vbd, 0);
+ if (err)
+ return err;
+
+ flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
+ file = vbd->name;
+ type = vbd->type;
+
+ for (;;) {
+ err = -ENOMEM;
+ image = tapdisk_image_allocate(file, type,
+ vbd->storage, flags, vbd);
+
+ if (file != vbd->name) {
+ free(file);
+ file = NULL;
+ }
+
+ if (!image)
+ goto fail;
+
+ err = td_load(image);
+ if (err) {
+ if (err != -ENODEV)
+ goto fail;
+
+ err = td_open(image);
+ if (err)
+ goto fail;
+ }
+
+ err = td_get_parent_id(image, &id);
+ if (err && err != TD_NO_PARENT) {
+ td_close(image);
+ goto fail;
+ }
+
+ if (!image->storage)
+ image->storage = vbd->storage;
+
+ tapdisk_vbd_add_image(vbd, image);
+ image = NULL;
+
+ if (err == TD_NO_PARENT)
+ break;
+
+ file = id.name;
+ type = id.drivertype;
+ flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+ }
+
+ if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
+ err = tapdisk_vbd_add_dirty_log(vbd);
+ if (err)
+ goto fail;
+ }
+
+ if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) {
+ err = tapdisk_vbd_add_block_cache(vbd);
+ if (err)
+ goto fail;
+ }
+
+ err = tapdisk_vbd_validate_chain(vbd);
+ if (err)
+ goto fail;
+
+ td_flag_clear(vbd->state, TD_VBD_CLOSED);
+
+ return 0;
+
+fail:
+ if (image)
+ tapdisk_image_free(image);
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ return err;
+}
+
+int
+tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
+ uint16_t drivertype, uint16_t storage, td_flag_t flags)
+{
+ int i, err;
+ struct tap_disk *ops;
+
+ ops = tapdisk_server_find_driver_interface(drivertype);
+ if (!ops)
+ return -EINVAL;
+ DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n",
+ ops->disk_type, vbd->uuid, path, flags);
+
+ err = tapdisk_namedup(&vbd->name, path);
+ if (err)
+ return err;
+
+ vbd->flags = flags;
+ vbd->storage = storage;
+ vbd->type = drivertype;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, 0);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ free(vbd->name);
+ vbd->name = NULL;
+ return err;
+}
+
+static int
+tapdisk_vbd_register_event_watches(td_vbd_t *vbd)
+{
+ event_id_t id;
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ vbd->ring.fd, 0,
+ tapdisk_vbd_ring_event, vbd);
+ if (id < 0)
+ return id;
+
+ vbd->ring_event_id = id;
+
+ return 0;
+}
+
+static void
+tapdisk_vbd_unregister_events(td_vbd_t *vbd)
+{
+ if (vbd->ring_event_id)
+ tapdisk_server_unregister_event(vbd->ring_event_id);
+}
+
+static int
+tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname)
+{
+
+ int err, psize;
+ td_ring_t *ring;
+
+ ring = &vbd->ring;
+ psize = getpagesize();
+
+ ring->fd = open(devname, O_RDWR);
+ if (ring->fd == -1) {
+ err = -errno;
+ EPRINTF("failed to open %s: %d\n", devname, err);
+ goto fail;
+ }
+
+ ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE,
+ PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0);
+ if (ring->mem == MAP_FAILED) {
+ err = -errno;
+ EPRINTF("failed to mmap %s: %d\n", devname, err);
+ goto fail;
+ }
+
+ ring->sring = (blkif_sring_t *)((unsigned long)ring->mem);
+ BACK_RING_INIT(&ring->fe_ring, ring->sring, psize);
+
+ ring->vstart =
+ (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize);
+
+ ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+
+ return 0;
+
+fail:
+ if (ring->mem && ring->mem != MAP_FAILED)
+ munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE);
+ if (ring->fd != -1)
+ close(ring->fd);
+ ring->fd = -1;
+ ring->mem = NULL;
+ return err;
+}
+
+static int
+tapdisk_vbd_unmap_device(td_vbd_t *vbd)
+{
+ int psize;
+
+ psize = getpagesize();
+
+ if (vbd->ring.fd != -1)
+ close(vbd->ring.fd);
+ if (vbd->ring.mem > 0)
+ munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE);
+
+ return 0;
+}
+
+int
+tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type,
+ uint16_t storage, const char *ring, td_flag_t flags)
+{
+ int err;
+
+ err = tapdisk_vbd_open_vdi(vbd, name, type, storage, flags);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_map_device(vbd, ring);
+ if (err)
+ goto out;
+
+ err = tapdisk_vbd_register_event_watches(vbd);
+ if (err)
+ goto out;
+
+ return 0;
+
+out:
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_vbd_unmap_device(vbd);
+ tapdisk_vbd_unregister_events(vbd);
+ free(vbd->name);
+ vbd->name = NULL;
+ return err;
+}
+
+static void
+tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new,
+ int *pending, int *failed, int *completed)
+{
+ int n, p, f, c;
+ td_vbd_request_t *vreq, *tvreq;
+
+ n = 0;
+ p = 0;
+ f = 0;
+ c = 0;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests)
+ n++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests)
+ p++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests)
+ f++;
+
+ tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests)
+ c++;
+
+ *new = n;
+ *pending = p;
+ *failed = f;
+ *completed = c;
+}
+
+static int
+tapdisk_vbd_shutdown(td_vbd_t *vbd)
+{
+ int new, pending, failed, completed;
+
+ if (!list_empty(&vbd->pending_requests))
+ return -EAGAIN;
+
+ tapdisk_vbd_kick(vbd);
+ tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+ DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+ "failed: 0x%02x, completed: 0x%02x\n",
+ vbd->name, vbd->state, new, pending, failed, completed);
+ DPRINTF("last activity: %010ld.%06ld, errors: 0x%04"PRIx64", "
+ "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+ "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+ vbd->ts.tv_sec, vbd->ts.tv_usec,
+ vbd->errors, vbd->retries, vbd->received, vbd->returned,
+ vbd->kicked);
+
+ tapdisk_vbd_close_vdi(vbd);
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_CLOSE_RSP);
+ tapdisk_vbd_unregister_events(vbd);
+ tapdisk_vbd_unmap_device(vbd);
+ tapdisk_server_remove_vbd(vbd);
+ free(vbd->name);
+ free(vbd);
+
+ tlog_print_errors();
+
+ return 0;
+}
+
+int
+tapdisk_vbd_close(td_vbd_t *vbd)
+{
+ /*
+ * don't close if any requests are pending in the aio layer
+ */
+ if (!list_empty(&vbd->pending_requests))
+ goto fail;
+
+ /*
+ * if the queue is still active and we have more
+ * requests, try to complete them before closing.
+ */
+ if (tapdisk_vbd_queue_ready(vbd) &&
+ (!list_empty(&vbd->new_requests) ||
+ !list_empty(&vbd->failed_requests) ||
+ !list_empty(&vbd->completed_requests)))
+ goto fail;
+
+ return tapdisk_vbd_shutdown(vbd);
+
+fail:
+ td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED);
+ DBG(TLOG_WARN, "%s: requests pending\n", vbd->name);
+ return -EAGAIN;
+}
+
+/*
+ * control operations
+ */
+
+void
+tapdisk_vbd_debug(td_vbd_t *vbd)
+{
+ td_image_t *image, *tmp;
+ int new, pending, failed, completed;
+
+ tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+ DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+ "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06ld, "
+ "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+ "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+ vbd->name, vbd->state, new, pending, failed, completed,
+ vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries,
+ vbd->received, vbd->returned, vbd->kicked);
+
+ tapdisk_vbd_for_each_image(vbd, image, tmp)
+ td_debug(image);
+}
+
+static void
+tapdisk_vbd_drop_log(td_vbd_t *vbd)
+{
+ if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED))
+ return;
+
+ tapdisk_vbd_debug(vbd);
+ tlog_flush();
+ td_flag_set(vbd->state, TD_VBD_LOG_DROPPED);
+}
+
+int
+tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img)
+{
+ td_image_t *image;
+
+ memset(img, 0, sizeof(image_t));
+
+ if (list_empty(&vbd->images))
+ return -EINVAL;
+
+ image = tapdisk_vbd_first_image(vbd);
+ img->size = image->info.size;
+ img->secsize = image->info.sector_size;
+ img->info = image->info.info;
+
+ return 0;
+}
+
+int
+tapdisk_vbd_queue_ready(td_vbd_t *vbd)
+{
+ return (!td_flag_test(vbd->state, TD_VBD_DEAD) &&
+ !td_flag_test(vbd->state, TD_VBD_CLOSED) &&
+ !td_flag_test(vbd->state, TD_VBD_QUIESCED) &&
+ !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED));
+}
+
+int
+tapdisk_vbd_retry_needed(td_vbd_t *vbd)
+{
+ return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED);
+}
+
+int
+tapdisk_vbd_lock(td_vbd_t *vbd)
+{
+ return 0;
+}
+
+int
+tapdisk_vbd_quiesce_queue(td_vbd_t *vbd)
+{
+ if (!list_empty(&vbd->pending_requests)) {
+ td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ return -EAGAIN;
+ }
+
+ td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_QUIESCED);
+ return 0;
+}
+
+int
+tapdisk_vbd_start_queue(td_vbd_t *vbd)
+{
+ td_flag_clear(vbd->state, TD_VBD_QUIESCED);
+ td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+ return 0;
+}
+
+int
+tapdisk_vbd_kill_queue(td_vbd_t *vbd)
+{
+ tapdisk_vbd_quiesce_queue(vbd);
+ td_flag_set(vbd->state, TD_VBD_DEAD);
+ return 0;
+}
+
+static int
+tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image)
+{
+ int err;
+ td_image_t *parent;
+
+ err = td_open(image);
+ if (err)
+ return err;
+
+ if (!tapdisk_vbd_is_last_image(vbd, image)) {
+ parent = tapdisk_vbd_next_image(image);
+ err = td_validate_parent(image, parent);
+ if (err) {
+ td_close(image);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image)
+{
+ int i, err;
+
+ td_close(image);
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = tapdisk_vbd_open_image(vbd, image);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+ if (err)
+ td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+ return err;
+}
+
+int
+tapdisk_vbd_pause(td_vbd_t *vbd)
+{
+ int err;
+
+ td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+ err = tapdisk_vbd_quiesce_queue(vbd);
+ if (err)
+ return err;
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_PAUSED);
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_PAUSE_RSP);
+
+ return 0;
+}
+
+int
+tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype)
+{
+ int i, err;
+
+ if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+ EPRINTF("resume request for unpaused vbd %s\n", vbd->name);
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+ return -EINVAL;
+ }
+
+ free(vbd->name);
+ vbd->name = strdup(path);
+ if (!vbd->name) {
+ EPRINTF("copying new vbd %s name failed\n", path);
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+ return -EINVAL;
+ }
+ vbd->type = drivertype;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = tapdisk_vbd_check_file(vbd);
+ if (err)
+ goto sleep;
+
+ err = tapdisk_vbd_reactivate_volumes(vbd, 1);
+ if (err) {
+ EPRINTF("failed to reactivate %s: %d\n",
+ vbd->name, err);
+ goto sleep;
+ }
+
+ err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+ if (!err)
+ break;
+
+ sleep:
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+ if (err) {
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+ return err;
+ }
+
+ tapdisk_vbd_start_queue(vbd);
+ td_flag_clear(vbd->state, TD_VBD_PAUSED);
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_RESUME_RSP);
+
+ return 0;
+}
+
+int
+tapdisk_vbd_kick(td_vbd_t *vbd)
+{
+ int n;
+ td_ring_t *ring;
+
+ ring = &vbd->ring;
+ if (!ring->sring)
+ return 0;
+
+ n = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod);
+ if (!n)
+ return 0;
+
+ vbd->kicked += n;
+ RING_PUSH_RESPONSES(&ring->fe_ring);
+ ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0);
+
+ DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: "
+ "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked);
+
+ return n;
+}
+
+static inline void
+tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp)
+{
+ td_ring_t *ring;
+ blkif_response_t *rspp;
+
+ ring = &vbd->ring;
+ rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt);
+ memcpy(rspp, rsp, sizeof(blkif_response_t));
+ ring->fe_ring.rsp_prod_pvt++;
+}
+
+static void
+tapdisk_vbd_callback(void *arg, blkif_response_t *rsp)
+{
+ td_vbd_t *vbd = (td_vbd_t *)arg;
+ tapdisk_vbd_write_response_to_ring(vbd, rsp);
+}
+
+static void
+tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ blkif_request_t tmp;
+ blkif_response_t *rsp;
+
+ tmp = vreq->req;
+ rsp = (blkif_response_t *)&vreq->req;
+
+ rsp->id = tmp.id;
+ rsp->operation = tmp.operation;
+ rsp->status = vreq->status;
+
+ DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n",
+ (int)tmp.id, tmp.sector_number, vreq->status);
+
+ if (rsp->status != BLKIF_RSP_OKAY)
+ ERR(EIO, "returning BLKIF_RSP %d", rsp->status);
+
+ vbd->returned++;
+ vbd->callback(vbd->argument, rsp);
+}
+
+void
+tapdisk_vbd_check_state(td_vbd_t *vbd)
+{
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests)
+ if (vreq->num_retries >= TD_VBD_MAX_RETRIES)
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+
+ if (!list_empty(&vbd->new_requests) ||
+ !list_empty(&vbd->failed_requests))
+ tapdisk_vbd_issue_requests(vbd);
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) {
+ tapdisk_vbd_make_response(vbd, vreq);
+ list_del(&vreq->next);
+ tapdisk_vbd_initialize_vreq(vreq);
+ }
+
+ if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED))
+ tapdisk_vbd_quiesce_queue(vbd);
+
+ if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED))
+ tapdisk_vbd_pause(vbd);
+
+ if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ tapdisk_vbd_close(vbd);
+}
+
+void
+tapdisk_vbd_check_progress(td_vbd_t *vbd)
+{
+ int diff;
+ struct timeval now;
+
+ if (list_empty(&vbd->pending_requests))
+ return;
+
+ gettimeofday(&now, NULL);
+ diff = now.tv_sec - vbd->ts.tv_sec;
+
+ if (diff >= TD_VBD_WATCHDOG_TIMEOUT) {
+ DBG(TLOG_WARN, "%s: watchdog timeout: pending requests "
+ "idle for %d seconds\n", vbd->name, diff);
+ tapdisk_vbd_drop_log(vbd);
+ return;
+ }
+
+ tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff);
+}
+
+/*
+ * request submission
+ */
+
+static int
+tapdisk_vbd_check_queue(td_vbd_t *vbd)
+{
+ int err;
+ td_image_t *image;
+
+ if (list_empty(&vbd->images))
+ return -ENOSYS;
+
+ if (!tapdisk_vbd_queue_ready(vbd))
+ return -EAGAIN;
+
+ if (!vbd->reopened) {
+ if (td_flag_test(vbd->state, TD_VBD_LOCKING)) {
+ err = tapdisk_vbd_lock(vbd);
+ if (err)
+ return err;
+ }
+
+ image = tapdisk_vbd_first_image(vbd);
+ td_flag_set(image->flags, TD_OPEN_STRICT);
+
+ if (tapdisk_vbd_close_and_reopen_image(vbd, image))
+ EPRINTF("reopening disks failed\n");
+ else {
+ DPRINTF("reopening disks succeeded\n");
+ vbd->reopened = 1;
+ }
+ }
+
+ return 0;
+}
+
+void
+tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ if (!vreq->submitting && !vreq->secs_pending) {
+ if (vreq->status == BLKIF_RSP_ERROR &&
+ vreq->num_retries < TD_VBD_MAX_RETRIES &&
+ !td_flag_test(vbd->state, TD_VBD_DEAD) &&
+ !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ tapdisk_vbd_move_request(vreq, &vbd->failed_requests);
+ else
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+}
+
+static void
+__tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq,
+ td_request_t treq, int res)
+{
+ int err;
+
+ err = (res <= 0 ? res : -res);
+ vbd->secs_pending -= treq.secs;
+ vreq->secs_pending -= treq.secs;
+
+ vreq->blocked = treq.blocked;
+
+ if (err) {
+ vreq->status = BLKIF_RSP_ERROR;
+ vreq->error = (vreq->error ? : err);
+ if (err != -EBUSY) {
+ vbd->errors++;
+ ERR(err, "req %"PRIu64": %s 0x%04x secs to "
+ "0x%08"PRIx64, vreq->req.id,
+ (treq.op == TD_OP_WRITE ? "write" : "read"),
+ treq.secs, treq.sec);
+ }
+ }
+
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+static void
+__tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
+ td_image_t *image, td_request_t treq)
+{
+ td_image_t *parent;
+ td_vbd_request_t *vreq;
+
+ vreq = (td_vbd_request_t *)treq.private;
+ gettimeofday(&vreq->last_try, NULL);
+
+ vreq->submitting++;
+
+ if (tapdisk_vbd_is_last_image(vbd, image)) {
+ memset(treq.buf, 0, treq.secs << SECTOR_SHIFT);
+ td_complete_request(treq, 0);
+ goto done;
+ }
+
+ parent = tapdisk_vbd_next_image(image);
+ treq.image = parent;
+
+ /* return zeros for requests that extend beyond end of parent image */
+ if (treq.sec + treq.secs > parent->info.size) {
+ td_request_t clone = treq;
+
+ if (parent->info.size > treq.sec) {
+ int secs = parent->info.size - treq.sec;
+ clone.sec += secs;
+ clone.secs -= secs;
+ clone.buf += (secs << SECTOR_SHIFT);
+ treq.secs = secs;
+ } else
+ treq.secs = 0;
+
+ memset(clone.buf, 0, clone.secs << SECTOR_SHIFT);
+ td_complete_request(clone, 0);
+
+ if (!treq.secs)
+ goto done;
+ }
+
+ switch (treq.op) {
+ case TD_OP_WRITE:
+ td_queue_write(parent, treq);
+ break;
+
+ case TD_OP_READ:
+ td_queue_read(parent, treq);
+ break;
+ }
+
+done:
+ vreq->submitting--;
+ if (!vreq->secs_pending)
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+void
+tapdisk_vbd_forward_request(td_request_t treq)
+{
+ td_vbd_t *vbd;
+ td_image_t *image;
+ td_vbd_request_t *vreq;
+
+ image = treq.image;
+ vbd = (td_vbd_t *)image->private;
+ vreq = (td_vbd_request_t *)treq.private;
+
+ gettimeofday(&vbd->ts, NULL);
+
+ if (tapdisk_vbd_queue_ready(vbd))
+ __tapdisk_vbd_reissue_td_request(vbd, image, treq);
+ else
+ __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO);
+}
+
+static void
+tapdisk_vbd_complete_td_request(td_request_t treq, int res)
+{
+ td_vbd_t *vbd;
+ td_image_t *image;
+ td_vbd_request_t *vreq;
+
+ image = treq.image;
+ vbd = (td_vbd_t *)image->private;
+ vreq = (td_vbd_request_t *)treq.private;
+
+ gettimeofday(&vbd->ts, NULL);
+ DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" "
+ "secs 0x%04x buf %p op %d res %d\n", image->name,
+ (int)treq.id, treq.sidx, treq.sec, treq.secs,
+ treq.buf, (int)vreq->req.operation, res);
+
+ __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res);
+}
+
+static int
+tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+ char *page;
+ td_ring_t *ring;
+ td_image_t *image;
+ td_request_t treq;
+ uint64_t sector_nr;
+ blkif_request_t *req;
+ int i, err, id, nsects;
+
+ req = &vreq->req;
+ id = req->id;
+ ring = &vbd->ring;
+ sector_nr = req->sector_number;
+ image = tapdisk_vbd_first_image(vbd);
+
+ vreq->submitting = 1;
+ gettimeofday(&vbd->ts, NULL);
+ gettimeofday(&vreq->last_try, NULL);
+ tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
+
+ err = tapdisk_vbd_check_queue(vbd);
+ if (err)
+ goto fail;
+
+ err = tapdisk_image_check_ring_request(image, req);
+ if (err)
+ goto fail;
+
+ for (i = 0; i < req->nr_segments; i++) {
+ nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+ page = (char *)MMAP_VADDR(ring->vstart,
+ (unsigned long)req->id, i);
+ page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+ treq.id = id;
+ treq.sidx = i;
+ treq.blocked = 0;
+ treq.buf = page;
+ treq.sec = sector_nr;
+ treq.secs = nsects;
+ treq.image = image;
+ treq.cb = tapdisk_vbd_complete_td_request;
+ treq.cb_data = NULL;
+ treq.private = vreq;
+
+ DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x "
+ "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs,
+ treq.buf, (int)req->operation);
+
+ vreq->secs_pending += nsects;
+ vbd->secs_pending += nsects;
+
+ switch (req->operation) {
+ case BLKIF_OP_WRITE:
+ treq.op = TD_OP_WRITE;
+ td_queue_write(image, treq);
+ break;
+
+ case BLKIF_OP_READ:
+ treq.op = TD_OP_READ;
+ td_queue_read(image, treq);
+ break;
+ }
+
+ sector_nr += nsects;
+ }
+
+ err = 0;
+
+out:
+ vreq->submitting--;
+ if (!vreq->secs_pending) {
+ err = (err ? : vreq->error);
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+ }
+
+ return err;
+
+fail:
+ vreq->status = BLKIF_RSP_ERROR;
+ goto out;
+}
+
+static int
+tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd)
+{
+ int err;
+ struct timeval now;
+ td_vbd_request_t *vreq, *tmp;
+
+ err = 0;
+ gettimeofday(&now, NULL);
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+ if (vreq->secs_pending)
+ continue;
+
+ if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+ goto fail;
+
+ if (vreq->error != -EBUSY &&
+ now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL)
+ continue;
+
+ if (vreq->num_retries >= TD_VBD_MAX_RETRIES) {
+ fail:
+ DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n",
+ vreq->req.id, vreq->num_retries);
+ tapdisk_vbd_complete_vbd_request(vbd, vreq);
+ continue;
+ }
+
+ /*
+ * never fail due to too many retries if we are blocked on a
+ * dependency
+ */
+ if (vreq->blocked) {
+ vreq->blocked = 0;
+ } else {
+ vbd->retries++;
+ vreq->num_retries++;
+ }
+ vreq->error = 0;
+ vreq->status = BLKIF_RSP_OKAY;
+ DBG(TLOG_DBG, "retry #%d of req %"PRIu64", "
+ "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries,
+ vreq->req.id, vreq->req.sector_number,
+ vreq->req.nr_segments);
+
+ err = tapdisk_vbd_issue_request(vbd, vreq);
+ if (err)
+ break;
+ }
+
+ if (list_empty(&vbd->failed_requests))
+ td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED);
+ else
+ td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED);
+
+ return err;
+}
+
+static int
+tapdisk_vbd_issue_new_requests(td_vbd_t *vbd)
+{
+ int err;
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+ err = tapdisk_vbd_issue_request(vbd, vreq);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk_vbd_kill_requests(td_vbd_t *vbd)
+{
+ td_vbd_request_t *vreq, *tmp;
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+ vreq->status = BLKIF_RSP_ERROR;
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+
+ tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+ vreq->status = BLKIF_RSP_ERROR;
+ tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+ }
+
+ return 0;
+}
+
+int
+tapdisk_vbd_issue_requests(td_vbd_t *vbd)
+{
+ int err;
+
+ if (td_flag_test(vbd->state, TD_VBD_DEAD))
+ return tapdisk_vbd_kill_requests(vbd);
+
+ if (!tapdisk_vbd_queue_ready(vbd))
+ return -EAGAIN;
+
+ err = tapdisk_vbd_reissue_failed_requests(vbd);
+ if (err)
+ return err;
+
+ return tapdisk_vbd_issue_new_requests(vbd);
+}
+
+static void
+tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd)
+{
+ int idx;
+ RING_IDX rp, rc;
+ td_ring_t *ring;
+ blkif_request_t *req;
+ td_vbd_request_t *vreq;
+
+ ring = &vbd->ring;
+ if (!ring->sring)
+ return;
+
+ rp = ring->fe_ring.sring->req_prod;
+ xen_rmb();
+
+ for (rc = ring->fe_ring.req_cons; rc != rp; rc++) {
+ req = RING_GET_REQUEST(&ring->fe_ring, rc);
+ ++ring->fe_ring.req_cons;
+
+ idx = req->id;
+ vreq = &vbd->request_list[idx];
+
+ ASSERT(list_empty(&vreq->next));
+ ASSERT(vreq->secs_pending == 0);
+
+ memcpy(&vreq->req, req, sizeof(blkif_request_t));
+ vbd->received++;
+ vreq->vbd = vbd;
+
+ tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+
+ DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx);
+ }
+}
+
+static int
+tapdisk_vbd_pause_ring(td_vbd_t *vbd)
+{
+ int err;
+
+ if (td_flag_test(vbd->state, TD_VBD_PAUSED))
+ return 0;
+
+ td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+ err = tapdisk_vbd_quiesce_queue(vbd);
+ if (err) {
+ EPRINTF("%s: ring pause request on active queue\n", vbd->name);
+ return err;
+ }
+
+ tapdisk_vbd_close_vdi(vbd);
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0);
+ if (err)
+ EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno);
+ else {
+ td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+ td_flag_set(vbd->state, TD_VBD_PAUSED);
+ }
+
+ return err;
+}
+
+static int
+tapdisk_vbd_resume_ring(td_vbd_t *vbd)
+{
+ int i, err, type;
+ char *path, message[BLKTAP2_MAX_MESSAGE_LEN];
+
+ memset(message, 0, sizeof(message));
+
+ if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+ EPRINTF("%s: resume message for unpaused vbd\n", vbd->name);
+ return -EINVAL;
+ }
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message);
+ if (err) {
+ EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno);
+ return err;
+ }
+
+ err = tapdisk_parse_disk_type(message, &path, &type);
+ if (err) {
+ EPRINTF("%s: invalid resume string %s\n", vbd->name, message);
+ goto out;
+ }
+
+ free(vbd->name);
+ vbd->name = strdup(path);
+ if (!vbd->name) {
+ EPRINTF("resume malloc failed\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ vbd->type = type;
+
+ tapdisk_vbd_start_queue(vbd);
+
+ err = tapdisk_vbd_reactivate_volumes(vbd, 1);
+ if (err) {
+ EPRINTF("failed to reactivate %s, %d\n", vbd->name, err);
+ goto out;
+ }
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+
+out:
+ if (!err) {
+ image_t image;
+ struct blktap2_params params;
+
+ memset(&params, 0, sizeof(params));
+ tapdisk_vbd_get_image_info(vbd, &image);
+
+ params.sector_size = image.secsize;
+ params.capacity = image.size;
+ snprintf(params.name, sizeof(params.name) - 1, "%s", message);
+
+ ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, &params);
+ td_flag_clear(vbd->state, TD_VBD_PAUSED);
+ }
+
+ ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err);
+ return err;
+}
+
+static int
+tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
+{
+ if (!vbd->ring.sring)
+ return -EINVAL;
+
+ switch (vbd->ring.sring->pad[0]) {
+ case 0:
+ return 0;
+
+ case BLKTAP2_RING_MESSAGE_PAUSE:
+ return tapdisk_vbd_pause_ring(vbd);
+
+ case BLKTAP2_RING_MESSAGE_RESUME:
+ return tapdisk_vbd_resume_ring(vbd);
+
+ case BLKTAP2_RING_MESSAGE_CLOSE:
+ return tapdisk_vbd_close(vbd);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static void
+tapdisk_vbd_ring_event(event_id_t id, char mode, void *private)
+{
+ td_vbd_t *vbd;
+
+ vbd = (td_vbd_t *)private;
+
+ tapdisk_vbd_pull_ring_requests(vbd);
+ tapdisk_vbd_issue_requests(vbd);
+
+ /* vbd may be destroyed after this call */
+ tapdisk_vbd_check_ring_message(vbd);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd)
+{
+ return list_entry(vbd->images.next, td_image_t, next);
+}
diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h
new file mode 100644
index 0000000000..ecb22a0762
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk-vbd.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_VBD_H_
+#define _TAPDISK_VBD_H_
+
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-ipc.h"
+#include "tapdisk-image.h"
+
+#define TD_VBD_MAX_RETRIES 100
+#define TD_VBD_RETRY_INTERVAL 1
+
+#define TD_VBD_DEAD 0x0001
+#define TD_VBD_CLOSED 0x0002
+#define TD_VBD_QUIESCE_REQUESTED 0x0004
+#define TD_VBD_QUIESCED 0x0008
+#define TD_VBD_PAUSE_REQUESTED 0x0010
+#define TD_VBD_PAUSED 0x0020
+#define TD_VBD_SHUTDOWN_REQUESTED 0x0040
+#define TD_VBD_LOCKING 0x0080
+#define TD_VBD_RETRY_NEEDED 0x0100
+#define TD_VBD_LOG_DROPPED 0x0200
+
+typedef struct td_ring td_ring_t;
+typedef struct td_vbd_request td_vbd_request_t;
+typedef struct td_vbd_handle td_vbd_t;
+typedef void (*td_vbd_cb_t) (void *, blkif_response_t *);
+
+struct td_ring {
+ int fd;
+ char *mem;
+ blkif_sring_t *sring;
+ blkif_back_ring_t fe_ring;
+ unsigned long vstart;
+};
+
+struct td_vbd_request {
+ blkif_request_t req;
+ int16_t status;
+
+ int error;
+ int blocked; /* blocked on a dependency */
+ int submitting;
+ int secs_pending;
+ int num_retries;
+ struct timeval last_try;
+
+ td_vbd_t *vbd;
+ struct list_head next;
+};
+
+struct td_vbd_handle {
+ char *name;
+
+ td_uuid_t uuid;
+ int type;
+
+ int storage;
+
+ uint8_t reopened;
+ uint8_t reactivated;
+ td_flag_t flags;
+ td_flag_t state;
+
+ td_ipc_t ipc;
+
+ struct list_head images;
+
+ struct list_head new_requests;
+ struct list_head pending_requests;
+ struct list_head failed_requests;
+ struct list_head completed_requests;
+
+ td_vbd_request_t request_list[MAX_REQUESTS];
+
+ td_ring_t ring;
+ event_id_t ring_event_id;
+
+ td_vbd_cb_t callback;
+ void *argument;
+
+ struct list_head next;
+
+ struct timeval ts;
+
+ uint64_t received;
+ uint64_t returned;
+ uint64_t kicked;
+ uint64_t secs_pending;
+ uint64_t retries;
+ uint64_t errors;
+};
+
+#define tapdisk_vbd_for_each_request(vreq, tmp, list) \
+ list_for_each_entry_safe((vreq), (tmp), (list), next)
+
+#define tapdisk_vbd_for_each_image(vbd, image, tmp) \
+ list_for_each_entry_safe((image), (tmp), &(vbd)->images, next)
+
+static inline void
+tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest)
+{
+ list_del(&vreq->next);
+ INIT_LIST_HEAD(&vreq->next);
+ list_add_tail(&vreq->next, dest);
+}
+
+static inline void
+tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image)
+{
+ list_add_tail(&image->next, &vbd->images);
+}
+
+static inline int
+tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image)
+{
+ return list_is_last(&image->next, &vbd->images);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd);
+
+static inline td_image_t *
+tapdisk_vbd_last_image(td_vbd_t *vbd)
+{
+ return list_entry(vbd->images.prev, td_image_t, next);
+}
+
+static inline td_image_t *
+tapdisk_vbd_next_image(td_image_t *image)
+{
+ return list_entry(image->next.next, td_image_t, next);
+}
+
+int tapdisk_vbd_initialize(int, int, td_uuid_t);
+void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
+int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
+ uint16_t, const char *, td_flag_t);
+int tapdisk_vbd_close(td_vbd_t *);
+
+int tapdisk_vbd_open_vdi(td_vbd_t *, const char *,
+ uint16_t, uint16_t, td_flag_t);
+void tapdisk_vbd_close_vdi(td_vbd_t *);
+
+void tapdisk_vbd_forward_request(td_request_t);
+
+int tapdisk_vbd_get_image_info(td_vbd_t *, image_t *);
+int tapdisk_vbd_queue_ready(td_vbd_t *);
+int tapdisk_vbd_retry_needed(td_vbd_t *);
+int tapdisk_vbd_quiesce_queue(td_vbd_t *);
+int tapdisk_vbd_start_queue(td_vbd_t *);
+int tapdisk_vbd_issue_requests(td_vbd_t *);
+int tapdisk_vbd_kill_queue(td_vbd_t *);
+int tapdisk_vbd_pause(td_vbd_t *);
+int tapdisk_vbd_resume(td_vbd_t *, const char *, uint16_t);
+int tapdisk_vbd_kick(td_vbd_t *);
+void tapdisk_vbd_check_state(td_vbd_t *);
+void tapdisk_vbd_check_progress(td_vbd_t *);
+void tapdisk_vbd_debug(td_vbd_t *);
+
+void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk.c b/tools/blktap2/drivers/tapdisk.c
new file mode 100644
index 0000000000..db1366afa4
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+
+static void
+usage(void)
+{
+ fprintf(stderr, "blktap-utils: v2.0.0\n");
+ fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n");
+ exit(EINVAL);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int err;
+
+ if (argc != 3)
+ usage();
+
+ daemon(0, 0);
+ tapdisk_start_logging("TAPDISK");
+
+ err = tapdisk_server_initialize(argv[1], argv[2]);
+ if (err) {
+ EPRINTF("failed to initialize tapdisk server: %d\n", err);
+ goto out;
+ }
+
+ err = tapdisk_server_run();
+
+out:
+ tapdisk_stop_logging();
+ return err;
+}
diff --git a/tools/blktap2/drivers/tapdisk.h b/tools/blktap2/drivers/tapdisk.h
new file mode 100644
index 0000000000..487c50fbf6
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Some notes on the tap_disk interface:
+ *
+ * tap_disk aims to provide a generic interface to easily implement new
+ * types of image accessors. The structure-of-function-calls is similar
+ * to disk interfaces used in qemu/denali/etc, with the significant
+ * difference being the expectation of asynchronous rather than synchronous
+ * I/O. The asynchronous interface is intended to allow lots of requests to
+ * be pipelined through a disk, without the disk requiring any of its own
+ * threads of control. As such, a batch of requests is delivered to the disk
+ * using:
+ *
+ * td_queue_[read,write]()
+ *
+ * and passing in a completion callback, which the disk is responsible for
+ * tracking. Disks should transform these requests as necessary and return
+ * the resulting iocbs to tapdisk using td_prep_[read,write]() and
+ * td_queue_tiocb().
+ *
+ * NOTE: tapdisk uses the number of sectors submitted per request as a
+ * ref count. Plugins must use the callback function to communicate the
+ * completion -- or error -- of every sector submitted to them.
+ *
+ * td_get_parent_id returns:
+ * 0 if parent id successfully retrieved
+ * TD_NO_PARENT if no parent exists
+ * -errno on error
+ */
+
+#ifndef _TAPDISK_H_
+#define _TAPDISK_H_
+
+#include <time.h>
+#include <stdint.h>
+
+#include "list.h"
+#include "blktaplib.h"
+#include "disktypes.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#define MAX_SEGMENTS_PER_REQ 11
+#define SECTOR_SHIFT 9
+#define DEFAULT_SECTOR_SIZE 512
+
+#define TAPDISK_DATA_REQUESTS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+//#define BLK_NOT_ALLOCATED (-99)
+#define TD_NO_PARENT 1
+
+#define MAX_RAMDISK_SIZE 1024000 /*500MB disk limit*/
+
+#define TD_OP_READ 0
+#define TD_OP_WRITE 1
+
+#define TD_OPEN_QUIET 0x00001
+#define TD_OPEN_QUERY 0x00002
+#define TD_OPEN_RDONLY 0x00004
+#define TD_OPEN_STRICT 0x00008
+#define TD_OPEN_SHAREABLE 0x00010
+#define TD_OPEN_ADD_CACHE 0x00020
+#define TD_OPEN_VHD_INDEX 0x00040
+#define TD_OPEN_LOG_DIRTY 0x00080
+
+#define TD_CREATE_SPARSE 0x00001
+#define TD_CREATE_MULTITYPE 0x00002
+
+#define td_flag_set(word, flag) ((word) |= (flag))
+#define td_flag_clear(word, flag) ((word) &= ~(flag))
+#define td_flag_test(word, flag) ((word) & (flag))
+
+typedef uint16_t td_uuid_t;
+typedef uint32_t td_flag_t;
+typedef uint64_t td_sector_t;
+typedef struct td_disk_id td_disk_id_t;
+typedef struct td_disk_info td_disk_info_t;
+typedef struct td_request td_request_t;
+typedef struct td_driver_handle td_driver_t;
+typedef struct td_image_handle td_image_t;
+
+/*
+ * Prototype of the callback to activate as requests complete.
+ */
+typedef void (*td_callback_t)(td_request_t, int);
+
+struct td_disk_id {
+ char *name;
+ int drivertype;
+};
+
+struct td_disk_info {
+ td_sector_t size;
+ long sector_size;
+ uint32_t info;
+};
+
+struct td_request {
+ int op;
+ char *buf;
+ td_sector_t sec;
+ int secs;
+
+ uint8_t blocked; /* blocked on a dependency */
+
+ td_image_t *image;
+
+ td_callback_t cb;
+ void *cb_data;
+
+ uint64_t id;
+ int sidx;
+ void *private;
+};
+
+/*
+ * Structure describing the interface to a virtual disk implementation.
+ * See note at the top of this file describing this interface.
+ */
+struct tap_disk {
+ const char *disk_type;
+ td_flag_t flags;
+ int private_data_size;
+ int (*td_open) (td_driver_t *, const char *, td_flag_t);
+ int (*td_close) (td_driver_t *);
+ int (*td_get_parent_id) (td_driver_t *, td_disk_id_t *);
+ int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t);
+ void (*td_queue_read) (td_driver_t *, td_request_t);
+ void (*td_queue_write) (td_driver_t *, td_request_t);
+ void (*td_debug) (td_driver_t *);
+};
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c
new file mode 100644
index 0000000000..45b27ecc19
--- /dev/null
+++ b/tools/blktap2/drivers/tapdisk2.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+
+#include "tapdisk.h"
+#include "blktap2.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+
+#define TAPDISK2_VBD 0
+
+#define cprintf(_err, _f, _a...) \
+ do { \
+ if (child_out) { \
+ fprintf(child_out, "%d: " _f, _err, ##_a); \
+ fflush(child_out); \
+ } \
+ } while (0)
+
+#define CHILD_ERR(_err, _f, _a...) \
+ do { \
+ EPRINTF(_f, ##_a); \
+ cprintf(_err, _f, ##_a); \
+ } while (0)
+
+static int channel[2];
+static FILE *child_out;
+static struct blktap2_handle handle;
+
+static int
+tapdisk2_prepare_directory(void)
+{
+ int err;
+ char *ptr, *name, *start;
+
+ err = access(BLKTAP2_DIRECTORY, W_OK | R_OK);
+ if (!err)
+ return 0;
+
+ name = strdup(BLKTAP2_DIRECTORY);
+ if (!name)
+ return -ENOMEM;
+
+ start = name;
+
+ for (;;) {
+ ptr = strchr(start + 1, '/');
+ if (ptr)
+ *ptr = '\0';
+
+ err = mkdir(name, 0755);
+ if (err && errno != EEXIST) {
+ err = -errno;
+ CHILD_ERR(err, "failed to create directory %s: %d\n",
+ name, err);
+ break;
+ }
+
+ if (!ptr)
+ break;
+ else {
+ *ptr = '/';
+ start = ptr + 1;
+ }
+ }
+
+ free(name);
+ return err;
+}
+
+static int
+tapdisk2_make_device(char *devname, int major, int minor, int perm)
+{
+ int err;
+ struct stat st;
+
+ err = tapdisk2_prepare_directory();
+ if (err)
+ return err;
+
+ if (!access(devname, F_OK))
+ if (unlink(devname)) {
+ CHILD_ERR(errno, "error unlinking %s: %d\n",
+ devname, errno);
+ return -errno;
+ }
+
+ err = mknod(devname, perm, makedev(major, minor));
+ if (err) {
+ CHILD_ERR(errno, "mknod %s failed: %d\n", devname, -errno);
+ return -errno;
+ }
+
+ DPRINTF("Created %s device\n", devname);
+ return 0;
+}
+
+static int
+tapdisk2_check_environment(void)
+{
+ FILE *f;
+ int err, minor;
+ char name[256];
+
+ if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK))
+ return 0;
+
+ memset(name, 0, sizeof(name));
+
+ f = fopen("/proc/misc", "r");
+ if (!f) {
+ CHILD_ERR(errno, "failed to open /proc/misc: %d\n", errno);
+ return -errno;
+ }
+
+ while (fscanf(f, "%d %256s", &minor, name) == 2)
+ if (!strcmp(name, BLKTAP2_CONTROL_NAME)) {
+ err = tapdisk2_make_device(BLKTAP2_CONTROL_DEVICE,
+ MISC_MAJOR_NUMBER,
+ minor, S_IFCHR | 0600);
+ goto out;
+ }
+
+ err = -ENOSYS;
+ CHILD_ERR(err, "didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME);
+
+out:
+ fclose(f);
+ return err;
+}
+
+static void
+tapdisk2_free_device(void)
+{
+ int fd, err;
+
+ fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+ if (fd == -1) {
+ CHILD_ERR(errno, "failed to open control device: %d\n", errno);
+ return;
+ }
+
+ err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, handle.minor);
+ close(fd);
+}
+
+static int
+tapdisk2_prepare_device(void)
+{
+ char *name;
+ int fd, err;
+
+ fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+ if (fd == -1) {
+ CHILD_ERR(errno, "failed to open control device: %d\n", errno);
+ return -errno;
+ }
+
+ err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle);
+ close(fd);
+ if (err == -1) {
+ CHILD_ERR(errno, "failed to allocate new device: %d\n", errno);
+ return -errno;
+ }
+
+ err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tapdisk2_make_device(name, handle.ring,
+ handle.minor, S_IFCHR | 0600);
+ free(name);
+ if (err) {
+ CHILD_ERR(err, "creating ring device for %d failed: %d\n",
+ handle.minor, err);
+ goto fail;
+ }
+
+ err = asprintf(&name, "%s%d", BLKTAP2_IO_DEVICE, handle.minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tapdisk2_make_device(name, handle.device,
+ handle.minor, S_IFBLK | 0600);
+ free(name);
+ if (err) {
+ CHILD_ERR(err, "creating IO device for %d failed: %d\n",
+ handle.minor, err);
+ goto fail;
+ }
+
+ DPRINTF("new interface: ring: %u, device: %u, minor: %u\n",
+ handle.ring, handle.device, handle.minor);
+
+ return 0;
+
+fail:
+ tapdisk2_free_device();
+ return err;
+}
+
+static int
+tapdisk2_open_device(int type, const char *path, const char *name)
+{
+ int err;
+ td_vbd_t *vbd;
+ image_t image;
+ char *devname;
+ struct blktap2_params params;
+
+ err = tapdisk_vbd_initialize(-1, -1, TAPDISK2_VBD);
+ if (err)
+ return err;
+
+ vbd = tapdisk_server_get_vbd(TAPDISK2_VBD);
+ if (!vbd) {
+ err = -ENODEV;
+ CHILD_ERR(err, "couldn't find vbd\n");
+ return err;
+ }
+
+ err = asprintf(&devname, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+ if (err == -1) {
+ err = -ENOMEM;
+ CHILD_ERR(err, "couldn't allocate ring\n");
+ return err;
+ }
+
+ err = tapdisk_vbd_open(vbd, path, type,
+ TAPDISK_STORAGE_TYPE_DEFAULT,
+ devname, 0);
+ free(devname);
+ if (err) {
+ CHILD_ERR(err, "vbd open failed: %d\n", err);
+ return err;
+ }
+
+ memset(&params, 0, sizeof(params));
+ tapdisk_vbd_get_image_info(vbd, &image);
+
+ params.capacity = image.size;
+ params.sector_size = image.secsize;
+ snprintf(params.name, sizeof(params.name) - 1, "%s", name);
+
+ err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_CREATE_DEVICE, &params);
+ if (err) {
+ err = -errno;
+ CHILD_ERR(err, "create device failed: %d\n", err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+tapdisk2_set_child_fds(void)
+{
+ int i, err;
+
+ err = dup2(channel[1], STDOUT_FILENO);
+ if (err == -1) {
+ CHILD_ERR(errno, "failed duping pipe: %d\n", errno);
+ return errno;
+ }
+
+ child_out = fdopen(STDOUT_FILENO, "w");
+ if (!child_out) {
+ CHILD_ERR(errno, "failed setting child_out: %d\n", errno);
+ return errno;
+ }
+
+ for (i = 0; i < sysconf(_SC_OPEN_MAX); i++)
+ if (i != STDOUT_FILENO)
+ close(i);
+
+ return 0;
+}
+
+static int
+tapdisk2_create_device(const char *params)
+{
+ char *path;
+ int err, type;
+
+ chdir("/");
+ tapdisk_start_logging("tapdisk2");
+
+ err = tapdisk2_set_child_fds();
+ if (err)
+ goto out;
+
+ err = tapdisk2_check_environment();
+ if (err)
+ goto out;
+
+ err = tapdisk_parse_disk_type(params, &path, &type);
+ if (err)
+ goto out;
+
+ err = tapdisk2_prepare_device();
+ if (err)
+ goto out;
+
+ err = tapdisk_server_initialize(NULL, NULL);
+ if (err)
+ goto fail;
+
+ err = tapdisk2_open_device(type, path, params);
+ if (err)
+ goto fail;
+
+ cprintf(0, "%s%d\n", BLKTAP2_IO_DEVICE, handle.minor);
+ close(STDOUT_FILENO);
+
+ err = tapdisk_server_run();
+ if (err)
+ goto fail;
+
+ err = 0;
+
+out:
+ tapdisk_stop_logging();
+ return err;
+
+fail:
+ tapdisk2_free_device();
+ goto out;
+}
+
+static int
+tapdisk2_wait_for_device(void)
+{
+ int err;
+ char msg[1024];
+ FILE *parent_in;
+
+ close(channel[1]);
+ parent_in = fdopen(channel[0], "r");
+ if (!parent_in) {
+ printf("failed to connect to child: %d\n", errno);
+ return errno;
+ }
+
+ memset(msg, 0, sizeof(msg));
+ if (fscanf(parent_in, "%d: %1023[^\n]", &err, msg) != 2) {
+ printf("unrecognized child response\n");
+ return EINVAL;
+ }
+
+ printf("%s\n", msg);
+ return (err >= 0 ? err : -err);
+}
+
+static void
+usage(const char *app, int err)
+{
+ fprintf(stderr, "usage: %s <-n file>\n", app);
+ exit(err);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c;
+ char *params;
+
+ params = NULL;
+
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ params = optarg;
+ break;
+ case 'h':
+ usage(argv[0], 0);
+ default:
+ usage(argv[0], EINVAL);
+ }
+ }
+
+ if (!params || optind != argc)
+ usage(argv[0], EINVAL);
+
+ if (pipe(channel) == -1) {
+ printf("pipe failed: %d\n", errno);
+ return errno;
+ }
+
+ switch (fork()) {
+ case -1:
+ printf("fork failed: %d\n", errno);
+ return errno;
+ case 0:
+ return tapdisk2_create_device(params);
+ default:
+ return tapdisk2_wait_for_device();
+ }
+}
diff --git a/tools/blktap2/drivers/td.c b/tools/blktap2/drivers/td.c
new file mode 100644
index 0000000000..f920acd294
--- /dev/null
+++ b/tools/blktap2/drivers/td.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+#include "tapdisk-utils.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef enum {
+ TD_FIELD_HIDDEN = 0,
+ TD_FIELD_INVALID = 1
+} td_field_t;
+
+struct vdi_field {
+ char *name;
+ td_field_t id;
+};
+
+static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = {
+ { .id = TD_FIELD_HIDDEN, .name = "hidden" }
+};
+
+typedef enum {
+ TD_CMD_CREATE = 0,
+ TD_CMD_SNAPSHOT,
+/* TD_CMD_COALESCE, */
+ TD_CMD_QUERY,
+/* TD_CMD_RESIZE, */
+ TD_CMD_SET,
+/* TD_CMD_REPAIR, */
+/* TD_CMD_FILL, */
+/* TD_CMD_READ, */
+ TD_CMD_INVALID,
+} td_command_t;
+
+struct command {
+ td_command_t id;
+ char *name;
+ int needs_type;
+};
+
+struct command commands[TD_CMD_INVALID] = {
+ { .id = TD_CMD_CREATE, .name = "create", .needs_type = 1 },
+ { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 },
+/* { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 }, */
+ { .id = TD_CMD_QUERY, .name = "query", .needs_type = 1 },
+/* { .id = TD_CMD_RESIZE, .name = "resize", .needs_type = 1 }, */
+ { .id = TD_CMD_SET, .name = "set", .needs_type = 1 },
+/* { .id = TD_CMD_REPAIR, .name = "repair", .needs_type = 1 }, */
+/* { .id = TD_CMD_FILL, .name = "fill", .needs_type = 1 }, */
+/* { .id = TD_CMD_READ, .name = "read", .needs_type = 1 }, */
+};
+
+typedef enum {
+ TD_TYPE_VHD = 0,
+ TD_TYPE_AIO,
+ TD_TYPE_INVALID,
+} td_disk_t;
+
+const char *td_disk_types[TD_TYPE_INVALID] = {
+ "vhd",
+ "aio",
+};
+
+#define print_commands() \
+ do { \
+ int i; \
+ fprintf(stderr, "COMMAND := { "); \
+ fprintf(stderr, "%s", commands[0].name); \
+ for (i = 1; i < TD_CMD_INVALID; i++) \
+ fprintf(stderr, " | %s", commands[i].name); \
+ fprintf(stderr, " }\n"); \
+ } while (0)
+
+#define print_disk_types() \
+ do { \
+ int i; \
+ fprintf(stderr, "TYPE := { "); \
+ fprintf(stderr, "%s", td_disk_types[0]); \
+ for (i = 1; i < TD_TYPE_INVALID; i++) \
+ fprintf(stderr, " | %s", td_disk_types[i]); \
+ fprintf(stderr, " }\n"); \
+ } while (0);
+
+#define print_field_names() \
+ do { \
+ int i; \
+ fprintf(stderr, "FIELD := { "); \
+ fprintf(stderr, "%s", td_vdi_fields[0].name); \
+ for (i = 1; i < TD_FIELD_INVALID; i++) \
+ fprintf(stderr, " | %s", td_vdi_fields[i].name); \
+ fprintf(stderr, " }\n"); \
+ } while (0)
+
+void
+help(void)
+{
+ fprintf(stderr, "Tapdisk Utilities: v1.0.0\n");
+ fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n");
+ print_commands();
+ print_disk_types();
+ exit(-1);
+}
+
+struct command *
+get_command(char *command)
+{
+ int i;
+
+ for (i = 0; i < TD_CMD_INVALID; i++)
+ if (!strcmp(command, commands[i].name))
+ return &commands[i];
+
+ return NULL;
+}
+
+struct vdi_field *
+get_field(char *field)
+{
+ int i;
+
+ for (i = 0; i < TD_FIELD_INVALID; i++)
+ if (!strcmp(field, td_vdi_fields[i].name))
+ return &td_vdi_fields[i];
+
+ return NULL;
+}
+
+int
+get_driver_type(char *type)
+{
+ int i;
+
+ if (strnlen(type, 25) >= 25)
+ return -ENAMETOOLONG;
+
+ for (i = 0; i < TD_TYPE_INVALID; i++)
+ if (!strcmp(type, td_disk_types[i]))
+ return i;
+
+ return -TD_TYPE_INVALID;
+}
+
+int
+td_create(int type, int argc, char *argv[])
+{
+ ssize_t mb;
+ uint64_t size;
+ char *name, *buf;
+ int c, i, fd, sparse = 1, fixedsize = 0;
+
+ while ((c = getopt(argc, argv, "hrb")) != -1) {
+ switch(c) {
+ case 'r':
+ sparse = 0;
+ break;
+ case 'b':
+ fixedsize = 1;
+ break;
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 2))
+ goto usage;
+
+ mb = 1 << 20;
+ size = atoi(argv[optind++]);
+ size = size << 20;
+ name = argv[optind];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (type == TD_TYPE_VHD) {
+ int cargc = 0;
+ char sbuf[32], *cargv[10];
+
+ size >>= 20;
+
+ memset(cargv, 0, sizeof(cargv));
+ snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size);
+ cargv[cargc++] = "create";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-s";
+ cargv[cargc++] = sbuf;
+ if (!sparse)
+ cargv[cargc++] = "-r";
+ if (fixedsize)
+ cargv[cargc++] = "-b";
+
+ return vhd_util_create(cargc, cargv);
+ }
+
+ /* generic create */
+ if (sparse) {
+ fprintf(stderr, "Cannot create sparse %s image\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ buf = calloc(1, mb);
+ if (!buf)
+ return ENOMEM;
+
+ fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644);
+ if (fd == -1) {
+ free(buf);
+ return errno;
+ }
+
+ size >>= 20;
+ for (i = 0; i < size; i++)
+ if (write(fd, buf, mb) != mb) {
+ close(fd);
+ unlink(name);
+ free(buf);
+ return EIO;
+ }
+
+ close(fd);
+ free(buf);
+ return 0;
+
+ usage:
+ fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] "
+ "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n",
+ td_disk_types[type]);
+ return EINVAL;
+}
+
+int
+td_snapshot(int type, int argc, char *argv[])
+{
+ char *cargv[10];
+ int c, err, cargc;
+ struct stat stats;
+ char *name, *backing, *limit = NULL;
+ int fixedsize = 0, rawparent = 0;
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot create snapshot of %s image type\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "hbml:")) != -1) {
+ switch(c) {
+ case 'b':
+ fixedsize = 1;
+ break;
+ case 'm':
+ rawparent = 1;
+ break;
+ case 'l':
+ limit = optarg;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = EINVAL;
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 2)) {
+ err = EINVAL;
+ goto usage;
+ }
+
+ name = argv[optind++];
+ backing = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (stat(backing, &stats) == -1) {
+ fprintf(stderr, "File %s not found\n", backing);
+ return errno;
+ }
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "snapshot";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-p";
+ cargv[cargc++] = backing;
+ if (fixedsize)
+ cargv[cargc++] = "-b";
+ if (rawparent)
+ cargv[cargc++] = "-m";
+ if (limit) {
+ cargv[cargc++] = "-l";
+ cargv[cargc++] = limit;
+ }
+ return vhd_util_snapshot(cargc, cargv);
+
+ usage:
+ fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] "
+ "[-b file_is_fixed_size] [-l snapshot depth limit] "
+ "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]);
+ return err;
+}
+
+int
+td_coalesce(int type, int argc, char *argv[])
+{
+ int c, ret, cargc;
+ char *name, *pname, *cargv[3];
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot create snapshot of %s image type\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "h")) != -1) {
+ switch(c) {
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 1))
+ goto usage;
+
+ name = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "coalesce";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ ret = vhd_util_coalesce(cargc, cargv);
+ if (ret)
+ printf("coalesce failed: %d\n", ret);
+
+ return ret;
+
+ usage:
+ fprintf(stderr, "usage: td-util coalesce %s [-h help] "
+ "<FILENAME>\n", td_disk_types[type]);
+ return EINVAL;
+}
+
+int
+td_query(int type, int argc, char *argv[])
+{
+ char *name;
+ int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0;
+
+ while ((c = getopt(argc, argv, "hvpfd")) != -1) {
+ switch(c) {
+ case 'v':
+ size = 1;
+ break;
+ case 'p':
+ parent = 1;
+ break;
+ case 'f':
+ fields = 1;
+ break;
+ case 'd':
+ depth = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = EINVAL;
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 1)) {
+ err = EINVAL;
+ goto usage;
+ }
+
+ name = argv[optind++];
+
+ if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ fprintf(stderr, "Device name too long\n");
+ return ENAMETOOLONG;
+ }
+
+ if (type == TD_TYPE_VHD) {
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("failed opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size)
+ printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+ if (parent) {
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ printf("%s has no parent\n", name);
+ else {
+ char *pname;
+
+ err = vhd_parent_locator_get(&vhd, &pname);
+ if (err)
+ printf("failed getting parent: %d\n",
+ err);
+ else {
+ printf("%s\n", pname);
+ free(pname);
+ }
+ }
+ }
+
+ if (fields) {
+ int ret, hidden;
+
+ ret = vhd_hidden(&vhd, &hidden);
+ if (ret) {
+ printf("failed checking 'hidden' field: %d\n",
+ ret);
+ err = (err ? : ret);
+ } else
+ printf("%s: %d\n",
+ td_vdi_fields[TD_FIELD_HIDDEN].name,
+ hidden);
+ }
+
+ if (depth) {
+ int ret, length;
+
+ ret = vhd_chain_depth(&vhd, &length);
+ if (ret)
+ printf("error checking chain depth: %d\n", ret);
+ else
+ printf("chain depth: %d\n", length);
+
+ err = (err ? : ret);
+ }
+
+ vhd_close(&vhd);
+
+ } else if (type == TD_TYPE_AIO) {
+ if (size) {
+ int fd;
+ uint64_t secs;
+ uint32_t ssize;
+
+ fd = open(name, O_RDONLY | O_LARGEFILE);
+ if (fd == -1) {
+ printf("failed opening %s: %d\n", name, errno);
+ return -errno;
+ }
+
+ err = tapdisk_get_image_size(fd, &secs, &ssize);
+ close(fd);
+
+ if (err) {
+ printf("failed getting size for %s: %d\n:",
+ name, err);
+ return err;
+ }
+
+ printf("%"PRIu64"\n", secs >> 11);
+ }
+
+ if (parent)
+ printf("%s has no parent\n", name);
+
+ if (fields) {
+ int i;
+
+ for (i = 0; i < TD_FIELD_INVALID; i++)
+ printf("%s: 0\n", td_vdi_fields[i].name);
+ }
+ }
+
+ return err;
+
+ usage:
+ fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] "
+ "[-p parent] [-f fields] <FILENAME>\n", td_disk_types[type]);
+ return err;
+}
+
+int
+td_set_field(int type, int argc, char *argv[])
+{
+ int ret, i, c, cargc;
+ struct vdi_field *field;
+ char *name, *value, *cargv[7];
+
+ if (type != TD_TYPE_VHD) {
+ fprintf(stderr, "Cannot set fields of %s images\n",
+ td_disk_types[type]);
+ return EINVAL;
+ }
+
+ while ((c = getopt(argc, argv, "h")) != -1) {
+ switch(c) {
+ default:
+ fprintf(stderr, "Unknown option %c\n", (char)c);
+ case 'h':
+ goto usage;
+ }
+ }
+
+ if (optind != (argc - 3))
+ goto usage;
+
+ name = argv[optind++];
+
+ field = get_field(argv[optind]);
+ if (!field || field->id != TD_FIELD_HIDDEN) {
+ fprintf(stderr, "Invalid field %s\n", argv[optind]);
+ goto usage;
+ }
+
+ value = argv[++optind];
+
+ cargc = 0;
+ memset(cargv, 0, sizeof(cargv));
+ cargv[cargc++] = "set";
+ cargv[cargc++] = "-n";
+ cargv[cargc++] = name;
+ cargv[cargc++] = "-f";
+ cargv[cargc++] = field->name;
+ cargv[cargc++] = "-v";
+ cargv[cargc++] = value;
+ return vhd_util_set_field(cargc, cargv);
+
+ usage:
+ fprintf(stderr, "usage: td-util set %s [-h help] "
+ "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]);
+ print_field_names();
+ return EINVAL;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **cargv;
+ struct command *cmd;
+ int cargc, i, type = -1, ret = 0;
+
+#ifdef CORE_DUMP
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+ if (argc < 2)
+ help();
+
+ cargc = argc - 1;
+ cmd = get_command(argv[1]);
+ if (!cmd) {
+ fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+ help();
+ }
+
+ if (cmd->needs_type) {
+ if (argc < 3) {
+ fprintf(stderr, "td-util %s requires a TYPE\n",
+ cmd->name);
+ print_disk_types();
+ exit(-1);
+ }
+
+ type = get_driver_type(argv[2]);
+ if (type < 0) {
+ fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]);
+ print_disk_types();
+ exit(-1);
+ }
+ --cargc;
+ }
+
+ cargv = malloc(sizeof(char *) * cargc);
+ if (!cargv)
+ exit(ENOMEM);
+
+ cargv[0] = cmd->name;
+ for (i = 1; i < cargc; i++)
+ cargv[i] = argv[i + (argc - cargc)];
+
+ switch(cmd->id) {
+ case TD_CMD_CREATE:
+ ret = td_create(type, cargc, cargv);
+ break;
+ case TD_CMD_SNAPSHOT:
+ ret = td_snapshot(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_COALESCE:
+ ret = td_coalesce(type, cargc, cargv);
+ break;
+*/
+ case TD_CMD_QUERY:
+ ret = td_query(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_RESIZE:
+ ret = td_resize(type, cargc, cargv);
+ break;
+*/
+ case TD_CMD_SET:
+ ret = td_set_field(type, cargc, cargv);
+ break;
+/*
+ case TD_CMD_REPAIR:
+ ret = td_repair(type, cargc, cargv);
+ break;
+ case TD_CMD_FILL:
+ ret = td_fill(type, cargc, cargv);
+ break;
+ case TD_CMD_READ:
+ ret = td_read(type, cargc, cargv);
+ break;
+*/
+ default:
+ case TD_CMD_INVALID:
+ ret = EINVAL;
+ break;
+ }
+
+ free(cargv);
+
+ return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap2/drivers/xmsnap b/tools/blktap2/drivers/xmsnap
new file mode 100644
index 0000000000..f14351ba56
--- /dev/null
+++ b/tools/blktap2/drivers/xmsnap
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+usage () { echo "USAGE: xmsnap <VM ID> <Backing File>"; }
+
+#
+# Check Usage
+#
+if [ -n "$1" ]
+then
+ vmid=$1
+else
+ usage
+ exit 1
+fi
+
+if [ -n "$2" ]
+then
+ target=$2
+else
+ usage
+ exit 1
+fi
+
+if [ -e "$target" ]
+then
+ echo "Creating snapshot of file $target for VM $vmid."
+else
+ usage
+ echo "File $target not found."
+ exit 1
+fi
+
+#
+# Find the snapshot name
+#
+directory=`dirname "$target"`
+target=`basename "$target"`
+
+let maxidx=0
+if [ -e $directory/${target}.snap1 ]
+then
+ for idx in $(ls $directory/${target}.snap*)
+ do
+ let idx=${idx#$directory/${target}.snap}
+ if [ "$idx" -gt "$maxidx" ]
+ then
+ let maxidx=$idx
+ fi
+ done
+fi
+
+snap=${target}.snap`expr $maxidx + 1`
+
+#
+# Pause VM
+#
+xm pause $vmid
+if [ "$?" -ne "0" ]; then
+ exit 1
+fi
+
+
+#
+# Snap and reposition the files
+#
+mv $directory/$target $directory/$snap
+if [ "$?" -ne "0" ]; then
+ exit 1
+fi
+
+qcow-create 0 $directory/$target $directory/$snap
+
+#
+# Unpause
+#
+xm unpause $vmid
+
+exit \ No newline at end of file
diff --git a/tools/blktap2/include/Makefile b/tools/blktap2/include/Makefile
new file mode 100644
index 0000000000..7267eac53a
--- /dev/null
+++ b/tools/blktap2/include/Makefile
@@ -0,0 +1,14 @@
+XEN_ROOT := ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+.PHONY: all
+all:
+
+.PHONY: install
+install:
+ $(INSTALL_DIR) -p $(DESTDIR)$(INCLUDEDIR)
+
+
+.PHONY: clean
+clean:
+ @:
diff --git a/tools/blktap2/include/atomicio.h b/tools/blktap2/include/atomicio.h
new file mode 100644
index 0000000000..7eccf206b3
--- /dev/null
+++ b/tools/blktap2/include/atomicio.h
@@ -0,0 +1,33 @@
+/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */
+
+/*
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t);
+
+#define vwrite (ssize_t (*)(int, void *, size_t))write
diff --git a/tools/blktap2/include/blktaplib.h b/tools/blktap2/include/blktaplib.h
new file mode 100644
index 0000000000..1824afa943
--- /dev/null
+++ b/tools/blktap2/include/blktaplib.h
@@ -0,0 +1,249 @@
+/* blktaplib.h
+ *
+ * Blktap library userspace code.
+ *
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <syslog.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#if 1
+#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, XC_PAGE_SIZE)
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLK_RING_SIZE
+
+/* blktap IOCTLs: These must correspond with the blktap driver ioctls */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2
+#define BLKTAP_IOCTL_SETMODE 3
+#define BLKTAP_IOCTL_SENDPID 4
+#define BLKTAP_IOCTL_NEWINTF 5
+#define BLKTAP_IOCTL_MINOR 6
+#define BLKTAP_IOCTL_MAJOR 7
+#define BLKTAP_QUERY_ALLOC_REQS 8
+#define BLKTAP_IOCTL_FREEINTF 9
+#define BLKTAP_IOCTL_PRINT_IDXS 100
+#define BLKTAP_IOCTL_BACKDEV_SETUP 200
+
+#define PRIO_SPECIAL_IO -9999
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) );
+}
+
+#define MAX_REQUESTS BLK_RING_SIZE
+
+#define BLKTAP_IOCTL_KICK 1
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define BLKTAP_DEV_DIR "/dev/xen"
+#define BLKTAP_DEV_NAME "blktap"
+#define BACKDEV_NAME "backdev"
+#define BLKTAP_DEV_MINOR 0
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+extern int blktap_major;
+
+#define BLKTAP_RING_PAGES 1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
+
+struct blkif;
+struct blkif_info;
+
+typedef struct {
+ blkif_request_t req;
+ int submitting;
+ int secs_pending;
+ int16_t status;
+ int num_retries;
+ struct timeval last_try;
+} pending_req_t;
+
+typedef struct blkif {
+ domid_t domid;
+ long int handle;
+
+ long int pdev;
+ long int readonly;
+
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
+
+ struct blkif_ops *ops;
+ struct blkif *hash_next;
+
+ void *prv; /* device-specific data */
+ struct blkif_info *info; /*Image parameter passing */
+ pending_req_t pending_list[MAX_REQUESTS];
+ int devnum;
+ int fds[2];
+ int be_id;
+ char *backend_path;
+ int major;
+ int minor;
+ pid_t tappid;
+ int drivertype;
+ uint16_t cookie;
+ int err;
+} blkif_t;
+
+typedef struct blkif_info {
+ char *params;
+ int readonly;
+ int storage;
+} blkif_info_t;
+
+typedef struct tapdev_info {
+ int fd;
+ char *mem;
+ blkif_sring_t *sring;
+ blkif_back_ring_t fe_ring;
+ unsigned long vstart;
+ blkif_t *blkif;
+} tapdev_info_t;
+
+typedef struct domid_translate {
+ unsigned short domid;
+ unsigned short busid;
+} domid_translate_t ;
+
+typedef struct image {
+ unsigned long long size;
+ unsigned long secsize;
+ unsigned int info;
+} image_t;
+
+typedef struct msg_hdr {
+ uint16_t type;
+ uint16_t len;
+ uint16_t drivertype;
+ uint16_t cookie;
+} msg_hdr_t;
+
+typedef struct msg_params {
+ uint8_t readonly;
+ int path_off;
+ int path_len;
+ int storage;
+} msg_params_t;
+
+typedef struct msg_newdev {
+ uint8_t devnum;
+ uint16_t domid;
+} msg_newdev_t;
+
+typedef struct msg_pid {
+ pid_t pid;
+} msg_pid_t;
+
+typedef struct msg_cp {
+ int cp_uuid_off;
+ int cp_uuid_len;
+ int cp_drivertype;
+} msg_cp_t;
+
+typedef struct msg_lock {
+ int ro;
+ int enforce;
+ int uuid_off;
+ int uuid_len;
+} msg_lock_t;
+
+#define READ 0
+#define WRITE 1
+
+/*Control Messages between manager and tapdev*/
+#define CTLMSG_PARAMS 1
+#define CTLMSG_IMG 2
+#define CTLMSG_IMG_FAIL 3
+#define CTLMSG_NEWDEV 4
+#define CTLMSG_NEWDEV_RSP 5
+#define CTLMSG_NEWDEV_FAIL 6
+#define CTLMSG_CLOSE 7
+#define CTLMSG_CLOSE_RSP 8
+#define CTLMSG_PID 9
+#define CTLMSG_PID_RSP 10
+#define CTLMSG_CHECKPOINT 11
+#define CTLMSG_CHECKPOINT_RSP 12
+#define CTLMSG_LOCK 13
+#define CTLMSG_LOCK_RSP 14
+#define CTLMSG_PAUSE 15
+#define CTLMSG_PAUSE_RSP 16
+#define CTLMSG_RESUME 17
+#define CTLMSG_RESUME_RSP 18
+
+#define TAPDISK_STORAGE_TYPE_NFS 1
+#define TAPDISK_STORAGE_TYPE_EXT 2
+#define TAPDISK_STORAGE_TYPE_LVM 3
+#define TAPDISK_STORAGE_TYPE_DEFAULT TAPDISK_STORAGE_TYPE_EXT
+
+/* Abitrary values, must match the underlying driver... */
+#define MAX_TAP_DEV 256
+
+/* Accessing attached data page mappings */
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_vstart,_req,_seg) \
+ ((_vstart) + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) + \
+ ((_seg) * getpagesize()))
+
+/* Defines that are only used by library clients */
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+ [BLKIF_OP_READ] = "READ",
+ [BLKIF_OP_WRITE] = "WRITE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+
+#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap2/include/libvhd-journal.h b/tools/blktap2/include/libvhd-journal.h
new file mode 100644
index 0000000000..2f32ff02ca
--- /dev/null
+++ b/tools/blktap2/include/libvhd-journal.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_JOURNAL_H_
+#define _VHD_JOURNAL_H_
+
+#include <inttypes.h>
+
+#include "libvhd.h"
+
+#define VHD_JOURNAL_METADATA 0x01
+#define VHD_JOURNAL_DATA 0x02
+
+#define VHD_JOURNAL_HEADER_COOKIE "vjournal"
+#define VHD_JOURNAL_ENTRY_COOKIE 0xaaaa12344321aaaa
+
+typedef struct vhd_journal_header {
+ char cookie[8];
+ uuid_t uuid;
+ uint64_t vhd_footer_offset;
+ uint32_t journal_data_entries;
+ uint32_t journal_metadata_entries;
+ uint64_t journal_data_offset;
+ uint64_t journal_metadata_offset;
+ uint64_t journal_eof;
+ char pad[448];
+} vhd_journal_header_t;
+
+typedef struct vhd_journal {
+ char *jname;
+ int jfd;
+ int is_block; /* is jfd a block device */
+ vhd_journal_header_t header;
+ vhd_context_t vhd;
+} vhd_journal_t;
+
+int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode);
+int vhd_journal_commit(vhd_journal_t *);
+int vhd_journal_revert(vhd_journal_t *);
+int vhd_journal_close(vhd_journal_t *);
+int vhd_journal_remove(vhd_journal_t *);
+
+#endif
diff --git a/tools/blktap2/include/libvhd.h b/tools/blktap2/include/libvhd.h
new file mode 100644
index 0000000000..b128ebaf38
--- /dev/null
+++ b/tools/blktap2/include/libvhd.h
@@ -0,0 +1,308 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_LIB_H_
+#define _VHD_LIB_H_
+
+#include <string.h>
+#include <endian.h>
+#include <byteswap.h>
+#include <uuid/uuid.h>
+
+#include "vhd.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ #define BE16_IN(foo) (*(foo)) = bswap_16(*(foo))
+ #define BE32_IN(foo) (*(foo)) = bswap_32(*(foo))
+ #define BE64_IN(foo) (*(foo)) = bswap_64(*(foo))
+ #define BE16_OUT(foo) (*(foo)) = bswap_16(*(foo))
+ #define BE32_OUT(foo) (*(foo)) = bswap_32(*(foo))
+ #define BE64_OUT(foo) (*(foo)) = bswap_64(*(foo))
+#else
+ #define BE16_IN(foo)
+ #define BE32_IN(foo)
+ #define BE64_IN(foo)
+ #define BE32_OUT(foo)
+ #define BE32_OUT(foo)
+ #define BE64_OUT(foo)
+#endif
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+#define VHD_MAX_NAME_LEN 1024
+
+#define VHD_BLOCK_SHIFT 21
+#define VHD_BLOCK_SIZE (1ULL << VHD_BLOCK_SHIFT)
+
+#define UTF_16 "UTF-16"
+#define UTF_16LE "UTF-16LE"
+#define UTF_16BE "UTF-16BE"
+
+#define VHD_OPEN_RDONLY 0x00001
+#define VHD_OPEN_RDWR 0x00002
+#define VHD_OPEN_FAST 0x00004
+#define VHD_OPEN_STRICT 0x00008
+#define VHD_OPEN_IGNORE_DISABLED 0x00010
+
+#define VHD_FLAG_CREAT_PARENT_RAW 0x00001
+
+#define vhd_flag_set(word, flag) ((word) |= (flag))
+#define vhd_flag_clear(word, flag) ((word) &= ~(flag))
+#define vhd_flag_test(word, flag) ((word) & (flag))
+
+
+#define ENABLE_FAILURE_TESTING
+#define FAIL_REPARENT_BEGIN 0
+#define FAIL_REPARENT_LOCATOR 1
+#define FAIL_REPARENT_END 2
+#define FAIL_RESIZE_BEGIN 3
+#define FAIL_RESIZE_DATA_MOVED 4
+#define FAIL_RESIZE_METADATA_MOVED 5
+#define FAIL_RESIZE_END 6
+#define NUM_FAIL_TESTS 7
+
+#ifdef ENABLE_FAILURE_TESTING
+#define TEST_FAIL_AT(point) \
+ if (TEST_FAIL[point]) { \
+ printf("Failing at %s\n", ENV_VAR_FAIL[point]); exit(EINVAL); }
+#define TEST_FAIL_EXTERN_VARS \
+ extern const char* ENV_VAR_FAIL[]; \
+ extern int TEST_FAIL[];
+#else
+#define TEST_FAIL_AT(point)
+#define TEST_FAIL_EXTERN_VARS
+#endif // ENABLE_FAILURE_TESTING
+
+
+static const char VHD_POISON_COOKIE[] = "v_poison";
+
+typedef struct hd_ftr vhd_footer_t;
+typedef struct dd_hdr vhd_header_t;
+typedef struct vhd_bat vhd_bat_t;
+typedef struct vhd_batmap vhd_batmap_t;
+typedef struct dd_batmap_hdr vhd_batmap_header_t;
+typedef struct prt_loc vhd_parent_locator_t;
+typedef struct vhd_context vhd_context_t;
+typedef uint32_t vhd_flag_creat_t;
+
+struct vhd_bat {
+ uint32_t spb;
+ uint32_t entries;
+ uint32_t *bat;
+};
+
+struct vhd_batmap {
+ vhd_batmap_header_t header;
+ char *map;
+};
+
+struct vhd_context {
+ int fd;
+ char *file;
+ int oflags;
+ int is_block;
+
+ uint32_t spb;
+ uint32_t bm_secs;
+
+ vhd_header_t header;
+ vhd_footer_t footer;
+ vhd_bat_t bat;
+ vhd_batmap_t batmap;
+};
+
+static inline uint32_t
+secs_round_up(uint64_t bytes)
+{
+ return ((bytes + (VHD_SECTOR_SIZE - 1)) >> VHD_SECTOR_SHIFT);
+}
+
+static inline uint32_t
+secs_round_up_no_zero(uint64_t bytes)
+{
+ return (secs_round_up(bytes) ? : 1);
+}
+
+static inline uint64_t
+vhd_sectors_to_bytes(uint64_t sectors)
+{
+ return sectors << VHD_SECTOR_SHIFT;
+}
+
+static inline uint64_t
+vhd_bytes_padded(uint64_t bytes)
+{
+ return vhd_sectors_to_bytes(secs_round_up_no_zero(bytes));
+}
+
+static inline int
+vhd_type_dynamic(vhd_context_t *ctx)
+{
+ return (ctx->footer.type == HD_TYPE_DYNAMIC ||
+ ctx->footer.type == HD_TYPE_DIFF);
+}
+
+static inline int
+vhd_creator_tapdisk(vhd_context_t *ctx)
+{
+ return !strncmp(ctx->footer.crtr_app, "tap", 3);
+}
+
+static inline int
+vhd_disabled(vhd_context_t *ctx)
+{
+ return (!memcmp(ctx->footer.cookie,
+ VHD_POISON_COOKIE, sizeof(ctx->footer.cookie)));
+}
+
+static inline size_t
+vhd_parent_locator_size(vhd_parent_locator_t *loc)
+{
+ /*
+ * MICROSOFT_COMPAT
+ * data_space *should* be in sectors,
+ * but sometimes we find it in bytes
+ */
+ if (loc->data_space < 512)
+ return vhd_sectors_to_bytes(loc->data_space);
+ else if (loc->data_space % 512 == 0)
+ return loc->data_space;
+ else
+ return 0;
+}
+
+static inline int
+vhd_parent_raw(vhd_context_t *ctx)
+{
+ return uuid_is_null(ctx->header.prt_uuid);
+}
+
+void libvhd_set_log_level(int);
+
+int vhd_test_file_fixed(const char *, int *);
+
+uint32_t vhd_time(time_t time);
+size_t vhd_time_to_string(uint32_t timestamp, char *target);
+uint32_t vhd_chs(uint64_t size);
+
+uint32_t vhd_checksum_footer(vhd_footer_t *);
+uint32_t vhd_checksum_header(vhd_header_t *);
+uint32_t vhd_checksum_batmap(vhd_batmap_t *);
+
+void vhd_footer_in(vhd_footer_t *);
+void vhd_footer_out(vhd_footer_t *);
+void vhd_header_in(vhd_header_t *);
+void vhd_header_out(vhd_header_t *);
+void vhd_bat_in(vhd_bat_t *);
+void vhd_bat_out(vhd_bat_t *);
+void vhd_batmap_header_in(vhd_batmap_t *);
+void vhd_batmap_header_out(vhd_batmap_t *);
+
+int vhd_validate_footer(vhd_footer_t *footer);
+int vhd_validate_header(vhd_header_t *header);
+int vhd_validate_batmap_header(vhd_batmap_t *batmap);
+int vhd_validate_batmap(vhd_batmap_t *batmap);
+int vhd_validate_platform_code(uint32_t code);
+
+int vhd_open(vhd_context_t *, const char *file, int flags);
+void vhd_close(vhd_context_t *);
+int vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t);
+/* vhd_snapshot: the bytes parameter is optional and can be 0 if the snapshot
+ * is to have the same size as the (first non-empty) parent */
+int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent,
+ vhd_flag_creat_t);
+
+int vhd_hidden(vhd_context_t *, int *);
+int vhd_chain_depth(vhd_context_t *, int *);
+
+off64_t vhd_position(vhd_context_t *);
+int vhd_seek(vhd_context_t *, off64_t, int);
+int vhd_read(vhd_context_t *, void *, size_t);
+int vhd_write(vhd_context_t *, void *, size_t);
+
+int vhd_offset(vhd_context_t *, uint32_t, uint32_t *);
+
+int vhd_end_of_headers(vhd_context_t *ctx, off64_t *off);
+int vhd_end_of_data(vhd_context_t *ctx, off64_t *off);
+int vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *off);
+
+int vhd_get_header(vhd_context_t *);
+int vhd_get_footer(vhd_context_t *);
+int vhd_get_bat(vhd_context_t *);
+int vhd_get_batmap(vhd_context_t *);
+
+void vhd_put_header(vhd_context_t *);
+void vhd_put_footer(vhd_context_t *);
+void vhd_put_bat(vhd_context_t *);
+void vhd_put_batmap(vhd_context_t *);
+
+int vhd_has_batmap(vhd_context_t *);
+int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t);
+
+int vhd_get_phys_size(vhd_context_t *, off64_t *);
+int vhd_set_phys_size(vhd_context_t *, off64_t);
+
+int vhd_bitmap_test(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_set(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_clear(vhd_context_t *, char *, uint32_t);
+
+int vhd_parent_locator_count(vhd_context_t *);
+int vhd_parent_locator_get(vhd_context_t *, char **);
+int vhd_parent_locator_read(vhd_context_t *, vhd_parent_locator_t *, char **);
+int vhd_find_parent(vhd_context_t *, const char *, char **);
+int vhd_parent_locator_write_at(vhd_context_t *, const char *,
+ off64_t, uint32_t, size_t,
+ vhd_parent_locator_t *);
+
+int vhd_header_decode_parent(vhd_context_t *, vhd_header_t *, char **);
+int vhd_change_parent(vhd_context_t *, char *parent_path, int raw);
+
+int vhd_read_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_read_footer_at(vhd_context_t *, vhd_footer_t *, off64_t);
+int vhd_read_footer_strict(vhd_context_t *, vhd_footer_t *);
+int vhd_read_header(vhd_context_t *, vhd_header_t *);
+int vhd_read_header_at(vhd_context_t *, vhd_header_t *, off64_t);
+int vhd_read_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_read_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_read_bitmap(vhd_context_t *, uint32_t block, char **bufp);
+int vhd_read_block(vhd_context_t *, uint32_t block, char **bufp);
+
+int vhd_write_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_write_footer_at(vhd_context_t *, vhd_footer_t *, off64_t);
+int vhd_write_header(vhd_context_t *, vhd_header_t *);
+int vhd_write_header_at(vhd_context_t *, vhd_header_t *, off64_t);
+int vhd_write_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_write_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_write_bitmap(vhd_context_t *, uint32_t block, char *bitmap);
+int vhd_write_block(vhd_context_t *, uint32_t block, char *data);
+
+int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t);
+int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t);
+
+#endif
diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h
new file mode 100644
index 0000000000..03a524be01
--- /dev/null
+++ b/tools/blktap2/include/list.h
@@ -0,0 +1,93 @@
+/*
+ * list.h
+ *
+ * This is a subset of linux's list.h intended to be used in user-space.
+ *
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline int list_is_last(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->next == head;
+}
+
+#define list_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif /* __LIST_H__ */
diff --git a/tools/blktap2/include/lvm-util.h b/tools/blktap2/include/lvm-util.h
new file mode 100644
index 0000000000..95f3320334
--- /dev/null
+++ b/tools/blktap2/include/lvm-util.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LVM_UTIL_H_
+#define _LVM_UTIL_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_SIZE 256
+
+#define LVM_SEG_TYPE_LINEAR 1
+#define LVM_SEG_TYPE_UNKNOWN 2
+
+struct lv_segment {
+ uint8_t type;
+ char device[MAX_NAME_SIZE];
+ uint64_t pe_start;
+ uint64_t pe_size;
+};
+
+struct lv {
+ char name[MAX_NAME_SIZE];
+ uint64_t size;
+ uint32_t segments;
+ struct lv_segment first_segment;
+};
+
+struct pv {
+ char name[MAX_NAME_SIZE];
+ uint64_t start;
+};
+
+struct vg {
+ char name[MAX_NAME_SIZE];
+ uint64_t extent_size;
+
+ int pv_cnt;
+ struct pv *pvs;
+
+ int lv_cnt;
+ struct lv *lvs;
+};
+
+int lvm_scan_vg(const char *vg_name, struct vg *vg);
+void lvm_free_vg(struct vg *vg);
+
+#endif
diff --git a/tools/blktap2/include/relative-path.h b/tools/blktap2/include/relative-path.h
new file mode 100644
index 0000000000..d78f94d023
--- /dev/null
+++ b/tools/blktap2/include/relative-path.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _RELATIVE_PATH_H_
+#define _RELATIVE_PATH_H_
+
+#include <syslog.h>
+
+#define DELIMITER '/'
+#define MAX_NAME_LEN 1000
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+/*
+ * returns a relative path from @src to @dest
+ * result should be freed
+ */
+char *relative_path_to(char *src, char *dest, int *err);
+
+#endif
diff --git a/tools/blktap2/include/tapdisk-message.h b/tools/blktap2/include/tapdisk-message.h
new file mode 100644
index 0000000000..1a86dcb6a3
--- /dev/null
+++ b/tools/blktap2/include/tapdisk-message.h
@@ -0,0 +1,141 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_MESSAGE_H_
+#define _TAPDISK_MESSAGE_H_
+
+#include <inttypes.h>
+
+#define TAPDISK_MESSAGE_MAX_PATH_LENGTH 256
+#define TAPDISK_MESSAGE_STRING_LENGTH 256
+
+#define TAPDISK_MESSAGE_FLAG_SHARED 0x01
+#define TAPDISK_MESSAGE_FLAG_RDONLY 0x02
+#define TAPDISK_MESSAGE_FLAG_ADD_CACHE 0x04
+#define TAPDISK_MESSAGE_FLAG_VHD_INDEX 0x08
+#define TAPDISK_MESSAGE_FLAG_LOG_DIRTY 0x10
+
+typedef struct tapdisk_message tapdisk_message_t;
+typedef uint8_t tapdisk_message_flag_t;
+typedef struct tapdisk_message_image tapdisk_message_image_t;
+typedef struct tapdisk_message_params tapdisk_message_params_t;
+typedef struct tapdisk_message_string tapdisk_message_string_t;
+
+struct tapdisk_message_params {
+ tapdisk_message_flag_t flags;
+
+ uint8_t storage;
+ uint32_t devnum;
+ uint32_t domid;
+ uint16_t path_len;
+ char path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message_image {
+ uint64_t sectors;
+ uint32_t sector_size;
+ uint32_t info;
+};
+
+struct tapdisk_message_string {
+ char text[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message {
+ uint16_t type;
+ uint16_t cookie;
+ uint16_t drivertype;
+
+ union {
+ pid_t tapdisk_pid;
+ tapdisk_message_image_t image;
+ tapdisk_message_params_t params;
+ tapdisk_message_string_t string;
+ } u;
+};
+
+enum tapdisk_message_id {
+ TAPDISK_MESSAGE_ERROR = 1,
+ TAPDISK_MESSAGE_RUNTIME_ERROR,
+ TAPDISK_MESSAGE_PID,
+ TAPDISK_MESSAGE_PID_RSP,
+ TAPDISK_MESSAGE_OPEN,
+ TAPDISK_MESSAGE_OPEN_RSP,
+ TAPDISK_MESSAGE_PAUSE,
+ TAPDISK_MESSAGE_PAUSE_RSP,
+ TAPDISK_MESSAGE_RESUME,
+ TAPDISK_MESSAGE_RESUME_RSP,
+ TAPDISK_MESSAGE_CLOSE,
+ TAPDISK_MESSAGE_CLOSE_RSP,
+ TAPDISK_MESSAGE_EXIT,
+};
+
+static inline char *
+tapdisk_message_name(enum tapdisk_message_id id)
+{
+ switch (id) {
+ case TAPDISK_MESSAGE_ERROR:
+ return "error";
+
+ case TAPDISK_MESSAGE_PID:
+ return "pid";
+
+ case TAPDISK_MESSAGE_PID_RSP:
+ return "pid response";
+
+ case TAPDISK_MESSAGE_OPEN:
+ return "open";
+
+ case TAPDISK_MESSAGE_OPEN_RSP:
+ return "open response";
+
+ case TAPDISK_MESSAGE_PAUSE:
+ return "pause";
+
+ case TAPDISK_MESSAGE_PAUSE_RSP:
+ return "pause response";
+
+ case TAPDISK_MESSAGE_RESUME:
+ return "resume";
+
+ case TAPDISK_MESSAGE_RESUME_RSP:
+ return "resume response";
+
+ case TAPDISK_MESSAGE_CLOSE:
+ return "close";
+
+ case TAPDISK_MESSAGE_CLOSE_RSP:
+ return "close response";
+
+ case TAPDISK_MESSAGE_EXIT:
+ return "exit";
+
+ default:
+ return "unknown";
+ }
+}
+
+#endif
diff --git a/tools/blktap2/include/vhd-util.h b/tools/blktap2/include/vhd-util.h
new file mode 100644
index 0000000000..11f077e2bf
--- /dev/null
+++ b/tools/blktap2/include/vhd-util.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_UTIL_H_
+#define _VHD_UTIL_H_
+
+int vhd_util_create(int argc, char **argv);
+int vhd_util_snapshot(int argc, char **argv);
+int vhd_util_query(int argc, char **argv);
+int vhd_util_read(int argc, char **argv);
+int vhd_util_set_field(int argc, char **argv);
+int vhd_util_repair(int argc, char **argv);
+int vhd_util_fill(int argc, char **argv);
+int vhd_util_resize(int argc, char **argv);
+int vhd_util_coalesce(int argc, char **argv);
+int vhd_util_modify(int argc, char **argv);
+int vhd_util_scan(int argc, char **argv);
+int vhd_util_check(int argc, char **argv);
+int vhd_util_revert(int argc, char **argv);
+
+#endif
diff --git a/tools/blktap2/include/vhd.h b/tools/blktap2/include/vhd.h
new file mode 100644
index 0000000000..4da5f86668
--- /dev/null
+++ b/tools/blktap2/include/vhd.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __VHD_H__
+#define __VHD_H__
+
+#include <asm/types.h>
+#include <uuid/uuid.h>
+#include <inttypes.h>
+
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define DEBUG 1
+
+/* ---------------------------------------------------------------------- */
+/* General definitions. */
+/* ---------------------------------------------------------------------- */
+
+#define VHD_SECTOR_SIZE 512
+#define VHD_SECTOR_SHIFT 9
+
+/* ---------------------------------------------------------------------- */
+/* This is the generic disk footer, used by all disks. */
+/* ---------------------------------------------------------------------- */
+
+struct hd_ftr {
+ char cookie[8]; /* Identifies original creator of the disk */
+ u32 features; /* Feature Support -- see below */
+ u32 ff_version; /* (major,minor) version of disk file */
+ u64 data_offset; /* Abs. offset from SOF to next structure */
+ u32 timestamp; /* Creation time. secs since 1/1/2000GMT */
+ char crtr_app[4]; /* Creator application */
+ u32 crtr_ver; /* Creator version (major,minor) */
+ u32 crtr_os; /* Creator host OS */
+ u64 orig_size; /* Size at creation (bytes) */
+ u64 curr_size; /* Current size of disk (bytes) */
+ u32 geometry; /* Disk geometry */
+ u32 type; /* Disk type */
+ u32 checksum; /* 1's comp sum of this struct. */
+ uuid_t uuid; /* Unique disk ID, used for naming parents */
+ char saved; /* one-bit -- is this disk/VM in a saved state? */
+ char hidden; /* tapdisk-specific field: is this vdi hidden? */
+ char reserved[426]; /* padding */
+};
+
+/* VHD cookie string. */
+static const char HD_COOKIE[9] = "conectix";
+
+/* Feature fields in hd_ftr */
+#define HD_NO_FEATURES 0x00000000
+#define HD_TEMPORARY 0x00000001 /* disk can be deleted on shutdown */
+#define HD_RESERVED 0x00000002 /* NOTE: must always be set */
+
+/* Version field in hd_ftr */
+#define HD_FF_VERSION 0x00010000
+
+/* Known creator OS type fields in hd_ftr.crtr_os */
+#define HD_CR_OS_WINDOWS 0x5769326B /* (Wi2k) */
+#define HD_CR_OS_MACINTOSH 0x4D616320 /* (Mac ) */
+
+/*
+ * version 0.1: little endian bitmaps
+ * version 1.1: big endian bitmaps; batmap
+ * version 1.2: libvhd
+ * version 1.3: batmap version bump to 1.2
+ */
+#define VHD_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_CURRENT_VERSION VHD_VERSION(1, 3)
+
+/* Disk geometry accessor macros. */
+/* Geometry is a triple of (cylinders (2 bytes), tracks (1 byte), and
+ * secotrs-per-track (1 byte))
+ */
+#define GEOM_GET_CYLS(_g) (((_g) >> 16) & 0xffff)
+#define GEOM_GET_HEADS(_g) (((_g) >> 8) & 0xff)
+#define GEOM_GET_SPT(_g) ((_g) & 0xff)
+
+#define GEOM_ENCODE(_c, _h, _s) (((_c) << 16) | ((_h) << 8) | (_s))
+
+/* type field in hd_ftr */
+#define HD_TYPE_NONE 0
+#define HD_TYPE_FIXED 2 /* fixed-allocation disk */
+#define HD_TYPE_DYNAMIC 3 /* dynamic disk */
+#define HD_TYPE_DIFF 4 /* differencing disk */
+
+/* String table for hd.type */
+static const char *HD_TYPE_STR[7] = {
+ "None", /* 0 */
+ "Reserved (deprecated)", /* 1 */
+ "Fixed hard disk", /* 2 */
+ "Dynamic hard disk", /* 3 */
+ "Differencing hard disk", /* 4 */
+ "Reserved (deprecated)", /* 5 */
+ "Reserved (deprecated)" /* 6 */
+};
+
+#define HD_TYPE_MAX 6
+
+struct prt_loc {
+ u32 code; /* Platform code -- see defines below. */
+ u32 data_space; /* Number of 512-byte sectors to store locator */
+ u32 data_len; /* Actual length of parent locator in bytes */
+ u32 res; /* Must be zero */
+ u64 data_offset; /* Absolute offset of locator data (bytes) */
+};
+
+/* Platform Codes */
+#define PLAT_CODE_NONE 0x0
+#define PLAT_CODE_WI2R 0x57693272 /* deprecated */
+#define PLAT_CODE_WI2K 0x5769326B /* deprecated */
+#define PLAT_CODE_W2RU 0x57327275 /* Windows relative path (UTF-16) */
+#define PLAT_CODE_W2KU 0x57326B75 /* Windows absolute path (UTF-16) */
+#define PLAT_CODE_MAC 0x4D616320 /* MacOS alias stored as a blob. */
+#define PLAT_CODE_MACX 0x4D616358 /* File URL (UTF-8), see RFC 2396. */
+
+/* ---------------------------------------------------------------------- */
+/* This is the dynamic disk header. */
+/* ---------------------------------------------------------------------- */
+
+struct dd_hdr {
+ char cookie[8]; /* Should contain "cxsparse" */
+ u64 data_offset; /* Byte offset of next record. (Unused) 0xffs */
+ u64 table_offset; /* Absolute offset to the BAT. */
+ u32 hdr_ver; /* Version of the dd_hdr (major,minor) */
+ u32 max_bat_size; /* Maximum number of entries in the BAT */
+ u32 block_size; /* Block size in bytes. Must be power of 2. */
+ u32 checksum; /* Header checksum. 1's comp of all fields. */
+ uuid_t prt_uuid; /* ID of the parent disk. */
+ u32 prt_ts; /* Modification time of the parent disk */
+ u32 res1; /* Reserved. */
+ char prt_name[512]; /* Parent unicode name. */
+ struct prt_loc loc[8]; /* Parent locator entries. */
+ char res2[256]; /* Reserved. */
+};
+
+/* VHD cookie string. */
+static const char DD_COOKIE[9] = "cxsparse";
+
+/* Version field in hd_ftr */
+#define DD_VERSION 0x00010000
+
+/* Default blocksize is 2 meg. */
+#define DD_BLOCKSIZE_DEFAULT 0x00200000
+
+#define DD_BLK_UNUSED 0xFFFFFFFF
+
+struct dd_batmap_hdr {
+ char cookie[8]; /* should contain "tdbatmap" */
+ u64 batmap_offset; /* byte offset to batmap */
+ u32 batmap_size; /* batmap size in sectors */
+ u32 batmap_version; /* version of batmap */
+ u32 checksum; /* batmap checksum -- 1's complement of batmap */
+};
+
+static const char VHD_BATMAP_COOKIE[9] = "tdbatmap";
+
+/*
+ * version 1.1: signed char checksum
+ */
+#define VHD_BATMAP_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_BATMAP_CURRENT_VERSION VHD_BATMAP_VERSION(1, 2)
+
+/* Layout of a dynamic disk:
+ *
+ * +-------------------------------------------------+
+ * | Mirror image of HD footer (hd_ftr) (512 bytes) |
+ * +-------------------------------------------------+
+ * | Sparse drive header (dd_hdr) (1024 bytes) |
+ * +-------------------------------------------------+
+ * | BAT (Block allocation table) |
+ * | - Array of absolute sector offsets into the |
+ * | file (u32). |
+ * | - Rounded up to a sector boundary. |
+ * | - Unused entries are marked as 0xFFFFFFFF |
+ * | - max entries in dd_hdr->max_bat_size |
+ * +-------------------------------------------------+
+ * | Data Block 0 |
+ * | Bitmap (padded to 512 byte sector boundary) |
+ * | - each bit indicates whether the associated |
+ * | sector within this block is used. |
+ * | Data |
+ * | - power-of-two multiple of sectors. |
+ * | - default 2MB (4096 * 512) |
+ * | - Any entries with zero in bitmap should be |
+ * | zero on disk |
+ * +-------------------------------------------------+
+ * | Data Block 1 |
+ * +-------------------------------------------------+
+ * | ... |
+ * +-------------------------------------------------+
+ * | Data Block n |
+ * +-------------------------------------------------+
+ * | HD Footer (511 bytes) |
+ * +-------------------------------------------------+
+ */
+
+#endif
diff --git a/tools/blktap2/lvm/Makefile b/tools/blktap2/lvm/Makefile
new file mode 100644
index 0000000000..3a726d7c8b
--- /dev/null
+++ b/tools/blktap2/lvm/Makefile
@@ -0,0 +1,38 @@
+XEN_ROOT = ../../../
+BLKTAP_ROOT := ../
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(LVM_UTIL_TEST),y)
+TEST := lvm-util
+endif
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../include
+CFLAGS += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+LVM-OBJS := lvm-util.o
+
+all: build
+
+build: $(TEST) $(LVM-OBJS)
+
+install: all
+
+lvm-util: lvm-util.o
+ $(CC) $(CFLAGS) -DLVM_UTIL -o lvm-util lvm-util.c
+
+clean:
+ rm -rf *.o *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install lvm-util
+
+-include $(DEPS)
diff --git a/tools/blktap2/lvm/lvm-util.c b/tools/blktap2/lvm/lvm-util.c
new file mode 100644
index 0000000000..b456e0438b
--- /dev/null
+++ b/tools/blktap2/lvm/lvm-util.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lvm-util.h"
+
+#define _NAME "%255s"
+static char line[1024];
+
+static inline int
+lvm_read_line(FILE *scan)
+{
+ memset(line, 0, sizeof(line));
+ return (fscanf(scan, "%1023[^\n]", line) != 1);
+}
+
+static inline int
+lvm_next_line(FILE *scan)
+{
+ return (fscanf(scan, "%1023[\n]", line) != 1);
+}
+
+static int
+lvm_copy_name(char *dst, const char *src, size_t size)
+{
+ if (strnlen(src, size) == size)
+ return -ENAMETOOLONG;
+
+ strcpy(dst, src);
+ return 0;
+}
+
+static int
+lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start)
+{
+ int i, err;
+ struct pv *pv;
+
+ pv = NULL;
+
+ if (!vg->pvs) {
+ vg->pvs = calloc(pvs, sizeof(struct pv));
+ if (!vg->pvs)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < pvs; i++) {
+ pv = vg->pvs + i;
+
+ if (!pv->name[0])
+ break;
+
+ if (!strcmp(pv->name, name))
+ return -EEXIST;
+ }
+
+ if (!pv)
+ return -ENOENT;
+
+ if (i == pvs)
+ return -ENOMEM;
+
+ err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1);
+ if (err)
+ return err;
+
+ pv->start = start;
+ return 0;
+}
+
+static int
+lvm_open_vg(const char *vgname, struct vg *vg)
+{
+ FILE *scan;
+ int i, err, pvs, lvs;
+ char *cmd, pvname[256];
+ uint64_t size, pv_start;
+
+ memset(vg, 0, sizeof(*vg));
+
+ err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b "
+ "--options=vg_name,vg_extent_size,lv_count,pv_count,"
+ "pv_name,pe_start --unbuffered 2> /dev/null", vgname);
+ if (err == -1)
+ return -ENOMEM;
+
+ errno = 0;
+ scan = popen(cmd, "r");
+ if (!scan) {
+ err = (errno ? -errno : ENOMEM);
+ goto out;
+ }
+
+ for (;;) {
+ if (lvm_read_line(scan))
+ break;
+
+ err = -EINVAL;
+ if (sscanf(line, _NAME" %"SCNu64" %d %d "_NAME" %"SCNu64,
+ vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6)
+ goto out;
+
+ if (strcmp(vg->name, vgname))
+ goto out;
+
+ err = lvm_parse_pv(vg, pvname, pvs, pv_start);
+ if (err)
+ goto out;
+
+ if (lvm_next_line(scan))
+ break;
+ }
+
+ err = -EINVAL;
+ if (strcmp(vg->name, vgname))
+ goto out;
+
+ for (i = 0; i < pvs; i++)
+ if (!vg->pvs[i].name[0])
+ goto out;
+
+ err = -ENOMEM;
+ vg->lvs = calloc(lvs, sizeof(struct lv));
+ if (!vg->lvs)
+ goto out;
+
+ err = 0;
+ vg->lv_cnt = lvs;
+ vg->pv_cnt = pvs;
+ vg->extent_size = size;
+
+out:
+ if (scan)
+ pclose(scan);
+ if (err)
+ lvm_free_vg(vg);
+ free(cmd);
+ return err;
+}
+
+static int
+lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices)
+{
+ int i;
+ uint64_t start, pe_start;
+
+ for (i = 0; i < strlen(devices); i++)
+ if (strchr(",()", devices[i]))
+ devices[i] = ' ';
+
+ if (sscanf(devices, _NAME" %"SCNu64, seg->device, &start) != 2)
+ return -EINVAL;
+
+ pe_start = -1;
+ for (i = 0; i < vg->pv_cnt; i++)
+ if (!strcmp(vg->pvs[i].name, seg->device)) {
+ pe_start = vg->pvs[i].start;
+ break;
+ }
+
+ if (pe_start == -1)
+ return -EINVAL;
+
+ seg->pe_start = (start * vg->extent_size) + pe_start;
+ return 0;
+}
+
+static int
+lvm_scan_lvs(struct vg *vg)
+{
+ char *cmd;
+ FILE *scan;
+ int i, err;
+
+ err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b "
+ "--options=lv_name,lv_size,segtype,seg_count,seg_start,"
+ "seg_size,devices --unbuffered 2> /dev/null", vg->name);
+ if (err == -1)
+ return -ENOMEM;
+
+ errno = 0;
+ scan = popen(cmd, "r");
+ if (!scan) {
+ err = (errno ? -errno : -ENOMEM);
+ goto out;
+ }
+
+ for (i = 0;;) {
+ int segs;
+ struct lv *lv;
+ struct lv_segment seg;
+ uint64_t size, seg_start;
+ char type[32], name[256], dev[256], devices[1024];
+
+ if (i >= vg->lv_cnt)
+ break;
+
+ if (lvm_read_line(scan)) {
+ vg->lv_cnt = i;
+ break;
+ }
+
+ err = -EINVAL;
+ lv = vg->lvs + i;
+
+ if (sscanf(line, _NAME" %"SCNu64" %31s %u %"SCNu64" %"SCNu64" %1023s",
+ name, &size, type, &segs, &seg_start,
+ &seg.pe_size, devices) != 7)
+ goto out;
+
+ if (seg_start)
+ goto next;
+
+ if (!strcmp(type, "linear"))
+ seg.type = LVM_SEG_TYPE_LINEAR;
+ else
+ seg.type = LVM_SEG_TYPE_UNKNOWN;
+
+ if (lvm_parse_lv_devices(vg, &seg, devices))
+ goto out;
+
+ i++;
+ lv->size = size;
+ lv->segments = segs;
+ lv->first_segment = seg;
+
+ err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1);
+ if (err)
+ goto out;
+ err = -EINVAL;
+
+ next:
+ if (lvm_next_line(scan))
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (scan)
+ pclose(scan);
+ free(cmd);
+ return err;
+}
+
+void
+lvm_free_vg(struct vg *vg)
+{
+ free(vg->lvs);
+ free(vg->pvs);
+ memset(vg, 0, sizeof(*vg));
+}
+
+int
+lvm_scan_vg(const char *vg_name, struct vg *vg)
+{
+ int err;
+
+ memset(vg, 0, sizeof(*vg));
+
+ err = lvm_open_vg(vg_name, vg);
+ if (err)
+ return err;
+
+ err = lvm_scan_lvs(vg);
+ if (err) {
+ lvm_free_vg(vg);
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef LVM_UTIL
+static int
+usage(void)
+{
+ printf("usage: lvm-util <vgname>\n");
+ exit(EINVAL);
+}
+
+int
+main(int argc, char **argv)
+{
+ int i, err;
+ struct vg vg;
+ struct pv *pv;
+ struct lv *lv;
+ struct lv_segment *seg;
+
+ if (argc != 2)
+ usage();
+
+ err = lvm_scan_vg(argv[1], &vg);
+ if (err) {
+ printf("scan failed: %d\n", err);
+ return (err >= 0 ? err : -err);
+ }
+
+
+ printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n",
+ vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt);
+
+ for (i = 0; i < vg.pv_cnt; i++) {
+ pv = vg.pvs + i;
+ printf("pv %s: start %"PRIu64"\n", pv->name, pv->start);
+ }
+
+ for (i = 0; i < vg.lv_cnt; i++) {
+ lv = vg.lvs + i;
+ seg = &lv->first_segment;
+ printf("lv %s: size: %"PRIu64", segments: %u, type: %u, "
+ "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n",
+ lv->name, lv->size, lv->segments, seg->type,
+ seg->device, seg->pe_start, seg->pe_size);
+ }
+
+ lvm_free_vg(&vg);
+ return 0;
+}
+#endif
diff --git a/tools/blktap2/vhd/Makefile b/tools/blktap2/vhd/Makefile
new file mode 100644
index 0000000000..099a0baca7
--- /dev/null
+++ b/tools/blktap2/vhd/Makefile
@@ -0,0 +1,55 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT := ../
+include $(XEN_ROOT)/tools/Rules.mk
+
+IBIN = vhd-util vhd-update
+INST_DIR = $(SBINDIR)
+
+LIBDIR = lib
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../include
+CFLAGS += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS += -fPIC
+endif
+
+ifeq ($(VHD_STATIC),y)
+CFLAGS += -static
+endif
+
+LIBS := -L$(LIBDIR) -lvhd
+LIBS += -luuid
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+all: build
+
+build: libvhd $(IBIN)
+
+libvhd:
+ @set -e
+ $(MAKE) -C $(LIBDIR) all
+
+vhd-util: vhd-util.o
+ $(CC) $(CFLAGS) -o vhd-util vhd-util.o $(LIBS)
+
+vhd-update: vhd-update.o
+ $(CC) $(CFLAGS) -o vhd-update vhd-update.o $(LIBS)
+
+install: all
+ $(MAKE) -C $(LIBDIR) install
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+ $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean:
+ $(MAKE) -C $(LIBDIR) clean
+ rm -rf *.o *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install libvhd vhd-util vhd-update
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/Makefile b/tools/blktap2/vhd/lib/Makefile
new file mode 100644
index 0000000000..e26ef86403
--- /dev/null
+++ b/tools/blktap2/vhd/lib/Makefile
@@ -0,0 +1,73 @@
+XEN_ROOT=../../../../
+BLKTAP_ROOT := ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHD-MAJOR = 1.0
+LIBVHD-MINOR = 0
+LIBVHD-SONAME = libvhd.so.$(LIBVHD-MAJOR)
+
+LVM-UTIL-OBJ := $(BLKTAP_ROOT)lvm/lvm-util.o
+
+LIBVHD-BUILD := libvhd.a
+
+INST-DIR = $(LIBDIR)
+
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+CFLAGS += -I../../include
+CFLAGS += -D_GNU_SOURCE
+CFLAGS += -fPIC
+CFLAGS += -g
+
+LIBS := -luuid
+
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+LIB-SRCS := libvhd.c
+LIB-SRCS += libvhd-journal.c
+LIB-SRCS += vhd-util-coalesce.c
+LIB-SRCS += vhd-util-create.c
+LIB-SRCS += vhd-util-fill.c
+LIB-SRCS += vhd-util-modify.c
+LIB-SRCS += vhd-util-query.c
+LIB-SRCS += vhd-util-read.c
+LIB-SRCS += vhd-util-repair.c
+LIB-SRCS += vhd-util-resize.c
+LIB-SRCS += vhd-util-revert.c
+LIB-SRCS += vhd-util-set-field.c
+LIB-SRCS += vhd-util-snapshot.c
+LIB-SRCS += vhd-util-scan.c
+LIB-SRCS += vhd-util-check.c
+LIB-SRCS += relative-path.c
+LIB-SRCS += atomicio.c
+
+LIB-OBJS = $(patsubst %.c,%.o,$(LIB-SRCS))
+LIB-OBJS += $(LVM-UTIL-OBJ)
+
+LIBVHD = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+all: build
+
+build: $(LIBVHD-BUILD)
+
+libvhd.a: $(LIB-OBJS)
+ $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_CFLAGS) \
+ -o libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(LIBS) $^
+ ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.so.$(LIBVHD-MAJOR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR) libvhd.so
+ $(AR) rc $@ $^
+
+install: all
+ $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR)
+ $(INSTALL_DATA) $(LIBVHD) $(DESTDIR)$(INST-DIR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR)
+ ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so
+
+clean:
+ rm -rf *.a *.so* *.o *~ $(DEPS) $(LIBVHD)
+
+.PHONY: all build clean install libvhd
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/atomicio.c b/tools/blktap2/vhd/lib/atomicio.c
new file mode 100644
index 0000000000..ae0e24b00a
--- /dev/null
+++ b/tools/blktap2/vhd/lib/atomicio.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+ ssize_t (*f) (int, void *, size_t);
+ int fd;
+ void *_s;
+ size_t n;
+{
+ char *s = _s;
+ size_t pos = 0;
+ ssize_t res;
+
+ while (n > pos) {
+ res = (f) (fd, s + pos, n - pos);
+ switch (res) {
+ case -1:
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return 0;
+ case 0:
+ errno = EPIPE;
+ return pos;
+ default:
+ pos += (size_t)res;
+ }
+ }
+ return (pos);
+}
+
diff --git a/tools/blktap2/vhd/lib/libvhd-journal.c b/tools/blktap2/vhd/lib/libvhd-journal.c
new file mode 100644
index 0000000000..c52affea1a
--- /dev/null
+++ b/tools/blktap2/vhd/lib/libvhd-journal.c
@@ -0,0 +1,1534 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "atomicio.h"
+#include "libvhd-journal.h"
+
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P 1
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C 2
+#define VHD_JOURNAL_ENTRY_TYPE_HEADER 3
+#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR 4
+#define VHD_JOURNAL_ENTRY_TYPE_BAT 5
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H 6
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M 7
+#define VHD_JOURNAL_ENTRY_TYPE_DATA 8
+
+typedef struct vhd_journal_entry {
+ uint64_t cookie;
+ uint32_t type;
+ uint32_t size;
+ uint64_t offset;
+ uint32_t checksum;
+} vhd_journal_entry_t;
+
+static inline int
+vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence)
+{
+ off64_t off;
+
+ off = lseek64(j->jfd, offset, whence);
+ if (off == (off64_t)-1)
+ return -errno;
+
+ return 0;
+}
+
+static inline off64_t
+vhd_journal_position(vhd_journal_t *j)
+{
+ return lseek64(j->jfd, 0, SEEK_CUR);
+}
+
+static inline int
+vhd_journal_read(vhd_journal_t *j, void *buf, size_t size)
+{
+ ssize_t ret;
+
+ errno = 0;
+
+ ret = atomicio(read, j->jfd, buf, size);
+ if (ret != size)
+ return (errno ? -errno : -EIO);
+
+ return 0;
+}
+
+static inline int
+vhd_journal_write(vhd_journal_t *j, void *buf, size_t size)
+{
+ ssize_t ret;
+
+ errno = 0;
+
+ ret = atomicio(vwrite, j->jfd, buf, size);
+ if (ret != size)
+ return (errno ? -errno : -EIO);
+
+ return 0;
+}
+
+static inline int
+vhd_journal_truncate(vhd_journal_t *j, off64_t length)
+{
+ int err;
+
+ err = ftruncate(j->jfd, length);
+ if (err == -1)
+ return -errno;
+
+ return 0;
+}
+
+static inline int
+vhd_journal_sync(vhd_journal_t *j)
+{
+ int err;
+
+ err = fdatasync(j->jfd);
+ if (err)
+ return -errno;
+
+ return 0;
+}
+
+static inline void
+vhd_journal_header_in(vhd_journal_header_t *header)
+{
+ BE64_IN(&header->vhd_footer_offset);
+ BE32_IN(&header->journal_data_entries);
+ BE32_IN(&header->journal_metadata_entries);
+ BE64_IN(&header->journal_data_offset);
+ BE64_IN(&header->journal_metadata_offset);
+}
+
+static inline void
+vhd_journal_header_out(vhd_journal_header_t *header)
+{
+ BE64_OUT(&header->vhd_footer_offset);
+ BE32_OUT(&header->journal_data_entries);
+ BE32_OUT(&header->journal_metadata_entries);
+ BE64_OUT(&header->journal_data_offset);
+ BE64_OUT(&header->journal_metadata_offset);
+}
+
+static int
+vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ off64_t eof;
+
+ if (memcmp(header->cookie,
+ VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie)))
+ return -EINVAL;
+
+ err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+ if (err)
+ return err;
+
+ eof = vhd_journal_position(j);
+ if (eof == (off64_t)-1)
+ return -errno;
+
+ if (j->header.journal_data_offset > j->header.journal_eof)
+ return -EINVAL;
+
+ if (j->header.journal_metadata_offset > j->header.journal_eof)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ size_t size;
+
+ size = sizeof(vhd_journal_header_t);
+ err = vhd_journal_seek(j, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_read(j, header, size);
+ if (err)
+ return err;
+
+ vhd_journal_header_in(header);
+
+ return vhd_journal_validate_header(j, header);
+}
+
+static int
+vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+ int err;
+ size_t size;
+ vhd_journal_header_t h;
+
+ memcpy(&h, header, sizeof(vhd_journal_header_t));
+
+ err = vhd_journal_validate_header(j, &h);
+ if (err)
+ return err;
+
+ vhd_journal_header_out(&h);
+ size = sizeof(vhd_journal_header_t);
+
+ err = vhd_journal_seek(j, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_write(j, &h, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_journal_add_journal_header(vhd_journal_t *j)
+{
+ int err;
+ off64_t off;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+ memset(&j->header, 0, sizeof(vhd_journal_header_t));
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(vhd);
+ if (off == (off64_t)-1)
+ return -errno;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ uuid_copy(j->header.uuid, vhd->footer.uuid);
+ memcpy(j->header.cookie,
+ VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie));
+ j->header.vhd_footer_offset = off - sizeof(vhd_footer_t);
+ j->header.journal_eof = sizeof(vhd_journal_header_t);
+
+ return vhd_journal_write_header(j, &j->header);
+}
+
+static void
+vhd_journal_entry_in(vhd_journal_entry_t *entry)
+{
+ BE32_IN(&entry->type);
+ BE32_IN(&entry->size);
+ BE64_IN(&entry->offset);
+ BE64_IN(&entry->cookie);
+ BE32_IN(&entry->checksum);
+}
+
+static void
+vhd_journal_entry_out(vhd_journal_entry_t *entry)
+{
+ BE32_OUT(&entry->type);
+ BE32_OUT(&entry->size);
+ BE64_OUT(&entry->offset);
+ BE64_OUT(&entry->cookie);
+ BE32_OUT(&entry->checksum);
+}
+
+static uint32_t
+vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = entry->checksum;
+ entry->checksum = 0;
+
+ blob = (unsigned char *)entry;
+ for (i = 0; i < sizeof(vhd_journal_entry_t); i++)
+ checksum += blob[i];
+
+ blob = (unsigned char *)buf;
+ for (i = 0; i < size; i++)
+ checksum += blob[i];
+
+ entry->checksum = tmp;
+ return ~checksum;
+}
+
+static int
+vhd_journal_validate_entry(vhd_journal_entry_t *entry)
+{
+ if (entry->size == 0)
+ return -EINVAL;
+
+ if (entry->size & (VHD_SECTOR_SIZE - 1))
+ return -EINVAL;
+
+ if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+ int err;
+
+ err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t));
+ if (err)
+ return err;
+
+ vhd_journal_entry_in(entry);
+ return vhd_journal_validate_entry(entry);
+}
+
+static int
+vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+ int err;
+ vhd_journal_entry_t e;
+
+ err = vhd_journal_validate_entry(entry);
+ if (err)
+ return err;
+
+ memcpy(&e, entry, sizeof(vhd_journal_entry_t));
+ vhd_journal_entry_out(&e);
+
+ err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t));
+ if (err)
+ err;
+
+ return 0;
+}
+
+static int
+vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf)
+{
+ int err;
+ uint32_t checksum;
+
+ err = 0;
+ checksum = vhd_journal_checksum_entry(entry, buf, entry->size);
+
+ if (checksum != entry->checksum)
+ return -EINVAL;
+
+ return err;
+}
+
+static int
+vhd_journal_update(vhd_journal_t *j, off64_t offset,
+ char *buf, size_t size, uint32_t type)
+{
+ int err;
+ off64_t eof;
+ uint64_t *off, off_bak;
+ uint32_t *entries;
+ vhd_journal_entry_t entry;
+
+ entry.type = type;
+ entry.size = size;
+ entry.offset = offset;
+ entry.cookie = VHD_JOURNAL_ENTRY_COOKIE;
+ entry.checksum = vhd_journal_checksum_entry(&entry, buf, size);
+
+ err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_write_entry(j, &entry);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_write(j, buf, size);
+ if (err)
+ goto fail;
+
+ if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) {
+ off = &j->header.journal_data_offset;
+ entries = &j->header.journal_data_entries;
+ } else {
+ off = &j->header.journal_metadata_offset;
+ entries = &j->header.journal_metadata_entries;
+ }
+
+ off_bak = *off;
+ if (!(*entries)++)
+ *off = j->header.journal_eof;
+ j->header.journal_eof += (size + sizeof(vhd_journal_entry_t));
+
+ err = vhd_journal_write_header(j, &j->header);
+ if (err) {
+ if (!--(*entries))
+ *off = off_bak;
+ j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t));
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ if (!j->is_block)
+ vhd_journal_truncate(j, j->header.journal_eof);
+ return err;
+}
+
+static int
+vhd_journal_add_footer(vhd_journal_t *j)
+{
+ int err;
+ off64_t off;
+ vhd_context_t *vhd;
+ vhd_footer_t footer;
+
+ vhd = &j->vhd;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(vhd);
+ if (off == (off64_t)-1)
+ return -errno;
+
+ err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t));
+ if (err)
+ return err;
+
+ vhd_footer_out(&footer);
+ err = vhd_journal_update(j, off - sizeof(vhd_footer_t),
+ (char *)&footer,
+ sizeof(vhd_footer_t),
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ err = vhd_read_footer_at(vhd, &footer, 0);
+ if (err)
+ return err;
+
+ vhd_footer_out(&footer);
+ err = vhd_journal_update(j, 0,
+ (char *)&footer,
+ sizeof(vhd_footer_t),
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+
+ return err;
+}
+
+static int
+vhd_journal_add_header(vhd_journal_t *j)
+{
+ int err;
+ off64_t off;
+ vhd_context_t *vhd;
+ vhd_header_t header;
+
+ vhd = &j->vhd;
+
+ err = vhd_read_header(vhd, &header);
+ if (err)
+ return err;
+
+ off = vhd->footer.data_offset;
+
+ vhd_header_out(&header);
+ err = vhd_journal_update(j, off,
+ (char *)&header,
+ sizeof(vhd_header_t),
+ VHD_JOURNAL_ENTRY_TYPE_HEADER);
+
+ return err;
+}
+
+static int
+vhd_journal_add_locators(vhd_journal_t *j)
+{
+ int i, n, err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++) {
+ char *buf;
+ off64_t off;
+ size_t size;
+ vhd_parent_locator_t *loc;
+
+ loc = vhd->header.loc + i;
+ err = vhd_validate_platform_code(loc->code);
+ if (err)
+ return err;
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ off = loc->data_offset;
+ size = vhd_parent_locator_size(loc);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto end;
+
+ err = vhd_read(vhd, buf, size);
+ if (err)
+ goto end;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_LOCATOR);
+ if (err)
+ goto end;
+
+ err = 0;
+
+ end:
+ free(buf);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int
+vhd_journal_add_bat(vhd_journal_t *j)
+{
+ int err;
+ off64_t off;
+ size_t size;
+ vhd_bat_t bat;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ err = vhd_read_bat(vhd, &bat);
+ if (err)
+ return err;
+
+ off = vhd->header.table_offset;
+ size = vhd_bytes_padded(bat.entries * sizeof(uint32_t));
+
+ vhd_bat_out(&bat);
+ err = vhd_journal_update(j, off, (char *)bat.bat, size,
+ VHD_JOURNAL_ENTRY_TYPE_BAT);
+
+ free(bat.bat);
+ return err;
+}
+
+static int
+vhd_journal_add_batmap(vhd_journal_t *j)
+{
+ int err;
+ off64_t off;
+ size_t size;
+ vhd_context_t *vhd;
+ vhd_batmap_t batmap;
+
+ vhd = &j->vhd;
+
+ err = vhd_batmap_header_offset(vhd, &off);
+ if (err)
+ return err;
+
+ err = vhd_read_batmap(vhd, &batmap);
+ if (err)
+ return err;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ vhd_batmap_header_out(&batmap);
+ err = vhd_journal_update(j, off, (char *)&batmap.header, size,
+ VHD_JOURNAL_ENTRY_TYPE_BATMAP_H);
+ if (err)
+ goto out;
+
+ vhd_batmap_header_in(&batmap);
+ off = batmap.header.batmap_offset;
+ size = vhd_sectors_to_bytes(batmap.header.batmap_size);
+
+ err = vhd_journal_update(j, off, batmap.map, size,
+ VHD_JOURNAL_ENTRY_TYPE_BATMAP_M);
+
+out:
+ free(batmap.map);
+ return err;
+}
+
+static int
+vhd_journal_add_metadata(vhd_journal_t *j)
+{
+ int err;
+ off64_t eof;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_journal_add_footer(j);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ err = vhd_journal_add_header(j);
+ if (err)
+ return err;
+
+ err = vhd_journal_add_locators(j);
+ if (err)
+ return err;
+
+ err = vhd_journal_add_bat(j);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_add_batmap(j);
+ if (err)
+ return err;
+ }
+
+ j->header.journal_data_offset = j->header.journal_eof;
+ return vhd_journal_write_header(j, &j->header);
+}
+
+static int
+__vhd_journal_read_footer(vhd_journal_t *j,
+ vhd_footer_t *footer, uint32_t type)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != type)
+ return -EINVAL;
+
+ if (entry.size != sizeof(vhd_footer_t))
+ return -EINVAL;
+
+ err = vhd_journal_read(j, footer, entry.size);
+ if (err)
+ return err;
+
+ vhd_footer_in(footer);
+ return vhd_validate_footer(footer);
+}
+
+static int
+vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return __vhd_journal_read_footer(j, footer,
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+}
+
+static int
+vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return __vhd_journal_read_footer(j, footer,
+ VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+}
+
+static int
+vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER)
+ return -EINVAL;
+
+ if (entry.size != sizeof(vhd_header_t))
+ return -EINVAL;
+
+ err = vhd_journal_read(j, header, entry.size);
+ if (err)
+ return err;
+
+ vhd_header_in(header);
+ return vhd_validate_header(header);
+}
+
+static int
+vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs)
+{
+ int err, n, _locs;
+ char **_locators, *buf;
+ off_t pos;
+ vhd_journal_entry_t entry;
+
+ _locs = 0;
+ *locs = 0;
+ *locators = NULL;
+
+ n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t);
+ _locators = calloc(n, sizeof(char *));
+ if (!_locators)
+ return -ENOMEM;
+
+ for (;;) {
+ buf = NULL;
+
+ pos = vhd_journal_position(j);
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ goto fail;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) {
+ err = vhd_journal_seek(j, pos, SEEK_SET);
+ if (err)
+ goto fail;
+ break;
+ }
+
+ if (_locs >= n) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto fail;
+ }
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err)
+ goto fail;
+
+ _locators[_locs++] = buf;
+ err = 0;
+ }
+
+
+ *locs = _locs;
+ *locators = _locators;
+
+ return 0;
+
+fail:
+ if (_locators) {
+ for (n = 0; n < _locs; n++)
+ free(_locators[n]);
+ free(_locators);
+ }
+ return err;
+}
+
+static int
+vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+ int err;
+ size_t size;
+ vhd_context_t *vhd;
+ vhd_journal_entry_t entry;
+
+ vhd = &j->vhd;
+
+ size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t));
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT)
+ return -EINVAL;
+
+ if (entry.size != size)
+ return -EINVAL;
+
+ if (entry.offset != vhd->header.table_offset)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&bat->bat, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_journal_read(j, bat->bat, entry.size);
+ if (err)
+ goto fail;
+
+ bat->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+ bat->entries = vhd->header.max_bat_size;
+ vhd_bat_in(bat);
+
+ return 0;
+
+fail:
+ free(bat->bat);
+ bat->bat = NULL;
+ return err;
+}
+
+static int
+vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ size_t size;
+ vhd_journal_entry_t entry;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H)
+ return -EINVAL;
+
+ if (entry.size != size)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return err;
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err) {
+ free(buf);
+ return err;
+ }
+
+ memcpy(&batmap->header, buf, sizeof(batmap->header));
+
+ vhd_batmap_header_in(batmap);
+ return vhd_validate_batmap_header(batmap);
+}
+
+static int
+vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+ vhd_journal_entry_t entry;
+
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ return err;
+
+ if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M)
+ return -EINVAL;
+
+ if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size))
+ return -EINVAL;
+
+ if (entry.offset != batmap->header.batmap_offset)
+ return -EINVAL;
+
+ err = posix_memalign((void **)&batmap->map,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err)
+ return -err;
+
+ err = vhd_journal_read(j, batmap->map, entry.size);
+ if (err) {
+ free(batmap->map);
+ batmap->map = NULL;
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ int err;
+
+ err = vhd_journal_read_batmap_header(j, batmap);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_batmap_map(j, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap(batmap);
+ if (err) {
+ free(batmap->map);
+ batmap->map = NULL;
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return vhd_write_footer_at(&j->vhd, footer,
+ j->header.vhd_footer_offset);
+}
+
+static int
+vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+ return vhd_write_footer_at(&j->vhd, footer, 0);
+}
+
+static int
+vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header)
+{
+ off64_t off;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+ off = vhd->footer.data_offset;
+
+ return vhd_write_header_at(&j->vhd, header, off);
+}
+
+static int
+vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs)
+{
+ size_t size;
+ vhd_context_t *vhd;
+ int i, n, lidx, err;
+ vhd_parent_locator_t *loc;
+
+ lidx = 0;
+ vhd = &j->vhd;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n && lidx < locs; i++) {
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ err = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ size = vhd_parent_locator_size(loc);
+ err = vhd_write(vhd, locators[lidx++], size);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+ return vhd_write_bat(&j->vhd, bat);
+}
+
+static int
+vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+ return vhd_write_batmap(&j->vhd, batmap);
+}
+
+static int
+vhd_journal_restore_metadata(vhd_journal_t *j)
+{
+ off64_t off;
+ char **locators;
+ vhd_footer_t copy;
+ vhd_context_t *vhd;
+ int i, locs, hlocs, err;
+
+ vhd = &j->vhd;
+ locs = 0;
+ hlocs = 0;
+ locators = NULL;
+
+ err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_footer(j, &vhd->footer);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(vhd))
+ goto restore;
+
+ err = vhd_journal_read_footer_copy(j, &copy);
+ if (err)
+ return err;
+
+ err = vhd_journal_read_header(j, &vhd->header);
+ if (err)
+ return err;
+
+ for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) {
+ if (vhd_validate_platform_code(vhd->header.loc[i].code))
+ return err;
+
+ if (vhd->header.loc[i].code != PLAT_CODE_NONE)
+ hlocs++;
+ }
+
+ if (hlocs) {
+ err = vhd_journal_read_locators(j, &locators, &locs);
+ if (err)
+ return err;
+
+ if (hlocs != locs) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ err = vhd_journal_read_bat(j, &vhd->bat);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_read_batmap(j, &vhd->batmap);
+ if (err)
+ goto out;
+ }
+
+restore:
+ off = vhd_journal_position(j);
+ if (off == (off64_t)-1)
+ return -errno;
+
+ if (j->header.journal_data_offset != off)
+ return -EINVAL;
+
+ err = vhd_journal_restore_footer(j, &vhd->footer);
+ if (err)
+ goto out;
+
+ if (!vhd_type_dynamic(vhd))
+ goto out;
+
+ err = vhd_journal_restore_footer_copy(j, &copy);
+ if (err)
+ goto out;
+
+ err = vhd_journal_restore_header(j, &vhd->header);
+ if (err)
+ goto out;
+
+ if (locs) {
+ err = vhd_journal_restore_locators(j, locators, locs);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_journal_restore_bat(j, &vhd->bat);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_journal_restore_batmap(j, &vhd->batmap);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ if (locators) {
+ for (i = 0; i < locs; i++)
+ free(locators[i]);
+ free(locators);
+ }
+
+ if (!err && !vhd->is_block)
+ err = ftruncate(vhd->fd,
+ j->header.vhd_footer_offset +
+ sizeof(vhd_footer_t));
+
+ return err;
+}
+
+static int
+vhd_journal_disable_vhd(vhd_journal_t *j)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ memcpy(&vhd->footer.cookie,
+ VHD_POISON_COOKIE, sizeof(vhd->footer.cookie));
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_journal_enable_vhd(vhd_journal_t *j)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ vhd = &j->vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ return err;
+
+ if (!vhd_disabled(vhd))
+ return 0;
+
+ memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie));
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int
+vhd_journal_close(vhd_journal_t *j)
+{
+ if (j->jfd)
+ close(j->jfd);
+
+ vhd_close(&j->vhd);
+ free(j->jname);
+
+ return 0;
+}
+
+int
+vhd_journal_remove(vhd_journal_t *j)
+{
+ int err;
+
+ err = vhd_journal_enable_vhd(j);
+ if (err)
+ return err;
+
+ if (j->jfd) {
+ close(j->jfd);
+ if (!j->is_block)
+ unlink(j->jname);
+ }
+
+ vhd_close(&j->vhd);
+ free(j->jname);
+
+ return 0;
+}
+
+int
+vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile)
+{
+ int err;
+ vhd_context_t *vhd;
+
+ memset(j, 0, sizeof(vhd_journal_t));
+
+ j->jfd = -1;
+ vhd = &j->vhd;
+
+ j->jname = strdup(jfile);
+ if (j->jname == NULL)
+ return -ENOMEM;
+
+ j->jfd = open(j->jname, O_LARGEFILE | O_RDWR);
+ if (j->jfd == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(j->jname, &j->is_block);
+ if (err)
+ goto fail;
+
+ vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT);
+ if (vhd->fd == -1) {
+ err = -errno;
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(file, &vhd->is_block);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_read_journal_header(j, &j->header);
+ if (err)
+ goto fail;
+
+ err = vhd_journal_restore_metadata(j);
+ if (err)
+ goto fail;
+
+ close(vhd->fd);
+ free(vhd->bat.bat);
+ free(vhd->batmap.map);
+
+ err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+ if (err)
+ goto fail;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ goto fail;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ goto fail;
+ }
+
+ err = vhd_journal_disable_vhd(j);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ vhd_journal_close(j);
+ return err;
+}
+
+int
+vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile)
+{
+ char *buf;
+ int i, err;
+ size_t size;
+ off64_t off;
+ struct stat stats;
+
+ memset(j, 0, sizeof(vhd_journal_t));
+ j->jfd = -1;
+
+ j->jname = strdup(jfile);
+ if (j->jname == NULL) {
+ err = -ENOMEM;
+ goto fail1;
+ }
+
+ if (access(j->jname, F_OK) == 0) {
+ err = vhd_test_file_fixed(j->jname, &j->is_block);
+ if (err)
+ goto fail1;
+
+ if (!j->is_block) {
+ err = -EEXIST;
+ goto fail1;
+ }
+ }
+
+ if (j->is_block)
+ j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644);
+ else
+ j->jfd = open(j->jname,
+ O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644);
+ if (j->jfd == -1) {
+ err = -errno;
+ goto fail1;
+ }
+
+ err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT);
+ if (err)
+ goto fail1;
+
+ err = vhd_get_bat(&j->vhd);
+ if (err)
+ goto fail2;
+
+ if (vhd_has_batmap(&j->vhd)) {
+ err = vhd_get_batmap(&j->vhd);
+ if (err)
+ goto fail2;
+ }
+
+ err = vhd_journal_add_journal_header(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_add_metadata(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_disable_vhd(j);
+ if (err)
+ goto fail2;
+
+ err = vhd_journal_sync(j);
+ if (err)
+ goto fail2;
+
+ return 0;
+
+fail1:
+ if (j->jfd != -1) {
+ close(j->jfd);
+ if (!j->is_block)
+ unlink(j->jname);
+ }
+ free(j->jname);
+ memset(j, 0, sizeof(vhd_journal_t));
+
+ return err;
+
+fail2:
+ vhd_journal_remove(j);
+ return err;
+}
+
+int
+vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode)
+{
+ int err;
+ char *buf;
+ off64_t off;
+ size_t size;
+ uint64_t blk;
+ vhd_context_t *vhd;
+
+ buf = NULL;
+ vhd = &j->vhd;
+
+ if (!vhd_type_dynamic(vhd))
+ return -EINVAL;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ return err;
+
+ if (block >= vhd->bat.entries)
+ return -ERANGE;
+
+ blk = vhd->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return 0;
+
+ off = vhd_sectors_to_bytes(blk);
+
+ if (mode & VHD_JOURNAL_METADATA) {
+ size = vhd_sectors_to_bytes(vhd->bm_secs);
+
+ err = vhd_read_bitmap(vhd, block, &buf);
+ if (err)
+ return err;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_DATA);
+
+ free(buf);
+
+ if (err)
+ return err;
+ }
+
+ if (mode & VHD_JOURNAL_DATA) {
+ off += vhd_sectors_to_bytes(vhd->bm_secs);
+ size = vhd_sectors_to_bytes(vhd->spb);
+
+ err = vhd_read_block(vhd, block, &buf);
+ if (err)
+ return err;
+
+ err = vhd_journal_update(j, off, buf, size,
+ VHD_JOURNAL_ENTRY_TYPE_DATA);
+ free(buf);
+
+ if (err)
+ return err;
+ }
+
+ return vhd_journal_sync(j);
+}
+
+/*
+ * commit indicates the transaction completed
+ * successfully and we can remove the undo log
+ */
+int
+vhd_journal_commit(vhd_journal_t *j)
+{
+ int err;
+
+ j->header.journal_data_entries = 0;
+ j->header.journal_metadata_entries = 0;
+ j->header.journal_data_offset = 0;
+ j->header.journal_metadata_offset = 0;
+
+ err = vhd_journal_write_header(j, &j->header);
+ if (err)
+ return err;
+
+ if (!j->is_block)
+ err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t));
+ if (err)
+ return -errno;
+
+ return 0;
+}
+
+/*
+ * revert indicates the transaction failed
+ * and we should revert any changes via the undo log
+ */
+int
+vhd_journal_revert(vhd_journal_t *j)
+{
+ int i, err;
+ char *buf, *file;
+ vhd_context_t *vhd;
+ vhd_journal_entry_t entry;
+
+ err = 0;
+ vhd = &j->vhd;
+ buf = NULL;
+
+ file = strdup(vhd->file);
+ if (!file)
+ return -ENOMEM;
+
+ vhd_close(&j->vhd);
+ j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE);
+ if (j->vhd.fd == -1) {
+ free(file);
+ return -errno;
+ }
+
+ err = vhd_test_file_fixed(file, &vhd->is_block);
+ if (err) {
+ free(file);
+ return err;
+ }
+
+ err = vhd_journal_restore_metadata(j);
+ if (err) {
+ free(file);
+ return err;
+ }
+
+ close(vhd->fd);
+ free(vhd->bat.bat);
+ free(vhd->batmap.map);
+
+ err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+ free(file);
+ if (err)
+ return err;
+
+ err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ for (i = 0; i < j->header.journal_data_entries; i++) {
+ err = vhd_journal_read_entry(j, &entry);
+ if (err)
+ goto end;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, entry.size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto end;
+ }
+
+ err = vhd_journal_read(j, buf, entry.size);
+ if (err)
+ goto end;
+
+ err = vhd_journal_validate_entry_data(&entry, buf);
+ if (err)
+ goto end;
+
+ err = vhd_seek(vhd, entry.offset, SEEK_SET);
+ if (err)
+ goto end;
+
+ err = vhd_write(vhd, buf, entry.size);
+ if (err)
+ goto end;
+
+ err = 0;
+
+ end:
+ free(buf);
+ buf = NULL;
+ if (err)
+ break;
+ }
+
+ if (err)
+ return err;
+
+ if (!vhd->is_block) {
+ err = ftruncate(vhd->fd, j->header.vhd_footer_offset +
+ sizeof(vhd_footer_t));
+ if (err)
+ return -errno;
+ }
+
+ return vhd_journal_sync(j);
+}
diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap2/vhd/lib/libvhd.c
new file mode 100644
index 0000000000..1af30ad1f6
--- /dev/null
+++ b/tools/blktap2/vhd/lib/libvhd.c
@@ -0,0 +1,3328 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <iconv.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "relative-path.h"
+
+static int libvhd_dbg = 0;
+
+void
+libvhd_set_log_level(int level)
+{
+ if (level)
+ libvhd_dbg = 1;
+}
+
+#define VHDLOG(_f, _a...) \
+ do { \
+ if (libvhd_dbg) \
+ syslog(LOG_INFO, "libvhd::%s: "_f, \
+ __func__, ##_a); \
+ } while (0)
+
+#define BIT_MASK 0x80
+
+#ifdef ENABLE_FAILURE_TESTING
+const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
+ "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN",
+ "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR",
+ "VHD_UTIL_TEST_FAIL_REPARENT_END",
+ "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN",
+ "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED",
+ "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED",
+ "VHD_UTIL_TEST_FAIL_RESIZE_END"
+};
+int TEST_FAIL[NUM_FAIL_TESTS];
+#endif // ENABLE_FAILURE_TESTING
+
+static inline int
+test_bit (volatile char *addr, int nr)
+{
+ return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0;
+}
+
+static inline void
+set_bit (volatile char *addr, int nr)
+{
+ addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static inline void
+clear_bit (volatile char *addr, int nr)
+{
+ addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7));
+}
+
+static inline int
+old_test_bit(volatile char *addr, int nr)
+{
+ return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1;
+}
+
+static inline void
+old_set_bit(volatile char *addr, int nr)
+{
+ ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31));
+}
+
+static inline void
+old_clear_bit(volatile char *addr, int nr)
+{
+ ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31));
+}
+
+void
+vhd_footer_in(vhd_footer_t *footer)
+{
+ BE32_IN(&footer->features);
+ BE32_IN(&footer->ff_version);
+ BE64_IN(&footer->data_offset);
+ BE32_IN(&footer->timestamp);
+ BE32_IN(&footer->crtr_ver);
+ BE32_IN(&footer->crtr_os);
+ BE64_IN(&footer->orig_size);
+ BE64_IN(&footer->curr_size);
+ BE32_IN(&footer->geometry);
+ BE32_IN(&footer->type);
+ BE32_IN(&footer->checksum);
+}
+
+void
+vhd_footer_out(vhd_footer_t *footer)
+{
+ BE32_OUT(&footer->features);
+ BE32_OUT(&footer->ff_version);
+ BE64_OUT(&footer->data_offset);
+ BE32_OUT(&footer->timestamp);
+ BE32_OUT(&footer->crtr_ver);
+ BE32_OUT(&footer->crtr_os);
+ BE64_OUT(&footer->orig_size);
+ BE64_OUT(&footer->curr_size);
+ BE32_OUT(&footer->geometry);
+ BE32_OUT(&footer->type);
+ BE32_OUT(&footer->checksum);
+}
+
+void
+vhd_header_in(vhd_header_t *header)
+{
+ int i, n;
+
+ BE64_IN(&header->data_offset);
+ BE64_IN(&header->table_offset);
+ BE32_IN(&header->hdr_ver);
+ BE32_IN(&header->max_bat_size);
+ BE32_IN(&header->block_size);
+ BE32_IN(&header->checksum);
+ BE32_IN(&header->prt_ts);
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ BE32_IN(&header->loc[i].code);
+ BE32_IN(&header->loc[i].data_space);
+ BE32_IN(&header->loc[i].data_len);
+ BE64_IN(&header->loc[i].data_offset);
+ }
+}
+
+void
+vhd_header_out(vhd_header_t *header)
+{
+ int i, n;
+
+ BE64_OUT(&header->data_offset);
+ BE64_OUT(&header->table_offset);
+ BE32_OUT(&header->hdr_ver);
+ BE32_OUT(&header->max_bat_size);
+ BE32_OUT(&header->block_size);
+ BE32_OUT(&header->checksum);
+ BE32_OUT(&header->prt_ts);
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ BE32_OUT(&header->loc[i].code);
+ BE32_OUT(&header->loc[i].data_space);
+ BE32_OUT(&header->loc[i].data_len);
+ BE64_OUT(&header->loc[i].data_offset);
+ }
+}
+
+void
+vhd_batmap_header_in(vhd_batmap_t *batmap)
+{
+ BE64_IN(&batmap->header.batmap_offset);
+ BE32_IN(&batmap->header.batmap_size);
+ BE32_IN(&batmap->header.batmap_version);
+ BE32_IN(&batmap->header.checksum);
+}
+
+void
+vhd_batmap_header_out(vhd_batmap_t *batmap)
+{
+ BE64_OUT(&batmap->header.batmap_offset);
+ BE32_OUT(&batmap->header.batmap_size);
+ BE32_OUT(&batmap->header.batmap_version);
+ BE32_OUT(&batmap->header.checksum);
+}
+
+void
+vhd_bat_in(vhd_bat_t *bat)
+{
+ int i;
+
+ for (i = 0; i < bat->entries; i++)
+ BE32_IN(&bat->bat[i]);
+}
+
+void
+vhd_bat_out(vhd_bat_t *bat)
+{
+ int i;
+
+ for (i = 0; i < bat->entries; i++)
+ BE32_OUT(&bat->bat[i]);
+}
+
+uint32_t
+vhd_checksum_footer(vhd_footer_t *footer)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = footer->checksum;
+ footer->checksum = 0;
+
+ blob = (unsigned char *)footer;
+ for (i = 0; i < sizeof(vhd_footer_t); i++)
+ checksum += (uint32_t)blob[i];
+
+ footer->checksum = tmp;
+ return ~checksum;
+}
+
+int
+vhd_validate_footer(vhd_footer_t *footer)
+{
+ int csize;
+ uint32_t checksum;
+
+ csize = sizeof(footer->cookie);
+ if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 &&
+ memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) {
+ char buf[9];
+ memcpy(buf, footer->cookie, 8);
+ buf[8]= '\0';
+ VHDLOG("invalid footer cookie: %s\n", buf);
+ return -EINVAL;
+ }
+
+ checksum = vhd_checksum_footer(footer);
+ if (checksum != footer->checksum) {
+ /*
+ * early td-util did not re-calculate
+ * checksum when marking vhds 'hidden'
+ */
+ if (footer->hidden &&
+ !strncmp(footer->crtr_app, "tap", 3) &&
+ (footer->crtr_ver == VHD_VERSION(0, 1) ||
+ footer->crtr_ver == VHD_VERSION(1, 1))) {
+ char tmp = footer->hidden;
+ footer->hidden = 0;
+ checksum = vhd_checksum_footer(footer);
+ footer->hidden = tmp;
+
+ if (checksum == footer->checksum)
+ return 0;
+ }
+
+ VHDLOG("invalid footer checksum: "
+ "footer = 0x%08x, calculated = 0x%08x\n",
+ footer->checksum, checksum);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+uint32_t
+vhd_checksum_header(vhd_header_t *header)
+{
+ int i;
+ unsigned char *blob;
+ uint32_t checksum, tmp;
+
+ checksum = 0;
+ tmp = header->checksum;
+ header->checksum = 0;
+
+ blob = (unsigned char *)header;
+ for (i = 0; i < sizeof(vhd_header_t); i++)
+ checksum += (uint32_t)blob[i];
+
+ header->checksum = tmp;
+ return ~checksum;
+}
+
+int
+vhd_validate_header(vhd_header_t *header)
+{
+ int i, n;
+ uint32_t checksum;
+
+ if (memcmp(header->cookie, DD_COOKIE, 8) != 0) {
+ char buf[9];
+ memcpy(buf, header->cookie, 8);
+ buf[8] = '\0';
+ VHDLOG("invalid header cookie: %s\n", buf);
+ return -EINVAL;
+ }
+
+ if (header->hdr_ver != 0x00010000) {
+ VHDLOG("invalid header version 0x%08x\n", header->hdr_ver);
+ return -EINVAL;
+ }
+
+ if (header->data_offset != 0xFFFFFFFFFFFFFFFF) {
+ VHDLOG("invalid header data_offset 0x%016"PRIx64"\n",
+ header->data_offset);
+ return -EINVAL;
+ }
+
+ n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++)
+ if (vhd_validate_platform_code(header->loc[i].code))
+ return -EINVAL;
+
+ checksum = vhd_checksum_header(header);
+ if (checksum != header->checksum) {
+ VHDLOG("invalid header checksum: "
+ "header = 0x%08x, calculated = 0x%08x\n",
+ header->checksum, checksum);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline int
+vhd_validate_bat(vhd_bat_t *bat)
+{
+ if (!bat->bat)
+ return -EINVAL;
+
+ return 0;
+}
+
+uint32_t
+vhd_checksum_batmap(vhd_batmap_t *batmap)
+{
+ int i, n;
+ char *blob;
+ uint32_t checksum;
+
+ blob = batmap->map;
+ checksum = 0;
+
+ n = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+ for (i = 0; i < n; i++) {
+ if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1))
+ checksum += (uint32_t)blob[i];
+ else
+ checksum += (uint32_t)(unsigned char)blob[i];
+ }
+
+ return ~checksum;
+}
+
+int
+vhd_validate_batmap_header(vhd_batmap_t *batmap)
+{
+ if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8))
+ return -EINVAL;
+
+ if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+vhd_validate_batmap(vhd_batmap_t *batmap)
+{
+ uint32_t checksum;
+
+ if (!batmap->map)
+ return -EINVAL;
+
+ checksum = vhd_checksum_batmap(batmap);
+ if (checksum != batmap->header.checksum)
+ return -EINVAL;
+
+ return 0;
+}
+
+int
+vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *_off)
+{
+ off64_t off;
+ size_t bat;
+
+ *_off = 0;
+
+ off = ctx->header.table_offset;
+ bat = ctx->header.max_bat_size * sizeof(uint32_t);
+ off += vhd_bytes_padded(bat);
+
+ *_off = off;
+ return 0;
+}
+
+int
+vhd_validate_platform_code(uint32_t code)
+{
+ switch (code) {
+ case PLAT_CODE_NONE:
+ case PLAT_CODE_WI2R:
+ case PLAT_CODE_WI2K:
+ case PLAT_CODE_W2RU:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_MAC:
+ case PLAT_CODE_MACX:
+ return 0;
+ default:
+ VHDLOG("invalid parent locator code %u\n", code);
+ return -EINVAL;
+ }
+}
+
+int
+vhd_parent_locator_count(vhd_context_t *ctx)
+{
+ return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t));
+}
+
+int
+vhd_hidden(vhd_context_t *ctx, int *hidden)
+{
+ int err;
+
+ *hidden = 0;
+
+ if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) &&
+ (ctx->footer.crtr_ver == VHD_VERSION(0, 1) ||
+ ctx->footer.crtr_ver == VHD_VERSION(1, 1))) {
+ vhd_footer_t copy;
+
+ err = vhd_read_footer_at(ctx, &copy, 0);
+ if (err) {
+ VHDLOG("error reading backup footer of %s: %d\n",
+ ctx->file, err);
+ return err;
+ }
+ *hidden = copy.hidden;
+ } else
+ *hidden = ctx->footer.hidden;
+
+ return 0;
+}
+
+int
+vhd_chain_depth(vhd_context_t *ctx, int *depth)
+{
+ char *file;
+ int err, cnt;
+ vhd_context_t vhd, *cur;
+
+ err = 0;
+ cnt = 0;
+ *depth = 0;
+ file = NULL;
+ cur = ctx;
+
+ for (;;) {
+ cnt++;
+
+ if (cur->footer.type != HD_TYPE_DIFF)
+ break;
+
+ if (vhd_parent_raw(cur)) {
+ cnt++;
+ break;
+ }
+
+ err = vhd_parent_locator_get(cur, &file);
+ if (err) {
+ file = NULL;
+ break;
+ }
+
+ if (cur != ctx) {
+ vhd_close(cur);
+ cur = NULL;
+ }
+
+ err = vhd_open(&vhd, file, VHD_OPEN_RDONLY);
+ if (err)
+ break;
+
+ cur = &vhd;
+ free(file);
+ file = NULL;
+ }
+
+ free(file);
+ if (cur && cur != ctx)
+ vhd_close(cur);
+
+ if (!err)
+ *depth = cnt;
+
+ return err;
+}
+
+int
+vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return 0;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return 0;
+
+ return test_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return;
+
+ set_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+ if (!vhd_has_batmap(ctx) || !batmap->map)
+ return;
+
+ if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+ return;
+
+ clear_bit(batmap->map, block);
+}
+
+int
+vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_test_bit(map, block);
+
+ return test_bit(map, block);
+}
+
+void
+vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_set_bit(map, block);
+
+ return set_bit(map, block);
+}
+
+void
+vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block)
+{
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == 0x00000001)
+ return old_clear_bit(map, block);
+
+ return clear_bit(map, block);
+}
+
+/*
+ * returns absolute offset of the first
+ * byte of the file which is not vhd metadata
+ */
+int
+vhd_end_of_headers(vhd_context_t *ctx, off64_t *end)
+{
+ int err, i, n;
+ uint32_t bat_bytes;
+ off64_t eom, bat_end;
+ vhd_parent_locator_t *loc;
+
+ *end = 0;
+
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ eom = ctx->footer.data_offset + sizeof(vhd_header_t);
+
+ bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+ bat_end = ctx->header.table_offset + bat_bytes;
+
+ eom = MAX(eom, bat_end);
+
+ if (vhd_has_batmap(ctx)) {
+ off64_t hdr_end, hdr_secs, map_end, map_secs;
+
+ err = vhd_get_batmap(ctx);
+ if (err)
+ return err;
+
+ hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t));
+ err = vhd_batmap_header_offset(ctx, &hdr_end);
+ if (err)
+ return err;
+
+ hdr_end += vhd_sectors_to_bytes(hdr_secs);
+ eom = MAX(eom, hdr_end);
+
+ map_secs = ctx->batmap.header.batmap_size;
+ map_end = (ctx->batmap.header.batmap_offset +
+ vhd_sectors_to_bytes(map_secs));
+ eom = MAX(eom, map_end);
+ }
+
+ /* parent locators */
+ n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t);
+
+ for (i = 0; i < n; i++) {
+ off64_t loc_end;
+
+ loc = &ctx->header.loc[i];
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ loc_end = loc->data_offset + vhd_parent_locator_size(loc);
+ eom = MAX(eom, loc_end);
+ }
+
+ *end = eom;
+ return 0;
+}
+
+int
+vhd_end_of_data(vhd_context_t *ctx, off64_t *end)
+{
+ int i, err;
+ off64_t max;
+ uint64_t blk;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ return err;
+
+ max = vhd_position(ctx);
+ if (max == (off64_t)-1)
+ return -errno;
+
+ *end = max - sizeof(vhd_footer_t);
+ return 0;
+ }
+
+ err = vhd_end_of_headers(ctx, &max);
+ if (err)
+ return err;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ max >>= VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < ctx->bat.entries; i++) {
+ blk = ctx->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ blk += ctx->spb + ctx->bm_secs;
+ max = MAX(blk, max);
+ }
+ }
+
+ *end = vhd_sectors_to_bytes(max);
+ return 0;
+}
+
+uint32_t
+vhd_time(time_t time)
+{
+ struct tm tm;
+ time_t micro_epoch;
+
+ memset(&tm, 0, sizeof(struct tm));
+ tm.tm_year = 100;
+ tm.tm_mon = 0;
+ tm.tm_mday = 1;
+ micro_epoch = mktime(&tm);
+
+ return (uint32_t)(time - micro_epoch);
+}
+
+/*
+ * Stringify the VHD timestamp for printing.
+ * As with ctime_r, target must be >=26 bytes.
+ */
+size_t
+vhd_time_to_string(uint32_t timestamp, char *target)
+{
+ char *cr;
+ struct tm tm;
+ time_t t1, t2;
+
+ memset(&tm, 0, sizeof(struct tm));
+
+ /* VHD uses an epoch of 12:00AM, Jan 1, 2000. */
+ /* Need to adjust this to the expected epoch of 1970. */
+ tm.tm_year = 100;
+ tm.tm_mon = 0;
+ tm.tm_mday = 1;
+
+ t1 = mktime(&tm);
+ t2 = t1 + (time_t)timestamp;
+ ctime_r(&t2, target);
+
+ /* handle mad ctime_r newline appending. */
+ if ((cr = strchr(target, '\n')) != NULL)
+ *cr = '\0';
+
+ return (strlen(target));
+}
+
+/*
+ * nabbed from vhd specs.
+ */
+uint32_t
+vhd_chs(uint64_t size)
+{
+ uint32_t secs, cylinders, heads, spt, cth;
+
+ secs = secs_round_up_no_zero(size);
+
+ if (secs > 65535 * 16 * 255)
+ secs = 65535 * 16 * 255;
+
+ if (secs >= 65535 * 16 * 63) {
+ spt = 255;
+ cth = secs / spt;
+ heads = 16;
+ } else {
+ spt = 17;
+ cth = secs / spt;
+ heads = (cth + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (cth >= (heads * 1024) || heads > 16) {
+ spt = 31;
+ cth = secs / spt;
+ heads = 16;
+ }
+
+ if (cth >= heads * 1024) {
+ spt = 63;
+ cth = secs / spt;
+ heads = 16;
+ }
+ }
+
+ cylinders = cth / heads;
+
+ return GEOM_ENCODE(cylinders, heads, spt);
+}
+
+int
+vhd_get_footer(vhd_context_t *ctx)
+{
+ if (!vhd_validate_footer(&ctx->footer))
+ return 0;
+
+ return vhd_read_footer(ctx, &ctx->footer);
+}
+
+int
+vhd_get_header(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_header(&ctx->header))
+ return 0;
+
+ return vhd_read_header(ctx, &ctx->header);
+}
+
+int
+vhd_get_bat(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_bat(&ctx->bat))
+ return 0;
+
+ vhd_put_bat(ctx);
+ return vhd_read_bat(ctx, &ctx->bat);
+}
+
+int
+vhd_get_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_has_batmap(ctx))
+ return -EINVAL;
+
+ if (!vhd_validate_batmap(&ctx->batmap))
+ return 0;
+
+ vhd_put_batmap(ctx);
+ return vhd_read_batmap(ctx, &ctx->batmap);
+}
+
+void
+vhd_put_footer(vhd_context_t *ctx)
+{
+ memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+}
+
+void
+vhd_put_header(vhd_context_t *ctx)
+{
+ memset(&ctx->header, 0, sizeof(vhd_header_t));
+}
+
+void
+vhd_put_bat(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return;
+
+ free(ctx->bat.bat);
+ memset(&ctx->bat, 0, sizeof(vhd_bat_t));
+}
+
+void
+vhd_put_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return;
+
+ if (!vhd_has_batmap(ctx))
+ return;
+
+ free(ctx->batmap.map);
+ memset(&ctx->batmap, 0, sizeof(vhd_batmap_t));
+}
+
+/*
+ * look for 511 byte footer at end of file
+ */
+int
+vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ char *buf;
+ off64_t eof;
+
+ buf = NULL;
+
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ eof = vhd_position(ctx);
+ if (eof == (off64_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, eof - 511, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memset(buf, 0, sizeof(vhd_footer_t));
+
+ /*
+ * expecting short read here
+ */
+ vhd_read(ctx, buf, sizeof(vhd_footer_t));
+
+ memcpy(footer, buf, sizeof(vhd_footer_t));
+
+ vhd_footer_in(footer);
+ err = vhd_validate_footer(footer);
+
+out:
+ if (err)
+ VHDLOG("%s: failed reading short footer: %d\n",
+ ctx->file, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
+{
+ int err;
+ char *buf;
+
+ buf = NULL;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, buf, sizeof(vhd_footer_t));
+ if (err)
+ goto out;
+
+ memcpy(footer, buf, sizeof(vhd_footer_t));
+
+ vhd_footer_in(footer);
+ err = vhd_validate_footer(footer);
+
+out:
+ if (err)
+ VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n",
+ ctx->file, off, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ off64_t off;
+
+ err = vhd_seek(ctx, 0, SEEK_END);
+ if (err)
+ return err;
+
+ off = vhd_position(ctx);
+ if (off == (off64_t)-1)
+ return -errno;
+
+ err = vhd_read_footer_at(ctx, footer, off - 512);
+ if (err != -EINVAL)
+ return err;
+
+ err = vhd_read_short_footer(ctx, footer);
+ if (err != -EINVAL)
+ return err;
+
+ if (ctx->oflags & VHD_OPEN_STRICT)
+ return -EINVAL;
+
+ return vhd_read_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
+{
+ int err;
+ char *buf;
+
+ buf = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf,
+ VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, buf, sizeof(vhd_header_t));
+ if (err)
+ goto out;
+
+ memcpy(header, buf, sizeof(vhd_header_t));
+
+ vhd_header_in(header);
+ err = vhd_validate_header(header);
+
+out:
+ if (err)
+ VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n",
+ ctx->file, off, err);
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+ int err;
+ off64_t off;
+
+ if (!vhd_type_dynamic(ctx)) {
+ VHDLOG("%s is not dynamic!\n", ctx->file);
+ return -EINVAL;
+ }
+
+ off = ctx->footer.data_offset;
+ return vhd_read_header_at(ctx, header, off);
+}
+
+int
+vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+ int err;
+ char *buf;
+ off64_t off;
+ size_t size;
+
+ buf = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ off = ctx->header.table_offset;
+ size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ bat->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ bat->entries = ctx->header.max_bat_size;
+ bat->bat = (uint32_t *)buf;
+
+ vhd_bat_in(bat);
+
+ return 0;
+
+fail:
+ free(buf);
+ memset(bat, 0, sizeof(vhd_bat_t));
+ VHDLOG("%s: failed to read bat: %d\n", ctx->file, err);
+ return err;
+}
+
+static int
+vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ off64_t off;
+ size_t size;
+
+ buf = NULL;
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ goto fail;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t));
+ free(buf);
+ buf = NULL;
+
+ vhd_batmap_header_in(batmap);
+
+ return 0;
+
+fail:
+ free(buf);
+ memset(&batmap->header, 0, sizeof(vhd_batmap_header_t));
+ VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err);
+ return err;
+}
+
+static int
+vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ char *buf;
+ off64_t off;
+ size_t map_size;
+
+ map_size = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size);
+ if (err) {
+ buf = NULL;
+ err = -err;
+ goto fail;
+ }
+
+ off = batmap->header.batmap_offset;
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, map_size);
+ if (err)
+ goto fail;
+
+ batmap->map = buf;
+ return 0;
+
+fail:
+ free(buf);
+ batmap->map = NULL;
+ VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err);
+ return err;
+}
+
+int
+vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+
+ if (!vhd_has_batmap(ctx))
+ return -EINVAL;
+
+ memset(batmap, 0, sizeof(vhd_batmap_t));
+
+ err = vhd_read_batmap_header(ctx, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap_header(batmap);
+ if (err)
+ return err;
+
+ err = vhd_read_batmap_map(ctx, batmap);
+ if (err)
+ return err;
+
+ err = vhd_validate_batmap(batmap);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ free(batmap->map);
+ memset(batmap, 0, sizeof(vhd_batmap_t));
+ return err;
+}
+
+int
+vhd_has_batmap(vhd_context_t *ctx)
+{
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ if (!vhd_creator_tapdisk(ctx))
+ return 0;
+
+ if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1))
+ return 0;
+
+ if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2))
+ return 1;
+
+ /*
+ * VHDs of version 1.1 probably have a batmap, but may not
+ * if they were updated from version 0.1 via vhd-update.
+ */
+ if (!vhd_validate_batmap_header(&ctx->batmap))
+ return 1;
+
+ if (vhd_read_batmap_header(ctx, &ctx->batmap))
+ return 0;
+
+ return (!vhd_validate_batmap_header(&ctx->batmap));
+}
+
+/*
+ * Is this a block device (with a fixed size)? This affects whether the file
+ * can be truncated and where the footer is written for VHDs.
+ */
+int
+vhd_test_file_fixed(const char *file, int *is_block)
+{
+ int err;
+ struct stat stats;
+
+ err = stat(file, &stats);
+ if (err == -1)
+ return -errno;
+
+ *is_block = !!(S_ISBLK(stats.st_mode));
+ return err;
+}
+
+int
+vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location)
+{
+ int err;
+ char *location, *cpath, *cdir, *path;
+
+ err = 0;
+ path = NULL;
+ cpath = NULL;
+ location = NULL;
+ *_location = NULL;
+
+ if (!parent)
+ return -EINVAL;
+
+ if (parent[0] == '/') {
+ if (!access(parent, R_OK)) {
+ path = strdup(parent);
+ if (!path)
+ return -ENOMEM;
+ *_location = path;
+ return 0;
+ }
+ }
+
+ /* check parent path relative to child's directory */
+ cpath = realpath(ctx->file, NULL);
+ if (!cpath) {
+ err = -errno;
+ goto out;
+ }
+
+ cdir = dirname(cpath);
+ if (asprintf(&location, "%s/%s", cdir, parent) == -1) {
+ err = -errno;
+ location = NULL;
+ goto out;
+ }
+
+ if (!access(location, R_OK)) {
+ path = realpath(location, NULL);
+ if (path) {
+ *_location = path;
+ return 0;
+ }
+ }
+ err = -errno;
+
+out:
+ free(location);
+ free(cpath);
+ return err;
+}
+
+static int
+vhd_macx_encode_location(char *name, char **out, int *outlen)
+{
+ iconv_t cd;
+ int len, err;
+ size_t ibl, obl;
+ char *uri, *urip, *uri_utf8, *uri_utf8p, *ret;
+
+ err = 0;
+ ret = NULL;
+ *out = NULL;
+ *outlen = 0;
+ len = strlen(name) + strlen("file://");
+
+ ibl = len;
+ obl = len;
+
+ uri = urip = malloc(ibl + 1);
+ uri_utf8 = uri_utf8p = malloc(obl);
+
+ if (!uri || !uri_utf8)
+ return -ENOMEM;
+
+ cd = iconv_open("UTF-8", "ASCII");
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ sprintf(uri, "file://%s", name);
+
+ if (iconv(cd, &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
+ ibl || obl) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ ret = malloc(len);
+ if (!ret) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(ret, uri_utf8, len);
+ *outlen = len;
+ *out = ret;
+
+ out:
+ free(uri);
+ free(uri_utf8);
+ if (cd != (iconv_t)-1)
+ iconv_close(cd);
+
+ return err;
+}
+
+static int
+vhd_w2u_encode_location(char *name, char **out, int *outlen)
+{
+ iconv_t cd;
+ int len, err;
+ size_t ibl, obl;
+ char *uri, *urip, *uri_utf16, *uri_utf16p, *tmp, *ret;
+
+ err = 0;
+ ret = NULL;
+ *out = NULL;
+ *outlen = 0;
+ cd = (iconv_t) -1;
+
+ /*
+ * MICROSOFT_COMPAT
+ * relative paths must start with ".\"
+ */
+ if (name[0] != '/') {
+ tmp = strstr(name, "./");
+ if (tmp == name)
+ tmp += strlen("./");
+ else
+ tmp = name;
+
+ err = asprintf(&uri, ".\\%s", tmp);
+ } else
+ err = asprintf(&uri, "%s", name);
+
+ if (err == -1)
+ return -ENOMEM;
+
+ tmp = uri;
+ while (*tmp != '\0') {
+ if (*tmp == '/')
+ *tmp = '\\';
+ tmp++;
+ }
+
+ len = strlen(uri);
+ ibl = len;
+ obl = len * 2;
+ urip = uri;
+
+ uri_utf16 = uri_utf16p = malloc(obl);
+ if (!uri_utf16) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * MICROSOFT_COMPAT
+ * little endian unicode here
+ */
+ cd = iconv_open("UTF-16LE", "ASCII");
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (iconv(cd, &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
+ ibl || obl) {
+ err = (errno ? -errno : -EIO);
+ goto out;
+ }
+
+ len = len * 2;
+ ret = malloc(len);
+ if (!ret) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(ret, uri_utf16, len);
+ *outlen = len;
+ *out = ret;
+ err = 0;
+
+ out:
+ free(uri);
+ free(uri_utf16);
+ if (cd != (iconv_t)-1)
+ iconv_close(cd);
+
+ return err;
+}
+
+static char *
+vhd_macx_decode_location(char *in, char *out, int len)
+{
+ iconv_t cd;
+ char *name;
+ size_t ibl, obl;
+
+ name = out;
+ ibl = obl = len;
+
+ cd = iconv_open("ASCII", "UTF-8");
+ if (cd == (iconv_t)-1)
+ return NULL;
+
+ if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+ return NULL;
+
+ iconv_close(cd);
+ *out = '\0';
+
+ if (strstr(name, "file://") != name)
+ return NULL;
+
+ name += strlen("file://");
+
+ return strdup(name);
+}
+
+static char *
+vhd_w2u_decode_location(char *in, char *out, int len, char *utf_type)
+{
+ iconv_t cd;
+ char *name, *tmp;
+ size_t ibl, obl;
+
+ tmp = name = out;
+ ibl = obl = len;
+
+ cd = iconv_open("ASCII", utf_type);
+ if (cd == (iconv_t)-1)
+ return NULL;
+
+ if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+ return NULL;
+
+ iconv_close(cd);
+ *out = '\0';
+
+ /* TODO: spaces */
+ while (tmp != out) {
+ if (*tmp == '\\')
+ *tmp = '/';
+ tmp++;
+ }
+
+ if (strstr(name, "C:") == name || strstr(name, "c:") == name)
+ name += strlen("c:");
+
+ return strdup(name);
+}
+
+int
+vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf)
+{
+ char *code, out[512];
+
+ if (vhd_creator_tapdisk(ctx) &&
+ ctx->footer.crtr_ver == VHD_VERSION(0, 1))
+ code = UTF_16;
+ else
+ code = UTF_16BE;
+
+ *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code);
+ return (*buf == NULL ? -EINVAL : 0);
+}
+
+int
+vhd_parent_locator_read(vhd_context_t *ctx,
+ vhd_parent_locator_t *loc, char **parent)
+{
+ int err, size;
+ char *raw, *out, *name;
+
+ raw = NULL;
+ out = NULL;
+ name = NULL;
+ *parent = NULL;
+
+ if (ctx->footer.type != HD_TYPE_DIFF) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = vhd_seek(ctx, loc->data_offset, SEEK_SET);
+ if (err)
+ goto out;
+
+ size = vhd_parent_locator_size(loc);
+ if (size <= 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size);
+ if (err) {
+ raw = NULL;
+ err = -err;
+ goto out;
+ }
+
+ err = vhd_read(ctx, raw, size);
+ if (err)
+ goto out;
+
+ out = malloc(loc->data_len + 1);
+ if (!out) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ name = vhd_macx_decode_location(raw, out, loc->data_len);
+ break;
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ name = vhd_w2u_decode_location(raw, out,
+ loc->data_len, UTF_16LE);
+ break;
+ }
+
+ if (!name) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = 0;
+ *parent = name;
+
+out:
+ free(raw);
+ free(out);
+
+ if (err) {
+ VHDLOG("%s: error reading parent locator: %d\n",
+ ctx->file, err);
+ VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, "
+ "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space,
+ loc->data_len, loc->data_offset);
+ }
+
+ return err;
+}
+
+int
+vhd_parent_locator_get(vhd_context_t *ctx, char **parent)
+{
+ int i, n, err;
+ char *name, *location;
+ vhd_parent_locator_t *loc;
+
+ err = 0;
+ *parent = NULL;
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ n = vhd_parent_locator_count(ctx);
+ for (i = 0; i < n; i++) {
+ loc = ctx->header.loc + i;
+ err = vhd_parent_locator_read(ctx, loc, &name);
+ if (err)
+ continue;
+
+ err = vhd_find_parent(ctx, name, &location);
+ if (err)
+ VHDLOG("%s: couldn't find parent %s (%d)\n",
+ ctx->file, name, err);
+ free(name);
+
+ if (!err) {
+ *parent = location;
+ return 0;
+ }
+ }
+
+ return err;
+}
+
+int
+vhd_parent_locator_write_at(vhd_context_t *ctx,
+ const char *parent, off64_t off, uint32_t code,
+ size_t max_bytes, vhd_parent_locator_t *loc)
+{
+ struct stat stats;
+ int err, len, size;
+ char *absolute_path, *relative_path, *encoded, *block;
+
+ memset(loc, 0, sizeof(vhd_parent_locator_t));
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ absolute_path = NULL;
+ relative_path = NULL;
+ encoded = NULL;
+ block = NULL;
+ size = 0;
+ len = 0;
+
+ switch (code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ absolute_path = realpath(parent, NULL);
+ if (!absolute_path) {
+ err = -errno;
+ goto out;
+ }
+
+ err = stat(absolute_path, &stats);
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ relative_path = relative_path_to(ctx->file, absolute_path, &err);
+ if (!relative_path || err) {
+ err = (err ? err : -EINVAL);
+ goto out;
+ }
+
+ switch (code) {
+ case PLAT_CODE_MACX:
+ err = vhd_macx_encode_location(relative_path, &encoded, &len);
+ break;
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ err = vhd_w2u_encode_location(relative_path, &encoded, &len);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ if (err)
+ goto out;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ size = vhd_bytes_padded(len);
+
+ if (max_bytes && size > max_bytes) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size);
+ if (err) {
+ block = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memset(block, 0, size);
+ memcpy(block, encoded, len);
+
+ err = vhd_write(ctx, block, size);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ free(absolute_path);
+ free(relative_path);
+ free(encoded);
+ free(block);
+
+ if (!err) {
+ loc->res = 0;
+ loc->code = code;
+ loc->data_len = len;
+ /*
+ * write number of bytes ('size') instead of number of sectors
+ * into loc->data_space to be compatible with MSFT, even though
+ * this goes against the specs
+ */
+ loc->data_space = size;
+ loc->data_offset = off;
+ }
+
+ return err;
+}
+
+static int
+vhd_footer_offset_at_eof(vhd_context_t *ctx, off64_t *off)
+{
+ int err;
+ if ((err = vhd_seek(ctx, 0, SEEK_END)))
+ return errno;
+ *off = vhd_position(ctx) - sizeof(vhd_footer_t);
+ return 0;
+}
+
+int
+vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+ int err;
+ char *buf;
+ size_t size;
+ off64_t off;
+ uint64_t blk;
+
+ buf = NULL;
+ *bufp = NULL;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk);
+ size = vhd_bytes_padded(ctx->spb >> 3);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ *bufp = buf;
+ return 0;
+
+fail:
+ free(buf);
+ return err;
+}
+
+int
+vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+ int err;
+ char *buf;
+ size_t size;
+ uint64_t blk;
+ off64_t end, off;
+
+ buf = NULL;
+ *bufp = NULL;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+ size = vhd_sectors_to_bytes(ctx->spb);
+
+ err = vhd_footer_offset_at_eof(ctx, &end);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ goto fail;
+ }
+
+ if (end < off + ctx->header.block_size) {
+ size = end - off;
+ memset(buf + size, 0, ctx->header.block_size - size);
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto fail;
+
+ err = vhd_read(ctx, buf, size);
+ if (err)
+ goto fail;
+
+ *bufp = buf;
+ return 0;
+
+fail:
+ free(buf);
+ return err;
+}
+
+int
+vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
+{
+ int err;
+ vhd_footer_t *f;
+
+ f = NULL;
+
+ err = posix_memalign((void **)&f,
+ VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+ if (err) {
+ f = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(f, footer, sizeof(vhd_footer_t));
+ f->checksum = vhd_checksum_footer(f);
+
+ err = vhd_validate_footer(f);
+ if (err)
+ goto out;
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ vhd_footer_out(f);
+
+ err = vhd_write(ctx, f, sizeof(vhd_footer_t));
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n",
+ ctx->file, off, err);
+ free(f);
+ return err;
+}
+
+int
+vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+ int err;
+ off64_t off;
+
+ if (ctx->is_block)
+ err = vhd_footer_offset_at_eof(ctx, &off);
+ else
+ err = vhd_end_of_data(ctx, &off);
+ if (err)
+ return err;
+
+ err = vhd_write_footer_at(ctx, footer, off);
+ if (err)
+ return err;
+
+ if (!vhd_type_dynamic(ctx))
+ return 0;
+
+ return vhd_write_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
+{
+ int err;
+ vhd_header_t *h;
+
+ h = NULL;
+
+ if (!vhd_type_dynamic(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = posix_memalign((void **)&h,
+ VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+ if (err) {
+ h = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(h, header, sizeof(vhd_header_t));
+
+ h->checksum = vhd_checksum_header(h);
+ err = vhd_validate_header(h);
+ if (err)
+ goto out;
+
+ vhd_header_out(h);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(ctx, h, sizeof(vhd_header_t));
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n",
+ ctx->file, off, err);
+ free(h);
+ return err;
+}
+
+int
+vhd_write_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+ int err;
+ off64_t off;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ off = ctx->footer.data_offset;
+ return vhd_write_header_at(ctx, header, off);
+}
+
+int
+vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+ int err;
+ off64_t off;
+ vhd_bat_t b;
+ size_t size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ err = vhd_validate_bat(bat);
+ if (err)
+ return err;
+
+ memset(&b, 0, sizeof(vhd_bat_t));
+
+ off = ctx->header.table_offset;
+ size = vhd_bytes_padded(bat->entries * sizeof(uint32_t));
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size);
+ if (err)
+ return -err;
+
+ memcpy(b.bat, bat->bat, size);
+ b.spb = bat->spb;
+ b.entries = bat->entries;
+ vhd_bat_out(&b);
+
+ err = vhd_write(ctx, b.bat, size);
+ free(b.bat);
+
+ return err;
+}
+
+int
+vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+ int err;
+ off64_t off;
+ vhd_batmap_t b;
+ char *buf, *map;
+ size_t size, map_size;
+
+ buf = NULL;
+ map = NULL;
+
+ if (!vhd_has_batmap(ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ b.header = batmap->header;
+ b.map = batmap->map;
+
+ b.header.checksum = vhd_checksum_batmap(&b);
+ err = vhd_validate_batmap(&b);
+ if (err)
+ goto out;
+
+ off = b.header.batmap_offset;
+ map_size = vhd_sectors_to_bytes(b.header.batmap_size);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size);
+ if (err) {
+ map = NULL;
+ err = -err;
+ goto out;
+ }
+
+ memcpy(map, b.map, map_size);
+
+ err = vhd_write(ctx, map, map_size);
+ if (err)
+ goto out;
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ goto out;
+
+ size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto out;
+ }
+
+ vhd_batmap_header_out(&b);
+ memset(buf, 0, size);
+ memcpy(buf, &b.header, sizeof(vhd_batmap_header_t));
+
+ err = vhd_write(ctx, buf, size);
+
+out:
+ if (err)
+ VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+ free(buf);
+ free(map);
+ return 0;
+}
+
+int
+vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap)
+{
+ int err;
+ off64_t off;
+ uint64_t blk;
+ size_t secs, size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1))
+ return -EINVAL;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk);
+ size = vhd_sectors_to_bytes(ctx->bm_secs);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_write(ctx, bitmap, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int
+vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data)
+{
+ int err;
+ off64_t off;
+ size_t size;
+ uint64_t blk;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ err = vhd_validate_bat(&ctx->bat);
+ if (err)
+ return err;
+
+ if (block >= ctx->bat.entries)
+ return -ERANGE;
+
+ if ((unsigned long)data & ~(VHD_SECTOR_SIZE -1))
+ return -EINVAL;
+
+ blk = ctx->bat.bat[block];
+ if (blk == DD_BLK_UNUSED)
+ return -EINVAL;
+
+ off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+ size = vhd_sectors_to_bytes(ctx->spb);
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ err = vhd_write(ctx, data, size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static inline int
+namedup(char **dup, const char *name)
+{
+ *dup = NULL;
+
+ if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ *dup = strdup(name);
+ if (*dup == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int
+vhd_seek(vhd_context_t *ctx, off64_t offset, int whence)
+{
+ off64_t off;
+
+ off = lseek64(ctx->fd, offset, whence);
+ if (off == (off64_t)-1) {
+ VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n",
+ ctx->file, offset, whence, -errno);
+ return -errno;
+ }
+
+ return 0;
+}
+
+off64_t
+vhd_position(vhd_context_t *ctx)
+{
+ return lseek64(ctx->fd, 0, SEEK_CUR);
+}
+
+int
+vhd_read(vhd_context_t *ctx, void *buf, size_t size)
+{
+ size_t ret;
+
+ errno = 0;
+
+ ret = read(ctx->fd, buf, size);
+ if (ret == size)
+ return 0;
+
+ VHDLOG("%s: read of %zu returned %zd, errno: %d\n",
+ ctx->file, size, ret, -errno);
+
+ return (errno ? -errno : -EIO);
+}
+
+int
+vhd_write(vhd_context_t *ctx, void *buf, size_t size)
+{
+ size_t ret;
+
+ errno = 0;
+
+ ret = write(ctx->fd, buf, size);
+ if (ret == size)
+ return 0;
+
+ VHDLOG("%s: write of %zu returned %zd, errno: %d\n",
+ ctx->file, size, ret, -errno);
+
+ return (errno ? -errno : -EIO);
+}
+
+int
+vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
+{
+ int err;
+ uint32_t block;
+
+ if (!vhd_type_dynamic(ctx))
+ return sector;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ block = sector / ctx->spb;
+ if (ctx->bat.bat[block] == DD_BLK_UNUSED)
+ *offset = DD_BLK_UNUSED;
+ else
+ *offset = ctx->bat.bat[block] +
+ ctx->bm_secs + (sector % ctx->spb);
+
+ return 0;
+}
+
+int
+vhd_open_fast(vhd_context_t *ctx)
+{
+ int err;
+ char *buf;
+ size_t size;
+
+ size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ VHDLOG("failed allocating %s: %d\n", ctx->file, -err);
+ return -err;
+ }
+
+ err = vhd_read(ctx, buf, size);
+ if (err) {
+ VHDLOG("failed reading %s: %d\n", ctx->file, err);
+ goto out;
+ }
+
+ memcpy(&ctx->footer, buf, sizeof(vhd_footer_t));
+ vhd_footer_in(&ctx->footer);
+ err = vhd_validate_footer(&ctx->footer);
+ if (err)
+ goto out;
+
+ if (vhd_type_dynamic(ctx)) {
+ if (ctx->footer.data_offset != sizeof(vhd_footer_t))
+ err = vhd_read_header(ctx, &ctx->header);
+ else {
+ memcpy(&ctx->header,
+ buf + sizeof(vhd_footer_t),
+ sizeof(vhd_header_t));
+ vhd_header_in(&ctx->header);
+ err = vhd_validate_header(&ctx->header);
+ }
+
+ if (err)
+ goto out;
+
+ ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+ }
+
+out:
+ free(buf);
+ return err;
+}
+
+int
+vhd_open(vhd_context_t *ctx, const char *file, int flags)
+{
+ int err, oflags;
+
+ if (flags & VHD_OPEN_STRICT)
+ vhd_flag_clear(flags, VHD_OPEN_FAST);
+
+ memset(ctx, 0, sizeof(vhd_context_t));
+ ctx->fd = -1;
+ ctx->oflags = flags;
+
+ err = namedup(&ctx->file, file);
+ if (err)
+ return err;
+
+ oflags = O_DIRECT | O_LARGEFILE;
+ if (flags & VHD_OPEN_RDONLY)
+ oflags |= O_RDONLY;
+ if (flags & VHD_OPEN_RDWR)
+ oflags |= O_RDWR;
+
+ ctx->fd = open(ctx->file, oflags, 0644);
+ if (ctx->fd == -1) {
+ err = -errno;
+ VHDLOG("failed to open %s: %d\n", ctx->file, err);
+ goto fail;
+ }
+
+ err = vhd_test_file_fixed(ctx->file, &ctx->is_block);
+ if (err)
+ goto fail;
+
+ if (flags & VHD_OPEN_FAST) {
+ err = vhd_open_fast(ctx);
+ if (err)
+ goto fail;
+
+ return 0;
+ }
+
+ err = vhd_read_footer(ctx, &ctx->footer);
+ if (err)
+ goto fail;
+
+ if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (vhd_type_dynamic(ctx)) {
+ err = vhd_read_header(ctx, &ctx->header);
+ if (err)
+ goto fail;
+
+ ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+ ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+ }
+
+ return 0;
+
+fail:
+ if (ctx->fd != -1)
+ close(ctx->fd);
+ free(ctx->file);
+ memset(ctx, 0, sizeof(vhd_context_t));
+ return err;
+}
+
+void
+vhd_close(vhd_context_t *ctx)
+{
+ if (ctx->file)
+ close(ctx->fd);
+ free(ctx->file);
+ free(ctx->bat.bat);
+ free(ctx->batmap.map);
+ memset(ctx, 0, sizeof(vhd_context_t));
+}
+
+static inline void
+vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size)
+{
+ memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+ memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie));
+ ctx->footer.features = HD_RESERVED;
+ ctx->footer.ff_version = HD_FF_VERSION;
+ ctx->footer.timestamp = vhd_time(time(NULL));
+ ctx->footer.crtr_ver = VHD_CURRENT_VERSION;
+ ctx->footer.crtr_os = 0x00000000;
+ ctx->footer.orig_size = size;
+ ctx->footer.curr_size = size;
+ ctx->footer.geometry = vhd_chs(size);
+ ctx->footer.type = type;
+ ctx->footer.saved = 0;
+ ctx->footer.data_offset = 0xFFFFFFFFFFFFFFFF;
+ strcpy(ctx->footer.crtr_app, "tap");
+ uuid_generate(ctx->footer.uuid);
+}
+
+static int
+vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
+{
+ int err;
+ iconv_t cd;
+ size_t ibl, obl;
+ char *pname, *ppath, *dst;
+
+ err = 0;
+ pname = NULL;
+ ppath = NULL;
+
+ /*
+ * MICROSOFT_COMPAT
+ * big endian unicode here
+ */
+ cd = iconv_open(UTF_16BE, "ASCII");
+ if (cd == (iconv_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ ppath = strdup(parent_path);
+ if (!ppath) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pname = basename(ppath);
+ if (!strcmp(pname, "")) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ibl = strlen(pname);
+ obl = sizeof(ctx->header.prt_name);
+ dst = ctx->header.prt_name;
+
+ memset(dst, 0, obl);
+
+ if (iconv(cd, &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl)
+ err = (errno ? -errno : -EINVAL);
+
+out:
+ iconv_close(cd);
+ free(ppath);
+ return err;
+}
+
+static off64_t
+get_file_size(const char *name)
+{
+ int fd;
+ off64_t end;
+
+ fd = open(name, O_LARGEFILE | O_RDONLY);
+ if (fd == -1) {
+ VHDLOG("unable to open '%s': %d\n", name, errno);
+ return -errno;
+ }
+ end = lseek64(fd, 0, SEEK_END);
+ close(fd);
+ return end;
+}
+
+static int
+vhd_initialize_header(vhd_context_t *ctx, const char *parent_path,
+ uint64_t size, int raw)
+{
+ int err;
+ struct stat stats;
+ vhd_context_t parent;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ memset(&ctx->header, 0, sizeof(vhd_header_t));
+ memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie));
+ ctx->header.data_offset = (uint64_t)-1;
+ ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */
+ ctx->header.hdr_ver = DD_VERSION;
+ ctx->header.block_size = VHD_BLOCK_SIZE;
+ ctx->header.prt_ts = 0;
+ ctx->header.res1 = 0;
+ ctx->header.max_bat_size = (ctx->footer.curr_size +
+ VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+ ctx->footer.data_offset = VHD_SECTOR_SIZE;
+
+ if (ctx->footer.type == HD_TYPE_DYNAMIC)
+ return 0;
+
+ err = stat(parent_path, &stats);
+ if (err == -1)
+ return -errno;
+
+ if (raw) {
+ ctx->header.prt_ts = vhd_time(stats.st_mtime);
+ if (!size)
+ size = get_file_size(parent_path);
+ }
+ else {
+ err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ ctx->header.prt_ts = vhd_time(stats.st_mtime);
+ uuid_copy(ctx->header.prt_uuid, parent.footer.uuid);
+ if (!size)
+ size = parent.footer.curr_size;
+ vhd_close(&parent);
+ }
+ ctx->footer.orig_size = size;
+ ctx->footer.curr_size = size;
+ ctx->footer.geometry = vhd_chs(size);
+ ctx->header.max_bat_size =
+ (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+ return vhd_initialize_header_parent_name(ctx, parent_path);
+}
+
+static int
+vhd_write_parent_locators(vhd_context_t *ctx, const char *parent)
+{
+ int i, err;
+ off64_t off;
+ uint32_t code;
+
+ code = PLAT_CODE_NONE;
+
+ if (ctx->footer.type != HD_TYPE_DIFF)
+ return -EINVAL;
+
+ off = ctx->batmap.header.batmap_offset +
+ vhd_sectors_to_bytes(ctx->batmap.header.batmap_size);
+ if (off & (VHD_SECTOR_SIZE - 1))
+ off = vhd_bytes_padded(off);
+
+ for (i = 0; i < 3; i++) {
+ switch (i) {
+ case 0:
+ code = PLAT_CODE_MACX;
+ break;
+ case 1:
+ code = PLAT_CODE_W2KU;
+ break;
+ case 2:
+ code = PLAT_CODE_W2RU;
+ break;
+ }
+
+ err = vhd_parent_locator_write_at(ctx, parent, off, code,
+ 0, ctx->header.loc + i);
+ if (err)
+ return err;
+
+ off += vhd_parent_locator_size(ctx->header.loc + i);
+ }
+
+ return 0;
+}
+
+int
+vhd_change_parent(vhd_context_t *child, char *parent_path, int raw)
+{
+ int i, err;
+ char *ppath;
+ struct stat stats;
+ vhd_context_t parent;
+
+ ppath = realpath(parent_path, NULL);
+ if (!ppath) {
+ VHDLOG("error resolving parent path %s for %s: %d\n",
+ parent_path, child->file, errno);
+ return -errno;
+ }
+
+ err = stat(ppath, &stats);
+ if (err == -1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (raw) {
+ uuid_clear(child->header.prt_uuid);
+ } else {
+ err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY);
+ if (err) {
+ VHDLOG("error opening parent %s for %s: %d\n",
+ ppath, child->file, err);
+ goto out;
+ }
+ uuid_copy(child->header.prt_uuid, parent.footer.uuid);
+ vhd_close(&parent);
+ }
+
+ vhd_initialize_header_parent_name(child, ppath);
+ child->header.prt_ts = vhd_time(stats.st_mtime);
+
+ for (i = 0; i < vhd_parent_locator_count(child); i++) {
+ vhd_parent_locator_t *loc = child->header.loc + i;
+ size_t max = vhd_parent_locator_size(loc);
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ case PLAT_CODE_W2KU:
+ case PLAT_CODE_W2RU:
+ break;
+ default:
+ continue;
+ }
+
+ err = vhd_parent_locator_write_at(child, ppath,
+ loc->data_offset,
+ loc->code, max, loc);
+ if (err) {
+ VHDLOG("error writing parent locator %d for %s: %d\n",
+ i, child->file, err);
+ goto out;
+ }
+ }
+
+ TEST_FAIL_AT(FAIL_REPARENT_LOCATOR);
+
+ err = vhd_write_header(child, &child->header);
+ if (err) {
+ VHDLOG("error writing header for %s: %d\n", child->file, err);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(ppath);
+ return err;
+}
+
+static int
+vhd_create_batmap(vhd_context_t *ctx)
+{
+ off64_t off;
+ int err, map_bytes;
+ vhd_batmap_header_t *header;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ map_bytes = (ctx->header.max_bat_size + 7) >> 3;
+ header = &ctx->batmap.header;
+
+ memset(header, 0, sizeof(vhd_batmap_header_t));
+ memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie));
+
+ err = vhd_batmap_header_offset(ctx, &off);
+ if (err)
+ return err;
+
+ header->batmap_offset = off +
+ vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+ header->batmap_size = secs_round_up_no_zero(map_bytes);
+ header->batmap_version = VHD_BATMAP_CURRENT_VERSION;
+
+ map_bytes = vhd_sectors_to_bytes(header->batmap_size);
+
+ err = posix_memalign((void **)&ctx->batmap.map,
+ VHD_SECTOR_SIZE, map_bytes);
+ if (err) {
+ ctx->batmap.map = NULL;
+ return -err;
+ }
+
+ memset(ctx->batmap.map, 0, map_bytes);
+
+ return vhd_write_batmap(ctx, &ctx->batmap);
+}
+
+static int
+vhd_create_bat(vhd_context_t *ctx)
+{
+ int i, err;
+ size_t size;
+
+ if (!vhd_type_dynamic(ctx))
+ return -EINVAL;
+
+ size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+ err = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size);
+ if (err) {
+ ctx->bat.bat = NULL;
+ return err;
+ }
+
+ memset(ctx->bat.bat, 0, size);
+ for (i = 0; i < ctx->header.max_bat_size; i++)
+ ctx->bat.bat[i] = DD_BLK_UNUSED;
+
+ err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET);
+ if (err)
+ return err;
+
+ ctx->bat.entries = ctx->header.max_bat_size;
+ ctx->bat.spb = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+
+ return vhd_write_bat(ctx, &ctx->bat);
+}
+
+static int
+vhd_initialize_fixed_disk(vhd_context_t *ctx)
+{
+ char *buf;
+ int i, err;
+
+ if (ctx->footer.type != HD_TYPE_FIXED)
+ return -EINVAL;
+
+ err = vhd_seek(ctx, 0, SEEK_SET);
+ if (err)
+ return err;
+
+ buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) {
+ err = vhd_write(ctx, buf, VHD_BLOCK_SIZE);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ munmap(buf, VHD_BLOCK_SIZE);
+ return err;
+}
+
+int
+vhd_get_phys_size(vhd_context_t *ctx, off64_t *size)
+{
+ int err;
+
+ if ((err = vhd_end_of_data(ctx, size)))
+ return err;
+ *size += sizeof(vhd_footer_t);
+ return 0;
+}
+
+int
+vhd_set_phys_size(vhd_context_t *ctx, off64_t size)
+{
+ off64_t phys_size;
+ int err;
+
+ err = vhd_get_phys_size(ctx, &phys_size);
+ if (err)
+ return err;
+ if (size < phys_size) {
+ // would result in data loss
+ VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n",
+ size, phys_size);
+ return -EINVAL;
+ }
+ return vhd_write_footer_at(ctx, &ctx->footer,
+ size - sizeof(vhd_footer_t));
+}
+
+static int
+__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
+ vhd_flag_creat_t flags)
+{
+ int err;
+ off64_t off;
+ vhd_context_t ctx;
+ vhd_footer_t *footer;
+ vhd_header_t *header;
+ uint64_t size, blks;
+
+ switch (type) {
+ case HD_TYPE_DIFF:
+ if (!parent)
+ return -EINVAL;
+ case HD_TYPE_FIXED:
+ case HD_TYPE_DYNAMIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1)
+ return -ENAMETOOLONG;
+
+ memset(&ctx, 0, sizeof(vhd_context_t));
+ footer = &ctx.footer;
+ header = &ctx.header;
+ blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+ size = blks << VHD_BLOCK_SHIFT;
+
+ ctx.fd = open(name, O_WRONLY | O_CREAT |
+ O_TRUNC | O_LARGEFILE | O_DIRECT, 0644);
+ if (ctx.fd == -1)
+ return -errno;
+
+ ctx.file = strdup(name);
+ if (!ctx.file) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = vhd_test_file_fixed(ctx.file, &ctx.is_block);
+ if (err)
+ goto out;
+
+ vhd_initialize_footer(&ctx, type, size);
+
+ if (type == HD_TYPE_FIXED) {
+ err = vhd_initialize_fixed_disk(&ctx);
+ if (err)
+ goto out;
+ } else {
+ int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ err = vhd_initialize_header(&ctx, parent, size, raw);
+ if (err)
+ goto out;
+
+ err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
+ if (err)
+ goto out;
+
+ err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+ if (err)
+ goto out;
+
+ err = vhd_create_batmap(&ctx);
+ if (err)
+ goto out;
+
+ err = vhd_create_bat(&ctx);
+ if (err)
+ goto out;
+
+ if (type == HD_TYPE_DIFF) {
+ err = vhd_write_parent_locators(&ctx, parent);
+ if (err)
+ goto out;
+ }
+
+ /* write header again since it may have changed */
+ err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_seek(&ctx, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ off = vhd_position(&ctx);
+ if (off == (off64_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ if (ctx.is_block)
+ off -= sizeof(vhd_footer_t);
+
+ err = vhd_write_footer_at(&ctx, &ctx.footer, off);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ vhd_close(&ctx);
+ if (err && !ctx.is_block)
+ unlink(name);
+ return err;
+}
+
+int
+vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags)
+{
+ return __vhd_create(name, NULL, bytes, type, flags);
+}
+
+int
+vhd_snapshot(const char *name, uint64_t bytes, const char *parent,
+ vhd_flag_creat_t flags)
+{
+ return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags);
+}
+
+static int
+__vhd_io_fixed_read(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (err)
+ return err;
+
+ return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static void
+__vhd_io_dynamic_copy_data(vhd_context_t *ctx,
+ char *map, int map_off,
+ char *bitmap, int bitmap_off,
+ char *dst, char *src, int secs)
+{
+ int i;
+
+ for (i = 0; i < secs; i++) {
+ if (test_bit(map, map_off + i))
+ goto next;
+
+ if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i))
+ goto next;
+
+ memcpy(dst, src, VHD_SECTOR_SIZE);
+ set_bit(map, map_off + i);
+
+ next:
+ src += VHD_SECTOR_SIZE;
+ dst += VHD_SECTOR_SIZE;
+ }
+}
+
+static int
+__vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map,
+ char *buf, uint64_t sector, uint32_t secs)
+{
+ off64_t off;
+ uint32_t blk, sec;
+ int err, cnt, map_off;
+ char *bitmap, *data, *src;
+
+ map_off = 0;
+
+ do {
+ blk = sector / ctx->spb;
+ sec = sector % ctx->spb;
+ off = ctx->bat.bat[blk];
+ data = NULL;
+ bitmap = NULL;
+
+ if (off == DD_BLK_UNUSED) {
+ cnt = MIN(secs, ctx->spb);
+ goto next;
+ }
+
+ err = vhd_read_bitmap(ctx, blk, &bitmap);
+ if (err)
+ return err;
+
+ err = vhd_read_block(ctx, blk, &data);
+ if (err) {
+ free(bitmap);
+ return err;
+ }
+
+ cnt = MIN(secs, ctx->spb - sec);
+ src = data + vhd_sectors_to_bytes(sec);
+
+ __vhd_io_dynamic_copy_data(ctx,
+ map, map_off,
+ bitmap, sec,
+ buf, src, cnt);
+
+ next:
+ free(data);
+ free(bitmap);
+
+ secs -= cnt;
+ sector += cnt;
+ map_off += cnt;
+ buf += vhd_sectors_to_bytes(cnt);
+
+ } while (secs);
+
+ return 0;
+}
+
+static int
+__raw_read_link(char *filename,
+ char *map, char *buf, uint64_t sec, uint32_t secs)
+{
+ int fd, err;
+ off64_t off;
+ uint64_t size;
+ char *data;
+
+ err = 0;
+ errno = 0;
+ fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (fd == -1) {
+ VHDLOG("%s: failed to open: %d\n", filename, -errno);
+ return -errno;
+ }
+
+ off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (off == (off64_t)-1) {
+ VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n",
+ filename, vhd_sectors_to_bytes(sec), -errno);
+ err = -errno;
+ goto close;
+ }
+
+ size = vhd_sectors_to_bytes(secs);
+ err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size);
+ if (err)
+ goto close;
+
+ err = read(fd, data, size);
+ if (err != size) {
+ VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n",
+ filename, size, err, -errno);
+ free(data);
+ err = errno ? -errno : -EIO;
+ goto close;
+ }
+ __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs);
+ free(data);
+ err = 0;
+
+close:
+ close(fd);
+ return err;
+}
+
+static int
+__vhd_io_dynamic_read(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+ uint32_t i, done;
+ char *map, *next;
+ vhd_context_t parent, *vhd;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ vhd = ctx;
+ next = NULL;
+ map = calloc(1, secs << (VHD_SECTOR_SHIFT - 3));
+ if (!map)
+ return -ENOMEM;
+
+ memset(buf, 0, vhd_sectors_to_bytes(secs));
+
+ for (;;) {
+ err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs);
+ if (err)
+ goto close;
+
+ for (done = 0, i = 0; i < secs; i++)
+ if (test_bit(map, i))
+ done++;
+
+ if (done == secs) {
+ err = 0;
+ goto close;
+ }
+
+ if (vhd->footer.type == HD_TYPE_DIFF) {
+ err = vhd_parent_locator_get(vhd, &next);
+ if (err)
+ goto close;
+ if (vhd_parent_raw(vhd)) {
+ err = __raw_read_link(next, map, buf, sec,
+ secs);
+ goto close;
+ }
+ } else {
+ err = 0;
+ goto close;
+ }
+
+ if (vhd != ctx)
+ vhd_close(vhd);
+ vhd = &parent;
+
+ err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+ if (err)
+ goto out;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ goto close;
+
+ free(next);
+ next = NULL;
+ }
+
+close:
+ if (vhd != ctx)
+ vhd_close(vhd);
+out:
+ free(map);
+ free(next);
+ return err;
+}
+
+int
+vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+ if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ if (!vhd_type_dynamic(ctx))
+ return __vhd_io_fixed_read(ctx, buf, sec, secs);
+
+ return __vhd_io_dynamic_read(ctx, buf, sec, secs);
+}
+
+static int
+__vhd_io_fixed_write(vhd_context_t *ctx,
+ char *buf, uint64_t sec, uint32_t secs)
+{
+ int err;
+
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (err)
+ return err;
+
+ return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static int
+__vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
+{
+ char *buf;
+ size_t size;
+ off64_t off, max;
+ int i, err, gap, spp;
+
+ spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+ err = vhd_end_of_data(ctx, &max);
+ if (err)
+ return err;
+
+ gap = 0;
+ off = max;
+ max >>= VHD_SECTOR_SHIFT;
+
+ /* data region of segment should begin on page boundary */
+ if ((max + ctx->bm_secs) % spp) {
+ gap = (spp - ((max + ctx->bm_secs) % spp));
+ max += gap;
+ }
+
+ err = vhd_seek(ctx, off, SEEK_SET);
+ if (err)
+ return err;
+
+ size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
+ buf = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ err = vhd_write(ctx, buf, size);
+ if (err)
+ goto out;
+
+ ctx->bat.bat[block] = max;
+ err = vhd_write_bat(ctx, &ctx->bat);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ munmap(buf, size);
+ return err;
+}
+
+static int
+__vhd_io_dynamic_write(vhd_context_t *ctx,
+ char *buf, uint64_t sector, uint32_t secs)
+{
+ char *map;
+ off64_t off;
+ uint32_t blk, sec;
+ int i, err, cnt, ret;
+
+ if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ err = vhd_get_bat(ctx);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(ctx)) {
+ err = vhd_get_batmap(ctx);
+ if (err)
+ return err;
+ }
+
+ do {
+ blk = sector / ctx->spb;
+ sec = sector % ctx->spb;
+
+ off = ctx->bat.bat[blk];
+ if (off == DD_BLK_UNUSED) {
+ err = __vhd_io_allocate_block(ctx, blk);
+ if (err)
+ return err;
+
+ off = ctx->bat.bat[blk];
+ }
+
+ off += ctx->bm_secs + sec;
+ err = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET);
+ if (err)
+ return err;
+
+ cnt = MIN(secs, ctx->spb - sec);
+ err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt));
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(ctx) &&
+ vhd_batmap_test(ctx, &ctx->batmap, blk))
+ goto next;
+
+ err = vhd_read_bitmap(ctx, blk, &map);
+ if (err)
+ return err;
+
+ for (i = 0; i < cnt; i++)
+ vhd_bitmap_set(ctx, map, sec + i);
+
+ err = vhd_write_bitmap(ctx, blk, map);
+ if (err)
+ goto fail;
+
+ if (vhd_has_batmap(ctx)) {
+ for (i = 0; i < ctx->spb; i++)
+ if (!vhd_bitmap_test(ctx, map, i)) {
+ free(map);
+ goto next;
+ }
+
+ vhd_batmap_set(ctx, &ctx->batmap, blk);
+ err = vhd_write_batmap(ctx, &ctx->batmap);
+ if (err)
+ goto fail;
+ }
+
+ free(map);
+ map = NULL;
+
+ next:
+ secs -= cnt;
+ sector += cnt;
+ buf += vhd_sectors_to_bytes(cnt);
+ } while (secs);
+
+ err = 0;
+
+out:
+ ret = vhd_write_footer(ctx, &ctx->footer);
+ return (err ? err : ret);
+
+fail:
+ free(map);
+ goto out;
+}
+
+int
+vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+ if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+ return -ERANGE;
+
+ if (!vhd_type_dynamic(ctx))
+ return __vhd_io_fixed_write(ctx, buf, sec, secs);
+
+ return __vhd_io_dynamic_write(ctx, buf, sec, secs);
+}
diff --git a/tools/blktap2/vhd/lib/relative-path.c b/tools/blktap2/vhd/lib/relative-path.c
new file mode 100644
index 0000000000..8b7cb71fc9
--- /dev/null
+++ b/tools/blktap2/vhd/lib/relative-path.c
@@ -0,0 +1,299 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "relative-path.h"
+
+#define sfree(ptr) \
+do { \
+ free(ptr); \
+ ptr = NULL; \
+} while (0)
+
+/*
+ * count number of tokens between DELIMETER characters
+ */
+static int
+count_nodes(char *path)
+{
+ int i;
+ char *tmp;
+
+ if (!path)
+ return 0;
+
+ for (i = 0, tmp = path; *tmp != '\0'; tmp++)
+ if (*tmp == DELIMITER)
+ i++;
+
+ return i;
+}
+
+/*
+ * return copy of next node in @path, or NULL
+ * @path is moved to the end of the next node
+ * @err is set to -errno on failure
+ * copy should be freed
+ */
+static char *
+next_node(char **path, int *err)
+{
+ int ret;
+ char *tmp, *start;
+
+ if (!path || !*path) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ *err = 0;
+ start = *path;
+
+ for (tmp = *path; *tmp != '\0'; tmp++)
+ if (*tmp == DELIMITER) {
+ int size;
+ char *node;
+
+ size = tmp - start + 1;
+ node = malloc(size);
+ if (!node) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ ret = snprintf(node, size, "%s", start);
+ if (ret < 0) {
+ free(node);
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ *path = tmp;
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * count number of nodes in common betwee @to and @from
+ * returns number of common nodes, or -errno on failure
+ */
+static int
+count_common_nodes(char *to, char *from)
+{
+ int err, common;
+ char *to_node, *from_node;
+
+ if (!to || !from)
+ return -EINVAL;
+
+ err = 0;
+ common = 0;
+ to_node = NULL;
+ from_node = NULL;
+
+ do {
+ to_node = next_node(&to, &err);
+ if (err || !to_node)
+ break;
+
+ from_node = next_node(&from, &err);
+ if (err || !from_node)
+ break;
+
+ if (strncmp(to_node, from_node, MAX_NAME_LEN))
+ break;
+
+ ++to;
+ ++from;
+ ++common;
+ sfree(to_node);
+ sfree(from_node);
+
+ } while (1);
+
+ sfree(to_node);
+ sfree(from_node);
+
+ if (err)
+ return err;
+
+ return common;
+}
+
+/*
+ * construct path of @count '../', './' if @count is zero, or NULL on error
+ * result should be freed
+ */
+static char *
+up_nodes(int count)
+{
+ char *path, *tmp;
+ int i, ret, len, size;
+
+ if (!count)
+ return strdup("./");
+
+ len = strlen("../");
+ size = len * count;
+ if (size >= MAX_NAME_LEN)
+ return NULL;
+
+ path = malloc(size + 1);
+ if (!path)
+ return NULL;
+
+ tmp = path;
+ for (i = 0; i < count; i++) {
+ ret = sprintf(tmp, "../");
+ if (ret < 0 || ret != len) {
+ free(path);
+ return NULL;
+ }
+ tmp += ret;
+ }
+
+ return path;
+}
+
+/*
+ * return pointer to @offset'th node of path or NULL on error
+ */
+static char *
+node_offset(char *from, int offset)
+{
+ char *path;
+
+ if (!from || !offset)
+ return NULL;
+
+ for (path = from; *path != '\0'; path++) {
+ if (*path == DELIMITER)
+ if (--offset == 0)
+ return path + 1;
+ }
+
+ return NULL;
+}
+
+/*
+ * return a relative path from @from to @to
+ * result should be freed
+ */
+char *
+relative_path_to(char *from, char *to, int *err)
+{
+ int from_nodes, common;
+ char *to_absolute, *from_absolute;
+ char *up, *common_target_path, *relative_path;
+
+ *err = 0;
+ up = NULL;
+ to_absolute = NULL;
+ from_absolute = NULL;
+ relative_path = NULL;
+
+ if (strnlen(to, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ EPRINTF("invalid input; max path length is %d\n",
+ MAX_NAME_LEN);
+ *err = -ENAMETOOLONG;
+ return NULL;
+ }
+
+ to_absolute = realpath(to, NULL);
+ if (!to_absolute) {
+ EPRINTF("failed to get absolute path of %s\n", to);
+ *err = -errno;
+ goto out;
+ }
+
+ from_absolute = realpath(from, NULL);
+ if (!from_absolute) {
+ EPRINTF("failed to get absolute path of %s\n", from);
+ *err = -errno;
+ goto out;
+ }
+
+ if (strnlen(to_absolute, MAX_NAME_LEN) == MAX_NAME_LEN ||
+ strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) {
+ EPRINTF("invalid input; max path length is %d\n",
+ MAX_NAME_LEN);
+ *err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ /* count nodes in source path */
+ from_nodes = count_nodes(from_absolute);
+
+ /* count nodes in common */
+ common = count_common_nodes(to_absolute + 1, from_absolute + 1);
+ if (common < 0) {
+ EPRINTF("failed to count common nodes of %s and %s: %d\n",
+ to_absolute, from_absolute, common);
+ *err = common;
+ goto out;
+ }
+
+ /* move up to common node */
+ up = up_nodes(from_nodes - common - 1);
+ if (!up) {
+ EPRINTF("failed to allocate relative path for %s: %d\n",
+ from_absolute, -ENOMEM);
+ *err = -ENOMEM;
+ goto out;
+ }
+
+ /* get path from common node to target */
+ common_target_path = node_offset(to_absolute, common + 1);
+ if (!common_target_path) {
+ EPRINTF("failed to find common target path to %s: %d\n",
+ to_absolute, -EINVAL);
+ *err = -EINVAL;
+ goto out;
+ }
+
+ /* get relative path */
+ if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) {
+ EPRINTF("failed to construct final path %s%s: %d\n",
+ up, common_target_path, -ENOMEM);
+ relative_path = NULL;
+ *err = -ENOMEM;
+ goto out;
+ }
+
+out:
+ sfree(up);
+ sfree(to_absolute);
+ sfree(from_absolute);
+
+ return relative_path;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-check.c b/tools/blktap2/vhd/lib/vhd-util-check.c
new file mode 100644
index 0000000000..d7d588088a
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-check.c
@@ -0,0 +1,977 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+// allow the VHD timestamp to be at most this many seconds into the future to
+// account for time skew with NFS servers
+#define TIMESTAMP_MAX_SLACK 1800
+
+static int
+vhd_util_check_zeros(void *buf, size_t size)
+{
+ int i;
+ char *p;
+
+ p = buf;
+ for (i = 0; i < size; i++)
+ if (p[i])
+ return i;
+
+ return 0;
+}
+
+static int
+vhd_util_check_footer_opened(vhd_footer_t *footer)
+{
+ int i, n;
+ uint32_t *buf;
+
+ buf = (uint32_t *)footer;
+ n = sizeof(*footer) / sizeof(uint32_t);
+
+ for (i = 0; i < n; i++)
+ if (buf[i] != 0xc7c7c7c7)
+ return 0;
+
+ return 1;
+}
+
+static char *
+vhd_util_check_validate_footer(vhd_footer_t *footer)
+{
+ int size;
+ uint32_t checksum, now;
+
+ size = sizeof(footer->cookie);
+ if (memcmp(footer->cookie, HD_COOKIE, size))
+ return "invalid cookie";
+
+ checksum = vhd_checksum_footer(footer);
+ if (checksum != footer->checksum) {
+ if (footer->hidden &&
+ !strncmp(footer->crtr_app, "tap", 3) &&
+ (footer->crtr_ver == VHD_VERSION(0, 1) ||
+ footer->crtr_ver == VHD_VERSION(1, 1))) {
+ char tmp = footer->hidden;
+ footer->hidden = 0;
+ checksum = vhd_checksum_footer(footer);
+ footer->hidden = tmp;
+
+ if (checksum == footer->checksum)
+ goto ok;
+ }
+
+ return "invalid checksum";
+ }
+
+ok:
+ if (!(footer->features & HD_RESERVED))
+ return "invalid 'reserved' feature";
+
+ if (footer->features & ~(HD_TEMPORARY | HD_RESERVED))
+ return "invalid extra features";
+
+ if (footer->ff_version != HD_FF_VERSION)
+ return "invalid file format version";
+
+ if (footer->type != HD_TYPE_DYNAMIC &&
+ footer->type != HD_TYPE_DIFF &&
+ footer->data_offset != ~(0ULL))
+ return "invalid data offset";
+
+ now = vhd_time(time(NULL));
+ if (footer->timestamp > now + TIMESTAMP_MAX_SLACK)
+ return "creation time in future";
+
+ if (!strncmp(footer->crtr_app, "tap", 3) &&
+ footer->crtr_ver > VHD_CURRENT_VERSION)
+ return "unsupported tap creator version";
+
+ if (vhd_chs(footer->curr_size) < footer->geometry)
+ return "geometry too large";
+
+ if (footer->type != HD_TYPE_FIXED &&
+ footer->type != HD_TYPE_DYNAMIC &&
+ footer->type != HD_TYPE_DIFF)
+ return "invalid type";
+
+ if (footer->saved && footer->saved != 1)
+ return "invalid 'saved' state";
+
+ if (footer->hidden && footer->hidden != 1)
+ return "invalid 'hidden' state";
+
+ if (vhd_util_check_zeros(footer->reserved,
+ sizeof(footer->reserved)))
+ return "invalid 'reserved' bits";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_header(int fd, vhd_header_t *header)
+{
+ off64_t eof;
+ int i, cnt, size;
+ uint32_t checksum;
+
+ size = sizeof(header->cookie);
+ if (memcmp(header->cookie, DD_COOKIE, size))
+ return "invalid cookie";
+
+ checksum = vhd_checksum_header(header);
+ if (checksum != header->checksum)
+ return "invalid checksum";
+
+ if (header->hdr_ver != 0x00010000)
+ return "invalid header version";
+
+ if (header->data_offset != ~(0ULL))
+ return "invalid data offset";
+
+ eof = lseek64(fd, 0, SEEK_END);
+ if (eof == (off64_t)-1)
+ return "error finding eof";
+
+ if (header->table_offset <= 0 ||
+ header->table_offset % 512 ||
+ (header->table_offset +
+ (header->max_bat_size * sizeof(uint32_t)) >
+ eof - sizeof(vhd_footer_t)))
+ return "invalid table offset";
+
+ for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++)
+ if ((header->block_size >> i) & 1)
+ cnt++;
+
+ if (cnt != 1)
+ return "invalid block size";
+
+ if (header->res1)
+ return "invalid reserved bits";
+
+ if (vhd_util_check_zeros(header->res2, sizeof(header->res2)))
+ return "invalid reserved bits";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_differencing_header(vhd_context_t *vhd)
+{
+ vhd_header_t *header;
+
+ header = &vhd->header;
+
+ if (vhd->footer.type == HD_TYPE_DIFF) {
+ char *parent;
+ uint32_t now;
+
+ now = vhd_time(time(NULL));
+ if (header->prt_ts > now + TIMESTAMP_MAX_SLACK)
+ return "parent creation time in future";
+
+ if (vhd_header_decode_parent(vhd, header, &parent))
+ return "invalid parent name";
+
+ free(parent);
+ } else {
+ if (vhd_util_check_zeros(header->prt_name,
+ sizeof(header->prt_name)))
+ return "invalid non-null parent name";
+
+ if (vhd_util_check_zeros(header->loc, sizeof(header->loc)))
+ return "invalid non-null parent locators";
+
+ if (!uuid_is_null(header->prt_uuid))
+ return "invalid non-null parent uuid";
+
+ if (header->prt_ts)
+ return "invalid non-zero parent timestamp";
+ }
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap)
+{
+ int size;
+ off64_t eof;
+ uint32_t checksum;
+
+ size = sizeof(batmap->header.cookie);
+ if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size))
+ return "invalid cookie";
+
+ if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+ return "unsupported batmap version";
+
+ checksum = vhd_checksum_batmap(batmap);
+ if (checksum != batmap->header.checksum)
+ return "invalid checksum";
+
+ if (!batmap->header.batmap_size)
+ return "invalid size zero";
+
+ eof = lseek64(vhd->fd, 0, SEEK_END);
+ if (eof == (off64_t)-1)
+ return "error finding eof";
+
+ if (!batmap->header.batmap_offset ||
+ batmap->header.batmap_offset % 512)
+ return "invalid batmap offset";
+
+ if ((batmap->header.batmap_offset +
+ vhd_sectors_to_bytes(batmap->header.batmap_size)) >
+ eof - sizeof(vhd_footer_t))
+ return "invalid batmap size";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent_locator(vhd_context_t *vhd,
+ vhd_parent_locator_t *loc)
+{
+ off64_t eof;
+
+ if (vhd_validate_platform_code(loc->code))
+ return "invalid platform code";
+
+ if (loc->code == PLAT_CODE_NONE) {
+ if (vhd_util_check_zeros(loc, sizeof(*loc)))
+ return "non-zero locator";
+
+ return NULL;
+ }
+
+ if (!loc->data_offset)
+ return "invalid data offset";
+
+ if (!loc->data_space)
+ return "invalid data space";
+
+ if (!loc->data_len)
+ return "invalid data length";
+
+ eof = lseek64(vhd->fd, 0, SEEK_END);
+ if (eof == (off64_t)-1)
+ return "error finding eof";
+
+ if (loc->data_offset + vhd_parent_locator_size(loc) >
+ eof - sizeof(vhd_footer_t))
+ return "invalid size";
+
+ if (loc->res)
+ return "invalid reserved bits";
+
+ return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent(vhd_context_t *vhd, const char *ppath)
+{
+ char *msg;
+ vhd_context_t parent;
+
+ msg = NULL;
+
+ if (vhd_parent_raw(vhd))
+ return msg;
+
+ if (vhd_open(&parent, ppath,
+ VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED))
+ return "error opening parent";
+
+ if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) {
+ msg = "invalid parent uuid";
+ goto out;
+ }
+
+out:
+ vhd_close(&parent);
+ return msg;
+}
+
+static int
+vhd_util_check_footer(int fd, vhd_footer_t *footer, int ignore)
+{
+ size_t size;
+ int err, opened;
+ char *msg, *buf;
+ off64_t eof, off;
+ vhd_footer_t primary, backup;
+
+ memset(&primary, 0, sizeof(primary));
+ memset(&backup, 0, sizeof(backup));
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(primary));
+ if (err) {
+ printf("error allocating buffer: %d\n", err);
+ return -err;
+ }
+
+ memset(buf, 0, sizeof(primary));
+
+ eof = lseek64(fd, 0, SEEK_END);
+ if (eof == (off64_t)-1) {
+ err = -errno;
+ printf("error calculating end of file: %d\n", err);
+ goto out;
+ }
+
+ size = ((eof % 512) ? 511 : 512);
+ eof = lseek64(fd, eof - size, SEEK_SET);
+ if (eof == (off64_t)-1) {
+ err = -errno;
+ printf("error calculating end of file: %d\n", err);
+ goto out;
+ }
+
+ err = read(fd, buf, 512);
+ if (err != size) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading primary footer: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&primary, buf, sizeof(primary));
+ opened = vhd_util_check_footer_opened(&primary);
+ vhd_footer_in(&primary);
+
+ msg = vhd_util_check_validate_footer(&primary);
+ if (msg) {
+ if (opened && ignore)
+ goto check_backup;
+
+ err = -EINVAL;
+ printf("primary footer invalid: %s\n", msg);
+ goto out;
+ }
+
+ if (primary.type == HD_TYPE_FIXED) {
+ err = 0;
+ goto out;
+ }
+
+check_backup:
+ off = lseek64(fd, 0, SEEK_SET);
+ if (off == (off64_t)-1) {
+ err = -errno;
+ printf("error seeking to backup footer: %d\n", err);
+ goto out;
+ }
+
+ size = 512;
+ memset(buf, 0, sizeof(primary));
+
+ err = read(fd, buf, size);
+ if (err != size) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading backup footer: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&backup, buf, sizeof(backup));
+ vhd_footer_in(&backup);
+
+ msg = vhd_util_check_validate_footer(&backup);
+ if (msg) {
+ err = -EINVAL;
+ printf("backup footer invalid: %s\n", msg);
+ goto out;
+ }
+
+ if (memcmp(&primary, &backup, sizeof(primary))) {
+ if (opened && ignore) {
+ memcpy(&primary, &backup, sizeof(primary));
+ goto ok;
+ }
+
+ if (backup.hidden &&
+ !strncmp(backup.crtr_app, "tap", 3) &&
+ (backup.crtr_ver == VHD_VERSION(0, 1) ||
+ backup.crtr_ver == VHD_VERSION(1, 1))) {
+ char cmp, tmp = backup.hidden;
+ backup.hidden = 0;
+ cmp = memcmp(&primary, &backup, sizeof(primary));
+ backup.hidden = tmp;
+ if (!cmp)
+ goto ok;
+ }
+
+ err = -EINVAL;
+ printf("primary and backup footers do not match\n");
+ goto out;
+ }
+
+ok:
+ err = 0;
+ memcpy(footer, &primary, sizeof(primary));
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_util_check_header(int fd, vhd_footer_t *footer)
+{
+ int err;
+ off64_t off;
+ char *msg, *buf;
+ vhd_header_t header;
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(header));
+ if (err) {
+ printf("error allocating header: %d\n", err);
+ return err;
+ }
+
+ off = footer->data_offset;
+ off = lseek64(fd, off, SEEK_SET);
+ if (off == (off64_t)-1) {
+ err = -errno;
+ printf("error seeking to header: %d\n", err);
+ goto out;
+ }
+
+ err = read(fd, buf, sizeof(header));
+ if (err != sizeof(header)) {
+ err = (errno ? -errno : -EIO);
+ printf("error reading header: %d\n", err);
+ goto out;
+ }
+
+ memcpy(&header, buf, sizeof(header));
+ vhd_header_in(&header);
+
+ msg = vhd_util_check_validate_header(fd, &header);
+ if (msg) {
+ err = -EINVAL;
+ printf("header is invalid: %s\n", msg);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_util_check_differencing_header(vhd_context_t *vhd)
+{
+ char *msg;
+
+ msg = vhd_util_check_validate_differencing_header(vhd);
+ if (msg) {
+ printf("differencing header is invalid: %s\n", msg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_bat(vhd_context_t *vhd)
+{
+ off64_t eof, eoh;
+ int i, j, err, block_size;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err) {
+ printf("error calculating eof: %d\n", err);
+ return err;
+ }
+
+ eof = vhd_position(vhd);
+ if (eof == (off64_t)-1) {
+ printf("error calculating eof: %d\n", -errno);
+ return -errno;
+ }
+
+ /* adjust eof for vhds with short footers */
+ if (eof % 512) {
+ if (eof % 512 != 511) {
+ printf("invalid file size: 0x%"PRIx64"\n", eof);
+ return -EINVAL;
+ }
+
+ eof++;
+ }
+
+ err = vhd_get_bat(vhd);
+ if (err) {
+ printf("error reading bat: %d\n", err);
+ return err;
+ }
+
+ err = vhd_end_of_headers(vhd, &eoh);
+ if (err) {
+ printf("error calculating end of metadata: %d\n", err);
+ return err;
+ }
+
+ eof -= sizeof(vhd_footer_t);
+ eof >>= VHD_SECTOR_SHIFT;
+ eoh >>= VHD_SECTOR_SHIFT;
+ block_size = vhd->spb + vhd->bm_secs;
+
+ for (i = 0; i < vhd->header.max_bat_size; i++) {
+ uint32_t off = vhd->bat.bat[i];
+ if (off == DD_BLK_UNUSED)
+ continue;
+
+ if (off < eoh) {
+ printf("block %d (offset 0x%x) clobbers headers\n",
+ i, off);
+ return -EINVAL;
+ }
+
+ if (off + block_size > eof) {
+ printf("block %d (offset 0x%x) clobbers footer\n",
+ i, off);
+ return -EINVAL;
+ }
+
+ for (j = 0; j < vhd->header.max_bat_size; j++) {
+ uint32_t joff = vhd->bat.bat[j];
+
+ if (i == j)
+ continue;
+
+ if (joff == DD_BLK_UNUSED)
+ continue;
+
+ if (off == joff)
+ err = -EINVAL;
+
+ if (off > joff && off < joff + block_size)
+ err = -EINVAL;
+
+ if (off + block_size > joff &&
+ off + block_size < joff + block_size)
+ err = -EINVAL;
+
+ if (err) {
+ printf("block %d (offset 0x%x) clobbers "
+ "block %d (offset 0x%x)\n",
+ i, off, j, joff);
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_batmap(vhd_context_t *vhd)
+{
+ char *msg;
+ int i, err;
+
+ err = vhd_get_bat(vhd);
+ if (err) {
+ printf("error reading bat: %d\n", err);
+ return err;
+ }
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("error reading batmap: %d\n", err);
+ return err;
+ }
+
+ msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap);
+ if (msg) {
+ printf("batmap is invalid: %s\n", msg);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < vhd->header.max_bat_size; i++) {
+ if (!vhd_batmap_test(vhd, &vhd->batmap, i))
+ continue;
+
+ if (vhd->bat.bat[i] == DD_BLK_UNUSED) {
+ printf("batmap shows unallocated block %d full\n", i);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_check_parent_locators(vhd_context_t *vhd)
+{
+ int i, n, err;
+ vhd_parent_locator_t *loc;
+ char *msg, *file, *ppath, *location, *pname;
+ int mac, macx, w2ku, w2ru, wi2r, wi2k, found;
+
+ mac = 0;
+ macx = 0;
+ w2ku = 0;
+ w2ru = 0;
+ wi2r = 0;
+ wi2k = 0;
+ found = 0;
+ pname = NULL;
+ ppath = NULL;
+ location = NULL;
+
+ err = vhd_header_decode_parent(vhd, &vhd->header, &pname);
+ if (err) {
+ printf("error decoding parent name: %d\n", err);
+ return err;
+ }
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]);
+ for (i = 0; i < n; i++) {
+ ppath = NULL;
+ location = NULL;
+ loc = vhd->header.loc + i;
+
+ msg = vhd_util_check_validate_parent_locator(vhd, loc);
+ if (msg) {
+ err = -EINVAL;
+ printf("invalid parent locator %d: %s\n", i, msg);
+ goto out;
+ }
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ switch (loc->code) {
+ case PLAT_CODE_MACX:
+ if (macx++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_MAC:
+ if (mac++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_W2KU:
+ if (w2ku++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_W2RU:
+ if (w2ru++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_WI2R:
+ if (wi2r++)
+ goto dup;
+ break;
+
+ case PLAT_CODE_WI2K:
+ if (wi2k++)
+ goto dup;
+ break;
+
+ default:
+ err = -EINVAL;
+ printf("invalid platform code for locator %d\n", i);
+ goto out;
+ }
+
+ if (loc->code != PLAT_CODE_MACX &&
+ loc->code != PLAT_CODE_W2RU &&
+ loc->code != PLAT_CODE_W2KU)
+ continue;
+
+ err = vhd_parent_locator_read(vhd, loc, &ppath);
+ if (err) {
+ printf("error reading parent locator %d: %d\n", i, err);
+ goto out;
+ }
+
+ file = basename(ppath);
+ if (strcmp(pname, file)) {
+ err = -EINVAL;
+ printf("parent locator %d name (%s) does not match "
+ "header name (%s)\n", i, file, pname);
+ goto out;
+ }
+
+ err = vhd_find_parent(vhd, ppath, &location);
+ if (err) {
+ printf("error resolving %s: %d\n", ppath, err);
+ goto out;
+ }
+
+ err = access(location, R_OK);
+ if (err && loc->code == PLAT_CODE_MACX) {
+ err = -errno;
+ printf("parent locator %d points to missing file %s "
+ "(resolved to %s)\n", i, ppath, location);
+ goto out;
+ }
+
+ msg = vhd_util_check_validate_parent(vhd, location);
+ if (msg) {
+ err = -EINVAL;
+ printf("invalid parent %s: %s\n", location, msg);
+ goto out;
+ }
+
+ found++;
+ free(ppath);
+ free(location);
+ ppath = NULL;
+ location = NULL;
+
+ continue;
+
+ dup:
+ printf("duplicate platform code in locator %d: 0x%x\n",
+ i, loc->code);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!found) {
+ err = -EINVAL;
+ printf("could not find parent %s\n", pname);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(pname);
+ free(ppath);
+ free(location);
+ return err;
+}
+
+static void
+vhd_util_dump_headers(const char *name)
+{
+ char *argv[] = { "read", "-p", "-n", (char *)name };
+ int argc = sizeof(argv) / sizeof(argv[0]);
+
+ printf("%s appears invalid; dumping metadata\n", name);
+ vhd_util_read(argc, argv);
+}
+
+static int
+vhd_util_check_vhd(const char *name, int ignore)
+{
+ int fd, err;
+ vhd_context_t vhd;
+ struct stat stats;
+ vhd_footer_t footer;
+
+ fd = -1;
+ memset(&vhd, 0, sizeof(vhd));
+
+ err = stat(name, &stats);
+ if (err == -1) {
+ printf("cannot stat %s: %d\n", name, errno);
+ return -errno;
+ }
+
+ if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+ printf("%s is not a regular file or block device\n", name);
+ return -EINVAL;
+ }
+
+ fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (fd == -1) {
+ printf("error opening %s\n", name);
+ return -errno;
+ }
+
+ err = vhd_util_check_footer(fd, &footer, ignore);
+ if (err)
+ goto out;
+
+ if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF)
+ goto out;
+
+ err = vhd_util_check_header(fd, &footer);
+ if (err)
+ goto out;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err)
+ goto out;
+
+ err = vhd_util_check_differencing_header(&vhd);
+ if (err)
+ goto out;
+
+ err = vhd_util_check_bat(&vhd);
+ if (err)
+ goto out;
+
+ if (vhd_has_batmap(&vhd)) {
+ err = vhd_util_check_batmap(&vhd);
+ if (err)
+ goto out;
+ }
+
+ if (vhd.footer.type == HD_TYPE_DIFF) {
+ err = vhd_util_check_parent_locators(&vhd);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+ printf("%s is valid\n", name);
+
+out:
+ if (err)
+ vhd_util_dump_headers(name);
+ if (fd != -1)
+ close(fd);
+ vhd_close(&vhd);
+ return err;
+}
+
+static int
+vhd_util_check_parents(const char *name, int ignore)
+{
+ int err;
+ vhd_context_t vhd;
+ char *cur, *parent;
+
+ cur = (char *)name;
+
+ for (;;) {
+ err = vhd_open(&vhd, cur,
+ VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err)
+ goto out;
+
+ if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) {
+ vhd_close(&vhd);
+ goto out;
+ }
+
+ err = vhd_parent_locator_get(&vhd, &parent);
+ vhd_close(&vhd);
+
+ if (err) {
+ printf("error getting parent: %d\n", err);
+ goto out;
+ }
+
+ if (cur != name)
+ free(cur);
+ cur = parent;
+
+ err = vhd_util_check_vhd(cur, ignore);
+ if (err)
+ goto out;
+ }
+
+out:
+ if (err)
+ printf("error checking parents: %d\n", err);
+ if (cur != name)
+ free(cur);
+ return err;
+}
+
+int
+vhd_util_check(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int c, err, ignore, parents;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ ignore = 0;
+ parents = 0;
+ name = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:iph")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'i':
+ ignore = 1;
+ break;
+ case 'p':
+ parents = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ err = vhd_util_check_vhd(name, ignore);
+ if (err)
+ goto out;
+
+ if (parents)
+ err = vhd_util_check_parents(name, ignore);
+
+out:
+ return err;
+
+usage:
+ printf("options: -n <file> [-i ignore missing primary footers] "
+ "[-p check parents] [-h help]\n");
+ return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-coalesce.c b/tools/blktap2/vhd/lib/vhd-util-coalesce.c
new file mode 100644
index 0000000000..f6461fc687
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-coalesce.c
@@ -0,0 +1,218 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs)
+{
+ off64_t off;
+ size_t ret;
+
+ errno = 0;
+ off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+ if (off == (off64_t)-1) {
+ printf("raw parent: seek(0x%08"PRIx64") failed: %d\n",
+ vhd_sectors_to_bytes(sec), -errno);
+ return -errno;
+ }
+
+ ret = write(fd, buf, vhd_sectors_to_bytes(secs));
+ if (ret == vhd_sectors_to_bytes(secs))
+ return 0;
+
+ printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n",
+ vhd_sectors_to_bytes(secs), ret, -errno);
+ return (errno ? -errno : -EIO);
+}
+
+/*
+ * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw
+ */
+static int
+vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent,
+ int parent_fd, uint64_t block)
+{
+ int i, err;
+ char *buf, *map;
+ uint64_t sec, secs;
+
+ buf = NULL;
+ map = NULL;
+ sec = block * vhd->spb;
+
+ if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+ return 0;
+
+ err = posix_memalign((void **)&buf, 4096, vhd->header.block_size);
+ if (err)
+ return -err;
+
+ err = vhd_io_read(vhd, buf, sec, vhd->spb);
+ if (err)
+ goto done;
+
+ if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) {
+ if (parent->file)
+ err = vhd_io_write(parent, buf, sec, vhd->spb);
+ else
+ err = __raw_io_write(parent_fd, buf, sec, vhd->spb);
+ goto done;
+ }
+
+ err = vhd_read_bitmap(vhd, block, &map);
+ if (err)
+ goto done;
+
+ for (i = 0; i < vhd->spb; i++) {
+ if (!vhd_bitmap_test(vhd, map, i))
+ continue;
+
+ for (secs = 0; i + secs < vhd->spb; secs++)
+ if (!vhd_bitmap_test(vhd, map, i + secs))
+ break;
+
+ if (parent->file)
+ err = vhd_io_write(parent,
+ buf + vhd_sectors_to_bytes(i),
+ sec + i, secs);
+ else
+ err = __raw_io_write(parent_fd,
+ buf + vhd_sectors_to_bytes(i),
+ sec + i, secs);
+ if (err)
+ goto done;
+
+ i += secs;
+ }
+
+ err = 0;
+
+done:
+ free(buf);
+ free(map);
+ return err;
+}
+
+int
+vhd_util_coalesce(int argc, char **argv)
+{
+ int err, c;
+ uint64_t i;
+ char *name, *pname;
+ vhd_context_t vhd, parent;
+ int parent_fd = -1;
+
+ name = NULL;
+ pname = NULL;
+ parent.file = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_parent_locator_get(&vhd, &pname);
+ if (err) {
+ printf("error finding %s parent: %d\n", name, err);
+ vhd_close(&vhd);
+ return err;
+ }
+
+ if (vhd_parent_raw(&vhd)) {
+ parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644);
+ if (parent_fd == -1) {
+ err = -errno;
+ printf("failed to open parent %s: %d\n", pname, err);
+ vhd_close(&vhd);
+ return err;
+ }
+ } else {
+ err = vhd_open(&parent, pname, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", pname, err);
+ free(pname);
+ vhd_close(&vhd);
+ return err;
+ }
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto done;
+
+ if (vhd_has_batmap(&vhd)) {
+ err = vhd_get_batmap(&vhd);
+ if (err)
+ goto done;
+ }
+
+ for (i = 0; i < vhd.bat.entries; i++) {
+ err = vhd_util_coalesce_block(&vhd, &parent, parent_fd, i);
+ if (err)
+ goto done;
+ }
+
+ err = 0;
+
+ done:
+ free(pname);
+ vhd_close(&vhd);
+ if (parent.file)
+ vhd_close(&parent);
+ else
+ close(parent_fd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-create.c b/tools/blktap2/vhd/lib/vhd-util-create.c
new file mode 100644
index 0000000000..a9bdf05fee
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-create.c
@@ -0,0 +1,80 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_create(int argc, char **argv)
+{
+ char *name;
+ uint64_t size;
+ int c, sparse, err;
+ vhd_flag_creat_t flags;
+
+ err = -EINVAL;
+ size = 0;
+ sparse = 1;
+ name = NULL;
+ flags = 0;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:s:rh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 's':
+ err = 0;
+ size = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ sparse = 0;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (err || !name || optind != argc)
+ goto usage;
+
+ return vhd_create(name, size << 20,
+ (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED),
+ flags);
+
+usage:
+ printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-fill.c b/tools/blktap2/vhd/lib/vhd-util-fill.c
new file mode 100644
index 0000000000..afbfccee48
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-fill.c
@@ -0,0 +1,105 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_fill(int argc, char **argv)
+{
+ int err, c;
+ char *buf, *name;
+ vhd_context_t vhd;
+ uint64_t i, sec, secs;
+
+ buf = NULL;
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto done;
+
+ err = posix_memalign((void **)&buf, 4096, vhd.header.block_size);
+ if (err) {
+ err = -err;
+ goto done;
+ }
+
+ sec = 0;
+ secs = vhd.header.block_size >> VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < vhd.header.max_bat_size; i++) {
+ err = vhd_io_read(&vhd, buf, sec, secs);
+ if (err)
+ goto done;
+
+ err = vhd_io_write(&vhd, buf, sec, secs);
+ if (err)
+ goto done;
+
+ sec += secs;
+ }
+
+ err = 0;
+
+ done:
+ free(buf);
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-modify.c b/tools/blktap2/vhd/lib/vhd-util-modify.c
new file mode 100644
index 0000000000..3b07e31b25
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-modify.c
@@ -0,0 +1,132 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT
+ * affect the VHD disk capacity, only the physical size of the file containing
+ * the VHD. Naturally, it is not possible to set the file size to be less than
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the
+ * footer in the right location such that resizing the file (manually, as a
+ * separate step) will produce the correct results. If the new file size is
+ * greater than the current file size, the file must first be expanded and then
+ * altered with this operation. If the new size is smaller than the current
+ * size, the VHD must first be altered with this operation and then the file
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+TEST_FAIL_EXTERN_VARS;
+
+int
+vhd_util_modify(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int err, c, size, parent, parent_raw;
+ off64_t newsize = 0;
+ char *newparent = NULL;
+
+ name = NULL;
+ size = 0;
+ parent = 0;
+ parent_raw = 0;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:s:p:mh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 's':
+ size = 1;
+ errno = 0;
+ newsize = strtoll(optarg, NULL, 10);
+ if (errno) {
+ fprintf(stderr, "Invalid size '%s'\n", optarg);
+ goto usage;
+ }
+ break;
+ case 'p':
+ parent = 1;
+ newparent = optarg;
+ break;
+ case 'm':
+ parent_raw = 1;
+ break;
+
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size) {
+ err = vhd_set_phys_size(&vhd, newsize);
+ if (err)
+ printf("failed to set physical size to %"PRIu64":"
+ " %d\n", newsize, err);
+ }
+
+ if (parent) {
+ TEST_FAIL_AT(FAIL_REPARENT_BEGIN);
+ err = vhd_change_parent(&vhd, newparent, parent_raw);
+ if (err) {
+ printf("failed to set parent to '%s': %d\n",
+ newparent, err);
+ goto done;
+ }
+ TEST_FAIL_AT(FAIL_REPARENT_END);
+ }
+
+done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("*** Dangerous operations, use with care ***\n");
+ printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] "
+ "[-s NEW_SIZE set size] [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-query.c b/tools/blktap2/vhd/lib/vhd-util-query.c
new file mode 100644
index 0000000000..3477a17f27
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-query.c
@@ -0,0 +1,159 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_query(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ off64_t currsize;
+ int ret, err, c, size, physize, parent, fields, depth;
+
+ name = NULL;
+ size = 0;
+ physize = 0;
+ parent = 0;
+ fields = 0;
+ depth = 0;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:vspfdh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'v':
+ size = 1;
+ break;
+ case 's':
+ physize = 1;
+ break;
+ case 'p':
+ parent = 1;
+ break;
+ case 'f':
+ fields = 1;
+ break;
+ case 'd':
+ depth = 1;
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (size)
+ printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+ if (physize) {
+ err = vhd_get_phys_size(&vhd, &currsize);
+ if (err)
+ printf("failed to get physical size: %d\n", err);
+ else
+ printf("%"PRIu64"\n", currsize);
+ }
+
+ if (parent) {
+ ret = 0;
+
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ printf("%s has no parent\n", name);
+ else {
+ char *pname;
+
+ ret = vhd_parent_locator_get(&vhd, &pname);
+ if (ret)
+ printf("query failed\n");
+ else {
+ printf("%s\n", pname);
+ free(pname);
+ }
+ }
+
+ err = (err ? : ret);
+ }
+
+ if (fields) {
+ int hidden;
+
+ ret = vhd_hidden(&vhd, &hidden);
+ if (ret)
+ printf("error checking 'hidden' field: %d\n", ret);
+ else
+ printf("hidden: %d\n", hidden);
+
+ err = (err ? : ret);
+ }
+
+ if (depth) {
+ int length;
+
+ ret = vhd_chain_depth(&vhd, &length);
+ if (ret)
+ printf("error checking chain depth: %d\n", ret);
+ else
+ printf("chain depth: %d\n", length);
+
+ err = (err ? : ret);
+ }
+
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-v print virtual size (in MB)] "
+ "[-s print physical utilization (bytes)] [-p print parent] "
+ "[-f print fields] [-d print chain depth] [-h help]\n");
+ return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-read.c b/tools/blktap2/vhd/lib/vhd-util-read.c
new file mode 100644
index 0000000000..7b5246c5f7
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-read.c
@@ -0,0 +1,742 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#define nsize 15
+static char nbuf[nsize];
+
+static inline char *
+__xconv(uint64_t num)
+{
+ snprintf(nbuf, nsize, "%#" PRIx64 , num);
+ return nbuf;
+}
+
+static inline char *
+__dconv(uint64_t num)
+{
+ snprintf(nbuf, nsize, "%" PRIu64, num);
+ return nbuf;
+}
+
+#define conv(hex, num) \
+ (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num))
+
+static void
+vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex)
+{
+ int err;
+ uint32_t cksm;
+ char uuid[37], time_str[26], cookie[9], out[512], *name;
+
+ printf("VHD Header Summary:\n-------------------\n");
+
+ snprintf(cookie, 9, "%s", h->cookie);
+ printf("Cookie : %s\n", cookie);
+
+ printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset));
+ printf("Table offset : %s\n", conv(hex, h->table_offset));
+ printf("Header version : 0x%08x\n", h->hdr_ver);
+ printf("Max BAT size : %s\n", conv(hex, h->max_bat_size));
+ printf("Block size : %s ", conv(hex, h->block_size));
+ printf("(%s MB)\n", conv(hex, h->block_size >> 20));
+
+ err = vhd_header_decode_parent(vhd, h, &name);
+ printf("Parent name : %s\n",
+ (err ? "failed to read name" : name));
+ free(name);
+
+ uuid_unparse(h->prt_uuid, uuid);
+ printf("Parent UUID : %s\n", uuid);
+
+ vhd_time_to_string(h->prt_ts, time_str);
+ printf("Parent timestamp : %s\n", time_str);
+
+ cksm = vhd_checksum_header(h);
+ printf("Checksum : 0x%x|0x%x (%s)\n", h->checksum, cksm,
+ h->checksum == cksm ? "Good!" : "Bad!");
+ printf("\n");
+}
+
+static void
+vhd_print_footer(vhd_footer_t *f, int hex)
+{
+ uint64_t c, h, s;
+ uint32_t ff_maj, ff_min, cr_maj, cr_min, cksm, cksm_save;
+ char time_str[26], creator[5], uuid[37], cookie[9];
+
+ printf("VHD Footer Summary:\n-------------------\n");
+
+ snprintf(cookie, 9, "%s", f->cookie);
+ printf("Cookie : %s\n", cookie);
+
+ printf("Features : (0x%08x) %s%s\n", f->features,
+ (f->features & HD_TEMPORARY) ? "<TEMP>" : "",
+ (f->features & HD_RESERVED) ? "<RESV>" : "");
+
+ ff_maj = f->ff_version >> 16;
+ ff_min = f->ff_version & 0xffff;
+ printf("File format version : Major: %d, Minor: %d\n",
+ ff_maj, ff_min);
+
+ printf("Data offset : %s\n", conv(hex, f->data_offset));
+
+ vhd_time_to_string(f->timestamp, time_str);
+ printf("Timestamp : %s\n", time_str);
+
+ memcpy(creator, f->crtr_app, 4);
+ creator[4] = '\0';
+ printf("Creator Application : '%s'\n", creator);
+
+ cr_maj = f->crtr_ver >> 16;
+ cr_min = f->crtr_ver & 0xffff;
+ printf("Creator version : Major: %d, Minor: %d\n",
+ cr_maj, cr_min);
+
+ printf("Creator OS : %s\n",
+ ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" :
+ ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" :
+ "Unknown!")));
+
+ printf("Original disk size : %s MB ", conv(hex, f->orig_size >> 20));
+ printf("(%s Bytes)\n", conv(hex, f->orig_size));
+
+ printf("Current disk size : %s MB ", conv(hex, f->curr_size >> 20));
+ printf("(%s Bytes)\n", conv(hex, f->curr_size));
+
+ c = f->geometry >> 16;
+ h = (f->geometry & 0x0000FF00) >> 8;
+ s = f->geometry & 0x000000FF;
+ printf("Geometry : Cyl: %s, ", conv(hex, c));
+ printf("Hds: %s, ", conv(hex, h));
+ printf("Sctrs: %s\n", conv(hex, s));
+ printf(" : = %s MB ", conv(hex, (c * h * s) >> 11));
+ printf("(%s Bytes)\n", conv(hex, c * h * s << 9));
+
+ printf("Disk type : %s\n",
+ f->type <= HD_TYPE_MAX ?
+ HD_TYPE_STR[f->type] : "Unknown type!\n");
+
+ cksm = vhd_checksum_footer(f);
+ printf("Checksum : 0x%x|0x%x (%s)\n", f->checksum, cksm,
+ f->checksum == cksm ? "Good!" : "Bad!");
+
+ uuid_unparse(f->uuid, uuid);
+ printf("UUID : %s\n", uuid);
+
+ printf("Saved state : %s\n", f->saved == 0 ? "No" : "Yes");
+ printf("Hidden : %d\n", f->hidden);
+ printf("\n");
+}
+
+static inline char *
+code_name(uint32_t code)
+{
+ switch(code) {
+ case PLAT_CODE_NONE:
+ return "PLAT_CODE_NONE";
+ case PLAT_CODE_WI2R:
+ return "PLAT_CODE_WI2R";
+ case PLAT_CODE_WI2K:
+ return "PLAT_CODE_WI2K";
+ case PLAT_CODE_W2RU:
+ return "PLAT_CODE_W2RU";
+ case PLAT_CODE_W2KU:
+ return "PLAT_CODE_W2KU";
+ case PLAT_CODE_MAC:
+ return "PLAT_CODE_MAC";
+ case PLAT_CODE_MACX:
+ return "PLAT_CODE_MACX";
+ default:
+ return "UNKOWN";
+ }
+}
+
+static void
+vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc)
+{
+ int err;
+ char *buf;
+
+ err = vhd_parent_locator_read(vhd, loc, &buf);
+ if (err) {
+ printf("failed to read parent name\n");
+ return;
+ }
+
+ printf(" decoded name : %s\n", buf);
+}
+
+static void
+vhd_print_parent_locators(vhd_context_t *vhd, int hex)
+{
+ int i, n;
+ vhd_parent_locator_t *loc;
+
+ printf("VHD Parent Locators:\n--------------------\n");
+
+ n = sizeof(vhd->header.loc) / sizeof(struct prt_loc);
+ for (i = 0; i < n; i++) {
+ loc = &vhd->header.loc[i];
+
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ printf("locator: : %d\n", i);
+ printf(" code : %s\n",
+ code_name(loc->code));
+ printf(" data_space : %s\n",
+ conv(hex, loc->data_space));
+ printf(" data_length : %s\n",
+ conv(hex, loc->data_len));
+ printf(" data_offset : %s\n",
+ conv(hex, loc->data_offset));
+ vhd_print_parent(vhd, loc);
+ printf("\n");
+ }
+}
+
+static void
+vhd_print_batmap_header(vhd_batmap_t *batmap, int hex)
+{
+ uint32_t cksm;
+
+ printf("VHD Batmap Summary:\n-------------------\n");
+ printf("Batmap offset : %s\n",
+ conv(hex, batmap->header.batmap_offset));
+ printf("Batmap size (secs) : %s\n",
+ conv(hex, batmap->header.batmap_size));
+ printf("Batmap version : 0x%08x\n",
+ batmap->header.batmap_version);
+
+ cksm = vhd_checksum_batmap(batmap);
+ printf("Checksum : 0x%x|0x%x (%s)\n",
+ batmap->header.checksum, cksm,
+ (batmap->header.checksum == cksm ? "Good!" : "Bad!"));
+ printf("\n");
+}
+
+static inline int
+check_block_range(vhd_context_t *vhd, uint64_t block, int hex)
+{
+ if (block > vhd->header.max_bat_size) {
+ fprintf(stderr, "block %s past end of file\n",
+ conv(hex, block));
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_headers(vhd_context_t *vhd, int hex)
+{
+ int err;
+
+ vhd_print_footer(&vhd->footer, hex);
+
+ if (vhd_type_dynamic(vhd)) {
+ vhd_print_header(vhd, &vhd->header, hex);
+
+ if (vhd->footer.type == HD_TYPE_DIFF)
+ vhd_print_parent_locators(vhd, hex);
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("failed to get batmap header\n");
+ return err;
+ }
+
+ vhd_print_batmap_header(&vhd->batmap, hex);
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhd_dump_headers(const char *name, int hex)
+{
+ vhd_context_t vhd;
+
+ libvhd_set_log_level(1);
+ memset(&vhd, 0, sizeof(vhd));
+
+ printf("\n%s appears invalid; dumping headers\n\n", name);
+
+ vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY);
+ if (vhd.fd == -1)
+ return -errno;
+
+ vhd.file = strdup(name);
+
+ vhd_read_footer(&vhd, &vhd.footer);
+ vhd_read_header(&vhd, &vhd.header);
+
+ vhd_print_footer(&vhd.footer, hex);
+ vhd_print_header(&vhd, &vhd.header, hex);
+
+ close(vhd.fd);
+ free(vhd.file);
+
+ return 0;
+}
+
+static int
+vhd_print_logical_to_physical(vhd_context_t *vhd,
+ uint64_t sector, int count, int hex)
+{
+ int i;
+ uint32_t blk, lsec;
+ uint64_t cur, offset;
+
+ if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+ fprintf(stderr, "sector %s past end of file\n",
+ conv(hex, sector + count));
+ return -ERANGE;
+ }
+
+ for (i = 0; i < count; i++) {
+ cur = sector + i;
+ blk = cur / vhd->spb;
+ lsec = cur % vhd->spb;
+ offset = vhd->bat.bat[blk];
+
+ if (offset != DD_BLK_UNUSED) {
+ offset += lsec + 1;
+ offset = vhd_sectors_to_bytes(offset);
+ }
+
+ printf("logical sector %s: ", conv(hex, cur));
+ printf("block number: %s, ", conv(hex, blk));
+ printf("sector offset: %s, ", conv(hex, lsec));
+ printf("file offset: %s\n", (offset == DD_BLK_UNUSED ?
+ "not allocated" : conv(hex, offset)));
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ int i;
+ uint64_t cur, offset;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+ offset = vhd->bat.bat[cur];
+
+ printf("block: %s: ", conv(hex, cur));
+ printf("offset: %s\n",
+ (offset == DD_BLK_UNUSED ? "not allocated" :
+ conv(hex, vhd_sectors_to_bytes(offset))));
+ }
+
+ return 0;
+}
+
+static inline void
+write_full(int fd, void* buf, size_t count)
+{
+ ssize_t num_written = 0;
+ if (!buf) return;
+
+
+ while(count > 0) {
+
+ num_written = write(fd, buf, count);
+ if (num_written == -1) {
+ if (errno == EINTR)
+ continue;
+ else
+ return;
+ }
+
+ count -= num_written;
+ buf += num_written;
+ }
+}
+
+static int
+vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ char *buf;
+ int i, err;
+ uint64_t cur;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+
+ if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+ printf("block %s not allocated\n", conv(hex, cur));
+ continue;
+ }
+
+ err = vhd_read_bitmap(vhd, cur, &buf);
+ if (err)
+ goto out;
+
+ write_full(STDOUT_FILENO, buf,
+ vhd_sectors_to_bytes(vhd->bm_secs));
+ free(buf);
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int
+vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex)
+{
+ char *buf;
+ uint64_t cur;
+ int i, err, bit;
+ uint32_t blk, bm_blk, sec;
+
+ if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+ printf("sector %s past end of file\n", conv(hex, sector));
+ return -ERANGE;
+ }
+
+ bm_blk = -1;
+ buf = NULL;
+
+ for (i = 0; i < count; i++) {
+ cur = sector + i;
+ blk = cur / vhd->spb;
+ sec = cur % vhd->spb;
+
+ if (blk != bm_blk) {
+ bm_blk = blk;
+ free(buf);
+ buf = NULL;
+
+ if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+ err = vhd_read_bitmap(vhd, blk, &buf);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+ bit = 0;
+ else
+ bit = vhd_bitmap_test(vhd, buf, blk);
+
+ print:
+ printf("block %s: ", conv(hex, blk));
+ printf("sec: %s: %d\n", conv(hex, sec), bit);
+ }
+
+ err = 0;
+ out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_print_batmap(vhd_context_t *vhd)
+{
+ int err;
+ size_t size;
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ printf("failed to read batmap: %d\n", err);
+ return err;
+ }
+
+ size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size);
+ write_full(STDOUT_FILENO, vhd->batmap.map, size);
+
+ return 0;
+}
+
+static int
+vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ int i, err;
+ uint64_t cur;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ err = vhd_get_batmap(vhd);
+ if (err) {
+ fprintf(stderr, "failed to get batmap\n");
+ return err;
+ }
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+ fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur),
+ vhd_batmap_test(vhd, &vhd->batmap, cur));
+ }
+
+ return 0;
+}
+
+static int
+vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+ char *buf;
+ int i, err;
+ uint64_t cur;
+
+ err = 0;
+
+ if (check_block_range(vhd, block + count, hex))
+ return -ERANGE;
+
+ for (i = 0; i < count; i++) {
+ cur = block + i;
+
+ if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+ printf("block %s not allocated\n", conv(hex, cur));
+ continue;
+ }
+
+ err = vhd_read_block(vhd, cur, &buf);
+ if (err)
+ break;
+
+ write_full(STDOUT_FILENO, buf, vhd->header.block_size);
+ free(buf);
+ }
+
+ return err;
+}
+
+static int
+vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex)
+{
+ char *buf;
+ uint64_t cur;
+ int err, max, secs;
+
+ if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size)
+ return -ERANGE;
+
+ max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, max);
+ if (err)
+ return -err;
+
+ cur = sec;
+ while (count) {
+ secs = MIN((max >> VHD_SECTOR_SHIFT), count);
+ err = vhd_io_read(vhd, buf, cur, secs);
+ if (err)
+ break;
+
+ write_full(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs));
+
+ cur += secs;
+ count -= secs;
+ }
+
+ free(buf);
+ return err;
+}
+
+int
+vhd_util_read(int argc, char **argv)
+{
+ char *name;
+ vhd_context_t vhd;
+ int c, err, headers, hex;
+ uint64_t bat, bitmap, tbitmap, batmap, tbatmap, data, lsec, count, read;
+
+ err = 0;
+ hex = 0;
+ headers = 0;
+ count = 1;
+ bat = -1;
+ bitmap = -1;
+ tbitmap = -1;
+ batmap = -1;
+ tbatmap = -1;
+ data = -1;
+ lsec = -1;
+ read = -1;
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:pt:b:m:i:aj:d:c:r:xh")) != -1) {
+ switch(c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'p':
+ headers = 1;
+ break;
+ case 't':
+ lsec = strtoul(optarg, NULL, 10);
+ break;
+ case 'b':
+ bat = strtoull(optarg, NULL, 10);
+ break;
+ case 'm':
+ bitmap = strtoull(optarg, NULL, 10);
+ break;
+ case 'i':
+ tbitmap = strtoul(optarg, NULL, 10);
+ break;
+ case 'a':
+ batmap = 1;
+ break;
+ case 'j':
+ tbatmap = strtoull(optarg, NULL, 10);
+ break;
+ case 'd':
+ data = strtoull(optarg, NULL, 10);
+ break;
+ case 'r':
+ read = strtoull(optarg, NULL, 10);
+ break;
+ case 'c':
+ count = strtoul(optarg, NULL, 10);
+ break;
+ case 'x':
+ hex = 1;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+ if (err) {
+ printf("Failed to open %s: %d\n", name, err);
+ vhd_dump_headers(name, hex);
+ return err;
+ }
+
+ err = vhd_get_bat(&vhd);
+ if (err) {
+ printf("Failed to get bat for %s: %d\n", name, err);
+ goto out;
+ }
+
+ if (headers)
+ vhd_print_headers(&vhd, hex);
+
+ if (lsec != -1) {
+ err = vhd_print_logical_to_physical(&vhd, lsec, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (bat != -1) {
+ err = vhd_print_bat(&vhd, bat, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (bitmap != -1) {
+ err = vhd_print_bitmap(&vhd, bitmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (tbitmap != -1) {
+ err = vhd_test_bitmap(&vhd, tbitmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (batmap != -1) {
+ err = vhd_print_batmap(&vhd);
+ if (err)
+ goto out;
+ }
+
+ if (tbatmap != -1) {
+ err = vhd_test_batmap(&vhd, tbatmap, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (data != -1) {
+ err = vhd_print_data(&vhd, data, count, hex);
+ if (err)
+ goto out;
+ }
+
+ if (read != -1) {
+ err = vhd_read_data(&vhd, read, count, hex);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+
+ out:
+ vhd_close(&vhd);
+ return err;
+
+ usage:
+ printf("options:\n"
+ "-h help\n"
+ "-n name\n"
+ "-p print VHD headers\n"
+ "-t sec translate logical sector to VHD location\n"
+ "-b blk print bat entry\n"
+ "-m blk print bitmap\n"
+ "-i sec test bitmap for logical sector\n"
+ "-a print batmap\n"
+ "-j blk test batmap for block\n"
+ "-d blk print data\n"
+ "-c num num units\n"
+ "-r sec read num sectors at sec\n"
+ "-x print in hex\n");
+ return EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-repair.c b/tools/blktap2/vhd/lib/vhd-util-repair.c
new file mode 100644
index 0000000000..a1d2c45c12
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-repair.c
@@ -0,0 +1,84 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_repair(int argc, char **argv)
+{
+ char *name;
+ int err, c;
+ off64_t eof;
+ vhd_context_t vhd;
+
+ name = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || optind != argc)
+ goto usage;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ err = vhd_end_of_data(&vhd, &eof);
+ if (err) {
+ printf("error finding end of data: %d\n", err);
+ goto done;
+ }
+
+ err = vhd_write_footer_at(&vhd, &vhd.footer, eof);
+
+ done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-resize.c b/tools/blktap2/vhd/lib/vhd-util-resize.c
new file mode 100644
index 0000000000..0143d7a0d3
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-resize.c
@@ -0,0 +1,1131 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "libvhd-journal.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...) \
+ do { \
+ syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \
+ DFPRINTF(_f, _a); \
+ } while (0)
+
+typedef struct vhd_block {
+ uint32_t block;
+ uint32_t offset;
+} vhd_block_t;
+
+TEST_FAIL_EXTERN_VARS;
+
+static inline uint32_t
+secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs)
+{
+ return secs / vhd->spb;
+}
+
+static uint32_t
+secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs)
+{
+ uint32_t blocks;
+
+ blocks = secs / vhd->spb;
+ if (secs % vhd->spb)
+ blocks++;
+
+ return blocks;
+}
+
+static int
+vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+ int err;
+ uint64_t new_eof;
+ vhd_context_t *vhd;
+
+ vhd = &journal->vhd;
+
+ new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs);
+ if (new_eof <= sizeof(vhd_footer_t))
+ return -EINVAL;
+
+ err = ftruncate(vhd->fd, new_eof);
+ if (err)
+ return errno;
+
+ vhd->footer.curr_size = new_eof;
+ return vhd_write_footer(vhd, &vhd->footer);
+}
+
+static int
+vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size)
+{
+ int err;
+ char *buf;
+ vhd_context_t *vhd;
+ uint64_t bytes, map;
+
+ vhd = &journal->vhd;
+ map = MIN(size, VHD_BLOCK_SIZE);
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ return err;
+
+ buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (buf == MAP_FAILED)
+ return -errno;
+
+ do {
+ bytes = MIN(size, map);
+
+ err = vhd_write(vhd, buf, bytes);
+ if (err)
+ break;
+
+ size -= bytes;
+ } while (size);
+
+ munmap(buf, map);
+
+ return err;
+}
+
+static int
+vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t size, eof, new_eof;
+
+ size = vhd_sectors_to_bytes(secs);
+ vhd = &journal->vhd;
+
+ err = vhd_seek(vhd, 0, SEEK_END);
+ if (err)
+ goto out;
+
+ eof = vhd_position(vhd);
+ if (eof == (off64_t)-1) {
+ err = -errno;
+ goto out;
+ }
+
+ err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size);
+ if (err)
+ goto out;
+
+ new_eof = eof + size;
+ err = vhd_seek(vhd, new_eof, SEEK_SET);
+ if (err)
+ goto out;
+
+ vhd->footer.curr_size += size;
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static int
+vhd_fixed_resize(vhd_journal_t *journal, uint64_t size)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t cur_secs, new_secs;
+
+ vhd = &journal->vhd;
+ cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+ new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+ if (cur_secs == new_secs)
+ return 0;
+ else if (cur_secs > new_secs)
+ err = vhd_fixed_shrink(journal, cur_secs - new_secs);
+ else
+ err = vhd_fixed_grow(journal, new_secs - cur_secs);
+
+ return err;
+}
+
+static inline void
+swap(vhd_block_t *list, int a, int b)
+{
+ vhd_block_t tmp;
+
+ tmp = list[a];
+ list[a] = list[b];
+ list[b] = tmp;
+}
+
+static int
+partition(vhd_block_t *list, int left, int right, int pidx)
+{
+ int i, sidx;
+ long long pval;
+
+ sidx = left;
+ pval = list[pidx].offset;
+ swap(list, pidx, right);
+
+ for (i = left; i < right; i++)
+ if (list[i].offset >= pval) {
+ swap(list, sidx, i);
+ ++sidx;
+ }
+
+ swap(list, right, sidx);
+ return sidx;
+}
+
+static void
+quicksort(vhd_block_t *list, int left, int right)
+{
+ int pidx, new_pidx;
+
+ if (right < left)
+ return;
+
+ pidx = left;
+ new_pidx = partition(list, left, right, pidx);
+ quicksort(list, left, new_pidx - 1);
+ quicksort(list, new_pidx + 1, right);
+}
+
+static int
+vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset)
+{
+ int err;
+ char *buf;
+ size_t size;
+ vhd_context_t *vhd;
+ off64_t off, src_off;
+
+ buf = NULL;
+ vhd = &journal->vhd;
+ off = offset;
+ size = vhd_sectors_to_bytes(vhd->bm_secs);
+ src_off = vhd->bat.bat[src];
+
+ if (src_off == DD_BLK_UNUSED)
+ return -EINVAL;
+ src_off = vhd_sectors_to_bytes(src_off);
+
+ err = vhd_journal_add_block(journal, src,
+ VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+ if (err)
+ goto out;
+
+ err = vhd_read_bitmap(vhd, src, &buf);
+ if (err)
+ goto out;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, buf, size);
+ if (err)
+ goto out;
+
+ free(buf);
+ buf = NULL;
+ off += size;
+ size = vhd_sectors_to_bytes(vhd->spb);
+
+ err = vhd_read_block(vhd, src, &buf);
+ if (err)
+ goto out;
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, buf, size);
+ if (err)
+ goto out;
+
+ vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT;
+
+ err = vhd_write_zeros(journal, src_off,
+ vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb));
+
+out:
+ free(buf);
+ return err;
+}
+
+static int
+vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest)
+{
+ int err;
+ off64_t off;
+ vhd_context_t *vhd;
+
+ vhd = &journal->vhd;
+ off = vhd_sectors_to_bytes(vhd->bat.bat[dest]);
+
+ err = vhd_journal_add_block(journal, dest,
+ VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+ if (err)
+ return err;
+
+ err = vhd_move_block(journal, src, off);
+ if (err)
+ return err;
+
+ vhd->bat.bat[dest] = DD_BLK_UNUSED;
+
+ return 0;
+}
+
+/*
+ * remove a list of blocks from the vhd file
+ * if a block to be removed:
+ * - resides at the end of the file: simply clear its bat entry
+ * - resides elsewhere: move the last block in the file into its position
+ * and update the bat to reflect this
+ */
+static int
+vhd_defrag_shrink(vhd_journal_t *journal,
+ vhd_block_t *original_free_list, int free_cnt)
+{
+ vhd_context_t *vhd;
+ int i, j, free_idx, err;
+ vhd_block_t *blocks, *free_list;
+
+ err = 0;
+ blocks = NULL;
+ free_list = NULL;
+ vhd = &journal->vhd;
+
+ blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t));
+ if (!blocks) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ free_list = malloc(free_cnt * sizeof(vhd_block_t));
+ if (!free_list) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blocks[i].block = i;
+ blocks[i].offset = vhd->bat.bat[i];
+ }
+
+ memcpy(free_list, original_free_list,
+ free_cnt * sizeof(vhd_block_t));
+
+ /* sort both the to-free list and the bat list
+ * in order of descending file offset */
+ quicksort(free_list, 0, free_cnt - 1);
+ quicksort(blocks, 0, vhd->bat.entries - 1);
+
+ for (i = 0, free_idx = 0;
+ i < vhd->bat.entries && free_idx < free_cnt; i++) {
+ vhd_block_t *b = blocks + i;
+
+ if (b->offset == DD_BLK_UNUSED)
+ continue;
+
+ for (j = free_idx; j < free_cnt; j++)
+ if (b->block == free_list[j].block) {
+ /* the last block in the file is in the list of
+ * blocks to remove; no need to shuffle the
+ * data -- just clear the bat entry */
+ vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED;
+ free_idx++;
+ continue;
+ }
+
+ err = vhd_clobber_block(journal, b->block,
+ free_list[free_idx++].block);
+ if (err)
+ goto out;
+ }
+
+ /* clear any bat entries for blocks we did not shuffle */
+ for (i = free_idx; i < free_cnt; i++)
+ vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED;
+
+out:
+ free(blocks);
+ free(free_list);
+
+ return err;
+}
+
+static int
+vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries)
+{
+ int i, err;
+ vhd_context_t *vhd;
+ off64_t orig_map_off, new_map_off;
+ uint32_t orig_entries, new_entries;
+
+ vhd = &journal->vhd;
+ orig_entries = vhd->header.max_bat_size;
+ new_entries = orig_entries - entries;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_batmap_header_offset(vhd, &orig_map_off);
+ if (err)
+ return err;
+ }
+
+ /* update header */
+ vhd->header.max_bat_size = new_entries;
+ err = vhd_write_header(vhd, &vhd->header);
+ if (err)
+ return err;
+
+ /* update footer */
+ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+ vhd->footer.geometry = vhd_chs(vhd->footer.curr_size);
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ /* update bat -- we don't reclaim space, just clear entries */
+ for (i = new_entries; i < orig_entries; i++)
+ vhd->bat.bat[i] = 0;
+
+ err = vhd_write_bat(vhd, &vhd->bat);
+ if (err)
+ return err;
+
+ /* update this after write_bat so the end of the bat is zeored */
+ vhd->bat.entries = new_entries;
+
+ if (!vhd_has_batmap(vhd))
+ return 0;
+
+ /* zero out old batmap header if new header has moved */
+ err = vhd_batmap_header_offset(vhd, &new_map_off);
+ if (err)
+ return err;
+
+ if (orig_map_off != new_map_off) {
+ size_t size;
+
+ size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+ err = vhd_write_zeros(journal, orig_map_off, size);
+ if (err)
+ return err;
+ }
+
+ /* update batmap -- clear entries for freed blocks */
+ for (i = new_entries; i < orig_entries; i++)
+ vhd_batmap_clear(vhd, &vhd->batmap, i);
+
+ err = vhd_write_batmap(vhd, &vhd->batmap);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int
+vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+ off64_t eof;
+ uint32_t blocks;
+ vhd_context_t *vhd;
+ int i, j, err, free_cnt;
+ struct vhd_block *free_list;
+
+ printf("dynamic shrink not fully implemented\n");
+ return -ENOSYS;
+
+ eof = 0;
+ free_cnt = 0;
+ free_list = NULL;
+ vhd = &journal->vhd;
+
+ blocks = secs_to_blocks_down(vhd, secs);
+ if (blocks == 0)
+ return 0;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ return err;
+ }
+
+ free_list = malloc(blocks * sizeof(struct vhd_block));
+ if (!free_list)
+ return -ENOMEM;
+
+ for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) {
+ uint32_t blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ free_list[free_cnt].block = i;
+ free_list[free_cnt].offset = blk;
+ free_cnt++;
+ }
+ }
+
+ if (free_cnt) {
+ err = vhd_defrag_shrink(journal, free_list, free_cnt);
+ if (err)
+ goto out;
+ }
+
+ err = vhd_clear_bat_entries(journal, blocks);
+ if (err)
+ goto out;
+
+ /* remove data beyond footer */
+ err = vhd_end_of_data(vhd, &eof);
+ if (err)
+ goto out;
+
+ err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t));
+ if (err) {
+ err = -errno;
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ free(free_list);
+ return err;
+}
+
+static inline void
+vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block)
+{
+ int i;
+ uint32_t blk;
+
+ memset(block, 0, sizeof(vhd_block_t));
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ if (!block->offset || blk < block->offset) {
+ block->block = i;
+ block->offset = blk;
+ }
+ }
+ }
+}
+
+static inline uint32_t
+vhd_next_block_offset(vhd_context_t *vhd)
+{
+ int i;
+ uint32_t blk, end, spp, next;
+
+ next = 0;
+ spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+ for (i = 0; i < vhd->bat.entries; i++) {
+ blk = vhd->bat.bat[i];
+
+ if (blk != DD_BLK_UNUSED) {
+ end = blk + vhd->spb + vhd->bm_secs;
+ next = MAX(next, end);
+ }
+ }
+
+ return next;
+}
+
+static inline int
+in_range(off64_t off, off64_t start, off64_t size)
+{
+ return (start < off && start + size > off);
+}
+
+#define SKIP_HEADER 0x01
+#define SKIP_BAT 0x02
+#define SKIP_BATMAP 0x04
+#define SKIP_PLOC 0x08
+#define SKIP_DATA 0x10
+
+static inline int
+skip_check(int mode, int type)
+{
+ return mode & type;
+}
+
+static int
+vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode)
+{
+ int i, n;
+ char *msg;
+ size_t size;
+ vhd_block_t fb;
+ vhd_parent_locator_t *loc;
+
+ msg = NULL;
+
+ if (!vhd_type_dynamic(vhd))
+ return 0;
+
+ if (off < VHD_SECTOR_SIZE) {
+ msg = "backup footer";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_HEADER))
+ if (in_range(off,
+ vhd->footer.data_offset, sizeof(vhd_header_t))) {
+ msg = "header";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_BAT))
+ if (in_range(off, vhd->header.table_offset,
+ vhd_bytes_padded(vhd->header.max_bat_size *
+ sizeof(uint32_t)))) {
+ msg = "bat";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_BATMAP))
+ if (vhd_has_batmap(vhd) &&
+ in_range(off, vhd->batmap.header.batmap_offset,
+ vhd_bytes_padded(vhd->batmap.header.batmap_size))) {
+ msg = "batmap";
+ goto fail;
+ }
+
+ if (!skip_check(mode, SKIP_PLOC)) {
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+ for (i = 0; i < n; i++) {
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ size = vhd_parent_locator_size(loc);
+ if (in_range(off, loc->data_offset, size)) {
+ msg = "parent locator";
+ goto fail;
+ }
+ }
+ }
+
+ if (!skip_check(mode, SKIP_DATA)) {
+ vhd_first_data_block(vhd, &fb);
+ if (fb.offset && in_range(off,
+ vhd_sectors_to_bytes(fb.offset),
+ VHD_BLOCK_SIZE)) {
+ msg = "data block";
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg);
+ return -EINVAL;
+}
+
+/*
+ * take any metadata after the bat (@eob) and shift it
+ */
+static int
+vhd_shift_metadata(vhd_journal_t *journal, off64_t eob,
+ size_t bat_needed, size_t map_needed)
+{
+ int i, n, err;
+ vhd_context_t *vhd;
+ size_t size_needed;
+ char *buf, **locators;
+ vhd_parent_locator_t *loc;
+
+ vhd = &journal->vhd;
+ size_needed = bat_needed + map_needed;
+
+ n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+ locators = calloc(n, sizeof(char *));
+ if (!locators)
+ return -ENOMEM;
+
+ for (i = 0; i < n; i++) {
+ size_t size;
+
+ loc = vhd->header.loc + i;
+ if (loc->code == PLAT_CODE_NONE)
+ continue;
+
+ if (loc->data_offset < eob)
+ continue;
+
+ size = vhd_parent_locator_size(loc);
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ err = -err;
+ buf = NULL;
+ goto out;
+ }
+
+ err = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_read(vhd, buf, size);
+ if (err)
+ goto out;
+
+ locators[i] = buf;
+ }
+
+ for (i = 0; i < n; i++) {
+ off64_t off;
+ size_t size;
+
+ if (!locators[i])
+ continue;
+
+ loc = vhd->header.loc + i;
+ off = loc->data_offset + size_needed;
+ size = vhd_parent_locator_size(loc);
+
+ if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) {
+ EPRINTF("%s: shifting locator %d would clobber data\n",
+ vhd->file, i);
+ return -EINVAL;
+ }
+
+ err = vhd_seek(vhd, off, SEEK_SET);
+ if (err)
+ goto out;
+
+ err = vhd_write(vhd, locators[i], size);
+ if (err)
+ goto out;
+
+ free(locators[i]);
+ locators[i] = NULL;
+ loc->data_offset = off;
+
+ /* write the new header after writing the new bat */
+ }
+
+ if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) {
+ vhd->batmap.header.batmap_offset += bat_needed;
+
+ /* write the new batmap after writing the new bat */
+ }
+
+ err = 0;
+
+out:
+ for (i = 0; i < n; i++)
+ free(locators[i]);
+ free(locators);
+
+ return err;
+}
+
+static int
+vhd_add_bat_entries(vhd_journal_t *journal, int entries)
+{
+ int i, err;
+ off64_t off;
+ vhd_bat_t new_bat;
+ vhd_context_t *vhd;
+ uint32_t new_entries;
+ vhd_batmap_t new_batmap;
+ uint64_t bat_size, new_bat_size, map_size, new_map_size;
+
+ vhd = &journal->vhd;
+ new_entries = vhd->header.max_bat_size + entries;
+
+ bat_size = vhd_bytes_padded(vhd->header.max_bat_size *
+ sizeof(uint32_t));
+ new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t));
+
+ map_size = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3);
+ new_map_size = vhd_bytes_padded((new_entries + 7) >> 3);
+
+ off = vhd->header.table_offset + new_bat_size;
+ if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) {
+ EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes "
+ "at 0x%08"PRIx64" would clobber data\n",
+ vhd->file, new_bat_size, vhd->header.table_offset);
+ return -EINVAL;
+ }
+
+ if (vhd_has_batmap(vhd)) {
+ off = vhd->batmap.header.batmap_offset + new_map_size;
+ if (vhd_check_for_clobber(vhd, off, 0)) {
+ EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes"
+ " at 0x%08"PRIx64" would clobber data\n", vhd->file,
+ new_map_size, vhd->batmap.header.batmap_offset);
+ return -EINVAL;
+ }
+ }
+
+ /* update header */
+ vhd->header.max_bat_size = new_entries;
+ err = vhd_write_header(vhd, &vhd->header);
+ if (err)
+ return err;
+
+ /* update footer */
+ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+ vhd->footer.geometry = vhd_chs(vhd->footer.curr_size);
+ vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+ err = vhd_write_footer(vhd, &vhd->footer);
+ if (err)
+ return err;
+
+ /* allocate new bat */
+ err = posix_memalign((void **)&new_bat.bat, VHD_SECTOR_SIZE, new_bat_size);
+ if (err)
+ return -err;
+
+ new_bat.spb = vhd->bat.spb;
+ new_bat.entries = new_entries;
+ memcpy(new_bat.bat, vhd->bat.bat, bat_size);
+ for (i = vhd->bat.entries; i < new_entries; i++)
+ new_bat.bat[i] = DD_BLK_UNUSED;
+
+ /* write new bat */
+ err = vhd_write_bat(vhd, &new_bat);
+ if (err) {
+ free(new_bat.bat);
+ return err;
+ }
+
+ /* update in-memory bat */
+ free(vhd->bat.bat);
+ vhd->bat = new_bat;
+
+ if (!vhd_has_batmap(vhd))
+ return 0;
+
+ /* allocate new batmap */
+ err = posix_memalign((void **)&new_batmap.map,
+ VHD_SECTOR_SIZE, new_map_size);
+ if (err)
+ return err;
+
+ new_batmap.header = vhd->batmap.header;
+ new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size);
+ memcpy(new_batmap.map, vhd->batmap.map, map_size);
+ memset(new_batmap.map + map_size, 0, new_map_size - map_size);
+
+ /* write new batmap */
+ err = vhd_write_batmap(vhd, &new_batmap);
+ if (err) {
+ free(new_batmap.map);
+ return err;
+ }
+
+ /* update in-memory batmap */
+ free(vhd->batmap.map);
+ vhd->batmap = new_batmap;
+
+ return 0;
+}
+
+static int
+vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs)
+{
+ int i, err;
+ off64_t eob, eom;
+ vhd_context_t *vhd;
+ vhd_block_t first_block;
+ uint64_t blocks, size_needed;
+ uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs;
+ uint64_t map_needed, map_size, map_avail, map_bytes, map_secs;
+
+ vhd = &journal->vhd;
+
+ size_needed = 0;
+ bat_needed = 0;
+ map_needed = 0;
+
+ /* number of vhd blocks to add */
+ blocks = secs_to_blocks_up(vhd, secs);
+
+ /* size in bytes needed for new bat entries */
+ bat_needed = blocks * sizeof(uint32_t);
+ map_needed = (blocks >> 3) + 1;
+
+ /* available bytes in current bat */
+ bat_bytes = vhd->header.max_bat_size * sizeof(uint32_t);
+ bat_secs = secs_round_up_no_zero(bat_bytes);
+ bat_size = vhd_sectors_to_bytes(bat_secs);
+ bat_avail = bat_size - bat_bytes;
+
+ if (vhd_has_batmap(vhd)) {
+ /* avaliable bytes in current batmap */
+ map_bytes = (vhd->header.max_bat_size + 7) >> 3;
+ map_secs = vhd->batmap.header.batmap_size;
+ map_size = vhd_sectors_to_bytes(map_secs);
+ map_avail = map_size - map_bytes;
+ } else {
+ map_needed = 0;
+ map_avail = 0;
+ }
+
+ /* we have enough space already; just extend the bat */
+ if (bat_needed <= bat_avail && map_needed <= map_avail)
+ goto add_entries;
+
+ /* we need to add new sectors to the bat */
+ if (bat_needed > bat_avail) {
+ bat_needed -= bat_avail;
+ bat_needed = vhd_bytes_padded(bat_needed);
+ } else
+ bat_needed = 0;
+
+ /* we need to add new sectors to the batmap */
+ if (map_needed > map_avail) {
+ map_needed -= map_avail;
+ map_needed = vhd_bytes_padded(map_needed);
+ } else
+ map_needed = 0;
+
+ /* how many additional bytes do we need? */
+ size_needed = bat_needed + map_needed;
+
+ /* calculate space between end of headers and beginning of data */
+ err = vhd_end_of_headers(vhd, &eom);
+ if (err)
+ return err;
+
+ eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs);
+ vhd_first_data_block(vhd, &first_block);
+
+ /* no blocks allocated; just shift post-bat metadata */
+ if (!first_block.offset)
+ goto shift_metadata;
+
+ /*
+ * not enough space --
+ * move vhd data blocks to the end of the file to make room
+ */
+ do {
+ off64_t new_off, bm_size, gap_size;
+
+ new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd));
+
+ /* data region of segment should begin on page boundary */
+ bm_size = vhd_sectors_to_bytes(vhd->bm_secs);
+ if ((new_off + bm_size) % 4096) {
+ gap_size = 4096 - ((new_off + bm_size) % 4096);
+
+ err = vhd_write_zeros(journal, new_off, gap_size);
+ if (err)
+ return err;
+
+ new_off += gap_size;
+ }
+
+ err = vhd_move_block(journal, first_block.block, new_off);
+ if (err)
+ return err;
+
+ vhd_first_data_block(vhd, &first_block);
+
+ } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset));
+
+ TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED);
+
+shift_metadata:
+ /* shift any metadata after the bat to make room for new bat sectors */
+ err = vhd_shift_metadata(journal, eob, bat_needed, map_needed);
+ if (err)
+ return err;
+
+ TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED);
+
+add_entries:
+ return vhd_add_bat_entries(journal, blocks);
+}
+
+static int
+vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size)
+{
+ int err;
+ vhd_context_t *vhd;
+ uint64_t cur_secs, new_secs;
+
+ vhd = &journal->vhd;
+ cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+ new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+ if (cur_secs == new_secs)
+ return 0;
+
+ err = vhd_get_header(vhd);
+ if (err)
+ return err;
+
+ err = vhd_get_bat(vhd);
+ if (err)
+ return err;
+
+ if (vhd_has_batmap(vhd)) {
+ err = vhd_get_batmap(vhd);
+ if (err)
+ return err;
+ }
+
+ if (cur_secs > new_secs)
+ err = vhd_dynamic_shrink(journal, cur_secs - new_secs);
+ else
+ err = vhd_dynamic_grow(journal, new_secs - cur_secs);
+
+ return err;
+}
+
+static int
+vhd_util_resize_check_creator(const char *name)
+{
+ int err;
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ if (!vhd_creator_tapdisk(&vhd)) {
+ printf("%s not created by xen; resize not supported\n", name);
+ err = -EINVAL;
+ }
+
+ vhd_close(&vhd);
+ return err;
+}
+
+int
+vhd_util_resize(int argc, char **argv)
+{
+ char *name, *jname;
+ uint64_t size;
+ int c, err, jerr;
+ vhd_journal_t journal;
+ vhd_context_t *vhd;
+
+ err = -EINVAL;
+ size = 0;
+ name = NULL;
+ jname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:j:s:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'j':
+ jname = optarg;
+ break;
+ case 's':
+ err = 0;
+ size = strtoull(optarg, NULL, 10);
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (err || !name || !jname || argc != optind)
+ goto usage;
+
+ err = vhd_util_resize_check_creator(name);
+ if (err)
+ return err;
+
+ libvhd_set_log_level(1);
+ err = vhd_journal_create(&journal, name, jname);
+ if (err) {
+ printf("creating journal failed: %d\n", err);
+ return err;
+ }
+
+ vhd = &journal.vhd;
+
+ err = vhd_get_footer(vhd);
+ if (err)
+ goto out;
+
+ TEST_FAIL_AT(FAIL_RESIZE_BEGIN);
+
+ if (vhd_type_dynamic(vhd))
+ err = vhd_dynamic_resize(&journal, size);
+ else
+ err = vhd_fixed_resize(&journal, size);
+
+ TEST_FAIL_AT(FAIL_RESIZE_END);
+
+out:
+ if (err) {
+ printf("resize failed: %d\n", err);
+ jerr = vhd_journal_revert(&journal);
+ } else
+ jerr = vhd_journal_commit(&journal);
+
+ if (jerr) {
+ printf("closing journal failed: %d\n", jerr);
+ vhd_journal_close(&journal);
+ } else
+ vhd_journal_remove(&journal);
+
+ return (err ? : jerr);
+
+usage:
+ printf("options: <-n name> <-j journal> <-s size (in MB)> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-revert.c b/tools/blktap2/vhd/lib/vhd-util-revert.c
new file mode 100644
index 0000000000..dab6e8b950
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-revert.c
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT
+ * affect the VHD disk capacity, only the physical size of the file containing
+ * the VHD. Naturally, it is not possible to set the file size to be less than
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the
+ * footer in the right location such that resizing the file (manually, as a
+ * separate step) will produce the correct results. If the new file size is
+ * greater than the current file size, the file must first be expanded and then
+ * altered with this operation. If the new size is smaller than the current
+ * size, the VHD must first be altered with this operation and then the file
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+*/
+
+#include <errno.h>
+//#include <fcntl.h>
+#include <stdio.h>
+//#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+int
+vhd_util_revert(int argc, char **argv)
+{
+ char *name, *jname;
+ vhd_journal_t journal;
+ int c, err;
+
+ name = NULL;
+ jname = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:j:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'j':
+ jname = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || !jname || argc != optind)
+ goto usage;
+
+ libvhd_set_log_level(1);
+ err = vhd_journal_open(&journal, name, jname);
+ if (err) {
+ printf("opening journal failed: %d\n", err);
+ return err;
+ }
+
+ err = vhd_journal_revert(&journal);
+ if (err) {
+ printf("reverting journal failed: %d\n", err);
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ err = vhd_journal_remove(&journal);
+ if (err) {
+ printf("removing journal failed: %d\n", err);
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ return 0;
+
+usage:
+ printf("options: <-n name> <-j journal> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-scan.c b/tools/blktap2/vhd/lib/vhd-util-scan.c
new file mode 100644
index 0000000000..4ecfb52e7d
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-scan.c
@@ -0,0 +1,1315 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <glob.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+
+#include "list.h"
+#include "libvhd.h"
+#include "lvm-util.h"
+
+#define VHD_SCAN_FAST 0x01
+#define VHD_SCAN_PRETTY 0x02
+#define VHD_SCAN_VOLUME 0x04
+#define VHD_SCAN_NOFAIL 0x08
+#define VHD_SCAN_VERBOSE 0x10
+#define VHD_SCAN_PARENTS 0x20
+
+#define VHD_TYPE_RAW_FILE 0x01
+#define VHD_TYPE_VHD_FILE 0x02
+#define VHD_TYPE_RAW_VOLUME 0x04
+#define VHD_TYPE_VHD_VOLUME 0x08
+
+static inline int
+target_volume(uint8_t type)
+{
+ return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME);
+}
+
+static inline int
+target_vhd(uint8_t type)
+{
+ return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME);
+}
+
+struct target {
+ char name[VHD_MAX_NAME_LEN];
+ char device[VHD_MAX_NAME_LEN];
+ uint64_t size;
+ uint64_t start;
+ uint64_t end;
+ uint8_t type;
+};
+
+struct iterator {
+ int cur;
+ int cur_size;
+ int max_size;
+ struct target *targets;
+};
+
+struct vhd_image {
+ char *name;
+ char *parent;
+ uint64_t capacity;
+ off64_t size;
+ uint8_t hidden;
+ int error;
+ char *message;
+
+ struct target *target;
+
+ struct list_head sibling;
+ struct list_head children;
+ struct vhd_image *parent_image;
+};
+
+struct vhd_scan {
+ int cur;
+ int size;
+
+ int lists_cur;
+ int lists_size;
+
+ struct vhd_image **images;
+ struct vhd_image **lists;
+};
+
+static int flags;
+static struct vg vg;
+static struct vhd_scan scan;
+
+static int
+vhd_util_scan_pretty_allocate_list(int cnt)
+{
+ int i;
+ struct vhd_image *list;
+
+ memset(&scan, 0, sizeof(scan));
+
+ scan.lists_cur = 1;
+ scan.lists_size = 10;
+
+ scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *));
+ if (!scan.lists)
+ goto fail;
+
+ scan.lists[0] = calloc(cnt, sizeof(struct vhd_image));
+ if (!scan.lists[0])
+ goto fail;
+
+ scan.images = calloc(cnt, sizeof(struct vhd_image *));
+ if (!scan.images)
+ goto fail;
+
+ for (i = 0; i < cnt; i++)
+ scan.images[i] = scan.lists[0] + i;
+
+ scan.cur = 0;
+ scan.size = cnt;
+
+ return 0;
+
+fail:
+ if (scan.lists) {
+ free(scan.lists[0]);
+ free(scan.lists);
+ }
+
+ free(scan.images);
+ memset(&scan, 0, sizeof(scan));
+ return -ENOMEM;
+}
+
+static void
+vhd_util_scan_pretty_free_list(void)
+{
+ int i;
+
+ if (scan.lists) {
+ for (i = 0; i < scan.lists_cur; i++)
+ free(scan.lists[i]);
+ free(scan.lists);
+ }
+
+ free(scan.images);
+ memset(&scan, 0, sizeof(scan));
+}
+
+static int
+vhd_util_scan_pretty_add_image(struct vhd_image *image)
+{
+ int i;
+ struct vhd_image *img;
+
+ for (i = 0; i < scan.cur; i++) {
+ img = scan.images[i];
+ if (!strcmp(img->name, image->name))
+ return 0;
+ }
+
+ if (scan.cur >= scan.size) {
+ struct vhd_image *new, **list;
+
+ if (scan.lists_cur >= scan.lists_size) {
+ list = realloc(scan.lists, scan.lists_size * 2 *
+ sizeof(struct vhd_image *));
+ if (!list)
+ return -ENOMEM;
+
+ scan.lists_size *= 2;
+ scan.lists = list;
+ }
+
+ new = calloc(scan.size, sizeof(struct vhd_image));
+ if (!new)
+ return -ENOMEM;
+
+ scan.lists[scan.lists_cur++] = new;
+ scan.size *= 2;
+
+ list = realloc(scan.images, scan.size *
+ sizeof(struct vhd_image *));
+ if (!list)
+ return -ENOMEM;
+
+ scan.images = list;
+ for (i = 0; i + scan.cur < scan.size; i++)
+ scan.images[i + scan.cur] = new + i;
+ }
+
+ img = scan.images[scan.cur];
+ INIT_LIST_HEAD(&img->sibling);
+ INIT_LIST_HEAD(&img->children);
+
+ img->capacity = image->capacity;
+ img->size = image->size;
+ img->hidden = image->hidden;
+ img->error = image->error;
+ img->message = image->message;
+
+ img->name = strdup(image->name);
+ if (!img->name)
+ goto fail;
+
+ if (image->parent) {
+ img->parent = strdup(image->parent);
+ if (!img->parent)
+ goto fail;
+ }
+
+ scan.cur++;
+ return 0;
+
+fail:
+ free(img->name);
+ free(img->parent);
+ memset(img, 0, sizeof(*img));
+ return -ENOMEM;
+}
+
+static int
+vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs)
+{
+ struct vhd_image *l, *r;
+
+ l = *(struct vhd_image **)lhs;
+ r = *(struct vhd_image **)rhs;
+
+ return strcmp(l->name, r->name);
+}
+
+static void
+vhd_util_scan_print_image_indent(struct vhd_image *image, int tab)
+{
+ char *pad, *name, *pmsg, *parent;
+
+ pad = (tab ? " " : "");
+ name = image->name;
+ parent = (image->parent ? : "none");
+
+ if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image)
+ pmsg = " (not found in scan)";
+ else
+ pmsg = "";
+
+ if (!(flags & VHD_SCAN_VERBOSE)) {
+ name = basename(image->name);
+ if (image->parent)
+ parent = basename(image->parent);
+ }
+
+ if (image->error)
+ printf("%*svhd=%s scan-error=%d error-message='%s'\n",
+ tab, pad, image->name, image->error, image->message);
+ else
+ printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+ "parent=%s%s\n", tab, pad, name, image->capacity,
+ image->size, image->hidden, parent, pmsg);
+}
+
+static void
+vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth)
+{
+ struct vhd_image *img, *tmp;
+
+ vhd_util_scan_print_image_indent(image, depth * 3);
+
+ list_for_each_entry_safe(img, tmp, &image->children, sibling)
+ if (!img->hidden)
+ vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+ list_for_each_entry_safe(img, tmp, &image->children, sibling)
+ if (img->hidden)
+ vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+ free(image->name);
+ free(image->parent);
+
+ image->name = NULL;
+ image->parent = NULL;
+}
+
+static void
+vhd_util_scan_pretty_print_images(void)
+{
+ int i;
+ struct vhd_image *image, **parentp, *parent, *keyp, key;
+
+ qsort(scan.images, scan.cur, sizeof(scan.images[0]),
+ vhd_util_scan_pretty_image_compare);
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->parent) {
+ image->parent_image = NULL;
+ continue;
+ }
+
+ memset(&key, 0, sizeof(key));
+ key.name = image->parent;
+ keyp = &key;
+
+ parentp = bsearch(&keyp, scan.images, scan.cur,
+ sizeof(scan.images[0]),
+ vhd_util_scan_pretty_image_compare);
+ if (!parentp) {
+ image->parent_image = NULL;
+ continue;
+ }
+
+ parent = *parentp;
+ image->parent_image = parent;
+ list_add_tail(&image->sibling, &parent->children);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (image->parent_image || !image->hidden)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->name || image->parent_image)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+
+ for (i = 0; i < scan.cur; i++) {
+ image = scan.images[i];
+
+ if (!image->name)
+ continue;
+
+ vhd_util_scan_pretty_print_tree(image, 0);
+ }
+}
+
+static void
+vhd_util_scan_print_image(struct vhd_image *image)
+{
+ int err;
+
+ if (!image->error && (flags & VHD_SCAN_PRETTY)) {
+ err = vhd_util_scan_pretty_add_image(image);
+ if (!err)
+ return;
+
+ if (!image->error) {
+ image->error = err;
+ image->message = "allocating memory";
+ }
+ }
+
+ vhd_util_scan_print_image_indent(image, 0);
+}
+
+static int
+vhd_util_scan_error(const char *file, int err)
+{
+ struct vhd_image image;
+
+ memset(&image, 0, sizeof(image));
+ image.name = (char *)file;
+ image.error = err;
+ image.message = "failure scanning target";
+
+ vhd_util_scan_print_image(&image);
+
+ /*
+ if (flags & VHD_SCAN_NOFAIL)
+ return 0;
+ */
+
+ return err;
+}
+
+static vhd_parent_locator_t *
+vhd_util_scan_get_parent_locator(vhd_context_t *vhd)
+{
+ int i;
+ vhd_parent_locator_t *loc;
+
+ loc = NULL;
+
+ for (i = 0; i < 8; i++) {
+ if (vhd->header.loc[i].code == PLAT_CODE_MACX) {
+ loc = vhd->header.loc + i;
+ break;
+ }
+
+ if (vhd->header.loc[i].code == PLAT_CODE_W2RU)
+ loc = vhd->header.loc + i;
+
+ if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE)
+ loc = vhd->header.loc + i;
+ }
+
+ return loc;
+}
+
+static inline int
+copy_name(char *dst, const char *src)
+{
+ if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN)
+ return 0;
+
+ return -ENAMETOOLONG;
+}
+
+/*
+ * LVHD stores realpath(parent) in parent locators, so
+ * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name>
+ */
+static int
+vhd_util_scan_extract_volume_name(char *dst, const char *src)
+{
+ int err;
+ char copy[VHD_MAX_NAME_LEN], *name, *s, *c;
+
+ name = strrchr(src, '/');
+ if (!name)
+ name = (char *)src;
+
+ /* convert single dashes to slashes, double dashes to single dashes */
+ for (c = copy, s = name; *s != '\0'; s++, c++) {
+ if (*s == '-') {
+ if (s[1] != '-')
+ *c = '/';
+ else {
+ s++;
+ *c = '-';
+ }
+ } else
+ *c = *s;
+ }
+
+ *c = '\0';
+ c = strrchr(copy, '/');
+ if (c == name) {
+ /* unrecognized format */
+ strcpy(dst, src);
+ return -EINVAL;
+ }
+
+ strcpy(dst, ++c);
+ return 0;
+}
+
+static int
+vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ char name[VHD_MAX_NAME_LEN];
+ vhd_parent_locator_t *loc, copy;
+
+ if (flags & VHD_SCAN_FAST) {
+ err = vhd_header_decode_parent(vhd,
+ &vhd->header, &image->parent);
+ if (!err)
+ goto found;
+ }
+
+ loc = vhd_util_scan_get_parent_locator(vhd);
+ if (!loc)
+ return -EINVAL;
+
+ copy = *loc;
+ copy.data_offset += image->target->start;
+ err = vhd_parent_locator_read(vhd, &copy, &image->parent);
+ if (err)
+ return err;
+
+found:
+ err = vhd_util_scan_extract_volume_name(name, image->parent);
+ if (!err)
+ return copy_name(image->parent, name);
+
+ return 0;
+}
+
+static int
+vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int i, err;
+ vhd_parent_locator_t *loc;
+
+ if (!target_vhd(image->target->type)) {
+ image->parent = NULL;
+ return 0;
+ }
+
+ loc = NULL;
+
+ if (target_volume(image->target->type))
+ return vhd_util_scan_get_volume_parent(vhd, image);
+
+ if (flags & VHD_SCAN_FAST) {
+ err = vhd_header_decode_parent(vhd,
+ &vhd->header, &image->parent);
+ if (!err)
+ return 0;
+ } else {
+ /*
+ * vhd_parent_locator_get checks for the existence of the
+ * parent file. if this call succeeds, all is well; if not,
+ * we'll try to return whatever string we have before failing
+ * outright.
+ */
+ err = vhd_parent_locator_get(vhd, &image->parent);
+ if (!err)
+ return 0;
+ }
+
+ loc = vhd_util_scan_get_parent_locator(vhd);
+ if (!loc)
+ return -EINVAL;
+
+ return vhd_parent_locator_read(vhd, loc, &image->parent);
+}
+
+static int
+vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err, hidden;
+
+ err = 0;
+ hidden = 0;
+
+ if (target_vhd(image->target->type))
+ err = vhd_hidden(vhd, &hidden);
+ else
+ hidden = 1;
+
+ if (err)
+ return err;
+
+ image->hidden = hidden;
+ return 0;
+}
+
+static int
+vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image)
+{
+ image->size = image->target->size;
+
+ if (target_vhd(image->target->type))
+ image->capacity = vhd->footer.curr_size;
+ else
+ image->capacity = image->size;
+
+ return 0;
+}
+
+static int
+vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err, vhd_flags;
+
+ if (!target_vhd(image->target->type))
+ return 0;
+
+ vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+ if (flags & VHD_SCAN_FAST)
+ vhd_flags |= VHD_OPEN_FAST;
+
+ err = vhd_open(vhd, image->name, vhd_flags);
+ if (err) {
+ vhd->file = NULL;
+ image->message = "opening file";
+ image->error = err;
+ return image->error;
+ }
+
+ return 0;
+}
+
+static int
+vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ char *buf;
+ size_t size;
+ struct target *target;
+
+ buf = NULL;
+ target = image->target;
+ size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+
+ err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+ if (err) {
+ buf = NULL;
+ image->message = "allocating image";
+ image->error = -err;
+ goto out;
+ }
+
+ err = vhd_seek(vhd, target->start, SEEK_SET);
+ if (err) {
+ image->message = "seeking to headers";
+ image->error = err;
+ goto out;
+ }
+
+ err = vhd_read(vhd, buf, size);
+ if (err) {
+ image->message = "reading headers";
+ image->error = err;
+ goto out;
+ }
+
+ memcpy(&vhd->footer, buf, sizeof(vhd_footer_t));
+ vhd_footer_in(&vhd->footer);
+ err = vhd_validate_footer(&vhd->footer);
+ if (err) {
+ image->message = "invalid footer";
+ image->error = err;
+ goto out;
+ }
+
+ /* lvhd vhds should always be dynamic */
+ if (vhd_type_dynamic(vhd)) {
+ if (vhd->footer.data_offset != sizeof(vhd_footer_t))
+ err = vhd_read_header_at(vhd, &vhd->header,
+ vhd->footer.data_offset +
+ target->start);
+ else {
+ memcpy(&vhd->header,
+ buf + sizeof(vhd_footer_t),
+ sizeof(vhd_header_t));
+ vhd_header_in(&vhd->header);
+ err = vhd_validate_header(&vhd->header);
+ }
+
+ if (err) {
+ image->message = "reading header";
+ image->error = err;
+ goto out;
+ }
+
+ vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+ vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3);
+ }
+
+out:
+ free(buf);
+ return image->error;
+}
+
+static int
+vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ struct target *target;
+
+ target = image->target;
+ memset(vhd, 0, sizeof(*vhd));
+ vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST;
+
+ if (target->end - target->start < 4096) {
+ image->message = "device too small";
+ image->error = -EINVAL;
+ return image->error;
+ }
+
+ vhd->file = strdup(image->name);
+ if (!vhd->file) {
+ image->message = "allocating device";
+ image->error = -ENOMEM;
+ return image->error;
+ }
+
+ vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE);
+ if (vhd->fd == -1) {
+ free(vhd->file);
+ vhd->file = NULL;
+
+ image->message = "opening device";
+ image->error = -errno;
+ return image->error;
+ }
+
+ if (target_vhd(target->type))
+ return vhd_util_scan_read_volume_headers(vhd, image);
+
+ return 0;
+}
+
+static int
+vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image)
+{
+ struct target *target;
+
+ target = image->target;
+
+ if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY))
+ image->name = target->name;
+ else {
+ image->name = realpath(target->name, NULL);
+ if (!image->name) {
+ image->name = target->name;
+ image->message = "resolving name";
+ image->error = -errno;
+ return image->error;
+ }
+ }
+
+ if (target_volume(target->type))
+ return vhd_util_scan_open_volume(vhd, image);
+ else
+ return vhd_util_scan_open_file(vhd, image);
+}
+
+static int
+vhd_util_scan_init_file_target(struct target *target,
+ const char *file, uint8_t type)
+{
+ int err;
+ struct stat stats;
+
+ err = stat(file, &stats);
+ if (err == -1)
+ return -errno;
+
+ err = copy_name(target->name, file);
+ if (err)
+ return err;
+
+ err = copy_name(target->device, file);
+ if (err)
+ return err;
+
+ target->type = type;
+ target->start = 0;
+ target->size = stats.st_size;
+ target->end = stats.st_size;
+
+ return 0;
+}
+
+static int
+vhd_util_scan_init_volume_target(struct target *target,
+ struct lv *lv, uint8_t type)
+{
+ int err;
+
+ if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR)
+ return -ENOSYS;
+
+ err = copy_name(target->name, lv->name);
+ if (err)
+ return err;
+
+ err = copy_name(target->device, lv->first_segment.device);
+ if (err)
+ return err;
+
+ target->type = type;
+ target->size = lv->size;
+ target->start = lv->first_segment.pe_start;
+ target->end = target->start + lv->first_segment.pe_size;
+
+ return 0;
+}
+
+static int
+iterator_init(struct iterator *itr, int cnt, struct target *targets)
+{
+ memset(itr, 0, sizeof(*itr));
+
+ itr->targets = malloc(sizeof(struct target) * cnt);
+ if (!itr->targets)
+ return -ENOMEM;
+
+ memcpy(itr->targets, targets, sizeof(struct target) * cnt);
+
+ itr->cur = 0;
+ itr->cur_size = cnt;
+ itr->max_size = cnt;
+
+ return 0;
+}
+
+static struct target *
+iterator_next(struct iterator *itr)
+{
+ if (itr->cur == itr->cur_size)
+ return NULL;
+
+ return itr->targets + itr->cur++;
+}
+
+static int
+iterator_add_file(struct iterator *itr,
+ struct target *target, const char *parent, uint8_t type)
+{
+ int i;
+ struct target *t;
+ char *lname, *rname;
+
+ for (i = 0; i < itr->cur_size; i++) {
+ t = itr->targets + i;
+ lname = basename((char *)t->name);
+ rname = basename((char *)parent);
+
+ if (!strcmp(lname, rname))
+ return -EEXIST;
+ }
+
+ return vhd_util_scan_init_file_target(target, parent, type);
+}
+
+static int
+iterator_add_volume(struct iterator *itr,
+ struct target *target, const char *parent, uint8_t type)
+{
+ int i, err;
+ struct lv *lv;
+
+ lv = NULL;
+ err = -ENOENT;
+
+ for (i = 0; i < itr->cur_size; i++)
+ if (!strcmp(parent, itr->targets[i].name))
+ return -EEXIST;
+
+ for (i = 0; i < vg.lv_cnt; i++) {
+ err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME);
+ if (err != FNM_NOMATCH) {
+ lv = vg.lvs + i;
+ break;
+ }
+ }
+
+ if (err && err != FNM_PATHNAME)
+ return err;
+
+ if (!lv)
+ return -ENOENT;
+
+ return vhd_util_scan_init_volume_target(target, lv, type);
+}
+
+static int
+iterator_add(struct iterator *itr, const char *parent, uint8_t type)
+{
+ int err;
+ struct target *target;
+
+ if (itr->cur_size == itr->max_size) {
+ struct target *new;
+
+ new = realloc(itr->targets,
+ sizeof(struct target) *
+ itr->max_size * 2);
+ if (!new)
+ return -ENOMEM;
+
+ itr->max_size *= 2;
+ itr->targets = new;
+ }
+
+ target = itr->targets + itr->cur_size;
+
+ if (target_volume(type))
+ err = iterator_add_volume(itr, target, parent, type);
+ else
+ err = iterator_add_file(itr, target, parent, type);
+
+ if (err)
+ memset(target, 0, sizeof(*target));
+ else
+ itr->cur_size++;
+
+ return (err == -EEXIST ? 0 : err);
+}
+
+static void
+iterator_free(struct iterator *itr)
+{
+ free(itr->targets);
+ memset(itr, 0, sizeof(*itr));
+}
+
+static void
+vhd_util_scan_add_parent(struct iterator *itr,
+ vhd_context_t *vhd, struct vhd_image *image)
+{
+ int err;
+ uint8_t type;
+
+ if (vhd_parent_raw(vhd))
+ type = target_volume(image->target->type) ?
+ VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE;
+ else
+ type = target_volume(image->target->type) ?
+ VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE;
+
+ err = iterator_add(itr, image->parent, type);
+ if (err)
+ vhd_util_scan_error(image->parent, err);
+}
+
+static int
+vhd_util_scan_targets(int cnt, struct target *targets)
+{
+ int ret, err;
+ vhd_context_t vhd;
+ struct iterator itr;
+ struct target *target;
+ struct vhd_image image;
+
+ ret = 0;
+ err = 0;
+
+ err = iterator_init(&itr, cnt, targets);
+ if (err)
+ return err;
+
+ while ((target = iterator_next(&itr))) {
+ memset(&vhd, 0, sizeof(vhd));
+ memset(&image, 0, sizeof(image));
+
+ image.target = target;
+
+ err = vhd_util_scan_open(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ goto end;
+ }
+
+ err = vhd_util_scan_get_size(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "getting physical size";
+ image.error = err;
+ goto end;
+ }
+
+ err = vhd_util_scan_get_hidden(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "checking 'hidden' field";
+ image.error = err;
+ goto end;
+ }
+
+ if (vhd.footer.type == HD_TYPE_DIFF) {
+ err = vhd_util_scan_get_parent(&vhd, &image);
+ if (err) {
+ ret = -EAGAIN;
+ image.message = "getting parent";
+ image.error = err;
+ goto end;
+ }
+ }
+
+ end:
+ vhd_util_scan_print_image(&image);
+
+ if (flags & VHD_SCAN_PARENTS && image.parent)
+ vhd_util_scan_add_parent(&itr, &vhd, &image);
+
+ if (vhd.file)
+ vhd_close(&vhd);
+ if (image.name != target->name)
+ free(image.name);
+ free(image.parent);
+
+ if (err && !(flags & VHD_SCAN_NOFAIL))
+ break;
+ }
+
+ iterator_free(&itr);
+
+ if (flags & VHD_SCAN_NOFAIL)
+ return ret;
+
+ return err;
+}
+
+static int
+vhd_util_scan_targets_pretty(int cnt, struct target *targets)
+{
+ int err;
+
+ err = vhd_util_scan_pretty_allocate_list(cnt);
+ if (err) {
+ printf("scan failed: no memory\n");
+ return -ENOMEM;
+ }
+
+ err = vhd_util_scan_targets(cnt, targets);
+
+ vhd_util_scan_pretty_print_images();
+ vhd_util_scan_pretty_free_list();
+
+ return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+}
+
+static int
+vhd_util_scan_find_file_targets(int cnt, char **names,
+ const char *filter,
+ struct target **_targets, int *_total)
+{
+ glob_t g;
+ struct target *targets;
+ int i, globs, err, total;
+
+ total = cnt;
+ globs = 0;
+ *_total = 0;
+ *_targets = NULL;
+
+ memset(&g, 0, sizeof(g));
+
+ if (filter) {
+ int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0);
+
+ errno = 0;
+ err = glob(filter, gflags, vhd_util_scan_error, &g);
+
+ switch (err) {
+ case GLOB_NOSPACE:
+ err = -ENOMEM;
+ break;
+ case GLOB_ABORTED:
+ err = -EIO;
+ break;
+ case GLOB_NOMATCH:
+ err = -errno;
+ break;
+ }
+
+ if (err) {
+ vhd_util_scan_error(filter, err);
+ return err;
+ }
+
+ globs = g.gl_pathc;
+ total += globs;
+ }
+
+ targets = calloc(total, sizeof(struct target));
+ if (!targets) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < g.gl_pathc; i++) {
+ err = vhd_util_scan_init_file_target(targets + i,
+ g.gl_pathv[i],
+ VHD_TYPE_VHD_FILE);
+ if (err) {
+ vhd_util_scan_error(g.gl_pathv[i], err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ for (i = 0; i + globs < total; i++) {
+ err = vhd_util_scan_init_file_target(targets + i + globs,
+ names[i],
+ VHD_TYPE_VHD_FILE);
+ if (err) {
+ vhd_util_scan_error(names[i], err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ err = 0;
+ *_total = total;
+ *_targets = targets;
+
+out:
+ if (err)
+ free(targets);
+ if (filter)
+ globfree(&g);
+
+ return err;
+}
+
+static inline void
+swap_volume(struct lv *lvs, int dst, int src)
+{
+ struct lv copy, *ldst, *lsrc;
+
+ if (dst == src)
+ return;
+
+ lsrc = lvs + src;
+ ldst = lvs + dst;
+
+ memcpy(&copy, ldst, sizeof(copy));
+ memcpy(ldst, lsrc, sizeof(*ldst));
+ memcpy(lsrc, &copy, sizeof(copy));
+}
+
+static int
+vhd_util_scan_sort_volumes(struct lv *lvs, int cnt,
+ const char *filter, int *_matches)
+{
+ struct lv *lv;
+ int i, err, matches;
+
+ matches = 0;
+ *_matches = 0;
+
+ if (!filter)
+ return 0;
+
+ for (i = 0; i < cnt; i++) {
+ lv = lvs + i;
+
+ err = fnmatch(filter, lv->name, FNM_PATHNAME);
+ if (err) {
+ if (err != FNM_NOMATCH) {
+ vhd_util_scan_error(lv->name, err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ return err;
+ }
+
+ continue;
+ }
+
+ swap_volume(lvs, matches++, i);
+ }
+
+ *_matches = matches;
+ return 0;
+}
+
+static int
+vhd_util_scan_find_volume_targets(int cnt, char **names,
+ const char *volume, const char *filter,
+ struct target **_targets, int *_total)
+{
+ struct target *targets;
+ int i, err, total, matches;
+
+ *_total = 0;
+ *_targets = NULL;
+ targets = NULL;
+
+ err = lvm_scan_vg(volume, &vg);
+ if (err)
+ return err;
+
+ err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt,
+ filter, &matches);
+ if (err)
+ goto out;
+
+ total = matches;
+ for (i = 0; i < cnt; i++) {
+ err = vhd_util_scan_sort_volumes(vg.lvs + total,
+ vg.lv_cnt - total,
+ names[i], &matches);
+ if (err)
+ goto out;
+
+ total += matches;
+ }
+
+ targets = calloc(total, sizeof(struct target));
+ if (!targets) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < total; i++) {
+ err = vhd_util_scan_init_volume_target(targets + i,
+ vg.lvs + i,
+ VHD_TYPE_VHD_VOLUME);
+ if (err) {
+ vhd_util_scan_error(vg.lvs[i].name, err);
+ if (!(flags & VHD_SCAN_NOFAIL))
+ goto out;
+ }
+ }
+
+ err = 0;
+ *_total = total;
+ *_targets = targets;
+
+out:
+ if (err)
+ free(targets);
+ return err;
+}
+
+static int
+vhd_util_scan_find_targets(int cnt, char **names,
+ const char *volume, const char *filter,
+ struct target **targets, int *total)
+{
+ if (flags & VHD_SCAN_VOLUME)
+ return vhd_util_scan_find_volume_targets(cnt, names,
+ volume, filter,
+ targets, total);
+ return vhd_util_scan_find_file_targets(cnt, names,
+ filter, targets, total);
+}
+
+int
+vhd_util_scan(int argc, char **argv)
+{
+ int c, ret, err, cnt;
+ char *filter, *volume;
+ struct target *targets;
+
+ cnt = 0;
+ ret = 0;
+ err = 0;
+ flags = 0;
+ filter = NULL;
+ volume = NULL;
+ targets = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "m:fcl:pavh")) != -1) {
+ switch (c) {
+ case 'm':
+ filter = optarg;
+ break;
+ case 'f':
+ flags |= VHD_SCAN_FAST;
+ break;
+ case 'c':
+ flags |= VHD_SCAN_NOFAIL;
+ break;
+ case 'l':
+ volume = optarg;
+ flags |= VHD_SCAN_VOLUME;
+ break;
+ case 'p':
+ flags |= VHD_SCAN_PRETTY;
+ break;
+ case 'a':
+ flags |= VHD_SCAN_PARENTS;
+ break;
+ case 'v':
+ flags |= VHD_SCAN_VERBOSE;
+ break;
+ case 'h':
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!filter && argc - optind == 0) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ if (flags & VHD_SCAN_PRETTY)
+ flags &= ~VHD_SCAN_FAST;
+
+ err = vhd_util_scan_find_targets(argc - optind, argv + optind,
+ volume, filter, &targets, &cnt);
+ if (err) {
+ printf("scan failed: %d\n", err);
+ return err;
+ }
+
+ if (!cnt)
+ return 0;
+
+ if (flags & VHD_SCAN_PRETTY)
+ err = vhd_util_scan_targets_pretty(cnt, targets);
+ else
+ err = vhd_util_scan_targets(cnt, targets);
+
+ free(targets);
+ lvm_free_vg(&vg);
+
+ return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+
+usage:
+ printf("usage: [OPTIONS] FILES\n"
+ "options: [-m match filter] [-f fast] [-c continue on failure] "
+ "[-l LVM volume] [-p pretty print] [-a scan parents] "
+ "[-v verbose] [-h help]\n");
+ return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-set-field.c b/tools/blktap2/vhd/lib/vhd-util-set-field.c
new file mode 100644
index 0000000000..ac185735d9
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-set-field.c
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_set_field(int argc, char **argv)
+{
+ long value;
+ int err, c;
+ off64_t eof;
+ vhd_context_t vhd;
+ char *name, *field;
+
+ err = -EINVAL;
+ value = 0;
+ name = NULL;
+ field = NULL;
+
+ if (!argc || !argv)
+ goto usage;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:f:v:h")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'f':
+ field = optarg;
+ break;
+ case 'v':
+ err = 0;
+ value = strtol(optarg, NULL, 10);
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ if (!name || !field || optind != argc || err)
+ goto usage;
+
+ if (strnlen(field, 25) >= 25) {
+ printf("invalid field\n");
+ goto usage;
+ }
+
+ if (strcmp(field, "hidden")) {
+ printf("invalid field %s\n", field);
+ goto usage;
+ }
+
+ if (value < 0 || value > 255) {
+ printf("invalid value %ld\n", value);
+ goto usage;
+ }
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+ if (err) {
+ printf("error opening %s: %d\n", name, err);
+ return err;
+ }
+
+ vhd.footer.hidden = (char)value;
+
+ err = vhd_write_footer(&vhd, &vhd.footer);
+
+ done:
+ vhd_close(&vhd);
+ return err;
+
+usage:
+ printf("options: <-n name> <-f field> <-v value> [-h help]\n");
+ return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-snapshot.c b/tools/blktap2/vhd/lib/vhd-util-snapshot.c
new file mode 100644
index 0000000000..75960f96ea
--- /dev/null
+++ b/tools/blktap2/vhd/lib/vhd-util-snapshot.c
@@ -0,0 +1,216 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw)
+{
+ int i, err;
+ char *target;
+ vhd_context_t vhd;
+
+ *parent_raw = 0;
+ *result = NULL;
+
+ target = strdup(name);
+ if (!target)
+ return -ENOMEM;
+
+ for (;;) {
+ err = vhd_open(&vhd, target, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ if (vhd.footer.type != HD_TYPE_DIFF)
+ goto out;
+
+ err = vhd_get_bat(&vhd);
+ if (err)
+ goto out;
+
+ for (i = 0; i < vhd.bat.entries; i++)
+ if (vhd.bat.bat[i] != DD_BLK_UNUSED)
+ goto out;
+
+ free(target);
+ err = vhd_parent_locator_get(&vhd, &target);
+ if (err)
+ goto out;
+
+ if (vhd_parent_raw(&vhd)) {
+ *parent_raw = 1;
+ goto out;
+ }
+
+ vhd_close(&vhd);
+ }
+
+out:
+ vhd_close(&vhd);
+ if (err)
+ free(target);
+ else
+ *result = target;
+
+ return err;
+}
+
+static int
+vhd_util_check_depth(const char *name, int *depth)
+{
+ int err;
+ vhd_context_t vhd;
+
+ err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+ if (err)
+ return err;
+
+ err = vhd_chain_depth(&vhd, depth);
+ vhd_close(&vhd);
+
+ return err;
+}
+
+int
+vhd_util_snapshot(int argc, char **argv)
+{
+ vhd_flag_creat_t flags;
+ int c, err, prt_raw, limit;
+ char *name, *pname, *ppath, *backing;
+ uint64_t size;
+ vhd_context_t vhd;
+
+ name = NULL;
+ pname = NULL;
+ ppath = NULL;
+ backing = NULL;
+ size = 0;
+ flags = 0;
+ limit = 0;
+
+ if (!argc || !argv) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ optind = 0;
+ while ((c = getopt(argc, argv, "n:p:l:mh")) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'p':
+ pname = optarg;
+ break;
+ case 'l':
+ limit = strtol(optarg, NULL, 10);
+ break;
+ case 'm':
+ vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ break;
+ case 'h':
+ err = 0;
+ goto usage;
+ default:
+ err = -EINVAL;
+ goto usage;
+ }
+ }
+
+ if (!name || !pname || optind != argc) {
+ err = -EINVAL;
+ goto usage;
+ }
+
+ ppath = realpath(pname, NULL);
+ if (!ppath)
+ return -errno;
+
+ if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+ backing = strdup(ppath);
+ if (!backing) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else {
+ err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw);
+ if (err) {
+ backing = NULL;
+ goto out;
+ }
+
+ /*
+ * if the sizes of the parent chain are non-uniform, we need to
+ * pick the right size: that of the supplied parent
+ */
+ if (strcmp(ppath, backing)) {
+ err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY);
+ if (err)
+ goto out;
+ size = vhd.footer.curr_size;
+ vhd_close(&vhd);
+ }
+
+ if (prt_raw)
+ vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+ }
+
+ if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+ int depth;
+
+ err = vhd_util_check_depth(backing, &depth);
+ if (err)
+ printf("error checking snapshot depth: %d\n", err);
+ else if (depth + 1 > limit) {
+ err = -ENOSPC;
+ printf("snapshot depth exceeded: "
+ "current depth: %d, limit: %d\n", depth, limit);
+ }
+
+ if (err)
+ goto out;
+ }
+
+ err = vhd_snapshot(name, size, backing, flags);
+
+out:
+ free(ppath);
+ free(backing);
+
+ return err;
+
+usage:
+ printf("options: <-n name> <-p parent name> [-l snapshot depth limit]"
+ " [-m parent_is_raw] [-h help]\n");
+ return err;
+}
diff --git a/tools/blktap2/vhd/vhd-update.c b/tools/blktap2/vhd/vhd-update.c
new file mode 100644
index 0000000000..fbc23cc7ae
--- /dev/null
+++ b/tools/blktap2/vhd/vhd-update.c
@@ -0,0 +1,261 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Before updating a VHD file, we create a journal consisting of:
+ * - all data at the beginning of the file, up to and including the BAT
+ * - each allocated bitmap (existing at the same offset in the journal as
+ * its corresponding bitmap in the original file)
+ * Updates are performed in place by writing appropriately
+ * transformed versions of journaled bitmaps to the original file.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <endian.h>
+#include <byteswap.h>
+
+#include "atomicio.h"
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+static void
+usage(void)
+{
+ printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n");
+ exit(EINVAL);
+}
+
+/*
+ * update vhd creator version to reflect its new bitmap ordering
+ */
+static inline int
+update_creator_version(vhd_journal_t *journal)
+{
+ journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1);
+ return vhd_write_footer(&journal->vhd, &journal->vhd.footer);
+}
+
+static int
+journal_bitmaps(vhd_journal_t *journal)
+{
+ int i, err;
+
+ for (i = 0; i < journal->vhd.bat.entries; i++) {
+ err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * older VHD bitmaps were little endian
+ * and bits within a word were set from right to left
+ */
+static inline int
+old_test_bit(int nr, volatile void * addr)
+{
+ return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+ (nr % (sizeof(unsigned long)*8))) & 1;
+}
+
+/*
+ * new VHD bitmaps are big endian
+ * and bits within a word are set from left to right
+ */
+#define BIT_MASK 0x80
+static inline void
+new_set_bit (int nr, volatile char *addr)
+{
+ addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static void
+convert_bitmap(char *in, char *out, int bytes)
+{
+ int i;
+
+ memset(out, 0, bytes);
+
+ for (i = 0; i < bytes << 3; i++)
+ if (old_test_bit(i, (void *)in))
+ new_set_bit(i, out);
+}
+
+static int
+update_vhd(vhd_journal_t *journal, int rollback)
+{
+ int i, err;
+ size_t size;
+ char *buf, *converted;
+
+ buf = NULL;
+ converted = NULL;
+
+ size = vhd_bytes_padded(journal->vhd.spb / 8);
+ err = posix_memalign((void **)&converted, 512, size);
+ if (err) {
+ converted = NULL;
+ goto out;
+ }
+
+ for (i = 0; i < journal->vhd.bat.entries; i++) {
+ if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED)
+ continue;
+
+ err = vhd_read_bitmap(&journal->vhd, i, &buf);
+ if (err)
+ goto out;
+
+ if (rollback)
+ memcpy(converted, buf, size);
+ else
+ convert_bitmap(buf, converted, size);
+
+ free(buf);
+
+ err = vhd_write_bitmap(&journal->vhd, i, converted);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+ out:
+ free(converted);
+ return err;
+}
+
+static int
+open_journal(vhd_journal_t *journal, const char *file, const char *jfile)
+{
+ int err;
+
+ err = vhd_journal_create(journal, file, jfile);
+ if (err) {
+ printf("error creating journal for %s: %d\n", file, err);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+close_journal(vhd_journal_t *journal, int err)
+{
+ if (err)
+ err = vhd_journal_revert(journal);
+ else
+ err = vhd_journal_commit(journal);
+
+ if (err)
+ return vhd_journal_close(journal);
+ else
+ return vhd_journal_remove(journal);
+}
+
+int
+main(int argc, char **argv)
+{
+ char *file, *jfile;
+ int c, err, rollback;
+ vhd_journal_t journal;
+
+ file = NULL;
+ jfile = NULL;
+ rollback = 0;
+
+ while ((c = getopt(argc, argv, "n:j:rh")) != -1) {
+ switch(c) {
+ case 'n':
+ file = optarg;
+ break;
+ case 'j':
+ jfile = optarg;
+ err = access(jfile, R_OK);
+ if (err == -1) {
+ printf("invalid journal arg %s\n", jfile);
+ return -errno;
+ }
+ break;
+ case 'r':
+ /* add a rollback option for debugging which
+ * pushes journalled bitmaps to original file
+ * without transforming them */
+ rollback = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+
+ if (!file)
+ usage();
+
+ if (rollback && !jfile) {
+ printf("rollback requires a journal argument\n");
+ usage();
+ }
+
+ err = open_journal(&journal, file, jfile);
+ if (err)
+ return err;
+
+ if (!vhd_creator_tapdisk(&journal.vhd) ||
+ journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) ||
+ journal.vhd.footer.type == HD_TYPE_FIXED) {
+ err = 0;
+ goto out;
+ }
+
+ err = journal_bitmaps(&journal);
+ if (err) {
+ /* no changes to vhd file yet,
+ * so close the journal and bail */
+ vhd_journal_close(&journal);
+ return err;
+ }
+
+ err = update_vhd(&journal, rollback);
+ if (err) {
+ printf("update failed: %d; saving journal\n", err);
+ goto out;
+ }
+
+ err = update_creator_version(&journal);
+ if (err) {
+ printf("failed to udpate creator version: %d\n", err);
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ err = close_journal(&journal, err);
+ return err;
+}
diff --git a/tools/blktap2/vhd/vhd-util.c b/tools/blktap2/vhd/vhd-util.c
new file mode 100644
index 0000000000..944a59e395
--- /dev/null
+++ b/tools/blktap2/vhd/vhd-util.c
@@ -0,0 +1,160 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of XenSource Inc. nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef int (*vhd_util_func_t) (int, char **);
+
+struct command {
+ char *name;
+ vhd_util_func_t func;
+};
+
+struct command commands[] = {
+ { .name = "create", .func = vhd_util_create },
+ { .name = "snapshot", .func = vhd_util_snapshot },
+ { .name = "query", .func = vhd_util_query },
+ { .name = "read", .func = vhd_util_read },
+ { .name = "set", .func = vhd_util_set_field },
+ { .name = "repair", .func = vhd_util_repair },
+ { .name = "resize", .func = vhd_util_resize },
+ { .name = "fill", .func = vhd_util_fill },
+ { .name = "coalesce", .func = vhd_util_coalesce },
+ { .name = "modify", .func = vhd_util_modify },
+ { .name = "scan", .func = vhd_util_scan },
+ { .name = "check", .func = vhd_util_check },
+ { .name = "revert", .func = vhd_util_revert },
+};
+
+#define print_commands() \
+ do { \
+ int i, n; \
+ n = sizeof(commands) / sizeof(struct command); \
+ printf("COMMAND := { "); \
+ printf("%s", commands[0].name); \
+ for (i = 1; i < n; i++) \
+ printf(" | %s", commands[i].name); \
+ printf(" }\n"); \
+ } while (0)
+
+TEST_FAIL_EXTERN_VARS;
+
+void
+help(void)
+{
+ printf("usage: vhd-util COMMAND [OPTIONS]\n");
+ print_commands();
+ exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+ int i, n;
+
+ if (strnlen(command, 25) >= 25)
+ return NULL;
+
+ n = sizeof(commands) / sizeof (struct command);
+
+ for (i = 0; i < n; i++)
+ if (!strcmp(command, commands[i].name))
+ return &commands[i];
+
+ return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **cargv;
+ struct command *cmd;
+ int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+ #include <sys/resource.h>
+ struct rlimit rlim;
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+ fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+ ret = 0;
+
+ if (argc < 2)
+ help();
+
+ cargc = argc - 1;
+ cmd = get_command(argv[1]);
+ if (!cmd) {
+ fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+ help();
+ }
+
+ cargv = malloc(sizeof(char *) * cargc);
+ if (!cargv)
+ exit(ENOMEM);
+
+ cnt = 1;
+ cargv[0] = cmd->name;
+ for (i = 1; i < cargc; i++) {
+ char *arg = argv[i + (argc - cargc)];
+
+ if (!strcmp(arg, "--debug")) {
+ libvhd_set_log_level(1);
+ continue;
+ }
+
+ cargv[cnt++] = arg;
+ }
+
+#ifdef ENABLE_FAILURE_TESTING
+ for (i = 0; i < NUM_FAIL_TESTS; i++) {
+ TEST_FAIL[i] = 0;
+ if (getenv(ENV_VAR_FAIL[i]))
+ TEST_FAIL[i] = 1;
+ }
+#endif // ENABLE_FAILURE_TESTING
+
+ ret = cmd->func(cnt, cargv);
+
+ free(cargv);
+
+ return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/check/check_uuid_devel b/tools/check/check_uuid_devel
new file mode 100755
index 0000000000..0a90b15eea
--- /dev/null
+++ b/tools/check/check_uuid_devel
@@ -0,0 +1,6 @@
+#!/bin/sh
+# CHECK-BUILD
+
+. ./funcs.sh
+
+has_header uuid/uuid.h || fail "missing uuid headers (package uuid-dev)"
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index bd499a728a..88a0cbe259 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -27,6 +27,7 @@ Author: Mike Wray <mike.wray@hp.com>
import logging
import time
import threading
+import thread
import re
import copy
import os
@@ -535,6 +536,25 @@ class XendDomainInfo:
@raise XendError: Failed pausing a domain
"""
try:
+ bepath="/local/domain/0/backend/"
+ if(self.domid):
+
+ dev = xstransact.List(bepath + 'vbd' + "/%d" % (self.domid,))
+ for x in dev:
+ path = self.getDeviceController('vbd').readBackend(x, 'params')
+ if path and path.startswith('/dev/xen/blktap-2'):
+ #Figure out the sysfs path.
+ pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+ ctrlid = pattern.search(path)
+ ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)
+ #pause the disk
+ f = open(ctrl + '/pause', 'w')
+ f.write('pause');
+ f.close()
+ except Exception, ex:
+ log.warn('Could not pause blktap disk.');
+
+ try:
xc.domain_pause(self.domid)
self._stateSet(DOM_STATE_PAUSED)
except Exception, ex:
@@ -547,6 +567,26 @@ class XendDomainInfo:
@raise XendError: Failed unpausing a domain
"""
try:
+ bepath="/local/domain/0/backend/"
+ if(self.domid):
+ dev = xstransact.List(bepath + "vbd" + "/%d" % (self.domid,))
+ for x in dev:
+ path = self.getDeviceController('vbd').readBackend(x, 'params')
+ if path and path.startswith('/dev/xen/blktap-2'):
+ #Figure out the sysfs path.
+ pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+ ctrlid = pattern.search(path)
+ ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)
+ #unpause the disk
+ if(os.path.exists(ctrl + '/resume')):
+ f = open(ctrl + '/resume', 'w');
+ f.write('resume');
+ f.close();
+
+ except Exception, ex:
+ log.warn('Could not unpause blktap disk: %s' % str(ex));
+
+ try:
xc.domain_unpause(self.domid)
self._stateSet(DOM_STATE_RUNNING)
except Exception, ex:
@@ -1171,6 +1211,15 @@ class XendDomainInfo:
rc = None
if self.domid is not None:
+
+ #new blktap implementation may need a sysfs write after everything is torn down.
+ dev = self.getDeviceController(deviceClass).convertToDeviceNumber(devid)
+ path = self.getDeviceController(deviceClass).readBackend(dev, 'params')
+ if path and path.startswith('/dev/xen/blktap-2'):
+ frontpath = self.getDeviceController(deviceClass).frontendPath(dev)
+ backpath = xstransact.Read(frontpath, "backend")
+ thread.start_new_thread(self.getDeviceController(deviceClass).finishDeviceCleanup, (backpath, path))
+
rc = self.getDeviceController(deviceClass).destroyDevice(devid, force)
if not force and rm_cfg:
# The backend path, other than the device itself,
diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py
index 36c1d0688e..4c7f334968 100644
--- a/tools/python/xen/xend/server/BlktapController.py
+++ b/tools/python/xen/xend/server/BlktapController.py
@@ -1,5 +1,6 @@
# Copyright (c) 2005, XenSource Ltd.
-
+import string, re
+import subprocess
from xen.xend.server.blkif import BlkifController
from xen.xend.XendLogging import log
@@ -7,6 +8,11 @@ from xen.xend.XendLogging import log
phantomDev = 0;
phantomId = 0;
+TAPDISK_SYSFS = '/sys/class/blktap2'
+TAPDISK_BINARY = '/usr/sbin/tapdisk2'
+TAPDISK_DEVICE = '/dev/xen/blktap-2/tapdev'
+TAPDISK_CONTROL = TAPDISK_SYSFS + '/blktap'
+
blktap_disk_types = [
'aio',
'sync',
@@ -14,10 +20,33 @@ blktap_disk_types = [
'ram',
'qcow',
'qcow2',
-
+ 'vhd',
'ioemu',
'tapdisk',
]
+
+def doexec(args, inputtext=None):
+ """Execute a subprocess, then return its return code, stdout and stderr"""
+ proc = subprocess.Popen(args,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True)
+ (stdout,stderr) = proc.communicate(inputtext)
+ rc = proc.returncode
+ return (rc,stdout,stderr)
+
+def parseDeviceString(device):
+ if device.find('/dev') == -1:
+ raise Exception, 'invalid tap device: ' + device
+
+ pattern = re.compile(TAPDISK_DEVICE + '(\d+)$')
+ groups = pattern.search(device)
+ if not groups:
+ raise Exception, 'malformed tap device: ' + device
+
+ minor = groups.group(1)
+ control = TAPDISK_CONTROL + minor
+
+ return minor, device, control
+
+
class BlktapController(BlkifController):
def __init__(self, vm):
@@ -86,3 +115,24 @@ class BlktapController(BlkifController):
return (devid, back, front)
+ def createDevice(self, config):
+
+ uname = config.get('uname', '')
+ (typ, subtyp, params, file) = string.split(uname, ':', 3)
+ if typ in ('tap'):
+ if subtyp in ('tapdisk'):
+ if params in ('ioemu', 'qcow2', 'vmdk', 'sync'):
+ log.warn('WARNING: using deprecated blktap module');
+ return BlkifController.createDevice(self, config);
+
+ cmd = [ TAPDISK_BINARY, '-n', '%s:%s' % (params, file) ]
+ (rc,stdout,stderr) = doexec(cmd)
+
+ minor, device, control = parseDeviceString(stdout)
+
+ #modify the configuration to attach as a vbd, now that the
+ #device is configured. Then continue to create the device
+ config.update({'uname' : 'phy:' + device.rstrip()})
+ self.deviceClass='vbd'
+
+ return BlkifController.createDevice(self, config);
diff --git a/tools/python/xen/xend/server/DevController.py b/tools/python/xen/xend/server/DevController.py
index 6c2bb09ca6..ed46dd4803 100644
--- a/tools/python/xen/xend/server/DevController.py
+++ b/tools/python/xen/xend/server/DevController.py
@@ -27,8 +27,8 @@ from xen.xend.server.DevConstants import *
from xen.xend.xenstore.xstransact import xstransact, complete
from xen.xend.xenstore.xswatch import xswatch
-
-import os
+import xen.xend.server.DevConstants
+import os, re
xoptions = XendOptions.instance()
@@ -238,6 +238,34 @@ class DevController:
# xstransact.Remove(self.devicePath()) ?? Below is the same ?
self.vm._removeVm("device/%s/%d" % (self.deviceClass, dev))
+ # The new blocktap implementation requires a sysfs signal to close
+ # out disks. This function is called from a thread when the
+ # domain is detached from the disk.
+ def finishDeviceCleanup(self, backpath, path):
+ """Perform any device specific cleanup
+
+ @backpath backend xenstore path.
+ @path frontend device path
+
+ """
+
+ if path and path.startswith('/dev/xen/blktap-2'):
+
+ #Figure out what we're going to wait on.
+ self.waitForBackend_destroy(backpath)
+
+ #Figure out the sysfs path.
+ pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+ ctrlid = pattern.search(path)
+ ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)
+
+ #Close out the disk
+ f = open(ctrl + '/remove', 'w')
+ f.write('remove');
+ f.close()
+
+ return
+
def configurations(self, transaction = None):
return map(lambda x: self.configuration(x, transaction), self.deviceIDs(transaction))