diff options
108 files changed, 35869 insertions, 5 deletions
@@ -103,7 +103,19 @@ ^stubdom/lwip/ ^stubdom/ioemu/ ^tools/.*/build/lib.*/.*\.py$ -^tools/blktap/Makefile\.smh$ +^tools/blktap2/daemon/blktapctrl$ +^tools/blktap2/drivers/img2qcow$ +^tools/blktap2/drivers/lock-util$ +^tools/blktap2/drivers/qcow-create$ +^tools/blktap2/drivers/qcow2raw$ +^tools/blktap2/drivers/tapdisk$ +^tools/blktap2/drivers/tapdisk-client$ +^tools/blktap2/drivers/tapdisk-diff$ +^tools/blktap2/drivers/tapdisk-stream$ +^tools/blktap2/drivers/tapdisk2$ +^tools/blktap2/drivers/td-util$ +^tools/blktap2/vhd/vhd-update$ +^tools/blktap2/vhd/vhd-util$ ^tools/blktap/drivers/blktapctrl$ ^tools/blktap/drivers/img2qcow$ ^tools/blktap/drivers/qcow-create$ diff --git a/tools/Makefile b/tools/Makefile index 3209f2f8bd..dff96a5c76 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -22,6 +22,7 @@ SUBDIRS-$(VTPM_TOOLS) += vtpm SUBDIRS-y += xenstat SUBDIRS-$(CONFIG_Linux) += libaio SUBDIRS-$(CONFIG_Linux) += blktap +SUBDIRS-$(CONFIG_Linux) += blktap2 SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen SUBDIRS-$(CONFIG_Linux) += fs-back diff --git a/tools/blktap2/Makefile b/tools/blktap2/Makefile new file mode 100644 index 0000000000..20a9451fa1 --- /dev/null +++ b/tools/blktap2/Makefile @@ -0,0 +1,34 @@ +XEN_ROOT = ../.. +include $(XEN_ROOT)/tools/Rules.mk + +CFLAGS += $(CFLAGS_libxenctrl) +LDFLAGS += $(LDFLAGS_libxenctrl) + +SUBDIRS-y := +SUBDIRS-y += include +SUBDIRS-y += lvm +SUBDIRS-y += vhd +SUBDIRS-y += drivers +SUBDIRS-y += daemon + +.PHONY: all +all: build + +.PHONY: build +build: + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir all; \ + done + +.PHONY: install +install: + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir install; \ + done + +.PHONY: clean +clean: + rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir clean; \ + done diff --git a/tools/blktap2/README b/tools/blktap2/README new file mode 100644 index 0000000000..5e4108030e --- /dev/null +++ b/tools/blktap2/README @@ -0,0 +1,122 @@ +Blktap Userspace Tools + Library +================================ + +Andrew Warfield and Julian Chesterfield +16th June 2006 + +{firstname.lastname}@cl.cam.ac.uk + +The blktap userspace toolkit provides a user-level disk I/O +interface. The blktap mechanism involves a kernel driver that acts +similarly to the existing Xen/Linux blkback driver, and a set of +associated user-level libraries. Using these tools, blktap allows +virtual block devices presented to VMs to be implemented in userspace +and to be backed by raw partitions, files, network, etc. + +The key benefit of blktap is that it makes it easy and fast to write +arbitrary block backends, and that these user-level backends actually +perform very well. Specifically: + +- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse + formats and other compression features can be easily implemented. + +- Accessing file-based images from userspace avoids problems related + to flushing dirty pages which are present in the Linux loopback + driver. (Specifically, doing a large number of writes to an + NFS-backed image don't result in the OOM killer going berserk.) + +- Per-disk handler processes enable easier userspace policing of block + resources, and process-granularity QoS techniques (disk scheduling + and related tools) may be trivially applied to block devices. + +- It's very easy to take advantage of userspace facilities such as + networking libraries, compression utilities, peer-to-peer + file-sharing systems and so on to build more complex block backends. + +- Crashes are contained -- incremental development/debugging is very + fast. + +How it works (in one paragraph): + +Working in conjunction with the kernel blktap driver, all disk I/O +requests from VMs are passed to the userspace deamon (using a shared +memory interface) through a character device. Each active disk is +mapped to an individual device node, allowing per-disk processes to +implement individual block devices where desired. The userspace +drivers are implemented using asynchronous (Linux libaio), +O_DIRECT-based calls to preserve the unbuffered, batched and +asynchronous request dispatch achieved with the existing blkback +code. We provide a simple, asynchronous virtual disk interface that +makes it quite easy to add new disk implementations. + +As of June 2006 the current supported disk formats are: + + - Raw Images (both on partitions and in image files) + - File-backed Qcow disks + - Standalone sparse Qcow disks + - Fast shareable RAM disk between VMs (requires some form of cluster-based + filesystem support e.g. OCFS2 in the guest kernel) + - Some VMDK images - your mileage may vary + +Raw and QCow images have asynchronous backends and so should perform +fairly well. VMDK is based directly on the qemu vmdk driver, which is +synchronous (a.k.a. slow). + +Build and Installation Instructions +=================================== + +Make to configure the blktap backend driver in your dom0 kernel. It +will cooperate fine with the existing backend driver, so you can +experiment with tap disks without breaking existing VM configs. + +To build the tools separately, "make && make install" in +tools/blktap. + + +Using the Tools +=============== + +Prepare the image for booting. For qcow files use the qcow utilities +installed earlier. e.g. qcow-create generates a blank standalone image +or a file-backed CoW image. img2qcow takes an existing image or +partition and creates a sparse, standalone qcow-based file. + +The userspace disk agent is configured to start automatically via xend +(alternatively you can start it manually => 'blktapctrl') + +Customise the VM config file to use the 'tap' handler, followed by the +driver type. e.g. for a raw image such as a file or partition: + +disk = ['tap:aio:<FILENAME>,sda1,w'] + +e.g. for a qcow image: + +disk = ['tap:qcow:<FILENAME>,sda1,w'] + + +Mounting images in Dom0 using the blktap driver +=============================================== +Tap (and blkback) disks are also mountable in Dom0 without requiring an +active VM to attach. You will need to build a xenlinux Dom0 kernel that +includes the blkfront driver (e.g. the default 'make world' or +'make kernels' build. Simply use the xm command-line tool to activate +the backend disks, and blkfront will generate a virtual block device that +can be accessed in the same way as a loop device or partition: + +e.g. for a raw image file <FILENAME> that would normally be mounted using +the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the +following: + +xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0 +mount /dev/xvda1 /mnt/disk <--- don't use loop driver + +In this way, you can use any of the userspace device-type drivers built +with the blktap userspace toolkit to open and mount disks such as qcow +or vmdk images: + +xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0 +mount /dev/xvda1 /mnt/disk + + + + diff --git a/tools/blktap2/daemon/Makefile b/tools/blktap2/daemon/Makefile new file mode 100644 index 0000000000..a7869b61b0 --- /dev/null +++ b/tools/blktap2/daemon/Makefile @@ -0,0 +1,55 @@ +XEN_ROOT=../../../ +BLKTAP_ROOT := .. +include $(XEN_ROOT)/tools/Rules.mk + +IBIN = blktapctrl +INST_DIR = $(SBINDIR) + +LIBDIR = lib + +LIBS := -lxenstore +LIBS += -Llib +LIBS += -lblktap +LIBS += -lxenctrl + +ifneq ($(USE_SYSTEM_LIBRARIES),y) +INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE) +LIBS += -L $(XEN_LIBXC) -L $(XEN_XENSTORE) +endif + +OBJS := tapdisk-dispatch-common.o +OBJS += tapdisk-channel.o + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -fno-strict-aliasing -fPIC +CFLAGS += -Ilib -I../include -I../drivers -I../../include $(INCLUDES) +CFLAGS += -D_GNU_SOURCE +CFLAGS += -g + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +all: libblktap $(IBIN) + +blktapctrl: tapdisk-daemon.c $(OBJS) + $(CC) $(CFLAGS) -o blktapctrl tapdisk-daemon.c $(LIBS) $(OBJS) + +libblktap: + @set -e + $(MAKE) -C $(LIBDIR) all + +install: all + $(MAKE) -C $(LIBDIR) install + $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR) + +clean: + $(MAKE) -C $(LIBDIR) clean + rm -rf *.o *~ $(IBIN) $(DEPS) xen TAGS + +.PHONY: all clean install blktapctrl libblktap + +-include $(DEPS) + diff --git a/tools/blktap2/daemon/lib/Makefile b/tools/blktap2/daemon/lib/Makefile new file mode 100644 index 0000000000..e4e289ab51 --- /dev/null +++ b/tools/blktap2/daemon/lib/Makefile @@ -0,0 +1,69 @@ +XEN_ROOT=../../../../ +BLKTAP_ROOT := ../../ +include $(XEN_ROOT)/tools/Rules.mk + +MAJOR = 3.1 +MINOR = 0 +SONAME = libblktap.so.$(MAJOR) + +BLKTAP_INSTALL_DIR = /usr/sbin + +LIBS := -lxenstore + +ifneq ($(USE_SYSTEM_LIBRARIES),y) +INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE) +LIBS += -L$(XEN_XENSTORE) +endif + +SRCS := +SRCS += xs_api.c +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -fno-strict-aliasing -fPIC +# get asprintf(): +CFLAGS += -D _GNU_SOURCE +CFLAGS += -g +CFLAGS += -I../../include -I../../../include/ $(INCLUDES) + + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +OBJS = $(patsubst %.c,%.o,$(SRCS)) +IBINS := + +LIB = libblktap.a libblktap.so.$(MAJOR).$(MINOR) + +.PHONY: all +all: build + +.PHONY: build +build: libblktap.a + +.PHONY: libblktap +libblktap: libblktap.a + +install: all + $(INSTALL_DIR) -p $(DESTDIR)$(LIBDIR) + $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR) + ln -sf libblktap.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libblktap.so.$(MAJOR) + ln -sf libblktap.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libblktap.so + +clean: + rm -rf *.a *.so* *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS + +libblktap.a: $(OBJS) + $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,$(SONAME) $(SHLIB_CFLAGS) \ + -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) + ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR) + ln -sf libblktap.so.$(MAJOR) libblktap.so + $(AR) rc $@ libblktap.so + +.PHONY: TAGS all build clean install libblktap + +TAGS: + etags -t $(SRCS) *.h + +-include $(DEPS) + diff --git a/tools/blktap2/daemon/lib/xs_api.c b/tools/blktap2/daemon/lib/xs_api.c new file mode 100644 index 0000000000..2a7d6acdb8 --- /dev/null +++ b/tools/blktap2/daemon/lib/xs_api.c @@ -0,0 +1,323 @@ +/* + * xs_api.c + * + * blocktap interface functions to xenstore + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <time.h> +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <xs.h> + +#include "xs_api.h" +#include "blktaplib.h" + +#define DOMNAME "Domain-0" +#define BASE_DEV_VAL 2048 + +static LIST_HEAD(watches); + +int +xs_gather(struct xs_handle *xs, const char *dir, ...) +{ + va_list ap; + const char *name; + char *path, **e; + int ret = 0, num,i; + unsigned int len; + xs_transaction_t xth; + +again: + if ((xth = xs_transaction_start(xs)) == XBT_NULL) { + DPRINTF("unable to start xs trasanction\n"); + ret = ENOMEM; + return ret; + } + + va_start(ap, dir); + while ((ret == 0) && (name = va_arg(ap, char *)) != NULL) { + char *p; + const char *fmt = va_arg(ap, char *); + void *result = va_arg(ap, void *); + + if (asprintf(&path, "%s/%s", dir, name) == -1) { + EPRINTF("allocation error in xs_gather!\n"); + ret = ENOMEM; + break; + } + + p = xs_read(xs, xth, path, &len); + free(path); + + if (!p) { + ret = ENOENT; + break; + } + + if (fmt) { + if (sscanf(p, fmt, result) == 0) + ret = EINVAL; + free(p); + } else + *(char **)result = p; + } + + va_end(ap); + + if (!xs_transaction_end(xs, xth, ret)) { + if (ret == 0 && errno == EAGAIN) + goto again; + else + ret = errno; + } + + return ret; +} + +/* Single printf and write: returns -errno or 0. */ +int +xs_printf(struct xs_handle *h, const char *dir, + const char *node, const char *fmt, ...) +{ + int ret; + va_list ap; + char *buf, *path; + + va_start(ap, fmt); + ret = vasprintf(&buf, fmt, ap); + va_end(ap); + + if (ret == -1) + return 0; + + ret = asprintf(&path, "%s/%s", dir, node); + if (ret == -1) { + free(buf); + return 0; + } + + ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1); + + free(buf); + free(path); + + return ret; +} + +int +xs_exists(struct xs_handle *h, const char *path) +{ + char **d; + unsigned int num; + xs_transaction_t xth; + + if ((xth = xs_transaction_start(h)) == XBT_NULL) { + EPRINTF("unable to start xs trasanction\n"); + return 0; + } + + d = xs_directory(h, xth, path, &num); + xs_transaction_end(h, xth, 0); + if (!d) + return 0; + + free(d); + return 1; +} + + + +/** + * This assumes that the domain name we are looking for is unique. + * Name parameter Domain-0 + */ +char * +get_dom_domid(struct xs_handle *h) +{ + int i; + xs_transaction_t xth; + unsigned int num, len; + char *val, *path, *domid, **e; + + e = NULL; + domid = NULL; + + if ((xth = xs_transaction_start(h)) == XBT_NULL) { + EPRINTF("unable to start xs trasanction\n"); + return NULL; + } + + e = xs_directory(h, xth, "/local/domain", &num); + if (e == NULL) + goto done; + + for (i = 0; (i < num) && (domid == NULL); i++) { + if (asprintf(&path, "/local/domain/%s/name", e[i]) == -1) + break; + + val = xs_read(h, xth, path, &len); + free(path); + if (val == NULL) + continue; + + if (strcmp(val, DOMNAME) == 0) { + /* match! */ + if (asprintf(&path, + "/local/domain/%s/domid", e[i]) == -1) { + free(val); + break; + } + domid = xs_read(h, xth, path, &len); + free(path); + } + free(val); + } + + done: + xs_transaction_end(h, xth, 0); + free(e); + return domid; +} + +/* + * a little paranoia: we don't just trust token + */ +static struct xenbus_watch *find_watch(const char *token) +{ + int ret; + long nonce; + unsigned long addr; + struct xenbus_watch *i, *cmp; + + ret = sscanf(token, "%lX:%lX", &addr, &nonce); + if (ret != 2) { + EPRINTF("invalid watch token %s\n", token); + return NULL; + } + + cmp = (struct xenbus_watch *)addr; + list_for_each_entry(i, &watches, list) + if (i == cmp && i->nonce == nonce) + return i; + + return NULL; +} + +/* + * Register callback to watch this node; + * like xs_watch, return 0 on failure + */ +int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + /* Pointer in ascii is the token. */ + char token[(sizeof(watch) + sizeof(long)) * 2 + 2]; + + /* 1-second granularity should suffice here */ + watch->nonce = time(NULL); + + sprintf(token, "%lX:%lX", (long)watch, watch->nonce); + if (find_watch(token)) { + EPRINTF("watch collision!\n"); + return -EINVAL; + } + + if (!xs_watch(h, watch->node, token)) { + EPRINTF("unable to set watch!\n"); + return -EINVAL; + } + + list_add(&watch->list, &watches); + + return 0; +} + +int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + char token[(sizeof(watch) + sizeof(long)) * 2 + 2]; + + sprintf(token, "%lX:%lX", (long)watch, watch->nonce); + if (!find_watch(token)) { + EPRINTF("no such watch!\n"); + return -EINVAL; + } + + if (!xs_unwatch(h, watch->node, token)) + EPRINTF("XENBUS Failed to release watch %s\n", watch->node); + + list_del(&watch->list); + + return 0; +} + +/* + * re-register callbacks to all watches + */ +void reregister_xenbus_watches(struct xs_handle *h) +{ + struct xenbus_watch *watch; + char token[(sizeof(watch) + sizeof(long)) * 2 + 2]; + + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX:%lX", (long)watch, watch->nonce); + xs_watch(h, watch->node, token); + } +} + +/* + * based on watch_thread() + */ +int xs_fire_next_watch(struct xs_handle *h) +{ + unsigned int num; + struct xenbus_watch *w; + char **res, *token, *node = NULL; + + res = xs_read_watch(h, &num); + if (res == NULL) + return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */ + + node = res[XS_WATCH_PATH]; + token = res[XS_WATCH_TOKEN]; + DPRINTF("got watch %s on %s\n", token, node); + + w = find_watch(token); + if (w) + w->callback(h, w, node); + + DPRINTF("handled watch %s on %s\n", token, node); + + free(res); + + return 1; +} diff --git a/tools/blktap2/daemon/lib/xs_api.h b/tools/blktap2/daemon/lib/xs_api.h new file mode 100644 index 0000000000..e6f055ac0c --- /dev/null +++ b/tools/blktap2/daemon/lib/xs_api.h @@ -0,0 +1,62 @@ +/* + * xs_api.h + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XS_API_H_ +#define _XS_API_H_ + +#include <xs.h> + +#include "list.h" + +struct xenbus_watch +{ + struct list_head list; + char *node; + void *data; + long nonce; + void (*callback) (struct xs_handle *h, + struct xenbus_watch *, + const char *node); +}; + +int xs_gather(struct xs_handle *xs, const char *dir, ...); +int xs_printf(struct xs_handle *h, const char *dir, const char *node, + const char *fmt, ...) __attribute__((format(printf, 4, 5))); +int xs_exists(struct xs_handle *h, const char *path); +char *get_dom_domid(struct xs_handle *h); +int convert_dev_name_to_num(char *name); + +int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch); +int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch); +void reregister_xenbus_watches(struct xs_handle *h); +int xs_fire_next_watch(struct xs_handle *h); + +#endif diff --git a/tools/blktap2/daemon/tapdisk-channel.c b/tools/blktap2/daemon/tapdisk-channel.c new file mode 100644 index 0000000000..c2dac3a858 --- /dev/null +++ b/tools/blktap2/daemon/tapdisk-channel.c @@ -0,0 +1,1367 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <stdarg.h> +#include <sys/wait.h> +#include <sys/ioctl.h> +#include <sys/resource.h> + +#include <xs.h> +#include "disktypes.h" +#include "tapdisk-dispatch.h" + +#define TAPDISK_CHANNEL_IDLE 1 +#define TAPDISK_CHANNEL_WAIT_PID 2 +#define TAPDISK_CHANNEL_WAIT_OPEN 3 +#define TAPDISK_CHANNEL_WAIT_PAUSE 4 +#define TAPDISK_CHANNEL_WAIT_RESUME 5 +#define TAPDISK_CHANNEL_WAIT_CLOSE 6 +#define TAPDISK_CHANNEL_CLOSED 7 + +static void tapdisk_channel_error(tapdisk_channel_t *, + const char *fmt, ...) + __attribute__((format(printf, 2, 3))); +static void tapdisk_channel_fatal(tapdisk_channel_t *, + const char *fmt, ...) + __attribute__((format(printf, 2, 3))); +static int tapdisk_channel_parse_params(tapdisk_channel_t *); +static void tapdisk_channel_pause_event(struct xs_handle *, + struct xenbus_watch *, + const char *); + +static int +tapdisk_channel_check_uuid(tapdisk_channel_t *channel) +{ + uint32_t uuid; + char *uuid_str; + + uuid_str = xs_read(channel->xsh, XBT_NULL, channel->uuid_str, NULL); + if (!uuid_str) + return -errno; + + uuid = strtoul(uuid_str, NULL, 10); + free(uuid_str); + + if (uuid != channel->cookie) + return -EINVAL; + + return 0; +} + +static inline int +tapdisk_channel_validate_watch(tapdisk_channel_t *channel, const char *path) +{ + int err, len; + + len = strsep_len(path, '/', 7); + if (len < 0) + return -EINVAL; + + err = tapdisk_channel_check_uuid(channel); + if (err) + return err; + + if (!xs_exists(channel->xsh, path)) + return -ENOENT; + + return 0; +} + +static inline int +tapdisk_channel_validate_message(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + switch (message->type) { + case TAPDISK_MESSAGE_PID_RSP: + if (channel->state != TAPDISK_CHANNEL_WAIT_PID) + return -EINVAL; + break; + + case TAPDISK_MESSAGE_OPEN_RSP: + if (channel->state != TAPDISK_CHANNEL_WAIT_OPEN) + return -EINVAL; + break; + + case TAPDISK_MESSAGE_PAUSE_RSP: + if (channel->state != TAPDISK_CHANNEL_WAIT_PAUSE) + return -EINVAL; + break; + + case TAPDISK_MESSAGE_RESUME_RSP: + if (channel->state != TAPDISK_CHANNEL_WAIT_RESUME) + return -EINVAL; + break; + + case TAPDISK_MESSAGE_CLOSE_RSP: + if (channel->state != TAPDISK_CHANNEL_WAIT_CLOSE) + return -EINVAL; + break; + + case TAPDISK_MESSAGE_RUNTIME_ERROR: + /* + * runtime errors can be received at any time + * and should not affect the state machine + */ + return 0; + } + + channel->state = TAPDISK_CHANNEL_IDLE; + return 0; +} + +static int +tapdisk_channel_send_message(tapdisk_channel_t *channel, + tapdisk_message_t *message, int timeout) +{ + fd_set writefds; + struct timeval tv; + int ret, len, offset; + + tv.tv_sec = timeout; + tv.tv_usec = 0; + offset = 0; + len = sizeof(tapdisk_message_t); + + DPRINTF("%s: sending '%s' message to %d:%d\n", + channel->path, tapdisk_message_name(message->type), + channel->channel_id, channel->cookie); + + if (channel->state != TAPDISK_CHANNEL_IDLE && + message->type != TAPDISK_MESSAGE_CLOSE) + EPRINTF("%s: writing message to non-idle channel (%d)\n", + channel->path, channel->state); + + while (offset < len) { + FD_ZERO(&writefds); + FD_SET(channel->write_fd, &writefds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(channel->write_fd + 1, + NULL, &writefds, NULL, &tv); + if (ret == -1) + break; + else if (FD_ISSET(channel->write_fd, &writefds)) { + ret = write(channel->write_fd, + message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + if (offset != len) { + EPRINTF("%s: error writing '%s' message to %d:%d\n", + channel->path, tapdisk_message_name(message->type), + channel->channel_id, channel->cookie); + return -EIO; + } + + switch (message->type) { + case TAPDISK_MESSAGE_PID: + channel->state = TAPDISK_CHANNEL_WAIT_PID; + break; + + case TAPDISK_MESSAGE_OPEN: + channel->state = TAPDISK_CHANNEL_WAIT_OPEN; + break; + + case TAPDISK_MESSAGE_PAUSE: + channel->state = TAPDISK_CHANNEL_WAIT_PAUSE; + break; + + case TAPDISK_MESSAGE_RESUME: + channel->state = TAPDISK_CHANNEL_WAIT_RESUME; + break; + + case TAPDISK_MESSAGE_CLOSE: + channel->state = TAPDISK_CHANNEL_WAIT_CLOSE; + break; + + default: + EPRINTF("%s: unrecognized message type %d\n", + channel->path, message->type); + } + + return 0; +} + +static void +__tapdisk_channel_error(tapdisk_channel_t *channel, + const char *fmt, va_list ap) +{ + int err; + char *dir, *buf, *message; + + err = vasprintf(&buf, fmt, ap); + if (err == -1) { + EPRINTF("failed to allocate error message\n"); + buf = NULL; + } + + if (buf) + message = buf; + else + message = "tapdisk error"; + + EPRINTF("%s: %s\n", channel->path, message); + + err = asprintf(&dir, "%s/tapdisk-error", channel->path); + if (err == -1) { + EPRINTF("%s: failed to write %s\n", __func__, message); + dir = NULL; + goto out; + } + + xs_write(channel->xsh, XBT_NULL, dir, message, strlen(message)); + +out: + free(dir); + free(buf); +} + +static void +tapdisk_channel_error(tapdisk_channel_t *channel, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + __tapdisk_channel_error(channel, fmt, ap); + va_end(ap); +} + +static void +tapdisk_channel_fatal(tapdisk_channel_t *channel, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + __tapdisk_channel_error(channel, fmt, ap); + va_end(ap); + + tapdisk_channel_close(channel); +} + +static int +tapdisk_channel_connect_backdev(tapdisk_channel_t *channel) +{ + int err, major, minor; + char *s, *path, *devname; + + s = NULL; + path = NULL; + devname = NULL; + + err = ioctl(channel->blktap_fd, + BLKTAP_IOCTL_BACKDEV_SETUP, channel->minor); + if (err) { + err = -errno; + goto fail; + } + + err = asprintf(&path, "%s/backdev-node", channel->path); + if (err == -1) { + path = NULL; + err = -ENOMEM; + goto fail; + } + + s = xs_read(channel->xsh, XBT_NULL, path, NULL); + if (!s) { + err = -errno; + goto fail; + } + + err = sscanf(s, "%d:%d", &major, &minor); + if (err != 2) { + err = -EINVAL; + goto fail; + } + + err = asprintf(&devname,"%s/%s%d", + BLKTAP_DEV_DIR, BACKDEV_NAME, minor); + if (err == -1) { + devname = NULL; + err = -ENOMEM; + goto fail; + } + + err = make_blktap_device(devname, major, minor, S_IFBLK | 0600); + if (err) + goto fail; + + free(path); + err = asprintf(&path, "%s/backdev-path", channel->path); + if (err == -1) { + path = NULL; + err = -ENOMEM; + goto fail; + } + + err = xs_write(channel->xsh, XBT_NULL, path, devname, strlen(devname)); + if (err == 0) { + err = -errno; + goto fail; + } + + err = 0; + out: + free(devname); + free(path); + free(s); + return err; + + fail: + EPRINTF("backdev setup failed [%d]\n", err); + goto out; +} + +static int +tapdisk_channel_complete_connection(tapdisk_channel_t *channel) +{ + int err; + char *path; + + if (!xs_printf(channel->xsh, channel->path, + "sectors", "%llu", channel->image.size)) { + EPRINTF("ERROR: Failed writing sectors"); + return -errno; + } + + if (!xs_printf(channel->xsh, channel->path, + "sector-size", "%lu", channel->image.secsize)) { + EPRINTF("ERROR: Failed writing sector-size"); + return -errno; + } + + if (!xs_printf(channel->xsh, channel->path, + "info", "%u", channel->image.info)) { + EPRINTF("ERROR: Failed writing info"); + return -errno; + } + + err = tapdisk_channel_connect_backdev(channel); + if (err) + goto clean; + + channel->connected = 1; + return 0; + + clean: + if (asprintf(&path, "%s/info", channel->path) == -1) + return err; + + if (!xs_rm(channel->xsh, XBT_NULL, path)) + goto clean_out; + + free(path); + if (asprintf(&path, "%s/sector-size", channel->path) == -1) + return err; + + if (!xs_rm(channel->xsh, XBT_NULL, path)) + goto clean_out; + + free(path); + if (asprintf(&path, "%s/sectors", channel->path) == -1) + return err; + + xs_rm(channel->xsh, XBT_NULL, path); + + clean_out: + free(path); + return err; +} + +static int +tapdisk_channel_send_open_request(tapdisk_channel_t *channel) +{ + int len; + tapdisk_message_t message; + + memset(&message, 0, sizeof(tapdisk_message_t)); + + len = strlen(channel->vdi_path); + + message.type = TAPDISK_MESSAGE_OPEN; + message.cookie = channel->cookie; + message.drivertype = channel->drivertype; + message.u.params.storage = channel->storage; + message.u.params.devnum = channel->minor; + message.u.params.domid = channel->domid; + message.u.params.path_len = len; + strncpy(message.u.params.path, channel->vdi_path, len); + + if (channel->mode == 'r') + message.u.params.flags |= TAPDISK_MESSAGE_FLAG_RDONLY; + if (channel->shared) + message.u.params.flags |= TAPDISK_MESSAGE_FLAG_SHARED; + + /* TODO: clean this up */ + if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/add-cache")) + message.u.params.flags |= TAPDISK_MESSAGE_FLAG_ADD_CACHE; + if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/log-dirty")) + message.u.params.flags |= TAPDISK_MESSAGE_FLAG_LOG_DIRTY; + + return tapdisk_channel_send_message(channel, &message, 2); +} + +static int +tapdisk_channel_receive_open_response(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + int err; + + channel->image.size = message->u.image.sectors; + channel->image.secsize = message->u.image.sector_size; + channel->image.info = message->u.image.info; + + err = tapdisk_channel_complete_connection(channel); + if (err) + goto fail; + + /* did we receive a pause request before the connection completed? */ + if (channel->pause_needed) { + DPRINTF("%s: deferred pause request\n", channel->path); + tapdisk_channel_pause_event(channel->xsh, + &channel->pause_watch, + channel->pause_str); + channel->pause_needed = 0; + } + + return 0; + +fail: + tapdisk_channel_fatal(channel, + "failure completing connection: %d", err); + return err; +} + +static int +tapdisk_channel_send_shutdown_request(tapdisk_channel_t *channel) +{ + tapdisk_message_t message; + + memset(&message, 0, sizeof(tapdisk_message_t)); + + message.type = TAPDISK_MESSAGE_CLOSE; + message.drivertype = channel->drivertype; + message.cookie = channel->cookie; + + return tapdisk_channel_send_message(channel, &message, 2); +} + +static int +tapdisk_channel_receive_shutdown_response(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + channel->open = 0; + channel->state = TAPDISK_CHANNEL_CLOSED; + tapdisk_channel_close(channel); + return 0; +} + +static int +tapdisk_channel_receive_runtime_error(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + tapdisk_channel_error(channel, + "runtime error: %s", message->u.string.text); + return 0; +} + +static int +tapdisk_channel_send_pid_request(tapdisk_channel_t *channel) +{ + int err; + tapdisk_message_t message; + + memset(&message, 0, sizeof(tapdisk_message_t)); + + message.type = TAPDISK_MESSAGE_PID; + message.drivertype = channel->drivertype; + message.cookie = channel->cookie; + + err = tapdisk_channel_send_message(channel, &message, 2); + + if (!err) + channel->open = 1; + + return err; +} + +static int +tapdisk_channel_receive_pid_response(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + int err; + + channel->tapdisk_pid = message->u.tapdisk_pid; + + DPRINTF("%s: tapdisk pid: %d\n", channel->path, channel->tapdisk_pid); + + err = setpriority(PRIO_PROCESS, channel->tapdisk_pid, PRIO_SPECIAL_IO); + if (err) { + tapdisk_channel_fatal(channel, + "setting tapdisk priority: %d", err); + return err; + } + + err = tapdisk_channel_send_open_request(channel); + if (err) { + tapdisk_channel_fatal(channel, + "sending open request: %d", err); + return err; + } + + return 0; +} + +static int +tapdisk_channel_send_pause_request(tapdisk_channel_t *channel) +{ + tapdisk_message_t message; + + memset(&message, 0, sizeof(tapdisk_message_t)); + + DPRINTF("pausing %s\n", channel->path); + + message.type = TAPDISK_MESSAGE_PAUSE; + message.drivertype = channel->drivertype; + message.cookie = channel->cookie; + + return tapdisk_channel_send_message(channel, &message, 2); +} + +static int +tapdisk_channel_receive_pause_response(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + int err; + + if (!xs_write(channel->xsh, XBT_NULL, + channel->pause_done_str, "", strlen(""))) { + err = -errno; + goto fail; + } + + return 0; + +fail: + tapdisk_channel_fatal(channel, + "failure receiving pause response: %d\n", err); + return err; +} + +static int +tapdisk_channel_send_resume_request(tapdisk_channel_t *channel) +{ + int len; + tapdisk_message_t message; + + memset(&message, 0, sizeof(tapdisk_message_t)); + + len = strlen(channel->vdi_path); + + DPRINTF("resuming %s\n", channel->path); + + message.type = TAPDISK_MESSAGE_RESUME; + message.drivertype = channel->drivertype; + message.cookie = channel->cookie; + message.u.params.path_len = len; + strncpy(message.u.params.path, channel->vdi_path, len); + + return tapdisk_channel_send_message(channel, &message, 2); +} + +static int +tapdisk_channel_receive_resume_response(tapdisk_channel_t *channel, + tapdisk_message_t *message) +{ + int err; + + if (!xs_rm(channel->xsh, XBT_NULL, channel->pause_done_str)) { + err = -errno; + goto fail; + } + + return 0; + +fail: + tapdisk_channel_fatal(channel, + "failure receiving pause response: %d", err); + return err; +} + +static void +tapdisk_channel_shutdown_event(struct xs_handle *xsh, + struct xenbus_watch *watch, const char *path) +{ + int err; + tapdisk_channel_t *channel; + + channel = watch->data; + + DPRINTF("%s: got watch on %s\n", channel->path, path); + + if (!xs_exists(channel->xsh, channel->path)) { + tapdisk_channel_close(channel); + return; + } + + err = tapdisk_channel_validate_watch(channel, path); + if (err) { + if (err == -EINVAL) + tapdisk_channel_fatal(channel, "bad shutdown watch"); + return; + } + + tapdisk_channel_send_shutdown_request(channel); +} + +static void +tapdisk_channel_pause_event(struct xs_handle *xsh, + struct xenbus_watch *watch, const char *path) +{ + int err, paused; + tapdisk_channel_t *channel; + + channel = watch->data; + + DPRINTF("%s: got watch on %s\n", channel->path, path); + + if (!xs_exists(channel->xsh, channel->path)) { + tapdisk_channel_close(channel); + return; + } + + /* NB: The VBD is essentially considered ready since the + * backend hotplug event ocurred, which is just after + * start-tapdisk, not after watch registration. We start + * testing xenstore keys with the very first shot, but defer + * until after connection completion. */ + + err = tapdisk_channel_validate_watch(channel, path); + if (err) { + if (err == -EINVAL) + tapdisk_channel_fatal(channel, "bad pause watch"); + + if (err != -ENOENT) + return; + + err = 0; + } + + paused = xs_exists(xsh, channel->pause_done_str); + + if (xs_exists(xsh, channel->pause_str)) { + /* + * Duplicate requests are a protocol validation, but + * impossible to identify if watch registration and an + * actual pause request may fire separately in close + * succession. Warn, but do not signal an error. + */ + int pausing = channel->state == TAPDISK_CHANNEL_WAIT_PAUSE; + if (pausing || paused) { + DPRINTF("Ignoring pause event for %s vbd %s\n", + pausing ? "pausing" : "paused", channel->path); + goto out; + } + + /* defer if tapdisk is not ready yet */ + if (!channel->connected) { + DPRINTF("%s: deferring pause request\n", path); + channel->pause_needed = 1; + goto out; + } + + err = tapdisk_channel_send_pause_request(channel); + + } else if (xs_exists(xsh, channel->pause_done_str)) { + free(channel->params); + channel->params = NULL; + channel->vdi_path = NULL; + + err = xs_gather(channel->xsh, channel->path, + "params", NULL, &channel->params, NULL); + if (err) { + EPRINTF("failure re-reading params: %d\n", err); + channel->params = NULL; + goto out; + } + + err = tapdisk_channel_parse_params(channel); + if (err) + goto out; + + err = tapdisk_channel_send_resume_request(channel); + if (err) + goto out; + } + + err = 0; + +out: + if (err) + tapdisk_channel_error(channel, "pause event failed: %d", err); +} + +static int +tapdisk_channel_open_control_socket(char *devname) +{ + int err, fd; + fd_set socks; + struct timeval timeout; + + err = mkdir(BLKTAP_CTRL_DIR, 0755); + if (err == -1 && errno != EEXIST) { + EPRINTF("Failure creating %s directory: %d\n", + BLKTAP_CTRL_DIR, errno); + return -errno; + } + + err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO); + if (err) { + if (errno == EEXIST) { + /* + * Remove fifo since it may have data from + * it's previous use --- earlier invocation + * of tapdisk may not have read all messages. + */ + err = unlink(devname); + if (err) { + EPRINTF("ERROR: unlink(%s) failed (%d)\n", + devname, errno); + return -errno; + } + + err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (err) { + EPRINTF("ERROR: pipe failed (%d)\n", errno); + return -errno; + } + } + + fd = open(devname, O_RDWR | O_NONBLOCK); + if (fd == -1) { + EPRINTF("Failed to open %s\n", devname); + return -errno; + } + + return fd; +} + +static int +tapdisk_channel_get_device_number(tapdisk_channel_t *channel) +{ + char *devname; + domid_translate_t tr; + int major, minor, err; + + tr.domid = channel->domid; + tr.busid = channel->busid; + + minor = ioctl(channel->blktap_fd, BLKTAP_IOCTL_NEWINTF, tr); + if (minor <= 0 || minor > MAX_TAP_DEV) { + EPRINTF("invalid dev id: %d\n", minor); + return -EINVAL; + } + + major = ioctl(channel->blktap_fd, BLKTAP_IOCTL_MAJOR, minor); + if (major < 0) { + EPRINTF("invalid major id: %d\n", major); + return -EINVAL; + } + + err = asprintf(&devname, "%s/%s%d", + BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor); + if (err == -1) { + EPRINTF("get_new_dev: malloc failed\n"); + return -ENOMEM; + } + + err = make_blktap_device(devname, major, minor, S_IFCHR | 0600); + free(devname); + + if (err) + return err; + + DPRINTF("Received device id %d and major %d, " + "sent domid %d and be_id %d\n", + minor, major, tr.domid, tr.busid); + + channel->major = major; + channel->minor = minor; + + return 0; +} + +static int +tapdisk_channel_start_process(tapdisk_channel_t *channel, + char *write_dev, char *read_dev) +{ + pid_t child; + char *argv[] = { "tapdisk", write_dev, read_dev, NULL }; + + if ((child = fork()) == -1) + return -errno; + + if (!child) { + int i; + for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++) + if (i != STDIN_FILENO && + i != STDOUT_FILENO && + i != STDERR_FILENO) + close(i); + + execvp("tapdisk", argv); + _exit(1); + } else { + pid_t got; + do { + got = waitpid(child, NULL, 0); + } while (got != child); + } + return 0; +} + +static int +tapdisk_channel_launch_tapdisk(tapdisk_channel_t *channel) +{ + int err; + char *read_dev, *write_dev; + + read_dev = NULL; + write_dev = NULL; + channel->read_fd = -1; + channel->write_fd = -1; + + err = tapdisk_channel_get_device_number(channel); + if (err) + return err; + + err = asprintf(&write_dev, + "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, channel->minor); + if (err == -1) { + err = -ENOMEM; + write_dev = NULL; + goto fail; + } + + err = asprintf(&read_dev, + "%s/tapctrlread%d", BLKTAP_CTRL_DIR, channel->minor); + if (err == -1) { + err = -ENOMEM; + read_dev = NULL; + goto fail; + } + + channel->write_fd = tapdisk_channel_open_control_socket(write_dev); + if (channel->write_fd < 0) { + err = channel->write_fd; + channel->write_fd = -1; + goto fail; + } + + channel->read_fd = tapdisk_channel_open_control_socket(read_dev); + if (channel->read_fd < 0) { + err = channel->read_fd; + channel->read_fd = -1; + goto fail; + } + + err = tapdisk_channel_start_process(channel, write_dev, read_dev); + if (err) + goto fail; + + channel->open = 1; + channel->channel_id = channel->write_fd; + + free(read_dev); + free(write_dev); + + DPRINTF("process launched, channel = %d:%d\n", + channel->channel_id, channel->cookie); + + return tapdisk_channel_send_pid_request(channel); + +fail: + free(read_dev); + free(write_dev); + if (channel->read_fd != -1) + close(channel->read_fd); + if (channel->write_fd != -1) + close(channel->write_fd); + return err; +} + +static int +tapdisk_channel_connect(tapdisk_channel_t *channel) +{ + int err; + + tapdisk_daemon_find_channel(channel); + + if (!channel->tapdisk_pid) + return tapdisk_channel_launch_tapdisk(channel); + + DPRINTF("%s: process exists: %d, channel = %d:%d\n", + channel->path, channel->tapdisk_pid, + channel->channel_id, channel->cookie); + + err = tapdisk_channel_get_device_number(channel); + if (err) + return err; + + return tapdisk_channel_send_pid_request(channel); +} + +static int +tapdisk_channel_init(tapdisk_channel_t *channel) +{ + int err; + + channel->uuid_str = NULL; + channel->pause_str = NULL; + channel->pause_done_str = NULL; + channel->shutdown_str = NULL; + channel->share_tapdisk_str = NULL; + + err = asprintf(&channel->uuid_str, + "%s/tapdisk-uuid", channel->path); + if (err == -1) { + channel->uuid_str = NULL; + goto fail; + } + + err = asprintf(&channel->pause_str, "%s/pause", channel->path); + if (err == -1) { + channel->pause_str = NULL; + goto fail; + } + + err = asprintf(&channel->pause_done_str, + "%s/pause-done", channel->path); + if (err == -1) { + channel->pause_done_str = NULL; + goto fail; + } + + err = asprintf(&channel->shutdown_str, + "%s/shutdown-tapdisk", channel->path); + if (err == -1) { + channel->shutdown_str = NULL; + goto fail; + } + + channel->share_tapdisk_str = "/local/domain/0/tapdisk/share-tapdisks"; + + return 0; + +fail: + free(channel->uuid_str); + free(channel->pause_str); + free(channel->pause_done_str); + free(channel->shutdown_str); + channel->uuid_str = NULL; + channel->pause_str = NULL; + channel->pause_done_str = NULL; + channel->shutdown_str = NULL; + channel->share_tapdisk_str = NULL; + return -ENOMEM; +} + +static int +tapdisk_channel_set_watches(tapdisk_channel_t *channel) +{ + int err; + + /* watch for pause events */ + channel->pause_watch.node = channel->pause_str; + channel->pause_watch.callback = tapdisk_channel_pause_event; + channel->pause_watch.data = channel; + err = register_xenbus_watch(channel->xsh, &channel->pause_watch); + if (err) { + channel->pause_watch.node = NULL; + goto fail; + } + + /* watch for shutdown events */ + channel->shutdown_watch.node = channel->shutdown_str; + channel->shutdown_watch.callback = tapdisk_channel_shutdown_event; + channel->shutdown_watch.data = channel; + err = register_xenbus_watch(channel->xsh, &channel->shutdown_watch); + if (err) { + channel->shutdown_watch.node = NULL; + goto fail; + } + + return 0; + +fail: + if (channel->pause_watch.node) { + unregister_xenbus_watch(channel->xsh, &channel->pause_watch); + channel->pause_watch.node = NULL; + } + if (channel->shutdown_watch.node) { + unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch); + channel->shutdown_watch.node = NULL; + } + return err; +} + +static void +tapdisk_channel_get_storage_type(tapdisk_channel_t *channel) +{ + int err, type; + unsigned int len; + char *path, *stype; + + channel->storage = TAPDISK_STORAGE_TYPE_DEFAULT; + + err = asprintf(&path, "%s/sm-data/storage-type", channel->path); + if (err == -1) + return; + + stype = xs_read(channel->xsh, XBT_NULL, path, &len); + if (!stype) + goto out; + else if (!strcmp(stype, "nfs")) + channel->storage = TAPDISK_STORAGE_TYPE_NFS; + else if (!strcmp(stype, "ext")) + channel->storage = TAPDISK_STORAGE_TYPE_EXT; + else if (!strcmp(stype, "lvm")) + channel->storage = TAPDISK_STORAGE_TYPE_LVM; + +out: + free(path); + free(stype); +} + +static int +tapdisk_channel_get_busid(tapdisk_channel_t *channel) +{ + int len, end; + const char *ptr; + char *tptr, num[10]; + + len = strsep_len(channel->path, '/', 6); + end = strlen(channel->path); + if(len < 0 || end < 0) { + EPRINTF("invalid path: %s\n", channel->path); + return -EINVAL; + } + + ptr = channel->path + len + 1; + strncpy(num, ptr, end - len); + tptr = num + (end - (len + 1)); + *tptr = '\0'; + + channel->busid = atoi(num); + return 0; +} + +static int +tapdisk_channel_parse_params(tapdisk_channel_t *channel) +{ + int i, size, err; + unsigned int len; + char *ptr, *path, handle[10]; + char *vdi_type; + char *vtype; + + path = channel->params; + size = sizeof(dtypes) / sizeof(disk_info_t *); + + if (strlen(path) + 1 >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) + goto fail; + + ptr = strchr(path, ':'); + if (!ptr) + goto fail; + + channel->vdi_path = ptr + 1; + memcpy(handle, path, (ptr - path)); + ptr = handle + (ptr - path); + *ptr = '\0'; + + err = asprintf(&vdi_type, "%s/sm-data/vdi-type", channel->path); + if (err == -1) + goto fail; + + if (xs_exists(channel->xsh, vdi_type)) { + vtype = xs_read(channel->xsh, XBT_NULL, vdi_type, &len); + free(vdi_type); + if (!vtype) + goto fail; + if (len >= sizeof(handle) - 1) { + free(vtype); + goto fail; + } + sprintf(handle, "%s", vtype); + free(vtype); + } + + for (i = 0; i < size; i++) { + if (strncmp(handle, dtypes[i]->handle, (ptr - path))) + continue; + + if (dtypes[i]->idnum == -1) + goto fail; + + channel->drivertype = dtypes[i]->idnum; + return 0; + } + +fail: + EPRINTF("%s: invalid blktap params: %s\n", + channel->path, channel->params); + channel->vdi_path = NULL; + return -EINVAL; +} + +static int +tapdisk_channel_gather_info(tapdisk_channel_t *channel) +{ + int err; + + err = xs_gather(channel->xsh, channel->path, + "frontend", NULL, &channel->frontpath, + "frontend-id", "%li", &channel->domid, + "params", NULL, &channel->params, + "mode", "%c", &channel->mode, NULL); + if (err) { + EPRINTF("could not find device info: %d\n", err); + return err; + } + + err = tapdisk_channel_parse_params(channel); + if (err) + return err; + + err = tapdisk_channel_get_busid(channel); + if (err) + return err; + + tapdisk_channel_get_storage_type(channel); + + return 0; +} + +static int +tapdisk_channel_verify_start_request(tapdisk_channel_t *channel) +{ + char *path; + unsigned int err; + + err = asprintf(&path, "%s/start-tapdisk", channel->path); + if (err == -1) + goto mem_fail; + + if (!xs_exists(channel->xsh, path)) + goto fail; + + free(path); + err = asprintf(&path, "%s/shutdown-request", channel->path); + if (err == -1) + goto mem_fail; + + if (xs_exists(channel->xsh, path)) + goto fail; + + if (xs_exists(channel->xsh, channel->shutdown_str)) + goto fail; + + free(path); + err = asprintf(&path, "%s/shutdown-done", channel->path); + if (err == -1) + goto mem_fail; + + if (xs_exists(channel->xsh, path)) + goto fail; + + free(path); + + return 0; + +fail: + free(path); + EPRINTF("%s:%s: invalid start request\n", __func__, channel->path); + return -EINVAL; + +mem_fail: + EPRINTF("%s:%s: out of memory\n", __func__, channel->path); + return -ENOMEM; +} + +void +tapdisk_channel_close(tapdisk_channel_t *channel) +{ + if (channel->channel_id) + DPRINTF("%s: closing channel %d:%d\n", + channel->path, channel->channel_id, channel->cookie); + + if (channel->open) + tapdisk_channel_send_shutdown_request(channel); + + if (channel->pause_watch.node) { + unregister_xenbus_watch(channel->xsh, &channel->pause_watch); + channel->pause_watch.node = NULL; + } + + if (channel->shutdown_watch.node) { + unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch); + channel->shutdown_watch.node = NULL; + } + + tapdisk_daemon_close_channel(channel); + + free(channel->params); + free(channel->frontpath); + free(channel->shutdown_str); + free(channel->pause_done_str); + free(channel->pause_str); + free(channel->uuid_str); + free(channel->path); + free(channel); +} + +int +tapdisk_channel_open(tapdisk_channel_t **_channel, + char *path, struct xs_handle *xsh, + int blktap_fd, uint16_t cookie) +{ + int err; + char *msg; + tapdisk_channel_t *channel; + + msg = NULL; + *_channel = NULL; + + channel = calloc(1, sizeof(tapdisk_channel_t)); + if (!channel) + return -ENOMEM; + + channel->xsh = xsh; + channel->blktap_fd = blktap_fd; + channel->cookie = cookie; + channel->state = TAPDISK_CHANNEL_IDLE; + + INIT_LIST_HEAD(&channel->list); + + channel->path = strdup(path); + if (!channel->path) { + err = -ENOMEM; + goto fail; + } + + err = tapdisk_channel_init(channel); + if (err) { + msg = "allocating device"; + goto fail; + } + + err = tapdisk_channel_check_uuid(channel); + if (err) { + msg = "checking uuid"; + goto fail; + } + + err = tapdisk_channel_gather_info(channel); + if (err) { + msg = "gathering parameters"; + goto fail; + } + + err = tapdisk_channel_verify_start_request(channel); + if (err) { + msg = "invalid start request"; + goto fail; + } + + err = tapdisk_channel_set_watches(channel); + if (err) { + msg = "registering xenstore watches"; + goto fail; + } + + err = tapdisk_channel_connect(channel); + if (err) { + msg = "connecting to tapdisk"; + goto fail; + } + + *_channel = channel; + return 0; + +fail: + tapdisk_channel_fatal(channel, "%s: %d", (msg ? : "failure"), err); + return err; +} + +int +tapdisk_channel_receive_message(tapdisk_channel_t *c, tapdisk_message_t *m) +{ + int err; + + err = tapdisk_channel_validate_message(c, m); + if (err) + goto fail; + + switch (m->type) { + case TAPDISK_MESSAGE_PID_RSP: + return tapdisk_channel_receive_pid_response(c, m); + + case TAPDISK_MESSAGE_OPEN_RSP: + return tapdisk_channel_receive_open_response(c, m); + + case TAPDISK_MESSAGE_PAUSE_RSP: + return tapdisk_channel_receive_pause_response(c, m); + + case TAPDISK_MESSAGE_RESUME_RSP: + return tapdisk_channel_receive_resume_response(c, m); + + case TAPDISK_MESSAGE_CLOSE_RSP: + return tapdisk_channel_receive_shutdown_response(c, m); + + case TAPDISK_MESSAGE_RUNTIME_ERROR: + return tapdisk_channel_receive_runtime_error(c, m); + } + +fail: + tapdisk_channel_fatal(c, "received unexpected message %s in state %d", + tapdisk_message_name(m->type), c->state); + return -EINVAL; +} diff --git a/tools/blktap2/daemon/tapdisk-daemon.c b/tools/blktap2/daemon/tapdisk-daemon.c new file mode 100644 index 0000000000..ecfc0f3c5b --- /dev/null +++ b/tools/blktap2/daemon/tapdisk-daemon.c @@ -0,0 +1,599 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include <xs.h> +#include "disktypes.h" +#include "tapdisk-dispatch.h" + +#define TAPDISK_DAEMON_DOMID_WATCH "domid-watch" +#define TAPDISK_DAEMON_PIDFILE "/var/run/blktapctrl.pid" + +typedef struct tapdisk_daemon { + char *node; + int blktap_fd; + uint16_t cookie; + + struct xs_handle *xsh; + struct list_head channels; + struct xenbus_watch watch; +} tapdisk_daemon_t; + +static tapdisk_daemon_t tapdisk_daemon; + +#define tapdisk_daemon_for_each_channel(c, tmp) \ + list_for_each_entry_safe(c, tmp, &tapdisk_daemon.channels, list) + +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +static void +tapdisk_daemon_print_drivers(void) +{ + int i, size; + + DPRINTF("blktap-daemon: v1.0.2\n"); + + size = sizeof(dtypes) / sizeof(disk_info_t *); + for (i = 0; i < size; i++) + DPRINTF("Found driver: [%s]\n", dtypes[i]->name); +} + +static int +tapdisk_daemon_write_pidfile(long pid) +{ + char buf[100]; + int len, fd, flags, err; + + fd = open(TAPDISK_DAEMON_PIDFILE, O_RDWR | O_CREAT, 0600); + if (fd == -1) { + EPRINTF("Opening pid file failed (%d)\n", errno); + return -errno; + } + + /* We exit silently if daemon already running */ + err = lockf(fd, F_TLOCK, 0); + if (err == -1) + exit(0); + + /* Set FD_CLOEXEC, so that tapdisk doesn't get this file descriptor */ + flags = fcntl(fd, F_GETFD); + if (flags == -1) { + EPRINTF("F_GETFD failed (%d)\n", errno); + return -errno; + } + + flags |= FD_CLOEXEC; + err = fcntl(fd, F_SETFD, flags); + if (err == -1) { + EPRINTF("F_SETFD failed (%d)\n", errno); + return -errno; + } + + len = sprintf(buf, "%ld\n", pid); + err = write(fd, buf, len); + if (err != len) { + EPRINTF("Writing pid file failed (%d)\n", errno); + return -errno; + } + + return 0; +} + +static int +tapdisk_daemon_init(void) +{ + char *devname; + int i, err, blktap_major; + + memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t)); + + err = asprintf(&devname, "%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME); + if (err == -1) { + devname = NULL; + err = -ENOMEM; + goto fail; + } + + err = xc_find_device_number("blktap0"); + if (err < 0) + goto fail; + + blktap_major = major(err); + err = make_blktap_device(devname, blktap_major, 0, S_IFCHR | 0600); + if (err) + goto fail; + + tapdisk_daemon.blktap_fd = open(devname, O_RDWR); + if (tapdisk_daemon.blktap_fd == -1) { + err = -errno; + EPRINTF("blktap0 open failed\n"); + goto fail; + } + + for (i = 0; i < 2; i++) { + tapdisk_daemon.xsh = xs_daemon_open(); + if (!tapdisk_daemon.xsh) { + EPRINTF("xs_daemon_open failed -- is xenstore running?\n"); + sleep(2); + } else + break; + } + + if (!tapdisk_daemon.xsh) { + err = -ENOSYS; + goto fail; + } + + INIT_LIST_HEAD(&tapdisk_daemon.channels); + + free(devname); + return 0; + +fail: + if (tapdisk_daemon.blktap_fd > 0) + close(tapdisk_daemon.blktap_fd); + free(devname); + memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t)); + EPRINTF("%s: %d\n", __func__, err); + + return err; +} + +static int +tapdisk_daemon_set_node(void) +{ + int err; + char *domid; + + domid = get_dom_domid(tapdisk_daemon.xsh); + if (!domid) + return -EAGAIN; + + err = asprintf(&tapdisk_daemon.node, + "/local/domain/%s/backend/tap", domid); + if (err == -1) { + tapdisk_daemon.node = NULL; + err = -ENOMEM; + goto out; + } + + err = 0; + +out: + free(domid); + return err; +} + +static int +tapdisk_daemon_get_domid(void) +{ + int err; + unsigned int num; + char **res, *node, *token, *domid; + + res = xs_read_watch(tapdisk_daemon.xsh, &num); + if (!res) + return -EAGAIN; + + err = 0; + node = res[XS_WATCH_PATH]; + token = res[XS_WATCH_TOKEN]; + + if (strcmp(token, TAPDISK_DAEMON_DOMID_WATCH)) { + err = -EINVAL; + goto out; + } + + err = tapdisk_daemon_set_node(); + +out: + free(res); + return err; +} + +static int +tapdisk_daemon_wait_for_domid(void) +{ + int err; + char *domid; + fd_set readfds; + + err = tapdisk_daemon_set_node(); + if (!err) + return 0; + + if (!xs_watch(tapdisk_daemon.xsh, "/local/domain", + TAPDISK_DAEMON_DOMID_WATCH)) { + EPRINTF("unable to set domain id watch\n"); + return -EINVAL; + } + + do { + FD_ZERO(&readfds); + FD_SET(xs_fileno(tapdisk_daemon.xsh), &readfds); + + select(xs_fileno(tapdisk_daemon.xsh) + 1, + &readfds, NULL, NULL, NULL); + + if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), &readfds)) + err = tapdisk_daemon_get_domid(); + else + err = -EAGAIN; + } while (err == -EAGAIN); + + xs_unwatch(tapdisk_daemon.xsh, + "/local/domain", TAPDISK_DAEMON_DOMID_WATCH); + return err; +} + +static inline int +tapdisk_daemon_new_vbd_event(const char *node) +{ + return (!strcmp(node, "start-tapdisk")); +} + +static int +tapdisk_daemon_write_uuid(char *path, uint32_t uuid) +{ + int err; + char *cpath, uuid_str[12]; + + snprintf(uuid_str, sizeof(uuid_str), "%u", uuid); + + err = asprintf(&cpath, "%s/tapdisk-uuid", path); + if (err == -1) + return -ENOMEM; + + err = xs_write(tapdisk_daemon.xsh, XBT_NULL, + cpath, uuid_str, strlen(uuid_str)); + free(cpath); + + return (err ? 0 : -errno); +} + +static void +tapdisk_daemon_probe(struct xs_handle *xsh, + struct xenbus_watch *watch, const char *path) +{ + char *cpath; + int len, err; + uint32_t cookie; + const char *node; + tapdisk_channel_t *channel; + + len = strsep_len(path, '/', 7); + if (len < 0) + return; + + node = path + len + 1; + + if (!tapdisk_daemon_new_vbd_event(node)) + return; + + if (!xs_exists(xsh, path)) + return; + + cpath = strdup(path); + if (!cpath) { + EPRINTF("failed to allocate control path for %s\n", path); + return; + } + cpath[len] = '\0'; + + cookie = tapdisk_daemon.cookie++; + err = tapdisk_daemon_write_uuid(cpath, cookie); + if (err) + goto out; + + DPRINTF("%s: got watch on %s, uuid = %u\n", __func__, path, cookie); + + err = tapdisk_channel_open(&channel, cpath, + tapdisk_daemon.xsh, + tapdisk_daemon.blktap_fd, + cookie); + if (!err) + list_add(&channel->list, &tapdisk_daemon.channels); + else + EPRINTF("failed to open tapdisk channel for %s: %d\n", + path, err); + +out: + free(cpath); +} + +static int +tapdisk_daemon_start(void) +{ + int err; + + err = tapdisk_daemon_wait_for_domid(); + if (err) + return err; + + tapdisk_daemon.watch.node = tapdisk_daemon.node; + tapdisk_daemon.watch.callback = tapdisk_daemon_probe; + + err = register_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch); + if (err) + goto fail; + + ioctl(tapdisk_daemon.blktap_fd, + BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE); + ioctl(tapdisk_daemon.blktap_fd, BLKTAP_IOCTL_SENDPID, getpid()); + + return 0; + +fail: + free(tapdisk_daemon.node); + tapdisk_daemon.node = NULL; + tapdisk_daemon.watch.node = NULL; + EPRINTF("%s: %d\n", __func__, err); + return err; +} + +static int +tapdisk_daemon_stop(void) +{ + unregister_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch); + + ioctl(tapdisk_daemon.blktap_fd, + BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH); + close(tapdisk_daemon.blktap_fd); + + return 0; +} + +static void +tapdisk_daemon_free(void) +{ + free(tapdisk_daemon.node); + xs_daemon_close(tapdisk_daemon.xsh); + memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t)); +} + +static int +tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout) +{ + fd_set readfds; + struct timeval tv; + int ret, len, offset; + + tv.tv_sec = timeout; + tv.tv_usec = 0; + offset = 0; + len = sizeof(tapdisk_message_t); + + memset(message, 0, sizeof(tapdisk_message_t)); + + while (offset < len) { + FD_ZERO(&readfds); + FD_SET(fd, &readfds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(fd + 1, &readfds, NULL, NULL, &tv); + if (ret == -1) + break; + else if (FD_ISSET(fd, &readfds)) { + ret = read(fd, message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + return (offset == len ? 0 : -EIO); +} + +static int +tapdisk_daemon_receive_message(int fd) +{ + int err; + tapdisk_message_t m; + tapdisk_channel_t *c, *tmp; + + err = tapdisk_daemon_read_message(fd, &m, 2); + if (err) { + EPRINTF("failed reading message on %d: %d\n", fd, err); + return err; + } + + tapdisk_daemon_for_each_channel(c, tmp) + if (c->cookie == m.cookie && c->read_fd == fd) { + DPRINTF("got '%s' message from %d:%d\n", + tapdisk_message_name(m.type), + c->channel_id, c->cookie); + + return tapdisk_channel_receive_message(c, &m); + } + + EPRINTF("unrecognized message on %d: '%s' (uuid = %u)\n", + fd, tapdisk_message_name(m.type), m.cookie); + + return -EINVAL; +} + +static int +tapdisk_daemon_set_fds(fd_set *readfds) +{ + int max, fd; + tapdisk_channel_t *channel, *tmp; + + max = xs_fileno(tapdisk_daemon.xsh); + + FD_ZERO(readfds); + FD_SET(max, readfds); + + tapdisk_daemon_for_each_channel(channel, tmp) { + fd = channel->read_fd; + max = MAX(fd, max); + FD_SET(fd, readfds); + } + + return max; +} + +static int +tapdisk_daemon_check_fds(fd_set *readfds) +{ + int err; + tapdisk_channel_t *channel, *tmp; + + if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), readfds)) + xs_fire_next_watch(tapdisk_daemon.xsh); + + tapdisk_daemon_for_each_channel(channel, tmp) + if (FD_ISSET(channel->read_fd, readfds)) + return tapdisk_daemon_receive_message(channel->read_fd); + + return 0; +} + +static int +tapdisk_daemon_run(void) +{ + int err, max; + fd_set readfds; + + while (1) { + max = tapdisk_daemon_set_fds(&readfds); + + err = select(max + 1, &readfds, NULL, NULL, NULL); + if (err < 0) + continue; + + err = tapdisk_daemon_check_fds(&readfds); + } + + return err; +} + +void +tapdisk_daemon_find_channel(tapdisk_channel_t *channel) +{ + tapdisk_channel_t *c, *tmp; + + channel->read_fd = 0; + channel->write_fd = 0; + channel->tapdisk_pid = 0; + + /* do we want multiple vbds per tapdisk? */ + if (!xs_exists(tapdisk_daemon.xsh, channel->share_tapdisk_str)) { + channel->shared = 0; + return; + } + + channel->shared = 1; + + /* check if we already have a process started */ + tapdisk_daemon_for_each_channel(c, tmp) + if (c->drivertype == channel->drivertype) { + channel->write_fd = c->write_fd; + channel->read_fd = c->read_fd; + channel->channel_id = c->channel_id; + channel->tapdisk_pid = c->tapdisk_pid; + return; + } +} + +void +tapdisk_daemon_close_channel(tapdisk_channel_t *channel) +{ + tapdisk_channel_t *c, *tmp; + + list_del(&channel->list); + + tapdisk_daemon_for_each_channel(c, tmp) + if (c->channel_id == channel->channel_id) + return; + + close(channel->read_fd); + close(channel->write_fd); +} + +int +main(int argc, char *argv[]) +{ + int err; + char buf[128]; + + if (daemon(0, 0)) { + EPRINTF("daemon() failed (%d)\n", errno); + return -errno; + } + +#define CORE_DUMP +#if defined(CORE_DUMP) +#include <sys/resource.h> + { + /* set up core-dumps*/ + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + EPRINTF("setrlimit failed: %d\n", errno); + } +#endif + + snprintf(buf, sizeof(buf), "BLKTAP-DAEMON[%d]", getpid()); + openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON); + + err = tapdisk_daemon_write_pidfile(getpid()); + if (err) + goto out; + + tapdisk_daemon_print_drivers(); + + err = tapdisk_daemon_init(); + if (err) + goto out; + + err = tapdisk_daemon_start(); + if (err) + goto out; + + tapdisk_daemon_run(); + + tapdisk_daemon_stop(); + tapdisk_daemon_free(); + + err = 0; + +out: + if (err) + EPRINTF("failed to start %s: %d\n", argv[0], err); + closelog(); + return err; +} diff --git a/tools/blktap2/daemon/tapdisk-dispatch-common.c b/tools/blktap2/daemon/tapdisk-dispatch-common.c new file mode 100644 index 0000000000..3d72b7dc7a --- /dev/null +++ b/tools/blktap2/daemon/tapdisk-dispatch-common.c @@ -0,0 +1,94 @@ +/* + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "tapdisk-dispatch.h" + +int +strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + + return (len == 0) ? i : -ERANGE; +} + +int +make_blktap_device(char *devname, int major, int minor, int perm) +{ + int err; + + err = unlink(devname); + if (err && errno != ENOENT) { + EPRINTF("unlink %s failed: %d\n", devname, errno); + return -errno; + } + + /* Need to create device */ + err = mkdir(BLKTAP_DEV_DIR, 0755); + if (err && errno != EEXIST) { + EPRINTF("Failed to create %s directory\n", BLKTAP_DEV_DIR); + return -errno; + } + + err = mknod(devname, perm, makedev(major, minor)); + if (err) { + int ret = -errno; + struct stat st; + + EPRINTF("mknod %s failed: %d\n", devname, -errno); + + err = lstat(devname, &st); + if (err) { + DPRINTF("lstat %s failed: %d\n", devname, -errno); + err = access(devname, F_OK); + if (err) + DPRINTF("access %s failed: %d\n", devname, -errno); + else + DPRINTF("access %s succeeded\n", devname); + } else + DPRINTF("lstat %s: %u:%u\n", devname, + (unsigned int)st.st_rdev >> 8, + (unsigned int)st.st_rdev & 0xff); + + return ret; + } + + DPRINTF("Created %s device\n", devname); + return 0; +} diff --git a/tools/blktap2/daemon/tapdisk-dispatch.h b/tools/blktap2/daemon/tapdisk-dispatch.h new file mode 100644 index 0000000000..bcd1e9dc9e --- /dev/null +++ b/tools/blktap2/daemon/tapdisk-dispatch.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _TAPDISK_DISPATCH_H_ +#define _TAPDISK_DISPATCH_H_ + +#include "xs_api.h" +#include "blktaplib.h" +#include "tapdisk-message.h" + +struct tapdisk_channel { + int state; + + int read_fd; + int write_fd; + int blktap_fd; + int channel_id; + + char mode; + char shared; + char open; + unsigned int domid; + unsigned int busid; + unsigned int major; + unsigned int minor; + unsigned int storage; + unsigned int drivertype; + uint16_t cookie; + pid_t tapdisk_pid; + + /* + * special accounting needed to handle pause + * requests received before tapdisk process is ready + */ + char connected; + char pause_needed; + + char *path; + char *frontpath; + char *params; + char *vdi_path; + char *uuid_str; + char *pause_str; + char *pause_done_str; + char *shutdown_str; + char *share_tapdisk_str; + + image_t image; + + struct list_head list; + struct xenbus_watch pause_watch; + struct xenbus_watch shutdown_watch; + + struct xs_handle *xsh; +}; + +typedef struct tapdisk_channel tapdisk_channel_t; + +int strsep_len(const char *str, char c, unsigned int len); +int make_blktap_device(char *devname, int major, int minor, int perm); + +int tapdisk_channel_open(tapdisk_channel_t **, + char *node, struct xs_handle *, + int blktap_fd, uint16_t cookie); +void tapdisk_channel_close(tapdisk_channel_t *); + +void tapdisk_daemon_find_channel(tapdisk_channel_t *); +void tapdisk_daemon_close_channel(tapdisk_channel_t *); + +int tapdisk_channel_receive_message(tapdisk_channel_t *, tapdisk_message_t *); + +#endif diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile new file mode 100644 index 0000000000..90cd6beca9 --- /dev/null +++ b/tools/blktap2/drivers/Makefile @@ -0,0 +1,105 @@ +XEN_ROOT=../../../ +BLKTAP_ROOT= .. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVHDDIR = $(BLKTAP_ROOT)/vhd/lib + +IBIN = tapdisk tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff +QCOW_UTIL = img2qcow qcow-create qcow2raw +LOCK_UTIL = lock-util +INST_DIR = $(SBINDIR) + +CFLAGS += -Werror -g -O0 +CFLAGS += -Wno-unused +CFLAGS += -fno-strict-aliasing +CFLAGS += -I../lib -I../../libxc +CFLAGS += -I../include -I../../include +CFLAGS += -I $(LIBAIO_DIR) +CFLAGS += -D_GNU_SOURCE +CFLAGS += -DUSE_NFS_LOCKS + +ifeq ($(CONFIG_X86_64),y) +CFLAGS += -fPIC +endif + +LIBS += -lrt -lz + +ifeq ($(shell . ./check_gcrypt $(CC)),yes) +CFLAGS += -DUSE_GCRYPT +CRYPT_LIB += -lgcrypt +else +CRYPT_LIB += -lcrypto +$(warning === libgcrypt not installed: falling back to libcrypto ===) +endif + +LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz + +tapdisk tapdisk2 td-util tapdisk-stream tapdisk-diff $(QCOW_UTIL): LIBS += -L$(LIBVHDDIR) -lvhd -luuid + +LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src +tapdisk tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := $(LIBAIO_DIR)/libaio.a +tapdisk tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS += -I$(LIBAIO_DIR) -I$(XEN_LIBXC) + +ifeq ($(VHD_STATIC),y) +td-util: CFLAGS += -static +endif + +TAP-OBJS-y := scheduler.o +TAP-OBJS-y += tapdisk-ipc.o +TAP-OBJS-y += tapdisk-vbd.o +TAP-OBJS-y += tapdisk-image.o +TAP-OBJS-y += tapdisk-driver.o +TAP-OBJS-y += tapdisk-interface.o +TAP-OBJS-y += tapdisk-server.o +TAP-OBJS-y += tapdisk-queue.o +TAP-OBJS-y += tapdisk-filter.o +TAP-OBJS-y += tapdisk-log.o +TAP-OBJS-y += tapdisk-utils.o +TAP-OBJS-y += io-optimize.o +TAP-OBJS-y += lock.o +TAP-OBJS-$(CONFIG_Linux) += blk_linux.o + +MISC-OBJS-y := atomicio.o + +BLK-OBJS-y := block-aio.o +BLK-OBJS-y += block-ram.o +BLK-OBJS-y += block-cache.o +BLK-OBJS-y += block-vhd.o +BLK-OBJS-y += block-log.o +BLK-OBJS-y += block-qcow.o +BLK-OBJS-y += aes.o + +all: $(IBIN) lock-util qcow-util + +tapdisk: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk.c + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img) + +tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.c + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img) + +tapdisk-client: tapdisk-client.o + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(LDFLAGS_img) + +tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y) + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img) + +td-util: td.o tapdisk-utils.o tapdisk-log.o + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(LDFLAGS_img) + +lock-util: lock.c + $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LIBS) + +.PHONY: qcow-util +qcow-util: img2qcow qcow2raw qcow-create + +img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y) + $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img) + +install: all + $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR) + $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR) + +clean: + rm -rf *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL) + +.PHONY: clean install diff --git a/tools/blktap2/drivers/aes.c b/tools/blktap2/drivers/aes.c new file mode 100644 index 0000000000..ea81ae53bb --- /dev/null +++ b/tools/blktap2/drivers/aes.c @@ -0,0 +1,1319 @@ +/**
+ *
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.
+ */
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+//#include "vl.h"
+#include <inttypes.h>
+#include <string.h>
+#include "aes.h"
+
+//#define NDEBUG
+#include <assert.h>
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+#define MAXKC (256/32)
+#define MAXKB (256/8)
+#define MAXNR 14
+
+/* This controls loop-unrolling in aes_core.c */
+#undef FULL_UNROLL
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+
+ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const u32 Te4[256] = {
+ 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+ 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+ 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+ 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+ 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+ 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+ 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+ 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+ 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+ 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+ 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+ 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+ 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+ 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+ 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+ 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+ 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+ 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+ 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+ 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+ 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+ 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+ 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+ 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+ 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+ 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+ 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+ 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+ 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+ 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+ 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+ 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+ 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+ 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+ 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+ 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+ 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+ 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+ 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+ 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+ 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+ 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+ 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+ 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+ 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+ 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+ 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+ 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+ 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+ 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+ 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+ 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+ 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+ 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+ 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+ 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+ 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+ 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+ 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+ 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+ 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+ 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+ 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+ 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const u32 Td0[256] = {
+ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+
+ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u32 Td4[256] = {
+ 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+ 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+ 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+ 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+ 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+ 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+ 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+ 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+ 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+ 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+ 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+ 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+ 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+ 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+ 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+ 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+ 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+ 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+ 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+ 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+ 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+ 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+ 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+ 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+ 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+ 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+ 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+ 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+ 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+ 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+ 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+ 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+ 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+ 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+ 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+ 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+ 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+ 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+ 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+ 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+ 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+ 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+ 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+ 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+ 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+ 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+ 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+ 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+ 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+ 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+ 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+ 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+ 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+ 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+ 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+ 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+ 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+ 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+ 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+ 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+ 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+ 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+ 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+ 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const u32 rcon[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000,
+ 0x10000000, 0x20000000, 0x40000000, 0x80000000,
+ 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key) {
+
+ u32 *rk;
+ int i = 0;
+ u32 temp;
+
+ if (!userKey || !key)
+ return -1;
+ if (bits != 128 && bits != 192 && bits != 256)
+ return -2;
+
+ rk = key->rd_key;
+
+ if (bits==128)
+ key->rounds = 10;
+ else if (bits==192)
+ key->rounds = 12;
+ else
+ key->rounds = 14;
+
+ rk[0] = GETU32(userKey );
+ rk[1] = GETU32(userKey + 4);
+ rk[2] = GETU32(userKey + 8);
+ rk[3] = GETU32(userKey + 12);
+ if (bits == 128) {
+ while (1) {
+ temp = rk[3];
+ rk[4] = rk[0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[5] = rk[1] ^ rk[4];
+ rk[6] = rk[2] ^ rk[5];
+ rk[7] = rk[3] ^ rk[6];
+ if (++i == 10) {
+ return 0;
+ }
+ rk += 4;
+ }
+ }
+ rk[4] = GETU32(userKey + 16);
+ rk[5] = GETU32(userKey + 20);
+ if (bits == 192) {
+ while (1) {
+ temp = rk[ 5];
+ rk[ 6] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 7] = rk[ 1] ^ rk[ 6];
+ rk[ 8] = rk[ 2] ^ rk[ 7];
+ rk[ 9] = rk[ 3] ^ rk[ 8];
+ if (++i == 8) {
+ return 0;
+ }
+ rk[10] = rk[ 4] ^ rk[ 9];
+ rk[11] = rk[ 5] ^ rk[10];
+ rk += 6;
+ }
+ }
+ rk[6] = GETU32(userKey + 24);
+ rk[7] = GETU32(userKey + 28);
+ if (bits == 256) {
+ while (1) {
+ temp = rk[ 7];
+ rk[ 8] = rk[ 0] ^
+ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp ) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp >> 24) ] & 0x000000ff) ^
+ rcon[i];
+ rk[ 9] = rk[ 1] ^ rk[ 8];
+ rk[10] = rk[ 2] ^ rk[ 9];
+ rk[11] = rk[ 3] ^ rk[10];
+ if (++i == 7) {
+ return 0;
+ }
+ temp = rk[11];
+ rk[12] = rk[ 4] ^
+ (Te4[(temp >> 24) ] & 0xff000000) ^
+ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(temp ) & 0xff] & 0x000000ff);
+ rk[13] = rk[ 5] ^ rk[12];
+ rk[14] = rk[ 6] ^ rk[13];
+ rk[15] = rk[ 7] ^ rk[14];
+
+ rk += 8;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key) {
+
+ u32 *rk;
+ int i, j, status;
+ u32 temp;
+
+ /* first, start with an encryption schedule */
+ status = AES_set_encrypt_key(userKey, bits, key);
+ if (status < 0)
+ return status;
+
+ rk = key->rd_key;
+
+ /* invert the order of the round keys: */
+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+ }
+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+ for (i = 1; i < (key->rounds); i++) {
+ rk += 4;
+ rk[0] =
+ Td0[Te4[(rk[0] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[0] ) & 0xff] & 0xff];
+ rk[1] =
+ Td0[Te4[(rk[1] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[1] ) & 0xff] & 0xff];
+ rk[2] =
+ Td0[Te4[(rk[2] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[2] ) & 0xff] & 0xff];
+ rk[3] =
+ Td0[Te4[(rk[3] >> 24) ] & 0xff] ^
+ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^
+ Td3[Te4[(rk[3] ) & 0xff] & 0xff];
+ }
+ return 0;
+}
+
+#ifndef AES_ASM
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key) {
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+ s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+ s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+ s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+ t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+ t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+ t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Te0[(s0 >> 24) ] ^
+ Te1[(s1 >> 16) & 0xff] ^
+ Te2[(s2 >> 8) & 0xff] ^
+ Te3[(s3 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Te0[(s1 >> 24) ] ^
+ Te1[(s2 >> 16) & 0xff] ^
+ Te2[(s3 >> 8) & 0xff] ^
+ Te3[(s0 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Te0[(s2 >> 24) ] ^
+ Te1[(s3 >> 16) & 0xff] ^
+ Te2[(s0 >> 8) & 0xff] ^
+ Te3[(s1 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Te0[(s3 >> 24) ] ^
+ Te1[(s0 >> 16) & 0xff] ^
+ Te2[(s1 >> 8) & 0xff] ^
+ Te3[(s2 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Te0[(t0 >> 24) ] ^
+ Te1[(t1 >> 16) & 0xff] ^
+ Te2[(t2 >> 8) & 0xff] ^
+ Te3[(t3 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Te0[(t1 >> 24) ] ^
+ Te1[(t2 >> 16) & 0xff] ^
+ Te2[(t3 >> 8) & 0xff] ^
+ Te3[(t0 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Te0[(t2 >> 24) ] ^
+ Te1[(t3 >> 16) & 0xff] ^
+ Te2[(t0 >> 8) & 0xff] ^
+ Te3[(t1 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Te0[(t3 >> 24) ] ^
+ Te1[(t0 >> 16) & 0xff] ^
+ Te2[(t1 >> 8) & 0xff] ^
+ Te3[(t2 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Te4[(t0 >> 24) ] & 0xff000000) ^
+ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ (Te4[(t1 >> 24) ] & 0xff000000) ^
+ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ (Te4[(t2 >> 24) ] & 0xff000000) ^
+ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ (Te4[(t3 >> 24) ] & 0xff000000) ^
+ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Te4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key) {
+
+ const u32 *rk;
+ u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+ int r;
+#endif /* ?FULL_UNROLL */
+
+ assert(in && out && key);
+ rk = key->rd_key;
+
+ /*
+ * map byte array block to cipher state
+ * and add initial round key:
+ */
+ s0 = GETU32(in ) ^ rk[0];
+ s1 = GETU32(in + 4) ^ rk[1];
+ s2 = GETU32(in + 8) ^ rk[2];
+ s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+ /* round 1: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+ /* round 2: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+ /* round 3: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+ /* round 4: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+ /* round 5: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+ /* round 6: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+ /* round 7: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+ /* round 8: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+ /* round 9: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+ if (key->rounds > 10) {
+ /* round 10: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+ /* round 11: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+ if (key->rounds > 12) {
+ /* round 12: */
+ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+ s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+ s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+ s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+ /* round 13: */
+ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+ t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+ t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+ t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+ }
+ }
+ rk += key->rounds << 2;
+#else /* !FULL_UNROLL */
+ /*
+ * Nr - 1 full rounds:
+ */
+ r = key->rounds >> 1;
+ for (;;) {
+ t0 =
+ Td0[(s0 >> 24) ] ^
+ Td1[(s3 >> 16) & 0xff] ^
+ Td2[(s2 >> 8) & 0xff] ^
+ Td3[(s1 ) & 0xff] ^
+ rk[4];
+ t1 =
+ Td0[(s1 >> 24) ] ^
+ Td1[(s0 >> 16) & 0xff] ^
+ Td2[(s3 >> 8) & 0xff] ^
+ Td3[(s2 ) & 0xff] ^
+ rk[5];
+ t2 =
+ Td0[(s2 >> 24) ] ^
+ Td1[(s1 >> 16) & 0xff] ^
+ Td2[(s0 >> 8) & 0xff] ^
+ Td3[(s3 ) & 0xff] ^
+ rk[6];
+ t3 =
+ Td0[(s3 >> 24) ] ^
+ Td1[(s2 >> 16) & 0xff] ^
+ Td2[(s1 >> 8) & 0xff] ^
+ Td3[(s0 ) & 0xff] ^
+ rk[7];
+
+ rk += 8;
+ if (--r == 0) {
+ break;
+ }
+
+ s0 =
+ Td0[(t0 >> 24) ] ^
+ Td1[(t3 >> 16) & 0xff] ^
+ Td2[(t2 >> 8) & 0xff] ^
+ Td3[(t1 ) & 0xff] ^
+ rk[0];
+ s1 =
+ Td0[(t1 >> 24) ] ^
+ Td1[(t0 >> 16) & 0xff] ^
+ Td2[(t3 >> 8) & 0xff] ^
+ Td3[(t2 ) & 0xff] ^
+ rk[1];
+ s2 =
+ Td0[(t2 >> 24) ] ^
+ Td1[(t1 >> 16) & 0xff] ^
+ Td2[(t0 >> 8) & 0xff] ^
+ Td3[(t3 ) & 0xff] ^
+ rk[2];
+ s3 =
+ Td0[(t3 >> 24) ] ^
+ Td1[(t2 >> 16) & 0xff] ^
+ Td2[(t1 >> 8) & 0xff] ^
+ Td3[(t0 ) & 0xff] ^
+ rk[3];
+ }
+#endif /* ?FULL_UNROLL */
+ /*
+ * apply last round and
+ * map cipher state to byte array block:
+ */
+ s0 =
+ (Td4[(t0 >> 24) ] & 0xff000000) ^
+ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t1 ) & 0xff] & 0x000000ff) ^
+ rk[0];
+ PUTU32(out , s0);
+ s1 =
+ (Td4[(t1 >> 24) ] & 0xff000000) ^
+ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t2 ) & 0xff] & 0x000000ff) ^
+ rk[1];
+ PUTU32(out + 4, s1);
+ s2 =
+ (Td4[(t2 >> 24) ] & 0xff000000) ^
+ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t3 ) & 0xff] & 0x000000ff) ^
+ rk[2];
+ PUTU32(out + 8, s2);
+ s3 =
+ (Td4[(t3 >> 24) ] & 0xff000000) ^
+ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
+ (Td4[(t0 ) & 0xff] & 0x000000ff) ^
+ rk[3];
+ PUTU32(out + 12, s3);
+}
+
+#endif /* AES_ASM */
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ const unsigned long length, const AES_KEY *key,
+ unsigned char *ivec, const int enc)
+{
+
+ unsigned long n;
+ unsigned long len = length;
+ unsigned char tmp[AES_BLOCK_SIZE];
+
+ assert(in && out && key && ivec);
+
+ if (enc) {
+ while (len >= AES_BLOCK_SIZE) {
+ for(n=0; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] = in[n] ^ ivec[n];
+ AES_encrypt(tmp, out, key);
+ memcpy(ivec, out, AES_BLOCK_SIZE);
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ if (len) {
+ for(n=0; n < len; ++n)
+ tmp[n] = in[n] ^ ivec[n];
+ for(n=len; n < AES_BLOCK_SIZE; ++n)
+ tmp[n] = ivec[n];
+ AES_encrypt(tmp, tmp, key);
+ memcpy(out, tmp, AES_BLOCK_SIZE);
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ }
+ } else {
+ while (len >= AES_BLOCK_SIZE) {
+ memcpy(tmp, in, AES_BLOCK_SIZE);
+ AES_decrypt(in, out, key);
+ for(n=0; n < AES_BLOCK_SIZE; ++n)
+ out[n] ^= ivec[n];
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ len -= AES_BLOCK_SIZE;
+ in += AES_BLOCK_SIZE;
+ out += AES_BLOCK_SIZE;
+ }
+ if (len) {
+ memcpy(tmp, in, AES_BLOCK_SIZE);
+ AES_decrypt(tmp, tmp, key);
+ for(n=0; n < len; ++n)
+ out[n] = tmp[n] ^ ivec[n];
+ memcpy(ivec, tmp, AES_BLOCK_SIZE);
+ }
+ }
+}
diff --git a/tools/blktap2/drivers/aes.h b/tools/blktap2/drivers/aes.h new file mode 100644 index 0000000000..9fb54a900d --- /dev/null +++ b/tools/blktap2/drivers/aes.h @@ -0,0 +1,28 @@ +#ifndef QEMU_AES_H +#define QEMU_AES_H + +#include <stdint.h> + +#define AES_MAXNR 14 +#define AES_BLOCK_SIZE 16 + +struct aes_key_st { + uint32_t rd_key[4 *(AES_MAXNR + 1)]; + int rounds; +}; +typedef struct aes_key_st AES_KEY; + +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); + +void AES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void AES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, + const unsigned long length, const AES_KEY *key, + unsigned char *ivec, const int enc); + +#endif diff --git a/tools/blktap2/drivers/atomicio.c b/tools/blktap2/drivers/atomicio.c new file mode 100644 index 0000000000..ae0e24b00a --- /dev/null +++ b/tools/blktap2/drivers/atomicio.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved. + * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <errno.h> +#include "atomicio.h" + +/* + * ensure all of data on socket comes through. f==read || f==vwrite + */ +size_t +atomicio(f, fd, _s, n) + ssize_t (*f) (int, void *, size_t); + int fd; + void *_s; + size_t n; +{ + char *s = _s; + size_t pos = 0; + ssize_t res; + + while (n > pos) { + res = (f) (fd, s + pos, n - pos); + switch (res) { + case -1: + if (errno == EINTR || errno == EAGAIN) + continue; + return 0; + case 0: + errno = EPIPE; + return pos; + default: + pos += (size_t)res; + } + } + return (pos); +} + diff --git a/tools/blktap2/drivers/blk.h b/tools/blktap2/drivers/blk.h new file mode 100644 index 0000000000..73ca40c629 --- /dev/null +++ b/tools/blktap2/drivers/blk.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +int blk_getimagesize(int fd, uint64_t *size); +int blk_getsectorsize(int fd, uint64_t *sector_size); diff --git a/tools/blktap2/drivers/blk_linux.c b/tools/blktap2/drivers/blk_linux.c new file mode 100644 index 0000000000..75ddcc389f --- /dev/null +++ b/tools/blktap2/drivers/blk_linux.c @@ -0,0 +1,43 @@ +#include <inttypes.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include "tapdisk.h" +#include "blk.h" + +int blk_getimagesize(int fd, uint64_t *size) +{ + int rc; + + *size = 0; + rc = ioctl(fd, BLKGETSIZE, size); + if (rc) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + return 0; +} + +int blk_getsectorsize(int fd, uint64_t *sector_size) +{ +#if defined(BLKSSZGET) + int rc; + + *sector_size = DEFAULT_SECTOR_SIZE; + rc = ioctl(fd, BLKSSZGET, sector_size); + if (rc) { + DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size"); + *sector_size = DEFAULT_SECTOR_SIZE; + } + + if (*sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %"PRIu64" (not %u)\n", + *sector_size, DEFAULT_SECTOR_SIZE); +#else + *sector_size = DEFAULT_SECTOR_SIZE; +#endif + + return 0; +} + diff --git a/tools/blktap2/drivers/blktap2.h b/tools/blktap2/drivers/blktap2.h new file mode 100644 index 0000000000..38350d2fad --- /dev/null +++ b/tools/blktap2/drivers/blktap2.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _BLKTAP_2_H_ +#define _BLKTAP_2_H_ + +#define MISC_MAJOR_NUMBER 10 + +#define BLKTAP2_MAX_MESSAGE_LEN 256 + +#define BLKTAP2_RING_MESSAGE_PAUSE 1 +#define BLKTAP2_RING_MESSAGE_RESUME 2 +#define BLKTAP2_RING_MESSAGE_CLOSE 3 + +#define BLKTAP2_IOCTL_KICK_FE 1 +#define BLKTAP2_IOCTL_ALLOC_TAP 200 +#define BLKTAP2_IOCTL_FREE_TAP 201 +#define BLKTAP2_IOCTL_CREATE_DEVICE 202 +#define BLKTAP2_IOCTL_SET_PARAMS 203 +#define BLKTAP2_IOCTL_PAUSE 204 +#define BLKTAP2_IOCTL_REOPEN 205 +#define BLKTAP2_IOCTL_RESUME 206 + +#define BLKTAP2_CONTROL_NAME "blktap-control" +#define BLKTAP2_DIRECTORY "/dev/xen/blktap-2" +#define BLKTAP2_CONTROL_DEVICE BLKTAP2_DIRECTORY"/control" +#define BLKTAP2_RING_DEVICE BLKTAP2_DIRECTORY"/blktap" +#define BLKTAP2_IO_DEVICE BLKTAP2_DIRECTORY"/tapdev" + +struct blktap2_handle { + unsigned int ring; + unsigned int device; + unsigned int minor; +}; + +struct blktap2_params { + char name[BLKTAP2_MAX_MESSAGE_LEN]; + unsigned long long capacity; + unsigned long sector_size; +}; + +#endif diff --git a/tools/blktap2/drivers/block-aio.c b/tools/blktap2/drivers/block-aio.c new file mode 100644 index 0000000000..2c5af1483c --- /dev/null +++ b/tools/blktap2/drivers/block-aio.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <errno.h> +#include <libaio.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> + +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +#define MAX_AIO_REQS TAPDISK_DATA_REQUESTS + +struct tdaio_state; + +struct aio_request { + td_request_t treq; + struct tiocb tiocb; + struct tdaio_state *state; +}; + +struct tdaio_state { + int fd; + td_driver_t *driver; + + int aio_free_count; + struct aio_request aio_requests[MAX_AIO_REQS]; + struct aio_request *aio_free_list[MAX_AIO_REQS]; +}; + +/*Get Image size, secsize*/ +static int tdaio_get_image_info(int fd, td_disk_info_t *info) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + info->size = 0; + if (ioctl(fd,BLKGETSIZE,&info->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + info->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &info->sector_size); + + if (info->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + info->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + info->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + info->size = (stat.st_size >> SECTOR_SHIFT); + info->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + } + + if (info->size == 0) { + info->size =((uint64_t) 16836057); + info->sector_size = DEFAULT_SECTOR_SIZE; + } + info->info = 0; + + return 0; +} + +/* Open the disk file and initialize aio state. */ +int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + int i, fd, ret, o_flags; + struct tdaio_state *prv; + + ret = 0; + prv = (struct tdaio_state *)driver->data; + + DPRINTF("block-aio open('%s')", name); + + memset(prv, 0, sizeof(struct tdaio_state)); + + prv->aio_free_count = MAX_AIO_REQS; + for (i = 0; i < MAX_AIO_REQS; i++) + prv->aio_free_list[i] = &prv->aio_requests[i]; + + /* Open the file */ + o_flags = O_DIRECT | O_LARGEFILE | + ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); + fd = open(name, o_flags); + + if ( (fd == -1) && (errno == EINVAL) ) { + + /* Maybe O_DIRECT isn't supported. */ + o_flags &= ~O_DIRECT; + fd = open(name, o_flags); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno); + ret = 0 - errno; + goto done; + } + + ret = tdaio_get_image_info(fd, &driver->info); + if (ret) { + close(fd); + goto done; + } + + prv->fd = fd; + +done: + return ret; +} + +void tdaio_complete(void *arg, struct tiocb *tiocb, int err) +{ + struct aio_request *aio = (struct aio_request *)arg; + struct tdaio_state *prv = aio->state; + + td_complete_request(aio->treq, err); + prv->aio_free_list[prv->aio_free_count++] = aio; +} + +void tdaio_queue_read(td_driver_t *driver, td_request_t treq) +{ + int size; + uint64_t offset; + struct aio_request *aio; + struct tdaio_state *prv; + + prv = (struct tdaio_state *)driver->data; + size = treq.secs * driver->info.sector_size; + offset = treq.sec * (uint64_t)driver->info.sector_size; + + if (prv->aio_free_count == 0) + goto fail; + + aio = prv->aio_free_list[--prv->aio_free_count]; + aio->treq = treq; + aio->state = prv; + + td_prep_read(&aio->tiocb, prv->fd, treq.buf, + size, offset, tdaio_complete, aio); + td_queue_tiocb(driver, &aio->tiocb); + + return; + +fail: + td_complete_request(treq, -EBUSY); +} + +void tdaio_queue_write(td_driver_t *driver, td_request_t treq) +{ + int size; + uint64_t offset; + struct aio_request *aio; + struct tdaio_state *prv; + + prv = (struct tdaio_state *)driver->data; + size = treq.secs * driver->info.sector_size; + offset = treq.sec * (uint64_t)driver->info.sector_size; + + if (prv->aio_free_count == 0) + goto fail; + + aio = prv->aio_free_list[--prv->aio_free_count]; + aio->treq = treq; + aio->state = prv; + + td_prep_write(&aio->tiocb, prv->fd, treq.buf, + size, offset, tdaio_complete, aio); + td_queue_tiocb(driver, &aio->tiocb); + + return; + +fail: + td_complete_request(treq, -EBUSY); +} + +int tdaio_close(td_driver_t *driver) +{ + struct tdaio_state *prv = (struct tdaio_state *)driver->data; + + close(prv->fd); + + return 0; +} + +int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + return TD_NO_PARENT; +} + +int tdaio_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags) +{ + return -EINVAL; +} + +struct tap_disk tapdisk_aio = { + .disk_type = "tapdisk_aio", + .flags = 0, + .private_data_size = sizeof(struct tdaio_state), + .td_open = tdaio_open, + .td_close = tdaio_close, + .td_queue_read = tdaio_queue_read, + .td_queue_write = tdaio_queue_write, + .td_get_parent_id = tdaio_get_parent_id, + .td_validate_parent = tdaio_validate_parent, + .td_debug = NULL, +}; diff --git a/tools/blktap2/drivers/block-cache.c b/tools/blktap2/drivers/block-cache.c new file mode 100644 index 0000000000..1d2f4eb879 --- /dev/null +++ b/tools/blktap2/drivers/block-cache.c @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> + +#include "tapdisk.h" +#include "tapdisk-utils.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" + +#ifdef DEBUG +#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) +#else +#define DBG(_f, _a...) ((void)0) +#endif + +#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) + +#define RADIX_TREE_PAGE_SHIFT 12 /* 4K pages */ +#define RADIX_TREE_PAGE_SIZE (1 << RADIX_TREE_PAGE_SHIFT) + +#define RADIX_TREE_NODE_SHIFT 9 /* 512B nodes */ +#define RADIX_TREE_NODE_SIZE (1 << RADIX_TREE_NODE_SHIFT) +#define RADIX_TREE_NODE_MASK (RADIX_TREE_NODE_SIZE - 1) + +#define BLOCK_CACHE_NODES_PER_PAGE (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT)) + +#define BLOCK_CACHE_MAX_SIZE (10 << 20) /* 100MB cache */ +#define BLOCK_CACHE_REQUESTS (TAPDISK_DATA_REQUESTS << 3) +#define BLOCK_CACHE_PAGE_IDLETIME 60 + +typedef struct radix_tree radix_tree_t; +typedef struct radix_tree_node radix_tree_node_t; +typedef struct radix_tree_link radix_tree_link_t; +typedef struct radix_tree_leaf radix_tree_leaf_t; +typedef struct radix_tree_page radix_tree_page_t; + +typedef struct block_cache block_cache_t; +typedef struct block_cache_request block_cache_request_t; +typedef struct block_cache_stats block_cache_stats_t; + +struct radix_tree_page { + char *buf; + size_t size; + uint64_t sec; + radix_tree_link_t *owners[BLOCK_CACHE_NODES_PER_PAGE]; +}; + +struct radix_tree_leaf { + radix_tree_page_t *page; + char *buf; +}; + +struct radix_tree_link { + uint32_t time; + union { + radix_tree_node_t *next; + radix_tree_leaf_t leaf; + } u; +}; + +struct radix_tree_node { + int height; + radix_tree_link_t links[RADIX_TREE_NODE_SIZE]; +}; + +struct radix_tree { + int height; + uint64_t size; + uint32_t nodes; + radix_tree_node_t *root; + + block_cache_t *cache; +}; + +struct block_cache_request { + int err; + char *buf; + uint64_t secs; + td_request_t treq; + block_cache_t *cache; +}; + +struct block_cache_stats { + uint64_t reads; + uint64_t hits; + uint64_t misses; + uint64_t prunes; +}; + +struct block_cache { + int ptype; + char *name; + + uint64_t sectors; + + block_cache_request_t requests[BLOCK_CACHE_REQUESTS]; + block_cache_request_t *request_free_list[BLOCK_CACHE_REQUESTS]; + int requests_free; + + event_id_t timeout_id; + + radix_tree_t tree; + + block_cache_stats_t stats; +}; + +static inline uint64_t +radix_tree_calculate_size(int height) +{ + return (uint64_t)RADIX_TREE_NODE_SIZE << + (height * RADIX_TREE_NODE_SHIFT); +} + +static inline int +radix_tree_calculate_height(uint64_t sectors) +{ + int height; + uint64_t tree_size; + + height = 1; /* always allocate root node */ + tree_size = radix_tree_calculate_size(height); + while (sectors > tree_size) + tree_size = radix_tree_calculate_size(++height); + + return height; +} + +static inline int +radix_tree_index(radix_tree_node_t *node, uint64_t sector) +{ + return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) & + RADIX_TREE_NODE_MASK); +} + +static inline int +radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node) +{ + return (node->height == 0); +} + +static inline int +radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node) +{ + return (node->height == tree->height); +} + +static inline uint64_t +radix_tree_size(radix_tree_t *tree) +{ + return tree->size + tree->nodes * sizeof(radix_tree_node_t); +} + +static inline void +radix_tree_clear_link(radix_tree_link_t *link) +{ + if (link) + memset(link, 0, sizeof(radix_tree_link_t)); +} + +static inline radix_tree_node_t * +radix_tree_allocate_node(radix_tree_t *tree, int height) +{ + radix_tree_node_t *node; + + node = calloc(1, sizeof(radix_tree_node_t)); + if (!node) + return NULL; + + node->height = height; + tree->nodes++; + + return node; +} + +static inline radix_tree_node_t * +radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent) +{ + return radix_tree_allocate_node(tree, parent->height - 1); +} + +void +radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node) +{ + if (!node) + return; + + free(node); + tree->nodes--; +} + +static inline radix_tree_page_t * +radix_tree_allocate_page(radix_tree_t *tree, + char *buf, uint64_t sec, size_t size) +{ + radix_tree_page_t *page; + + page = calloc(1, sizeof(radix_tree_page_t)); + if (!page) + return NULL; + + page->buf = buf; + page->sec = sec; + page->size = size; + tree->size += size; + + return page; +} + +static inline void +radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page) +{ + int i; + + for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++) + DBG("%s: ejecting sector 0x%llx\n", + tree->cache->name, page->sec + i); + + tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT); + tree->size -= page->size; + free(page->buf); + free(page); +} + +/* + * remove a leaf and the shared radix_tree_page_t containing its buffer. + * leaves are deleted, nodes are not; gc will reap the nodes later. + */ +static void +radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page) +{ + int i; + + if (!page) + return; + + for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) + radix_tree_clear_link(page->owners[i]); + + radix_tree_free_page(tree, page); +} + +static void +radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link, + radix_tree_page_t *page, off_t off) +{ + int i; + + if (off + RADIX_TREE_NODE_SIZE > page->size) + return; + + for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) { + if (page->owners[i]) + continue; + + page->owners[i] = link; + link->u.leaf.page = page; + link->u.leaf.buf = page->buf + off; + + break; + } +} + +static char * +radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector) +{ + int idx; + struct timeval now; + radix_tree_link_t *link; + radix_tree_node_t *node; + + node = tree->root; + gettimeofday(&now, NULL); + + do { + idx = radix_tree_index(node, sector); + link = node->links + idx; + link->time = now.tv_sec; + + if (radix_tree_node_contains_leaves(tree, node)) + return link->u.leaf.buf; + + if (!link->u.next) + return NULL; + + node = link->u.next; + } while (1); +} + +static char * +radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector, + radix_tree_page_t *page, off_t off) +{ + int idx; + struct timeval now; + radix_tree_link_t *link; + radix_tree_node_t *node; + + node = tree->root; + gettimeofday(&now, NULL); + + do { + idx = radix_tree_index(node, sector); + link = node->links + idx; + link->time = now.tv_sec; + + if (radix_tree_node_contains_leaves(tree, node)) { + radix_tree_remove_page(tree, link->u.leaf.page); + radix_tree_insert_leaf(tree, link, page, off); + return link->u.leaf.buf; + } + + if (!link->u.next) { + link->u.next = radix_tree_allocate_child_node(tree, + node); + if (!link->u.next) + return NULL; + } + + node = link->u.next; + } while (1); +} + +static int +radix_tree_add_leaves(radix_tree_t *tree, char *buf, + uint64_t sector, uint64_t sectors) +{ + int i; + radix_tree_page_t *page; + + page = radix_tree_allocate_page(tree, buf, sector, + sectors << RADIX_TREE_NODE_SHIFT); + if (!page) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + if (!radix_tree_add_leaf(tree, sector + i, + page, (i << RADIX_TREE_NODE_SHIFT))) + goto fail; + + return 0; + +fail: + page->buf = NULL; + radix_tree_remove_page(tree, page); + return -ENOMEM; +} + +static void +radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node) +{ + int i; + radix_tree_link_t *link; + + if (!node) + return; + + for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) { + link = node->links + i; + + if (radix_tree_node_contains_leaves(tree, node)) + radix_tree_remove_page(tree, link->u.leaf.page); + else + radix_tree_delete_branch(tree, link->u.next); + + radix_tree_clear_link(link); + } + + radix_tree_free_node(tree, node); +} + +static inline void +radix_tree_destroy(radix_tree_t *tree) +{ + radix_tree_delete_branch(tree, tree->root); + tree->root = NULL; +} + +/* + * returns 1 if @node is empty after pruning, 0 otherwise + */ +static int +radix_tree_prune_branch(radix_tree_t *tree, + radix_tree_node_t *node, uint32_t now) +{ + int i, empty; + radix_tree_link_t *link; + + empty = 1; + if (!node) + return empty; + + for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) { + link = node->links + i; + + if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) { + if (radix_tree_node_contains_leaves(tree, node)) { + empty = 0; + continue; + } + + if (radix_tree_prune_branch(tree, link->u.next, now)) + radix_tree_clear_link(link); + else + empty = 0; + + continue; + } + + if (radix_tree_node_contains_leaves(tree, node)) + radix_tree_remove_page(tree, link->u.leaf.page); + else + radix_tree_delete_branch(tree, link->u.next); + + radix_tree_clear_link(link); + } + + if (empty && !radix_tree_node_is_root(tree, node)) + radix_tree_free_node(tree, node); + + return empty; +} + +/* + * walk tree and free any node that has been idle for too long + */ +static void +radix_tree_prune(radix_tree_t *tree) +{ + struct timeval now; + + if (!tree->root) + return; + + DPRINTF("tree %s has %"PRIu64" bytes\n", + tree->cache->name, tree->size); + + gettimeofday(&now, NULL); + radix_tree_prune_branch(tree, tree->root, now.tv_sec); + + DPRINTF("tree %s now has %"PRIu64" bytes\n", + tree->cache->name, tree->size); +} + +static inline int +radix_tree_initialize(radix_tree_t *tree, uint64_t sectors) +{ + tree->height = radix_tree_calculate_height(sectors); + tree->root = radix_tree_allocate_node(tree, tree->height); + if (!tree->root) + return -ENOMEM; + + return 0; +} + +static inline void +radix_tree_free(radix_tree_t *tree) +{ + radix_tree_destroy(tree); +} + +static void +block_cache_prune_event(event_id_t id, char mode, void *private) +{ + radix_tree_t *tree; + block_cache_t *cache; + + cache = (block_cache_t *)private; + tree = &cache->tree; + + radix_tree_prune(tree); +} + +static inline block_cache_request_t * +block_cache_get_request(block_cache_t *cache) +{ + if (!cache->requests_free) + return NULL; + + return cache->request_free_list[--cache->requests_free]; +} + +static inline void +block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq) +{ + memset(breq, 0, sizeof(block_cache_request_t)); + cache->request_free_list[cache->requests_free++] = breq; +} + +static int +block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + int i, err; + radix_tree_t *tree; + block_cache_t *cache; + + if (!td_flag_test(flags, TD_OPEN_RDONLY)) + return -EINVAL; + + if (driver->info.sector_size != RADIX_TREE_NODE_SIZE) + return -EINVAL; + + cache = (block_cache_t *)driver->data; + err = tapdisk_namedup(&cache->name, (char *)name); + if (err) + return -ENOMEM; + + cache->sectors = driver->info.size; + + tree = &cache->tree; + err = radix_tree_initialize(tree, cache->sectors); + if (err) + goto fail; + + tree->cache = cache; + cache->requests_free = BLOCK_CACHE_REQUESTS; + for (i = 0; i < BLOCK_CACHE_REQUESTS; i++) + cache->request_free_list[i] = cache->requests + i; + + cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, + -1, /* dummy fd */ + BLOCK_CACHE_PAGE_IDLETIME << 1, + block_cache_prune_event, + cache); + if (cache->timeout_id < 0) + goto fail; + + DPRINTF("opening cache for %s, sectors: %"PRIu64", " + "tree: %p, height: %d\n", + cache->name, cache->sectors, tree, tree->height); + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + DPRINTF("mlockall failed: %d\n", -errno); + + return 0; + +fail: + free(cache->name); + radix_tree_free(&cache->tree); + return err; +} + +static int +block_cache_close(td_driver_t *driver) +{ + radix_tree_t *tree; + block_cache_t *cache; + + cache = (block_cache_t *)driver->data; + tree = &cache->tree; + + DPRINTF("closing cache for %s\n", cache->name); + + tapdisk_server_unregister_event(cache->timeout_id); + radix_tree_free(tree); + free(cache->name); + + return 0; +} + +static inline uint64_t +block_cache_hash(block_cache_t *cache, char *buf) +{ + int i, n; + uint64_t cksm, *data; + + return 0; + + cksm = 0; + data = (uint64_t *)buf; + n = RADIX_TREE_NODE_SIZE / sizeof(uint64_t); + + for (i = 0; i < n; i++) + cksm += data[i]; + + return ~cksm; +} + +static void +block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[]) +{ + int i; + off_t off; + + cache->stats.hits += treq.secs; + + for (i = 0; i < treq.secs; i++) { + DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n", + cache->name, treq.sec + i, block_cache_hash(cache, iov[i])); + + off = i << RADIX_TREE_NODE_SHIFT; + memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE); + } + + td_complete_request(treq, 0); +} + +static void +block_cache_populate_cache(td_request_t clone, int err) +{ + int i; + radix_tree_t *tree; + block_cache_t *cache; + block_cache_request_t *breq; + + breq = (block_cache_request_t *)clone.cb_data; + cache = breq->cache; + tree = &cache->tree; + breq->secs -= clone.secs; + breq->err = (breq->err ? breq->err : err); + + if (breq->secs) + return; + + if (breq->err) { + free(breq->buf); + goto out; + } + + for (i = 0; i < breq->treq.secs; i++) { + off_t off = i << RADIX_TREE_NODE_SHIFT; + DBG("%s: populating sec 0x%08llx\n", + cache->name, breq->treq.sec + i); + memcpy(breq->treq.buf + off, + breq->buf + off, RADIX_TREE_NODE_SIZE); + } + + if (radix_tree_add_leaves(tree, breq->buf, + breq->treq.sec, breq->treq.secs)) + free(breq->buf); + +out: + td_complete_request(breq->treq, breq->err); + block_cache_put_request(cache, breq); +} + +static void +block_cache_miss(block_cache_t *cache, td_request_t treq) +{ + char *buf; + size_t size; + td_request_t clone; + radix_tree_t *tree; + block_cache_request_t *breq; + + DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec); + + clone = treq; + tree = &cache->tree; + size = treq.secs << RADIX_TREE_NODE_SHIFT; + + cache->stats.misses += treq.secs; + + if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE) + goto out; + + breq = block_cache_get_request(cache); + if (!breq) + goto out; + + if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) { + block_cache_put_request(cache, breq); + goto out; + } + + breq->treq = treq; + breq->secs = treq.secs; + breq->err = 0; + breq->buf = buf; + breq->cache = cache; + + clone.buf = buf; + clone.cb = block_cache_populate_cache; + clone.cb_data = breq; + +out: + td_forward_request(clone); +} + +static void +block_cache_queue_read(td_driver_t *driver, td_request_t treq) +{ + int i; + radix_tree_t *tree; + block_cache_t *cache; + char *iov[BLOCK_CACHE_NODES_PER_PAGE]; + + cache = (block_cache_t *)driver->data; + tree = &cache->tree; + + cache->stats.reads += treq.secs; + + if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE) + return td_forward_request(treq); + + for (i = 0; i < treq.secs; i++) { + iov[i] = radix_tree_find_leaf(tree, treq.sec + i); + if (!iov[i]) + return block_cache_miss(cache, treq); + } + + return block_cache_hit(cache, treq, iov); +} + +static void +block_cache_queue_write(td_driver_t *driver, td_request_t treq) +{ + td_complete_request(treq, -EPERM); +} + +static int +block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + return -EINVAL; +} + +static int +block_cache_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags) +{ + block_cache_t *cache; + + if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY)) + return -EINVAL; + + cache = (block_cache_t *)driver->data; + if (strcmp(driver->name, pdriver->name)) + return -EINVAL; + + return 0; +} + +static void +block_cache_debug(td_driver_t *driver) +{ + block_cache_t *cache; + block_cache_stats_t *stats; + + cache = (block_cache_t *)driver->data; + stats = &cache->stats; + + WARN("BLOCK CACHE %s\n", cache->name); + WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n", + stats->reads, stats->hits, stats->misses, stats->prunes); +} + +struct tap_disk tapdisk_block_cache = { + .disk_type = "tapdisk_block_cache", + .flags = 0, + .private_data_size = sizeof(block_cache_t), + .td_open = block_cache_open, + .td_close = block_cache_close, + .td_queue_read = block_cache_queue_read, + .td_queue_write = block_cache_queue_write, + .td_get_parent_id = block_cache_get_parent_id, + .td_validate_parent = block_cache_validate_parent, + .td_debug = block_cache_debug, +}; diff --git a/tools/blktap2/drivers/block-log.c b/tools/blktap2/drivers/block-log.c new file mode 100644 index 0000000000..2cc051b7d2 --- /dev/null +++ b/tools/blktap2/drivers/block-log.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver to sit on top of another disk and log writes, in order + * to synchronize two distinct disks + * + * On receipt of a control request it can export a list of dirty + * sectors in the following format: + * struct writerange { + * u64 sector; + * u32 count; + * } + * terminated by { 0, 0 } + */ + +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "log.h" +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +#define MAX_CONNECTIONS 1 + +typedef struct poll_fd { + int fd; + event_id_t id; +} poll_fd_t; + +struct tdlog_state { + uint64_t size; + + void* writelog; + + char* ctlpath; + poll_fd_t ctl; + + int connected; + poll_fd_t connections[MAX_CONNECTIONS]; + + char* shmpath; + void* shm; + + log_sring_t* sring; + log_back_ring_t bring; +}; + +#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a) + +#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a) + +static void ctl_accept(event_id_t, char, void *); +static void ctl_request(event_id_t, char, void *); + +/* -- write log -- */ + +/* large flat bitmaps don't scale particularly well either in size or scan + * time, but they'll do for now */ +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) + +#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG] +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) + +static inline int test_bit(int nr, void* bmap) +{ + return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1; +} + +static inline void clear_bit(int nr, void* bmap) +{ + BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr)); +} + +static inline void set_bit(int nr, void* bmap) +{ + BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr)); +} + +static inline int bitmap_size(uint64_t sz) +{ + return sz >> 3; +} + +static int writelog_create(struct tdlog_state *s) +{ + uint64_t bmsize; + + bmsize = bitmap_size(s->size); + + BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize); + + if (!(s->writelog = calloc(bmsize, 1))) { + BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize); + return -1; + } + + return 0; +} + +static int writelog_free(struct tdlog_state *s) +{ + if (s->writelog) + free(s->writelog); + + return 0; +} + +static int writelog_set(struct tdlog_state* s, uint64_t sector, int count) +{ + int i; + + for (i = 0; i < count; i++) + set_bit(sector + i, s->writelog); + + return 0; +} + +/* if end is 0, clear to end of disk */ +int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end) +{ + if (!end) + end = s->size; + + /* clear to word boundaries */ + while (BITMAP_SHIFT(start)) + clear_bit(start++, s->writelog); + while (BITMAP_SHIFT(end)) + clear_bit(end--, s->writelog); + + memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3); + + return 0; +} + +/* returns last block exported (may not be end of disk if shm region + * overflows) */ +static uint64_t writelog_export(struct tdlog_state* s) +{ + struct disk_range* range = s->shm; + uint64_t i = 0; + + BDPRINTF("sector count: %"PRIu64, s->size); + + for (i = 0; i < s->size; i++) { + if (test_bit(i, s->writelog)) { + /* range start */ + range->sector = i; + range->count = 1; + /* find end */ + for (i++; i < s->size && test_bit(i, s->writelog); i++) + range->count++; + + BDPRINTF("export: dirty extent %"PRIu64":%u", + range->sector, range->count); + range++; + + /* out of space in shared memory region */ + if ((void*)range >= bmend(s->shm)) { + BDPRINTF("out of space in shm region at sector %"PRIu64, i); + return i; + } + + /* undo forloop increment */ + i--; + } + } + + /* NULL-terminate range list */ + range->sector = 0; + range->count = 0; + + return i; +} + +/* -- communication channel -- */ + +/* remove FS special characters in up to len bytes of path */ +static inline void path_escape(char* path, size_t len) { + int i; + + for (i = 0; i < len && path[i]; i++) + if (strchr(":/", path[i])) + path[i] = '_'; +} + +static char* ctl_makepath(const char* name, const char* ext) +{ + char* res; + char *file; + + file = strrchr(name, '/'); + if (!file) { + BWPRINTF("invalid name %s\n", name); + return NULL; + } + + if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) { + BWPRINTF("could not allocate path"); + return NULL; + } + + path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file)); + + return res; +} + +static int shmem_open(struct tdlog_state* s, const char* name) +{ + int i, l, fd; + + /* device name -> path */ + if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) { + BWPRINTF("could not allocate shm path"); + return -1; + } + + path_escape(s->shmpath + 5, strlen(name)); + + if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) { + BWPRINTF("could not open shared memory file %s: %s", s->shmpath, + strerror(errno)); + goto err; + } + if (ftruncate(fd, SHMSIZE) < 0) { + BWPRINTF("error truncating shmem to size %u", SHMSIZE); + close(fd); + goto err; + } + + s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (s->shm == MAP_FAILED) { + BWPRINTF("could not mmap write log shm: %s", strerror(errno)); + goto err; + } + return 0; + + err: + s->shm = NULL; + free(s->shmpath); + s->shmpath = NULL; + return -1; +} + +static int shmem_close(struct tdlog_state* s) +{ + if (s->shm) { + munmap(s->shm, SHMSIZE); + s->shm = NULL; + } + + if (s->shmpath) { + shm_unlink(s->shmpath); + s->shmpath = NULL; + } + + return 0; +} + +/* control socket */ + +static int ctl_open(struct tdlog_state* s, const char* name) +{ + struct sockaddr_un saddr; + + if (!(s->ctlpath = ctl_makepath(name, "ctl"))) + return -1; + + if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + BWPRINTF("error opening control socket: %s", strerror(errno)); + goto err; + } + + memset(&saddr, 0, sizeof(saddr)); + saddr.sun_family = AF_UNIX; + memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath)); + if (unlink(s->ctlpath) && errno != ENOENT) { + BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath, + strerror(errno)); + goto err_sock; + } + + if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) { + BWPRINTF("error binding control socket to %s: %s", s->ctlpath, + strerror(errno)); + goto err_sock; + } + + if (listen(s->ctl.fd, 1) < 0) { + BWPRINTF("error listening on control socket: %s", strerror(errno)); + goto err_sock; + } + + s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + s->ctl.fd, 0, ctl_accept, s); + if (s->ctl.id < 0) { + BWPRINTF("error register event handler: %s", strerror(s->ctl.id)); + goto err_sock; + } + + return 0; + + err_sock: + close(s->ctl.fd); + s->ctl.fd = -1; + err: + free(s->ctlpath); + s->ctlpath = NULL; + + return -1; +} + +static int ctl_close(struct tdlog_state* s) +{ + while (s->connected) { + tapdisk_server_unregister_event(s->connections[s->connected].id); + close(s->connections[s->connected].fd); + s->connections[s->connected].fd = -1; + s->connections[s->connected].id = 0; + s->connected--; + } + + if (s->ctl.fd >= 0) { + tapdisk_server_unregister_event(s->ctl.id); + close(s->ctl.fd); + s->ctl.fd = -1; + s->ctl.id = 0; + } + + if (s->ctlpath) { + unlink(s->ctlpath); + free(s->ctlpath); + s->ctlpath = NULL; + } + + /* XXX this must be fixed once requests are actually in flight */ + /* could just drain the existing ring here first */ + if (s->sring) { + SHARED_RING_INIT(s->sring); + BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); + } + + return 0; +} + +/* walk list of open sockets, close matching fd */ +static int ctl_close_sock(struct tdlog_state* s, int fd) +{ + int i; + + for (i = 0; i <= s->connected; i++) { + if (s->connections[i].fd == fd) { + tapdisk_server_unregister_event(s->connections[i].id); + close(s->connections[i].fd); + s->connections[i].fd = -1; + s->connections[i].id = 0; + s->connected--; + return 0; + } + } + + BWPRINTF("requested to close unknown socket %d", fd); + return -1; +} + +static void ctl_accept(event_id_t id, char mode, void *private) +{ + struct tdlog_state* s = (struct tdlog_state *)private; + int fd; + event_id_t cid; + + if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) { + BWPRINTF("error accepting control connection: %s", strerror(errno)); + return; + } + + if (s->connected) { + BWPRINTF("control session in progress, closing new connection"); + close(fd); + return; + } + + cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + fd, 0, ctl_request, s); + if (cid < 0) { + BWPRINTF("error registering connection event handler: %s", strerror(cid)); + close(fd); + return; + } + + s->connections[s->connected].fd = fd; + s->connections[s->connected].id = cid; + s->connected++; +} + +/* response format: 4 bytes shmsize, 0-terminated path */ +static int ctl_get_shmpath(struct tdlog_state* s, int fd) +{ + char msg[CTLRSPLEN_SHMP + 1]; + uint32_t sz; + int rc; + + BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)", + SHMSIZE, s->shmpath); + + /* TMP: sanity-check shm */ + sz = 0xdeadbeef; + memcpy(s->shm, &sz, sizeof(sz)); + + sz = SHMSIZE; + memcpy(msg, &sz, sizeof(sz)); + snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath); + if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) { + BWPRINTF("error writing shmpath: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int ctl_peek_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: peeking bitmap"); + + writelog_export(s); + + if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) { + BWPRINTF("error writing peek ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int ctl_clear_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: clearing bitmap"); + + writelog_clear(s, 0, 0); + + if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) { + BWPRINTF("error writing clear ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +/* get dirty bitmap and clear it atomically */ +static int ctl_get_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: getting bitmap"); + + writelog_export(s); + writelog_clear(s, 0, 0); + + if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) { + BWPRINTF("error writing get ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +/* get requests from ring */ +static int ctl_kick(struct tdlog_state* s, int fd) +{ + RING_IDX reqstart, reqend; + log_request_t req; + + /* XXX testing */ + RING_IDX rspstart, rspend; + log_response_t rsp; + struct log_ctlmsg msg; + int rc; + + reqstart = s->bring.req_cons; + reqend = s->sring->req_prod; + + BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend); + + while (reqstart != reqend) { + /* XXX actually submit these! */ + memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req)); + BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count); + s->bring.req_cons = ++reqstart; + + rsp.sector = req.sector; + rsp.count = req.count; + memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp, + sizeof(rsp)); + s->bring.rsp_prod_pvt++; + } + + RING_PUSH_RESPONSES(&s->bring); + memset(&msg, 0, sizeof(msg)); + memcpy(msg.msg, LOGCMD_KICK, 4); + if ((rc = write(fd, &msg, sizeof(msg))) < 0) { + BWPRINTF("error sending notify: %s", strerror(errno)); + return -1; + } else if (rc < sizeof(msg)) { + BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg)); + return -1; + } + + return 0; +} + +static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg) +{ + if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) { + return ctl_get_shmpath(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) { + return ctl_peek_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) { + return ctl_clear_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) { + return ctl_get_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) { + return ctl_kick(s, fd); + } + + BWPRINTF("unknown control request %.4s", msg->msg); + return -1; +} + +static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id) +{ + int i; + + for (i = 0; i < s->connected; i++) + if (s->connections[i].id == id) + return s->connections[i].fd; + + BWPRINTF("unrecognized event callback id %d", id); + return -1; +} + +static void ctl_request(event_id_t id, char mode, void *private) +{ + struct tdlog_state* s = (struct tdlog_state*)private; + struct log_ctlmsg msg; + int rc, i, fd = -1; + + fd = ctl_find_connection(s, id); + if (fd == -1) + return; + + if ((rc = read(fd, &msg, sizeof(msg))) < 0) { + BWPRINTF("error reading from ctl socket %d, closing: %s", fd, + strerror(errno)); + ctl_close_sock(s, fd); + return; + } else if (rc == 0) { + BDPRINTF("ctl_request: EOF, closing socket"); + ctl_close_sock(s, fd); + return; + } else if (rc < sizeof(msg)) { + BWPRINTF("short request received (%d/%zd bytes), ignoring", rc, + sizeof(msg)); + return; + } + + ctl_do_request(s, fd, &msg); +} + +/* -- interface -- */ + +static int tdlog_close(td_driver_t*); + +static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + int rc; + + memset(s, 0, sizeof(*s)); + + s->size = driver->info.size; + + if ((rc = writelog_create(s))) { + tdlog_close(driver); + return rc; + } + if ((rc = shmem_open(s, name))) { + tdlog_close(driver); + return rc; + } + if ((rc = ctl_open(s, name))) { + tdlog_close(driver); + return rc; + } + + s->sring = (log_sring_t*)sringstart(s->shm); + SHARED_RING_INIT(s->sring); + BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); + + BDPRINTF("opened ctl socket"); + + return 0; +} + +static int tdlog_close(td_driver_t* driver) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + + ctl_close(s); + shmem_close(s); + writelog_free(s); + + return 0; +} + +static void tdlog_queue_read(td_driver_t* driver, td_request_t treq) +{ + td_forward_request(treq); +} + +static void tdlog_queue_write(td_driver_t* driver, td_request_t treq) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + int rc; + + writelog_set(s, treq.sec, treq.secs); + td_forward_request(treq); +} + +static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id) +{ + return -EINVAL; +} + +static int tdlog_validate_parent(td_driver_t *driver, + td_driver_t *parent, td_flag_t flags) +{ + return 0; +} + +struct tap_disk tapdisk_log = { + .disk_type = "tapdisk_log", + .private_data_size = sizeof(struct tdlog_state), + .flags = 0, + .td_open = tdlog_open, + .td_close = tdlog_close, + .td_queue_read = tdlog_queue_read, + .td_queue_write = tdlog_queue_write, + .td_get_parent_id = tdlog_get_parent_id, + .td_validate_parent = tdlog_validate_parent, +}; diff --git a/tools/blktap2/drivers/block-qcow.c b/tools/blktap2/drivers/block-qcow.c new file mode 100644 index 0000000000..1ddd92d750 --- /dev/null +++ b/tools/blktap2/drivers/block-qcow.c @@ -0,0 +1,1517 @@ +/* block-qcow.c + * + * Asynchronous Qemu copy-on-write disk implementation. + * Code based on the Qemu implementation + * (see copyright notice below) + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + */ + +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files(the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include <zlib.h> +#include <inttypes.h> +#include <libaio.h> +#include <openssl/md5.h> +#include "bswap.h" +#include "aes.h" + +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "qcow.h" +#include "blk.h" +#include "atomicio.h" + +/* *BSD has no O_LARGEFILE */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + +struct pending_aio { + td_callback_t cb; + int id; + void *private; + int nb_sectors; + char *buf; + uint64_t sector; +}; + +#undef IOCB_IDX +#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list) + +#define ZERO_TEST(_b) (_b | 0x00) + +struct qcow_request { + td_request_t treq; + struct tiocb tiocb; + struct tdqcow_state *state; +}; + +static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset); + +#ifdef USE_GCRYPT + +#include <gcrypt.h> + +uint32_t gen_cksum(char *ptr, int len) +{ + int i; + uint32_t md[4]; + + /* Generate checksum */ + gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len); + + return md[0]; +} + +#else /* use libcrypto */ + +#include <openssl/md5.h> + +uint32_t gen_cksum(char *ptr, int len) +{ + int i; + unsigned char *md; + uint32_t ret; + + md = malloc(MD5_DIGEST_LENGTH); + if(!md) return 0; + + /* Generate checksum */ + if (MD5((unsigned char *)ptr, len, md) != md) + ret = 0; + else + memcpy(&ret, md, sizeof(uint32_t)); + + free(md); + return ret; +} + +#endif + + +static void free_aio_state(struct tdqcow_state* s) +{ + free(s->aio_requests); + free(s->aio_free_list); +} + +static int init_aio_state(td_driver_t *driver) +{ + int i, ret; + td_disk_info_t *bs = &(driver->info); + struct tdqcow_state *s = (struct tdqcow_state *)driver->data; + + // A segment (i.e. a page) can span multiple clusters + s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) * + MAX_SEGMENTS_PER_REQ * MAX_REQUESTS; + + s->aio_free_count = s->max_aio_reqs; + + if (!(s->aio_requests = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) || + !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) { + DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n", + s->max_aio_reqs); + goto fail; + } + + for (i = 0; i < s->max_aio_reqs; i++) + s->aio_free_list[i] = &s->aio_requests[i]; + + DPRINTF("AIO state initialised\n"); + + return 0; + fail: + return -1; +} + +int get_filesize(char *filename, uint64_t *size, struct stat *st) +{ + int fd; + QCowHeader header; + + /*Set to the backing file size*/ + fd = open(filename, O_RDONLY); + if (fd < 0) + return -1; + if (read(fd, &header, sizeof(header)) < sizeof(header)) { + close(fd); + return -1; + } + close(fd); + + be32_to_cpus(&header.magic); + be64_to_cpus(&header.size); + if (header.magic == QCOW_MAGIC) { + *size = header.size >> SECTOR_SHIFT; + return 0; + } + + if(S_ISBLK(st->st_mode)) { + fd = open(filename, O_RDONLY); + if (fd < 0) + return -1; + if (blk_getimagesize(fd, size) != 0) { + printf("Unable to get Block device size\n"); + close(fd); + return -1; + } + close(fd); + } else *size = (st->st_size >> SECTOR_SHIFT); + return 0; +} + +static int qcow_set_key(struct tdqcow_state *s, const char *key) +{ + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for (i = 0; i < len; i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for (i=0; i<16; i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for (i = 0; i < 16; i++) + DPRINTF(" %02x", tmp[i]); + DPRINTF("\n"); + for (i = 0; i < 16; i++) + DPRINTF(" %02x", out[i]); + DPRINTF("\n"); + } +#endif + return 0; +} + +void tdqcow_complete(void *arg, struct tiocb *tiocb, int err) +{ + struct qcow_request *aio = (struct qcow_request *)arg; + struct tdqcow_state *s = aio->state; + + td_complete_request(aio->treq, err); + + s->aio_free_list[s->aio_free_count++] = aio; +} + +static void async_read(td_driver_t *driver, td_request_t treq) +{ + int size; + uint64_t offset; + struct qcow_request *aio; + struct tdqcow_state *prv; + + prv = (struct tdqcow_state *)driver->data; + size = treq.secs * driver->info.sector_size; + offset = treq.sec * (uint64_t)driver->info.sector_size; + + if (prv->aio_free_count == 0) + goto fail; + + aio = prv->aio_free_list[--prv->aio_free_count]; + aio->treq = treq; + aio->state = prv; + + td_prep_read(&aio->tiocb, prv->fd, treq.buf, + size, offset, tdqcow_complete, aio); + td_queue_tiocb(driver, &aio->tiocb); + + return; + +fail: + td_complete_request(treq, -EBUSY); +} + +static void async_write(td_driver_t *driver, td_request_t treq) +{ + int size; + uint64_t offset; + struct qcow_request *aio; + struct tdqcow_state *prv; + + prv = (struct tdqcow_state *)driver->data; + size = treq.secs * driver->info.sector_size; + offset = treq.sec * (uint64_t)driver->info.sector_size; + + if (prv->aio_free_count == 0) + goto fail; + + aio = prv->aio_free_list[--prv->aio_free_count]; + aio->treq = treq; + aio->state = prv; + + td_prep_write(&aio->tiocb, prv->fd, treq.buf, + size, offset, tdqcow_complete, aio); + td_queue_tiocb(driver, &aio->tiocb); + + return; + +fail: + td_complete_request(treq, -EBUSY); +} + +/* + * The crypt function is compatible with the linux cryptoloop + * algorithm for < 4 GB images. NOTE: out_buf == in_buf is + * supported . + */ +static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for (i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +int qtruncate(int fd, off_t length, int sparse) +{ + int ret, i; + int current = 0, rem = 0; + uint64_t sectors; + struct stat st; + char *buf; + + /* If length is greater than the current file len + * we synchronously write zeroes to the end of the + * file, otherwise we truncate the length down + */ + ret = fstat(fd, &st); + if (ret == -1) + return -1; + if (S_ISBLK(st.st_mode)) + return 0; + + sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE; + current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE; + rem = st.st_size % DEFAULT_SECTOR_SIZE; + + /* If we are extending this file, we write zeros to the end -- + * this tries to ensure that the extents allocated wind up being + * contiguous on disk. + */ + if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) { + /*We are extending the file*/ + if ((ret = posix_memalign((void **)&buf, + 512, DEFAULT_SECTOR_SIZE))) { + DPRINTF("posix_memalign failed: %d\n", ret); + return -1; + } + memset(buf, 0x00, DEFAULT_SECTOR_SIZE); + if (lseek(fd, 0, SEEK_END)==-1) { + DPRINTF("Lseek EOF failed (%d), internal error\n", + errno); + free(buf); + return -1; + } + if (rem) { + ret = write(fd, buf, rem); + if (ret != rem) { + DPRINTF("write failed: ret = %d, err = %s\n", + ret, strerror(errno)); + free(buf); + return -1; + } + } + for (i = current; i < sectors; i++ ) { + ret = write(fd, buf, DEFAULT_SECTOR_SIZE); + if (ret != DEFAULT_SECTOR_SIZE) { + DPRINTF("write failed: ret = %d, err = %s\n", + ret, strerror(errno)); + free(buf); + return -1; + } + } + free(buf); + } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE)) + if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) { + DPRINTF("Ftruncate failed (%s)\n", strerror(errno)); + return -1; + } + return 0; +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(struct tdqcow_state *s, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector; + char *tmp_ptr2, *l2_ptr, *l1_ptr; + uint64_t *tmp_ptr; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + /*Check L1 table for the extent offset*/ + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* + * allocating a new l2 entry + extent + * at the end of the file, we must also + * update the L1 entry safely. + */ + l2_offset = s->fd_end; + + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + + /*Truncate file for L2 table + *(initialised to zero in case we crash)*/ + if (qtruncate(s->fd, + l2_offset + (s->l2_size * sizeof(uint64_t)), + s->sparse) != 0) { + DPRINTF("ERROR truncating file\n"); + return 0; + } + s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t)); + + /*Update the L1 table entry on disk + * (for O_DIRECT we write 4KByte blocks)*/ + l1_sector = (l1_index * sizeof(uint64_t)) >> 12; + l1_ptr = (char *)s->l1_table + (l1_sector << 12); + + if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) { + DPRINTF("ERROR allocating memory for L1 table\n"); + } + memcpy(tmp_ptr, l1_ptr, 4096); + + /* Convert block to write to big endian */ + for(i = 0; i < 4096 / sizeof(uint64_t); i++) { + cpu_to_be64s(&tmp_ptr[i]); + } + + /* + * Issue non-asynchronous L1 write. + * For safety, we must ensure that + * entry is written before blocks. + */ + lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET); + if (write(s->fd, tmp_ptr, 4096) != 4096) { + free(tmp_ptr); + return 0; + } + free(tmp_ptr); + + new_l2_table = 1; + goto cache_miss; + } else if (s->min_cluster_alloc == s->l2_size) { + /*Fast-track the request*/ + cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t)); + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + return cluster_offset + (l2_index * s->cluster_size); + } + + /*Check to see if L2 entry is already cached*/ + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for (j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + +cache_miss: + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + + /*If extent pre-allocated, read table from disk, + *otherwise write new table to disk*/ + if (new_l2_table) { + /*Should we allocate the whole extent? Adjustable parameter.*/ + if (s->cluster_alloc == s->l2_size) { + cluster_offset = l2_offset + + (s->l2_size * sizeof(uint64_t)); + cluster_offset = (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + if (qtruncate(s->fd, cluster_offset + + (s->cluster_size * s->l2_size), + s->sparse) != 0) { + DPRINTF("ERROR truncating file\n"); + return 0; + } + s->fd_end = cluster_offset + + (s->cluster_size * s->l2_size); + for (i = 0; i < s->l2_size; i++) { + l2_table[i] = cpu_to_be64(cluster_offset + + (i*s->cluster_size)); + } + } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + + lseek(s->fd, l2_offset, SEEK_SET); + if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } else { + lseek(s->fd, l2_offset, SEEK_SET); + if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + + /*Update the cache entries*/ + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + +found: + /*The extent is split into 's->l2_size' blocks of + *size 's->cluster_size'*/ + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) { + if (!allocate) + return 0; + + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* cluster is already allocated but compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(s, cluster_offset) < 0) + return 0; + cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET); + cluster_offset = (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + /* write the cluster content - not asynchronous */ + lseek(s->fd, cluster_offset, SEEK_SET); + if (write(s->fd, s->cluster_cache, s->cluster_size) != + s->cluster_size) + return -1; + } else { + /* allocate a new cluster */ + cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = + (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + if (qtruncate(s->fd, cluster_offset + + s->cluster_size, s->sparse)!=0) { + DPRINTF("ERROR truncating file\n"); + return 0; + } + s->fd_end = (cluster_offset + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (s->crypt_method && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + start_sect = (offset & + ~(s->cluster_size - 1)) + >> 9; + memset(s->cluster_data + 512, + 0xaa, 512); + for (i = 0; i < s->cluster_sectors;i++) + { + if (i < n_start || i >= n_end) + { + encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, 1, + &s->aes_encrypt_key); + lseek(s->fd, cluster_offset + i * 512, SEEK_SET); + if (write(s->fd, s->cluster_data, 512) != 512) + return -1; + } + } + } + } else { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size + << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + + /*For IO_DIRECT we write 4KByte blocks*/ + l2_sector = (l2_index * sizeof(uint64_t)) >> 12; + l2_ptr = (char *)l2_table + (l2_sector << 12); + + if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) { + DPRINTF("ERROR allocating memory for L1 table\n"); + } + memcpy(tmp_ptr2, l2_ptr, 4096); + lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET); + if (write(s->fd, tmp_ptr2, 4096) != 4096) { + free(tmp_ptr2); + return -1; + } + free(tmp_ptr2); + } + return cluster_offset; +} + +static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int index_in_cluster, n; + uint64_t cluster_offset; + + cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + (out_len != out_buf_size) ) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset) +{ + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + lseek(s->fd, coffset, SEEK_SET); + ret = read(s->fd, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +static int +tdqcow_read_header(int fd, QCowHeader *header) +{ + int err; + char *buf; + struct stat st; + size_t size, expected; + + memset(header, 0, sizeof(*header)); + + err = fstat(fd, &st); + if (err) + return -errno; + + err = lseek(fd, 0, SEEK_SET); + if (err == (off_t)-1) + return -errno; + + size = (sizeof(*header) + 511) & ~511; + err = posix_memalign((void **)&buf, 512, size); + if (err) + return err; + + expected = size; + if (st.st_size < size) + expected = st.st_size; + + errno = 0; + err = read(fd, buf, size); + if (err != expected) { + err = (errno ? -errno : -EIO); + goto out; + } + + memcpy(header, buf, sizeof(*header)); + be32_to_cpus(&header->magic); + be32_to_cpus(&header->version); + be64_to_cpus(&header->backing_file_offset); + be32_to_cpus(&header->backing_file_size); + be32_to_cpus(&header->mtime); + be64_to_cpus(&header->size); + be32_to_cpus(&header->crypt_method); + be64_to_cpus(&header->l1_table_offset); + + err = 0; + +out: + free(buf); + return err; +} + +static int +tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header) +{ + char *buf; + struct stat st; + size_t expected; + int i, err, shift; + QCowHeader_ext *exthdr; + uint32_t l1_table_bytes, l1_table_block, l1_table_size; + + buf = NULL; + s->l1_table = NULL; + + shift = s->cluster_bits + s->l2_bits; + + s->l1_size = (header->size + (1LL << shift) - 1) >> shift; + s->l1_table_offset = header->l1_table_offset; + + s->min_cluster_alloc = 1; /* default */ + + l1_table_bytes = s->l1_size * sizeof(uint64_t); + l1_table_size = (l1_table_bytes + 4095) & ~4095; + l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095; + + DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n", + (uint64_t)s->l1_table_offset, + (int) (s->l1_size * sizeof(uint64_t)), + l1_table_size); + + err = fstat(s->fd, &st); + if (err) { + err = -errno; + goto out; + } + + err = lseek(s->fd, 0, SEEK_SET); + if (err == (off_t)-1) { + err = -errno; + goto out; + } + + err = posix_memalign((void **)&buf, 512, l1_table_block); + if (err) { + buf = NULL; + goto out; + } + + err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size); + if (err) { + s->l1_table = NULL; + goto out; + } + + memset(buf, 0, l1_table_block); + memset(s->l1_table, 0, l1_table_size); + + expected = l1_table_block; + if (st.st_size < l1_table_block) + expected = st.st_size; + + errno = 0; + err = read(s->fd, buf, l1_table_block); + if (err != expected) { + err = (errno ? -errno : -EIO); + goto out; + } + + memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size); + exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader)); + + /* check for xen extended header */ + if (s->l1_table_offset % 4096 == 0 && + be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) { + uint32_t flags = be32_to_cpu(exthdr->flags); + uint32_t cksum = be32_to_cpu(exthdr->cksum); + + /* + * Try to detect old tapdisk images. They have to be fixed + * because they use big endian rather than native endian for + * the L1 table. After this block, the l1 table will + * definitely be in BIG endian. + */ + if (!(flags & EXTHDR_L1_BIG_ENDIAN)) { + DPRINTF("qcow: converting to big endian L1 table\n"); + + /* convert to big endian */ + for (i = 0; i < s->l1_size; i++) + cpu_to_be64s(&s->l1_table[i]); + + flags |= EXTHDR_L1_BIG_ENDIAN; + exthdr->flags = cpu_to_be32(flags); + + memcpy(buf + s->l1_table_offset, + s->l1_table, l1_table_size); + + err = lseek(s->fd, 0, SEEK_SET); + if (err == (off_t)-1) { + err = -errno; + goto out; + } + + err = atomicio(vwrite, s->fd, buf, l1_table_block); + if (err != l1_table_block) { + err = -errno; + goto out; + } + } + + /* check the L1 table checksum */ + if (cksum != gen_cksum((char *)s->l1_table, + s->l1_size * sizeof(uint64_t))) + DPRINTF("qcow: bad L1 checksum\n"); + else { + s->extended = 1; + s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE); + s->min_cluster_alloc = + be32_to_cpu(exthdr->min_cluster_alloc); + } + } + + /* convert L1 table to native endian for operation */ + for (i = 0; i < s->l1_size; i++) + be64_to_cpus(&s->l1_table[i]); + + err = 0; + +out: + if (err) { + free(buf); + free(s->l1_table); + s->l1_table = NULL; + } + return err; +} + +/* Open the disk file and initialize qcow state. */ +int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags) +{ + int fd, len, i, ret, size, o_flags; + td_disk_info_t *bs = &(driver->info); + struct tdqcow_state *s = (struct tdqcow_state *)driver->data; + QCowHeader header; + uint64_t final_cluster = 0; + + DPRINTF("QCOW: Opening %s\n", name); + + o_flags = O_DIRECT | O_LARGEFILE | + ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); + fd = open(name, o_flags); + if (fd < 0) { + DPRINTF("Unable to open %s (%d)\n", name, -errno); + return -1; + } + + s->fd = fd; + s->name = strdup(name); + if (!s->name) + goto fail; + + if (tdqcow_read_header(fd, &header)) + goto fail; + + if (header.magic != QCOW_MAGIC) + goto fail; + + switch (header.version) { + case QCOW_VERSION: + break; + case 2: + //TODO: Port qcow2 to new blktap framework. + // close(fd); + // dd->drv = &tapdisk_qcow2; + // return dd->drv->td_open(dd, name, flags); + goto fail; + default: + goto fail; + } + + if (header.size <= 1 || header.cluster_bits < 9) + goto fail; + if (header.crypt_method > QCOW_CRYPT_AES) + goto fail; + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) + s->encrypted = 1; + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header.l2_bits; + s->l2_size = 1 << s->l2_bits; + s->cluster_alloc = s->l2_size; + bs->size = header.size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + s->backing_file_offset = header.backing_file_offset; + s->backing_file_size = header.backing_file_size; + + /* allocate and load l1 table */ + if (tdqcow_load_l1_table(s, &header)) + goto fail; + + /* alloc L2 cache */ + size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t); + ret = posix_memalign((void **)&s->l2_cache, 4096, size); + if(ret != 0) goto fail; + + size = s->cluster_size; + ret = posix_memalign((void **)&s->cluster_cache, 4096, size); + if(ret != 0) goto fail; + + ret = posix_memalign((void **)&s->cluster_data, 4096, size); + if(ret != 0) goto fail; + s->cluster_cache_offset = -1; + + if (s->backing_file_offset != 0) + s->cluster_alloc = 1; /*Cannot use pre-alloc*/ + + bs->sector_size = 512; + bs->info = 0; + + for(i = 0; i < s->l1_size; i++) + if (s->l1_table[i] > final_cluster) + final_cluster = s->l1_table[i]; + + if (init_aio_state(driver)!=0) { + DPRINTF("Unable to initialise AIO state\n"); + free_aio_state(s); + goto fail; + } + + if (!final_cluster) + s->fd_end = s->l1_table_offset + + ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095); + else { + s->fd_end = lseek64(fd, 0, SEEK_END); + if (s->fd_end == (off64_t)-1) + goto fail; + } + + return 0; + +fail: + DPRINTF("QCOW Open failed\n"); + + free_aio_state(s); + free(s->l1_table); + free(s->l2_cache); + free(s->cluster_cache); + free(s->cluster_data); + close(fd); + return -1; +} + +void tdqcow_queue_read(td_driver_t *driver, td_request_t treq) +{ + struct tdqcow_state *s = (struct tdqcow_state *)driver->data; + int ret = 0, index_in_cluster, n, i; + uint64_t cluster_offset, sector, nb_sectors; + struct qcow_prv* prv; + td_request_t clone = treq; + char* buf = treq.buf; + + sector = treq.sec; + nb_sectors = treq.secs; + + /*We store a local record of the request*/ + while (nb_sectors > 0) { + cluster_offset = + get_cluster_offset(s, sector << 9, 0, 0, 0, 0); + index_in_cluster = sector & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + + if (s->aio_free_count == 0) { + td_complete_request(treq, -EBUSY); + return; + } + + if(!cluster_offset) { + treq.buf = buf; + treq.sec = sector; + treq.secs = n; + td_forward_request(treq); + + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + if (decompress_cluster(s, cluster_offset) < 0) { + td_complete_request(treq, -EIO); + goto done; + } + memcpy(buf, s->cluster_cache + index_in_cluster * 512, + 512 * n); + + treq.buf = buf; + treq.sec = sector; + treq.secs = n; + td_complete_request(treq, 0); + } else { + clone.buf = buf; + clone.sec = (cluster_offset>>9)+index_in_cluster; + clone.secs = n; + async_read(driver, clone); + } + nb_sectors -= n; + sector += n; + buf += n * 512; + } +done: + return; +} + +void tdqcow_queue_write(td_driver_t *driver, td_request_t treq) +{ + struct tdqcow_state *s = (struct tdqcow_state *)driver->data; + int ret = 0, index_in_cluster, n, i; + uint64_t cluster_offset, sector, nb_sectors; + td_callback_t cb; + struct qcow_prv* prv; + char* buf = treq.buf; + td_request_t clone=treq; + + sector = treq.sec; + nb_sectors = treq.secs; + + /*We store a local record of the request*/ + while (nb_sectors > 0) { + index_in_cluster = sector & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + + if (s->aio_free_count == 0) { + td_complete_request(treq, -EBUSY); + return; + } + + cluster_offset = get_cluster_offset(s, sector << 9, 1, 0, + index_in_cluster, + index_in_cluster+n); + if (!cluster_offset) { + DPRINTF("Ooops, no write cluster offset!\n"); + td_complete_request(treq, -EIO); + return; + } + + if (s->crypt_method) { + encrypt_sectors(s, sector, s->cluster_data, + (unsigned char *)buf, n, 1, + &s->aes_encrypt_key); + + clone.buf = buf; + clone.sec = (cluster_offset>>9) + index_in_cluster; + clone.secs = n; + async_write(driver, clone); + } else { + clone.buf = buf; + clone.sec = (cluster_offset>>9) + index_in_cluster; + clone.secs = n; + + async_write(driver, clone); + } + + nb_sectors -= n; + sector += n; + buf += n * 512; + } + s->cluster_cache_offset = -1; /* disable compressed cache */ + + return; +} + +static int +tdqcow_update_checksum(struct tdqcow_state *s) +{ + int i, fd, err; + uint32_t offset, cksum, out; + + if (!s->extended) + return 0; + + fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */ + if (fd == -1) { + err = errno; + goto out; + } + + offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum); + if (lseek(fd, offset, SEEK_SET) == (off_t)-1) { + err = errno; + goto out; + } + + /* convert to big endian for checksum */ + for (i = 0; i < s->l1_size; i++) + cpu_to_be64s(&s->l1_table[i]); + + cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t)); + + /* and back again... */ + for (i = 0; i < s->l1_size; i++) + be64_to_cpus(&s->l1_table[i]); + + DPRINTF("Writing cksum: %d", cksum); + + out = cpu_to_be32(cksum); + if (write(fd, &out, sizeof(out)) != sizeof(out)) { + err = errno; + goto out; + } + + err = 0; + +out: + if (err) + DPRINTF("failed to update checksum: %d\n", err); + if (fd != -1) + close(fd); + return err; +} + +int tdqcow_close(td_driver_t *driver) +{ + struct tdqcow_state *s = (struct tdqcow_state *)driver->data; + + /*Update the hdr cksum*/ + tdqcow_update_checksum(s); + + free_aio_state(s); + free(s->name); + free(s->l1_table); + free(s->l2_cache); + free(s->cluster_cache); + free(s->cluster_data); + close(s->fd); + return 0; +} + +int qcow_create(const char *filename, uint64_t total_size, + const char *backing_file, int sparse) +{ + int fd, header_size, backing_filename_len, l1_size, i; + int shift, length, adjust, flags = 0, ret = 0; + QCowHeader header; + QCowHeader_ext exthdr; + char backing_filename[PATH_MAX], *ptr; + uint64_t tmp, size, total_length; + struct stat st; + + DPRINTF("Qcow_create: size %"PRIu64"\n",total_size); + + fd = open(filename, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + if (fd < 0) + return -1; + + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + + /*Create extended header fields*/ + exthdr.xmagic = cpu_to_be32(XEN_MAGIC); + + header_size = sizeof(header) + sizeof(QCowHeader_ext); + backing_filename_len = 0; + size = (total_size >> SECTOR_SHIFT); + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + const char *p; + /* XXX: this is a hack: we do not attempt to + *check for URL like syntax */ + p = strchr(backing_file, ':'); + if (p && (p - backing_file) >= 2) { + /* URL like but exclude "c:" like filenames */ + strncpy(backing_filename, backing_file, + sizeof(backing_filename)); + } else { + if (realpath(backing_file, backing_filename) == NULL || + stat(backing_filename, &st) != 0) { + return -1; + } + } + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_filename); + header.backing_file_size = cpu_to_be32( + backing_filename_len); + header_size += backing_filename_len; + + /*Set to the backing file size*/ + if(get_filesize(backing_filename, &size, &st)) { + return -1; + } + DPRINTF("Backing file size detected: %"PRId64" sectors" + "(total %"PRId64" [%"PRId64" MB])\n", + size, + (uint64_t)(size << SECTOR_SHIFT), + (uint64_t)(size >> 11)); + } else { + backing_file = NULL; + DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n", + total_size, + (uint64_t) (total_size << SECTOR_SHIFT)); + } + header.mtime = cpu_to_be32(st.st_mtime); + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodifyed sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + exthdr.min_cluster_alloc = cpu_to_be32(1); + } else { + DPRINTF("Setting file size: %"PRId64" sectors" + "(total %"PRId64" [%"PRId64" MB])\n", + size, + (uint64_t) (size << SECTOR_SHIFT), + (uint64_t) (size >> 11)); + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + exthdr.min_cluster_alloc = cpu_to_be32(1 << 9); + } + /*Set the header size value*/ + header.size = cpu_to_be64(size * 512); + + header_size = (header_size + 7) & ~7; + if (header_size % 4096 > 0) { + header_size = ((header_size >> 12) + 1) << 12; + } + + shift = header.cluster_bits + header.l2_bits; + l1_size = ((size * 512) + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + DPRINTF("L1 Table offset: %d, size %d\n", + header_size, + (int)(l1_size * sizeof(uint64_t))); + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + + ptr = calloc(1, l1_size * sizeof(uint64_t)); + exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t))); + printf("Created cksum: %d\n",exthdr.cksum); + free(ptr); + + /*adjust file length to system page size boundary*/ + length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)), + getpagesize()); + if (qtruncate(fd, length, 0)!=0) { + DPRINTF("ERROR truncating file\n"); + return -1; + } + + if (sparse == 0) { + /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/ + total_length = length + (l1_size * (1 << 9)) + (size * 512); + if (qtruncate(fd, total_length, 0)!=0) { + DPRINTF("ERROR truncating file\n"); + return -1; + } + printf("File truncated to length %"PRIu64"\n",total_length); + } else + flags = SPARSE_FILE; + + flags |= EXTHDR_L1_BIG_ENDIAN; + exthdr.flags = cpu_to_be32(flags); + + /* write all the data */ + lseek(fd, 0, SEEK_SET); + ret += write(fd, &header, sizeof(header)); + ret += write(fd, &exthdr, sizeof(exthdr)); + if (backing_file) + ret += write(fd, backing_filename, backing_filename_len); + + lseek(fd, header_size, SEEK_SET); + tmp = 0; + for (i = 0;i < l1_size; i++) { + ret += write(fd, &tmp, sizeof(tmp)); + } + + close(fd); + + return 0; +} + +static int qcow_make_empty(struct tdqcow_state *s) +{ + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + + memset(s->l1_table, 0, l1_length); + lseek(s->fd, s->l1_table_offset, SEEK_SET); + if (write(s->fd, s->l1_table, l1_length) < 0) + return -1; + if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) { + DPRINTF("ERROR truncating file\n"); + return -1; + } + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +static int qcow_get_cluster_size(struct tdqcow_state *s) +{ + return s->cluster_size; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num, + const uint8_t *buf) +{ + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + if (!out_buf) + return -1; + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + free(out_buf); + return -1; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + free(out_buf); + deflateEnd(&strm); + return -1; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors); + } else { + cluster_offset = get_cluster_offset(s, sector_num << 9, 2, + out_len, 0, 0); + cluster_offset &= s->cluster_offset_mask; + lseek(s->fd, cluster_offset, SEEK_SET); + if (write(s->fd, out_buf, out_len) != out_len) { + free(out_buf); + return -1; + } + } + + free(out_buf); + return 0; +} + +static int +tdqcow_get_image_type(const char *file, int *type) +{ + int fd; + size_t size; + QCowHeader header; + + fd = open(file, O_RDONLY); + if (fd == -1) + return -errno; + + size = read(fd, &header, sizeof(header)); + close(fd); + if (size != sizeof(header)) + return (errno ? -errno : -EIO); + + be32_to_cpus(&header.magic); + if (header.magic == QCOW_MAGIC) + *type = DISK_TYPE_QCOW; + else + *type = DISK_TYPE_AIO; + + return 0; +} + +int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + off_t off; + char *buf, *filename; + int len, secs, type, err = -EINVAL; + struct tdqcow_state *child = (struct tdqcow_state *)driver->data; + + if (!child->backing_file_offset) + return TD_NO_PARENT; + + /* read the backing file name */ + len = child->backing_file_size; + off = child->backing_file_offset - (child->backing_file_offset % 512); + secs = (len + (child->backing_file_offset - off) + 511) >> 9; + + if (posix_memalign((void **)&buf, 512, secs << 9)) + return -1; + + if (lseek(child->fd, off, SEEK_SET) == (off_t)-1) + goto out; + + if (read(child->fd, buf, secs << 9) != secs << 9) + goto out; + filename = buf + (child->backing_file_offset - off); + filename[len] = '\0'; + + if (tdqcow_get_image_type(filename, &type)) + goto out; + + id->name = strdup(filename); + id->drivertype = type; + err = 0; + out: + free(buf); + return err; +} + +int tdqcow_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags) +{ + struct stat stats; + uint64_t psize, csize; + struct tdqcow_state *c = (struct tdqcow_state *)driver->data; + struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data; + + if (stat(p->name, &stats)) + return -EINVAL; + if (get_filesize(p->name, &psize, &stats)) + return -EINVAL; + + if (stat(c->name, &stats)) + return -EINVAL; + if (get_filesize(c->name, &csize, &stats)) + return -EINVAL; + + if (csize != psize) + return -EINVAL; + + return 0; +} + +struct tap_disk tapdisk_qcow = { + .disk_type = "tapdisk_qcow", + .flags = 0, + .private_data_size = sizeof(struct tdqcow_state), + .td_open = tdqcow_open, + .td_close = tdqcow_close, + .td_queue_read = tdqcow_queue_read, + .td_queue_write = tdqcow_queue_write, + .td_get_parent_id = tdqcow_get_parent_id, + .td_validate_parent = tdqcow_validate_parent, + .td_debug = NULL, +}; diff --git a/tools/blktap2/drivers/block-ram.c b/tools/blktap2/drivers/block-ram.c new file mode 100644 index 0000000000..16b4ec9dc7 --- /dev/null +++ b/tools/blktap2/drivers/block-ram.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> + +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +char *img; +long int disksector_size; +long int disksize; +long int diskinfo; +static int connections = 0; + +struct tdram_state { + int fd; +}; + +/*Get Image size, secsize*/ +static int get_image_info(int fd, td_disk_info_t *info) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + info->size = 0; + if (ioctl(fd,BLKGETSIZE,&info->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + info->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &info->sector_size); + + if (info->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + info->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + info->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + info->size = (stat.st_size >> SECTOR_SHIFT); + info->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + } + + if (info->size == 0) { + info->size =((uint64_t) MAX_RAMDISK_SIZE); + info->sector_size = DEFAULT_SECTOR_SIZE; + } + info->info = 0; + + /*Store variables locally*/ + disksector_size = info->sector_size; + disksize = info->size; + diskinfo = info->info; + DPRINTF("Image sector_size: \n\t[%lu]\n", + info->sector_size); + + return 0; +} + +/* Open the disk file and initialize ram state. */ +int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags) +{ + char *p; + uint64_t size; + int i, fd, ret = 0, count = 0, o_flags; + struct tdram_state *prv = (struct tdram_state *)driver->data; + + connections++; + + if (connections > 1) { + driver->info.sector_size = disksector_size; + driver->info.size = disksize; + driver->info.info = diskinfo; + DPRINTF("Image already open, returning parameters:\n"); + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(driver->info.size << SECTOR_SHIFT), + (long long unsigned)driver->info.size); + DPRINTF("Image sector_size: \n\t[%lu]\n", + driver->info.sector_size); + + prv->fd = -1; + goto done; + } + + /* Open the file */ + o_flags = O_DIRECT | O_LARGEFILE | + ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); + fd = open(name, o_flags); + + if ((fd == -1) && (errno == EINVAL)) { + + /* Maybe O_DIRECT isn't supported. */ + o_flags &= ~O_DIRECT; + fd = open(name, o_flags); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s]!\n",name); + ret = 0 - errno; + goto done; + } + + prv->fd = fd; + + ret = get_image_info(fd, &driver->info); + size = MAX_RAMDISK_SIZE; + + if (driver->info.size > size) { + DPRINTF("Disk exceeds limit, must be less than [%d]MB", + (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20); + return -ENOMEM; + } + + /*Read the image into memory*/ + if (posix_memalign((void **)&img, + DEFAULT_SECTOR_SIZE, + driver->info.size << SECTOR_SHIFT)) { + DPRINTF("Mem malloc failed\n"); + return -errno; + } + p = img; + DPRINTF("Reading %llu bytes.......", + (long long unsigned)driver->info.size << SECTOR_SHIFT); + + for (i = 0; i < driver->info.size; i++) { + ret = read(prv->fd, p, driver->info.sector_size); + if (ret != driver->info.sector_size) { + DPRINTF("ret = %d, errno = %d\n", ret, errno); + ret = 0 - errno; + break; + } else { + count += ret; + p = img + count; + } + } + DPRINTF("[%d]\n",count); + if (count != driver->info.size << SECTOR_SHIFT) { + ret = -1; + } else { + ret = 0; + } + +done: + return ret; +} + +void tdram_queue_read(td_driver_t *driver, td_request_t treq) +{ + struct tdram_state *prv = (struct tdram_state *)driver->data; + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + memcpy(treq.buf, img + offset, size); + + td_complete_request(treq, 0); +} + +void tdram_queue_write(td_driver_t *driver, td_request_t treq) +{ + struct tdram_state *prv = (struct tdram_state *)driver->data; + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + /* We assume that write access is controlled + * at a higher level for multiple disks */ + memcpy(img + offset, treq.buf, size); + + td_complete_request(treq, 0); +} + +int tdram_close(td_driver_t *driver) +{ + struct tdram_state *prv = (struct tdram_state *)driver->data; + + connections--; + + return 0; +} + +int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + return TD_NO_PARENT; +} + +int tdram_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags) +{ + return -EINVAL; +} + +struct tap_disk tapdisk_ram = { + .disk_type = "tapdisk_ram", + .flags = 0, + .private_data_size = sizeof(struct tdram_state), + .td_open = tdram_open, + .td_close = tdram_close, + .td_queue_read = tdram_queue_read, + .td_queue_write = tdram_queue_write, + .td_get_parent_id = tdram_get_parent_id, + .td_validate_parent = tdram_validate_parent, + .td_debug = NULL, +}; diff --git a/tools/blktap2/drivers/block-vhd.c b/tools/blktap2/drivers/block-vhd.c new file mode 100644 index 0000000000..54431c12d8 --- /dev/null +++ b/tools/blktap2/drivers/block-vhd.c @@ -0,0 +1,2321 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * A note on write transactions: + * Writes that require updating the BAT or bitmaps cannot be signaled + * as complete until all updates have reached disk. Transactions are + * used to ensure proper ordering in these cases. The two types of + * transactions are as follows: + * - Bitmap updates only: data writes that require updates to the same + * bitmap are grouped in a transaction. Only after all data writes + * in a transaction complete does the bitmap write commence. Only + * after the bitmap write finishes are the data writes signalled as + * complete. + * - BAT and bitmap updates: data writes are grouped in transactions + * as above, but a special extra write is included in the transaction, + * which zeros out the newly allocated bitmap on disk. When the data + * writes and the zero-bitmap write complete, the BAT and bitmap writes + * are started in parallel. The transaction is completed only after both + * the BAT and bitmap writes successfully return. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */ + /* e2fsprogs-devel. */ +#include <string.h> /* for memset. */ +#include <libaio.h> +#include <sys/mman.h> + +#include "libvhd.h" +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +unsigned int SPB; + +#define DEBUGGING 2 +#define ASSERTING 1 +#define MICROSOFT_COMPAT + +#define VHD_BATMAP_MAX_RETRIES 10 + +#define __TRACE(s) \ + do { \ + DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \ + PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \ + "%lu, BBLK: 0x%04x\n", \ + s->vhd.file, s->queued, s->completed, s->returned, \ + VHD_REQS_DATA - s->vreq_free_count, \ + s->bat.pbw_blk); \ + } while(0) + +#define __ASSERT(_p) \ + if (!(_p)) { \ + DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \ + __FILE__, __LINE__, #_p); \ + DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n", \ + __FILE__, __LINE__, #_p); \ + tlog_flush(); \ + *(int*)0 = 0; \ + } + +#if (DEBUGGING == 1) + #define DBG(level, _f, _a...) DPRINTF(_f, ##_a) + #define ERR(err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a) + #define TRACE(s) ((void)0) +#elif (DEBUGGING == 2) + #define DBG(level, _f, _a...) tlog_write(level, _f, ##_a) + #define ERR(err, _f, _a...) tlog_error(err, _f, ##_a) + #define TRACE(s) __TRACE(s) +#else + #define DBG(level, _f, _a...) ((void)0) + #define ERR(err, _f, _a...) ((void)0) + #define TRACE(s) ((void)0) +#endif + +#if (ASSERTING == 1) + #define ASSERT(_p) __ASSERT(_p) +#else + #define ASSERT(_p) ((void)0) +#endif + +/******VHD DEFINES******/ +#define VHD_CACHE_SIZE 32 + +#define VHD_REQS_DATA TAPDISK_DATA_REQUESTS +#define VHD_REQS_META (VHD_CACHE_SIZE + 2) +#define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META) + +#define VHD_OP_BAT_WRITE 0 +#define VHD_OP_DATA_READ 1 +#define VHD_OP_DATA_WRITE 2 +#define VHD_OP_BITMAP_READ 3 +#define VHD_OP_BITMAP_WRITE 4 +#define VHD_OP_ZERO_BM_WRITE 5 + +#define VHD_BM_BAT_LOCKED 0 +#define VHD_BM_BAT_CLEAR 1 +#define VHD_BM_BIT_CLEAR 2 +#define VHD_BM_BIT_SET 3 +#define VHD_BM_NOT_CACHED 4 +#define VHD_BM_READ_PENDING 5 + +#define VHD_FLAG_OPEN_RDONLY 1 +#define VHD_FLAG_OPEN_NO_CACHE 2 +#define VHD_FLAG_OPEN_QUIET 4 +#define VHD_FLAG_OPEN_STRICT 8 +#define VHD_FLAG_OPEN_QUERY 16 +#define VHD_FLAG_OPEN_PREALLOCATE 32 + +#define VHD_FLAG_BAT_LOCKED 1 +#define VHD_FLAG_BAT_WRITE_STARTED 2 + +#define VHD_FLAG_BM_UPDATE_BAT 1 +#define VHD_FLAG_BM_WRITE_PENDING 2 +#define VHD_FLAG_BM_READ_PENDING 4 +#define VHD_FLAG_BM_LOCKED 8 + +#define VHD_FLAG_REQ_UPDATE_BAT 1 +#define VHD_FLAG_REQ_UPDATE_BITMAP 2 +#define VHD_FLAG_REQ_QUEUED 4 +#define VHD_FLAG_REQ_FINISHED 8 + +#define VHD_FLAG_TX_LIVE 1 +#define VHD_FLAG_TX_UPDATE_BAT 2 + +typedef uint8_t vhd_flag_t; + +struct vhd_state; +struct vhd_request; + +struct vhd_req_list { + struct vhd_request *head; + struct vhd_request *tail; +}; + +struct vhd_transaction { + int error; + int closed; + int started; + int finished; + vhd_flag_t status; + struct vhd_req_list requests; +}; + +struct vhd_request { + int error; + uint8_t op; + vhd_flag_t flags; + td_request_t treq; + struct tiocb tiocb; + struct vhd_state *state; + struct vhd_request *next; + struct vhd_transaction *tx; +}; + +struct vhd_bat_state { + vhd_bat_t bat; + vhd_batmap_t batmap; + vhd_flag_t status; + uint32_t pbw_blk; /* blk num of pending write */ + uint64_t pbw_offset; /* file offset of same */ + struct vhd_request req; /* for writing bat table */ + struct vhd_request zero_req; /* for initializing bitmaps */ + char *bat_buf; +}; + +struct vhd_bitmap { + u32 blk; + u64 seqno; /* lru sequence number */ + vhd_flag_t status; + + char *map; /* map should only be modified + * in finish_bitmap_write */ + char *shadow; /* in-memory bitmap changes are + * made to shadow and copied to + * map only after having been + * flushed to disk */ + struct vhd_transaction tx; /* transaction data structure + * encapsulating data, bitmap, + * and bat writes */ + struct vhd_req_list queue; /* data writes waiting for next + * transaction */ + struct vhd_req_list waiting; /* pending requests that cannot + * be serviced until this bitmap + * is read from disk */ + struct vhd_request req; +}; + +struct vhd_state { + vhd_flag_t flags; + + /* VHD stuff */ + vhd_context_t vhd; + u32 spp; /* sectors per page */ + u32 spb; /* sectors per block */ + u64 next_db; /* pointer to the next + * (unallocated) datablock */ + + struct vhd_bat_state bat; + + u64 bm_lru; /* lru sequence number */ + u32 bm_secs; /* size of bitmap, in sectors */ + struct vhd_bitmap *bitmap[VHD_CACHE_SIZE]; + + int bm_free_count; + struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE]; + struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE]; + + int vreq_free_count; + struct vhd_request *vreq_free[VHD_REQS_DATA]; + struct vhd_request vreq_list[VHD_REQS_DATA]; + + td_driver_t *driver; + + uint64_t queued; + uint64_t completed; + uint64_t returned; + uint64_t reads; + uint64_t read_size; + uint64_t writes; + uint64_t write_size; +}; + +#define test_vhd_flag(word, flag) ((word) & (flag)) +#define set_vhd_flag(word, flag) ((word) |= (flag)) +#define clear_vhd_flag(word, flag) ((word) &= ~(flag)) + +#define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)]) + +static void vhd_complete(void *, struct tiocb *, int); +static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *); + +static struct vhd_state *_vhd_master; +static unsigned long _vhd_zsize; +static char *_vhd_zeros; + +static int +vhd_initialize(struct vhd_state *s) +{ + if (_vhd_zeros) + return 0; + + _vhd_zsize = 2 * getpagesize(); + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) + _vhd_zsize += VHD_BLOCK_SIZE; + + _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (_vhd_zeros == MAP_FAILED) { + EPRINTF("vhd_initialize failed: %d\n", -errno); + _vhd_zeros = NULL; + _vhd_zsize = 0; + return -errno; + } + + _vhd_master = s; + return 0; +} + +static void +vhd_free(struct vhd_state *s) +{ + if (_vhd_master != s || !_vhd_zeros) + return; + + munmap(_vhd_zeros, _vhd_zsize); + _vhd_zsize = 0; + _vhd_zeros = NULL; + _vhd_master = NULL; +} + +static char * +_get_vhd_zeros(const char *func, unsigned long size) +{ + if (!_vhd_zeros || _vhd_zsize < size) { + EPRINTF("invalid zero request from %s: %lu, %lu, %p\n", + func, size, _vhd_zsize, _vhd_zeros); + ASSERT(0); + } + + return _vhd_zeros; +} + +#define vhd_zeros(size) _get_vhd_zeros(__func__, size) + +static inline void +set_batmap(struct vhd_state *s, uint32_t blk) +{ + if (s->bat.batmap.map) { + vhd_batmap_set(&s->vhd, &s->bat.batmap, blk); + DBG(TLOG_DBG, "block 0x%x completely full\n", blk); + } +} + +static inline int +test_batmap(struct vhd_state *s, uint32_t blk) +{ + if (!s->bat.batmap.map) + return 0; + return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk); +} + +static int +vhd_kill_footer(struct vhd_state *s) +{ + int err; + off64_t end; + char *zeros; + + if (s->vhd.footer.type == HD_TYPE_FIXED) + return 0; + + err = posix_memalign((void **)&zeros, 512, 512); + if (err) + return -err; + + err = 1; + memset(zeros, 0xc7c7c7c7, 512); + + if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1) + goto fail; + + if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1) + goto fail; + + if (write(s->vhd.fd, zeros, 512) != 512) + goto fail; + + err = 0; + + fail: + free(zeros); + if (err) + return (errno ? -errno : -EIO); + return 0; +} + +static inline int +find_next_free_block(struct vhd_state *s) +{ + int err; + off64_t eom; + uint32_t i, entry; + + err = vhd_end_of_headers(&s->vhd, &eom); + if (err) + return err; + + s->next_db = secs_round_up(eom); + + for (i = 0; i < s->bat.bat.entries; i++) { + entry = bat_entry(s, i); + if (entry != DD_BLK_UNUSED && entry >= s->next_db) + s->next_db = entry + s->spb + s->bm_secs; + } + + return 0; +} + +static void +vhd_free_bat(struct vhd_state *s) +{ + free(s->bat.bat.bat); + free(s->bat.batmap.map); + free(s->bat.bat_buf); + memset(&s->bat, 0, sizeof(struct vhd_bat)); +} + +static int +vhd_initialize_bat(struct vhd_state *s) +{ + int err, psize, batmap_required, i; + + memset(&s->bat, 0, sizeof(struct vhd_bat)); + + psize = getpagesize(); + + err = vhd_read_bat(&s->vhd, &s->bat.bat); + if (err) { + EPRINTF("%s: reading bat: %d\n", s->vhd.file, err); + return err; + } + + batmap_required = 1; + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) { + batmap_required = 0; + } else { + err = find_next_free_block(s); + if (err) + goto fail; + } + + if (vhd_has_batmap(&s->vhd)) { + for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) { + err = vhd_read_batmap(&s->vhd, &s->bat.batmap); + if (err) { + EPRINTF("%s: reading batmap: %d\n", + s->vhd.file, err); + if (batmap_required) + goto fail; + } else { + break; + } + } + if (err) + EPRINTF("%s: ignoring non-critical batmap error\n", + s->vhd.file); + } + + err = posix_memalign((void **)&s->bat.bat_buf, + VHD_SECTOR_SIZE, VHD_SECTOR_SIZE); + if (err) { + s->bat.bat_buf = NULL; + goto fail; + } + + return 0; + +fail: + vhd_free_bat(s); + return err; +} + +static void +vhd_free_bitmap_cache(struct vhd_state *s) +{ + int i; + struct vhd_bitmap *bm; + + for (i = 0; i < VHD_CACHE_SIZE; i++) { + bm = s->bitmap_list + i; + free(bm->map); + free(bm->shadow); + s->bitmap_free[i] = NULL; + } + + memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); +} + +static int +vhd_initialize_bitmap_cache(struct vhd_state *s) +{ + int i, err, map_size; + struct vhd_bitmap *bm; + + memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); + + s->bm_lru = 0; + map_size = vhd_sectors_to_bytes(s->bm_secs); + s->bm_free_count = VHD_CACHE_SIZE; + + for (i = 0; i < VHD_CACHE_SIZE; i++) { + bm = s->bitmap_list + i; + + err = posix_memalign((void **)&bm->map, 512, map_size); + if (err) { + bm->map = NULL; + goto fail; + } + + err = posix_memalign((void **)&bm->shadow, 512, map_size); + if (err) { + bm->shadow = NULL; + goto fail; + } + + memset(bm->map, 0, map_size); + memset(bm->shadow, 0, map_size); + s->bitmap_free[i] = bm; + } + + return 0; + +fail: + vhd_free_bitmap_cache(s); + return err; +} + +static int +vhd_initialize_dynamic_disk(struct vhd_state *s) +{ + int err; + + err = vhd_get_header(&s->vhd); + if (err) { + if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) + EPRINTF("Error reading VHD DD header.\n"); + return err; + } + + if (s->vhd.header.hdr_ver != 0x00010000) { + EPRINTF("unsupported header version! (0x%x)\n", + s->vhd.header.hdr_ver); + return -EINVAL; + } + + s->spp = getpagesize() >> VHD_SECTOR_SHIFT; + s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT; + s->bm_secs = secs_round_up_no_zero(s->spb >> 3); + + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE)) + return 0; + + err = vhd_initialize_bat(s); + if (err) + return err; + + err = vhd_initialize_bitmap_cache(s); + if (err) { + vhd_free_bat(s); + return err; + } + + return 0; +} + +static int +vhd_check_version(struct vhd_state *s) +{ + if (strncmp(s->vhd.footer.crtr_app, "tap", 3)) + return 0; + + if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) { + if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) + EPRINTF("WARNING: %s vhd creator version 0x%08x, " + "but only versions up to 0x%08x are " + "supported for IO\n", s->vhd.file, + s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION); + + return -EINVAL; + } + + return 0; +} + +static void +vhd_log_open(struct vhd_state *s) +{ + char buf[5]; + uint32_t i, allocated, full; + + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) + return; + + snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app); + if (!vhd_type_dynamic(&s->vhd)) { + DPRINTF("%s version: %s 0x%08x\n", + s->vhd.file, buf, s->vhd.footer.crtr_ver); + return; + } + + allocated = 0; + full = 0; + + for (i = 0; i < s->bat.bat.entries; i++) { + if (bat_entry(s, i) != DD_BLK_UNUSED) + allocated++; + if (test_batmap(s, i)) + full++; + } + + DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n", + s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries, + allocated, full, s->next_db); +} + +static int +__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags) +{ + int i, o_flags, err; + struct vhd_state *s; + + DBG(TLOG_INFO, "vhd_open: %s\n", name); + if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT)) + libvhd_set_log_level(1); + + s = (struct vhd_state *)driver->data; + memset(s, 0, sizeof(struct vhd_state)); + + s->flags = flags; + s->driver = driver; + + err = vhd_initialize(s); + if (err) + return err; + + o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? + VHD_OPEN_RDONLY : VHD_OPEN_RDWR); + + err = vhd_open(&s->vhd, name, o_flags); + if (err) { + libvhd_set_log_level(1); + err = vhd_open(&s->vhd, name, o_flags); + if (err) { + EPRINTF("Unable to open [%s] (%d)!\n", name, err); + return err; + } + } + + err = vhd_check_version(s); + if (err) + goto fail; + + s->spb = s->spp = 1; + + if (vhd_type_dynamic(&s->vhd)) { + err = vhd_initialize_dynamic_disk(s); + if (err) + goto fail; + } + + vhd_log_open(s); + + SPB = s->spb; + + s->vreq_free_count = VHD_REQS_DATA; + for (i = 0; i < VHD_REQS_DATA; i++) + s->vreq_free[i] = s->vreq_list + i; + + driver->info.size = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT; + driver->info.sector_size = VHD_SECTOR_SIZE; + driver->info.info = 0; + + DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n", + driver->info.size, driver->info.sector_size, driver->info.info); + + if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && + !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) { + err = vhd_kill_footer(s); + if (err) { + DPRINTF("ERROR killing footer: %d\n", err); + goto fail; + } + s->writes++; + } + + return 0; + + fail: + vhd_free_bat(s); + vhd_free_bitmap_cache(s); + vhd_close(&s->vhd); + vhd_free(s); + return err; +} + +static int +_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + vhd_flag_t vhd_flags = 0; + + if (flags & TD_OPEN_RDONLY) + vhd_flags |= VHD_FLAG_OPEN_RDONLY; + if (flags & TD_OPEN_QUIET) + vhd_flags |= VHD_FLAG_OPEN_QUIET; + if (flags & TD_OPEN_STRICT) + vhd_flags |= VHD_FLAG_OPEN_STRICT; + if (flags & TD_OPEN_QUERY) + vhd_flags |= (VHD_FLAG_OPEN_QUERY | + VHD_FLAG_OPEN_QUIET | + VHD_FLAG_OPEN_RDONLY | + VHD_FLAG_OPEN_NO_CACHE); + + /* pre-allocate for all but NFS and LVM storage */ + if (driver->storage != TAPDISK_STORAGE_TYPE_NFS && + driver->storage != TAPDISK_STORAGE_TYPE_LVM) + vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE; + + return __vhd_open(driver, name, vhd_flags); +} + +static void +vhd_log_close(struct vhd_state *s) +{ + uint32_t i, allocated, full; + + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) + return; + + allocated = 0; + full = 0; + + for (i = 0; i < s->bat.bat.entries; i++) { + if (bat_entry(s, i) != DD_BLK_UNUSED) + allocated++; + if (test_batmap(s, i)) + full++; + } + + DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n", + s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db); +} + +static int +_vhd_close(td_driver_t *driver) +{ + int err; + struct vhd_state *s; + struct vhd_bitmap *bm; + + DBG(TLOG_WARN, "vhd_close\n"); + s = (struct vhd_state *)driver->data; + + /* don't write footer if tapdisk is read-only */ + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) + goto free; + + /* + * write footer if: + * - we killed it on open (opened with strict) + * - we've written data since opening + */ + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) { + memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t)); + err = vhd_write_footer(&s->vhd, &s->vhd.footer); + memset(&s->vhd.bat, 0, sizeof(vhd_bat_t)); + + if (err) + EPRINTF("writing %s footer: %d\n", s->vhd.file, err); + + if (!vhd_has_batmap(&s->vhd)) + goto free; + + err = vhd_write_batmap(&s->vhd, &s->bat.batmap); + if (err) + EPRINTF("writing %s batmap: %d\n", s->vhd.file, err); + } + + free: + vhd_log_close(s); + vhd_free_bat(s); + vhd_free_bitmap_cache(s); + vhd_close(&s->vhd); + vhd_free(s); + + memset(s, 0, sizeof(struct vhd_state)); + + return 0; +} + +int +vhd_validate_parent(td_driver_t *child_driver, + td_driver_t *parent_driver, td_flag_t flags) +{ + struct stat stats; + struct vhd_state *child = (struct vhd_state *)child_driver->data; + struct vhd_state *parent; + + if (parent_driver->type != DISK_TYPE_VHD) { + if (child_driver->type != DISK_TYPE_VHD) + return -EINVAL; + if (child->vhd.footer.type != HD_TYPE_DIFF) + return -EINVAL; + if (!vhd_parent_raw(&child->vhd)) + return -EINVAL; + return 0; + } + + parent = (struct vhd_state *)parent_driver->data; + + /* + * This check removed because of cases like: + * - parent VHD marked as 'hidden' + * - parent VHD modified during coalesce + */ + /* + if (stat(parent->vhd.file, &stats)) { + DPRINTF("ERROR stating parent file %s\n", parent->vhd.file); + return -errno; + } + + if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) { + DPRINTF("ERROR: parent file has been modified since " + "snapshot. Child image no longer valid.\n"); + return -EINVAL; + } + */ + + if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) { + DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since " + "snapshot. Child image no longer valid.\n", + __func__, child->vhd.file, parent->vhd.file); + return -EINVAL; + } + + /* TODO: compare sizes */ + + return 0; +} + +int +vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + int err; + char *parent; + struct vhd_state *s; + + DBG(TLOG_DBG, "\n"); + memset(id, 0, sizeof(td_disk_id_t)); + + s = (struct vhd_state *)driver->data; + + if (s->vhd.footer.type != HD_TYPE_DIFF) + return TD_NO_PARENT; + + err = vhd_parent_locator_get(&s->vhd, &parent); + if (err) + return err; + + id->name = parent; + id->drivertype = DISK_TYPE_VHD; + if (vhd_parent_raw(&s->vhd)) { + DPRINTF("VHD: parent is raw\n"); + id->drivertype = DISK_TYPE_AIO; + } + return 0; +} + +static inline void +clear_req_list(struct vhd_req_list *list) +{ + list->head = list->tail = NULL; +} + +static inline void +add_to_tail(struct vhd_req_list *list, struct vhd_request *e) +{ + if (!list->head) + list->head = list->tail = e; + else + list->tail = list->tail->next = e; +} + +static inline int +remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e) +{ + struct vhd_request *i = list->head; + + if (list->head == e) { + if (list->tail == e) + clear_req_list(list); + else + list->head = list->head->next; + return 0; + } + + while (i->next) { + if (i->next == e) { + if (list->tail == e) { + i->next = NULL; + list->tail = i; + } else + i->next = i->next->next; + return 0; + } + i = i->next; + } + + return -EINVAL; +} + +static inline void +init_vhd_request(struct vhd_state *s, struct vhd_request *req) +{ + memset(req, 0, sizeof(struct vhd_request)); + req->state = s; +} + +static inline void +init_tx(struct vhd_transaction *tx) +{ + memset(tx, 0, sizeof(struct vhd_transaction)); +} + +static inline void +add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r) +{ + ASSERT(!tx->closed); + + r->tx = tx; + tx->started++; + add_to_tail(&tx->requests, r); + set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE); + + DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, " + "started: %d, finished: %d, status: %u\n", + r->treq.sec / SPB, r->treq.sec, tx, + tx->started, tx->finished, tx->status); +} + +static inline int +transaction_completed(struct vhd_transaction *tx) +{ + return (tx->started == tx->finished); +} + +static inline void +init_bat(struct vhd_state *s) +{ + s->bat.req.tx = NULL; + s->bat.req.next = NULL; + s->bat.req.error = 0; + s->bat.pbw_blk = 0; + s->bat.pbw_offset = 0; + s->bat.status = 0; +} + +static inline void +lock_bat(struct vhd_state *s) +{ + set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); +} + +static inline void +unlock_bat(struct vhd_state *s) +{ + clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); +} + +static inline int +bat_locked(struct vhd_state *s) +{ + return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); +} + +static inline void +init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) +{ + bm->blk = 0; + bm->seqno = 0; + bm->status = 0; + init_tx(&bm->tx); + clear_req_list(&bm->queue); + clear_req_list(&bm->waiting); + memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs)); + memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs)); + init_vhd_request(s, &bm->req); +} + +static inline struct vhd_bitmap * +get_bitmap(struct vhd_state *s, uint32_t block) +{ + int i; + struct vhd_bitmap *bm; + + for (i = 0; i < VHD_CACHE_SIZE; i++) { + bm = s->bitmap[i]; + if (bm && bm->blk == block) + return bm; + } + + return NULL; +} + +static inline void +lock_bitmap(struct vhd_bitmap *bm) +{ + set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); +} + +static inline void +unlock_bitmap(struct vhd_bitmap *bm) +{ + clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); +} + +static inline int +bitmap_locked(struct vhd_bitmap *bm) +{ + return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); +} + +static inline int +bitmap_valid(struct vhd_bitmap *bm) +{ + return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); +} + +static inline int +bitmap_in_use(struct vhd_bitmap *bm) +{ + return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING) || + test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) || + test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) || + bm->waiting.head || bm->tx.requests.head || bm->queue.head); +} + +static inline int +bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm) +{ + int i, n; + + n = s->spb >> 3; + for (i = 0; i < n; i++) + if (bm->map[i] != (char)0xFF) + return 0; + + DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk); + return 1; +} + +static struct vhd_bitmap * +remove_lru_bitmap(struct vhd_state *s) +{ + int i, idx = 0; + u64 seq = s->bm_lru; + struct vhd_bitmap *bm, *lru = NULL; + + for (i = 0; i < VHD_CACHE_SIZE; i++) { + bm = s->bitmap[i]; + if (bm && bm->seqno < seq && !bitmap_locked(bm)) { + idx = i; + lru = bm; + seq = lru->seqno; + } + } + + if (lru) { + s->bitmap[idx] = NULL; + ASSERT(!bitmap_in_use(lru)); + } + + return lru; +} + +static int +alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk) +{ + struct vhd_bitmap *bm; + + *bitmap = NULL; + + if (s->bm_free_count > 0) { + bm = s->bitmap_free[--s->bm_free_count]; + } else { + bm = remove_lru_bitmap(s); + if (!bm) + return -EBUSY; + } + + init_vhd_bitmap(s, bm); + bm->blk = blk; + *bitmap = bm; + + return 0; +} + +static inline uint64_t +__bitmap_lru_seqno(struct vhd_state *s) +{ + int i; + struct vhd_bitmap *bm; + + if (s->bm_lru == 0xffffffff) { + s->bm_lru = 0; + for (i = 0; i < VHD_CACHE_SIZE; i++) { + bm = s->bitmap[i]; + if (bm) { + bm->seqno >>= 1; + if (bm->seqno > s->bm_lru) + s->bm_lru = bm->seqno; + } + } + } + + return ++s->bm_lru; +} + +static inline void +touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) +{ + bm->seqno = __bitmap_lru_seqno(s); +} + +static inline void +install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) +{ + int i; + for (i = 0; i < VHD_CACHE_SIZE; i++) { + if (!s->bitmap[i]) { + touch_bitmap(s, bm); + s->bitmap[i] = bm; + return; + } + } + + ASSERT(0); +} + +static inline void +free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) +{ + int i; + + for (i = 0; i < VHD_CACHE_SIZE; i++) + if (s->bitmap[i] == bm) + break; + + ASSERT(!bitmap_locked(bm)); + ASSERT(!bitmap_in_use(bm)); + ASSERT(i < VHD_CACHE_SIZE); + + s->bitmap[i] = NULL; + s->bitmap_free[s->bm_free_count++] = bm; +} + +static int +read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op) +{ + u32 blk, sec; + struct vhd_bitmap *bm; + + /* in fixed disks, every block is present */ + if (s->vhd.footer.type == HD_TYPE_FIXED) + return VHD_BM_BIT_SET; + + blk = sector / s->spb; + sec = sector % s->spb; + + if (blk > s->vhd.header.max_bat_size) { + DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n", + sector, op); + return -EINVAL; + } + + if (bat_entry(s, blk) == DD_BLK_UNUSED) { + if (op == VHD_OP_DATA_WRITE && + s->bat.pbw_blk != blk && bat_locked(s)) + return VHD_BM_BAT_LOCKED; + + return VHD_BM_BAT_CLEAR; + } + + if (test_batmap(s, blk)) { + DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk); + return VHD_BM_BIT_SET; + } + + bm = get_bitmap(s, blk); + if (!bm) + return VHD_BM_NOT_CACHED; + + /* bump lru count */ + touch_bitmap(s, bm); + + if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)) + return VHD_BM_READ_PENDING; + + return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? + VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR); +} + +static int +read_bitmap_cache_span(struct vhd_state *s, + uint64_t sector, int nr_secs, int value) +{ + int ret; + u32 blk, sec; + struct vhd_bitmap *bm; + + /* in fixed disks, every block is present */ + if (s->vhd.footer.type == HD_TYPE_FIXED) + return nr_secs; + + sec = sector % s->spb; + blk = sector / s->spb; + + if (test_batmap(s, blk)) + return MIN(nr_secs, s->spb - sec); + + bm = get_bitmap(s, blk); + + ASSERT(bm && bitmap_valid(bm)); + + for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++) + if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value) + break; + + return ret; +} + +static inline struct vhd_request * +alloc_vhd_request(struct vhd_state *s) +{ + struct vhd_request *req = NULL; + + if (s->vreq_free_count > 0) { + req = s->vreq_free[--s->vreq_free_count]; + ASSERT(req->treq.secs == 0); + init_vhd_request(s, req); + return req; + } + + return NULL; +} + +static inline void +free_vhd_request(struct vhd_state *s, struct vhd_request *req) +{ + memset(req, 0, sizeof(struct vhd_request)); + s->vreq_free[s->vreq_free_count++] = req; +} + +static inline void +aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset) +{ + struct tiocb *tiocb = &req->tiocb; + + td_prep_read(tiocb, s->vhd.fd, req->treq.buf, + vhd_sectors_to_bytes(req->treq.secs), + offset, vhd_complete, req); + td_queue_tiocb(s->driver, tiocb); + + s->queued++; + s->reads++; + s->read_size += req->treq.secs; + TRACE(s); +} + +static inline void +aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset) +{ + struct tiocb *tiocb = &req->tiocb; + + td_prep_write(tiocb, s->vhd.fd, req->treq.buf, + vhd_sectors_to_bytes(req->treq.secs), + offset, vhd_complete, req); + td_queue_tiocb(s->driver, tiocb); + + s->queued++; + s->writes++; + s->write_size += req->treq.secs; + TRACE(s); +} + +static inline uint64_t +reserve_new_block(struct vhd_state *s, uint32_t blk) +{ + int gap = 0; + + ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); + + /* data region of segment should begin on page boundary */ + if ((s->next_db + s->bm_secs) % s->spp) + gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp)); + + s->bat.pbw_blk = blk; + s->bat.pbw_offset = s->next_db + gap; + + return s->next_db; +} + +static int +schedule_bat_write(struct vhd_state *s) +{ + int i; + u32 blk; + char *buf; + u64 offset; + struct vhd_request *req; + + ASSERT(bat_locked(s)); + + req = &s->bat.req; + buf = s->bat.bat_buf; + blk = s->bat.pbw_blk; + + init_vhd_request(s, req); + memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512); + + ((u32 *)buf)[blk % 128] = s->bat.pbw_offset; + + for (i = 0; i < 128; i++) + BE32_OUT(&((u32 *)buf)[i]); + + offset = s->vhd.header.table_offset + (blk - (blk % 128)) * 4; + req->treq.secs = 1; + req->treq.buf = buf; + req->op = VHD_OP_BAT_WRITE; + req->next = NULL; + + aio_write(s, req, offset); + set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED); + + DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", " + "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset); + + return 0; +} + +static void +schedule_zero_bm_write(struct vhd_state *s, + struct vhd_bitmap *bm, uint64_t lb_end) +{ + uint64_t offset; + struct vhd_request *req = &s->bat.zero_req; + + init_vhd_request(s, req); + + offset = vhd_sectors_to_bytes(lb_end); + req->op = VHD_OP_ZERO_BM_WRITE; + req->treq.sec = s->bat.pbw_blk * s->spb; + req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs; + req->treq.buf = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs)); + req->next = NULL; + + DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n", + s->bat.pbw_blk, offset); + + lock_bitmap(bm); + add_to_transaction(&bm->tx, req); + aio_write(s, req, offset); +} + +static int +update_bat(struct vhd_state *s, uint32_t blk) +{ + int err; + uint64_t lb_end; + struct vhd_bitmap *bm; + + ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); + + if (bat_locked(s)) { + ASSERT(s->bat.pbw_blk == blk); + return 0; + } + + /* empty bitmap could already be in + * cache if earlier bat update failed */ + bm = get_bitmap(s, blk); + if (!bm) { + /* install empty bitmap in cache */ + err = alloc_vhd_bitmap(s, &bm, blk); + if (err) + return err; + + install_bitmap(s, bm); + } + + lock_bat(s); + lb_end = reserve_new_block(s, blk); + schedule_zero_bm_write(s, bm, lb_end); + set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT); + + return 0; +} + +static int +allocate_block(struct vhd_state *s, uint32_t blk) +{ + char *zeros; + int err, gap; + uint64_t offset, size; + struct vhd_bitmap *bm; + + ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); + + if (bat_locked(s)) { + ASSERT(s->bat.pbw_blk == blk); + if (s->bat.req.error) + return -EBUSY; + return 0; + } + + gap = 0; + s->bat.pbw_blk = blk; + offset = vhd_sectors_to_bytes(s->next_db); + + /* data region of segment should begin on page boundary */ + if ((s->next_db + s->bm_secs) % s->spp) { + gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp)); + s->next_db += gap; + } + + s->bat.pbw_offset = s->next_db; + + DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n", + blk, s->bat.pbw_offset); + + if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) { + ERR(errno, "lseek failed\n"); + return -errno; + } + + size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap); + err = write(s->vhd.fd, vhd_zeros(size), size); + if (err != size) { + err = (err == -1 ? -errno : -EIO); + ERR(err, "write failed"); + return err; + } + + /* empty bitmap could already be in + * cache if earlier bat update failed */ + bm = get_bitmap(s, blk); + if (!bm) { + /* install empty bitmap in cache */ + err = alloc_vhd_bitmap(s, &bm, blk); + if (err) + return err; + + install_bitmap(s, bm); + } + + lock_bat(s); + lock_bitmap(bm); + schedule_bat_write(s); + add_to_transaction(&bm->tx, &s->bat.req); + + return 0; +} + +static int +schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) +{ + u64 offset; + u32 blk = 0, sec = 0; + struct vhd_bitmap *bm; + struct vhd_request *req; + + if (s->vhd.footer.type == HD_TYPE_FIXED) { + offset = vhd_sectors_to_bytes(treq.sec); + goto make_request; + } + + blk = treq.sec / s->spb; + sec = treq.sec % s->spb; + bm = get_bitmap(s, blk); + offset = bat_entry(s, blk); + + ASSERT(offset != DD_BLK_UNUSED); + ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm))); + + offset += s->bm_secs + sec; + offset = vhd_sectors_to_bytes(offset); + + make_request: + req = alloc_vhd_request(s); + if (!req) + return -EBUSY; + + req->treq = treq; + req->flags = flags; + req->op = VHD_OP_DATA_READ; + req->next = NULL; + + aio_read(s, req, offset); + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " + "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n", + s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags, + treq.buf); + + return 0; +} + +static int +schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) +{ + int err; + u64 offset; + u32 blk = 0, sec = 0; + struct vhd_bitmap *bm = NULL; + struct vhd_request *req; + + if (s->vhd.footer.type == HD_TYPE_FIXED) { + offset = vhd_sectors_to_bytes(treq.sec); + goto make_request; + } + + blk = treq.sec / s->spb; + sec = treq.sec % s->spb; + offset = bat_entry(s, blk); + + if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) { + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) + err = allocate_block(s, blk); + else + err = update_bat(s, blk); + + if (err) + return err; + + offset = s->bat.pbw_offset; + } + + offset += s->bm_secs + sec; + offset = vhd_sectors_to_bytes(offset); + + make_request: + req = alloc_vhd_request(s); + if (!req) + return -EBUSY; + + req->treq = treq; + req->flags = flags; + req->op = VHD_OP_DATA_WRITE; + req->next = NULL; + + if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) { + bm = get_bitmap(s, blk); + ASSERT(bm && bitmap_valid(bm)); + lock_bitmap(bm); + + if (bm->tx.closed) { + add_to_tail(&bm->queue, req); + set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED); + } else + add_to_transaction(&bm->tx, req); + } + + aio_write(s, req, offset); + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " + "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n", + s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags); + + return 0; +} + +static int +schedule_bitmap_read(struct vhd_state *s, uint32_t blk) +{ + int err; + u64 offset; + struct vhd_bitmap *bm; + struct vhd_request *req = NULL; + + ASSERT(vhd_type_dynamic(&s->vhd)); + + offset = bat_entry(s, blk); + + ASSERT(offset != DD_BLK_UNUSED); + ASSERT(!get_bitmap(s, blk)); + + offset = vhd_sectors_to_bytes(offset); + + err = alloc_vhd_bitmap(s, &bm, blk); + if (err) + return err; + + req = &bm->req; + init_vhd_request(s, req); + + req->treq.sec = blk * s->spb; + req->treq.secs = s->bm_secs; + req->treq.buf = bm->map; + req->treq.cb = NULL; + req->op = VHD_OP_BITMAP_READ; + req->next = NULL; + + aio_read(s, req, offset); + lock_bitmap(bm); + install_bitmap(s, bm); + set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, " + "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk, + req->treq.secs, offset); + + return 0; +} + +static void +schedule_bitmap_write(struct vhd_state *s, uint32_t blk) +{ + u64 offset; + struct vhd_bitmap *bm; + struct vhd_request *req; + + bm = get_bitmap(s, blk); + offset = bat_entry(s, blk); + + ASSERT(vhd_type_dynamic(&s->vhd)); + ASSERT(bm && bitmap_valid(bm) && + !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); + + if (offset == DD_BLK_UNUSED) { + ASSERT(bat_locked(s) && s->bat.pbw_blk == blk); + offset = s->bat.pbw_offset; + } + + offset = vhd_sectors_to_bytes(offset); + + req = &bm->req; + init_vhd_request(s, req); + + req->treq.sec = blk * s->spb; + req->treq.secs = s->bm_secs; + req->treq.buf = bm->shadow; + req->treq.cb = NULL; + req->op = VHD_OP_BITMAP_WRITE; + req->next = NULL; + + aio_write(s, req, offset); + lock_bitmap(bm); + touch_bitmap(s, bm); /* bump lru count */ + set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); + + DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, " + "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec, + req->treq.secs, offset); +} + +/* + * queued requests will be submitted once the bitmap + * describing them is read and the requests are validated. + */ +static int +__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq) +{ + u32 blk; + struct vhd_bitmap *bm; + struct vhd_request *req; + + ASSERT(vhd_type_dynamic(&s->vhd)); + + blk = treq.sec / s->spb; + bm = get_bitmap(s, blk); + + ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); + + req = alloc_vhd_request(s); + if (!req) + return -EBUSY; + + req->treq = treq; + req->op = op; + req->next = NULL; + + add_to_tail(&bm->waiting, req); + lock_bitmap(bm); + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, " + "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op); + + TRACE(s); + return 0; +} + +static void +vhd_queue_read(td_driver_t *driver, td_request_t treq) +{ + struct vhd_state *s = (struct vhd_state *)driver->data; + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n", + s->vhd.file, treq.sec, treq.secs, treq.sidx); + + while (treq.secs) { + int err; + td_request_t clone; + + err = 0; + clone = treq; + + switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) { + case -EINVAL: + err = -EINVAL; + goto fail; + + case VHD_BM_BAT_CLEAR: + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + td_forward_request(clone); + break; + + case VHD_BM_BIT_CLEAR: + clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); + td_forward_request(clone); + break; + + case VHD_BM_BIT_SET: + clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); + err = schedule_data_read(s, clone, 0); + if (err) + goto fail; + break; + + case VHD_BM_NOT_CACHED: + err = schedule_bitmap_read(s, clone.sec / s->spb); + if (err) + goto fail; + + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); + if (err) + goto fail; + break; + + case VHD_BM_READ_PENDING: + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); + if (err) + goto fail; + break; + + case VHD_BM_BAT_LOCKED: + default: + ASSERT(0); + break; + } + + treq.sec += clone.secs; + treq.secs -= clone.secs; + treq.buf += vhd_sectors_to_bytes(clone.secs); + continue; + + fail: + clone.secs = treq.secs; + td_complete_request(clone, err); + break; + } +} + +static void +vhd_queue_write(td_driver_t *driver, td_request_t treq) +{ + struct vhd_state *s = (struct vhd_state *)driver->data; + + DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n", + s->vhd.file, treq.sec, treq.secs, treq.sidx); + + while (treq.secs) { + int err; + uint8_t flags; + td_request_t clone; + + err = 0; + flags = 0; + clone = treq; + + switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) { + case -EINVAL: + err = -EINVAL; + goto fail; + + case VHD_BM_BAT_LOCKED: + err = -EBUSY; + clone.blocked = 1; + goto fail; + + case VHD_BM_BAT_CLEAR: + flags = (VHD_FLAG_REQ_UPDATE_BAT | + VHD_FLAG_REQ_UPDATE_BITMAP); + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + err = schedule_data_write(s, clone, flags); + if (err) + goto fail; + break; + + case VHD_BM_BIT_CLEAR: + flags = VHD_FLAG_REQ_UPDATE_BITMAP; + clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); + err = schedule_data_write(s, clone, flags); + if (err) + goto fail; + break; + + case VHD_BM_BIT_SET: + clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); + err = schedule_data_write(s, clone, 0); + if (err) + goto fail; + break; + + case VHD_BM_NOT_CACHED: + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + err = schedule_bitmap_read(s, clone.sec / s->spb); + if (err) + goto fail; + + err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); + if (err) + goto fail; + break; + + case VHD_BM_READ_PENDING: + clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); + err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); + if (err) + goto fail; + break; + + default: + ASSERT(0); + break; + } + + treq.sec += clone.secs; + treq.secs -= clone.secs; + treq.buf += vhd_sectors_to_bytes(clone.secs); + continue; + + fail: + clone.secs = treq.secs; + td_complete_request(clone, err); + break; + } +} + +static inline void +signal_completion(struct vhd_request *list, int error) +{ + struct vhd_state *s; + struct vhd_request *r, *next; + + if (!list) + return; + + r = list; + s = list->state; + + while (r) { + int err; + + err = (error ? error : r->error); + next = r->next; + td_complete_request(r->treq, err); + DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", " + "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err); + free_vhd_request(s, r); + r = next; + + s->returned++; + TRACE(s); + } +} + +static void +start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm) +{ + int i, error = 0; + struct vhd_transaction *tx; + struct vhd_request *r, *next; + + if (!bm->queue.head) + return; + + DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); + + r = bm->queue.head; + tx = &bm->tx; + clear_req_list(&bm->queue); + + if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED) + tx->error = -EIO; + + while (r) { + next = r->next; + r->next = NULL; + clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED); + + add_to_transaction(tx, r); + if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) { + tx->finished++; + if (!r->error) { + u32 sec = r->treq.sec % s->spb; + for (i = 0; i < r->treq.secs; i++) + vhd_bitmap_set(&s->vhd, + bm->shadow, sec + i); + } + } + r = next; + } + + /* perhaps all the queued writes already completed? */ + if (tx->started && transaction_completed(tx)) + finish_data_transaction(s, bm); +} + +static void +finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm) +{ + struct vhd_transaction *tx = &bm->tx; + + if (!bat_locked(s)) + return; + + if (s->bat.pbw_blk != bm->blk) + return; + + if (!s->bat.req.error) + goto release; + + if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)) + goto release; + + tx->closed = 1; + return; + + release: + DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); + unlock_bat(s); + init_bat(s); +} + +static void +finish_bitmap_transaction(struct vhd_state *s, + struct vhd_bitmap *bm, int error) +{ + int map_size; + struct vhd_transaction *tx = &bm->tx; + + DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error); + tx->error = (tx->error ? tx->error : error); + map_size = vhd_sectors_to_bytes(s->bm_secs); + + if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { + if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) { + /* still waiting for bat write */ + ASSERT(bm->blk == s->bat.pbw_blk); + ASSERT(test_vhd_flag(s->bat.status, + VHD_FLAG_BAT_WRITE_STARTED)); + s->bat.req.tx = tx; + return; + } + } + + if (tx->error) { + /* undo changes to shadow */ + memcpy(bm->shadow, bm->map, map_size); + } else { + /* complete atomic write */ + memcpy(bm->map, bm->shadow, map_size); + if (!test_batmap(s, bm->blk) && bitmap_full(s, bm)) + set_batmap(s, bm->blk); + } + + /* transaction done; signal completions */ + signal_completion(tx->requests.head, tx->error); + init_tx(tx); + start_new_bitmap_transaction(s, bm); + + if (!bitmap_in_use(bm)) + unlock_bitmap(bm); + + finish_bat_transaction(s, bm); +} + +static void +finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm) +{ + struct vhd_transaction *tx = &bm->tx; + + DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); + + tx->closed = 1; + + if (!tx->error) + return schedule_bitmap_write(s, bm->blk); + + return finish_bitmap_transaction(s, bm, 0); +} + +static void +finish_bat_write(struct vhd_request *req) +{ + struct vhd_bitmap *bm; + struct vhd_transaction *tx; + struct vhd_state *s = req->state; + + s->returned++; + TRACE(s); + + bm = get_bitmap(s, s->bat.pbw_blk); + + DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n", + s->bat.pbw_blk, s->bat.pbw_offset, req->error); + ASSERT(bm && bitmap_valid(bm)); + ASSERT(bat_locked(s) && + test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); + + tx = &bm->tx; + ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)); + + if (!req->error) { + bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset; + s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs; + } else + tx->error = req->error; + + if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { + tx->finished++; + remove_from_req_list(&tx->requests, req); + if (transaction_completed(tx)) + finish_data_transaction(s, bm); + } else { + clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); + if (s->bat.req.tx) + finish_bitmap_transaction(s, bm, req->error); + } + + finish_bat_transaction(s, bm); +} + +static void +finish_zero_bm_write(struct vhd_request *req) +{ + u32 blk; + struct vhd_bitmap *bm; + struct vhd_transaction *tx = req->tx; + struct vhd_state *s = req->state; + + s->returned++; + TRACE(s); + + blk = req->treq.sec / s->spb; + bm = get_bitmap(s, blk); + + DBG(TLOG_DBG, "blk: 0x%04x\n", blk); + ASSERT(bat_locked(s)); + ASSERT(s->bat.pbw_blk == blk); + ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); + + tx->finished++; + remove_from_req_list(&tx->requests, req); + + if (req->error) { + unlock_bat(s); + init_bat(s); + tx->error = req->error; + clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); + } else + schedule_bat_write(s); + + if (transaction_completed(tx)) + finish_data_transaction(s, bm); +} + +static void +finish_bitmap_read(struct vhd_request *req) +{ + u32 blk; + struct vhd_bitmap *bm; + struct vhd_request *r, *next; + struct vhd_state *s = req->state; + + s->returned++; + TRACE(s); + + blk = req->treq.sec / s->spb; + bm = get_bitmap(s, blk); + + DBG(TLOG_DBG, "blk: 0x%04x\n", blk); + ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); + + r = bm->waiting.head; + clear_req_list(&bm->waiting); + clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); + + if (!req->error) { + memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs)); + + while (r) { + struct vhd_request tmp; + + tmp = *r; + next = r->next; + free_vhd_request(s, r); + + ASSERT(tmp.op == VHD_OP_DATA_READ || + tmp.op == VHD_OP_DATA_WRITE); + + if (tmp.op == VHD_OP_DATA_READ) + vhd_queue_read(s->driver, tmp.treq); + else if (tmp.op == VHD_OP_DATA_WRITE) + vhd_queue_write(s->driver, tmp.treq); + + r = next; + } + } else { + int err = req->error; + unlock_bitmap(bm); + free_vhd_bitmap(s, bm); + return signal_completion(r, err); + } + + if (!bitmap_in_use(bm)) + unlock_bitmap(bm); +} + +static void +finish_bitmap_write(struct vhd_request *req) +{ + u32 blk; + struct vhd_bitmap *bm; + struct vhd_transaction *tx; + struct vhd_state *s = req->state; + + s->returned++; + TRACE(s); + + blk = req->treq.sec / s->spb; + bm = get_bitmap(s, blk); + tx = &bm->tx; + + DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n", + blk, tx->started, tx->finished); + ASSERT(tx->closed); + ASSERT(bm && bitmap_valid(bm)); + ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); + + clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); + + finish_bitmap_transaction(s, bm, req->error); +} + +static void +finish_data_read(struct vhd_request *req) +{ + struct vhd_state *s = req->state; + + DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", + req->treq.sec, req->treq.sec / s->spb); + signal_completion(req, 0); +} + +static void +finish_data_write(struct vhd_request *req) +{ + int i; + struct vhd_transaction *tx = req->tx; + struct vhd_state *s = (struct vhd_state *)req->state; + + set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED); + + if (tx) { + u32 blk, sec; + struct vhd_bitmap *bm; + + blk = req->treq.sec / s->spb; + sec = req->treq.sec % s->spb; + bm = get_bitmap(s, blk); + + ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); + + tx->finished++; + + DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", " + "tx->started: %d, tx->finished: %d\n", req->treq.sec, + req->treq.sec / s->spb, tx->started, tx->finished); + + if (!req->error) + for (i = 0; i < req->treq.secs; i++) + vhd_bitmap_set(&s->vhd, bm->shadow, sec + i); + + if (transaction_completed(tx)) + finish_data_transaction(s, bm); + + } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) { + ASSERT(!req->next); + DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", + req->treq.sec, req->treq.sec / s->spb); + signal_completion(req, 0); + } +} + +void +vhd_complete(void *arg, struct tiocb *tiocb, int err) +{ + struct vhd_request *req = (struct vhd_request *)arg; + struct vhd_state *s = req->state; + struct iocb *io = &tiocb->iocb; + + s->completed++; + TRACE(s); + + req->error = err; + + if (req->error) + ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, " + "nbytes: %lu, blk: %"PRIu64", blk_offset: %u", + s->vhd.file, req->op, req->treq.sec, req->treq.secs, + io->u.c.nbytes, req->treq.sec / s->spb, + bat_entry(s, req->treq.sec / s->spb)); + + switch (req->op) { + case VHD_OP_DATA_READ: + finish_data_read(req); + break; + + case VHD_OP_DATA_WRITE: + finish_data_write(req); + break; + + case VHD_OP_BITMAP_READ: + finish_bitmap_read(req); + break; + + case VHD_OP_BITMAP_WRITE: + finish_bitmap_write(req); + break; + + case VHD_OP_ZERO_BM_WRITE: + finish_zero_bm_write(req); + break; + + case VHD_OP_BAT_WRITE: + finish_bat_write(req); + break; + + default: + ASSERT(0); + break; + } +} + +void +vhd_debug(td_driver_t *driver) +{ + int i; + struct vhd_state *s = (struct vhd_state *)driver->data; + + DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", " + "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed, + s->returned); + DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n", + s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0)); + DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n", + s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0)); + + DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA); + for (i = 0; i < VHD_REQS_DATA; i++) { + struct vhd_request *r = &s->vreq_list[i]; + td_request_t *t = &r->treq; + if (t->secs) + DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d," + " lsec: 0x%08"PRIx64", flags: %d, this: %p, " + "next: %p, tx: %p\n", i, t->id, r->error, r->op, + t->sec, r->flags, r, r->next, r->tx); + } + + DBG(TLOG_WARN, "BITMAP CACHE:\n"); + for (i = 0; i < VHD_CACHE_SIZE; i++) { + int qnum = 0, wnum = 0, rnum = 0; + struct vhd_bitmap *bm = s->bitmap[i]; + struct vhd_transaction *tx; + struct vhd_request *r; + + if (!bm) + continue; + + tx = &bm->tx; + r = bm->queue.head; + while (r) { + qnum++; + r = r->next; + } + + r = bm->waiting.head; + while (r) { + wnum++; + r = r->next; + } + + r = tx->requests.head; + while (r) { + rnum++; + r = r->next; + } + + DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, " + "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, " + "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n", + i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head, + wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error, + tx->started, tx->finished, tx->status, tx->requests.head, rnum); + } + + DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, " + "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk, + s->bat.pbw_offset, s->bat.req.tx); + +/* + for (i = 0; i < s->hdr.max_bat_size; i++) + DPRINTF("%d: %u\n", i, s->bat.bat[i]); +*/ +} + +struct tap_disk tapdisk_vhd = { + .disk_type = "tapdisk_vhd", + .flags = 0, + .private_data_size = sizeof(struct vhd_state), + .td_open = _vhd_open, + .td_close = _vhd_close, + .td_queue_read = vhd_queue_read, + .td_queue_write = vhd_queue_write, + .td_get_parent_id = vhd_get_parent_id, + .td_validate_parent = vhd_validate_parent, + .td_debug = vhd_debug, +}; diff --git a/tools/blktap2/drivers/bswap.h b/tools/blktap2/drivers/bswap.h new file mode 100644 index 0000000000..45016b978b --- /dev/null +++ b/tools/blktap2/drivers/bswap.h @@ -0,0 +1,214 @@ +#ifndef BSWAP_H +#define BSWAP_H + +//#include "config-host.h" + +#include <inttypes.h> + +#if defined(__NetBSD__) +#include <sys/endian.h> +#include <sys/types.h> +#elif defined(__OpenBSD__) +#include <machine/endian.h> +#define bswap_16(x) swap16(x) +#define bswap_32(x) swap32(x) +#define bswap_64(x) swap64(x) +#else + +#ifdef HAVE_BYTESWAP_H +#include <byteswap.h> +#else + +#define bswap_16(x) \ +({ \ + uint16_t __x = (x); \ + ((uint16_t)( \ + (((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \ + (((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \ +}) + +#define bswap_32(x) \ +({ \ + uint32_t __x = (x); \ + ((uint32_t)( \ + (((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \ + (((uint32_t)(__x) & (uint32_t)0x0000ff00UL) << 8) | \ + (((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >> 8) | \ + (((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \ +}) + +#define bswap_64(x) \ +({ \ + uint64_t __x = (x); \ + ((uint64_t)( \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) << 8) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >> 8) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \ +}) + +#endif /* !HAVE_BYTESWAP_H */ + +static inline uint16_t bswap16(uint16_t x) +{ + return bswap_16(x); +} + +static inline uint32_t bswap32(uint32_t x) +{ + return bswap_32(x); +} + +static inline uint64_t bswap64(uint64_t x) +{ + return bswap_64(x); +} + +static inline void bswap16s(uint16_t *s) +{ + *s = bswap16(*s); +} + +static inline void bswap32s(uint32_t *s) +{ + *s = bswap32(*s); +} + +static inline void bswap64s(uint64_t *s) +{ + *s = bswap64(*s); +} + +#endif + +#if defined(WORDS_BIGENDIAN) +#define be_bswap(v, size) (v) +#define le_bswap(v, size) bswap ## size(v) +#define be_bswaps(v, size) +#define le_bswaps(p, size) *p = bswap ## size(*p); +#else +#define le_bswap(v, size) (v) +#define be_bswap(v, size) bswap ## size(v) +#define le_bswaps(v, size) +#define be_bswaps(p, size) *p = bswap ## size(*p); +#endif + +#define CPU_CONVERT(endian, size, type)\ +static inline type endian ## size ## _to_cpu(type v)\ +{\ + return endian ## _bswap(v, size);\ +}\ +\ +static inline type cpu_to_ ## endian ## size(type v)\ +{\ + return endian ## _bswap(v, size);\ +}\ +\ +static inline void endian ## size ## _to_cpus(type *p)\ +{\ + endian ## _bswaps(p, size)\ +}\ +\ +static inline void cpu_to_ ## endian ## size ## s(type *p)\ +{\ + endian ## _bswaps(p, size)\ +}\ +\ +static inline type endian ## size ## _to_cpup(const type *p)\ +{\ + return endian ## size ## _to_cpu(*p);\ +}\ +\ +static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\ +{\ + *p = cpu_to_ ## endian ## size(v);\ +} + +CPU_CONVERT(be, 16, uint16_t) +CPU_CONVERT(be, 32, uint32_t) +CPU_CONVERT(be, 64, uint64_t) + +CPU_CONVERT(le, 16, uint16_t) +CPU_CONVERT(le, 32, uint32_t) +CPU_CONVERT(le, 64, uint64_t) + +/* unaligned versions (optimized for frequent unaligned accesses)*/ + +#if defined(__i386__) || defined(__powerpc__) + +#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v) +#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v) +#define le16_to_cpupu(p) le16_to_cpup(p) +#define le32_to_cpupu(p) le32_to_cpup(p) + +#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v) +#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v) + +#else + +static inline void cpu_to_le16wu(uint16_t *p, uint16_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v; + p1[1] = v >> 8; +} + +static inline void cpu_to_le32wu(uint32_t *p, uint32_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v; + p1[1] = v >> 8; + p1[2] = v >> 16; + p1[3] = v >> 24; +} + +static inline uint16_t le16_to_cpupu(const uint16_t *p) +{ + const uint8_t *p1 = (const uint8_t *)p; + return p1[0] | (p1[1] << 8); +} + +static inline uint32_t le32_to_cpupu(const uint32_t *p) +{ + const uint8_t *p1 = (const uint8_t *)p; + return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24); +} + +static inline void cpu_to_be16wu(uint16_t *p, uint16_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v >> 8; + p1[1] = v; +} + +static inline void cpu_to_be32wu(uint32_t *p, uint32_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v >> 24; + p1[1] = v >> 16; + p1[2] = v >> 8; + p1[3] = v; +} + +#endif + +#ifdef WORDS_BIGENDIAN +#define cpu_to_32wu cpu_to_be32wu +#else +#define cpu_to_32wu cpu_to_le32wu +#endif + +#undef le_bswap +#undef be_bswap +#undef le_bswaps +#undef be_bswaps + +#endif /* BSWAP_H */ diff --git a/tools/blktap2/drivers/check_gcrypt b/tools/blktap2/drivers/check_gcrypt new file mode 100644 index 0000000000..154ba2492a --- /dev/null +++ b/tools/blktap2/drivers/check_gcrypt @@ -0,0 +1,14 @@ +#!/bin/sh + +cat > .gcrypt.c << EOF +#include <gcrypt.h> +int main(void) { return 0; } +EOF + +if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then + echo "yes" +else + echo "no" +fi + +rm -f .gcrypt* diff --git a/tools/blktap2/drivers/disktypes.h b/tools/blktap2/drivers/disktypes.h new file mode 100644 index 0000000000..d0923f18b4 --- /dev/null +++ b/tools/blktap2/drivers/disktypes.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __DISKTYPES_H__ +#define __DISKTYPES_H__ + +typedef struct disk_info { + int idnum; + char name[50]; /* e.g. "RAMDISK" */ + char handle[10]; /* xend handle, e.g. 'ram' */ + int single_handler; /* is there a single controller for all */ + /* instances of disk type? */ +#ifdef TAPDISK + struct tap_disk *drv; +#endif +} disk_info_t; + +extern struct tap_disk tapdisk_aio; +/* extern struct tap_disk tapdisk_sync; */ +/* extern struct tap_disk tapdisk_vmdk; */ +/* extern struct tap_disk tapdisk_vhdsync; */ +extern struct tap_disk tapdisk_vhd; +extern struct tap_disk tapdisk_ram; + extern struct tap_disk tapdisk_qcow; +extern struct tap_disk tapdisk_block_cache; +extern struct tap_disk tapdisk_log; + +#define MAX_DISK_TYPES 20 + +#define DISK_TYPE_AIO 0 +#define DISK_TYPE_SYNC 1 +#define DISK_TYPE_VMDK 2 +#define DISK_TYPE_VHDSYNC 3 +#define DISK_TYPE_VHD 4 +#define DISK_TYPE_RAM 5 +#define DISK_TYPE_QCOW 6 +#define DISK_TYPE_BLOCK_CACHE 7 +#define DISK_TYPE_LOG 9 + +/*Define Individual Disk Parameters here */ +static disk_info_t null_disk = { + -1, + "null disk", + "null", + 0, +#ifdef TAPDISK + 0, +#endif +}; + +static disk_info_t aio_disk = { + DISK_TYPE_AIO, + "raw image (aio)", + "aio", + 0, +#ifdef TAPDISK + &tapdisk_aio, +#endif +}; +/* +static disk_info_t sync_disk = { + DISK_TYPE_SYNC, + "raw image (sync)", + "sync", + 0, +#ifdef TAPDISK + &tapdisk_sync, +#endif +}; + +static disk_info_t vmdk_disk = { + DISK_TYPE_VMDK, + "vmware image (vmdk)", + "vmdk", + 1, +#ifdef TAPDISK + &tapdisk_vmdk, +#endif +}; + +static disk_info_t vhdsync_disk = { + DISK_TYPE_VHDSYNC, + "virtual server image (vhd) - synchronous", + "vhdsync", + 1, +#ifdef TAPDISK + &tapdisk_vhdsync, +#endif +}; +*/ + +static disk_info_t vhd_disk = { + DISK_TYPE_VHD, + "virtual server image (vhd)", + "vhd", + 0, +#ifdef TAPDISK + &tapdisk_vhd, +#endif +}; + + +static disk_info_t ram_disk = { + DISK_TYPE_RAM, + "ramdisk image (ram)", + "ram", + 1, +#ifdef TAPDISK + &tapdisk_ram, +#endif +}; + + +static disk_info_t qcow_disk = { + DISK_TYPE_QCOW, + "qcow disk (qcow)", + "qcow", + 0, +#ifdef TAPDISK + &tapdisk_qcow, +#endif +}; + + +static disk_info_t block_cache_disk = { + DISK_TYPE_BLOCK_CACHE, + "block cache image (bc)", + "bc", + 1, +#ifdef TAPDISK + &tapdisk_block_cache, +#endif +}; + +static disk_info_t log_disk = { + DISK_TYPE_LOG, + "write logger (log)", + "log", + 0, +#ifdef TAPDISK + &tapdisk_log, +#endif +}; + +/*Main disk info array */ +static disk_info_t *dtypes[] = { + &aio_disk, + &null_disk, /* &sync_disk, */ + &null_disk, /* &vmdk_disk, */ + &null_disk, /* &vhdsync_disk, */ + &vhd_disk, + &ram_disk, + &qcow_disk, + &block_cache_disk, + &null_disk, + &log_disk, +}; + +#endif diff --git a/tools/blktap2/drivers/img2qcow.c b/tools/blktap2/drivers/img2qcow.c new file mode 100644 index 0000000000..b12509ddd8 --- /dev/null +++ b/tools/blktap2/drivers/img2qcow.c @@ -0,0 +1,318 @@ +/* img2qcow.c + * + * Generates a qcow format disk and fills it from an existing image. + * + * (c) 2006 Julian Chesterfield and Andrew Warfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <string.h> +#include <zlib.h> +#include <inttypes.h> +#include <libaio.h> +#include <openssl/md5.h> + +#include "bswap.h" +#include "aes.h" +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "qcow.h" +#include "blk.h" + + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +/* *BSD has no O_LARGEFILE */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + + +#define TAPDISK 1 +#define BLOCK_PROCESSSZ 4096 +#define QCOW_VBD 0 +#define PROGRESS_QUANT 2 + +static int running = 1, complete = 0; +static int returned_events = 0, submit_events = 0; +static uint32_t read_idx = 0; +td_driver_t *ddqcow; +td_vbd_t* qcow_vbd; +static uint64_t prev = 0, written = 0; +static char output[(100/PROGRESS_QUANT) + 5]; + +extern tapdisk_server_t server; + + +static void print_bytes(void *ptr, int length) +{ + int i,k; + unsigned char *p = ptr; + + DFPRINTF("Buf dump, length %d:\n",length); + for (k = 0; k < length; k++) { + DFPRINTF("%x",*p); + *p++; + if(k % 16 == 0) DFPRINTF("\n"); + else if(k % 2 == 0) DFPRINTF(" "); + } + DFPRINTF("\n"); + return; +} + +static void debug_output(uint64_t progress, uint64_t size) +{ + //Output progress every PROGRESS_QUANT + uint64_t blocks = size/(100/PROGRESS_QUANT); + + if (progress/blocks > prev) { + memcpy(output+prev+1,"=>",2); + prev++; + DFPRINTF("\r%s %"PRIi64"%%", + output, (int64_t)((prev-1)*PROGRESS_QUANT)); + } + return; +} + +static int get_image_info(td_disk_info_t *driver, int fd) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + uint64_t sector_size=DEFAULT_SECTOR_SIZE; + + ret = fstat(fd, &stat); + if (ret != 0) { + DFPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + if (blk_getimagesize(fd, &driver->size) != 0) + return -EINVAL; + + DFPRINTF("Image size: \n\tpre sector_shift [%"PRIu64"]\n\tpost " + "sector_shift [%"PRIu64"]\n", + (uint64_t)(driver->size << SECTOR_SHIFT), + (uint64_t)driver->size); + + /*Get the sector size*/ + if (!blk_getsectorsize(fd, §or_size)) + driver->sector_size = sector_size; + + } else { + /*Local file? try fstat instead*/ + driver->size = (stat.st_size >> SECTOR_SHIFT); + driver->sector_size = DEFAULT_SECTOR_SIZE; + DFPRINTF("Image size: [%"PRIu64"]\n", + (uint64_t)driver->size); + } + + return 0; +} + +void send_responses(td_request_t treq, int err) +{ + if (err < 0) { + DFPRINTF("AIO FAILURE: res [%d]!\n",err); + return; + } + + returned_events++; + + free(treq.buf); +} + +int main(int argc, const char *argv[]) +{ + int ret = -1, fd, len, err; + struct timeval timeout; + uint64_t i; + char *buf; + td_request_t treq; + td_disk_info_t info; + td_vbd_request_t* vreq; + + if (argc != 3) { + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", + argv[0]); + exit(-1); + } + + + /*Open image*/ + fd = open(argv[2], O_RDONLY | O_LARGEFILE); + + if (fd == -1) { + DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno); + exit(-1); + } + + get_image_info(&info, fd); + + /*Create qcow file*/ + ret = qcow_create(argv[1],info.size<<SECTOR_SHIFT,NULL,0); + + if (ret < 0) { + DFPRINTF("Unable to create QCOW file\n"); + exit(-1); + } else DFPRINTF("Qcow file created: size %"PRIu64" sectors\n", + (uint64_t)info.size); + + /* Open Qcow image*/ + err = tapdisk_server_initialize(NULL, NULL); + if( err ) { + DPRINTF("qcow2raw Couldn't initialize server instance.\n"); + return err; + } + + err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD); + if( err ) { + DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n"); + return err; + } + + qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD); + if (!qcow_vbd) { + err = -ENODEV; + DPRINTF("qcow2raw Couldn't create qcow vbd.\n"); + return err; + } + + err = tapdisk_vbd_open_vdi(qcow_vbd, argv[1], DISK_TYPE_QCOW, + TAPDISK_STORAGE_TYPE_DEFAULT, + 0); + if( err ) { + DPRINTF("qcow2raw Couldn't open qcow file.\n"); + return err; + } + + ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver; + + /*Initialise the output string*/ + memset(output,0x20,(100/PROGRESS_QUANT)+5); + output[0] = '['; + output[(100/PROGRESS_QUANT)+2] = ']'; + output[(100/PROGRESS_QUANT)+3] = '\0'; + DFPRINTF("%s",output); + + i = 0; + while (running) { + + if (!complete) { + /*Read sector from image*/ + if (lseek(fd, i*512, SEEK_SET) == (off_t)-1) { + DFPRINTF("Unable to access file offset %"PRIu64"\n", + (uint64_t)i*512); + exit(-1); + } + + if( (ret = posix_memalign((void **)&buf, + BLOCK_PROCESSSZ, + BLOCK_PROCESSSZ)) != 0) { + DFPRINTF("Unable to read memalign buf (%d)\n",ret); + exit(-1); + } + + /*We attempt to read 4k sized blocks*/ + len = read(fd, buf, BLOCK_PROCESSSZ); + if (len < 512) { + DFPRINTF("Unable to read sector %"PRIu64"\n", + (uint64_t) (i)); + complete = 1; + continue; + } + + len = (len >> 9); + + treq.op = TD_OP_WRITE; + treq.buf = buf; + treq.sec = i; + treq.secs = len; + treq.image = 0; + treq.cb = send_responses; + treq.cb_data = buf; + treq.id = 0; + treq.sidx = 0; + vreq = calloc(1, sizeof(td_vbd_request_t)); + treq.private = vreq; + + vreq->submitting = 1; + INIT_LIST_HEAD(&vreq->next); + tapdisk_vbd_move_request(treq.private, + &qcow_vbd->pending_requests); + + ddqcow->ops->td_queue_write(ddqcow,treq); + --vreq->submitting; + + submit_events++; + + i += len; + + if (i == info.size) + complete = 1; + + tapdisk_submit_all_tiocbs(&server.aio_queue); + debug_output(i,info.size); + } + + while(returned_events != submit_events) { + ret = scheduler_wait_for_events(&server.scheduler); + if (ret < 0) { + DFPRINTF("server wait returned %d\n", ret); + sleep(2); + } + } + + if (complete && (returned_events == submit_events)) + running = 0; + } + memcpy(output+prev+1,"=",1); + DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output); + + ddqcow->ops->td_close(ddqcow); + free(ddqcow->data); + + return 0; +} diff --git a/tools/blktap2/drivers/io-optimize.c b/tools/blktap2/drivers/io-optimize.c new file mode 100644 index 0000000000..5d397652e5 --- /dev/null +++ b/tools/blktap2/drivers/io-optimize.c @@ -0,0 +1,664 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <time.h> +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> + +#include "io-optimize.h" +#include "tapdisk-log.h" + +#if (!defined(TEST) && defined(DEBUG)) +#define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a) +#elif defined(TEST) +#define DBG(ctx, f, a...) printf(f, ##a) +#else +#define DBG(ctx, f, a...) ((void)0) +#endif + +static void print_merged_iocbs(struct opioctx *ctx, + struct iocb **iocbs, int num_iocbs); + +void +opio_free(struct opioctx *ctx) +{ + free(ctx->opios); + free(ctx->free_opios); + free(ctx->iocb_queue); + free(ctx->event_queue); +} + +int +opio_init(struct opioctx *ctx, int num_iocbs) +{ + int i; + + memset(ctx, 0, sizeof(struct opioctx)); + + ctx->num_opios = num_iocbs; + ctx->free_opio_cnt = num_iocbs; + ctx->opios = calloc(1, sizeof(struct opio) * num_iocbs); + ctx->free_opios = calloc(1, sizeof(struct opio *) * num_iocbs); + ctx->iocb_queue = calloc(1, sizeof(struct iocb *) * num_iocbs); + ctx->event_queue = calloc(1, sizeof(struct io_event) * num_iocbs); + + if (!ctx->opios || !ctx->free_opios || + !ctx->iocb_queue || !ctx->event_queue) + goto fail; + + for (i = 0; i < num_iocbs; i++) + ctx->free_opios[i] = &ctx->opios[i]; + + return 0; + + fail: + opio_free(ctx); + return -ENOMEM; +} + +static inline struct opio * +alloc_opio(struct opioctx *ctx) +{ + if (ctx->free_opio_cnt <= 0) + return NULL; + return ctx->free_opios[--ctx->free_opio_cnt]; +} + +static inline void +free_opio(struct opioctx *ctx, struct opio *op) +{ + memset(op, 0, sizeof(struct opio)); + ctx->free_opios[ctx->free_opio_cnt++] = op; +} + +static inline void +restore_iocb(struct opio *op) +{ + struct iocb *io = op->iocb; + + io->data = op->data; + io->u.c.buf = op->buf; + io->u.c.nbytes = op->nbytes; +} + +static inline int +iocb_optimized(struct opioctx *ctx, struct iocb *io) +{ + unsigned long iop = (unsigned long)io->data; + unsigned long start = (unsigned long)ctx->opios; + unsigned long end = start + (ctx->num_opios * sizeof(struct opio)); + + return (iop >= start && iop < end); +} + +static inline int +contiguous_sectors(struct iocb *l, struct iocb *r) +{ + return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset); +} + +static inline int +contiguous_buffers(struct iocb *l, struct iocb *r) +{ + return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf); +} + +static inline int +contiguous_iocbs(struct iocb *l, struct iocb *r) +{ + return ((l->aio_fildes == r->aio_fildes) && + contiguous_sectors(l, r) && + contiguous_buffers(l, r)); +} + +static inline void +init_opio_list(struct opio *op) +{ + op->list.head = op->list.tail = op; +} + +static struct opio * +opio_iocb_init(struct opioctx *ctx, struct iocb *io) +{ + struct opio *op; + + op = alloc_opio(ctx); + if (!op) + return NULL; + + op->buf = io->u.c.buf; + op->nbytes = io->u.c.nbytes; + op->offset = io->u.c.offset; + op->data = io->data; + op->iocb = io; + io->data = op; + + init_opio_list(op); + + return op; +} + +static inline struct opio * +opio_get(struct opioctx *ctx, struct iocb *io) +{ + if (iocb_optimized(ctx, io)) + return (struct opio *)io->data; + else + return opio_iocb_init(ctx, io); +} + +static int +merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io) +{ + struct opio *ophead, *opio; + + ophead = opio_get(ctx, head); + if (!ophead) + return -ENOMEM; + + opio = opio_get(ctx, io); + if (!opio) + return -ENOMEM; + + opio->head = ophead; + head->u.c.nbytes += io->u.c.nbytes; + ophead->list.tail = ophead->list.tail->next = opio; + + return 0; +} + +static int +merge(struct opioctx *ctx, struct iocb *head, struct iocb *io) +{ + if (head->aio_lio_opcode != io->aio_lio_opcode) + return -EINVAL; + + if (!contiguous_iocbs(head, io)) + return -EINVAL; + + return merge_tail(ctx, head, io); +} + +int +io_merge(struct opioctx *ctx, struct iocb **queue, int num) +{ + int i, on_queue; + struct iocb *io, **q; + struct opio *ophead; + + if (!num) + return 0; + + on_queue = 0; + q = ctx->iocb_queue; + memcpy(q, queue, num * sizeof(struct iocb *)); + + for (i = 1; i < num; i++) { + io = q[i]; + if (merge(ctx, queue[on_queue], io) != 0) + queue[++on_queue] = io; + } + +#if (defined(TEST) || defined(DEBUG)) + print_merged_iocbs(ctx, queue, on_queue + 1); +#endif + + return ++on_queue; +} + +static int +expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io) +{ + int idx; + struct opio *op, *next; + + idx = 0; + op = (struct opio *)io->data; + while (op) { + next = op->next; + restore_iocb(op); + queue[idx++] = op->iocb; + free_opio(ctx, op); + op = next; + } + + return idx; +} + +int +io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num) +{ + int i, on_queue; + struct iocb *io, **q; + + if (!num) + return 0; + + on_queue = 0; + q = ctx->iocb_queue; + memcpy(q, queue, num * sizeof(struct iocb *)); + + for (i = idx; i < num; i++) { + io = q[i]; + if (!iocb_optimized(ctx, io)) + queue[on_queue++] = io; + else + on_queue += expand_iocb(ctx, queue + on_queue, io); + } + + return on_queue; +} + +static int +expand_event(struct opioctx *ctx, + struct io_event *event, struct io_event *queue, int idx) +{ + int err; + struct iocb *io; + struct io_event *ep; + struct opio *ophead, *op, *next; + + io = event->obj; + ophead = (struct opio *)io->data; + op = ophead; + + if (event->res == io->u.c.nbytes) + err = 0; + else if ((int)event->res < 0) + err = (int)event->res; + else + err = -EIO; + + while (op) { + next = op->next; + ep = &queue[idx++]; + ep->obj = op->iocb; + ep->res = (err ? err : op->nbytes); + restore_iocb(op); + free_opio(ctx, op); + op = next; + } + + return idx; +} + +int +io_split(struct opioctx *ctx, struct io_event *events, int num) +{ + int on_queue; + struct iocb *io; + struct io_event *ep, *q; + + if (!num) + return 0; + + on_queue = 0; + q = ctx->event_queue; + memcpy(q, events, num * sizeof(struct io_event)); + + for (ep = q; num-- > 0; ep++) { + io = ep->obj; + if (!iocb_optimized(ctx, io)) + events[on_queue++] = *ep; + else + on_queue = expand_event(ctx, ep, events, on_queue); + } + + return on_queue; +} + +/****************************************************************************** +debug print functions +******************************************************************************/ +static inline void +__print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix) +{ + char *type; + + type = (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write"); + + DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx," + " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes, + io->u.c.buf, type, (unsigned long)io->data, + iocb_optimized(ctx, io)); +} + +static char *null_prefix = ""; +#define print_iocb(ctx, io) __print_iocb(ctx, io, null_prefix) + +static void +print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs) +{ + int i; + char pref[10]; + struct iocb *io; + + DBG(ctx, "iocbs:\n"); + for (i = 0; i < num_iocbs; i++) { + io = iocbs[i]; + snprintf(pref, 10, "%d: ", i); + __print_iocb(ctx, io, pref); + } +} + +static void +print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt) +{ + char pref[10]; + + while (op) { + snprintf(pref, 10, " %d: ", (*cnt)++); + __print_iocb(ctx, op->iocb, pref); + op = op->next; + } +} + +static void +print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs) +{ + int i, cnt; + char pref[10]; + struct iocb *io; + struct opio *op; + + DBG(ctx, "merged iocbs:\n"); + for (i = 0, cnt = 0; i < num_iocbs; i++) { + io = iocbs[i]; + snprintf(pref, 10, "%d: ", cnt++); + __print_iocb(ctx, io, pref); + + if (iocb_optimized(ctx, io)) { + op = (struct opio *)io->data; + print_optimized_iocbs(ctx, op->next, &cnt); + } + } +} + +static void +print_events(struct opioctx *ctx, struct io_event *events, int num_events) +{ + int i; + struct iocb *io; + + for (i = 0; i < num_events; i++) { + io = events[i].obj; + print_iocb(ctx, io); + } +} +/****************************************************************************** +end debug print functions +******************************************************************************/ + +#if defined(TEST) + +#define hmask 0x80000000UL +#define smask 0x40000000UL +#define make_data(idx, is_head, sparse) \ + (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0)) +#define data_idx(data) (int)((unsigned long)(data) & (0x0fffffff)) +#define data_is_head(data) (((unsigned long)(data) & hmask) ? 1 : 0) +#define data_is_sparse(data) (((unsigned long)(data) & smask) ? 1 : 0) + +static void +usage(void) +{ + fprintf(stderr, "usage: io_optimize [-n num_runs] " + "[-i num_iocbs] [-s num_secs] [-r random_seed]\n"); + exit(-1); +} + +static int xalloc_cnt, xfree_cnt; +static inline char * +xalloc(int size) +{ + char *buf = malloc(size); + if (!buf) { + fprintf(stderr, "xalloc failed\n"); + exit(ENOMEM); + } + xalloc_cnt++; + return buf; +} + +static inline void +xfree(void *buf) +{ + free(buf); + xfree_cnt++; +} + +static void +randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs) +{ + int i, j; + + i = 0; + while (i < num_iocbs) { + char *buf; + short type; + int segs, sparse_mem; + uint64_t offset, nbytes; + + type = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE); + offset = ((random() % num_secs) << 9); + + if (random() % 10 < 4) { + segs = 1; + nbytes = (((random() % 7) + 1) << 9); + } else { + segs = (random() % 10) + 1; + nbytes = 4096; + } + + if (i + segs > num_iocbs) + segs = (num_iocbs - i); + + sparse_mem = (random() % 10 < 2 ? 1 : 0); + + if (sparse_mem) + buf = xalloc(nbytes); + else + buf = xalloc(segs * nbytes); + + for (j = 0; j < segs; j++) { + struct iocb *io = iocbs[i + j]; + io->aio_lio_opcode = type; + io->u.c.nbytes = nbytes; + io->u.c.offset = offset; + io->u.c.buf = buf; + offset += nbytes; + + io->data = make_data(i + j, (j == 0), sparse_mem); + + if (j + 1 < segs && sparse_mem) + buf = xalloc(nbytes); + else + buf += nbytes; + } + + i += segs; + } +} + +static int +simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs) +{ + int i, done; + struct iocb *io; + struct io_event *ep; + + if (num_iocbs > 1) + done = (random() % (num_iocbs - 1)) + 1; + else + done = num_iocbs; + + for (i = 0; i < done; i++) { + io = iocbs[i]; + ep = &events[i]; + ep->obj = io; + ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0); + } + + return done; +} + +static inline void +process_events(struct opioctx *ctx, + struct iocb *iocb_list, struct io_event *events, int num) +{ + int i; + struct iocb *io; + + for (i = 0; i < num; i++) { + io = events[i].obj; + print_iocb(ctx, io); + if (data_idx(io->data) != (io - iocb_list)) { + printf("corrupt data! data_idx = %d, io = %d\n", + data_idx(io->data), (io - iocb_list)); + exit(-1); + } + if (data_is_head(io->data) || data_is_sparse(io->data)) + xfree(io->u.c.buf); + memset(io, 0, sizeof(struct iocb)); + } +} + +static inline void +init_optest(struct iocb *iocb_list, + struct iocb **iocbs, struct io_event *events, int num) +{ + int i; + + memset(iocb_list, 0, num * sizeof(struct iocb)); + memset(events, 0, num * sizeof(struct io_event)); + + for (i = 0; i < num; i++) + iocbs[i] = &iocb_list[i]; +} + +int +main(int argc, char **argv) +{ + uint64_t num_secs; + struct opioctx ctx; + struct io_event *events; + int i, c, num_runs, num_iocbs, seed; + struct iocb *iocb_list, **iocbs, **ioqueue; + + num_runs = 1; + num_iocbs = 300; + seed = time(NULL); + num_secs = ((4ULL << 20) >> 9); /* 4GB disk */ + + while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) { + switch (c) { + case 'n': + num_runs = atoi(optarg); + break; + case 'i': + num_iocbs = atoi(optarg); + break; + case 's': + num_secs = strtoull(optarg, NULL, 10); + break; + case 'r': + seed = atoi(optarg); + break; + case 'h': + usage(); + case '?': + fprintf(stderr, "Unrecognized option: -%c\n", optopt); + usage(); + } + } + + printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n", + num_runs, num_iocbs, num_secs, seed); + + srand(seed); + + iocb_list = malloc(num_iocbs * sizeof(struct iocb)); + iocbs = malloc(num_iocbs * sizeof(struct iocb *)); + events = malloc(num_iocbs * sizeof(struct io_event)); + + if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) { + fprintf(stderr, "initialization failed\n"); + exit(ENOMEM); + } + + for (i = 0; i < num_runs; i++) { + int op_rem, op_done, num_split, num_events, num_done; + + ioqueue = iocbs; + init_optest(iocb_list, ioqueue, events, num_iocbs); + randomize_iocbs(ioqueue, num_iocbs, num_secs); + print_iocbs(&ctx, ioqueue, num_iocbs); + + op_done = 0; + num_done = 0; + op_rem = io_merge(&ctx, ioqueue, num_iocbs); + print_iocbs(&ctx, ioqueue, op_rem); + print_merged_iocbs(&ctx, ioqueue, op_rem); + + while (num_done < num_iocbs) { + DBG(&ctx, "optimized remaining: %d\n", op_rem); + + DBG(&ctx, "simulating\n"); + num_events = simulate_io(ioqueue + op_done, events, op_rem); + print_events(&ctx, events, num_events); + + DBG(&ctx, "splitting %d\n", num_events); + num_split = io_split(&ctx, events, num_events); + print_events(&ctx, events, num_split); + + DBG(&ctx, "processing %d\n", num_split); + process_events(&ctx, iocb_list, events, num_split); + + op_rem -= num_events; + op_done += num_events; + num_done += num_split; + } + + DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n", + i, num_done, xalloc_cnt, xfree_cnt); + if (xalloc_cnt != xfree_cnt) + exit(-1); + xalloc_cnt = xfree_cnt = 0; + } + + free(iocbs); + free(events); + free(iocb_list); + opio_free(&ctx); + + return 0; +} +#endif diff --git a/tools/blktap2/drivers/io-optimize.h b/tools/blktap2/drivers/io-optimize.h new file mode 100644 index 0000000000..9a0d86b6a9 --- /dev/null +++ b/tools/blktap2/drivers/io-optimize.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __IO_OPTIMIZE_H__ +#define __IO_OPTIMIZE_H__ + +#include <libaio.h> + +struct opio; + +struct opio_list { + struct opio *head; + struct opio *tail; +}; + +struct opio { + char *buf; + unsigned long nbytes; + long long offset; + void *data; + struct iocb *iocb; + struct io_event event; + struct opio *head; + struct opio *next; + struct opio_list list; +}; + +struct opioctx { + int num_opios; + int free_opio_cnt; + struct opio *opios; + struct opio **free_opios; + struct iocb **iocb_queue; + struct io_event *event_queue; +}; + +int opio_init(struct opioctx *ctx, int num_iocbs); +void opio_free(struct opioctx *ctx); +int io_merge(struct opioctx *ctx, struct iocb **queue, int num); +int io_split(struct opioctx *ctx, struct io_event *events, int num); +int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num); + +#endif diff --git a/tools/blktap2/drivers/lock.c b/tools/blktap2/drivers/lock.c new file mode 100644 index 0000000000..107c4b609b --- /dev/null +++ b/tools/blktap2/drivers/lock.c @@ -0,0 +1,1000 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This module implements a "dot locking" style advisory file locking algorithm. + */ + +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <dirent.h> +#include <limits.h> +#include "lock.h" + +#define unlikely(x) __builtin_expect(!!(x), 0) + +/* format: xenlk.hostname.uuid.<xf><rw>*/ +#define LF_POSTFIX ".xenlk" +#define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s" +#define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s" +#define RETRY_MAX 16 + +#if defined(LOGS) +#define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args) +#else +#define LOG(format, args...) +#endif + +/* random wait - up to .5 seconds */ +#define XSLEEP usleep(random() & 0x7ffff) + +typedef int (*eval_func)(char *name, int readonly); + +static char *create_lockfn(char *fn_to_lock) +{ + char *lockfn; + + /* allocate string to hold constructed lock file */ + lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1); + if (unlikely(!lockfn)) { + return 0; + } + + /* append postfix to file to lock */ + strcpy(lockfn, fn_to_lock); + strcat(lockfn, LF_POSTFIX); + + return lockfn; +} + +static char *create_lockfn_link(char *fn_to_lock, char *format, + char *uuid, int readonly) +{ + char hostname[128]; + char *lockfn_link; + char *ptr; + + /* get hostname */ + if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) { + return 0; + } + + /* allocate string to hold constructed lock file link */ + lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + + strlen(hostname) + strlen(uuid) + 8); + if (unlikely(!lockfn_link)) { + return 0; + } + + /* construct lock file link with specific format */ + strcpy(lockfn_link, fn_to_lock); + ptr = lockfn_link + strlen(lockfn_link); + sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w"); + + return lockfn_link; +} + +static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno) +{ + int result = LOCK_OK; + int uniq; + char *buf; + int fd; + int pid = (int)getpid(); + int clstat; + + *reterrno = 0; + + /* create file to normalize time */ + srandom((int)time(0) ^ pid); + uniq = random() % 0xffffff; + buf = malloc(strlen(fn) + 24); + if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; } + + strcpy(buf, fn); + sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq); + + fd = open(buf, O_WRONLY | O_CREAT, 0644); + if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; } + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + if (lstat(buf, statnow) == -1) { + unlink(buf); + *reterrno = errno; + result = LOCK_ESTAT; + goto finish; + } + unlink(buf); + +finish: + return result; +} + +static int writer_eval(char *name, int readonly) +{ + return name[strlen(name)-1] == 'w'; +} + +static int reader_eval(char *name, int readonly) +{ + return name[strlen(name)-1] == 'r' && !readonly; +} + +static int lock_holder(char *fn, char *lockfn, char *lockfn_link, + int force, int readonly, int *stole, eval_func eval, + int *elt, int *ioerror) +{ + int status = 0; + int ustat; + DIR *pd = 0; + struct dirent *dptr; + char *ptr; + char *dirname = malloc(strlen(lockfn)); + char *uname = malloc(strlen(lockfn_link) + 8); + int elt_established = 0; + int fd; + char tmpbuf[4096]; + + *stole = 0; + *ioerror = 0; + *elt = 0; + + if (!dirname) goto finish; + if (!uname) goto finish; + + /* get directory */ + ptr = strrchr(lockfn, '/'); + if (!ptr) { + strcpy(dirname, "."); + } else { + int numbytes = ptr - lockfn; + strncpy(dirname, lockfn, numbytes); + dirname[numbytes] = '\0'; + } + pd = opendir(dirname); + if (!pd) { + *ioerror = errno ? errno : EIO; + goto finish; + } + + /* + * scan through directory entries and use eval function + * if we have a match (i.e. reader or writer lock) but + * note that if we are forcing, we will remove any and + * all locks that appear for target of our lock, regardless + * if it a reader/writer owns the lock. + */ + errno = 0; + dptr = readdir(pd); + if (!dptr) { + *ioerror = EIO; + } + while (dptr) { + char *p1 = strrchr(fn, '/'); + char *p2 = strrchr(lockfn, '/'); + char *p3 = strrchr(lockfn_link, '/'); + if (p1) p1+=1; + if (p2) p2+=1; + if (p3) p3+=1; + if (strcmp(dptr->d_name, p1 ? p1 : fn) && + strcmp(dptr->d_name, p2 ? p2 : lockfn) && + strcmp(dptr->d_name, p3 ? p3 : lockfn_link) && + !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) { + strcpy(uname, dirname); + strcat(uname, "/"); + strcat(uname, dptr->d_name); + if (!elt_established) { + /* read final lock file and extract lease time */ + fd = open(uname, O_RDONLY, 0644); + memset(tmpbuf, 0, sizeof(tmpbuf)); + if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { + *ioerror = errno; + status = 1; + close(fd); + goto finish; + } + close(fd); + ptr = strrchr(tmpbuf, '.'); + if (ptr) { + *elt = atoi(ptr+1); + elt_established = 1; + } + } + if (force) { + ustat = unlink(uname); + if (ustat == -1) { + LOG("failed to unlink %s\n", uname); + } + *stole = 1; + *elt = 0; + } else { + if ((*eval)(dptr->d_name, readonly)) { + closedir(pd); + status = 1; + goto finish; + } + } + } + dptr = readdir(pd); + if (!dptr & errno) { + *ioerror = EIO; + } + } + + closedir(pd); + +finish: + free(dirname); + free(uname); + + /* if IO error, force a taken status */ + return (*ioerror) ? 1 : status; +} + +int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus) +{ + char *lockfn = 0; + char *lockfn_xlink = 0; + char *lockfn_flink = 0; + char *buf = 0; + int fd; + int status = 0; + struct stat stat1, stat2; + int retry_attempts = 0; + int clstat; + int tmpstat; + int stealx = 0; + int stealw = 0; + int stealr = 0; + int established_lease_time = 0; + char tmpbuf[4096]; + int ioerr; + + if (!fn_to_lock || !uuid) { + *retstatus = LOCK_EBADPARM; + return EINVAL; + } + + *retstatus = 0; + + /* seed random with time/pid combo */ + srandom((int)time(0) ^ getpid()); + + /* build lock file strings */ + lockfn = create_lockfn(fn_to_lock); + if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } + + lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT, + uuid, readonly); + if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } + + lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid, + readonly); + if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } + +try_again: + if (retry_attempts++ > RETRY_MAX) { + if (*retstatus == LOCK_EXLOCK_OPEN) { + struct stat statnow, stat_exlock; + int diff; + + if (lstat(lockfn, &stat_exlock) == -1) { + goto finish; + } + + if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) { + goto finish; + } + + diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime; + if (diff > DEFAULT_LEASE_TIME_SECS) { + unlink(lockfn); + retry_attempts = 0; + goto try_again; + } + } + goto finish; + } + + /* try to open exlusive lockfile */ + fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644); + if (fd == -1) { + LOG("Initial lockfile creation failed %s force=%d, errno=%d\n", + lockfn, force, errno); + if (errno == EIO) { + *retstatus = LOCK_EXLOCK_OPEN; + status = EIO; + goto finish; + } + /* already owned? (hostname & uuid match, skip time bits) */ + errno = 0; + fd = open(lockfn, O_RDWR, 0644); + if (fd != -1) { + buf = malloc(strlen(lockfn_xlink)+1); + if (!buf) { + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + *retstatus = LOCK_ENOMEM; + status = ENOMEM; + goto finish; + } + if (read(fd, buf, strlen(lockfn_xlink)) != + (strlen(lockfn_xlink))) { + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + free(buf); + goto force_lock; + } + if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) { + LOG("lock owned by us, reasserting\n"); + /* our lock, reassert by rewriting below */ + if (lseek(fd, 0, SEEK_SET) == -1) { + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + goto force_lock; + } + free(buf); + goto skip; + } + free(buf); + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + } +force_lock: + if (errno == EIO) { + *retstatus = LOCK_EXLOCK_OPEN; + status = EIO; + goto finish; + } + if (force) { + /* remove lock file, we are forcing lock, try again */ + status = unlink(lockfn); + if (unlikely(status == -1)) { + if (errno == EIO) { + *retstatus = LOCK_EXLOCK_OPEN; + status = EIO; + goto finish; + } + LOG("force removal of %s lockfile failed, " + "errno=%d, trying again\n", lockfn, errno); + } + stealx = 1; + } + XSLEEP; + *retstatus = LOCK_EXLOCK_OPEN; + goto try_again; + } + + LOG("lockfile created %s\n", lockfn); + +skip: + /* + * write into the temporary xlock + */ + if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) != + strlen(lockfn_xlink)) { + if (errno == EIO) { + *retstatus = LOCK_EXLOCK_WRITE; + status = EIO; + goto finish; + } + status = errno; + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + XSLEEP; + *retstatus = LOCK_EXLOCK_WRITE; + if (unlink(lockfn) == -1) { + LOG("removal of %s lockfile failed, " + "errno=%d, trying again\n", lockfn, errno); + } + goto try_again; + } + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + + while (retry_attempts++ < RETRY_MAX) { + tmpstat = link(lockfn, lockfn_xlink); + LOG("linking %s and %s\n", lockfn, lockfn_xlink); + if ((tmpstat == -1) && (errno != EEXIST)) { + LOG("link status is %d, errno=%d\n", tmpstat, errno); + } + + if ((lstat(lockfn, &stat1) == -1) || + (lstat(lockfn_xlink, &stat2) == -1)) { + /* try again, cleanup first */ + tmpstat = unlink(lockfn); + if (unlikely(tmpstat == -1)) { + LOG("error removing lock file %s", lockfn); + } + tmpstat = unlink(lockfn_xlink); + if (unlikely(tmpstat == -1)) { + LOG("error removing linked lock file %s", + lockfn_xlink); + } + XSLEEP; + status = LOCK_ESTAT; + goto finish; + } + + /* compare inodes */ + if (stat1.st_ino == stat2.st_ino) { + /* success, inodes are the same */ + /* should we check that st_nlink's are also 2?? */ + *retstatus = LOCK_OK; + status = 0; + tmpstat = unlink(lockfn_xlink); + if (unlikely(tmpstat == -1)) { + LOG("error removing linked lock file %s", + lockfn_xlink); + } + goto finish; + } else { + status = errno; + /* try again, cleanup first */ + tmpstat = unlink(lockfn); + if (unlikely(tmpstat == -1)) { + LOG("error removing lock file %s", lockfn); + } + tmpstat = unlink(lockfn_xlink); + if (unlikely(tmpstat == -1)) { + LOG("error removing linked lock file %s", + lockfn_xlink); + } + XSLEEP; + *retstatus = LOCK_EINODE; + goto try_again; + } + } + +finish: + if (!*retstatus) { + + /* we have exclusive lock */ + + status = 0; + + /* fast check, see if we own a final lock and are reasserting */ + if (!lstat(lockfn_flink, &stat1)) { + char *ptr; + + /* set the return value to notice this is a reassert */ + *retstatus = 1; + + /* read existing lock file and extract + established lease time */ + fd = open(lockfn_flink, O_RDONLY, 0644); + memset(tmpbuf, 0, sizeof(tmpbuf)); + if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { + if (errno == EIO) { + close(fd); + *retstatus = LOCK_EINODE; + status = EIO; + goto skip_scan; + } + } + close(fd); + ptr = strrchr(tmpbuf, '.'); + if (ptr) { + *lease_time = atoi(ptr+1); + } else { + *lease_time = 10; /* wkchack */ + } + goto skip_scan; + } else { + if (errno == EIO) { + *retstatus = LOCK_EINODE; + status = EIO; + goto skip_scan; + } + } + + /* we allow exclusive writer, or multiple readers */ + if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force, + readonly, &stealw, writer_eval, + &established_lease_time, &ioerr)) { + if (ioerr) { + *retstatus = LOCK_EREAD; + status = ioerr; + goto skip_scan; + } + *retstatus = LOCK_EHELD_WR; + } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force, + readonly, &stealr, reader_eval, + &established_lease_time, &ioerr)) { + if (ioerr) { + *retstatus = LOCK_EREAD; + status = ioerr; + goto skip_scan; + } + *retstatus = LOCK_EHELD_RD; + } + if (established_lease_time) *lease_time = + established_lease_time; + } + +skip_scan: + if (*retstatus >= 0) { + /* update file, changes last modify time */ + fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644); + if (fd == -1) { + *retstatus = LOCK_EOPEN; + status = errno; + } else { + char tmpbuf[32]; + int failed_write; + memset(tmpbuf, 0, sizeof(tmpbuf)); + sprintf(tmpbuf, ".%d", *lease_time); + failed_write = write(fd, lockfn_flink, + strlen(lockfn_flink)) != + strlen(lockfn_flink); + if (failed_write) status = errno; + failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) != + strlen(tmpbuf); + if (failed_write) status = errno; + if (failed_write) { + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + XSLEEP; + *retstatus = LOCK_EUPDATE; + goto try_again; + } + } + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + } + + if (!*retstatus && force && (stealx || stealw || stealr)) { + struct timeval timeout; + + /* enforce quiet time on steal */ + timeout.tv_sec = *lease_time; + timeout.tv_usec = 0; + select(0, 0, 0, 0, &timeout); + } + + /* remove exclusive lock, final read/write locks will hold */ + tmpstat = unlink(lockfn); + if (unlikely(tmpstat == -1)) { + LOG("error removing exclusive lock file %s", + lockfn); + } + + free(lockfn); + free(lockfn_xlink); + free(lockfn_flink); + + /* set lease time to -1 if error, so no one is apt to use it */ + if (*retstatus < 0) *lease_time = -1; + + LOG("returning status %d, errno=%d\n", status, errno); + return status; +} + + +int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status) +{ + char *lockfn_link = 0; + int reterrno = 0; + + if (!fn_to_unlock || !uuid) { + *status = LOCK_EBADPARM; + return 0; + } + + lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid, + readonly); + if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; } + + if (unlink(lockfn_link) == -1) { + LOG("error removing linked lock file %s", lockfn_link); + reterrno = errno; + *status = LOCK_ENOLOCK; + goto finish; + } + + *status = LOCK_OK; + +finish: + free(lockfn_link); + return reterrno; +} + +int lock_delta(char *fn, int *ret_lease, int *max_lease) +{ + int reterrno = 0; + DIR *pd = 0; + struct dirent *dptr; + char *ptr; + int result = INT_MAX; + struct stat statbuf, statnow; + char *dirname = malloc(strlen(fn)); + char *uname = malloc(strlen(fn) + 8); + int elt_established = 0; + char *dotptr; + char tmpbuf[4096]; + int fd; + + if (!fn || !dirname || !uname) { + *ret_lease = LOCK_EBADPARM; + *max_lease = -1; + return 0; + } + + if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) { + result = LOCK_ESTAT; + goto finish; + } + + /* get directory */ + ptr = strrchr(fn, '/'); + if (!ptr) { + strcpy(dirname, "."); + ptr = fn; + } else { + int numbytes = ptr - fn; + strncpy(dirname, fn, numbytes); + ptr += 1; + } + pd = opendir(dirname); + if (!pd) { reterrno = errno; goto finish; } + + dptr = readdir(pd); + while (dptr) { + if (strcmp(dptr->d_name, ptr) && + !strncmp(dptr->d_name, ptr, strlen(ptr))) { + char *fpath = malloc(strlen(dptr->d_name) + + strlen(dirname) + 2); + if (!fpath) { + closedir(pd); + result = LOCK_ENOMEM; + goto finish; + } + strcpy(fpath, dirname); + strcat(fpath, "/"); + strcat(fpath, dptr->d_name); + if (lstat(fpath, &statbuf) != -1) { + int diff = (int)statnow.st_mtime - + (int)statbuf.st_mtime; + /* adjust diff if someone updated the lock + between now and when we created the "now" + file + */ + diff = (diff < 0) ? 0 : diff; + result = diff < result ? diff : result; + } else { + closedir(pd); + reterrno = errno; + goto finish; + } + + if (!elt_established) { + /* read final lock file and extract lease time */ + fd = open(fpath, O_RDONLY, 0644); + memset(tmpbuf, 0, sizeof(tmpbuf)); + if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { + /* error on read? */ + } + close(fd); + dotptr = strrchr(tmpbuf, '.'); + if (dotptr) { + *max_lease = atoi(dotptr+1); + elt_established = 1; + } + } + + free(fpath); + } + dptr = readdir(pd); + } + + closedir(pd); + +finish: + free(dirname); + free(uname); + + /* returns smallest lock time, or error */ + if (result == INT_MAX) result = LOCK_ENOLOCK; + + /* set lease time to -1 if error, so no one is apt to use it */ + if ((result < 0) || reterrno) *max_lease = -1; + *ret_lease = result; + return reterrno; +} + +#if defined(TEST) +/* + * the following is for sanity testing. + */ + +static void usage(char *prg) +{ + printf("usage %s\n" + " dtr <filename>]\n" + " p <filename> [num iterations]\n" + " u <filename> [0|1] [<uniqid>]\n" + " l <filename> [0|1] [0|1] [<uniqid>] [<leasetime>]\n", prg); + printf(" p : perf test lock take and reassert\n"); + printf(" d : delta lock time\n"); + printf(" t : test the file (after random locks)\n"); + printf(" r : random lock tests (must ^C)\n"); + printf(" u : unlock, readonly? uniqID (default is PID)\n"); + printf(" l : lock, readonly? force?, uniqID (default is PID), lease time\n"); +} + +static void test_file(char *fn) +{ + FILE *fptr; + int prev_count = 0; + int count, pid, time; + + fptr = fopen(fn, "r"); + if (!fptr) { + LOG("ERROR on file %s open, errno=%d\n", fn, errno); + return; + } + + while (!feof(fptr)) { + fscanf(fptr, "%d %d %d\n", &count, &pid, &time); + if (prev_count != count) { + LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n", + prev_count, count, pid, time); + } + prev_count = count + 1; + } +} + +static void random_locks(char *fn) +{ + int pid = getpid(); + int status; + char *filebuf = malloc(256); + int count = 0; + int dummy; + int clstat; + char uuid[12]; + int readonly; + int lease = DEFAULT_LEASE_TIME_SECS; + int err; + + /* this will never return, kill to exit */ + + srandom((int)time(0) ^ pid); + + LOG("pid: %d using file %s\n", pid, fn); + sprintf(uuid, "%08d", pid); + + while (1) { + XSLEEP; + readonly = random() & 1; + sysstatus = lock(fn, uuid, 0, readonly, &lease, status); + if (status == LOCK_OK) { + /* got lock, open, read, modify write close file */ + int fd = open(fn, O_RDWR, 0644); + if (fd == -1) { + LOG("pid: %d ERROR on file %s open, errno=%d\n", + pid, fn, errno); + } else { + if (!readonly) { + /* ugly code to read data in test format */ + /* format is "%d %d %d" 'count pid time' */ + struct stat statbuf; + int bytes; + status = stat(fn, &statbuf); + if (status != -1) { + if (statbuf.st_size > 256) { + lseek(fd, -256, SEEK_END); + } + memset(filebuf, 0, 256); + bytes = read(fd, filebuf, 256); + if (bytes) { + int bw = bytes-2; + while (bw && filebuf[bw]!='\n') + bw--; + if (!bw) bw = -1; + sscanf(&filebuf[bw+1], + "%d %d %d", + &count, &dummy, &dummy); + count += 1; + } + lseek(fd, 0, SEEK_END); + sprintf(filebuf, "%d %d %d\n", + count, pid, (int)time(0)); + write(fd, filebuf, strlen(filebuf)); + } else { + LOG("pid: %d ERROR on file %s stat, " + "errno=%d\n", pid, fn, errno); + } + } + clstat = close(fd); + if (unlikely(clstat == -1)) { + LOG("fail on close\n"); + } + } + XSLEEP; + err = unlock(fn, uuid, readonly, &status); + LOG("unlock status is %d (err=%d)\n", status, err); + } + } +} + +static void perf_lock(char *fn, int loops) +{ + int sysstatus; + char buf[9]; + int start = loops; + int lease = DEFAULT_LEASE_TIME_SECS; + + sprintf(buf, "%08d", getpid()); + + while (loops--) { + sysstatus = lock(fn, buf, 0, 0, &lease, &status); + if (status < 0) { + printf("failed to get lock at iteration %d errno=%d\n", + start - loops, errno); + return; + } + } + unlock(fn, buf, 0, &status); +} + +int main(int argc, char *argv[]) +{ + int status; + char *ptr; + char uuid[12]; + int force; + int readonly; + int max_lease, cur_lease; + int intstatus; + int lease = DEFAULT_LEASE_TIME_SECS; + + if (argc < 3) { + usage(argv[0]); + return 0; + } + + sprintf(uuid, "%08d", getpid()); + ptr = uuid; + + if (!strcmp(argv[1],"d")) { + status = lock_delta(argv[2], &cur_lease, &max_lease); + + printf("lock delta for %s is %d seconds, max lease is %d\n", + argv[2], cur_lease, max_lease); + } else if (!strcmp(argv[1],"t")) { + test_file(argv[2]); + } else if (!strcmp(argv[1],"r")) { + random_locks(argv[2]); + } else if (!strcmp(argv[1],"p")) { + perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3])); + } else if (!strcmp(argv[1],"l")) { + if (argc < 4) force = 0; else force = atoi(argv[3]); + if (argc < 5) readonly = 0; else readonly = atoi(argv[4]); + if (argc >= 6) ptr = argv[5]; + if (argc == 7) lease = atoi(argv[6]); + status = lock(argv[2], ptr, readonly, force, &lease, &intstatus); + printf("lock status = %d\n", status); + } else if (!strcmp(argv[1],"u") ) { + if (argc < 5) readonly = 0; else readonly = atoi(argv[3]); + if (argc == 5) ptr = argv[4]; + status = unlock(argv[2], ptr, readonly, &intstatus); + printf("unlock status = %d\n", intstatus); + } else { + usage(argv[0]); + } + + return status; +} +#elif defined(UTIL) +/* + * the following is used for non-libary, standalone + * program utility as a shell program + */ + +static void usage(char *prg) +{ + printf("usage %s\n" + " delta <filename>\n" + " unlock <filename> <r|w> <uniqid>\n" + " lock <filename> <r|w> <0|1> <uniqid> <leasetime>\n", prg); + printf(" delta : get time since lock last refreshed\n"); + printf(" returns delta time and max lease time in seconds\n"); + printf(" unlock: unlock request filename, r|w, uniqID\n"); + printf(" returns status (success is 0)\n"); + printf(" lock : lock request filename, r|w, force?, uniqID, lease time request\n"); + printf(" returns status (success is 0) and established lease time in seconds\n"); +} + +int main(int argc, char *argv[]) +{ + int status = 0; + int dlock; + char *ptr; + int force; + int readonly; + int cur_lease, max_lease, intstatus; + int lease = DEFAULT_LEASE_TIME_SECS; + + if (argc < 3) { + if (argc == 2 && !strcmp(argv[1], "-h")) { + usage(argv[0]); + } else { + printf("%d\n", LOCK_EUSAGE); + } + return 0; + } + + if (!strcmp(argv[1],"delta") && (argc == 3)) { + status = lock_delta(argv[2], &cur_lease, &max_lease); + printf("%d %d\n", cur_lease, max_lease); + } else if (!strcmp(argv[1],"lock") && (argc == 7)) { + readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0; + force = atoi(argv[4]); + ptr = argv[5]; + lease = atoi(argv[6]); + status = lock(argv[2], ptr, force, readonly, &lease, &intstatus); + printf("%d %d\n", intstatus, lease); + } else if (!strcmp(argv[1],"unlock") && (argc == 5)) { + readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0; + ptr = argv[4]; + status = unlock(argv[2], ptr, readonly, &intstatus); + printf("%d\n", intstatus); + } else { + printf("%d\n", LOCK_EUSAGE); + } + + /* this is either 0 or a system defined errno */ + return status; +} +#endif diff --git a/tools/blktap2/drivers/lock.h b/tools/blktap2/drivers/lock.h new file mode 100644 index 0000000000..98baaaa705 --- /dev/null +++ b/tools/blktap2/drivers/lock.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define DEFAULT_LEASE_TIME_SECS 30 + +int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat); +int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat); +int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time); + +typedef enum { + LOCK_OK = 0, + LOCK_EBADPARM = -1, + LOCK_ENOMEM = -2, + LOCK_ESTAT = -3, + LOCK_EHELD_WR = -4, + LOCK_EHELD_RD = -5, + LOCK_EOPEN = -6, + LOCK_EXLOCK_OPEN = -7, + LOCK_EXLOCK_WRITE= -8, + LOCK_EINODE = -9, + LOCK_EUPDATE = -10, + LOCK_EREAD = -11, + LOCK_EREMOVE = -12, + LOCK_ENOLOCK = -13, + LOCK_EUSAGE = -14, +} lock_error; diff --git a/tools/blktap2/drivers/log.h b/tools/blktap2/drivers/log.h new file mode 100644 index 0000000000..8f00df4478 --- /dev/null +++ b/tools/blktap2/drivers/log.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* log.h: API for writelog communication */ + +#ifndef __LOG_H__ +#define __LOG_H__ 1 + +#include <inttypes.h> + +#include <xen/io/ring.h> +/* for wmb et al */ +#include <xenctrl.h> + +#define LOGCMD_SHMP "shmp" +#define LOGCMD_PEEK "peek" +#define LOGCMD_CLEAR "clrw" +#define LOGCMD_GET "getw" +#define LOGCMD_KICK "kick" + +#define CTLRSPLEN_SHMP 256 +#define CTLRSPLEN_PEEK 4 +#define CTLRSPLEN_CLEAR 4 +#define CTLRSPLEN_GET 4 +#define CTLRSPLEN_KICK 0 + +/* shmregion is arbitrarily capped at 8 megs for a minimum of + * 64 MB of data per read (if there are no contiguous regions) + * In the off-chance that there is more dirty data, multiple + * reads must be done */ +#define SHMSIZE (8 * 1024 * 1024) +#define SRINGSIZE 4096 + +/* The shared memory region is split up into 3 subregions: + * The first half is reserved for the dirty bitmap log. + * The second half begins with 1 page for read request descriptors, + * followed by a big area for supplying read data. + */ +static inline void* bmstart(void* shm) +{ + return shm; +} + +static inline void* bmend(void* shm) +{ + return shm + SHMSIZE/2; +} + +static inline void* sringstart(void* shm) +{ + return bmend(shm); +} + +static inline void* sdatastart(void* shm) +{ + return sringstart(shm) + SRINGSIZE; +} + +static inline void* sdataend(void* shm) +{ + return shm + SHMSIZE; +} + +/* format for messages between log client and server */ +struct log_ctlmsg { + char msg[4]; + char params[16]; +}; + +/* extent descriptor */ +struct disk_range { + uint64_t sector; + uint32_t count; +}; + +/* dirty write logging space. This is an extent ring at the front, + * full of disk_ranges plus a pointer into the data area */ +/* I think I'd rather have the header in front of each data section to + * avoid having two separate spaces that can run out, but then I'd either + * lose page alignment on the data blocks or spend an entire page on the + * header */ + +struct log_extent { + uint64_t sector; + uint32_t count; + uint32_t offset; /* offset from start of data area to start of extent */ +}; + +/* struct above should be 16 bytes, or 256 extents/page */ + +typedef struct log_extent log_request_t; +typedef struct log_extent log_response_t; + +DEFINE_RING_TYPES(log, log_request_t, log_response_t); + +#define LOG_HEADER_PAGES 4 + +#endif diff --git a/tools/blktap2/drivers/profile.h b/tools/blktap2/drivers/profile.h new file mode 100644 index 0000000000..f628ba223e --- /dev/null +++ b/tools/blktap2/drivers/profile.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __TAP_PROFILE_H__ +#define __TAP_PROFILE_H__ + +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <syslog.h> +#include <sys/time.h> +#include <time.h> +#include <fcntl.h> +#include <inttypes.h> + +//#define PROFILING +//#define LOGGING + +#define TAPPROF_IN 1 +#define TAPPROF_OUT 2 + +struct profile_times { + char *fn_name; + uint64_t in, out_sum, cnt; +}; + +struct profile_info { + FILE *log; + int size; + char *name; + unsigned long long seq; + struct profile_times *pt; +}; + +#ifdef PROFILING + +static inline void +tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size) +{ + memset(prof, 0, sizeof(struct profile_info)); +#ifdef LOGGING + prof->log = fopen(log_name, "w"); +#endif + prof->size = size; + prof->name = strdup(tap_name); + prof->pt = malloc(sizeof(struct profile_times) * prof->size); + if (prof->pt) + memset(prof->pt, 0, sizeof(struct profile_times) * prof->size); +} + +static inline void +tp_close(struct profile_info *prof) +{ + int i; + struct profile_times *pt; + + for (i = 0; i < prof->size; i++) { + pt = &prof->pt[i]; + if (pt->fn_name) { + syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n", + prof->name, pt->fn_name, pt->cnt, + ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0)); + free(pt->fn_name); + } + } + +#ifdef LOGGING + if (prof->log) + fclose(prof->log); +#endif + free(prof->name); + if (prof->pt) + free(prof->pt); +} + +static inline u64 +tp_get_id(struct profile_info *prof) +{ + return prof->seq++; +} + +static inline int +tp_fn_id(struct profile_info *prof, const char *name) +{ + int i; + struct profile_times *pt; + + for (i = 0; i < prof->size; i++) { + pt = &prof->pt[i]; + if (!pt->fn_name) + return i; + if (!strcmp(pt->fn_name, name)) + return i; + } + + return prof->size - 1; +} + +static inline void +__tp_in(struct profile_info *prof, const char *func) +{ + long long _time; + int idx = tp_fn_id(prof, func); + struct profile_times *pt = &prof->pt[idx]; + + if (!pt->fn_name) + pt->fn_name = strdup(func); + + asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); + pt->in = _time; +} + +#define tp_in(prof) __tp_in(prof, __func__) + +static inline void +__tp_out(struct profile_info *prof, const char *func) +{ + long long _time; + int idx = tp_fn_id(prof, func); + struct profile_times *pt = &prof->pt[idx]; + + if (!pt->fn_name || !pt->in) + return; + + asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); + pt->cnt++; + pt->out_sum += (_time - pt->in); + pt->in = 0; +} + +#define tp_out(prof) __tp_out(prof, __func__) + +static inline void +__tp_log(struct profile_info *prof, u64 id, const char *func, int direction) +{ + long long _time; + asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); + + if (direction == TAPPROF_IN) + __tp_in(prof, func); + else + __tp_out(prof, func); + +#ifdef LOGGING + if (prof->log) + fprintf(prof->log, "%s: %s: %llu, %lld\n", func, + ((direction == TAPPROF_IN) ? "in" : "out"), id, _time); +#endif +} + +#define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction) + +#else +#define tp_open(prof, tname, lname, size) ((void)0) +#define tp_close(prof) ((void)0) +#define tp_in(prof) ((void)0) +#define tp_out(prof) ((void)0) +#define tp_log(prof, sec, direction) ((void)0) +#endif + +#endif diff --git a/tools/blktap2/drivers/qcow-create.c b/tools/blktap2/drivers/qcow-create.c new file mode 100644 index 0000000000..6a641af95f --- /dev/null +++ b/tools/blktap2/drivers/qcow-create.c @@ -0,0 +1,121 @@ +/* qcow-create.c + * + * Generates a qcow format disk. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <string.h> +#include "tapdisk.h" +#include "qcow.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +#define MAX_NAME_LEN 1000 + +void help(void) +{ + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, + "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> " + "[<BACKING_FILENAME>]\n"); + exit(-1); +} + +int main(int argc, char *argv[]) +{ + int ret = -1, c, backed = 0; + int sparse = 1; + uint64_t size; + char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN]; + + for(;;) { + c = getopt(argc, argv, "hr"); + if (c == -1) + break; + switch(c) { + case 'h': + help(); + exit(0); + break; + case 'r': + sparse = 0; + break; + default: + fprintf(stderr, "Unknown option\n"); + help(); + } + } + + printf("Optind %d, argc %d\n", optind, argc); + if ( !(optind == (argc - 2) || optind == (argc - 3)) ) + help(); + + size = atoi(argv[optind++]); + size = size << 20; + + if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >= + MAX_NAME_LEN) { + fprintf(stderr,"Device name too long\n"); + exit(-1); + } + + if (optind != argc) { + /*Backing file argument*/ + backed = 1; + if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >= + MAX_NAME_LEN) { + fprintf(stderr,"Device name too long\n"); + exit(-1); + } + } + + DFPRINTF("Creating file size %"PRIu64", name %s\n",(uint64_t)size, filename); + if (!backed) + ret = qcow_create(filename,size,NULL,sparse); + else + ret = qcow_create(filename,size,bfilename,sparse); + + if (ret < 0) + DPRINTF("Unable to create QCOW file\n"); + else + DPRINTF("QCOW file successfully created\n"); + + return 0; +} diff --git a/tools/blktap2/drivers/qcow.h b/tools/blktap2/drivers/qcow.h new file mode 100644 index 0000000000..a88f1d5d92 --- /dev/null +++ b/tools/blktap2/drivers/qcow.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _QCOW_H_ +#define _QCOW_H_ + +#include "aes.h" +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0x00 +#define QCOW_CRYPT_AES 0x01 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) +#define SPARSE_FILE 0x01 +#define EXTHDR_L1_BIG_ENDIAN 0x02 + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +#define ROUNDUP(l, s) \ +({ \ + (uint64_t)( \ + (l + (s - 1)) - ((l + (s - 1)) % s)); \ +}) + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QCowHeader; + +/*Extended header for Xen enhancements*/ +typedef struct QCowHeader_ext { + uint32_t xmagic; + uint32_t cksum; + uint32_t min_cluster_alloc; + uint32_t flags; +} QCowHeader_ext; + +uint32_t gen_cksum(char *ptr, int len); +int get_filesize(char *filename, uint64_t *size, struct stat *st); +int qtruncate(int fd, off_t length, int sparse); + +#define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/ + +struct tdqcow_state { + int fd; /*Main Qcow file descriptor */ + uint64_t fd_end; /*Store a local record of file length */ + char *name; /*Record of the filename*/ + uint32_t backing_file_size; + uint64_t backing_file_offset; + uint8_t extended; /*File contains extended header*/ + int encrypted; /*File contents are encrypted or plain*/ + int cluster_bits; /*Determines length of cluster as + *indicated by file hdr*/ + int cluster_size; /*Length of cluster*/ + int cluster_sectors; /*Number of sectors per cluster*/ + int cluster_alloc; /*Blktap fix for allocating full + *extents*/ + int min_cluster_alloc; /*Blktap historical extent alloc*/ + int sparse; /*Indicates whether to preserve sparseness*/ + int l2_bits; /*Size of L2 table entry*/ + int l2_size; /*Full table size*/ + int l1_size; /*L1 table size*/ + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; /*L1 table offset from beginning of + *file*/ + uint64_t *l1_table; /*L1 table entries*/ + uint64_t *l2_cache; /*We maintain a cache of size + *L2_CACHE_SIZE of most read entries*/ + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/ + uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/ + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; /**/ + uint32_t crypt_method; /*current crypt method, 0 if no + *key yet */ + uint32_t crypt_method_header; /**/ + AES_KEY aes_encrypt_key; /*AES key*/ + AES_KEY aes_decrypt_key; /*AES key*/ + + /* libaio state */ + int aio_free_count; + int max_aio_reqs; + struct qcow_request *aio_requests; + struct qcow_request **aio_free_list; + +}; + +int qcow_create(const char *filename, uint64_t total_size, + const char *backing_file, int sparse); + +#endif //_QCOW_H_ diff --git a/tools/blktap2/drivers/qcow2raw.c b/tools/blktap2/drivers/qcow2raw.c new file mode 100644 index 0000000000..689e7f5cd1 --- /dev/null +++ b/tools/blktap2/drivers/qcow2raw.c @@ -0,0 +1,449 @@ +/* qcow2raw.c + * + * Generates raw image data from an existing qcow image + * + * (c) 2006 Julian Chesterfield and Andrew Warfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <string.h> + +#include "bswap.h" +#include "aes.h" +#include "blk.h" +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "qcow.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + + +/* *BSD has no O_LARGEFILE */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +#define TAPDISK 1 +#define BLOCK_PROCESSSZ 4096 +#define QCOW_VBD 0 +#define AIO_VBD 1 +#define WINDOW 32 +#define PROGRESS_QUANT 2 + +static int running = 1, complete = 0; +static int returned_read_events = 0, returned_write_events = 0; +static int submit_events = 0; +static uint32_t read_idx = 0; +td_driver_t *ddqcow, *ddaio; +td_vbd_t* qcow_vbd, *aio_vbd; +static uint64_t prev = 0, written = 0; +static char output[(100/PROGRESS_QUANT) + 5]; + +extern tapdisk_server_t server; + +struct request_info { + void* buf; + uint64_t logical_sec; + int pending; +}; + +static void print_bytes(void *ptr, int length) +{ + int i,k; + unsigned char *p = ptr; + + DFPRINTF("Buf dump, length %d:\n",length); + for (k = 0; k < length; k++) { + DFPRINTF("%x",*p); + *p++; + if (k % 16 == 0) DFPRINTF("\n"); + else if (k % 2 == 0) DFPRINTF(" "); + } + DFPRINTF("\n"); + return; +} + +void +queue_event(event_id_t id, char mode, void *private) +{ + tapdisk_complete_tiocbs(&server.aio_queue); +} + +static void debug_output(uint64_t progress, uint64_t size) +{ + //Output progress every PROGRESS_QUANT + uint64_t blocks = size/(100/PROGRESS_QUANT); + + if (progress/blocks > prev) { + memcpy(output+prev+1,"=>",2); + prev++; + DFPRINTF("\r%s %"PRIu64"%%", + output, (uint64_t)((prev-1)*PROGRESS_QUANT)); + } + return; +} + +static void send_write_responses(td_request_t treq, int err) +{ + struct request_info* req; + + if (err < 0) { + DFPRINTF("AIO FAILURE: res [%d]!\n",err); + return; + } + returned_write_events+=treq.secs; + written += treq.secs; + + req= (struct request_info*)treq.cb_data; + + //Wait for whole request to complete. + req->pending-=treq.secs; + if(req->pending) + return; + + //Whole request has completed, we can free buffers. + free(req->buf); + free(req); + + debug_output(written, ddaio->info.size); + + return; +} + +static void send_read_responses(td_request_t treq, int err) +{ + int ret; + struct request_info* req; + td_vbd_request_t* vreq; + + if (err < 0) { + DFPRINTF("AIO FAILURE: res [%d]!\n",err); + return; + } + returned_read_events+=treq.secs; + + req= (struct request_info*)treq.cb_data; + + //do nothing until all fragments complete. + req->pending-=treq.secs; + + if(req->pending) + return; + + //This read is done. + tapdisk_vbd_complete_vbd_request(qcow_vbd, treq.private); + + + treq.op = TD_OP_WRITE; + treq.buf = req->buf; + treq.sec = req->logical_sec; + treq.secs = BLOCK_PROCESSSZ>>9; + treq.image = tapdisk_vbd_first_image(aio_vbd); + treq.cb = send_write_responses; + treq.id = 0; + treq.sidx = 0; + + req->pending = BLOCK_PROCESSSZ>>9; + treq.cb_data = req; + + vreq = calloc(1, sizeof(td_vbd_request_t)); + treq.private = vreq; + + //Put it in the VBD's queue, so we don't lose + //track of it. + vreq->submitting = 1; + INIT_LIST_HEAD(&vreq->next); + tapdisk_vbd_move_request(treq.private, + &aio_vbd->pending_requests); + + ddaio->ops->td_queue_write(ddaio,treq); + --vreq->submitting; + + tapdisk_submit_all_tiocbs(&server.aio_queue); + + return; +} + +int main(int argc, const char *argv[]) +{ + int ret = -1, fd, len,input; + uint64_t size; + struct timeval timeout; + uint64_t i; + char *buf; + struct stat finfo; + td_request_t treq; + td_vbd_request_t* vreq; + struct request_info* req; + int err; + + if (argc != 3) { + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, "usage: %s <Dest File descriptor> " + "<Qcow SRC IMAGE>\n", + argv[0]); + exit(-1); + } + + err = tapdisk_server_initialize(NULL, NULL); + if( err ) { + DPRINTF("qcow2raw Couldn't initialize server instance.\n"); + return err; + } + + err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD); + if( err ) { + DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n"); + return err; + } + + qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD); + if (!qcow_vbd) { + err = -ENODEV; + DPRINTF("qcow2raw Couldn't create qcow vbd.\n"); + return err; + } + + err = tapdisk_vbd_open_vdi(qcow_vbd, argv[2], DISK_TYPE_QCOW, + TAPDISK_STORAGE_TYPE_DEFAULT, + TD_OPEN_RDONLY); + if( err ) { + DPRINTF("qcow2raw Couldn't open qcow file.\n"); + return err; + } + + ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver; + + /*Setup aio destination file*/ + ret = stat(argv[1],&finfo); + if (ret == -1) { + /*Check errno*/ + switch(errno) { + case ENOENT: + /*File doesn't exist, create*/ + fd = open(argv[1], + O_RDWR | O_LARGEFILE | O_CREAT, 0644); + if (fd < 0) { + DFPRINTF("ERROR creating file [%s] " + "(errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) { + DFPRINTF("Unable to create file " + "[%s] of size %"PRIu64" (errno %d). " + "Exiting...\n", + argv[1], + (uint64_t)ddqcow->info.size<<9, + 0 - errno); + close(fd); + exit(-1); + } + close(fd); + break; + case ENXIO: + DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]); + exit(-1); + default: + DFPRINTF("An error occurred opening Device [%s] " + "(errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + } else { + fprintf(stderr, "WARNING: All existing data in " + "%s will be overwritten.\nDo you wish to continue? " + "(y or n) ", + argv[1]); + if (getchar() != 'y') { + DFPRINTF("Exiting...\n"); + exit(-1); + } + + /*TODO - Test the existing file or device for adequate space*/ + fd = open(argv[1], O_RDWR | O_LARGEFILE); + if (fd < 0) { + DFPRINTF("ERROR: opening file [%s] (errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + + if (S_ISBLK(finfo.st_mode)) { + if (blk_getimagesize(fd, &size) != 0) { + close(fd); + return -1; + } + + if (size < ddqcow->info.size<<9) { + DFPRINTF("ERROR: Not enough space on device " + "%s (%"PRIu64" bytes available, " + "%"PRIu64" bytes required\n", + argv[1], size, + (uint64_t)ddqcow->info.size<<9); + close(fd); + exit(-1); + } + } else { + if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) { + DFPRINTF("Unable to create file " + "[%s] of size %"PRIu64" (errno %d). " + "Exiting...\n", + argv[1], + (uint64_t)ddqcow->info.size<<9, + 0 - errno); + close(fd); + exit(-1); + } else DFPRINTF("File [%s] truncated to length %"PRIu64" " + "(%"PRIu64")\n", + argv[1], + (uint64_t)ddqcow->info.size<<9, + (uint64_t)ddqcow->info.size); + } + close(fd); + } + + //Now the output file should be there, reopen it as an aio VBD + err=tapdisk_vbd_initialize(-1,-1, AIO_VBD); + if( err ) { + DPRINTF("qcow2raw Couldn't initialize aio vbd.\n"); + return err; + } + + aio_vbd = tapdisk_server_get_vbd(AIO_VBD); + if (!aio_vbd) { + err = -ENODEV; + DPRINTF("qcow2raw Couldn't create aio vbd.\n"); + return err; + } + + err = tapdisk_vbd_open_vdi(aio_vbd, argv[1], DISK_TYPE_AIO, + TAPDISK_STORAGE_TYPE_DEFAULT, + 0); + if( err ) { + DPRINTF("qcow2raw Couldn't open aio file.\n"); + return err; + } + + ddaio=(tapdisk_vbd_first_image(aio_vbd))->driver; + + /*Initialise the output string*/ + memset(output,0x20,(100/PROGRESS_QUANT)+5); + output[0] = '['; + output[(100/PROGRESS_QUANT)+2] = ']'; + output[(100/PROGRESS_QUANT)+3] = '\0'; + DFPRINTF("%s",output); + + i = 0; + while (running) { + timeout.tv_sec = 0; + + if (!complete) { + /*Read Pages from qcow image*/ + if ( (ret = posix_memalign((void **)&buf, + BLOCK_PROCESSSZ, + BLOCK_PROCESSSZ)) + != 0) { + DFPRINTF("Unable to alloc memory (%d)\n",ret); + exit(-1); + } + + /*Attempt to read 4k sized blocks*/ + submit_events+=BLOCK_PROCESSSZ>>9; + + //Set up the read request + treq.op = TD_OP_READ; + treq.buf = buf; + treq.sec = i; + treq.secs = BLOCK_PROCESSSZ>>9; + treq.image = tapdisk_vbd_first_image(qcow_vbd); + treq.cb = send_read_responses; + treq.id = 0; + treq.sidx = 0; + + req = calloc(1, sizeof(struct request_info)); + req->buf = buf; + req->logical_sec = i; + req->pending = BLOCK_PROCESSSZ>>9; + treq.cb_data = req; + + vreq = calloc(1, sizeof(td_vbd_request_t)); + treq.private = vreq; + + //Put it in the VBD's queue, so we don't lose + //track of it. + vreq->submitting = 1; + INIT_LIST_HEAD(&vreq->next); + tapdisk_vbd_move_request(treq.private, + &qcow_vbd->pending_requests); + + ddqcow->ops->td_queue_read(ddqcow, treq); + --vreq->submitting; + + i += BLOCK_PROCESSSZ>>9; + + if (i >= ddqcow->info.size) + complete = 1; + + + tapdisk_submit_all_tiocbs(&server.aio_queue); + } + + + while(returned_write_events != submit_events) { + ret = scheduler_wait_for_events(&server.scheduler); + if (ret < 0) { + DFPRINTF("server wait returned %d\n", ret); + sleep(2); + } + } + if (complete && (returned_write_events == submit_events)) + running = 0; + } + memcpy(output+prev+1,"=",1); + DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output); + + ddqcow->ops->td_close(ddqcow); + ddaio->ops->td_close(ddaio); + free(ddqcow->data); + free(ddaio->data); + + return 0; +} diff --git a/tools/blktap2/drivers/scheduler.c b/tools/blktap2/drivers/scheduler.c new file mode 100644 index 0000000000..6b8d0093e7 --- /dev/null +++ b/tools/blktap2/drivers/scheduler.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> + +#include "scheduler.h" +#include "tapdisk-log.h" + +#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) + +#define SCHEDULER_MAX_TIMEOUT 600 +#define SCHEDULER_POLL_FD (SCHEDULER_POLL_READ_FD | \ + SCHEDULER_POLL_WRITE_FD | \ + SCHEDULER_POLL_EXCEPT_FD) + +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +#define scheduler_for_each_event(s, event, tmp) \ + list_for_each_entry_safe(event, tmp, &(s)->events, next) + +typedef struct event { + char mode; + event_id_t id; + + int fd; + int timeout; + int deadline; + + event_cb_t cb; + void *private; + + struct list_head next; +} event_t; + +static void +scheduler_prepare_events(scheduler_t *s) +{ + int diff; + struct timeval now; + event_t *event, *tmp; + + FD_ZERO(&s->read_fds); + FD_ZERO(&s->write_fds); + FD_ZERO(&s->except_fds); + + s->max_fd = 0; + s->timeout = SCHEDULER_MAX_TIMEOUT; + + gettimeofday(&now, NULL); + + scheduler_for_each_event(s, event, tmp) { + if (event->mode & SCHEDULER_POLL_READ_FD) { + FD_SET(event->fd, &s->read_fds); + s->max_fd = MAX(event->fd, s->max_fd); + } + + if (event->mode & SCHEDULER_POLL_WRITE_FD) { + FD_SET(event->fd, &s->write_fds); + s->max_fd = MAX(event->fd, s->max_fd); + } + + if (event->mode & SCHEDULER_POLL_EXCEPT_FD) { + FD_SET(event->fd, &s->except_fds); + s->max_fd = MAX(event->fd, s->max_fd); + } + + if (event->mode & SCHEDULER_POLL_TIMEOUT) { + diff = event->deadline - now.tv_sec; + if (diff > 0) + s->timeout = MIN(s->timeout, diff); + else + s->timeout = 0; + } + } + + s->timeout = MIN(s->timeout, s->max_timeout); +} + +static void +scheduler_event_callback(event_t *event, char mode) +{ + if (event->mode & SCHEDULER_POLL_TIMEOUT) { + struct timeval now; + gettimeofday(&now, NULL); + event->deadline = now.tv_sec + event->timeout; + } + + event->cb(event->id, mode, event->private); +} + +static void +scheduler_run_events(scheduler_t *s) +{ + struct timeval now; + event_t *event, *tmp; + + gettimeofday(&now, NULL); + + again: + s->restart = 0; + + scheduler_for_each_event(s, event, tmp) { + if ((event->mode & SCHEDULER_POLL_READ_FD) && + FD_ISSET(event->fd, &s->read_fds)) { + FD_CLR(event->fd, &s->read_fds); + scheduler_event_callback(event, SCHEDULER_POLL_READ_FD); + goto next; + } + + if ((event->mode & SCHEDULER_POLL_WRITE_FD) && + FD_ISSET(event->fd, &s->write_fds)) { + FD_CLR(event->fd, &s->write_fds); + scheduler_event_callback(event, SCHEDULER_POLL_WRITE_FD); + goto next; + } + + if ((event->mode & SCHEDULER_POLL_EXCEPT_FD) && + FD_ISSET(event->fd, &s->except_fds)) { + FD_CLR(event->fd, &s->except_fds); + scheduler_event_callback(event, SCHEDULER_POLL_EXCEPT_FD); + goto next; + } + + if ((event->mode & SCHEDULER_POLL_TIMEOUT) && + (event->deadline <= now.tv_sec)) + scheduler_event_callback(event, SCHEDULER_POLL_TIMEOUT); + + next: + if (s->restart) + goto again; + } +} + +int +scheduler_register_event(scheduler_t *s, char mode, int fd, + int timeout, event_cb_t cb, void *private) +{ + event_t *event; + struct timeval now; + + if (!cb) + return -EINVAL; + + if (!(mode & SCHEDULER_POLL_TIMEOUT) && !(mode & SCHEDULER_POLL_FD)) + return -EINVAL; + + event = calloc(1, sizeof(event_t)); + if (!event) + return -ENOMEM; + + gettimeofday(&now, NULL); + + INIT_LIST_HEAD(&event->next); + + event->mode = mode; + event->fd = fd; + event->timeout = timeout; + event->deadline = now.tv_sec + timeout; + event->cb = cb; + event->private = private; + event->id = s->uuid++; + + if (!s->uuid) + s->uuid++; + + list_add_tail(&event->next, &s->events); + + return event->id; +} + +void +scheduler_unregister_event(scheduler_t *s, event_id_t id) +{ + event_t *event, *tmp; + + if (!id) + return; + + scheduler_for_each_event(s, event, tmp) + if (event->id == id) { + list_del(&event->next); + free(event); + s->restart = 1; + break; + } +} + +void +scheduler_set_max_timeout(scheduler_t *s, int timeout) +{ + if (timeout >= 0) + s->max_timeout = MIN(s->max_timeout, timeout); +} + +int +scheduler_wait_for_events(scheduler_t *s) +{ + int ret; + struct timeval tv; + + scheduler_prepare_events(s); + + tv.tv_sec = s->timeout; + tv.tv_usec = 0; + + DBG("timeout: %d, max_timeout: %d\n", + s->timeout, s->max_timeout); + + ret = select(s->max_fd + 1, &s->read_fds, + &s->write_fds, &s->except_fds, &tv); + + s->restart = 0; + s->timeout = SCHEDULER_MAX_TIMEOUT; + s->max_timeout = SCHEDULER_MAX_TIMEOUT; + + if (ret < 0) + return ret; + + scheduler_run_events(s); + + return ret; +} + +void +scheduler_initialize(scheduler_t *s) +{ + memset(s, 0, sizeof(scheduler_t)); + + s->uuid = 1; + + FD_ZERO(&s->read_fds); + FD_ZERO(&s->write_fds); + FD_ZERO(&s->except_fds); + + INIT_LIST_HEAD(&s->events); +} diff --git a/tools/blktap2/drivers/scheduler.h b/tools/blktap2/drivers/scheduler.h new file mode 100644 index 0000000000..ea37e8f837 --- /dev/null +++ b/tools/blktap2/drivers/scheduler.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _SCHEDULER_H_ +#define _SCHEDULER_H_ + +#include <sys/select.h> + +#include "list.h" + +#define SCHEDULER_POLL_READ_FD 0x1 +#define SCHEDULER_POLL_WRITE_FD 0x2 +#define SCHEDULER_POLL_EXCEPT_FD 0x4 +#define SCHEDULER_POLL_TIMEOUT 0x8 + +typedef int event_id_t; +typedef void (*event_cb_t) (event_id_t id, char mode, void *private); + +typedef struct scheduler { + fd_set read_fds; + fd_set write_fds; + fd_set except_fds; + + struct list_head events; + + int uuid; + int max_fd; + int timeout; + int restart; + int max_timeout; +} scheduler_t; + +void scheduler_initialize(scheduler_t *); +event_id_t scheduler_register_event(scheduler_t *, char mode, + int fd, int timeout, + event_cb_t cb, void *private); +void scheduler_unregister_event(scheduler_t *, event_id_t); +void scheduler_set_max_timeout(scheduler_t *, int); +int scheduler_wait_for_events(scheduler_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-client.c b/tools/blktap2/drivers/tapdisk-client.c new file mode 100644 index 0000000000..c85b5fc530 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-client.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* client harness for tapdisk log */ + +#include <errno.h> +#include <fcntl.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> + +#include "log.h" + +#define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a) + +#define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a) + +struct writelog { + char* shmpath; + uint32_t shmsize; + void* shm; + + /* next unprocessed item in the writelog */ + void* cur; + unsigned int inflight; + + /* pointer to start and end of free data space for requests */ + void* dhd; + void* dtl; + + log_sring_t* sring; + log_front_ring_t fring; +}; + +/* bytes free on the data ring */ +static inline unsigned int dring_avail(struct writelog* wl) +{ + /* one byte reserved to distinguish empty from full */ + if (wl->dhd == wl->dtl) + return sdataend(wl->shm) - sdatastart(wl->shm) - 1; + + if (wl->dhd < wl->dtl) + return wl->dtl - wl->dhd - 1; + + return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1; +} + +/* advance ring pointer by len bytes */ +static inline void* dring_advance(struct writelog* wl, void* start, size_t len) +{ + void* next; + int dsz = sdataend(wl->shm) - sdatastart(wl->shm); + + next = start + (len % dsz); + if (next > sdataend(wl->shm)) + next -= dsz; + + return next; +} + +static void usage(void) +{ + fprintf(stderr, "usage: tapdisk-client <sock>\n"); +} + +/* returns socket file descriptor */ +static int tdctl_open(const char* sockpath) +{ + struct sockaddr_un saddr; + int fd; + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + BWPRINTF("error creating socket: %s", strerror(errno)); + return -1; + } + + memset(&saddr, 0, sizeof(saddr)); + saddr.sun_family = AF_UNIX; + memcpy(saddr.sun_path, sockpath, strlen(sockpath)); + + if (connect(fd, &saddr, sizeof(saddr)) < 0) { + BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno)); + close(fd); + return -1; + } + + return fd; +} + +static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen) +{ + int rc; + + if ((rc = write(fd, msg, sizeof(*msg))) < 0) { + BWPRINTF("error sending ctl request: %s", strerror(errno)); + return -1; + } else if (rc < sizeof(*msg)) { + BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg)); + return -1; + } + + if (!rsplen) + return 0; + + if ((rc = read(fd, rsp, rsplen)) < 0) { + BWPRINTF("error reading ctl response: %s", strerror(errno)); + return -1; + } else if (rc < rsplen) { + BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen); + return -1; + } + + return 0; +} + +static int ctl_get_shmem(int fd, struct writelog* wl) +{ + struct log_ctlmsg req; + char rsp[CTLRSPLEN_SHMP + 1]; + int rc; + + memset(&req, 0, sizeof(req)); + memset(rsp, 0, sizeof(rsp)); + + memcpy(req.msg, LOGCMD_SHMP, 4); + if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) { + BWPRINTF("error getting shared memory parameters"); + return -1; + } + + memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize)); + wl->shmpath = strdup(rsp + sizeof(wl->shmsize)); + + BDPRINTF("shared memory parameters: size: %u, path: %s", + wl->shmsize, wl->shmpath); + + return 0; +} + +static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd) +{ + memset(msg, 0, sizeof(*msg)); + memcpy(msg->msg, cmd, 4); +} + +static int ctl_get_writes(int fd) +{ + struct log_ctlmsg req; + char rsp[CTLRSPLEN_GET]; + int rc; + + ctlmsg_init(&req, LOGCMD_GET); + + if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) { + BWPRINTF("error getting writes"); + return -1; + } + + return 0; +} + +static int ctl_peek_writes(int fd) +{ + struct log_ctlmsg req; + char rsp[CTLRSPLEN_PEEK]; + int rc; + + ctlmsg_init(&req, LOGCMD_PEEK); + + if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) { + BWPRINTF("error peeking writes"); + return -1; + } + + return 0; +} + +/* submit pending requests */ +static int ctl_kick(int fd) +{ + struct log_ctlmsg req; + int rc; + + ctlmsg_init(&req, LOGCMD_KICK); + + if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) { + BWPRINTF("error kicking ring"); + return -1; + } + + return 0; +} + +static int ctl_clear_writes(int fd) +{ + struct log_ctlmsg req; + char rsp[CTLRSPLEN_CLEAR]; + int rc; + + ctlmsg_init(&req, LOGCMD_CLEAR); + + if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) { + BWPRINTF("error clearing writes"); + return -1; + } + + return 0; +} + +static int writelog_map(struct writelog* wl) +{ + int fd; + void* shm; + + if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) { + BWPRINTF("could not open shared memory at %s: %s", wl->shmpath, + strerror(errno)); + return -1; + } + + wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (wl->shm == MAP_FAILED) { + BWPRINTF("could not mmap write log shm: %s", strerror(errno)); + return -1; + } + wl->cur = wl->shm; + wl->inflight = 0; + wl->dhd = wl->dtl = sdatastart(wl->shm); + + BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm), + dring_avail(wl)); + + wl->sring = sringstart(wl->shm); + /* need some thought about what to do on reconnect */ + FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE); + + return 0; +} + +static int writelog_dump(struct writelog* wl) +{ + struct disk_range* range = wl->shm; + + for (range = wl->shm; (void*)range < bmend(wl->shm); range++) { + if (!range->count) + break; + + BDPRINTF("dirty extent: %"PRIu64":%u", + range->sector, range->count); + } + + return 0; +} + +/* walk dirty map and enqueue read requests. + * returns: 0 when entire bitmap has been enqueued, + * 1 when the ring is full + * -1 on error + */ +static int writelog_enqueue_requests(struct writelog* wl) +{ + struct disk_range* range = wl->shm; + log_request_t* req; + + for (range = wl->cur; (void*)range < bmend(wl->shm); range++) { + if (!range->count) + break; + + if (RING_FULL(&wl->fring)) + break; + + /* insert range into request stream */ + /* 1. get next request slot from ring */ + /* 2. ensure enough shm space is available */ + + BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)", + range->sector, range->count, RING_FREE_REQUESTS(&wl->fring), + RING_SIZE(&wl->fring)); + + req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt); + + req->sector = range->sector; + req->count = range->count; + /* ... */ + req->offset = 0; + + wl->fring.req_prod_pvt++; + wl->inflight++; + } + + wl->cur = range; + + if (range->count) + return 1; + + return 0; +} + +static int writelog_dequeue_responses(struct writelog* wl) +{ + RING_IDX rstart, rend; + log_response_t rsp; + + rstart = wl->fring.rsp_cons; + rend = wl->sring->rsp_prod; + + BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend); + + while (rstart != rend) { + memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp)); + BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count); + wl->fring.rsp_cons = ++rstart; + wl->inflight--; + } + + return 0; +} + +static int writelog_free(struct writelog* wl) +{ + if (wl->shmpath) { + free(wl->shmpath); + wl->shmpath = NULL; + } + if (wl->shm) { + munmap(wl->shm, wl->shmsize); + wl->shm = NULL; + } + + return 0; +} + +int get_writes(struct writelog* wl, int fd, int peek) +{ + int rc; + + if (peek) + rc = ctl_peek_writes(fd); + else + rc = ctl_get_writes(fd); + + if (rc < 0) + return rc; + + wl->cur = wl->shm; + + return 0; +} + +int await_responses(struct writelog* wl, int fd) +{ + struct log_ctlmsg msg; + int rc; + + /* sit on socket waiting for kick */ + if ((rc = read(fd, &msg, sizeof(msg))) < 0) { + BWPRINTF("error reading from control socket: %s", strerror(errno)); + return -1; + } else if (!rc) { + BWPRINTF("EOF on control socket"); + return -1; + } else if (rc < sizeof(msg)) { + BWPRINTF("short reply (%d/%d bytes)", rc, (int) sizeof(msg)); + return -1; + } + + if (strncmp(msg.msg, LOGCMD_KICK, 4)) { + BWPRINTF("Unknown message received: %.4s", msg.msg); + return -1; + } + + if (writelog_dequeue_responses(wl) < 0) + return -1; + + return 0; +} + +/* read_loop: + * 1. extract dirty bitmap + * 2. feed as much as possible onto ring + * 3. kick + * 4. as responses come back, feed more of the dirty bitmap + * into the ring + * 5. when entire bitmap has been queued, go to 1? + */ +int read_loop(struct writelog* wl, int fd) +{ + int rc; + + if (get_writes(wl, fd, 1) < 0) + return -1; + writelog_dump(wl); + + do { + rc = writelog_enqueue_requests(wl); + + if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring)) + RING_PUSH_REQUESTS(&wl->fring); + if (ctl_kick(fd) < 0) + return -1; + + /* collect responses */ + if (wl->inflight && await_responses(wl, fd) < 0) + return -1; + } while (rc > 0); + + return rc; +} + +int main(int argc, char* argv[]) +{ + int fd; + struct writelog wl; + char cmd; + + if (argc < 2) { + usage(); + return 1; + } + + if (argc < 3) + cmd = 'p'; + else + cmd = argv[2][0]; + + fd = tdctl_open(argv[1]); + + if (ctl_get_shmem(fd, &wl) < 0) + return 1; + + if (writelog_map(&wl) < 0) { + BWPRINTF("Error mapping write log: %s", strerror(errno)); + return 1; + } + + switch (cmd) { + case 'p': + if (get_writes(&wl, fd, 1) < 0) + return 1; + writelog_dump(&wl); + break; + case 'c': + if (ctl_clear_writes(fd) < 0) + return 1; + break; + case 'g': + if (get_writes(&wl, fd, 0) < 0) + return 1; + writelog_dump(&wl); + break; + case 'r': + if (read_loop(&wl, fd) < 0) + return 1; + break; + default: + usage(); + return 1; + } + + writelog_free(&wl); + close(fd); + + return 0; +} diff --git a/tools/blktap2/drivers/tapdisk-diff.c b/tools/blktap2/drivers/tapdisk-diff.c new file mode 100644 index 0000000000..0f31c57d42 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-diff.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> + +#include "list.h" +#include "scheduler.h" +#include "tapdisk-vbd.h" +#include "tapdisk-server.h" +#include "libvhd.h" + +#define POLL_READ 0 +#define POLL_WRITE 1 + +#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT) + +struct tapdisk_stream_poll { + int pipe[2]; + int set; +}; + +struct tapdisk_stream_request { + uint64_t sec; + uint32_t secs; + uint64_t seqno; + blkif_request_t blkif_req; + struct list_head next; +}; + +struct tapdisk_stream { + td_vbd_t *vbd; + + unsigned int id; + + int err; + + uint64_t cur; + uint64_t start; + uint64_t end; + + uint64_t started; + uint64_t completed; + + struct tapdisk_stream_poll poll; + event_id_t enqueue_event_id; + + struct list_head free_list; + struct list_head pending_list; + struct list_head completed_list; + + struct tapdisk_stream_request requests[MAX_REQUESTS]; +}; + +static unsigned int tapdisk_stream_count; + +static void tapdisk_stream_close_image(struct tapdisk_stream *); + +static char *program; +static struct tapdisk_stream stream1, stream2; +static vhd_context_t vhd1; + +static void +usage(FILE *stream) +{ + printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n", + program); +} + +static int +open_vhd(const char *path, vhd_context_t *vhd) +{ + int err; + + err = vhd_open(vhd, path, VHD_OPEN_RDONLY); + if (err) { + printf("error opening %s: %d\n", path, err); + return err; + } + + err = vhd_get_bat(vhd); + if (err) + { + printf("error reading BAT for %s: %d\n", path, err); + vhd_close(vhd); + return err; + } + + return 0; +} + +static inline void +tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p) +{ + p->set = 0; + p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1; +} + +static int +tapdisk_stream_poll_open(struct tapdisk_stream_poll *p) +{ + int err; + + tapdisk_stream_poll_initialize(p); + + err = pipe(p->pipe); + if (err) + return -errno; + + err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + return 0; + +out: + close(p->pipe[POLL_READ]); + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); + return -errno; +} + +static void +tapdisk_stream_poll_close(struct tapdisk_stream_poll *p) +{ + if (p->pipe[POLL_READ] != -1) + close(p->pipe[POLL_READ]); + if (p->pipe[POLL_WRITE] != -1) + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); +} + +static inline void +tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p) +{ + int dummy; + + read(p->pipe[POLL_READ], &dummy, sizeof(dummy)); + p->set = 0; +} + +static inline void +tapdisk_stream_poll_set(struct tapdisk_stream_poll *p) +{ + int dummy = 0; + + if (!p->set) { + write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy)); + p->set = 1; + } +} + +static inline int +tapdisk_stream_stop(struct tapdisk_stream *s) +{ + return ((s->cur == s->end || s->err) && + list_empty(&s->pending_list) && + list_empty(&s->completed_list)); +} + +static inline void +tapdisk_stream_initialize_request(struct tapdisk_stream_request *req) +{ + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->next); +} + +static inline int +tapdisk_stream_request_idx(struct tapdisk_stream *s, + struct tapdisk_stream_request *req) +{ + return (req - s->requests); +} + +static inline struct tapdisk_stream_request * +tapdisk_stream_get_request(struct tapdisk_stream *s) +{ + struct tapdisk_stream_request *req; + + if (list_empty(&s->free_list)) + return NULL; + + req = list_entry(s->free_list.next, + struct tapdisk_stream_request, next); + + list_del_init(&req->next); + tapdisk_stream_initialize_request(req); + + return req; +} + +static inline void +tapdisk_stream_queue_completed(struct tapdisk_stream *s, + struct tapdisk_stream_request *sreq) +{ + struct tapdisk_stream_request *itr; + + list_for_each_entry(itr, &s->completed_list, next) + if (sreq->seqno < itr->seqno) { + list_add_tail(&sreq->next, &itr->next); + return; + } + + list_add_tail(&sreq->next, &s->completed_list); +} + +static int +tapdisk_result_compare(struct tapdisk_stream_request *sreq1, + struct tapdisk_stream_request *sreq2) +{ + unsigned long idx1, idx2; + char *buf1, *buf2; + int result; + + assert(sreq1->seqno == sreq2->seqno); + assert(sreq1->secs == sreq2->secs); + idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1, + sreq1); + idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2, + sreq2); + buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0); + buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0); + + result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT); + return result; +} + +static int +tapdisk_stream_process_data(void) +{ + struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2; + int advance_both; + int result = 0; + + sreq1 = list_entry(stream1.completed_list.next, + struct tapdisk_stream_request, next); + sreq2 = list_entry(stream2.completed_list.next, + struct tapdisk_stream_request, next); + tmp1 = list_entry(sreq1->next.next, + struct tapdisk_stream_request, next); + tmp2 = list_entry(sreq2->next.next, + struct tapdisk_stream_request, next); + while (result == 0 && + &sreq1->next != &stream1.completed_list && + &sreq2->next != &stream2.completed_list) { + //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno); + advance_both = 1; + if (sreq1->seqno < sreq2->seqno) { + advance_both = 0; + goto advance1; + } + if (sreq1->seqno > sreq2->seqno) + goto advance2; + + result = tapdisk_result_compare(sreq1, sreq2); + + stream1.completed++; + stream2.completed++; + + list_del_init(&sreq1->next); + list_add_tail(&sreq1->next, &stream1.free_list); + list_del_init(&sreq2->next); + list_add_tail(&sreq2->next, &stream2.free_list); + +advance1: + sreq1 = tmp1; + tmp1 = list_entry(tmp1->next.next, + struct tapdisk_stream_request, next); + if (!advance_both) + continue; +advance2: + sreq2 = tmp2; + tmp2 = list_entry(tmp2->next.next, + struct tapdisk_stream_request, next); + } + + return result; +} + +static void +tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp) +{ + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + struct tapdisk_stream_request *sreq = s->requests + rsp->id; + + list_del_init(&sreq->next); + + if (rsp->status == BLKIF_RSP_OKAY) + tapdisk_stream_queue_completed(s, sreq); + else { + s->err = EIO; + list_add_tail(&sreq->next, &s->free_list); + fprintf(stderr, "error reading sector 0x%"PRIx64"\n", sreq->sec); + } + + if (tapdisk_stream_process_data()) { + fprintf(stderr, "mismatch at sector 0x%"PRIx64"\n", + sreq->sec); + stream1.err = EINVAL; + stream2.err = EINVAL; + } + + tapdisk_stream_poll_set(&stream1.poll); + tapdisk_stream_poll_set(&stream2.poll); +} + +static inline int +tapdisk_stream_enqueue_copy(struct tapdisk_stream *s, + struct tapdisk_stream_request *r) +{ + td_vbd_t *vbd; + blkif_request_t *breq; + td_vbd_request_t *vreq; + struct tapdisk_stream_request *sreq; + int idx; + + vbd = stream2.vbd; + sreq = tapdisk_stream_get_request(s); + if (!sreq) + return 1; + + idx = tapdisk_stream_request_idx(s, sreq); + + sreq->sec = r->sec; + sreq->secs = r->secs; + sreq->seqno = r->seqno; + + breq = &sreq->blkif_req; + breq->id = idx; + breq->nr_segments = r->blkif_req.nr_segments; + breq->sector_number = r->blkif_req.sector_number; + breq->operation = BLKIF_OP_READ; + + for (int i = 0; i < r->blkif_req.nr_segments; i++) { + struct blkif_request_segment *seg = breq->seg + i; + seg->first_sect = r->blkif_req.seg[i].first_sect; + seg->last_sect = r->blkif_req.seg[i].last_sect; + } + s->cur += sreq->secs; + + vreq = vbd->request_list + idx; + assert(list_empty(&vreq->next)); + assert(vreq->secs_pending == 0); + + memcpy(&vreq->req, breq, sizeof(*breq)); + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + list_add_tail(&sreq->next, &s->pending_list); + + return 0; +} + +static void +tapdisk_stream_enqueue1(void) +{ + td_vbd_t *vbd; + int i, idx, psize, blk; + struct tapdisk_stream *s = &stream1; + + vbd = s->vbd; + psize = getpagesize(); + + while (s->cur < s->end && !s->err) { + blkif_request_t *breq; + td_vbd_request_t *vreq; + struct tapdisk_stream_request *sreq; + + /* skip any blocks that are not present in this image */ + blk = s->cur >> SPB_SHIFT; + while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) { + //printf("skipping block %d\n", blk); + blk++; + s->cur = blk << SPB_SHIFT; + } + + if (s->cur >= s->end) + break; + + sreq = tapdisk_stream_get_request(s); + if (!sreq) + break; + + idx = tapdisk_stream_request_idx(s, sreq); + + sreq->sec = s->cur; + sreq->secs = 0; + sreq->seqno = s->started++; + + breq = &sreq->blkif_req; + breq->id = idx; + breq->nr_segments = 0; + breq->sector_number = sreq->sec; + breq->operation = BLKIF_OP_READ; + + for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) { + uint32_t secs; + struct blkif_request_segment *seg = breq->seg + i; + + secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT); + secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs); + if (!secs) + break; + + sreq->secs += secs; + s->cur += secs; + + seg->first_sect = 0; + seg->last_sect = secs - 1; + breq->nr_segments++; + } + + vreq = vbd->request_list + idx; + + assert(list_empty(&vreq->next)); + assert(vreq->secs_pending == 0); + + memcpy(&vreq->req, breq, sizeof(*breq)); + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + list_add_tail(&sreq->next, &s->pending_list); + } + + tapdisk_vbd_issue_requests(vbd); +} + +static void +tapdisk_stream_enqueue2(void) +{ + td_vbd_t *vbd; + int i, blk; + struct tapdisk_stream_request *itr; + struct tapdisk_stream *s = &stream2; + + vbd = s->vbd; + + /* issue the same requests that we issued on stream1 */ + list_for_each_entry(itr, &stream1.completed_list, next) { + if (itr->sec < s->cur) + continue; + if (tapdisk_stream_enqueue_copy(s, itr)) + goto done; + } + + list_for_each_entry(itr, &stream1.pending_list, next) { + if (itr->sec < s->cur) + continue; + if (tapdisk_stream_enqueue_copy(s, itr)) + goto done; + } + + stream2.cur = stream1.cur; + +done: + tapdisk_vbd_issue_requests(vbd); +} + +static inline int +tapdisk_diff_done(void) +{ + return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2)); +} + +static void +tapdisk_diff_stop(void) +{ + tapdisk_stream_close_image(&stream1); + tapdisk_stream_close_image(&stream2); +} + +static void +tapdisk_stream_enqueue(event_id_t id, char mode, void *arg) +{ + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + + tapdisk_stream_poll_clear(&s->poll); + + if (tapdisk_diff_done()) { + tapdisk_diff_stop(); + return; + } + + if (s == &stream1) + tapdisk_stream_enqueue1(); + else if (s == &stream2) + tapdisk_stream_enqueue2(); + else + assert(0); + + if (tapdisk_diff_done()) { + // we have to check again for the case when stream1 had no + // blocks at all + tapdisk_diff_stop(); + return; + } +} + +static int +tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type) +{ + int err; + image_t image; + + s->id = tapdisk_stream_count++; + + err = tapdisk_vbd_initialize(-1, -1, s->id); + if (err) + goto out; + + s->vbd = tapdisk_server_get_vbd(s->id); + if (!s->vbd) { + err = ENODEV; + goto out; + } + + tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s); + + err = tapdisk_vbd_open_vdi(s->vbd, path, type, + TAPDISK_STORAGE_TYPE_DEFAULT, + TD_OPEN_RDONLY); + if (err) + goto out; + + s->vbd->reopened = 1; + + err = tapdisk_vbd_get_image_info(s->vbd, &image); + if (err) { + fprintf(stderr, "failed getting image size: %d\n", err); + return err; + } + + s->start = 0; + s->cur = s->start; + s->end = image.size; + + err = 0; + +out: + if (err) + fprintf(stderr, "failed to open image %s: %d\n", path, err); + return err; +} + +static void +tapdisk_stream_close_image(struct tapdisk_stream *s) +{ + td_vbd_t *vbd; + + vbd = tapdisk_server_get_vbd(s->id); + if (vbd) { + tapdisk_vbd_close_vdi(vbd); + tapdisk_server_remove_vbd(vbd); + free((void *)vbd->ring.vstart); + free(vbd->name); + free(vbd); + s->vbd = NULL; + } +} + +static int +tapdisk_stream_initialize_requests(struct tapdisk_stream *s) +{ + size_t size; + td_ring_t *ring; + int err, i, psize; + + ring = &s->vbd->ring; + psize = getpagesize(); + size = psize * BLKTAP_MMAP_REGION_SIZE; + + /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */ + err = posix_memalign((void **)&ring->vstart, psize, size); + if (err) { + fprintf(stderr, "failed to allocate buffers: %d\n", err); + ring->vstart = 0; + return err; + } + + for (i = 0; i < MAX_REQUESTS; i++) { + struct tapdisk_stream_request *req = s->requests + i; + tapdisk_stream_initialize_request(req); + list_add_tail(&req->next, &s->free_list); + } + + return 0; +} + +static int +tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s) +{ + int err; + struct tapdisk_stream_poll *p = &s->poll; + + err = tapdisk_stream_poll_open(p); + if (err) + goto out; + + err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + p->pipe[POLL_READ], 0, + tapdisk_stream_enqueue, s); + if (err < 0) + goto out; + + s->enqueue_event_id = err; + err = 0; + +out: + if (err) + fprintf(stderr, "failed to register event: %d\n", err); + return err; +} + +static void +tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s) +{ + if (s->enqueue_event_id) { + tapdisk_server_unregister_event(s->enqueue_event_id); + s->enqueue_event_id = 0; + } + tapdisk_stream_poll_close(&s->poll); +} + +static inline void +tapdisk_stream_initialize(struct tapdisk_stream *s) +{ + memset(s, 0, sizeof(*s)); + INIT_LIST_HEAD(&s->free_list); + INIT_LIST_HEAD(&s->pending_list); + INIT_LIST_HEAD(&s->completed_list); +} + +static int +tapdisk_stream_open(struct tapdisk_stream *s, const char *arg) +{ + int err, type; + char *path; + + err = tapdisk_parse_disk_type(arg, &path, &type); + if (err) + return err; + + tapdisk_stream_initialize(s); + + err = tapdisk_stream_open_image(s, path, type); + if (err) + return err; + + err = tapdisk_stream_initialize_requests(s); + if (err) + return err; + + err = tapdisk_stream_register_enqueue_event(s); + if (err) + return err; + + tapdisk_stream_enqueue(s->enqueue_event_id, + SCHEDULER_POLL_READ_FD, s); + + return 0; +} + +static void +tapdisk_stream_release(struct tapdisk_stream *s) +{ + tapdisk_stream_close_image(s); + tapdisk_stream_unregister_enqueue_event(s); +} + +static int +tapdisk_stream_run(struct tapdisk_stream *s) +{ + tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s); + tapdisk_server_run(); + return s->err; +} + +int +main(int argc, char *argv[]) +{ + int c, err, type1; + const char *arg1 = NULL, *arg2 = NULL; + char *path1; + + err = 0; + + program = basename(argv[0]); + + while ((c = getopt(argc, argv, "n:m:h")) != -1) { + switch (c) { + case 'n': + arg1 = optarg; + break; + case 'm': + arg2 = optarg; + break; + case 'h': + usage(stdout); + return 0; + default: + goto fail_usage; + } + } + + if (!arg1 || !arg2) + goto fail_usage; + + err = tapdisk_parse_disk_type(arg1, &path1, &type1); + if (err) + return err; + if (type1 != DISK_TYPE_VHD) { + printf("error: first VDI is not VHD\n"); + return EINVAL; + } + + err = open_vhd(path1, &vhd1); + if (err) + return err; + + tapdisk_start_logging("tapdisk-diff"); + + err = tapdisk_server_initialize(NULL, NULL); + if (err) + goto out; + + err = tapdisk_stream_open(&stream1, arg1); + if (err) { + fprintf(stderr, "Failed to open %s: %s\n", + arg1, strerror(-err)); + goto out; + } + + err = tapdisk_stream_open(&stream2, arg2); + if (err) { + fprintf(stderr, "Failed to open %s: %s\n", + arg2, strerror(-err)); + goto out1; + } + + if (stream1.end != stream2.end) { + fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n", + stream1.end, stream2.end); + err = EINVAL; + goto out2; + } + + tapdisk_server_run(); + +out2: + tapdisk_stream_release(&stream2); +out1: + tapdisk_stream_release(&stream1); +out: + vhd_close(&vhd1); + tapdisk_stop_logging(); + + return err ? : stream1.err; + +fail_usage: + usage(stderr); + return 1; +} diff --git a/tools/blktap2/drivers/tapdisk-driver.c b/tools/blktap2/drivers/tapdisk-driver.c new file mode 100644 index 0000000000..ca5629ab73 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-driver.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdlib.h> + +#include "tapdisk-driver.h" +#include "tapdisk-server.h" + +td_driver_t * +tapdisk_driver_allocate(int type, char *name, td_flag_t flags, int storage) +{ + int err; + td_driver_t *driver; + struct tap_disk *ops; + + ops = tapdisk_server_find_driver_interface(type); + if (!ops) + return NULL; + + driver = calloc(1, sizeof(td_driver_t)); + if (!driver) + return NULL; + + err = tapdisk_namedup(&driver->name, name); + if (err) + goto fail; + + driver->ops = ops; + driver->type = type; + driver->storage = storage; + driver->data = calloc(1, ops->private_data_size); + if (!driver->data) + goto fail; + + if (td_flag_test(flags, TD_OPEN_RDONLY)) + td_flag_set(driver->state, TD_DRIVER_RDONLY); + + return driver; + +fail: + free(driver->name); + free(driver->data); + free(driver); + return NULL; +} + +void +tapdisk_driver_free(td_driver_t *driver) +{ + if (!driver) + return; + + if (driver->refcnt) + return; + + if (td_flag_test(driver->state, TD_DRIVER_OPEN)) + EPRINTF("freeing open driver %s (state 0x%08x)\n", + driver->name, driver->state); + + free(driver->name); + free(driver->data); + free(driver); +} + +void +tapdisk_driver_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb) +{ + tapdisk_server_queue_tiocb(tiocb); +} + +void +tapdisk_driver_debug(td_driver_t *driver) +{ + if (driver->ops->td_debug) + driver->ops->td_debug(driver); +} diff --git a/tools/blktap2/drivers/tapdisk-driver.h b/tools/blktap2/drivers/tapdisk-driver.h new file mode 100644 index 0000000000..de0a9be233 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-driver.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_DRIVER_H_ +#define _TAPDISK_DRIVER_H_ + +#include "tapdisk.h" +#include "scheduler.h" +#include "tapdisk-queue.h" + +#define TD_DRIVER_OPEN 0x0001 +#define TD_DRIVER_RDONLY 0x0002 + +struct td_driver_handle { + int type; + char *name; + + int storage; + + int refcnt; + td_flag_t state; + + td_disk_info_t info; + + void *data; + struct tap_disk *ops; + + struct list_head next; +}; + +td_driver_t *tapdisk_driver_allocate(int, char *, td_flag_t, int); +void tapdisk_driver_free(td_driver_t *); + +void tapdisk_driver_queue_tiocb(td_driver_t *, struct tiocb *); + +void tapdisk_driver_debug(td_driver_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-filter.c b/tools/blktap2/drivers/tapdisk-filter.c new file mode 100644 index 0000000000..fc018eadbd --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-filter.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <stdlib.h> +#include <unistd.h> +#include <libaio.h> +#include <syslog.h> +#include <sys/time.h> + +#include "tapdisk-log.h" +#include "tapdisk-filter.h" + +#define RSEED 7 +#define PRE_CHECK 0 +#define POST_CHECK 1 + +#define WRITE_INTEGRITY "buffer integrity failure after write" +#define READ_INTEGRITY "disk integrity failure after read" + +#define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a) + +/* + * simulate IO errors by knocking request size to zero before + * submitting and restoring original size before returning + */ +static inline void +inject_fault(struct tfilter *filter, struct iocb *io) +{ + struct fiocb *fio; + + if (!filter->ffree) + return; + + fio = filter->flist[--filter->ffree]; + + fio->bytes = io->u.c.nbytes; + fio->data = io->data; + io->u.c.nbytes = 0; + io->data = fio; +} + +static inline int +fault_injected(struct tfilter *filter, struct iocb *io) +{ + unsigned long iop = (unsigned long)io->data; + unsigned long start = (unsigned long)filter->fiocbs; + unsigned long end = start + (filter->iocbs * sizeof(struct fiocb)); + + return (iop >= start && iop < end); +} + +static inline void +recover_fault(struct tfilter *filter, struct iocb *io) +{ + struct fiocb *fio = (struct fiocb *)io->data; + + io->u.c.nbytes = fio->bytes; + io->data = fio->data; + + memset(fio, 0, sizeof(struct fiocb)); + filter->flist[filter->ffree++] = fio; +} + +static inline uint64_t +chksum(char *buf) +{ + int i, num = 512 >> 3; + uint64_t *p = (uint64_t *)buf; + uint64_t sum = 0; + + for (i = 0; i < num; i++) + sum += p[i]; + + return sum; +} + +static inline void +check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type) +{ + uint64_t sum; + struct dhash *hash; + + hash = filter->dhash + sec; + if (!hash->time.tv_sec) + return; + + sum = chksum(buf); + if (hash->hash != chksum(buf)) { + struct timeval now; + gettimeofday(&now, NULL); + DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06lu, " + "from disk: 0x%020" PRIx64 " at %012lu.%06lu\n", + type, hash->hash, hash->time.tv_sec, + hash->time.tv_usec, sum, now.tv_sec, now.tv_usec); + } +} + +static inline void +insert_hash(struct tfilter *filter, uint64_t sec, char *buf) +{ + struct dhash *hash; + + hash = filter->dhash + sec; + hash->hash = chksum(buf); + gettimeofday(&hash->time, NULL); +} + +static void +check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf) +{ + struct dhash *hash; + + if (sec >= filter->secs) + return; + + hash = filter->dhash + sec; + + if (rw) { + if (type == PRE_CHECK) + insert_hash(filter, sec, buf); + else + check_hash(filter, sec, buf, WRITE_INTEGRITY); + } else if (type == POST_CHECK) { + check_hash(filter, sec, buf, READ_INTEGRITY); + insert_hash(filter, sec, buf); + } +} + +static void +check_data(struct tfilter *filter, int type, struct iocb *io) +{ + int rw; + uint64_t i, sec; + + rw = (io->aio_lio_opcode == IO_CMD_PWRITE); + + for (i = 0; i < io->u.c.nbytes; i += 512) { + char *buf = io->u.c.buf + i; + uint64_t sec = (io->u.c.offset + i) >> 9; + check_sector(filter, type, rw, sec, buf); + } +} + +struct tfilter * +tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs) +{ + int i; + struct tfilter *filter = NULL; + + if (!mode) + return NULL; + + filter = calloc(1, sizeof(struct tfilter)); + if (!filter) + goto fail; + + filter->mode = mode; + filter->secs = secs; + filter->iocbs = iocbs; + + if (filter->mode & TD_INJECT_FAULTS) { + filter->fiocbs = calloc(iocbs, sizeof(struct fiocb)); + filter->flist = calloc(iocbs, sizeof(struct fiocb *)); + if (!filter->fiocbs || !filter->flist) + filter->mode &= ~TD_INJECT_FAULTS; + else { + srand(RSEED); + filter->ffree = iocbs; + for (i = 0; i < iocbs; i++) + filter->flist[i] = filter->fiocbs + i; + } + } + + if (filter->mode & TD_CHECK_INTEGRITY) { + filter->dhash = calloc(secs, sizeof(struct dhash)); + if (!filter->dhash) + filter->mode &= ~TD_CHECK_INTEGRITY; + } + + syslog(LOG_WARNING, "WARNING: " + "FILTERING IN MODE 0x%04x\n", filter->mode); + + return filter; + + fail: + tapdisk_free_tfilter(filter); + return NULL; +} + +void +tapdisk_free_tfilter(struct tfilter *filter) +{ + if (!filter) + return; + + free(filter->dhash); + free(filter->flist); + free(filter->fiocbs); + free(filter); +} + +void +tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num) +{ + int i; + + if (!filter) + return; + + for (i = 0; i < num; i++) { + struct iocb *io = iocbs[i]; + + if (filter->mode & TD_INJECT_FAULTS) { + if ((random() % 100) <= TD_FAULT_RATE) { + inject_fault(filter, io); + continue; + } + } + + if (filter->mode & TD_CHECK_INTEGRITY) + check_data(filter, PRE_CHECK, io); + } +} + +void +tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num) +{ + int i; + + if (!filter) + return; + + for (i = 0; i < num; i++) { + struct iocb *io = events[i].obj; + + if (filter->mode & TD_INJECT_FAULTS) { + if (fault_injected(filter, io)) { + recover_fault(filter, io); + continue; + } + } + + if (filter->mode & TD_CHECK_INTEGRITY) + check_data(filter, POST_CHECK, io); + } +} diff --git a/tools/blktap2/drivers/tapdisk-filter.h b/tools/blktap2/drivers/tapdisk-filter.h new file mode 100644 index 0000000000..c4e977e4aa --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-filter.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef TAPDISK_FILTER_H +#define TAPDISK_FILTER_H + +#include <libaio.h> +#include <inttypes.h> +#include <time.h> + +#define TD_INJECT_FAULTS 0x00001 /* simulate random IO failures */ +#define TD_CHECK_INTEGRITY 0x00002 /* check data integrity */ + +#define TD_FAULT_RATE 5 + +struct dhash { + uint64_t hash; + struct timeval time; +}; + +struct fiocb { + size_t bytes; + void *data; +}; + +struct tfilter { + int mode; + uint64_t secs; + int iocbs; + + struct dhash *dhash; + + int ffree; + struct fiocb *fiocbs; + struct fiocb **flist; +}; + +struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs); +void tapdisk_free_tfilter(struct tfilter *); +void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int); +void tapdisk_filter_events(struct tfilter *, struct io_event *, int); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-image.c b/tools/blktap2/drivers/tapdisk-image.c new file mode 100644 index 0000000000..6da7f48bd8 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-image.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> + +#include "tapdisk-image.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" + +#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) + +td_image_t * +tapdisk_image_allocate(char *file, int type, int storage, + td_flag_t flags, void *private) +{ + int err; + td_image_t *image; + + image = calloc(1, sizeof(td_image_t)); + if (!image) + return NULL; + + err = tapdisk_namedup(&image->name, file); + if (err) { + free(image); + return NULL; + } + + image->type = type; + image->flags = flags; + image->storage = storage; + image->private = private; + INIT_LIST_HEAD(&image->next); + + return image; +} + +void +tapdisk_image_free(td_image_t *image) +{ + if (!image) + return; + + list_del(&image->next); + + free(image->name); + tapdisk_driver_free(image->driver); + free(image); +} + +int +tapdisk_image_check_td_request(td_image_t *image, td_request_t treq) +{ + int rdonly; + td_driver_t *driver; + td_disk_info_t *info; + + driver = image->driver; + if (!driver) + return -ENODEV; + + info = &driver->info; + rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY); + + if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE) + goto fail; + + if (treq.op == TD_OP_WRITE && rdonly) + goto fail; + + if (treq.secs <= 0 || treq.sec + treq.secs > info->size) + goto fail; + + return 0; + +fail: + ERR(-EINVAL, "bad td request on %s (%s, %"PRIu64"): %d at %"PRIu64, + image->name, (rdonly ? "ro" : "rw"), info->size, treq.op, + treq.sec + treq.secs); + return -EINVAL; + +} + +int +tapdisk_image_check_ring_request(td_image_t *image, blkif_request_t *req) +{ + td_driver_t *driver; + td_disk_info_t *info; + int i, psize, rdonly; + uint64_t nsects, total; + + driver = image->driver; + if (!driver) + return -ENODEV; + + nsects = 0; + total = 0; + info = &driver->info; + + rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY); + + if (req->operation != BLKIF_OP_READ && + req->operation != BLKIF_OP_WRITE) + goto fail; + + if (req->operation == BLKIF_OP_WRITE && rdonly) + goto fail; + + if (!req->nr_segments || req->nr_segments > MAX_SEGMENTS_PER_REQ) + goto fail; + + total = 0; + psize = getpagesize(); + + for (i = 0; i < req->nr_segments; i++) { + nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1; + + if (req->seg[i].last_sect >= psize >> 9 || nsects <= 0) + goto fail; + + total += nsects; + } + + if (req->sector_number + nsects > info->size) + goto fail; + + return 0; + +fail: + ERR(-EINVAL, "bad request on %s (%s, %"PRIu64"): id: %"PRIu64": %d at %"PRIu64, + image->name, (rdonly ? "ro" : "rw"), info->size, req->id, + req->operation, req->sector_number + total); + return -EINVAL; +} diff --git a/tools/blktap2/drivers/tapdisk-image.h b/tools/blktap2/drivers/tapdisk-image.h new file mode 100644 index 0000000000..8779dff8b7 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-image.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_IMAGE_H_ +#define _TAPDISK_IMAGE_H_ + +#include "tapdisk.h" +#include <xen/io/blkif.h> + +struct td_image_handle { + int type; + char *name; + + td_flag_t flags; + int storage; + + td_driver_t *driver; + td_disk_info_t info; + + void *private; + + struct list_head next; +}; + +td_image_t *tapdisk_image_allocate(char *, int, int, td_flag_t, void *); +void tapdisk_image_free(td_image_t *); + +int tapdisk_image_check_td_request(td_image_t *, td_request_t); +int tapdisk_image_check_ring_request(td_image_t *, blkif_request_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-interface.c b/tools/blktap2/drivers/tapdisk-interface.c new file mode 100644 index 0000000000..58366d0a0b --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-interface.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> + +#include "tapdisk.h" +#include "tapdisk-vbd.h" +#include "tapdisk-image.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" + +int +td_load(td_image_t *image) +{ + int err; + td_image_t *shared; + td_driver_t *driver; + + shared = tapdisk_server_get_shared_image(image); + if (!shared) + return -ENODEV; + + driver = shared->driver; + if (!driver) + return -EBADF; + + driver->refcnt++; + image->driver = driver; + image->info = driver->info; + + DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n", + driver->name, driver->refcnt, driver->state, driver->type); + return 0; +} + +int +td_open(td_image_t *image) +{ + int err; + td_driver_t *driver; + + driver = image->driver; + if (!driver) { + driver = tapdisk_driver_allocate(image->type, + image->name, + image->flags, + image->storage); + if (!driver) + return -ENOMEM; + } + + if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { + err = driver->ops->td_open(driver, image->name, image->flags); + if (err) { + if (!image->driver) + tapdisk_driver_free(driver); + return err; + } + + td_flag_set(driver->state, TD_DRIVER_OPEN); + DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d)\n", + driver->name, driver->refcnt + 1, + driver->state, driver->type); + } + + image->driver = driver; + image->info = driver->info; + driver->refcnt++; + return 0; +} + +int +td_close(td_image_t *image) +{ + td_driver_t *driver; + + driver = image->driver; + if (!driver) + return -ENODEV; + + driver->refcnt--; + if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) { + driver->ops->td_close(driver); + td_flag_clear(driver->state, TD_DRIVER_OPEN); + } + + DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n", + driver->name, driver->refcnt, driver->state, driver->type); + + return 0; +} + +int +td_get_parent_id(td_image_t *image, td_disk_id_t *id) +{ + td_driver_t *driver; + + driver = image->driver; + if (!driver) + return -ENODEV; + + if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) + return -EBADF; + + return driver->ops->td_get_parent_id(driver, id); +} + +int +td_validate_parent(td_image_t *image, td_image_t *parent) +{ + td_driver_t *driver, *pdriver; + + driver = image->driver; + pdriver = parent->driver; + if (!driver || !pdriver) + return -ENODEV; + + if (!td_flag_test(driver->state, TD_DRIVER_OPEN) || + !td_flag_test(pdriver->state, TD_DRIVER_OPEN)) + return -EBADF; + + return 0; + return driver->ops->td_validate_parent(driver, pdriver, 0); +} + +void +td_queue_write(td_image_t *image, td_request_t treq) +{ + int err; + td_driver_t *driver; + + driver = image->driver; + if (!driver) { + err = -ENODEV; + goto fail; + } + + if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { + err = -EBADF; + goto fail; + } + + err = tapdisk_image_check_td_request(image, treq); + if (err) + goto fail; + + driver->ops->td_queue_write(driver, treq); + return; + +fail: + td_complete_request(treq, err); +} + +void +td_queue_read(td_image_t *image, td_request_t treq) +{ + int err; + td_driver_t *driver; + + driver = image->driver; + if (!driver) { + err = -ENODEV; + goto fail; + } + + if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { + err = -EBADF; + goto fail; + } + + err = tapdisk_image_check_td_request(image, treq); + if (err) + goto fail; + + driver->ops->td_queue_read(driver, treq); + return; + +fail: + td_complete_request(treq, err); +} + +void +td_forward_request(td_request_t treq) +{ + tapdisk_vbd_forward_request(treq); +} + +void +td_complete_request(td_request_t treq, int res) +{ + treq.cb(treq, res); +} + +void +td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb) +{ + tapdisk_driver_queue_tiocb(driver, tiocb); +} + +void +td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes, + long long offset, td_queue_callback_t cb, void *arg) +{ + tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg); +} + +void +td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes, + long long offset, td_queue_callback_t cb, void *arg) +{ + tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg); +} + +void +td_debug(td_image_t *image) +{ + td_driver_t *driver; + + driver = image->driver; + if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN)) + + return; + + tapdisk_driver_debug(driver); +} diff --git a/tools/blktap2/drivers/tapdisk-interface.h b/tools/blktap2/drivers/tapdisk-interface.h new file mode 100644 index 0000000000..1e48e5811a --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-interface.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_INTERFACE_H_ +#define _TAPDISK_INTERFACE_H_ + +#include "tapdisk.h" +#include "tapdisk-queue.h" + +int td_open(td_image_t *); +int td_load(td_image_t *); +int td_close(td_image_t *); +int td_get_parent_id(td_image_t *, td_disk_id_t *); +int td_validate_parent(td_image_t *, td_image_t *); + +void td_queue_write(td_image_t *, td_request_t); +void td_queue_read(td_image_t *, td_request_t); +void td_forward_request(td_request_t); +void td_complete_request(td_request_t, int); + +void td_debug(td_image_t *); + +void td_queue_tiocb(td_driver_t *, struct tiocb *); +void td_prep_read(struct tiocb *, int, char *, size_t, + long long, td_queue_callback_t, void *); +void td_prep_write(struct tiocb *, int, char *, size_t, + long long, td_queue_callback_t, void *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-ipc.c b/tools/blktap2/drivers/tapdisk-ipc.c new file mode 100644 index 0000000000..3cfdb6c8f8 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-ipc.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "tapdisk.h" +#include "tapdisk-ipc.h" +#include "tapdisk-vbd.h" +#include "tapdisk-server.h" + +static int +tapdisk_ipc_write_message(int fd, tapdisk_message_t *message, int timeout) +{ + fd_set writefds; + int ret, len, offset; + struct timeval tv, *t; + + t = NULL; + offset = 0; + len = sizeof(tapdisk_message_t); + + if (timeout) { + tv.tv_sec = timeout; + tv.tv_usec = 0; + t = &tv; + } + + DPRINTF("sending '%s' message (uuid = %u)\n", + tapdisk_message_name(message->type), message->cookie); + + while (offset < len) { + FD_ZERO(&writefds); + FD_SET(fd, &writefds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(fd + 1, NULL, &writefds, NULL, t); + if (ret == -1) + break; + else if (FD_ISSET(fd, &writefds)) { + ret = write(fd, message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + if (offset != len) { + EPRINTF("failure writing message\n"); + return -EIO; + } + + return 0; +} + +int +tapdisk_ipc_write(td_ipc_t *ipc, int type) +{ + tapdisk_message_t message; + + if (ipc->wfd == -1) + return 0; + + memset(&message, 0, sizeof(tapdisk_message_t)); + message.type = type; + message.cookie = ipc->uuid; + + return tapdisk_ipc_write_message(ipc->wfd, &message, 2); +} + +int +tapdisk_ipc_write_error(td_ipc_t *ipc, const char *text) +{ + tapdisk_message_t message; + + memset(&message, 0, sizeof(message)); + message.type = TAPDISK_MESSAGE_RUNTIME_ERROR; + message.cookie = ipc->uuid; + snprintf(message.u.string.text, sizeof(message.u.string.text), "%s", text); + + return tapdisk_ipc_write_message(ipc->wfd, &message, 2); +} + +static int +tapdisk_ipc_read_message(int fd, tapdisk_message_t *message, int timeout) +{ + fd_set readfds; + int ret, len, offset; + struct timeval tv, *t; + + t = NULL; + offset = 0; + len = sizeof(tapdisk_message_t); + + if (timeout) { + tv.tv_sec = timeout; + tv.tv_usec = 0; + t = &tv; + } + + memset(message, 0, sizeof(tapdisk_message_t)); + + while (offset < len) { + FD_ZERO(&readfds); + FD_SET(fd, &readfds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(fd + 1, &readfds, NULL, NULL, t); + if (ret == -1) + break; + else if (FD_ISSET(fd, &readfds)) { + ret = read(fd, message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + if (offset != len) { + EPRINTF("failure reading message\n"); + return -EIO; + } + + DPRINTF("received '%s' message (uuid = %u)\n", + tapdisk_message_name(message->type), message->cookie); + + return 0; +} + +int +tapdisk_ipc_read(td_ipc_t *ipc) +{ + int err; + td_vbd_t *vbd; + td_uuid_t uuid; + tapdisk_message_t message; + + err = tapdisk_ipc_read_message(ipc->rfd, &message, 2); + if (err) { + tapdisk_server_check_state(); + return err; + } + + uuid = message.cookie; + vbd = tapdisk_server_get_vbd(uuid); + + if (!vbd && message.type != TAPDISK_MESSAGE_PID) { + EPRINTF("received message for non-existing vbd: %u\n", uuid); + err = -EINVAL; + goto fail; + } + + switch (message.type) { + case TAPDISK_MESSAGE_PID: + err = tapdisk_vbd_initialize(ipc->rfd, ipc->wfd, uuid); + + memset(&message, 0, sizeof(tapdisk_message_t)); + message.cookie = uuid; + + if (!err) { + message.type = TAPDISK_MESSAGE_PID_RSP; + message.u.tapdisk_pid = getpid(); + } else + message.type = TAPDISK_MESSAGE_ERROR; + + return tapdisk_ipc_write_message(ipc->wfd, &message, 0); + + case TAPDISK_MESSAGE_OPEN: + { + image_t image; + char *devname; + td_flag_t flags; + + flags = 0; + + if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY) + flags |= TD_OPEN_RDONLY; + if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED) + flags |= TD_OPEN_SHAREABLE; + if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE) + flags |= TD_OPEN_ADD_CACHE; + if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX) + flags |= TD_OPEN_VHD_INDEX; + if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY) + flags |= TD_OPEN_LOG_DIRTY; + + err = asprintf(&devname, "%s/%s%d", + BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, + message.u.params.devnum); + if (err == -1) + goto fail; + + err = tapdisk_vbd_open(vbd, + message.u.params.path, + message.drivertype, + message.u.params.storage, + devname, flags); + free(devname); + if (err) + goto fail; + + err = tapdisk_vbd_get_image_info(vbd, &image); + if (err) + goto fail; + + memset(&message, 0, sizeof(tapdisk_message_t)); + message.cookie = uuid; + message.u.image.sectors = image.size; + message.u.image.sector_size = image.secsize; + message.u.image.info = image.info; + message.type = TAPDISK_MESSAGE_OPEN_RSP; + + return tapdisk_ipc_write_message(ipc->wfd, &message, 0); + } + + case TAPDISK_MESSAGE_PAUSE: + tapdisk_vbd_pause(vbd); + return 0; /* response written asynchronously */ + + case TAPDISK_MESSAGE_RESUME: + tapdisk_vbd_resume(vbd, + message.u.params.path, + message.drivertype); + return 0; /* response written asynchronously */ + + case TAPDISK_MESSAGE_CLOSE: + tapdisk_vbd_close(vbd); + return 0; /* response written asynchronously */ + + case TAPDISK_MESSAGE_EXIT: + return 0; + } + + err = -EINVAL; + EPRINTF("received unrecognized message %s, uuid = %d\n", + tapdisk_message_name(message.type), uuid); + +fail: + memset(&message, 0, sizeof(tapdisk_message_t)); + message.cookie = uuid; + message.type = TAPDISK_MESSAGE_ERROR; + tapdisk_ipc_write_message(ipc->wfd, &message, 2); + tapdisk_server_check_state(); + + return -err; +} diff --git a/tools/blktap2/drivers/tapdisk-ipc.h b/tools/blktap2/drivers/tapdisk-ipc.h new file mode 100644 index 0000000000..25eb48cafc --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-ipc.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_IPC_H_ +#define _TAPDISK_IPC_H_ + +#include "tapdisk-message.h" + +typedef struct td_ipc_handle { + int rfd; + int wfd; + td_uuid_t uuid; +} td_ipc_t; + +int tapdisk_ipc_read(td_ipc_t *ipc); +int tapdisk_ipc_write(td_ipc_t *ipc, int type); +int tapdisk_ipc_write_error(td_ipc_t *ipc, const char *message); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-log.c b/tools/blktap2/drivers/tapdisk-log.c new file mode 100644 index 0000000000..980affa3a2 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-log.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <syslog.h> +#include <inttypes.h> +#include <sys/time.h> + +#include "tapdisk-log.h" + +#define MAX_ENTRY_LEN 512 +#define MAX_ERROR_MESSAGES 16 + +struct error { + int cnt; + int err; + char *func; + char msg[MAX_ENTRY_LEN]; +}; + +struct ehandle { + int cnt; + int dropped; + struct error errors[MAX_ERROR_MESSAGES]; +}; + +struct tlog { + char *p; + int size; + uint64_t cnt; + char *buf; + int level; + char *file; + int append; +}; + +static struct ehandle tapdisk_err; +static struct tlog tapdisk_log; + +void +open_tlog(char *file, size_t bytes, int level, int append) +{ + tapdisk_log.size = ((bytes + 511) & (~511)); + + if (asprintf(&tapdisk_log.file, "%s.%d", file, getpid()) == -1) + return; + + if (posix_memalign((void **)&tapdisk_log.buf, 512, tapdisk_log.size)) { + free(tapdisk_log.file); + tapdisk_log.buf = NULL; + return; + } + + memset(tapdisk_log.buf, 0, tapdisk_log.size); + + tapdisk_log.p = tapdisk_log.buf; + tapdisk_log.level = level; + tapdisk_log.append = append; +} + +void +close_tlog(void) +{ + if (!tapdisk_log.buf) + return; + + if (tapdisk_log.append) + tlog_flush(); + + free(tapdisk_log.buf); + free(tapdisk_log.file); + + memset(&tapdisk_log, 0, sizeof(struct tlog)); +} + +void +__tlog_write(int level, const char *func, const char *fmt, ...) +{ + char *buf; + va_list ap; + struct timeval t; + int ret, len, avail; + + if (!tapdisk_log.buf) + return; + + if (level > tapdisk_log.level) + return; + + avail = tapdisk_log.size - (tapdisk_log.p - tapdisk_log.buf); + if (avail < MAX_ENTRY_LEN) { + if (tapdisk_log.append) + tlog_flush(); + tapdisk_log.p = tapdisk_log.buf; + } + + buf = tapdisk_log.p; + gettimeofday(&t, NULL); + len = snprintf(buf, MAX_ENTRY_LEN - 1, "%08"PRIu64":%010ld.%06ld:" + "%s ", tapdisk_log.cnt, t.tv_sec, t.tv_usec, func); + + va_start(ap, fmt); + ret = vsnprintf(buf + len, MAX_ENTRY_LEN - (len + 1), fmt, ap); + va_end(ap); + + len = (ret < MAX_ENTRY_LEN - (len + 1) ? + len + ret : MAX_ENTRY_LEN - 1); + buf[len] = '\0'; + + tapdisk_log.cnt++; + tapdisk_log.p += len; +} + +void +__tlog_error(int err, const char *func, const char *fmt, ...) +{ + va_list ap; + int i, len, ret; + struct error *e; + struct timeval t; + + err = (err > 0 ? err : -err); + + for (i = 0; i < tapdisk_err.cnt; i++) { + e = &tapdisk_err.errors[i]; + if (e->err == err && e->func == func) { + e->cnt++; + return; + } + } + + if (tapdisk_err.cnt >= MAX_ERROR_MESSAGES) { + tapdisk_err.dropped++; + return; + } + + gettimeofday(&t, NULL); + e = &tapdisk_err.errors[tapdisk_err.cnt]; + + len = snprintf(e->msg, MAX_ENTRY_LEN - 1, "%010ld.%06ld:%s ", + t.tv_sec, t.tv_usec, func); + + va_start(ap, fmt); + ret = vsnprintf(e->msg + len, MAX_ENTRY_LEN - (len + 1), fmt, ap); + va_end(ap); + + len = (ret < MAX_ENTRY_LEN - (len + 1) ? + len + ret : MAX_ENTRY_LEN - 1); + e->msg[len] = '\0'; + + e->cnt++; + e->err = err; + e->func = (char *)func; + tapdisk_err.cnt++; +} + +void +tlog_print_errors(void) +{ + int i; + struct error *e; + + for (i = 0; i < tapdisk_err.cnt; i++) { + e = &tapdisk_err.errors[i]; + syslog(LOG_INFO, "TAPDISK ERROR: errno %d at %s (cnt = %d): " + "%s\n", e->err, e->func, e->cnt, e->msg); + } + + if (tapdisk_err.dropped) + syslog(LOG_INFO, "TAPDISK ERROR: %d other error messages " + "dropped\n", tapdisk_err.dropped); +} + +void +tlog_flush_errors(void) +{ + int i; + struct error *e; + + for (i = 0; i < tapdisk_err.cnt; i++) { + e = &tapdisk_err.errors[i]; + tlog_write(TLOG_WARN, "TAPDISK ERROR: errno %d at %s " + "(cnt = %d): %s\n", e->err, e->func, e->cnt, + e->msg); + } + + if (tapdisk_err.dropped) + tlog_write(TLOG_WARN, "TAPDISK ERROR: %d other error messages " + "dropped\n", tapdisk_err.dropped); +} + +void +tlog_flush(void) +{ + int fd, flags; + size_t size, wsize; + + if (!tapdisk_log.buf) + return; + + flags = O_CREAT | O_WRONLY | O_DIRECT | O_NONBLOCK; + if (!tapdisk_log.append) + flags |= O_TRUNC; + + fd = open(tapdisk_log.file, flags, 0644); + if (fd == -1) + return; + + if (tapdisk_log.append) + if (lseek64(fd, 0, SEEK_END) == (loff_t)-1) + goto out; + + tlog_flush_errors(); + + size = tapdisk_log.p - tapdisk_log.buf; + wsize = ((size + 511) & (~511)); + + memset(tapdisk_log.buf + size, '\n', wsize - size); + write(fd, tapdisk_log.buf, wsize); + + tapdisk_log.p = tapdisk_log.buf; + +out: + close(fd); +} diff --git a/tools/blktap2/drivers/tapdisk-log.h b/tools/blktap2/drivers/tapdisk-log.h new file mode 100644 index 0000000000..ae2a408dd4 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-log.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _TAPDISK_LOG_H_ +#define _TAPDISK_LOG_H_ + +#define TLOG_WARN 0 +#define TLOG_INFO 1 +#define TLOG_DBG 2 + +void open_tlog(char *file, size_t bytes, int level, int append); +void close_tlog(void); +void tlog_flush(void); +void tlog_print_errors(void); + +void __tlog_write(int level, const char *func, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); +void __tlog_error(int err, const char *func, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +#define tlog_write(_level, _f, _a...) \ + __tlog_write(_level, __func__, _f, ##_a) + +#define tlog_error(_err, _f, _a...) \ + __tlog_error(_err, __func__, _f, ##_a) + +#endif diff --git a/tools/blktap2/drivers/tapdisk-queue.c b/tools/blktap2/drivers/tapdisk-queue.c new file mode 100644 index 0000000000..5461d415e0 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-queue.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <libaio.h> + +#include "tapdisk.h" +#include "tapdisk-log.h" +#include "tapdisk-queue.h" +#include "tapdisk-filter.h" +#include "atomicio.h" + +#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) +#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) +#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) + +/* + * We used a kernel patch to return an fd associated with the AIO context + * so that we can concurrently poll on synchronous and async descriptors. + * This is signalled by passing 1 as the io context to io_setup. + */ +#define REQUEST_ASYNC_FD 1 + +static inline void +queue_tiocb(struct tqueue *queue, struct tiocb *tiocb) +{ + struct iocb *iocb = &tiocb->iocb; + + if (queue->queued) { + struct tiocb *prev = (struct tiocb *) + queue->iocbs[queue->queued - 1]->data; + prev->next = tiocb; + } + + queue->iocbs[queue->queued++] = iocb; +} + +static inline int +deferred_tiocbs(struct tqueue *queue) +{ + return (queue->deferred.head != NULL); +} + +static inline void +defer_tiocb(struct tqueue *queue, struct tiocb *tiocb) +{ + struct tlist *list = &queue->deferred; + + if (!list->head) + list->head = list->tail = tiocb; + else + list->tail = list->tail->next = tiocb; + + queue->tiocbs_deferred++; + queue->deferrals++; +} + +static inline void +queue_deferred_tiocb(struct tqueue *queue) +{ + struct tlist *list = &queue->deferred; + + if (list->head) { + struct tiocb *tiocb = list->head; + + list->head = tiocb->next; + if (!list->head) + list->tail = NULL; + + queue_tiocb(queue, tiocb); + queue->tiocbs_deferred--; + } +} + +static inline void +queue_deferred_tiocbs(struct tqueue *queue) +{ + while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue)) + queue_deferred_tiocb(queue); +} + +/* + * td_complete may queue more tiocbs + */ +static void +complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res) +{ + int err; + struct iocb *iocb = &tiocb->iocb; + + if (res == iocb->u.c.nbytes) + err = 0; + else if ((int)res < 0) + err = (int)res; + else + err = -EIO; + + tiocb->cb(tiocb->arg, tiocb, err); +} + +static int +cancel_tiocbs(struct tqueue *queue, int err) +{ + int queued; + struct tiocb *tiocb; + + if (!queue->queued) + return 0; + + /* + * td_complete may queue more tiocbs, which + * will overwrite the contents of queue->iocbs. + * use a private linked list to keep track + * of the tiocbs we're cancelling. + */ + tiocb = (struct tiocb *)queue->iocbs[0]->data; + queued = queue->queued; + queue->queued = 0; + + for (; tiocb != NULL; tiocb = tiocb->next) + complete_tiocb(queue, tiocb, err); + + return queued; +} + +static int +fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err) +{ + ERR(err, "io_submit error: %d of %d failed", + total - succeeded, total); + + /* take any non-submitted, merged iocbs + * off of the queue, split them, and fail them */ + queue->queued = io_expand_iocbs(&queue->opioctx, + queue->iocbs, succeeded, total); + + return cancel_tiocbs(queue, err); +} + +static inline ssize_t +iocb_rw(struct iocb *iocb) +{ + int fd = iocb->aio_fildes; + char *buf = iocb->u.c.buf; + long long off = iocb->u.c.offset; + size_t size = iocb->u.c.nbytes; + ssize_t (*func)(int, void *, size_t) = + (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read); + + if (lseek64(fd, off, SEEK_SET) == (off64_t)-1) + return -errno; + + if (atomicio(func, fd, buf, size) != size) + return -errno; + + return size; +} + +static int +io_synchronous_rw(struct tqueue *queue) +{ + int i, merged, split; + struct iocb *iocb; + struct tiocb *tiocb; + struct io_event *ep; + + if (!queue->queued) + return 0; + + tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued); + merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued); + + queue->queued = 0; + + for (i = 0; i < merged; i++) { + ep = queue->aio_events + i; + iocb = queue->iocbs[i]; + ep->obj = iocb; + ep->res = iocb_rw(iocb); + } + + split = io_split(&queue->opioctx, queue->aio_events, merged); + tapdisk_filter_events(queue->filter, queue->aio_events, split); + + for (i = split, ep = queue->aio_events; i-- > 0; ep++) { + iocb = ep->obj; + tiocb = (struct tiocb *)iocb->data; + complete_tiocb(queue, tiocb, ep->res); + } + + queue_deferred_tiocbs(queue); + + return split; +} + +int +tapdisk_init_queue(struct tqueue *queue, int size, + int sync, struct tfilter *filter) +{ + int i, err; + + memset(queue, 0, sizeof(struct tqueue)); + + queue->size = size; + queue->sync = sync; + queue->filter = filter; + + if (sync) { + /* set up a pipe so we can return + * a poll fd that won't fire. */ + if (pipe(queue->dummy_pipe)) + return -errno; + queue->poll_fd = queue->dummy_pipe[0]; + } else { + queue->aio_ctx = (io_context_t)REQUEST_ASYNC_FD; + queue->poll_fd = io_setup(size, &queue->aio_ctx); + + if (queue->poll_fd < 0) { + if (queue->poll_fd == -EAGAIN) + DPRINTF("Couldn't setup AIO context. If you " + "are trying to concurrently use a " + "large number of blktap-based disks, " + "you may need to increase the " + "system-wide aio request limit. " + "(e.g. 'echo 1048576 > /proc/sys/fs/" + "aio-max-nr')\n"); + else + DPRINTF("Couldn't get fd for AIO poll " + "support. This is probably because " + "your kernel does not have the " + "aio-poll patch applied.\n"); + return queue->poll_fd; + } + } + + err = -ENOMEM; + queue->iocbs = calloc(size, sizeof(struct iocb *)); + queue->aio_events = calloc(size, sizeof(struct io_event)); + if (!queue->iocbs || !queue->aio_events) + goto fail; + + err = opio_init(&queue->opioctx, size); + if (err) + goto fail; + + return 0; + + fail: + tapdisk_free_queue(queue); + return err; +} + +void +tapdisk_free_queue(struct tqueue *queue) +{ + if (queue->sync) { + close(queue->dummy_pipe[0]); + close(queue->dummy_pipe[1]); + } else + io_destroy(queue->aio_ctx); + + free(queue->iocbs); + free(queue->aio_events); + opio_free(&queue->opioctx); +} + +void +tapdisk_debug_queue(struct tqueue *queue) +{ + struct tiocb *tiocb = queue->deferred.head; + + WARN("TAPDISK QUEUE:\n"); + WARN("size: %d, sync: %d, queued: %d, iocbs_pending: %d, " + "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n", + queue->size, queue->sync, queue->queued, queue->iocbs_pending, + queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals); + + if (tiocb) { + WARN("deferred:\n"); + for (; tiocb != NULL; tiocb = tiocb->next) { + struct iocb *io = &tiocb->iocb; + WARN("%s of %lu bytes at %lld\n", + (io->aio_lio_opcode == IO_CMD_PWRITE ? + "write" : "read"), + io->u.c.nbytes, io->u.c.offset); + } + } +} + +void +tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size, + long long offset, td_queue_callback_t cb, void *arg) +{ + struct iocb *iocb = &tiocb->iocb; + + if (rw) + io_prep_pwrite(iocb, fd, buf, size, offset); + else + io_prep_pread(iocb, fd, buf, size, offset); + + iocb->data = tiocb; + tiocb->cb = cb; + tiocb->arg = arg; + tiocb->next = NULL; +} + +void +tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb) +{ + if (!tapdisk_queue_full(queue)) + queue_tiocb(queue, tiocb); + else + defer_tiocb(queue, tiocb); +} + +/* + * fail_tiocbs may queue more tiocbs + */ +int +tapdisk_submit_tiocbs(struct tqueue *queue) +{ + int merged, submitted, err = 0; + + if (!queue->queued) + return 0; + + if (queue->sync) + return io_synchronous_rw(queue); + + tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued); + merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued); + submitted = io_submit(queue->aio_ctx, merged, queue->iocbs); + + DBG("queued: %d, merged: %d, submitted: %d\n", + queue->queued, merged, submitted); + + if (submitted < 0) { + err = submitted; + submitted = 0; + } else if (submitted < merged) + err = -EIO; + + queue->iocbs_pending += submitted; + queue->tiocbs_pending += queue->queued; + queue->queued = 0; + + if (err) + queue->tiocbs_pending -= + fail_tiocbs(queue, submitted, merged, err); + + return submitted; +} + +int +tapdisk_submit_all_tiocbs(struct tqueue *queue) +{ + int submitted = 0; + + do { + submitted += tapdisk_submit_tiocbs(queue); + } while (!tapdisk_queue_empty(queue)); + + return submitted; +} + +int +tapdisk_complete_tiocbs(struct tqueue *queue) +{ + int i, ret, split; + struct iocb *iocb; + struct tiocb *tiocb; + struct io_event *ep; + + ret = io_getevents(queue->aio_ctx, 0, + queue->size, queue->aio_events, NULL); + split = io_split(&queue->opioctx, queue->aio_events, ret); + tapdisk_filter_events(queue->filter, queue->aio_events, split); + + DBG("events: %d, tiocbs: %d\n", ret, split); + + queue->iocbs_pending -= ret; + queue->tiocbs_pending -= split; + + for (i = split, ep = queue->aio_events; i-- > 0; ep++) { + iocb = ep->obj; + tiocb = (struct tiocb *)iocb->data; + complete_tiocb(queue, tiocb, ep->res); + } + + queue_deferred_tiocbs(queue); + + return split; +} + +/* + * cancel_tiocbs may queue more tiocbs + */ +int +tapdisk_cancel_tiocbs(struct tqueue *queue) +{ + return cancel_tiocbs(queue, -EIO); +} + +int +tapdisk_cancel_all_tiocbs(struct tqueue *queue) +{ + int cancelled = 0; + + do { + cancelled += tapdisk_cancel_tiocbs(queue); + } while (!tapdisk_queue_empty(queue)); + + return cancelled; +} diff --git a/tools/blktap2/drivers/tapdisk-queue.h b/tools/blktap2/drivers/tapdisk-queue.h new file mode 100644 index 0000000000..40ff88669c --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-queue.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef TAPDISK_QUEUE_H +#define TAPDISK_QUEUE_H + +#include <libaio.h> + +#include "io-optimize.h" + +struct tiocb; +struct tfilter; + +typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err); + + +struct tiocb { + td_queue_callback_t cb; + void *arg; + + struct iocb iocb; + struct tiocb *next; +}; + +struct tlist { + struct tiocb *head; + struct tiocb *tail; +}; + +struct tqueue { + int size; + int sync; + + int poll_fd; + io_context_t aio_ctx; + struct opioctx opioctx; + int dummy_pipe[2]; + + int queued; + struct iocb **iocbs; + struct io_event *aio_events; + + /* number of iocbs pending in the aio layer */ + int iocbs_pending; + + /* number of tiocbs pending in the queue -- + * this is likely to be larger than iocbs_pending + * due to request coalescing */ + int tiocbs_pending; + + /* iocbs may be deferred if the aio ring is full. + * tapdisk_queue_complete will ensure deferred + * iocbs are queued as slots become available. */ + struct tlist deferred; + int tiocbs_deferred; + + /* optional tapdisk filter */ + struct tfilter *filter; + + uint64_t deferrals; +}; + +/* + * Interface for request producer (i.e., tapdisk) + * NB: the following functions may cause additional tiocbs to be queued: + * - tapdisk_submit_tiocbs + * - tapdisk_cancel_tiocbs + * - tapdisk_complete_tiocbs + * The *_all_tiocbs variants will handle the first two cases; + * be sure to call submit after calling complete in the third case. + */ +#define tapdisk_queue_count(q) ((q)->queued) +#define tapdisk_queue_empty(q) ((q)->queued == 0) +#define tapdisk_queue_full(q) \ + (((q)->tiocbs_pending + (q)->queued) >= (q)->size) +int tapdisk_init_queue(struct tqueue *, int size, int sync, struct tfilter *); +void tapdisk_free_queue(struct tqueue *); +void tapdisk_debug_queue(struct tqueue *); +void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *); +int tapdisk_submit_tiocbs(struct tqueue *); +int tapdisk_submit_all_tiocbs(struct tqueue *); +int tapdisk_complete_tiocbs(struct tqueue *); +int tapdisk_cancel_tiocbs(struct tqueue *); +int tapdisk_cancel_all_tiocbs(struct tqueue *); +void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t, + long long, td_queue_callback_t, void *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-ring.c b/tools/blktap2/drivers/tapdisk-ring.c new file mode 100644 index 0000000000..a5d40cb0a1 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-ring.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> + +#include "tapdisk-ring.h" + +static int +tapdisk_uring_create_ctlfd(td_uring_t *ring) +{ + int fd, err; + struct sockaddr_un saddr; + + if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >= + sizeof(saddr.sun_family)) + return -ENAMETOOLONG; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + return -errno; + + memset(&saddr, 0, sizeof(struct sockaddr_un)); + saddr.sun_family = AF_UNIX; + memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path)); + + err = unlink(ring->ctlfd_path); + if (err == -1 && errno != ENOENT) { + err = -errno; + goto fail; + } + + err = bind(fd, &saddr, sizeof(struct sockaddr_un)); + if (err == -1) { + err = -errno; + goto fail; + } + + err = listen(fd, 1); + if (err == -1) { + err = -errno; + goto fail; + } + + ring->ctlfd = fd; + return 0; + +fail: + close(fd); + return err; +} + +static void +tapdisk_uring_destroy_ctlfd(td_uring_t *ring) +{ + if (ring->ctlfd) { + close(ring->ctlfd); + ring->ctlfd = 0; + } + + if (ring->ctlfd_path) { + unlink(ring->ctlfd_path); + free(ring->ctlfd_path); + ring->ctlfd_path = NULL; + } +} + +static int +tapdisk_uring_connect_ctlfd(td_uring_t *ring) +{ + int fd, err; + struct sockaddr_un saddr; + + if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >= + sizeof(saddr.sun_path)) + return -ENAMETOOLONG; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd == -1) + return -errno; + + memset(&saddr, 0, sizeof(struct sockaddr_un)); + saddr.sun_family = AF_UNIX; + memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path)); + + err = connect(fd, &saddr, sizeof(saddr)); + if (err == -1) { + err = -errno; + goto fail; + } + + ring->ctlfd = fd; + return 0; + +fail: + close(fd); + return err; +} + +static void +tapdisk_uring_disconnect_ctlfd(td_uring_t *ring) +{ + if (ring->ctlfd) + close(ring->ctlfd); + free(ring->ctlfd_path); + ring->ctlfd_path = NULL; +} + +static int +tapdisk_uring_create_shmem(td_uring_t *ring) +{ + int fd, err; + + fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750); + if (fd == -1) + return -errno; + + err = ftruncate(fd, ring->shmem_size); + if (err == -1) { + err = -errno; + goto out; + } + + ring->shmem = mmap(NULL, ring->shmem_size, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring->shmem == MAP_FAILED) { + ring->shmem = NULL; + err = -errno; + goto out; + } + + err = 0; + +out: + close(fd); + return err; +} + +static void +tapdisk_uring_destroy_shmem(td_uring_t *ring) +{ + if (ring->shmem) { + munmap(ring->shmem, ring->shmem_size); + ring->shmem = NULL; + } + + if (ring->shmem_path) { + shm_unlink(ring->shmem_path); + free(ring->shmem_path); + ring->shmem_path = NULL; + } +} + +static int +tapdisk_uring_connect_shmem(td_uring_t *ring) +{ + int fd, err; + td_uring_header_t header, *p; + + fd = shm_open(ring->shmem_path, O_RDWR); + if (fd == -1) + return -errno; + + p = mmap(NULL, sizeof(td_uring_header_t), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + err = -errno; + goto out; + } + + memcpy(&header, p, sizeof(td_uring_header_t)); + munmap(p, sizeof(td_uring_header_t)); + + if (memcmp(header.cookie, + TAPDISK_URING_COOKIE, sizeof(header.cookie))) { + err = -EINVAL; + goto out; + } + + if (header.version != TD_URING_CURRENT_VERSION) { + err = -EINVAL; + goto out; + } + + ring->ring_size = header.ring_size; + ring->data_size = header.data_size; + ring->shmem_size = header.shmem_size; + + ring->shmem = mmap(NULL, ring->shmem_size, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring->shmem == MAP_FAILED) { + rint->shmem = NULL; + err = -errno; + goto out; + } + + err = 0; + +out: + close(fd); + return err; +} + +static void +tapdisk_uring_disconnect_shmem(td_uring_t *ring) +{ + if (ring->shmem) + munmap(ring->shmem, ring->shmem_size); + free(ring->shmem_path); + ring->shmem_path = NULL; +} + +int +tapdisk_uring_create(td_uring_t *ring, const char *location, + uint32_t ring_size, uint32_t data_size) +{ + int fd, err; + + memset(ring, 0, sizeof(td_uring_t)); + + ring->ring_size = ring_size; + ring->data_size = data_size; + ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t); + + err = asprintf(&ring->shmem_path, "%s.shm", location); + if (err == -1) { + ring->shmem_path = NULL; + err = -errno; + goto fail; + } + + err = asprintf(&ring->ctlfd_path, "%s.cfd", location); + if (err == -1) { + ring->ctlfd_path = NULL; + err = -errno; + goto fail; + } + + err = tapdisk_uring_create_ctlfd(ring); + if (err) + goto fail; + + err = tapdisk_uring_create_shmem(ring); + if (err) + goto fail; + + ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t); + ring->data_area = (unsigned long)ring->ring_area + ring->ring_size; + + return 0; + +fail: + tapdisk_uring_destroy(ring); + return err; +} + +int +tapdisk_uring_destroy(td_uring_t *ring) +{ + tapdisk_uring_destroy_shmem(ring); + tapdisk_uring_destroy_ctlfd(ring); + return 0; +} + +int +tapdisk_uring_connect(td_uring_t *ring, const char *location) +{ + int fd, err; + + memset(ring, 0, sizeof(td_uring_t)); + + err = asprintf(&ring->shmem_path, "%s.shm", location); + if (err == -1) { + ring->shmem_path = NULL; + err = -errno; + goto fail; + } + + err = asprintf(&ring->ctlfd_path, "%s.cfd", location); + if (err == -1) { + ring->ctlfd_path = NULL; + err = -errno; + goto fail; + } + + err = tapdisk_uring_connect_ctlfd(ring); + if (err) + goto fail; + + err = tapdisk_uring_connect_shmem(ring); + if (err) + goto fail; + + err = 0; + +fail: +} + +int +tapdisk_uring_disconnect(td_uring_t *ring) +{ + tapdisk_uring_disconnect_shmem(ring); + tapdisk_uring_disconnect_ctlfd(ring); + return 0; +} + +static int +tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout) +{ + fd_set readfds; + int ret, len, offset; + struct timeval tv, *t; + + t = NULL; + offset = 0; + len = sizeof(td_uring_message_t); + + if (timeout) { + tv.tv_sec = timeout; + tv.tv_usec = 0; + t = &tv; + } + + while (offset < len) { + FD_ZERO(&readfds); + FD_SET(fd, &readfds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(fd + 1, &readfds, NULL, NULL, t); + if (ret == -1) + break; + else if (FD_ISSET(fd, &readfds)) { + ret = read(fd, message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + if (offset != len) + return -EIO; + + return 0; +} + +static int +tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout) +{ + fd_set writefds; + int ret, len, offset; + struct timeval tv, *t; + + t = NULL; + offset = 0; + len = sizeof(td_uring_message_t); + + if (timeout) { + tv.tv_sec = timeout; + tv.tv_usec = 0; + t = &tv; + } + + while (offset < len) { + FD_ZERO(&writefds); + FD_SET(fd, &writefds); + + /* we don't bother reinitializing tv. at worst, it will wait a + * bit more time than expected. */ + + ret = select(fd + 1, NULL, &writefds, NULL, t); + if (ret == -1) + break; + else if (FD_ISSET(fd, &writefds)) { + ret = write(fd, message + offset, len - offset); + if (ret <= 0) + break; + offset += ret; + } else + break; + } + + if (offset != len) + return -EIO; + + return 0; +} + +int +tapdisk_uring_poll(td_uring_t *ring) +{ + int err; + td_uring_message_t message; + + err = tapdisk_uring_read_message(ring->ctlfd, &message, 1); + if (err) + return err; + + if (message.type != TAPDISK_URING_MESSAGE_KICK) + return -EINVAL; + + return 0; +} + +int +tapdisk_uring_kick(td_uring_t *ring) +{ + td_uring_message_t message; + + memset(&message, 0, sizeof(td_uring_message_t)); + message.type = TAPDISK_URING_MESSAGE_KICK; + + return tapdisk_uring_write_message(ring->ctlfd, &message, 1); +} diff --git a/tools/blktap2/drivers/tapdisk-ring.h b/tools/blktap2/drivers/tapdisk-ring.h new file mode 100644 index 0000000000..a70ee10609 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-ring.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_RING_H_ +#define _TAPDISK_RING_H_ + +#include <inttypes.h> + +#include <xenctrl.h> +#include <xen/io/ring.h> + +typedef struct td_uring td_uring_t; +typedef struct td_uring_header td_uring_header_t; +typedef struct td_uring_request td_uring_request_t; +typedef struct td_uring_response td_uring_response_t; + +struct td_uring { + int ctlfd; + + char *shmem_path; + char *ctlfd_path; + + void *shmem; + void *ring_area; + void *data_area; +}; + +struct td_uring_header { + char cookie[8]; + uint32_t version; + uint32_t shmem_size; + uint32_t ring_size; + uint32_t data_size; + char reserved[4064]; +}; + +struct td_uring_request { + uint8_t op; + uint64_t id; + uint64_t sec; + uint32_t secs; + uint32_t offset; +}; + +struct td_uring_response { + uint8_t op; + uint64_t id; + uint8_t status; +}; + +DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t); + +int tapdisk_uring_create(td_uring_t *, const char *location, + uint32_t ring_size, uint32_t data_size); +int tapdisk_uring_destroy(td_uring_t *); + +int tapdisk_uring_connect(td_uring_t *, const char *location); +int tapdisk_uring_disconnect(td_uring_t *); + +int tapdisk_uring_poll(td_uring_t *); +int tapdisk_uring_kick(td_uring_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-server.c b/tools/blktap2/drivers/tapdisk-server.c new file mode 100644 index 0000000000..c6a3de514e --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-server.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/signal.h> + +#define TAPDISK +#include "tapdisk-utils.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) +#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) + + tapdisk_server_t server; + +#define tapdisk_server_for_each_vbd(vbd, tmp) \ + list_for_each_entry_safe(vbd, tmp, &server.vbds, next) + +struct tap_disk * +tapdisk_server_find_driver_interface(int type) +{ + int n; + + n = sizeof(dtypes) / sizeof(struct disk_info_t *); + if (type > n) + return NULL; + + return dtypes[type]->drv; +} + +td_image_t * +tapdisk_server_get_shared_image(td_image_t *image) +{ + td_vbd_t *vbd, *tmpv; + td_image_t *img, *tmpi; + + if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE)) + return NULL; + + tapdisk_server_for_each_vbd(vbd, tmpv) + tapdisk_vbd_for_each_image(vbd, img, tmpi) + if (img->type == image->type && + !strcmp(img->name, image->name)) + return img; + + return NULL; +} + +td_vbd_t * +tapdisk_server_get_vbd(uint16_t uuid) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + if (vbd->uuid == uuid) + return vbd; + + return NULL; +} + +void +tapdisk_server_add_vbd(td_vbd_t *vbd) +{ + list_add_tail(&vbd->next, &server.vbds); +} + +void +tapdisk_server_remove_vbd(td_vbd_t *vbd) +{ + list_del(&vbd->next); + INIT_LIST_HEAD(&vbd->next); + tapdisk_server_check_state(); +} + +void +tapdisk_server_queue_tiocb(struct tiocb *tiocb) +{ + tapdisk_queue_tiocb(&server.aio_queue, tiocb); +} + +void +tapdisk_server_debug(void) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_debug_queue(&server.aio_queue); + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_debug(vbd); + + tlog_flush(); +} + +void +tapdisk_server_check_state(void) +{ + if (list_empty(&server.vbds)) + server.run = 0; +} + +event_id_t +tapdisk_server_register_event(char mode, int fd, + int timeout, event_cb_t cb, void *data) +{ + return scheduler_register_event(&server.scheduler, + mode, fd, timeout, cb, data); +} + +void +tapdisk_server_unregister_event(event_id_t event) +{ + return scheduler_unregister_event(&server.scheduler, event); +} + +void +tapdisk_server_set_max_timeout(int seconds) +{ + scheduler_set_max_timeout(&server.scheduler, seconds); +} + +static void +tapdisk_server_assert_locks(void) +{ + +} + +static void +tapdisk_server_set_retry_timeout(void) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + if (tapdisk_vbd_retry_needed(vbd)) { + tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL); + return; + } +} + +static void +tapdisk_server_check_progress(void) +{ + struct timeval now; + td_vbd_t *vbd, *tmp; + + gettimeofday(&now, NULL); + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_check_progress(vbd); +} + +static void +tapdisk_server_submit_tiocbs(void) +{ + tapdisk_submit_all_tiocbs(&server.aio_queue); +} + +static void +tapdisk_server_kick_responses(void) +{ + int n; + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_kick(vbd); +} + +static void +tapdisk_server_check_vbds(void) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_check_state(vbd); +} + +static void +tapdisk_server_stop_vbds(void) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_kill_queue(vbd); +} + +static void +tapdisk_server_send_error(const char *message) +{ + td_vbd_t *vbd, *tmp; + + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_ipc_write_error(&vbd->ipc, message); +} + +static void +tapdisk_server_read_ipc_message(event_id_t id, char mode, void *private) +{ + tapdisk_ipc_read(&server.ipc); +} + +static void +tapdisk_server_aio_queue_event(event_id_t id, char mode, void *private) +{ + tapdisk_complete_tiocbs(&server.aio_queue); +} + +static void +tapdisk_server_free_aio_queue(void) +{ + tapdisk_server_unregister_event(server.aio_queue_event_id); + tapdisk_free_queue(&server.aio_queue); +} + +static int +tapdisk_server_initialize_aio_queue(void) +{ + int err; + event_id_t id; + + err = tapdisk_init_queue(&server.aio_queue, + TAPDISK_TIOCBS, 0, NULL); + if (err) + return err; + + id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + server.aio_queue.poll_fd, 0, + tapdisk_server_aio_queue_event, + NULL); + if (id < 0) { + tapdisk_free_queue(&server.aio_queue); + return id; + } + + server.aio_queue_event_id = id; + + return 0; +} + +static void +tapdisk_server_close(void) +{ + tapdisk_server_free_aio_queue(); + + if (server.control_event) + scheduler_unregister_event(&server.scheduler, server.control_event); + + if (server.ipc.rfd != -1) + close(server.ipc.rfd); + + if (server.ipc.wfd != -1) + close(server.ipc.wfd); +} + +static void +__tapdisk_server_run(void) +{ + int ret; + + while (server.run) { + tapdisk_server_assert_locks(); + tapdisk_server_set_retry_timeout(); + tapdisk_server_check_progress(); + + ret = scheduler_wait_for_events(&server.scheduler); + if (ret < 0) + DBG(TLOG_WARN, "server wait returned %d\n", ret); + + tapdisk_server_check_vbds(); + tapdisk_server_submit_tiocbs(); + tapdisk_server_kick_responses(); + } +} + +static void +tapdisk_server_signal_handler(int signal) +{ + td_vbd_t *vbd, *tmp; + static int xfsz_error_sent = 0; + + switch (signal) { + case SIGBUS: + case SIGINT: + tapdisk_server_for_each_vbd(vbd, tmp) + tapdisk_vbd_close(vbd); + break; + + case SIGXFSZ: + ERR(EFBIG, "received SIGXFSZ"); + tapdisk_server_stop_vbds(); + if (xfsz_error_sent) + break; + + tapdisk_server_send_error("received SIGXFSZ, closing queues"); + xfsz_error_sent = 1; + break; + + case SIGUSR1: + tapdisk_server_debug(); + break; + } +} + +int +tapdisk_server_initialize(const char *read, const char *write) +{ + int err; + event_id_t event_id; + + event_id = 0; + memset(&server, 0, sizeof(tapdisk_server_t)); + server.ipc.rfd = server.ipc.wfd = -1; + + INIT_LIST_HEAD(&server.vbds); + + if (read) { + server.ipc.rfd = open(read, O_RDWR | O_NONBLOCK); + if (server.ipc.rfd < 0) { + err = -errno; + EPRINTF("FD open failed %s: %d\n", read, err); + goto fail; + } + } + + if (write) { + server.ipc.wfd = open(write, O_RDWR | O_NONBLOCK); + if (server.ipc.wfd < 0) { + err = -errno; + EPRINTF("FD open failed %s, %d\n", write, err); + goto fail; + } + } + + scheduler_initialize(&server.scheduler); + + if (read) { + event_id = scheduler_register_event(&server.scheduler, + SCHEDULER_POLL_READ_FD, + server.ipc.rfd, 0, + tapdisk_server_read_ipc_message, + NULL); + if (event_id < 0) { + err = event_id; + goto fail; + } + } + + err = tapdisk_server_initialize_aio_queue(); + if (err) + goto fail; + + server.control_event = event_id; + server.run = 1; + + return 0; + +fail: + if (server.ipc.rfd > 0) + close(server.ipc.rfd); + if (server.ipc.wfd > 0) + close(server.ipc.wfd); + if (event_id > 0) + scheduler_unregister_event(&server.scheduler, + server.control_event); + return err; +} + +int +tapdisk_server_run() +{ + int err; + + err = tapdisk_set_resource_limits(); + if (err) + return err; + + signal(SIGBUS, tapdisk_server_signal_handler); + signal(SIGINT, tapdisk_server_signal_handler); + signal(SIGUSR1, tapdisk_server_signal_handler); + signal(SIGXFSZ, tapdisk_server_signal_handler); + + __tapdisk_server_run(); + tapdisk_server_close(); + + return 0; +} diff --git a/tools/blktap2/drivers/tapdisk-server.h b/tools/blktap2/drivers/tapdisk-server.h new file mode 100644 index 0000000000..09a4e13b81 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-server.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_SERVER_H_ +#define _TAPDISK_SERVER_H_ + +#include "tapdisk-vbd.h" +#include "tapdisk-queue.h" + +struct tap_disk *tapdisk_server_find_driver_interface(int); + +td_image_t *tapdisk_server_get_shared_image(td_image_t *); + +td_vbd_t *tapdisk_server_get_vbd(td_uuid_t); +void tapdisk_server_add_vbd(td_vbd_t *); +void tapdisk_server_remove_vbd(td_vbd_t *); + +void tapdisk_server_queue_tiocb(struct tiocb *); + +void tapdisk_server_check_state(void); + +event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *); +void tapdisk_server_unregister_event(event_id_t); +void tapdisk_server_set_max_timeout(int); + +int tapdisk_server_initialize(const char *, const char *); +int tapdisk_server_run(void); + +#define TAPDISK_TIOCBS (TAPDISK_DATA_REQUESTS + 50) + +typedef struct tapdisk_server { + int run; + td_ipc_t ipc; + struct list_head vbds; + scheduler_t scheduler; + event_id_t control_event; + struct tqueue aio_queue; + event_id_t aio_queue_event_id; +} tapdisk_server_t; + +#endif diff --git a/tools/blktap2/drivers/tapdisk-stream.c b/tools/blktap2/drivers/tapdisk-stream.c new file mode 100644 index 0000000000..8fa9d9e0bf --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-stream.c @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> + +#include "list.h" +#include "scheduler.h" +#include "tapdisk-vbd.h" +#include "tapdisk-server.h" + +#define POLL_READ 0 +#define POLL_WRITE 1 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +struct tapdisk_stream_poll { + int pipe[2]; + int set; +}; + +struct tapdisk_stream_request { + uint64_t sec; + uint32_t secs; + uint64_t seqno; + blkif_request_t blkif_req; + struct list_head next; +}; + +struct tapdisk_stream { + td_vbd_t *vbd; + + unsigned int id; + int in_fd; + int out_fd; + + int err; + + uint64_t cur; + uint64_t start; + uint64_t end; + + uint64_t started; + uint64_t completed; + + struct tapdisk_stream_poll poll; + event_id_t enqueue_event_id; + + struct list_head free_list; + struct list_head pending_list; + struct list_head completed_list; + + struct tapdisk_stream_request requests[MAX_REQUESTS]; +}; + +static unsigned int tapdisk_stream_count; + +static void tapdisk_stream_close_image(struct tapdisk_stream *); + +static void +usage(const char *app, int err) +{ + printf("usage: %s <-n type:/path/to/image> " + "[-c sector count] [-s skip sectors]\n", app); + exit(err); +} + +static inline void +tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p) +{ + p->set = 0; + p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1; +} + +static int +tapdisk_stream_poll_open(struct tapdisk_stream_poll *p) +{ + int err; + + tapdisk_stream_poll_initialize(p); + + err = pipe(p->pipe); + if (err) + return -errno; + + err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + return 0; + +out: + close(p->pipe[POLL_READ]); + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); + return -errno; +} + +static void +tapdisk_stream_poll_close(struct tapdisk_stream_poll *p) +{ + if (p->pipe[POLL_READ] != -1) + close(p->pipe[POLL_READ]); + if (p->pipe[POLL_WRITE] != -1) + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); +} + +static inline void +tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p) +{ + int dummy; + + read(p->pipe[POLL_READ], &dummy, sizeof(dummy)); + p->set = 0; +} + +static inline void +tapdisk_stream_poll_set(struct tapdisk_stream_poll *p) +{ + int dummy = 0; + + if (!p->set) { + write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy)); + p->set = 1; + } +} + +static inline int +tapdisk_stream_stop(struct tapdisk_stream *s) +{ + return (list_empty(&s->pending_list) && (s->cur == s->end || s->err)); +} + +static inline void +tapdisk_stream_initialize_request(struct tapdisk_stream_request *req) +{ + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->next); +} + +static inline int +tapdisk_stream_request_idx(struct tapdisk_stream *s, + struct tapdisk_stream_request *req) +{ + return (req - s->requests); +} + +static inline struct tapdisk_stream_request * +tapdisk_stream_get_request(struct tapdisk_stream *s) +{ + struct tapdisk_stream_request *req; + + if (list_empty(&s->free_list)) + return NULL; + + req = list_entry(s->free_list.next, + struct tapdisk_stream_request, next); + + list_del_init(&req->next); + tapdisk_stream_initialize_request(req); + + return req; +} + +static void +tapdisk_stream_print_request(struct tapdisk_stream *s, + struct tapdisk_stream_request *sreq) +{ + unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq); + char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0); + write(s->out_fd, buf, sreq->secs << SECTOR_SHIFT); +} + +static void +tapdisk_stream_write_data(struct tapdisk_stream *s) +{ + struct tapdisk_stream_request *sreq, *tmp; + + list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) { + if (sreq->seqno != s->completed) + break; + + s->completed++; + tapdisk_stream_print_request(s, sreq); + + list_del_init(&sreq->next); + list_add_tail(&sreq->next, &s->free_list); + } +} + +static inline void +tapdisk_stream_queue_completed(struct tapdisk_stream *s, + struct tapdisk_stream_request *sreq) +{ + struct tapdisk_stream_request *itr; + + list_for_each_entry(itr, &s->completed_list, next) + if (sreq->seqno < itr->seqno) { + list_add_tail(&sreq->next, &itr->next); + return; + } + + list_add_tail(&sreq->next, &s->completed_list); +} + +static void +tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp) +{ + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + struct tapdisk_stream_request *sreq = s->requests + rsp->id; + + list_del_init(&sreq->next); + + if (rsp->status == BLKIF_RSP_OKAY) + tapdisk_stream_queue_completed(s, sreq); + else { + s->err = EIO; + list_add_tail(&sreq->next, &s->free_list); + fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec); + } + + tapdisk_stream_write_data(s); + tapdisk_stream_poll_set(&s->poll); +} + +static void +tapdisk_stream_enqueue(event_id_t id, char mode, void *arg) +{ + td_vbd_t *vbd; + int i, idx, psize; + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + + vbd = s->vbd; + tapdisk_stream_poll_clear(&s->poll); + + if (tapdisk_stream_stop(s)) { + tapdisk_stream_close_image(s); + return; + } + + psize = getpagesize(); + + while (s->cur < s->end && !s->err) { + blkif_request_t *breq; + td_vbd_request_t *vreq; + struct tapdisk_stream_request *sreq; + + sreq = tapdisk_stream_get_request(s); + if (!sreq) + break; + + idx = tapdisk_stream_request_idx(s, sreq); + + sreq->sec = s->cur; + sreq->secs = 0; + sreq->seqno = s->started++; + + breq = &sreq->blkif_req; + breq->id = idx; + breq->nr_segments = 0; + breq->sector_number = sreq->sec; + breq->operation = BLKIF_OP_READ; + + for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) { + uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT); + struct blkif_request_segment *seg = breq->seg + i; + + if (!secs) + break; + + sreq->secs += secs; + s->cur += secs; + + seg->first_sect = 0; + seg->last_sect = secs - 1; + breq->nr_segments++; + } + + vreq = vbd->request_list + idx; + + assert(list_empty(&vreq->next)); + assert(vreq->secs_pending == 0); + + memcpy(&vreq->req, breq, sizeof(*breq)); + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + list_add_tail(&sreq->next, &s->pending_list); + } + + tapdisk_vbd_issue_requests(vbd); +} + +static int +tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type) +{ + int err; + + s->id = tapdisk_stream_count++; + + err = tapdisk_server_initialize(NULL, NULL); + if (err) + goto out; + + err = tapdisk_vbd_initialize(-1, -1, s->id); + if (err) + goto out; + + s->vbd = tapdisk_server_get_vbd(s->id); + if (!s->vbd) { + err = ENODEV; + goto out; + } + + tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s); + + err = tapdisk_vbd_open_vdi(s->vbd, path, type, + TAPDISK_STORAGE_TYPE_DEFAULT, + TD_OPEN_RDONLY); + if (err) + goto out; + + s->vbd->reopened = 1; + err = 0; + +out: + if (err) + fprintf(stderr, "failed to open %s: %d\n", path, err); + return err; +} + +static void +tapdisk_stream_close_image(struct tapdisk_stream *s) +{ + td_vbd_t *vbd; + + vbd = tapdisk_server_get_vbd(s->id); + if (vbd) { + tapdisk_vbd_close_vdi(vbd); + tapdisk_server_remove_vbd(vbd); + free((void *)vbd->ring.vstart); + free(vbd->name); + free(vbd); + s->vbd = NULL; + } +} + +static int +tapdisk_stream_set_position(struct tapdisk_stream *s, + uint64_t count, uint64_t skip) +{ + int err; + image_t image; + + err = tapdisk_vbd_get_image_info(s->vbd, &image); + if (err) { + fprintf(stderr, "failed getting image size: %d\n", err); + return err; + } + + if (count == (uint64_t)-1) + count = image.size - skip; + + if (count + skip > image.size) { + fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n", + (uint64_t) (count + skip), (uint64_t) image.size); + return -EINVAL; + } + + s->start = skip; + s->cur = s->start; + s->end = s->start + count; + + return 0; +} + +static int +tapdisk_stream_initialize_requests(struct tapdisk_stream *s) +{ + size_t size; + td_ring_t *ring; + int err, i, psize; + + ring = &s->vbd->ring; + psize = getpagesize(); + size = psize * BLKTAP_MMAP_REGION_SIZE; + + /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */ + err = posix_memalign((void **)&ring->vstart, psize, size); + if (err) { + fprintf(stderr, "failed to allocate buffers: %d\n", err); + ring->vstart = 0; + return err; + } + + for (i = 0; i < MAX_REQUESTS; i++) { + struct tapdisk_stream_request *req = s->requests + i; + tapdisk_stream_initialize_request(req); + list_add_tail(&req->next, &s->free_list); + } + + return 0; +} + +static int +tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s) +{ + int err; + struct tapdisk_stream_poll *p = &s->poll; + + err = tapdisk_stream_poll_open(p); + if (err) + goto out; + + err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + p->pipe[POLL_READ], 0, + tapdisk_stream_enqueue, s); + if (err < 0) + goto out; + + s->enqueue_event_id = err; + err = 0; + +out: + if (err) + fprintf(stderr, "failed to register event: %d\n", err); + return err; +} + +static void +tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s) +{ + if (s->enqueue_event_id) { + tapdisk_server_unregister_event(s->enqueue_event_id); + s->enqueue_event_id = 0; + } + tapdisk_stream_poll_close(&s->poll); +} + +static inline void +tapdisk_stream_initialize(struct tapdisk_stream *s) +{ + memset(s, 0, sizeof(*s)); + s->in_fd = s->out_fd = -1; + INIT_LIST_HEAD(&s->free_list); + INIT_LIST_HEAD(&s->pending_list); + INIT_LIST_HEAD(&s->completed_list); +} + +static int +tapdisk_stream_open_fds(struct tapdisk_stream *s) +{ + s->out_fd = dup(STDOUT_FILENO); + if (s->out_fd == -1) { + fprintf(stderr, "failed to open output: %d\n", errno); + return errno; + } + + return 0; +} + +static int +tapdisk_stream_open(struct tapdisk_stream *s, const char *path, + int type, uint64_t count, uint64_t skip) +{ + int err; + + tapdisk_stream_initialize(s); + + err = tapdisk_stream_open_fds(s); + if (err) + return err; + + err = tapdisk_stream_open_image(s, path, type); + if (err) + return err; + + err = tapdisk_stream_set_position(s, count, skip); + if (err) + return err; + + err = tapdisk_stream_initialize_requests(s); + if (err) + return err; + + err = tapdisk_stream_register_enqueue_event(s); + if (err) + return err; + + return 0; +} + +static void +tapdisk_stream_release(struct tapdisk_stream *s) +{ + close(s->out_fd); + tapdisk_stream_close_image(s); + tapdisk_stream_unregister_enqueue_event(s); +} + +static int +tapdisk_stream_run(struct tapdisk_stream *s) +{ + tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s); + tapdisk_server_run(); + return s->err; +} + +int +main(int argc, char *argv[]) +{ + int c, err, type; + char *params, *path; + uint64_t count, skip; + struct tapdisk_stream stream; + + err = 0; + skip = 0; + count = (uint64_t)-1; + params = NULL; + + while ((c = getopt(argc, argv, "n:c:s:h")) != -1) { + switch (c) { + case 'n': + params = optarg; + break; + case 'c': + count = strtoull(optarg, NULL, 10); + break; + case 's': + skip = strtoull(optarg, NULL, 10); + break; + default: + err = EINVAL; + case 'h': + usage(argv[0], err); + } + } + + if (!params) + usage(argv[0], EINVAL); + + err = tapdisk_parse_disk_type(params, &path, &type); + if (err) { + fprintf(stderr, "invalid argument %s: %d\n", params, err); + return err; + } + + tapdisk_start_logging("tapdisk-stream"); + + err = tapdisk_stream_open(&stream, path, type, count, skip); + if (err) + goto out; + + err = tapdisk_stream_run(&stream); + if (err) + goto out; + + err = 0; + +out: + tapdisk_stream_release(&stream); + tapdisk_stop_logging(); + return err; +} diff --git a/tools/blktap2/drivers/tapdisk-utils.c b/tools/blktap2/drivers/tapdisk-utils.c new file mode 100644 index 0000000000..560f3bf6cc --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-utils.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <linux/fs.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/resource.h> + +#include "tapdisk.h" +#include "disktypes.h" +#include "blktaplib.h" +#include "tapdisk-log.h" +#include "tapdisk-utils.h" + +void +tapdisk_start_logging(const char *name) +{ + static char buf[128]; + + snprintf(buf, sizeof(buf), "%s[%d]", name, getpid()); + openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON); + open_tlog("/tmp/tapdisk.log", (64 << 10), TLOG_WARN, 0); +} + +void +tapdisk_stop_logging(void) +{ + closelog(); + close_tlog(); +} + +int +tapdisk_set_resource_limits(void) +{ + int err; + struct rlimit rlim; + + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + + err = setrlimit(RLIMIT_MEMLOCK, &rlim); + if (err == -1) { + EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno); + return -errno; + } + + err = mlockall(MCL_CURRENT | MCL_FUTURE); + if (err == -1) { + EPRINTF("mlockall failed: %d\n", errno); + return -errno; + } + +#define CORE_DUMP +#if defined(CORE_DUMP) + err = setrlimit(RLIMIT_CORE, &rlim); + if (err == -1) + EPRINTF("RLIMIT_CORE failed: %d\n", errno); +#endif + + return 0; +} + +int +tapdisk_namedup(char **dup, const char *name) +{ + *dup = NULL; + + if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN) + return -ENAMETOOLONG; + + *dup = strdup(name); + if (!*dup) + return -ENOMEM; + + return 0; +} + +int +tapdisk_parse_disk_type(const char *params, char **_path, int *_type) +{ + int i, err, size, handle_len; + char *ptr, *path, handle[10]; + + if (strlen(params) + 1 >= MAX_NAME_LEN) + return -ENAMETOOLONG; + + ptr = strchr(params, ':'); + if (!ptr) + return -EINVAL; + + path = ptr + 1; + + handle_len = ptr - params; + if (handle_len > sizeof(handle)) + return -ENAMETOOLONG; + + memcpy(handle, params, handle_len); + handle[handle_len] = '\0'; + + size = sizeof(dtypes) / sizeof(disk_info_t *); + for (i = 0; i < size; i++) { + if (strncmp(handle, dtypes[i]->handle, handle_len)) + continue; + + if (dtypes[i]->idnum == -1) + return -ENODEV; + + *_type = dtypes[i]->idnum; + *_path = path; + + return 0; + } + + return -ENODEV; +} + +/*Get Image size, secsize*/ +int +tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size) +{ + int ret; + struct stat stat; + uint64_t sectors; + uint32_t sector_size; + + sectors = 0; + sector_size = 0; + *_sectors = 0; + *_sector_size = 0; + + if (fstat(fd, &stat)) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + if (ioctl(fd, BLKGETSIZE, §ors)) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, §or_size); + + if (sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %u (not %d)\n", + sector_size, DEFAULT_SECTOR_SIZE); + } +#else + sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + sectors = (stat.st_size >> SECTOR_SHIFT); + sector_size = DEFAULT_SECTOR_SIZE; + } + + if (sectors == 0) { + sectors = 16836057ULL; + sector_size = DEFAULT_SECTOR_SIZE; + } + + return 0; +} diff --git a/tools/blktap2/drivers/tapdisk-utils.h b/tools/blktap2/drivers/tapdisk-utils.h new file mode 100644 index 0000000000..216c902377 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-utils.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _TAPDISK_UTILS_H_ +#define _TAPDISK_UTILS_H_ + +#include <inttypes.h> + +#define MAX_NAME_LEN 1000 + +void tapdisk_start_logging(const char *); +void tapdisk_stop_logging(void); +int tapdisk_set_resource_limits(void); +int tapdisk_namedup(char **, const char *); +int tapdisk_parse_disk_type(const char *, char **, int *); +int tapdisk_get_image_size(int, uint64_t *, uint32_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c new file mode 100644 index 0000000000..1eaaee9634 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-vbd.c @@ -0,0 +1,1758 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <regex.h> +#include <unistd.h> +#include <stdlib.h> +#include <libgen.h> +#include <sys/mman.h> +#include <sys/ioctl.h> + +#include "libvhd.h" +#include "tapdisk-image.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" +#include "tapdisk-vbd.h" +#include "blktap2.h" + +#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) +#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) + +#if 1 +#define ASSERT(p) \ + do { \ + if (!(p)) { \ + DPRINTF("Assertion '%s' failed, line %d, " \ + "file %s", #p, __LINE__, __FILE__); \ + *(int*)0 = 0; \ + } \ + } while (0) +#else +#define ASSERT(p) ((void)0) +#endif + + +#define TD_VBD_EIO_RETRIES 10 +#define TD_VBD_EIO_SLEEP 1 +#define TD_VBD_WATCHDOG_TIMEOUT 10 + +static void tapdisk_vbd_ring_event(event_id_t, char, void *); +static void tapdisk_vbd_callback(void *, blkif_response_t *); + +/* + * initialization + */ + +static inline void +tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq) +{ + memset(vreq, 0, sizeof(td_vbd_request_t)); + INIT_LIST_HEAD(&vreq->next); +} + +int +tapdisk_vbd_initialize(int rfd, int wfd, uint16_t uuid) +{ + int i; + td_vbd_t *vbd; + + vbd = tapdisk_server_get_vbd(uuid); + if (vbd) { + EPRINTF("duplicate vbds! %u\n", uuid); + return -EEXIST; + } + + vbd = calloc(1, sizeof(td_vbd_t)); + if (!vbd) { + EPRINTF("failed to allocate tapdisk state\n"); + return -ENOMEM; + } + + vbd->uuid = uuid; + vbd->ipc.rfd = rfd; + vbd->ipc.wfd = wfd; + vbd->ipc.uuid = uuid; + vbd->ring.fd = -1; + + /* default blktap ring completion */ + vbd->callback = tapdisk_vbd_callback; + vbd->argument = vbd; + + INIT_LIST_HEAD(&vbd->images); + INIT_LIST_HEAD(&vbd->new_requests); + INIT_LIST_HEAD(&vbd->pending_requests); + INIT_LIST_HEAD(&vbd->failed_requests); + INIT_LIST_HEAD(&vbd->completed_requests); + INIT_LIST_HEAD(&vbd->next); + gettimeofday(&vbd->ts, NULL); + + for (i = 0; i < MAX_REQUESTS; i++) + tapdisk_vbd_initialize_vreq(vbd->request_list + i); + + tapdisk_server_add_vbd(vbd); + + return 0; +} + +void +tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument) +{ + vbd->callback = callback; + vbd->argument = argument; +} + +static int +tapdisk_vbd_validate_chain(td_vbd_t *vbd) +{ + int err; + td_image_t *image, *parent, *tmp; + + DPRINTF("VBD CHAIN:\n"); + + tapdisk_vbd_for_each_image(vbd, image, tmp) { + DPRINTF("%s: %d\n", image->name, image->type); + + if (tapdisk_vbd_is_last_image(vbd, image)) + break; + + parent = tapdisk_vbd_next_image(image); + err = td_validate_parent(image, parent); + if (err) + return err; + } + + return 0; +} + +void +tapdisk_vbd_close_vdi(td_vbd_t *vbd) +{ + td_image_t *image, *tmp; + + tapdisk_vbd_for_each_image(vbd, image, tmp) { + td_close(image); + tapdisk_image_free(image); + } + + INIT_LIST_HEAD(&vbd->images); + td_flag_set(vbd->state, TD_VBD_CLOSED); +} + +static int +tapdisk_vbd_add_block_cache(td_vbd_t *vbd) +{ + int err; + td_driver_t *driver; + td_image_t *cache, *image, *target, *tmp; + + target = NULL; + + tapdisk_vbd_for_each_image(vbd, image, tmp) + if (td_flag_test(image->flags, TD_OPEN_RDONLY) && + td_flag_test(image->flags, TD_OPEN_SHAREABLE)) { + target = image; + break; + } + + if (!target) + return 0; + + cache = tapdisk_image_allocate(target->name, + DISK_TYPE_BLOCK_CACHE, + target->storage, + target->flags, + target->private); + if (!cache) + return -ENOMEM; + + /* try to load existing cache */ + err = td_load(cache); + if (!err) + goto done; + + /* hack driver to send open() correct image size */ + if (!target->driver) { + err = -ENODEV; + goto fail; + } + + cache->driver = tapdisk_driver_allocate(cache->type, + cache->name, + cache->flags, + cache->storage); + if (!cache->driver) { + err = -ENOMEM; + goto fail; + } + + cache->driver->info = target->driver->info; + + /* try to open new cache */ + err = td_open(cache); + if (!err) + goto done; + +fail: + /* give up */ + tapdisk_image_free(target); + return err; + +done: + /* insert cache before image */ + list_add(&cache->next, target->next.prev); + return 0; +} + +static int +tapdisk_vbd_add_dirty_log(td_vbd_t *vbd) +{ + int err; + td_driver_t *driver; + td_image_t *log, *parent; + + driver = NULL; + log = NULL; + + parent = tapdisk_vbd_first_image(vbd); + + log = tapdisk_image_allocate(parent->name, + DISK_TYPE_LOG, + parent->storage, + parent->flags, + vbd); + if (!log) + return -ENOMEM; + + driver = tapdisk_driver_allocate(log->type, + log->name, + log->flags, + log->storage); + if (!driver) { + err = -ENOMEM; + goto fail; + } + + driver->info = parent->driver->info; + log->driver = driver; + + err = td_open(log); + if (err) + goto fail; + + list_add(&log->next, &vbd->images); + return 0; + +fail: + tapdisk_image_free(log); + return err; +} + +/* + * LVHD hack: have to rescan LVM metadata on pool + * slaves to register lvchanges made on master. FIXME. + */ +static int +tapdisk_vbd_reactivate_volume(const char *name) +{ + int err; + char *cmd; + + DPRINTF("reactivating %s\n", name); + + err = asprintf(&cmd, "lvchange -an %s", name); + if (err == - 1) { + EPRINTF("failed to deactivate %s\n", name); + return -errno; + } + + err = system(cmd); + if (err) { + /* + * Assume that LV deactivation failed because the LV is open, + * in which case the LVM information should be up-to-date and + * we don't need this step anyways (so ignore the error). If + * the failure is due to a non-existent LV, the next command + * (lvchange -ay) will catch it. + * If we want to be more prudent/paranoid, we can instead check + * whether the LV is currently open (a bit more work). + */ + } + + free(cmd); + err = asprintf(&cmd, "lvchange -ay --refresh %s", name); + if (err == - 1) { + EPRINTF("failed to activate %s\n", name); + return -errno; + } + + err = system(cmd); + if (err) + EPRINTF("%s failed: %d\n", cmd, err); + free(cmd); + return err; +} + +static int +tapdisk_vbd_reactivate_volumes(td_vbd_t *vbd, int resume) +{ + int i, cnt, err; + char *name, *new; + vhd_context_t vhd; + vhd_parent_locator_t *loc; + + new = NULL; + name = NULL; + + if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM) + return 0; + + if (!resume && vbd->reactivated) + return 0; + + name = strdup(vbd->name); + if (!name) { + EPRINTF("%s: nomem\n", vbd->name); + return -ENOMEM; + } + + for (cnt = 0; 1; cnt++) { + + /* only need to reactivate child and parent during resume */ + if (resume && cnt == 2) + break; + + err = tapdisk_vbd_reactivate_volume(name); + if (err) + goto fail; + + if (!strstr(name, "VHD")) + break; + + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (!err) + break; + + libvhd_set_log_level(1); + sleep(TD_VBD_EIO_SLEEP); + } + libvhd_set_log_level(0); + if (err) + goto fail; + + if (vhd.footer.type != HD_TYPE_DIFF) { + vhd_close(&vhd); + break; + } + + loc = NULL; + for (i = 0; i < 8; i++) + if (vhd.header.loc[i].code == PLAT_CODE_MACX) { + loc = vhd.header.loc + i; + break; + } + + if (!loc) { + vhd_close(&vhd); + err = -EINVAL; + goto fail; + } + + free(name); + err = vhd_parent_locator_read(&vhd, loc, &name); + vhd_close(&vhd); + + if (err) { + name = NULL; + goto fail; + } + + /* + * vhd_parent_locator_read returns path relative to child: + * ./VG_XenStorage--<sr-uuid>-VHD--<vdi-uuid> + * we have to convert this to absolute path for lvm + */ + err = asprintf(&new, "/dev/mapper/%s", name + 2); + if (err == -1) { + err = -errno; + goto fail; + } + + free(name); + name = new; + } + + err = 0; + vbd->reactivated = 1; + +out: + free(name); + return err; + +fail: + EPRINTF("failed to reactivate %s: %d\n", vbd->name, err); + goto out; +} + +/* + * LVHD hack: + * raw volumes are named /dev/<sr-vg-name>-<sr-uuid>/LV-<sr-uuid> + * vhd volumes are named /dev/<sr-vg-name>-<sr-uuid>/VHD-<sr-uuid> + * + * a live snapshot of a raw volume will result in the writeable volume's + * name changing from the raw to vhd format, but this change will not be + * reflected by xenstore. hence this mess. + */ +static int +tapdisk_vbd_check_file(td_vbd_t *vbd) +{ + int i, err; + regex_t re; + size_t len, max; + regmatch_t matches[4]; + char *new, *src, *dst, error[256]; + + if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM) + return 0; + + err = tapdisk_vbd_reactivate_volume(vbd->name); + if (!err) + return 0; + else + DPRINTF("reactivating %s failed\n", vbd->name); + +#define HEX "[A-Za-z0-9]" +#define UUID HEX"\\{8\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{12\\}" +#define VG "VG_"HEX"\\+" +#define TYPE "\\(LV\\|VHD\\)" +#define RE "\\(/dev/"VG"-"UUID"/\\)"TYPE"\\(-"UUID"\\)" + + err = regcomp(&re, RE, 0); + if (err) + goto regerr; + +#undef HEX +#undef UUID +#undef VG +#undef TYPE +#undef RE + + err = regexec(&re, vbd->name, 4, matches, 0); + if (err) + goto regerr; + + max = strlen("VHD") + 1; + for (i = 1; i < 4; i++) { + if (matches[i].rm_so == -1 || matches[i].rm_eo == -1) { + EPRINTF("%s: failed to tokenize name\n", vbd->name); + err = -EINVAL; + goto out; + } + + max += matches[i].rm_eo - matches[i].rm_so; + } + + new = malloc(max); + if (!new) { + EPRINTF("%s: failed to allocate new name\n", vbd->name); + err = -ENOMEM; + goto out; + } + + src = new; + for (i = 1; i < 4; i++) { + dst = vbd->name + matches[i].rm_so; + len = matches[i].rm_eo - matches[i].rm_so; + + if (i == 2) { + if (memcmp(dst, "LV", len)) { + EPRINTF("%s: bad name format\n", vbd->name); + free(new); + err = -EINVAL; + goto out; + } + + src += sprintf(src, "VHD"); + continue; + } + + memcpy(src, dst, len + 1); + src += len; + } + + *src = '\0'; + + err = tapdisk_vbd_reactivate_volume(new); + if (err) + DPRINTF("reactivating %s failed\n", new); + + err = access(new, F_OK); + if (err == -1) { + EPRINTF("neither %s nor %s accessible\n", + vbd->name, new); + err = -errno; + free(new); + goto out; + } + + DPRINTF("couldn't find %s, trying %s\n", vbd->name, new); + + err = 0; + free(vbd->name); + vbd->name = new; + vbd->type = DISK_TYPE_VHD; + +out: + regfree(&re); + return err; + +regerr: + regerror(err, &re, error, sizeof(error)); + EPRINTF("%s: regex failed: %s\n", vbd->name, error); + err = -EINVAL; + goto out; +} + +static int +__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags) +{ + char *file; + int err, type; + td_flag_t flags; + td_disk_id_t id; + td_image_t *image, *tmp; + struct tfilter *filter = NULL; + + err = tapdisk_vbd_reactivate_volumes(vbd, 0); + if (err) + return err; + + flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags; + file = vbd->name; + type = vbd->type; + + for (;;) { + err = -ENOMEM; + image = tapdisk_image_allocate(file, type, + vbd->storage, flags, vbd); + + if (file != vbd->name) { + free(file); + file = NULL; + } + + if (!image) + goto fail; + + err = td_load(image); + if (err) { + if (err != -ENODEV) + goto fail; + + err = td_open(image); + if (err) + goto fail; + } + + err = td_get_parent_id(image, &id); + if (err && err != TD_NO_PARENT) { + td_close(image); + goto fail; + } + + if (!image->storage) + image->storage = vbd->storage; + + tapdisk_vbd_add_image(vbd, image); + image = NULL; + + if (err == TD_NO_PARENT) + break; + + file = id.name; + type = id.drivertype; + flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE); + } + + if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) { + err = tapdisk_vbd_add_dirty_log(vbd); + if (err) + goto fail; + } + + if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) { + err = tapdisk_vbd_add_block_cache(vbd); + if (err) + goto fail; + } + + err = tapdisk_vbd_validate_chain(vbd); + if (err) + goto fail; + + td_flag_clear(vbd->state, TD_VBD_CLOSED); + + return 0; + +fail: + if (image) + tapdisk_image_free(image); + + tapdisk_vbd_close_vdi(vbd); + + return err; +} + +int +tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path, + uint16_t drivertype, uint16_t storage, td_flag_t flags) +{ + int i, err; + struct tap_disk *ops; + + ops = tapdisk_server_find_driver_interface(drivertype); + if (!ops) + return -EINVAL; + DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n", + ops->disk_type, vbd->uuid, path, flags); + + err = tapdisk_namedup(&vbd->name, path); + if (err) + return err; + + vbd->flags = flags; + vbd->storage = storage; + vbd->type = drivertype; + + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { + err = __tapdisk_vbd_open_vdi(vbd, 0); + if (err != -EIO) + break; + + sleep(TD_VBD_EIO_SLEEP); + } + if (err) + goto fail; + + return 0; + +fail: + free(vbd->name); + vbd->name = NULL; + return err; +} + +static int +tapdisk_vbd_register_event_watches(td_vbd_t *vbd) +{ + event_id_t id; + + id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + vbd->ring.fd, 0, + tapdisk_vbd_ring_event, vbd); + if (id < 0) + return id; + + vbd->ring_event_id = id; + + return 0; +} + +static void +tapdisk_vbd_unregister_events(td_vbd_t *vbd) +{ + if (vbd->ring_event_id) + tapdisk_server_unregister_event(vbd->ring_event_id); +} + +static int +tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname) +{ + + int err, psize; + td_ring_t *ring; + + ring = &vbd->ring; + psize = getpagesize(); + + ring->fd = open(devname, O_RDWR); + if (ring->fd == -1) { + err = -errno; + EPRINTF("failed to open %s: %d\n", devname, err); + goto fail; + } + + ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0); + if (ring->mem == MAP_FAILED) { + err = -errno; + EPRINTF("failed to mmap %s: %d\n", devname, err); + goto fail; + } + + ring->sring = (blkif_sring_t *)((unsigned long)ring->mem); + BACK_RING_INIT(&ring->fe_ring, ring->sring, psize); + + ring->vstart = + (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize); + + ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE); + + return 0; + +fail: + if (ring->mem && ring->mem != MAP_FAILED) + munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE); + if (ring->fd != -1) + close(ring->fd); + ring->fd = -1; + ring->mem = NULL; + return err; +} + +static int +tapdisk_vbd_unmap_device(td_vbd_t *vbd) +{ + int psize; + + psize = getpagesize(); + + if (vbd->ring.fd != -1) + close(vbd->ring.fd); + if (vbd->ring.mem > 0) + munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE); + + return 0; +} + +int +tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type, + uint16_t storage, const char *ring, td_flag_t flags) +{ + int err; + + err = tapdisk_vbd_open_vdi(vbd, name, type, storage, flags); + if (err) + goto out; + + err = tapdisk_vbd_map_device(vbd, ring); + if (err) + goto out; + + err = tapdisk_vbd_register_event_watches(vbd); + if (err) + goto out; + + return 0; + +out: + tapdisk_vbd_close_vdi(vbd); + tapdisk_vbd_unmap_device(vbd); + tapdisk_vbd_unregister_events(vbd); + free(vbd->name); + vbd->name = NULL; + return err; +} + +static void +tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new, + int *pending, int *failed, int *completed) +{ + int n, p, f, c; + td_vbd_request_t *vreq, *tvreq; + + n = 0; + p = 0; + f = 0; + c = 0; + + tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests) + n++; + + tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests) + p++; + + tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests) + f++; + + tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests) + c++; + + *new = n; + *pending = p; + *failed = f; + *completed = c; +} + +static int +tapdisk_vbd_shutdown(td_vbd_t *vbd) +{ + int new, pending, failed, completed; + + if (!list_empty(&vbd->pending_requests)) + return -EAGAIN; + + tapdisk_vbd_kick(vbd); + tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); + + DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " + "failed: 0x%02x, completed: 0x%02x\n", + vbd->name, vbd->state, new, pending, failed, completed); + DPRINTF("last activity: %010ld.%06ld, errors: 0x%04"PRIx64", " + "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", " + "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n", + vbd->ts.tv_sec, vbd->ts.tv_usec, + vbd->errors, vbd->retries, vbd->received, vbd->returned, + vbd->kicked); + + tapdisk_vbd_close_vdi(vbd); + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_CLOSE_RSP); + tapdisk_vbd_unregister_events(vbd); + tapdisk_vbd_unmap_device(vbd); + tapdisk_server_remove_vbd(vbd); + free(vbd->name); + free(vbd); + + tlog_print_errors(); + + return 0; +} + +int +tapdisk_vbd_close(td_vbd_t *vbd) +{ + /* + * don't close if any requests are pending in the aio layer + */ + if (!list_empty(&vbd->pending_requests)) + goto fail; + + /* + * if the queue is still active and we have more + * requests, try to complete them before closing. + */ + if (tapdisk_vbd_queue_ready(vbd) && + (!list_empty(&vbd->new_requests) || + !list_empty(&vbd->failed_requests) || + !list_empty(&vbd->completed_requests))) + goto fail; + + return tapdisk_vbd_shutdown(vbd); + +fail: + td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED); + DBG(TLOG_WARN, "%s: requests pending\n", vbd->name); + return -EAGAIN; +} + +/* + * control operations + */ + +void +tapdisk_vbd_debug(td_vbd_t *vbd) +{ + td_image_t *image, *tmp; + int new, pending, failed, completed; + + tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); + + DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " + "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06ld, " + "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", " + "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n", + vbd->name, vbd->state, new, pending, failed, completed, + vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries, + vbd->received, vbd->returned, vbd->kicked); + + tapdisk_vbd_for_each_image(vbd, image, tmp) + td_debug(image); +} + +static void +tapdisk_vbd_drop_log(td_vbd_t *vbd) +{ + if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED)) + return; + + tapdisk_vbd_debug(vbd); + tlog_flush(); + td_flag_set(vbd->state, TD_VBD_LOG_DROPPED); +} + +int +tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img) +{ + td_image_t *image; + + memset(img, 0, sizeof(image_t)); + + if (list_empty(&vbd->images)) + return -EINVAL; + + image = tapdisk_vbd_first_image(vbd); + img->size = image->info.size; + img->secsize = image->info.sector_size; + img->info = image->info.info; + + return 0; +} + +int +tapdisk_vbd_queue_ready(td_vbd_t *vbd) +{ + return (!td_flag_test(vbd->state, TD_VBD_DEAD) && + !td_flag_test(vbd->state, TD_VBD_CLOSED) && + !td_flag_test(vbd->state, TD_VBD_QUIESCED) && + !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)); +} + +int +tapdisk_vbd_retry_needed(td_vbd_t *vbd) +{ + return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED); +} + +int +tapdisk_vbd_lock(td_vbd_t *vbd) +{ + return 0; +} + +int +tapdisk_vbd_quiesce_queue(td_vbd_t *vbd) +{ + if (!list_empty(&vbd->pending_requests)) { + td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED); + return -EAGAIN; + } + + td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED); + td_flag_set(vbd->state, TD_VBD_QUIESCED); + return 0; +} + +int +tapdisk_vbd_start_queue(td_vbd_t *vbd) +{ + td_flag_clear(vbd->state, TD_VBD_QUIESCED); + td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED); + return 0; +} + +int +tapdisk_vbd_kill_queue(td_vbd_t *vbd) +{ + tapdisk_vbd_quiesce_queue(vbd); + td_flag_set(vbd->state, TD_VBD_DEAD); + return 0; +} + +static int +tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image) +{ + int err; + td_image_t *parent; + + err = td_open(image); + if (err) + return err; + + if (!tapdisk_vbd_is_last_image(vbd, image)) { + parent = tapdisk_vbd_next_image(image); + err = td_validate_parent(image, parent); + if (err) { + td_close(image); + return err; + } + } + + return 0; +} + +static int +tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image) +{ + int i, err; + + td_close(image); + + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { + err = tapdisk_vbd_open_image(vbd, image); + if (err != -EIO) + break; + + sleep(TD_VBD_EIO_SLEEP); + } + + if (err) + td_flag_set(vbd->state, TD_VBD_CLOSED); + + return err; +} + +int +tapdisk_vbd_pause(td_vbd_t *vbd) +{ + int err; + + td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); + + err = tapdisk_vbd_quiesce_queue(vbd); + if (err) + return err; + + tapdisk_vbd_close_vdi(vbd); + + td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); + td_flag_set(vbd->state, TD_VBD_PAUSED); + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_PAUSE_RSP); + + return 0; +} + +int +tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype) +{ + int i, err; + + if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) { + EPRINTF("resume request for unpaused vbd %s\n", vbd->name); + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR); + return -EINVAL; + } + + free(vbd->name); + vbd->name = strdup(path); + if (!vbd->name) { + EPRINTF("copying new vbd %s name failed\n", path); + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR); + return -EINVAL; + } + vbd->type = drivertype; + + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { + err = tapdisk_vbd_check_file(vbd); + if (err) + goto sleep; + + err = tapdisk_vbd_reactivate_volumes(vbd, 1); + if (err) { + EPRINTF("failed to reactivate %s: %d\n", + vbd->name, err); + goto sleep; + } + + err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT); + if (!err) + break; + + sleep: + sleep(TD_VBD_EIO_SLEEP); + } + + if (err) { + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR); + return err; + } + + tapdisk_vbd_start_queue(vbd); + td_flag_clear(vbd->state, TD_VBD_PAUSED); + td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); + tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_RESUME_RSP); + + return 0; +} + +int +tapdisk_vbd_kick(td_vbd_t *vbd) +{ + int n; + td_ring_t *ring; + + ring = &vbd->ring; + if (!ring->sring) + return 0; + + n = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod); + if (!n) + return 0; + + vbd->kicked += n; + RING_PUSH_RESPONSES(&ring->fe_ring); + ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0); + + DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: " + "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked); + + return n; +} + +static inline void +tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp) +{ + td_ring_t *ring; + blkif_response_t *rspp; + + ring = &vbd->ring; + rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt); + memcpy(rspp, rsp, sizeof(blkif_response_t)); + ring->fe_ring.rsp_prod_pvt++; +} + +static void +tapdisk_vbd_callback(void *arg, blkif_response_t *rsp) +{ + td_vbd_t *vbd = (td_vbd_t *)arg; + tapdisk_vbd_write_response_to_ring(vbd, rsp); +} + +static void +tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq) +{ + blkif_request_t tmp; + blkif_response_t *rsp; + + tmp = vreq->req; + rsp = (blkif_response_t *)&vreq->req; + + rsp->id = tmp.id; + rsp->operation = tmp.operation; + rsp->status = vreq->status; + + DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n", + (int)tmp.id, tmp.sector_number, vreq->status); + + if (rsp->status != BLKIF_RSP_OKAY) + ERR(EIO, "returning BLKIF_RSP %d", rsp->status); + + vbd->returned++; + vbd->callback(vbd->argument, rsp); +} + +void +tapdisk_vbd_check_state(td_vbd_t *vbd) +{ + td_vbd_request_t *vreq, *tmp; + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) + if (vreq->num_retries >= TD_VBD_MAX_RETRIES) + tapdisk_vbd_complete_vbd_request(vbd, vreq); + + if (!list_empty(&vbd->new_requests) || + !list_empty(&vbd->failed_requests)) + tapdisk_vbd_issue_requests(vbd); + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) { + tapdisk_vbd_make_response(vbd, vreq); + list_del(&vreq->next); + tapdisk_vbd_initialize_vreq(vreq); + } + + if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) + tapdisk_vbd_quiesce_queue(vbd); + + if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED)) + tapdisk_vbd_pause(vbd); + + if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) + tapdisk_vbd_close(vbd); +} + +void +tapdisk_vbd_check_progress(td_vbd_t *vbd) +{ + int diff; + struct timeval now; + + if (list_empty(&vbd->pending_requests)) + return; + + gettimeofday(&now, NULL); + diff = now.tv_sec - vbd->ts.tv_sec; + + if (diff >= TD_VBD_WATCHDOG_TIMEOUT) { + DBG(TLOG_WARN, "%s: watchdog timeout: pending requests " + "idle for %d seconds\n", vbd->name, diff); + tapdisk_vbd_drop_log(vbd); + return; + } + + tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff); +} + +/* + * request submission + */ + +static int +tapdisk_vbd_check_queue(td_vbd_t *vbd) +{ + int err; + td_image_t *image; + + if (list_empty(&vbd->images)) + return -ENOSYS; + + if (!tapdisk_vbd_queue_ready(vbd)) + return -EAGAIN; + + if (!vbd->reopened) { + if (td_flag_test(vbd->state, TD_VBD_LOCKING)) { + err = tapdisk_vbd_lock(vbd); + if (err) + return err; + } + + image = tapdisk_vbd_first_image(vbd); + td_flag_set(image->flags, TD_OPEN_STRICT); + + if (tapdisk_vbd_close_and_reopen_image(vbd, image)) + EPRINTF("reopening disks failed\n"); + else { + DPRINTF("reopening disks succeeded\n"); + vbd->reopened = 1; + } + } + + return 0; +} + +void +tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq) +{ + if (!vreq->submitting && !vreq->secs_pending) { + if (vreq->status == BLKIF_RSP_ERROR && + vreq->num_retries < TD_VBD_MAX_RETRIES && + !td_flag_test(vbd->state, TD_VBD_DEAD) && + !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) + tapdisk_vbd_move_request(vreq, &vbd->failed_requests); + else + tapdisk_vbd_move_request(vreq, &vbd->completed_requests); + } +} + +static void +__tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq, + td_request_t treq, int res) +{ + int err; + + err = (res <= 0 ? res : -res); + vbd->secs_pending -= treq.secs; + vreq->secs_pending -= treq.secs; + + vreq->blocked = treq.blocked; + + if (err) { + vreq->status = BLKIF_RSP_ERROR; + vreq->error = (vreq->error ? : err); + if (err != -EBUSY) { + vbd->errors++; + ERR(err, "req %"PRIu64": %s 0x%04x secs to " + "0x%08"PRIx64, vreq->req.id, + (treq.op == TD_OP_WRITE ? "write" : "read"), + treq.secs, treq.sec); + } + } + + tapdisk_vbd_complete_vbd_request(vbd, vreq); +} + +static void +__tapdisk_vbd_reissue_td_request(td_vbd_t *vbd, + td_image_t *image, td_request_t treq) +{ + td_image_t *parent; + td_vbd_request_t *vreq; + + vreq = (td_vbd_request_t *)treq.private; + gettimeofday(&vreq->last_try, NULL); + + vreq->submitting++; + + if (tapdisk_vbd_is_last_image(vbd, image)) { + memset(treq.buf, 0, treq.secs << SECTOR_SHIFT); + td_complete_request(treq, 0); + goto done; + } + + parent = tapdisk_vbd_next_image(image); + treq.image = parent; + + /* return zeros for requests that extend beyond end of parent image */ + if (treq.sec + treq.secs > parent->info.size) { + td_request_t clone = treq; + + if (parent->info.size > treq.sec) { + int secs = parent->info.size - treq.sec; + clone.sec += secs; + clone.secs -= secs; + clone.buf += (secs << SECTOR_SHIFT); + treq.secs = secs; + } else + treq.secs = 0; + + memset(clone.buf, 0, clone.secs << SECTOR_SHIFT); + td_complete_request(clone, 0); + + if (!treq.secs) + goto done; + } + + switch (treq.op) { + case TD_OP_WRITE: + td_queue_write(parent, treq); + break; + + case TD_OP_READ: + td_queue_read(parent, treq); + break; + } + +done: + vreq->submitting--; + if (!vreq->secs_pending) + tapdisk_vbd_complete_vbd_request(vbd, vreq); +} + +void +tapdisk_vbd_forward_request(td_request_t treq) +{ + td_vbd_t *vbd; + td_image_t *image; + td_vbd_request_t *vreq; + + image = treq.image; + vbd = (td_vbd_t *)image->private; + vreq = (td_vbd_request_t *)treq.private; + + gettimeofday(&vbd->ts, NULL); + + if (tapdisk_vbd_queue_ready(vbd)) + __tapdisk_vbd_reissue_td_request(vbd, image, treq); + else + __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO); +} + +static void +tapdisk_vbd_complete_td_request(td_request_t treq, int res) +{ + td_vbd_t *vbd; + td_image_t *image; + td_vbd_request_t *vreq; + + image = treq.image; + vbd = (td_vbd_t *)image->private; + vreq = (td_vbd_request_t *)treq.private; + + gettimeofday(&vbd->ts, NULL); + DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" " + "secs 0x%04x buf %p op %d res %d\n", image->name, + (int)treq.id, treq.sidx, treq.sec, treq.secs, + treq.buf, (int)vreq->req.operation, res); + + __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res); +} + +static int +tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq) +{ + char *page; + td_ring_t *ring; + td_image_t *image; + td_request_t treq; + uint64_t sector_nr; + blkif_request_t *req; + int i, err, id, nsects; + + req = &vreq->req; + id = req->id; + ring = &vbd->ring; + sector_nr = req->sector_number; + image = tapdisk_vbd_first_image(vbd); + + vreq->submitting = 1; + gettimeofday(&vbd->ts, NULL); + gettimeofday(&vreq->last_try, NULL); + tapdisk_vbd_move_request(vreq, &vbd->pending_requests); + + err = tapdisk_vbd_check_queue(vbd); + if (err) + goto fail; + + err = tapdisk_image_check_ring_request(image, req); + if (err) + goto fail; + + for (i = 0; i < req->nr_segments; i++) { + nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1; + page = (char *)MMAP_VADDR(ring->vstart, + (unsigned long)req->id, i); + page += (req->seg[i].first_sect << SECTOR_SHIFT); + + treq.id = id; + treq.sidx = i; + treq.blocked = 0; + treq.buf = page; + treq.sec = sector_nr; + treq.secs = nsects; + treq.image = image; + treq.cb = tapdisk_vbd_complete_td_request; + treq.cb_data = NULL; + treq.private = vreq; + + DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x " + "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs, + treq.buf, (int)req->operation); + + vreq->secs_pending += nsects; + vbd->secs_pending += nsects; + + switch (req->operation) { + case BLKIF_OP_WRITE: + treq.op = TD_OP_WRITE; + td_queue_write(image, treq); + break; + + case BLKIF_OP_READ: + treq.op = TD_OP_READ; + td_queue_read(image, treq); + break; + } + + sector_nr += nsects; + } + + err = 0; + +out: + vreq->submitting--; + if (!vreq->secs_pending) { + err = (err ? : vreq->error); + tapdisk_vbd_complete_vbd_request(vbd, vreq); + } + + return err; + +fail: + vreq->status = BLKIF_RSP_ERROR; + goto out; +} + +static int +tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd) +{ + int err; + struct timeval now; + td_vbd_request_t *vreq, *tmp; + + err = 0; + gettimeofday(&now, NULL); + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) { + if (vreq->secs_pending) + continue; + + if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) + goto fail; + + if (vreq->error != -EBUSY && + now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL) + continue; + + if (vreq->num_retries >= TD_VBD_MAX_RETRIES) { + fail: + DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n", + vreq->req.id, vreq->num_retries); + tapdisk_vbd_complete_vbd_request(vbd, vreq); + continue; + } + + /* + * never fail due to too many retries if we are blocked on a + * dependency + */ + if (vreq->blocked) { + vreq->blocked = 0; + } else { + vbd->retries++; + vreq->num_retries++; + } + vreq->error = 0; + vreq->status = BLKIF_RSP_OKAY; + DBG(TLOG_DBG, "retry #%d of req %"PRIu64", " + "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries, + vreq->req.id, vreq->req.sector_number, + vreq->req.nr_segments); + + err = tapdisk_vbd_issue_request(vbd, vreq); + if (err) + break; + } + + if (list_empty(&vbd->failed_requests)) + td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED); + else + td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED); + + return err; +} + +static int +tapdisk_vbd_issue_new_requests(td_vbd_t *vbd) +{ + int err; + td_vbd_request_t *vreq, *tmp; + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { + err = tapdisk_vbd_issue_request(vbd, vreq); + if (err) + return err; + } + + return 0; +} + +static int +tapdisk_vbd_kill_requests(td_vbd_t *vbd) +{ + td_vbd_request_t *vreq, *tmp; + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { + vreq->status = BLKIF_RSP_ERROR; + tapdisk_vbd_move_request(vreq, &vbd->completed_requests); + } + + tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) { + vreq->status = BLKIF_RSP_ERROR; + tapdisk_vbd_move_request(vreq, &vbd->completed_requests); + } + + return 0; +} + +int +tapdisk_vbd_issue_requests(td_vbd_t *vbd) +{ + int err; + + if (td_flag_test(vbd->state, TD_VBD_DEAD)) + return tapdisk_vbd_kill_requests(vbd); + + if (!tapdisk_vbd_queue_ready(vbd)) + return -EAGAIN; + + err = tapdisk_vbd_reissue_failed_requests(vbd); + if (err) + return err; + + return tapdisk_vbd_issue_new_requests(vbd); +} + +static void +tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd) +{ + int idx; + RING_IDX rp, rc; + td_ring_t *ring; + blkif_request_t *req; + td_vbd_request_t *vreq; + + ring = &vbd->ring; + if (!ring->sring) + return; + + rp = ring->fe_ring.sring->req_prod; + xen_rmb(); + + for (rc = ring->fe_ring.req_cons; rc != rp; rc++) { + req = RING_GET_REQUEST(&ring->fe_ring, rc); + ++ring->fe_ring.req_cons; + + idx = req->id; + vreq = &vbd->request_list[idx]; + + ASSERT(list_empty(&vreq->next)); + ASSERT(vreq->secs_pending == 0); + + memcpy(&vreq->req, req, sizeof(blkif_request_t)); + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + + DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx); + } +} + +static int +tapdisk_vbd_pause_ring(td_vbd_t *vbd) +{ + int err; + + if (td_flag_test(vbd->state, TD_VBD_PAUSED)) + return 0; + + td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); + + err = tapdisk_vbd_quiesce_queue(vbd); + if (err) { + EPRINTF("%s: ring pause request on active queue\n", vbd->name); + return err; + } + + tapdisk_vbd_close_vdi(vbd); + + err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0); + if (err) + EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno); + else { + td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); + td_flag_set(vbd->state, TD_VBD_PAUSED); + } + + return err; +} + +static int +tapdisk_vbd_resume_ring(td_vbd_t *vbd) +{ + int i, err, type; + char *path, message[BLKTAP2_MAX_MESSAGE_LEN]; + + memset(message, 0, sizeof(message)); + + if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) { + EPRINTF("%s: resume message for unpaused vbd\n", vbd->name); + return -EINVAL; + } + + err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message); + if (err) { + EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno); + return err; + } + + err = tapdisk_parse_disk_type(message, &path, &type); + if (err) { + EPRINTF("%s: invalid resume string %s\n", vbd->name, message); + goto out; + } + + free(vbd->name); + vbd->name = strdup(path); + if (!vbd->name) { + EPRINTF("resume malloc failed\n"); + err = -ENOMEM; + goto out; + } + vbd->type = type; + + tapdisk_vbd_start_queue(vbd); + + err = tapdisk_vbd_reactivate_volumes(vbd, 1); + if (err) { + EPRINTF("failed to reactivate %s, %d\n", vbd->name, err); + goto out; + } + + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { + err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT); + if (err != -EIO) + break; + + sleep(TD_VBD_EIO_SLEEP); + } + +out: + if (!err) { + image_t image; + struct blktap2_params params; + + memset(¶ms, 0, sizeof(params)); + tapdisk_vbd_get_image_info(vbd, &image); + + params.sector_size = image.secsize; + params.capacity = image.size; + snprintf(params.name, sizeof(params.name) - 1, "%s", message); + + ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, ¶ms); + td_flag_clear(vbd->state, TD_VBD_PAUSED); + } + + ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err); + return err; +} + +static int +tapdisk_vbd_check_ring_message(td_vbd_t *vbd) +{ + if (!vbd->ring.sring) + return -EINVAL; + + switch (vbd->ring.sring->pad[0]) { + case 0: + return 0; + + case BLKTAP2_RING_MESSAGE_PAUSE: + return tapdisk_vbd_pause_ring(vbd); + + case BLKTAP2_RING_MESSAGE_RESUME: + return tapdisk_vbd_resume_ring(vbd); + + case BLKTAP2_RING_MESSAGE_CLOSE: + return tapdisk_vbd_close(vbd); + + default: + return -EINVAL; + } +} + +static void +tapdisk_vbd_ring_event(event_id_t id, char mode, void *private) +{ + td_vbd_t *vbd; + + vbd = (td_vbd_t *)private; + + tapdisk_vbd_pull_ring_requests(vbd); + tapdisk_vbd_issue_requests(vbd); + + /* vbd may be destroyed after this call */ + tapdisk_vbd_check_ring_message(vbd); +} + +td_image_t * +tapdisk_vbd_first_image(td_vbd_t *vbd) +{ + return list_entry(vbd->images.next, td_image_t, next); +} diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h new file mode 100644 index 0000000000..ecb22a0762 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk-vbd.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _TAPDISK_VBD_H_ +#define _TAPDISK_VBD_H_ + +#include <sys/time.h> +#include <xenctrl.h> +#include <xen/io/blkif.h> + +#include "tapdisk.h" +#include "scheduler.h" +#include "tapdisk-ipc.h" +#include "tapdisk-image.h" + +#define TD_VBD_MAX_RETRIES 100 +#define TD_VBD_RETRY_INTERVAL 1 + +#define TD_VBD_DEAD 0x0001 +#define TD_VBD_CLOSED 0x0002 +#define TD_VBD_QUIESCE_REQUESTED 0x0004 +#define TD_VBD_QUIESCED 0x0008 +#define TD_VBD_PAUSE_REQUESTED 0x0010 +#define TD_VBD_PAUSED 0x0020 +#define TD_VBD_SHUTDOWN_REQUESTED 0x0040 +#define TD_VBD_LOCKING 0x0080 +#define TD_VBD_RETRY_NEEDED 0x0100 +#define TD_VBD_LOG_DROPPED 0x0200 + +typedef struct td_ring td_ring_t; +typedef struct td_vbd_request td_vbd_request_t; +typedef struct td_vbd_handle td_vbd_t; +typedef void (*td_vbd_cb_t) (void *, blkif_response_t *); + +struct td_ring { + int fd; + char *mem; + blkif_sring_t *sring; + blkif_back_ring_t fe_ring; + unsigned long vstart; +}; + +struct td_vbd_request { + blkif_request_t req; + int16_t status; + + int error; + int blocked; /* blocked on a dependency */ + int submitting; + int secs_pending; + int num_retries; + struct timeval last_try; + + td_vbd_t *vbd; + struct list_head next; +}; + +struct td_vbd_handle { + char *name; + + td_uuid_t uuid; + int type; + + int storage; + + uint8_t reopened; + uint8_t reactivated; + td_flag_t flags; + td_flag_t state; + + td_ipc_t ipc; + + struct list_head images; + + struct list_head new_requests; + struct list_head pending_requests; + struct list_head failed_requests; + struct list_head completed_requests; + + td_vbd_request_t request_list[MAX_REQUESTS]; + + td_ring_t ring; + event_id_t ring_event_id; + + td_vbd_cb_t callback; + void *argument; + + struct list_head next; + + struct timeval ts; + + uint64_t received; + uint64_t returned; + uint64_t kicked; + uint64_t secs_pending; + uint64_t retries; + uint64_t errors; +}; + +#define tapdisk_vbd_for_each_request(vreq, tmp, list) \ + list_for_each_entry_safe((vreq), (tmp), (list), next) + +#define tapdisk_vbd_for_each_image(vbd, image, tmp) \ + list_for_each_entry_safe((image), (tmp), &(vbd)->images, next) + +static inline void +tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest) +{ + list_del(&vreq->next); + INIT_LIST_HEAD(&vreq->next); + list_add_tail(&vreq->next, dest); +} + +static inline void +tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image) +{ + list_add_tail(&image->next, &vbd->images); +} + +static inline int +tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image) +{ + return list_is_last(&image->next, &vbd->images); +} + +td_image_t * +tapdisk_vbd_first_image(td_vbd_t *vbd); + +static inline td_image_t * +tapdisk_vbd_last_image(td_vbd_t *vbd) +{ + return list_entry(vbd->images.prev, td_image_t, next); +} + +static inline td_image_t * +tapdisk_vbd_next_image(td_image_t *image) +{ + return list_entry(image->next.next, td_image_t, next); +} + +int tapdisk_vbd_initialize(int, int, td_uuid_t); +void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *); +int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t, + uint16_t, const char *, td_flag_t); +int tapdisk_vbd_close(td_vbd_t *); + +int tapdisk_vbd_open_vdi(td_vbd_t *, const char *, + uint16_t, uint16_t, td_flag_t); +void tapdisk_vbd_close_vdi(td_vbd_t *); + +void tapdisk_vbd_forward_request(td_request_t); + +int tapdisk_vbd_get_image_info(td_vbd_t *, image_t *); +int tapdisk_vbd_queue_ready(td_vbd_t *); +int tapdisk_vbd_retry_needed(td_vbd_t *); +int tapdisk_vbd_quiesce_queue(td_vbd_t *); +int tapdisk_vbd_start_queue(td_vbd_t *); +int tapdisk_vbd_issue_requests(td_vbd_t *); +int tapdisk_vbd_kill_queue(td_vbd_t *); +int tapdisk_vbd_pause(td_vbd_t *); +int tapdisk_vbd_resume(td_vbd_t *, const char *, uint16_t); +int tapdisk_vbd_kick(td_vbd_t *); +void tapdisk_vbd_check_state(td_vbd_t *); +void tapdisk_vbd_check_progress(td_vbd_t *); +void tapdisk_vbd_debug(td_vbd_t *); + +void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *); + +#endif diff --git a/tools/blktap2/drivers/tapdisk.c b/tools/blktap2/drivers/tapdisk.c new file mode 100644 index 0000000000..db1366afa4 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> + +#include "tapdisk-utils.h" +#include "tapdisk-server.h" + +static void +usage(void) +{ + fprintf(stderr, "blktap-utils: v2.0.0\n"); + fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n"); + exit(EINVAL); +} + +int +main(int argc, char *argv[]) +{ + int err; + + if (argc != 3) + usage(); + + daemon(0, 0); + tapdisk_start_logging("TAPDISK"); + + err = tapdisk_server_initialize(argv[1], argv[2]); + if (err) { + EPRINTF("failed to initialize tapdisk server: %d\n", err); + goto out; + } + + err = tapdisk_server_run(); + +out: + tapdisk_stop_logging(); + return err; +} diff --git a/tools/blktap2/drivers/tapdisk.h b/tools/blktap2/drivers/tapdisk.h new file mode 100644 index 0000000000..487c50fbf6 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Some notes on the tap_disk interface: + * + * tap_disk aims to provide a generic interface to easily implement new + * types of image accessors. The structure-of-function-calls is similar + * to disk interfaces used in qemu/denali/etc, with the significant + * difference being the expectation of asynchronous rather than synchronous + * I/O. The asynchronous interface is intended to allow lots of requests to + * be pipelined through a disk, without the disk requiring any of its own + * threads of control. As such, a batch of requests is delivered to the disk + * using: + * + * td_queue_[read,write]() + * + * and passing in a completion callback, which the disk is responsible for + * tracking. Disks should transform these requests as necessary and return + * the resulting iocbs to tapdisk using td_prep_[read,write]() and + * td_queue_tiocb(). + * + * NOTE: tapdisk uses the number of sectors submitted per request as a + * ref count. Plugins must use the callback function to communicate the + * completion -- or error -- of every sector submitted to them. + * + * td_get_parent_id returns: + * 0 if parent id successfully retrieved + * TD_NO_PARENT if no parent exists + * -errno on error + */ + +#ifndef _TAPDISK_H_ +#define _TAPDISK_H_ + +#include <time.h> +#include <stdint.h> + +#include "list.h" +#include "blktaplib.h" +#include "disktypes.h" +#include "tapdisk-log.h" +#include "tapdisk-utils.h" + +#define MAX_SEGMENTS_PER_REQ 11 +#define SECTOR_SHIFT 9 +#define DEFAULT_SECTOR_SIZE 512 + +#define TAPDISK_DATA_REQUESTS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) + +//#define BLK_NOT_ALLOCATED (-99) +#define TD_NO_PARENT 1 + +#define MAX_RAMDISK_SIZE 1024000 /*500MB disk limit*/ + +#define TD_OP_READ 0 +#define TD_OP_WRITE 1 + +#define TD_OPEN_QUIET 0x00001 +#define TD_OPEN_QUERY 0x00002 +#define TD_OPEN_RDONLY 0x00004 +#define TD_OPEN_STRICT 0x00008 +#define TD_OPEN_SHAREABLE 0x00010 +#define TD_OPEN_ADD_CACHE 0x00020 +#define TD_OPEN_VHD_INDEX 0x00040 +#define TD_OPEN_LOG_DIRTY 0x00080 + +#define TD_CREATE_SPARSE 0x00001 +#define TD_CREATE_MULTITYPE 0x00002 + +#define td_flag_set(word, flag) ((word) |= (flag)) +#define td_flag_clear(word, flag) ((word) &= ~(flag)) +#define td_flag_test(word, flag) ((word) & (flag)) + +typedef uint16_t td_uuid_t; +typedef uint32_t td_flag_t; +typedef uint64_t td_sector_t; +typedef struct td_disk_id td_disk_id_t; +typedef struct td_disk_info td_disk_info_t; +typedef struct td_request td_request_t; +typedef struct td_driver_handle td_driver_t; +typedef struct td_image_handle td_image_t; + +/* + * Prototype of the callback to activate as requests complete. + */ +typedef void (*td_callback_t)(td_request_t, int); + +struct td_disk_id { + char *name; + int drivertype; +}; + +struct td_disk_info { + td_sector_t size; + long sector_size; + uint32_t info; +}; + +struct td_request { + int op; + char *buf; + td_sector_t sec; + int secs; + + uint8_t blocked; /* blocked on a dependency */ + + td_image_t *image; + + td_callback_t cb; + void *cb_data; + + uint64_t id; + int sidx; + void *private; +}; + +/* + * Structure describing the interface to a virtual disk implementation. + * See note at the top of this file describing this interface. + */ +struct tap_disk { + const char *disk_type; + td_flag_t flags; + int private_data_size; + int (*td_open) (td_driver_t *, const char *, td_flag_t); + int (*td_close) (td_driver_t *); + int (*td_get_parent_id) (td_driver_t *, td_disk_id_t *); + int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t); + void (*td_queue_read) (td_driver_t *, td_request_t); + void (*td_queue_write) (td_driver_t *, td_request_t); + void (*td_debug) (td_driver_t *); +}; + +#endif diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c new file mode 100644 index 0000000000..45b27ecc19 --- /dev/null +++ b/tools/blktap2/drivers/tapdisk2.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/ioctl.h> + +#include "tapdisk.h" +#include "blktap2.h" +#include "tapdisk-vbd.h" +#include "tapdisk-utils.h" +#include "tapdisk-server.h" + +#define TAPDISK2_VBD 0 + +#define cprintf(_err, _f, _a...) \ + do { \ + if (child_out) { \ + fprintf(child_out, "%d: " _f, _err, ##_a); \ + fflush(child_out); \ + } \ + } while (0) + +#define CHILD_ERR(_err, _f, _a...) \ + do { \ + EPRINTF(_f, ##_a); \ + cprintf(_err, _f, ##_a); \ + } while (0) + +static int channel[2]; +static FILE *child_out; +static struct blktap2_handle handle; + +static int +tapdisk2_prepare_directory(void) +{ + int err; + char *ptr, *name, *start; + + err = access(BLKTAP2_DIRECTORY, W_OK | R_OK); + if (!err) + return 0; + + name = strdup(BLKTAP2_DIRECTORY); + if (!name) + return -ENOMEM; + + start = name; + + for (;;) { + ptr = strchr(start + 1, '/'); + if (ptr) + *ptr = '\0'; + + err = mkdir(name, 0755); + if (err && errno != EEXIST) { + err = -errno; + CHILD_ERR(err, "failed to create directory %s: %d\n", + name, err); + break; + } + + if (!ptr) + break; + else { + *ptr = '/'; + start = ptr + 1; + } + } + + free(name); + return err; +} + +static int +tapdisk2_make_device(char *devname, int major, int minor, int perm) +{ + int err; + struct stat st; + + err = tapdisk2_prepare_directory(); + if (err) + return err; + + if (!access(devname, F_OK)) + if (unlink(devname)) { + CHILD_ERR(errno, "error unlinking %s: %d\n", + devname, errno); + return -errno; + } + + err = mknod(devname, perm, makedev(major, minor)); + if (err) { + CHILD_ERR(errno, "mknod %s failed: %d\n", devname, -errno); + return -errno; + } + + DPRINTF("Created %s device\n", devname); + return 0; +} + +static int +tapdisk2_check_environment(void) +{ + FILE *f; + int err, minor; + char name[256]; + + if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK)) + return 0; + + memset(name, 0, sizeof(name)); + + f = fopen("/proc/misc", "r"); + if (!f) { + CHILD_ERR(errno, "failed to open /proc/misc: %d\n", errno); + return -errno; + } + + while (fscanf(f, "%d %256s", &minor, name) == 2) + if (!strcmp(name, BLKTAP2_CONTROL_NAME)) { + err = tapdisk2_make_device(BLKTAP2_CONTROL_DEVICE, + MISC_MAJOR_NUMBER, + minor, S_IFCHR | 0600); + goto out; + } + + err = -ENOSYS; + CHILD_ERR(err, "didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME); + +out: + fclose(f); + return err; +} + +static void +tapdisk2_free_device(void) +{ + int fd, err; + + fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY); + if (fd == -1) { + CHILD_ERR(errno, "failed to open control device: %d\n", errno); + return; + } + + err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, handle.minor); + close(fd); +} + +static int +tapdisk2_prepare_device(void) +{ + char *name; + int fd, err; + + fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY); + if (fd == -1) { + CHILD_ERR(errno, "failed to open control device: %d\n", errno); + return -errno; + } + + err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle); + close(fd); + if (err == -1) { + CHILD_ERR(errno, "failed to allocate new device: %d\n", errno); + return -errno; + } + + err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor); + if (err == -1) { + err = -ENOMEM; + goto fail; + } + + err = tapdisk2_make_device(name, handle.ring, + handle.minor, S_IFCHR | 0600); + free(name); + if (err) { + CHILD_ERR(err, "creating ring device for %d failed: %d\n", + handle.minor, err); + goto fail; + } + + err = asprintf(&name, "%s%d", BLKTAP2_IO_DEVICE, handle.minor); + if (err == -1) { + err = -ENOMEM; + goto fail; + } + + err = tapdisk2_make_device(name, handle.device, + handle.minor, S_IFBLK | 0600); + free(name); + if (err) { + CHILD_ERR(err, "creating IO device for %d failed: %d\n", + handle.minor, err); + goto fail; + } + + DPRINTF("new interface: ring: %u, device: %u, minor: %u\n", + handle.ring, handle.device, handle.minor); + + return 0; + +fail: + tapdisk2_free_device(); + return err; +} + +static int +tapdisk2_open_device(int type, const char *path, const char *name) +{ + int err; + td_vbd_t *vbd; + image_t image; + char *devname; + struct blktap2_params params; + + err = tapdisk_vbd_initialize(-1, -1, TAPDISK2_VBD); + if (err) + return err; + + vbd = tapdisk_server_get_vbd(TAPDISK2_VBD); + if (!vbd) { + err = -ENODEV; + CHILD_ERR(err, "couldn't find vbd\n"); + return err; + } + + err = asprintf(&devname, "%s%d", BLKTAP2_RING_DEVICE, handle.minor); + if (err == -1) { + err = -ENOMEM; + CHILD_ERR(err, "couldn't allocate ring\n"); + return err; + } + + err = tapdisk_vbd_open(vbd, path, type, + TAPDISK_STORAGE_TYPE_DEFAULT, + devname, 0); + free(devname); + if (err) { + CHILD_ERR(err, "vbd open failed: %d\n", err); + return err; + } + + memset(¶ms, 0, sizeof(params)); + tapdisk_vbd_get_image_info(vbd, &image); + + params.capacity = image.size; + params.sector_size = image.secsize; + snprintf(params.name, sizeof(params.name) - 1, "%s", name); + + err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_CREATE_DEVICE, ¶ms); + if (err) { + err = -errno; + CHILD_ERR(err, "create device failed: %d\n", err); + return err; + } + + return 0; +} + +static int +tapdisk2_set_child_fds(void) +{ + int i, err; + + err = dup2(channel[1], STDOUT_FILENO); + if (err == -1) { + CHILD_ERR(errno, "failed duping pipe: %d\n", errno); + return errno; + } + + child_out = fdopen(STDOUT_FILENO, "w"); + if (!child_out) { + CHILD_ERR(errno, "failed setting child_out: %d\n", errno); + return errno; + } + + for (i = 0; i < sysconf(_SC_OPEN_MAX); i++) + if (i != STDOUT_FILENO) + close(i); + + return 0; +} + +static int +tapdisk2_create_device(const char *params) +{ + char *path; + int err, type; + + chdir("/"); + tapdisk_start_logging("tapdisk2"); + + err = tapdisk2_set_child_fds(); + if (err) + goto out; + + err = tapdisk2_check_environment(); + if (err) + goto out; + + err = tapdisk_parse_disk_type(params, &path, &type); + if (err) + goto out; + + err = tapdisk2_prepare_device(); + if (err) + goto out; + + err = tapdisk_server_initialize(NULL, NULL); + if (err) + goto fail; + + err = tapdisk2_open_device(type, path, params); + if (err) + goto fail; + + cprintf(0, "%s%d\n", BLKTAP2_IO_DEVICE, handle.minor); + close(STDOUT_FILENO); + + err = tapdisk_server_run(); + if (err) + goto fail; + + err = 0; + +out: + tapdisk_stop_logging(); + return err; + +fail: + tapdisk2_free_device(); + goto out; +} + +static int +tapdisk2_wait_for_device(void) +{ + int err; + char msg[1024]; + FILE *parent_in; + + close(channel[1]); + parent_in = fdopen(channel[0], "r"); + if (!parent_in) { + printf("failed to connect to child: %d\n", errno); + return errno; + } + + memset(msg, 0, sizeof(msg)); + if (fscanf(parent_in, "%d: %1023[^\n]", &err, msg) != 2) { + printf("unrecognized child response\n"); + return EINVAL; + } + + printf("%s\n", msg); + return (err >= 0 ? err : -err); +} + +static void +usage(const char *app, int err) +{ + fprintf(stderr, "usage: %s <-n file>\n", app); + exit(err); +} + +int +main(int argc, char *argv[]) +{ + int c; + char *params; + + params = NULL; + + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + params = optarg; + break; + case 'h': + usage(argv[0], 0); + default: + usage(argv[0], EINVAL); + } + } + + if (!params || optind != argc) + usage(argv[0], EINVAL); + + if (pipe(channel) == -1) { + printf("pipe failed: %d\n", errno); + return errno; + } + + switch (fork()) { + case -1: + printf("fork failed: %d\n", errno); + return errno; + case 0: + return tapdisk2_create_device(params); + default: + return tapdisk2_wait_for_device(); + } +} diff --git a/tools/blktap2/drivers/td.c b/tools/blktap2/drivers/td.c new file mode 100644 index 0000000000..f920acd294 --- /dev/null +++ b/tools/blktap2/drivers/td.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <unistd.h> +#include <string.h> + +#include "libvhd.h" +#include "vhd-util.h" +#include "tapdisk-utils.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +typedef enum { + TD_FIELD_HIDDEN = 0, + TD_FIELD_INVALID = 1 +} td_field_t; + +struct vdi_field { + char *name; + td_field_t id; +}; + +static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = { + { .id = TD_FIELD_HIDDEN, .name = "hidden" } +}; + +typedef enum { + TD_CMD_CREATE = 0, + TD_CMD_SNAPSHOT, +/* TD_CMD_COALESCE, */ + TD_CMD_QUERY, +/* TD_CMD_RESIZE, */ + TD_CMD_SET, +/* TD_CMD_REPAIR, */ +/* TD_CMD_FILL, */ +/* TD_CMD_READ, */ + TD_CMD_INVALID, +} td_command_t; + +struct command { + td_command_t id; + char *name; + int needs_type; +}; + +struct command commands[TD_CMD_INVALID] = { + { .id = TD_CMD_CREATE, .name = "create", .needs_type = 1 }, + { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 }, +/* { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 }, */ + { .id = TD_CMD_QUERY, .name = "query", .needs_type = 1 }, +/* { .id = TD_CMD_RESIZE, .name = "resize", .needs_type = 1 }, */ + { .id = TD_CMD_SET, .name = "set", .needs_type = 1 }, +/* { .id = TD_CMD_REPAIR, .name = "repair", .needs_type = 1 }, */ +/* { .id = TD_CMD_FILL, .name = "fill", .needs_type = 1 }, */ +/* { .id = TD_CMD_READ, .name = "read", .needs_type = 1 }, */ +}; + +typedef enum { + TD_TYPE_VHD = 0, + TD_TYPE_AIO, + TD_TYPE_INVALID, +} td_disk_t; + +const char *td_disk_types[TD_TYPE_INVALID] = { + "vhd", + "aio", +}; + +#define print_commands() \ + do { \ + int i; \ + fprintf(stderr, "COMMAND := { "); \ + fprintf(stderr, "%s", commands[0].name); \ + for (i = 1; i < TD_CMD_INVALID; i++) \ + fprintf(stderr, " | %s", commands[i].name); \ + fprintf(stderr, " }\n"); \ + } while (0) + +#define print_disk_types() \ + do { \ + int i; \ + fprintf(stderr, "TYPE := { "); \ + fprintf(stderr, "%s", td_disk_types[0]); \ + for (i = 1; i < TD_TYPE_INVALID; i++) \ + fprintf(stderr, " | %s", td_disk_types[i]); \ + fprintf(stderr, " }\n"); \ + } while (0); + +#define print_field_names() \ + do { \ + int i; \ + fprintf(stderr, "FIELD := { "); \ + fprintf(stderr, "%s", td_vdi_fields[0].name); \ + for (i = 1; i < TD_FIELD_INVALID; i++) \ + fprintf(stderr, " | %s", td_vdi_fields[i].name); \ + fprintf(stderr, " }\n"); \ + } while (0) + +void +help(void) +{ + fprintf(stderr, "Tapdisk Utilities: v1.0.0\n"); + fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n"); + print_commands(); + print_disk_types(); + exit(-1); +} + +struct command * +get_command(char *command) +{ + int i; + + for (i = 0; i < TD_CMD_INVALID; i++) + if (!strcmp(command, commands[i].name)) + return &commands[i]; + + return NULL; +} + +struct vdi_field * +get_field(char *field) +{ + int i; + + for (i = 0; i < TD_FIELD_INVALID; i++) + if (!strcmp(field, td_vdi_fields[i].name)) + return &td_vdi_fields[i]; + + return NULL; +} + +int +get_driver_type(char *type) +{ + int i; + + if (strnlen(type, 25) >= 25) + return -ENAMETOOLONG; + + for (i = 0; i < TD_TYPE_INVALID; i++) + if (!strcmp(type, td_disk_types[i])) + return i; + + return -TD_TYPE_INVALID; +} + +int +td_create(int type, int argc, char *argv[]) +{ + ssize_t mb; + uint64_t size; + char *name, *buf; + int c, i, fd, sparse = 1, fixedsize = 0; + + while ((c = getopt(argc, argv, "hrb")) != -1) { + switch(c) { + case 'r': + sparse = 0; + break; + case 'b': + fixedsize = 1; + break; + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case 'h': + goto usage; + } + } + + if (optind != (argc - 2)) + goto usage; + + mb = 1 << 20; + size = atoi(argv[optind++]); + size = size << 20; + name = argv[optind]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (type == TD_TYPE_VHD) { + int cargc = 0; + char sbuf[32], *cargv[10]; + + size >>= 20; + + memset(cargv, 0, sizeof(cargv)); + snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size); + cargv[cargc++] = "create"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-s"; + cargv[cargc++] = sbuf; + if (!sparse) + cargv[cargc++] = "-r"; + if (fixedsize) + cargv[cargc++] = "-b"; + + return vhd_util_create(cargc, cargv); + } + + /* generic create */ + if (sparse) { + fprintf(stderr, "Cannot create sparse %s image\n", + td_disk_types[type]); + return EINVAL; + } + + buf = calloc(1, mb); + if (!buf) + return ENOMEM; + + fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + free(buf); + return errno; + } + + size >>= 20; + for (i = 0; i < size; i++) + if (write(fd, buf, mb) != mb) { + close(fd); + unlink(name); + free(buf); + return EIO; + } + + close(fd); + free(buf); + return 0; + + usage: + fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] " + "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n", + td_disk_types[type]); + return EINVAL; +} + +int +td_snapshot(int type, int argc, char *argv[]) +{ + char *cargv[10]; + int c, err, cargc; + struct stat stats; + char *name, *backing, *limit = NULL; + int fixedsize = 0, rawparent = 0; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot create snapshot of %s image type\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "hbml:")) != -1) { + switch(c) { + case 'b': + fixedsize = 1; + break; + case 'm': + rawparent = 1; + break; + case 'l': + limit = optarg; + break; + case 'h': + err = 0; + goto usage; + default: + err = EINVAL; + goto usage; + } + } + + if (optind != (argc - 2)) { + err = EINVAL; + goto usage; + } + + name = argv[optind++]; + backing = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN || + strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (stat(backing, &stats) == -1) { + fprintf(stderr, "File %s not found\n", backing); + return errno; + } + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "snapshot"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-p"; + cargv[cargc++] = backing; + if (fixedsize) + cargv[cargc++] = "-b"; + if (rawparent) + cargv[cargc++] = "-m"; + if (limit) { + cargv[cargc++] = "-l"; + cargv[cargc++] = limit; + } + return vhd_util_snapshot(cargc, cargv); + + usage: + fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] " + "[-b file_is_fixed_size] [-l snapshot depth limit] " + "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]); + return err; +} + +int +td_coalesce(int type, int argc, char *argv[]) +{ + int c, ret, cargc; + char *name, *pname, *cargv[3]; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot create snapshot of %s image type\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "h")) != -1) { + switch(c) { + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case 'h': + goto usage; + } + } + + if (optind != (argc - 1)) + goto usage; + + name = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "coalesce"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + ret = vhd_util_coalesce(cargc, cargv); + if (ret) + printf("coalesce failed: %d\n", ret); + + return ret; + + usage: + fprintf(stderr, "usage: td-util coalesce %s [-h help] " + "<FILENAME>\n", td_disk_types[type]); + return EINVAL; +} + +int +td_query(int type, int argc, char *argv[]) +{ + char *name; + int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0; + + while ((c = getopt(argc, argv, "hvpfd")) != -1) { + switch(c) { + case 'v': + size = 1; + break; + case 'p': + parent = 1; + break; + case 'f': + fields = 1; + break; + case 'd': + depth = 1; + break; + case 'h': + err = 0; + goto usage; + default: + err = EINVAL; + goto usage; + } + } + + if (optind != (argc - 1)) { + err = EINVAL; + goto usage; + } + + name = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (type == TD_TYPE_VHD) { + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) { + printf("failed opening %s: %d\n", name, err); + return err; + } + + if (size) + printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); + + if (parent) { + if (vhd.footer.type != HD_TYPE_DIFF) + printf("%s has no parent\n", name); + else { + char *pname; + + err = vhd_parent_locator_get(&vhd, &pname); + if (err) + printf("failed getting parent: %d\n", + err); + else { + printf("%s\n", pname); + free(pname); + } + } + } + + if (fields) { + int ret, hidden; + + ret = vhd_hidden(&vhd, &hidden); + if (ret) { + printf("failed checking 'hidden' field: %d\n", + ret); + err = (err ? : ret); + } else + printf("%s: %d\n", + td_vdi_fields[TD_FIELD_HIDDEN].name, + hidden); + } + + if (depth) { + int ret, length; + + ret = vhd_chain_depth(&vhd, &length); + if (ret) + printf("error checking chain depth: %d\n", ret); + else + printf("chain depth: %d\n", length); + + err = (err ? : ret); + } + + vhd_close(&vhd); + + } else if (type == TD_TYPE_AIO) { + if (size) { + int fd; + uint64_t secs; + uint32_t ssize; + + fd = open(name, O_RDONLY | O_LARGEFILE); + if (fd == -1) { + printf("failed opening %s: %d\n", name, errno); + return -errno; + } + + err = tapdisk_get_image_size(fd, &secs, &ssize); + close(fd); + + if (err) { + printf("failed getting size for %s: %d\n:", + name, err); + return err; + } + + printf("%"PRIu64"\n", secs >> 11); + } + + if (parent) + printf("%s has no parent\n", name); + + if (fields) { + int i; + + for (i = 0; i < TD_FIELD_INVALID; i++) + printf("%s: 0\n", td_vdi_fields[i].name); + } + } + + return err; + + usage: + fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] " + "[-p parent] [-f fields] <FILENAME>\n", td_disk_types[type]); + return err; +} + +int +td_set_field(int type, int argc, char *argv[]) +{ + int ret, i, c, cargc; + struct vdi_field *field; + char *name, *value, *cargv[7]; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot set fields of %s images\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "h")) != -1) { + switch(c) { + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case 'h': + goto usage; + } + } + + if (optind != (argc - 3)) + goto usage; + + name = argv[optind++]; + + field = get_field(argv[optind]); + if (!field || field->id != TD_FIELD_HIDDEN) { + fprintf(stderr, "Invalid field %s\n", argv[optind]); + goto usage; + } + + value = argv[++optind]; + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "set"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-f"; + cargv[cargc++] = field->name; + cargv[cargc++] = "-v"; + cargv[cargc++] = value; + return vhd_util_set_field(cargc, cargv); + + usage: + fprintf(stderr, "usage: td-util set %s [-h help] " + "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]); + print_field_names(); + return EINVAL; +} + +int +main(int argc, char *argv[]) +{ + char **cargv; + struct command *cmd; + int cargc, i, type = -1, ret = 0; + +#ifdef CORE_DUMP + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + fprintf(stderr, "setrlimit failed: %d\n", errno); +#endif + + if (argc < 2) + help(); + + cargc = argc - 1; + cmd = get_command(argv[1]); + if (!cmd) { + fprintf(stderr, "invalid COMMAND %s\n", argv[1]); + help(); + } + + if (cmd->needs_type) { + if (argc < 3) { + fprintf(stderr, "td-util %s requires a TYPE\n", + cmd->name); + print_disk_types(); + exit(-1); + } + + type = get_driver_type(argv[2]); + if (type < 0) { + fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]); + print_disk_types(); + exit(-1); + } + --cargc; + } + + cargv = malloc(sizeof(char *) * cargc); + if (!cargv) + exit(ENOMEM); + + cargv[0] = cmd->name; + for (i = 1; i < cargc; i++) + cargv[i] = argv[i + (argc - cargc)]; + + switch(cmd->id) { + case TD_CMD_CREATE: + ret = td_create(type, cargc, cargv); + break; + case TD_CMD_SNAPSHOT: + ret = td_snapshot(type, cargc, cargv); + break; +/* + case TD_CMD_COALESCE: + ret = td_coalesce(type, cargc, cargv); + break; +*/ + case TD_CMD_QUERY: + ret = td_query(type, cargc, cargv); + break; +/* + case TD_CMD_RESIZE: + ret = td_resize(type, cargc, cargv); + break; +*/ + case TD_CMD_SET: + ret = td_set_field(type, cargc, cargv); + break; +/* + case TD_CMD_REPAIR: + ret = td_repair(type, cargc, cargv); + break; + case TD_CMD_FILL: + ret = td_fill(type, cargc, cargv); + break; + case TD_CMD_READ: + ret = td_read(type, cargc, cargv); + break; +*/ + default: + case TD_CMD_INVALID: + ret = EINVAL; + break; + } + + free(cargv); + + return (ret >= 0 ? ret : -ret); +} diff --git a/tools/blktap2/drivers/xmsnap b/tools/blktap2/drivers/xmsnap new file mode 100644 index 0000000000..f14351ba56 --- /dev/null +++ b/tools/blktap2/drivers/xmsnap @@ -0,0 +1,78 @@ +#!/bin/bash + +usage () { echo "USAGE: xmsnap <VM ID> <Backing File>"; } + +# +# Check Usage +# +if [ -n "$1" ] +then + vmid=$1 +else + usage + exit 1 +fi + +if [ -n "$2" ] +then + target=$2 +else + usage + exit 1 +fi + +if [ -e "$target" ] +then + echo "Creating snapshot of file $target for VM $vmid." +else + usage + echo "File $target not found." + exit 1 +fi + +# +# Find the snapshot name +# +directory=`dirname "$target"` +target=`basename "$target"` + +let maxidx=0 +if [ -e $directory/${target}.snap1 ] +then + for idx in $(ls $directory/${target}.snap*) + do + let idx=${idx#$directory/${target}.snap} + if [ "$idx" -gt "$maxidx" ] + then + let maxidx=$idx + fi + done +fi + +snap=${target}.snap`expr $maxidx + 1` + +# +# Pause VM +# +xm pause $vmid +if [ "$?" -ne "0" ]; then + exit 1 +fi + + +# +# Snap and reposition the files +# +mv $directory/$target $directory/$snap +if [ "$?" -ne "0" ]; then + exit 1 +fi + +qcow-create 0 $directory/$target $directory/$snap + +# +# Unpause +# +xm unpause $vmid + +exit
\ No newline at end of file diff --git a/tools/blktap2/include/Makefile b/tools/blktap2/include/Makefile new file mode 100644 index 0000000000..7267eac53a --- /dev/null +++ b/tools/blktap2/include/Makefile @@ -0,0 +1,14 @@ +XEN_ROOT := ../../../ +include $(XEN_ROOT)/tools/Rules.mk + +.PHONY: all +all: + +.PHONY: install +install: + $(INSTALL_DIR) -p $(DESTDIR)$(INCLUDEDIR) + + +.PHONY: clean +clean: + @: diff --git a/tools/blktap2/include/atomicio.h b/tools/blktap2/include/atomicio.h new file mode 100644 index 0000000000..7eccf206b3 --- /dev/null +++ b/tools/blktap2/include/atomicio.h @@ -0,0 +1,33 @@ +/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */ + +/* + * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Ensure all of data on socket comes through. f==read || f==vwrite + */ +size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t); + +#define vwrite (ssize_t (*)(int, void *, size_t))write diff --git a/tools/blktap2/include/blktaplib.h b/tools/blktap2/include/blktaplib.h new file mode 100644 index 0000000000..1824afa943 --- /dev/null +++ b/tools/blktap2/include/blktaplib.h @@ -0,0 +1,249 @@ +/* blktaplib.h + * + * Blktap library userspace code. + * + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __BLKTAPLIB_H__ +#define __BLKTAPLIB_H__ + +#include <syslog.h> +#include <xenctrl.h> +#include <xen/io/blkif.h> + +#if 1 +#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a) + +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, XC_PAGE_SIZE) + +/* size of the extra VMA area to map in attached pages. */ +#define BLKTAP_VMA_PAGES BLK_RING_SIZE + +/* blktap IOCTLs: These must correspond with the blktap driver ioctls */ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_SENDPID 4 +#define BLKTAP_IOCTL_NEWINTF 5 +#define BLKTAP_IOCTL_MINOR 6 +#define BLKTAP_IOCTL_MAJOR 7 +#define BLKTAP_QUERY_ALLOC_REQS 8 +#define BLKTAP_IOCTL_FREEINTF 9 +#define BLKTAP_IOCTL_PRINT_IDXS 100 +#define BLKTAP_IOCTL_BACKDEV_SETUP 200 + +#define PRIO_SPECIAL_IO -9999 + +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) ); +} + +#define MAX_REQUESTS BLK_RING_SIZE + +#define BLKTAP_IOCTL_KICK 1 +#define MAX_PENDING_REQS BLK_RING_SIZE +#define BLKTAP_DEV_DIR "/dev/xen" +#define BLKTAP_DEV_NAME "blktap" +#define BACKDEV_NAME "backdev" +#define BLKTAP_DEV_MINOR 0 +#define BLKTAP_CTRL_DIR "/var/run/tap" + +extern int blktap_major; + +#define BLKTAP_RING_PAGES 1 /* Front */ +#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES) + +struct blkif; +struct blkif_info; + +typedef struct { + blkif_request_t req; + int submitting; + int secs_pending; + int16_t status; + int num_retries; + struct timeval last_try; +} pending_req_t; + +typedef struct blkif { + domid_t domid; + long int handle; + + long int pdev; + long int readonly; + + enum { DISCONNECTED, DISCONNECTING, CONNECTED } state; + + struct blkif_ops *ops; + struct blkif *hash_next; + + void *prv; /* device-specific data */ + struct blkif_info *info; /*Image parameter passing */ + pending_req_t pending_list[MAX_REQUESTS]; + int devnum; + int fds[2]; + int be_id; + char *backend_path; + int major; + int minor; + pid_t tappid; + int drivertype; + uint16_t cookie; + int err; +} blkif_t; + +typedef struct blkif_info { + char *params; + int readonly; + int storage; +} blkif_info_t; + +typedef struct tapdev_info { + int fd; + char *mem; + blkif_sring_t *sring; + blkif_back_ring_t fe_ring; + unsigned long vstart; + blkif_t *blkif; +} tapdev_info_t; + +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + +typedef struct image { + unsigned long long size; + unsigned long secsize; + unsigned int info; +} image_t; + +typedef struct msg_hdr { + uint16_t type; + uint16_t len; + uint16_t drivertype; + uint16_t cookie; +} msg_hdr_t; + +typedef struct msg_params { + uint8_t readonly; + int path_off; + int path_len; + int storage; +} msg_params_t; + +typedef struct msg_newdev { + uint8_t devnum; + uint16_t domid; +} msg_newdev_t; + +typedef struct msg_pid { + pid_t pid; +} msg_pid_t; + +typedef struct msg_cp { + int cp_uuid_off; + int cp_uuid_len; + int cp_drivertype; +} msg_cp_t; + +typedef struct msg_lock { + int ro; + int enforce; + int uuid_off; + int uuid_len; +} msg_lock_t; + +#define READ 0 +#define WRITE 1 + +/*Control Messages between manager and tapdev*/ +#define CTLMSG_PARAMS 1 +#define CTLMSG_IMG 2 +#define CTLMSG_IMG_FAIL 3 +#define CTLMSG_NEWDEV 4 +#define CTLMSG_NEWDEV_RSP 5 +#define CTLMSG_NEWDEV_FAIL 6 +#define CTLMSG_CLOSE 7 +#define CTLMSG_CLOSE_RSP 8 +#define CTLMSG_PID 9 +#define CTLMSG_PID_RSP 10 +#define CTLMSG_CHECKPOINT 11 +#define CTLMSG_CHECKPOINT_RSP 12 +#define CTLMSG_LOCK 13 +#define CTLMSG_LOCK_RSP 14 +#define CTLMSG_PAUSE 15 +#define CTLMSG_PAUSE_RSP 16 +#define CTLMSG_RESUME 17 +#define CTLMSG_RESUME_RSP 18 + +#define TAPDISK_STORAGE_TYPE_NFS 1 +#define TAPDISK_STORAGE_TYPE_EXT 2 +#define TAPDISK_STORAGE_TYPE_LVM 3 +#define TAPDISK_STORAGE_TYPE_DEFAULT TAPDISK_STORAGE_TYPE_EXT + +/* Abitrary values, must match the underlying driver... */ +#define MAX_TAP_DEV 256 + +/* Accessing attached data page mappings */ +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_vstart,_req,_seg) \ + ((_vstart) + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) + \ + ((_seg) * getpagesize())) + +/* Defines that are only used by library clients */ + +#ifndef __COMPILING_BLKTAP_LIB + +static char *blkif_op_name[] = { + [BLKIF_OP_READ] = "READ", + [BLKIF_OP_WRITE] = "WRITE", +}; + +#endif /* __COMPILING_BLKTAP_LIB */ + +#endif /* __BLKTAPLIB_H__ */ diff --git a/tools/blktap2/include/libvhd-journal.h b/tools/blktap2/include/libvhd-journal.h new file mode 100644 index 0000000000..2f32ff02ca --- /dev/null +++ b/tools/blktap2/include/libvhd-journal.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _VHD_JOURNAL_H_ +#define _VHD_JOURNAL_H_ + +#include <inttypes.h> + +#include "libvhd.h" + +#define VHD_JOURNAL_METADATA 0x01 +#define VHD_JOURNAL_DATA 0x02 + +#define VHD_JOURNAL_HEADER_COOKIE "vjournal" +#define VHD_JOURNAL_ENTRY_COOKIE 0xaaaa12344321aaaa + +typedef struct vhd_journal_header { + char cookie[8]; + uuid_t uuid; + uint64_t vhd_footer_offset; + uint32_t journal_data_entries; + uint32_t journal_metadata_entries; + uint64_t journal_data_offset; + uint64_t journal_metadata_offset; + uint64_t journal_eof; + char pad[448]; +} vhd_journal_header_t; + +typedef struct vhd_journal { + char *jname; + int jfd; + int is_block; /* is jfd a block device */ + vhd_journal_header_t header; + vhd_context_t vhd; +} vhd_journal_t; + +int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile); +int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile); +int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode); +int vhd_journal_commit(vhd_journal_t *); +int vhd_journal_revert(vhd_journal_t *); +int vhd_journal_close(vhd_journal_t *); +int vhd_journal_remove(vhd_journal_t *); + +#endif diff --git a/tools/blktap2/include/libvhd.h b/tools/blktap2/include/libvhd.h new file mode 100644 index 0000000000..b128ebaf38 --- /dev/null +++ b/tools/blktap2/include/libvhd.h @@ -0,0 +1,308 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _VHD_LIB_H_ +#define _VHD_LIB_H_ + +#include <string.h> +#include <endian.h> +#include <byteswap.h> +#include <uuid/uuid.h> + +#include "vhd.h" + +#if BYTE_ORDER == LITTLE_ENDIAN + #define BE16_IN(foo) (*(foo)) = bswap_16(*(foo)) + #define BE32_IN(foo) (*(foo)) = bswap_32(*(foo)) + #define BE64_IN(foo) (*(foo)) = bswap_64(*(foo)) + #define BE16_OUT(foo) (*(foo)) = bswap_16(*(foo)) + #define BE32_OUT(foo) (*(foo)) = bswap_32(*(foo)) + #define BE64_OUT(foo) (*(foo)) = bswap_64(*(foo)) +#else + #define BE16_IN(foo) + #define BE32_IN(foo) + #define BE64_IN(foo) + #define BE32_OUT(foo) + #define BE32_OUT(foo) + #define BE64_OUT(foo) +#endif + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +#define VHD_MAX_NAME_LEN 1024 + +#define VHD_BLOCK_SHIFT 21 +#define VHD_BLOCK_SIZE (1ULL << VHD_BLOCK_SHIFT) + +#define UTF_16 "UTF-16" +#define UTF_16LE "UTF-16LE" +#define UTF_16BE "UTF-16BE" + +#define VHD_OPEN_RDONLY 0x00001 +#define VHD_OPEN_RDWR 0x00002 +#define VHD_OPEN_FAST 0x00004 +#define VHD_OPEN_STRICT 0x00008 +#define VHD_OPEN_IGNORE_DISABLED 0x00010 + +#define VHD_FLAG_CREAT_PARENT_RAW 0x00001 + +#define vhd_flag_set(word, flag) ((word) |= (flag)) +#define vhd_flag_clear(word, flag) ((word) &= ~(flag)) +#define vhd_flag_test(word, flag) ((word) & (flag)) + + +#define ENABLE_FAILURE_TESTING +#define FAIL_REPARENT_BEGIN 0 +#define FAIL_REPARENT_LOCATOR 1 +#define FAIL_REPARENT_END 2 +#define FAIL_RESIZE_BEGIN 3 +#define FAIL_RESIZE_DATA_MOVED 4 +#define FAIL_RESIZE_METADATA_MOVED 5 +#define FAIL_RESIZE_END 6 +#define NUM_FAIL_TESTS 7 + +#ifdef ENABLE_FAILURE_TESTING +#define TEST_FAIL_AT(point) \ + if (TEST_FAIL[point]) { \ + printf("Failing at %s\n", ENV_VAR_FAIL[point]); exit(EINVAL); } +#define TEST_FAIL_EXTERN_VARS \ + extern const char* ENV_VAR_FAIL[]; \ + extern int TEST_FAIL[]; +#else +#define TEST_FAIL_AT(point) +#define TEST_FAIL_EXTERN_VARS +#endif // ENABLE_FAILURE_TESTING + + +static const char VHD_POISON_COOKIE[] = "v_poison"; + +typedef struct hd_ftr vhd_footer_t; +typedef struct dd_hdr vhd_header_t; +typedef struct vhd_bat vhd_bat_t; +typedef struct vhd_batmap vhd_batmap_t; +typedef struct dd_batmap_hdr vhd_batmap_header_t; +typedef struct prt_loc vhd_parent_locator_t; +typedef struct vhd_context vhd_context_t; +typedef uint32_t vhd_flag_creat_t; + +struct vhd_bat { + uint32_t spb; + uint32_t entries; + uint32_t *bat; +}; + +struct vhd_batmap { + vhd_batmap_header_t header; + char *map; +}; + +struct vhd_context { + int fd; + char *file; + int oflags; + int is_block; + + uint32_t spb; + uint32_t bm_secs; + + vhd_header_t header; + vhd_footer_t footer; + vhd_bat_t bat; + vhd_batmap_t batmap; +}; + +static inline uint32_t +secs_round_up(uint64_t bytes) +{ + return ((bytes + (VHD_SECTOR_SIZE - 1)) >> VHD_SECTOR_SHIFT); +} + +static inline uint32_t +secs_round_up_no_zero(uint64_t bytes) +{ + return (secs_round_up(bytes) ? : 1); +} + +static inline uint64_t +vhd_sectors_to_bytes(uint64_t sectors) +{ + return sectors << VHD_SECTOR_SHIFT; +} + +static inline uint64_t +vhd_bytes_padded(uint64_t bytes) +{ + return vhd_sectors_to_bytes(secs_round_up_no_zero(bytes)); +} + +static inline int +vhd_type_dynamic(vhd_context_t *ctx) +{ + return (ctx->footer.type == HD_TYPE_DYNAMIC || + ctx->footer.type == HD_TYPE_DIFF); +} + +static inline int +vhd_creator_tapdisk(vhd_context_t *ctx) +{ + return !strncmp(ctx->footer.crtr_app, "tap", 3); +} + +static inline int +vhd_disabled(vhd_context_t *ctx) +{ + return (!memcmp(ctx->footer.cookie, + VHD_POISON_COOKIE, sizeof(ctx->footer.cookie))); +} + +static inline size_t +vhd_parent_locator_size(vhd_parent_locator_t *loc) +{ + /* + * MICROSOFT_COMPAT + * data_space *should* be in sectors, + * but sometimes we find it in bytes + */ + if (loc->data_space < 512) + return vhd_sectors_to_bytes(loc->data_space); + else if (loc->data_space % 512 == 0) + return loc->data_space; + else + return 0; +} + +static inline int +vhd_parent_raw(vhd_context_t *ctx) +{ + return uuid_is_null(ctx->header.prt_uuid); +} + +void libvhd_set_log_level(int); + +int vhd_test_file_fixed(const char *, int *); + +uint32_t vhd_time(time_t time); +size_t vhd_time_to_string(uint32_t timestamp, char *target); +uint32_t vhd_chs(uint64_t size); + +uint32_t vhd_checksum_footer(vhd_footer_t *); +uint32_t vhd_checksum_header(vhd_header_t *); +uint32_t vhd_checksum_batmap(vhd_batmap_t *); + +void vhd_footer_in(vhd_footer_t *); +void vhd_footer_out(vhd_footer_t *); +void vhd_header_in(vhd_header_t *); +void vhd_header_out(vhd_header_t *); +void vhd_bat_in(vhd_bat_t *); +void vhd_bat_out(vhd_bat_t *); +void vhd_batmap_header_in(vhd_batmap_t *); +void vhd_batmap_header_out(vhd_batmap_t *); + +int vhd_validate_footer(vhd_footer_t *footer); +int vhd_validate_header(vhd_header_t *header); +int vhd_validate_batmap_header(vhd_batmap_t *batmap); +int vhd_validate_batmap(vhd_batmap_t *batmap); +int vhd_validate_platform_code(uint32_t code); + +int vhd_open(vhd_context_t *, const char *file, int flags); +void vhd_close(vhd_context_t *); +int vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t); +/* vhd_snapshot: the bytes parameter is optional and can be 0 if the snapshot + * is to have the same size as the (first non-empty) parent */ +int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent, + vhd_flag_creat_t); + +int vhd_hidden(vhd_context_t *, int *); +int vhd_chain_depth(vhd_context_t *, int *); + +off64_t vhd_position(vhd_context_t *); +int vhd_seek(vhd_context_t *, off64_t, int); +int vhd_read(vhd_context_t *, void *, size_t); +int vhd_write(vhd_context_t *, void *, size_t); + +int vhd_offset(vhd_context_t *, uint32_t, uint32_t *); + +int vhd_end_of_headers(vhd_context_t *ctx, off64_t *off); +int vhd_end_of_data(vhd_context_t *ctx, off64_t *off); +int vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *off); + +int vhd_get_header(vhd_context_t *); +int vhd_get_footer(vhd_context_t *); +int vhd_get_bat(vhd_context_t *); +int vhd_get_batmap(vhd_context_t *); + +void vhd_put_header(vhd_context_t *); +void vhd_put_footer(vhd_context_t *); +void vhd_put_bat(vhd_context_t *); +void vhd_put_batmap(vhd_context_t *); + +int vhd_has_batmap(vhd_context_t *); +int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t); +void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t); +void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t); + +int vhd_get_phys_size(vhd_context_t *, off64_t *); +int vhd_set_phys_size(vhd_context_t *, off64_t); + +int vhd_bitmap_test(vhd_context_t *, char *, uint32_t); +void vhd_bitmap_set(vhd_context_t *, char *, uint32_t); +void vhd_bitmap_clear(vhd_context_t *, char *, uint32_t); + +int vhd_parent_locator_count(vhd_context_t *); +int vhd_parent_locator_get(vhd_context_t *, char **); +int vhd_parent_locator_read(vhd_context_t *, vhd_parent_locator_t *, char **); +int vhd_find_parent(vhd_context_t *, const char *, char **); +int vhd_parent_locator_write_at(vhd_context_t *, const char *, + off64_t, uint32_t, size_t, + vhd_parent_locator_t *); + +int vhd_header_decode_parent(vhd_context_t *, vhd_header_t *, char **); +int vhd_change_parent(vhd_context_t *, char *parent_path, int raw); + +int vhd_read_footer(vhd_context_t *, vhd_footer_t *); +int vhd_read_footer_at(vhd_context_t *, vhd_footer_t *, off64_t); +int vhd_read_footer_strict(vhd_context_t *, vhd_footer_t *); +int vhd_read_header(vhd_context_t *, vhd_header_t *); +int vhd_read_header_at(vhd_context_t *, vhd_header_t *, off64_t); +int vhd_read_bat(vhd_context_t *, vhd_bat_t *); +int vhd_read_batmap(vhd_context_t *, vhd_batmap_t *); +int vhd_read_bitmap(vhd_context_t *, uint32_t block, char **bufp); +int vhd_read_block(vhd_context_t *, uint32_t block, char **bufp); + +int vhd_write_footer(vhd_context_t *, vhd_footer_t *); +int vhd_write_footer_at(vhd_context_t *, vhd_footer_t *, off64_t); +int vhd_write_header(vhd_context_t *, vhd_header_t *); +int vhd_write_header_at(vhd_context_t *, vhd_header_t *, off64_t); +int vhd_write_bat(vhd_context_t *, vhd_bat_t *); +int vhd_write_batmap(vhd_context_t *, vhd_batmap_t *); +int vhd_write_bitmap(vhd_context_t *, uint32_t block, char *bitmap); +int vhd_write_block(vhd_context_t *, uint32_t block, char *data); + +int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t); +int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t); + +#endif diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h new file mode 100644 index 0000000000..03a524be01 --- /dev/null +++ b/tools/blktap2/include/list.h @@ -0,0 +1,93 @@ +/* + * list.h + * + * This is a subset of linux's list.h intended to be used in user-space. + * + */ + +#ifndef __LIST_H__ +#define __LIST_H__ + +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#endif /* __LIST_H__ */ diff --git a/tools/blktap2/include/lvm-util.h b/tools/blktap2/include/lvm-util.h new file mode 100644 index 0000000000..95f3320334 --- /dev/null +++ b/tools/blktap2/include/lvm-util.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LVM_UTIL_H_ +#define _LVM_UTIL_H_ + +#include <inttypes.h> + +#define MAX_NAME_SIZE 256 + +#define LVM_SEG_TYPE_LINEAR 1 +#define LVM_SEG_TYPE_UNKNOWN 2 + +struct lv_segment { + uint8_t type; + char device[MAX_NAME_SIZE]; + uint64_t pe_start; + uint64_t pe_size; +}; + +struct lv { + char name[MAX_NAME_SIZE]; + uint64_t size; + uint32_t segments; + struct lv_segment first_segment; +}; + +struct pv { + char name[MAX_NAME_SIZE]; + uint64_t start; +}; + +struct vg { + char name[MAX_NAME_SIZE]; + uint64_t extent_size; + + int pv_cnt; + struct pv *pvs; + + int lv_cnt; + struct lv *lvs; +}; + +int lvm_scan_vg(const char *vg_name, struct vg *vg); +void lvm_free_vg(struct vg *vg); + +#endif diff --git a/tools/blktap2/include/relative-path.h b/tools/blktap2/include/relative-path.h new file mode 100644 index 0000000000..d78f94d023 --- /dev/null +++ b/tools/blktap2/include/relative-path.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _RELATIVE_PATH_H_ +#define _RELATIVE_PATH_H_ + +#include <syslog.h> + +#define DELIMITER '/' +#define MAX_NAME_LEN 1000 + +#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a) + +/* + * returns a relative path from @src to @dest + * result should be freed + */ +char *relative_path_to(char *src, char *dest, int *err); + +#endif diff --git a/tools/blktap2/include/tapdisk-message.h b/tools/blktap2/include/tapdisk-message.h new file mode 100644 index 0000000000..1a86dcb6a3 --- /dev/null +++ b/tools/blktap2/include/tapdisk-message.h @@ -0,0 +1,141 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _TAPDISK_MESSAGE_H_ +#define _TAPDISK_MESSAGE_H_ + +#include <inttypes.h> + +#define TAPDISK_MESSAGE_MAX_PATH_LENGTH 256 +#define TAPDISK_MESSAGE_STRING_LENGTH 256 + +#define TAPDISK_MESSAGE_FLAG_SHARED 0x01 +#define TAPDISK_MESSAGE_FLAG_RDONLY 0x02 +#define TAPDISK_MESSAGE_FLAG_ADD_CACHE 0x04 +#define TAPDISK_MESSAGE_FLAG_VHD_INDEX 0x08 +#define TAPDISK_MESSAGE_FLAG_LOG_DIRTY 0x10 + +typedef struct tapdisk_message tapdisk_message_t; +typedef uint8_t tapdisk_message_flag_t; +typedef struct tapdisk_message_image tapdisk_message_image_t; +typedef struct tapdisk_message_params tapdisk_message_params_t; +typedef struct tapdisk_message_string tapdisk_message_string_t; + +struct tapdisk_message_params { + tapdisk_message_flag_t flags; + + uint8_t storage; + uint32_t devnum; + uint32_t domid; + uint16_t path_len; + char path[TAPDISK_MESSAGE_MAX_PATH_LENGTH]; +}; + +struct tapdisk_message_image { + uint64_t sectors; + uint32_t sector_size; + uint32_t info; +}; + +struct tapdisk_message_string { + char text[TAPDISK_MESSAGE_STRING_LENGTH]; +}; + +struct tapdisk_message { + uint16_t type; + uint16_t cookie; + uint16_t drivertype; + + union { + pid_t tapdisk_pid; + tapdisk_message_image_t image; + tapdisk_message_params_t params; + tapdisk_message_string_t string; + } u; +}; + +enum tapdisk_message_id { + TAPDISK_MESSAGE_ERROR = 1, + TAPDISK_MESSAGE_RUNTIME_ERROR, + TAPDISK_MESSAGE_PID, + TAPDISK_MESSAGE_PID_RSP, + TAPDISK_MESSAGE_OPEN, + TAPDISK_MESSAGE_OPEN_RSP, + TAPDISK_MESSAGE_PAUSE, + TAPDISK_MESSAGE_PAUSE_RSP, + TAPDISK_MESSAGE_RESUME, + TAPDISK_MESSAGE_RESUME_RSP, + TAPDISK_MESSAGE_CLOSE, + TAPDISK_MESSAGE_CLOSE_RSP, + TAPDISK_MESSAGE_EXIT, +}; + +static inline char * +tapdisk_message_name(enum tapdisk_message_id id) +{ + switch (id) { + case TAPDISK_MESSAGE_ERROR: + return "error"; + + case TAPDISK_MESSAGE_PID: + return "pid"; + + case TAPDISK_MESSAGE_PID_RSP: + return "pid response"; + + case TAPDISK_MESSAGE_OPEN: + return "open"; + + case TAPDISK_MESSAGE_OPEN_RSP: + return "open response"; + + case TAPDISK_MESSAGE_PAUSE: + return "pause"; + + case TAPDISK_MESSAGE_PAUSE_RSP: + return "pause response"; + + case TAPDISK_MESSAGE_RESUME: + return "resume"; + + case TAPDISK_MESSAGE_RESUME_RSP: + return "resume response"; + + case TAPDISK_MESSAGE_CLOSE: + return "close"; + + case TAPDISK_MESSAGE_CLOSE_RSP: + return "close response"; + + case TAPDISK_MESSAGE_EXIT: + return "exit"; + + default: + return "unknown"; + } +} + +#endif diff --git a/tools/blktap2/include/vhd-util.h b/tools/blktap2/include/vhd-util.h new file mode 100644 index 0000000000..11f077e2bf --- /dev/null +++ b/tools/blktap2/include/vhd-util.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _VHD_UTIL_H_ +#define _VHD_UTIL_H_ + +int vhd_util_create(int argc, char **argv); +int vhd_util_snapshot(int argc, char **argv); +int vhd_util_query(int argc, char **argv); +int vhd_util_read(int argc, char **argv); +int vhd_util_set_field(int argc, char **argv); +int vhd_util_repair(int argc, char **argv); +int vhd_util_fill(int argc, char **argv); +int vhd_util_resize(int argc, char **argv); +int vhd_util_coalesce(int argc, char **argv); +int vhd_util_modify(int argc, char **argv); +int vhd_util_scan(int argc, char **argv); +int vhd_util_check(int argc, char **argv); +int vhd_util_revert(int argc, char **argv); + +#endif diff --git a/tools/blktap2/include/vhd.h b/tools/blktap2/include/vhd.h new file mode 100644 index 0000000000..4da5f86668 --- /dev/null +++ b/tools/blktap2/include/vhd.h @@ -0,0 +1,221 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __VHD_H__ +#define __VHD_H__ + +#include <asm/types.h> +#include <uuid/uuid.h> +#include <inttypes.h> + +typedef uint32_t u32; +typedef uint64_t u64; + +#define DEBUG 1 + +/* ---------------------------------------------------------------------- */ +/* General definitions. */ +/* ---------------------------------------------------------------------- */ + +#define VHD_SECTOR_SIZE 512 +#define VHD_SECTOR_SHIFT 9 + +/* ---------------------------------------------------------------------- */ +/* This is the generic disk footer, used by all disks. */ +/* ---------------------------------------------------------------------- */ + +struct hd_ftr { + char cookie[8]; /* Identifies original creator of the disk */ + u32 features; /* Feature Support -- see below */ + u32 ff_version; /* (major,minor) version of disk file */ + u64 data_offset; /* Abs. offset from SOF to next structure */ + u32 timestamp; /* Creation time. secs since 1/1/2000GMT */ + char crtr_app[4]; /* Creator application */ + u32 crtr_ver; /* Creator version (major,minor) */ + u32 crtr_os; /* Creator host OS */ + u64 orig_size; /* Size at creation (bytes) */ + u64 curr_size; /* Current size of disk (bytes) */ + u32 geometry; /* Disk geometry */ + u32 type; /* Disk type */ + u32 checksum; /* 1's comp sum of this struct. */ + uuid_t uuid; /* Unique disk ID, used for naming parents */ + char saved; /* one-bit -- is this disk/VM in a saved state? */ + char hidden; /* tapdisk-specific field: is this vdi hidden? */ + char reserved[426]; /* padding */ +}; + +/* VHD cookie string. */ +static const char HD_COOKIE[9] = "conectix"; + +/* Feature fields in hd_ftr */ +#define HD_NO_FEATURES 0x00000000 +#define HD_TEMPORARY 0x00000001 /* disk can be deleted on shutdown */ +#define HD_RESERVED 0x00000002 /* NOTE: must always be set */ + +/* Version field in hd_ftr */ +#define HD_FF_VERSION 0x00010000 + +/* Known creator OS type fields in hd_ftr.crtr_os */ +#define HD_CR_OS_WINDOWS 0x5769326B /* (Wi2k) */ +#define HD_CR_OS_MACINTOSH 0x4D616320 /* (Mac ) */ + +/* + * version 0.1: little endian bitmaps + * version 1.1: big endian bitmaps; batmap + * version 1.2: libvhd + * version 1.3: batmap version bump to 1.2 + */ +#define VHD_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF)) +#define VHD_CURRENT_VERSION VHD_VERSION(1, 3) + +/* Disk geometry accessor macros. */ +/* Geometry is a triple of (cylinders (2 bytes), tracks (1 byte), and + * secotrs-per-track (1 byte)) + */ +#define GEOM_GET_CYLS(_g) (((_g) >> 16) & 0xffff) +#define GEOM_GET_HEADS(_g) (((_g) >> 8) & 0xff) +#define GEOM_GET_SPT(_g) ((_g) & 0xff) + +#define GEOM_ENCODE(_c, _h, _s) (((_c) << 16) | ((_h) << 8) | (_s)) + +/* type field in hd_ftr */ +#define HD_TYPE_NONE 0 +#define HD_TYPE_FIXED 2 /* fixed-allocation disk */ +#define HD_TYPE_DYNAMIC 3 /* dynamic disk */ +#define HD_TYPE_DIFF 4 /* differencing disk */ + +/* String table for hd.type */ +static const char *HD_TYPE_STR[7] = { + "None", /* 0 */ + "Reserved (deprecated)", /* 1 */ + "Fixed hard disk", /* 2 */ + "Dynamic hard disk", /* 3 */ + "Differencing hard disk", /* 4 */ + "Reserved (deprecated)", /* 5 */ + "Reserved (deprecated)" /* 6 */ +}; + +#define HD_TYPE_MAX 6 + +struct prt_loc { + u32 code; /* Platform code -- see defines below. */ + u32 data_space; /* Number of 512-byte sectors to store locator */ + u32 data_len; /* Actual length of parent locator in bytes */ + u32 res; /* Must be zero */ + u64 data_offset; /* Absolute offset of locator data (bytes) */ +}; + +/* Platform Codes */ +#define PLAT_CODE_NONE 0x0 +#define PLAT_CODE_WI2R 0x57693272 /* deprecated */ +#define PLAT_CODE_WI2K 0x5769326B /* deprecated */ +#define PLAT_CODE_W2RU 0x57327275 /* Windows relative path (UTF-16) */ +#define PLAT_CODE_W2KU 0x57326B75 /* Windows absolute path (UTF-16) */ +#define PLAT_CODE_MAC 0x4D616320 /* MacOS alias stored as a blob. */ +#define PLAT_CODE_MACX 0x4D616358 /* File URL (UTF-8), see RFC 2396. */ + +/* ---------------------------------------------------------------------- */ +/* This is the dynamic disk header. */ +/* ---------------------------------------------------------------------- */ + +struct dd_hdr { + char cookie[8]; /* Should contain "cxsparse" */ + u64 data_offset; /* Byte offset of next record. (Unused) 0xffs */ + u64 table_offset; /* Absolute offset to the BAT. */ + u32 hdr_ver; /* Version of the dd_hdr (major,minor) */ + u32 max_bat_size; /* Maximum number of entries in the BAT */ + u32 block_size; /* Block size in bytes. Must be power of 2. */ + u32 checksum; /* Header checksum. 1's comp of all fields. */ + uuid_t prt_uuid; /* ID of the parent disk. */ + u32 prt_ts; /* Modification time of the parent disk */ + u32 res1; /* Reserved. */ + char prt_name[512]; /* Parent unicode name. */ + struct prt_loc loc[8]; /* Parent locator entries. */ + char res2[256]; /* Reserved. */ +}; + +/* VHD cookie string. */ +static const char DD_COOKIE[9] = "cxsparse"; + +/* Version field in hd_ftr */ +#define DD_VERSION 0x00010000 + +/* Default blocksize is 2 meg. */ +#define DD_BLOCKSIZE_DEFAULT 0x00200000 + +#define DD_BLK_UNUSED 0xFFFFFFFF + +struct dd_batmap_hdr { + char cookie[8]; /* should contain "tdbatmap" */ + u64 batmap_offset; /* byte offset to batmap */ + u32 batmap_size; /* batmap size in sectors */ + u32 batmap_version; /* version of batmap */ + u32 checksum; /* batmap checksum -- 1's complement of batmap */ +}; + +static const char VHD_BATMAP_COOKIE[9] = "tdbatmap"; + +/* + * version 1.1: signed char checksum + */ +#define VHD_BATMAP_VERSION(major, minor) (((major) << 16) | ((minor) & 0x0000FFFF)) +#define VHD_BATMAP_CURRENT_VERSION VHD_BATMAP_VERSION(1, 2) + +/* Layout of a dynamic disk: + * + * +-------------------------------------------------+ + * | Mirror image of HD footer (hd_ftr) (512 bytes) | + * +-------------------------------------------------+ + * | Sparse drive header (dd_hdr) (1024 bytes) | + * +-------------------------------------------------+ + * | BAT (Block allocation table) | + * | - Array of absolute sector offsets into the | + * | file (u32). | + * | - Rounded up to a sector boundary. | + * | - Unused entries are marked as 0xFFFFFFFF | + * | - max entries in dd_hdr->max_bat_size | + * +-------------------------------------------------+ + * | Data Block 0 | + * | Bitmap (padded to 512 byte sector boundary) | + * | - each bit indicates whether the associated | + * | sector within this block is used. | + * | Data | + * | - power-of-two multiple of sectors. | + * | - default 2MB (4096 * 512) | + * | - Any entries with zero in bitmap should be | + * | zero on disk | + * +-------------------------------------------------+ + * | Data Block 1 | + * +-------------------------------------------------+ + * | ... | + * +-------------------------------------------------+ + * | Data Block n | + * +-------------------------------------------------+ + * | HD Footer (511 bytes) | + * +-------------------------------------------------+ + */ + +#endif diff --git a/tools/blktap2/lvm/Makefile b/tools/blktap2/lvm/Makefile new file mode 100644 index 0000000000..3a726d7c8b --- /dev/null +++ b/tools/blktap2/lvm/Makefile @@ -0,0 +1,38 @@ +XEN_ROOT = ../../../ +BLKTAP_ROOT := ../ +include $(XEN_ROOT)/tools/Rules.mk + +ifeq ($(LVM_UTIL_TEST),y) +TEST := lvm-util +endif + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -I../include +CFLAGS += -D_GNU_SOURCE + +ifeq ($(CONFIG_X86_64),y) +CFLAGS += -fPIC +endif + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +LVM-OBJS := lvm-util.o + +all: build + +build: $(TEST) $(LVM-OBJS) + +install: all + +lvm-util: lvm-util.o + $(CC) $(CFLAGS) -DLVM_UTIL -o lvm-util lvm-util.c + +clean: + rm -rf *.o *~ $(DEPS) $(IBIN) + +.PHONY: all build clean install lvm-util + +-include $(DEPS) diff --git a/tools/blktap2/lvm/lvm-util.c b/tools/blktap2/lvm/lvm-util.c new file mode 100644 index 0000000000..b456e0438b --- /dev/null +++ b/tools/blktap2/lvm/lvm-util.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "lvm-util.h" + +#define _NAME "%255s" +static char line[1024]; + +static inline int +lvm_read_line(FILE *scan) +{ + memset(line, 0, sizeof(line)); + return (fscanf(scan, "%1023[^\n]", line) != 1); +} + +static inline int +lvm_next_line(FILE *scan) +{ + return (fscanf(scan, "%1023[\n]", line) != 1); +} + +static int +lvm_copy_name(char *dst, const char *src, size_t size) +{ + if (strnlen(src, size) == size) + return -ENAMETOOLONG; + + strcpy(dst, src); + return 0; +} + +static int +lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start) +{ + int i, err; + struct pv *pv; + + pv = NULL; + + if (!vg->pvs) { + vg->pvs = calloc(pvs, sizeof(struct pv)); + if (!vg->pvs) + return -ENOMEM; + } + + for (i = 0; i < pvs; i++) { + pv = vg->pvs + i; + + if (!pv->name[0]) + break; + + if (!strcmp(pv->name, name)) + return -EEXIST; + } + + if (!pv) + return -ENOENT; + + if (i == pvs) + return -ENOMEM; + + err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1); + if (err) + return err; + + pv->start = start; + return 0; +} + +static int +lvm_open_vg(const char *vgname, struct vg *vg) +{ + FILE *scan; + int i, err, pvs, lvs; + char *cmd, pvname[256]; + uint64_t size, pv_start; + + memset(vg, 0, sizeof(*vg)); + + err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b " + "--options=vg_name,vg_extent_size,lv_count,pv_count," + "pv_name,pe_start --unbuffered 2> /dev/null", vgname); + if (err == -1) + return -ENOMEM; + + errno = 0; + scan = popen(cmd, "r"); + if (!scan) { + err = (errno ? -errno : ENOMEM); + goto out; + } + + for (;;) { + if (lvm_read_line(scan)) + break; + + err = -EINVAL; + if (sscanf(line, _NAME" %"SCNu64" %d %d "_NAME" %"SCNu64, + vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6) + goto out; + + if (strcmp(vg->name, vgname)) + goto out; + + err = lvm_parse_pv(vg, pvname, pvs, pv_start); + if (err) + goto out; + + if (lvm_next_line(scan)) + break; + } + + err = -EINVAL; + if (strcmp(vg->name, vgname)) + goto out; + + for (i = 0; i < pvs; i++) + if (!vg->pvs[i].name[0]) + goto out; + + err = -ENOMEM; + vg->lvs = calloc(lvs, sizeof(struct lv)); + if (!vg->lvs) + goto out; + + err = 0; + vg->lv_cnt = lvs; + vg->pv_cnt = pvs; + vg->extent_size = size; + +out: + if (scan) + pclose(scan); + if (err) + lvm_free_vg(vg); + free(cmd); + return err; +} + +static int +lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices) +{ + int i; + uint64_t start, pe_start; + + for (i = 0; i < strlen(devices); i++) + if (strchr(",()", devices[i])) + devices[i] = ' '; + + if (sscanf(devices, _NAME" %"SCNu64, seg->device, &start) != 2) + return -EINVAL; + + pe_start = -1; + for (i = 0; i < vg->pv_cnt; i++) + if (!strcmp(vg->pvs[i].name, seg->device)) { + pe_start = vg->pvs[i].start; + break; + } + + if (pe_start == -1) + return -EINVAL; + + seg->pe_start = (start * vg->extent_size) + pe_start; + return 0; +} + +static int +lvm_scan_lvs(struct vg *vg) +{ + char *cmd; + FILE *scan; + int i, err; + + err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b " + "--options=lv_name,lv_size,segtype,seg_count,seg_start," + "seg_size,devices --unbuffered 2> /dev/null", vg->name); + if (err == -1) + return -ENOMEM; + + errno = 0; + scan = popen(cmd, "r"); + if (!scan) { + err = (errno ? -errno : -ENOMEM); + goto out; + } + + for (i = 0;;) { + int segs; + struct lv *lv; + struct lv_segment seg; + uint64_t size, seg_start; + char type[32], name[256], dev[256], devices[1024]; + + if (i >= vg->lv_cnt) + break; + + if (lvm_read_line(scan)) { + vg->lv_cnt = i; + break; + } + + err = -EINVAL; + lv = vg->lvs + i; + + if (sscanf(line, _NAME" %"SCNu64" %31s %u %"SCNu64" %"SCNu64" %1023s", + name, &size, type, &segs, &seg_start, + &seg.pe_size, devices) != 7) + goto out; + + if (seg_start) + goto next; + + if (!strcmp(type, "linear")) + seg.type = LVM_SEG_TYPE_LINEAR; + else + seg.type = LVM_SEG_TYPE_UNKNOWN; + + if (lvm_parse_lv_devices(vg, &seg, devices)) + goto out; + + i++; + lv->size = size; + lv->segments = segs; + lv->first_segment = seg; + + err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1); + if (err) + goto out; + err = -EINVAL; + + next: + if (lvm_next_line(scan)) + goto out; + } + + err = 0; + +out: + if (scan) + pclose(scan); + free(cmd); + return err; +} + +void +lvm_free_vg(struct vg *vg) +{ + free(vg->lvs); + free(vg->pvs); + memset(vg, 0, sizeof(*vg)); +} + +int +lvm_scan_vg(const char *vg_name, struct vg *vg) +{ + int err; + + memset(vg, 0, sizeof(*vg)); + + err = lvm_open_vg(vg_name, vg); + if (err) + return err; + + err = lvm_scan_lvs(vg); + if (err) { + lvm_free_vg(vg); + return err; + } + + return 0; +} + +#ifdef LVM_UTIL +static int +usage(void) +{ + printf("usage: lvm-util <vgname>\n"); + exit(EINVAL); +} + +int +main(int argc, char **argv) +{ + int i, err; + struct vg vg; + struct pv *pv; + struct lv *lv; + struct lv_segment *seg; + + if (argc != 2) + usage(); + + err = lvm_scan_vg(argv[1], &vg); + if (err) { + printf("scan failed: %d\n", err); + return (err >= 0 ? err : -err); + } + + + printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n", + vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt); + + for (i = 0; i < vg.pv_cnt; i++) { + pv = vg.pvs + i; + printf("pv %s: start %"PRIu64"\n", pv->name, pv->start); + } + + for (i = 0; i < vg.lv_cnt; i++) { + lv = vg.lvs + i; + seg = &lv->first_segment; + printf("lv %s: size: %"PRIu64", segments: %u, type: %u, " + "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n", + lv->name, lv->size, lv->segments, seg->type, + seg->device, seg->pe_start, seg->pe_size); + } + + lvm_free_vg(&vg); + return 0; +} +#endif diff --git a/tools/blktap2/vhd/Makefile b/tools/blktap2/vhd/Makefile new file mode 100644 index 0000000000..099a0baca7 --- /dev/null +++ b/tools/blktap2/vhd/Makefile @@ -0,0 +1,55 @@ +XEN_ROOT=../../../ +BLKTAP_ROOT := ../ +include $(XEN_ROOT)/tools/Rules.mk + +IBIN = vhd-util vhd-update +INST_DIR = $(SBINDIR) + +LIBDIR = lib + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -I../include +CFLAGS += -D_GNU_SOURCE + +ifeq ($(CONFIG_X86_64),y) +CFLAGS += -fPIC +endif + +ifeq ($(VHD_STATIC),y) +CFLAGS += -static +endif + +LIBS := -L$(LIBDIR) -lvhd +LIBS += -luuid + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +all: build + +build: libvhd $(IBIN) + +libvhd: + @set -e + $(MAKE) -C $(LIBDIR) all + +vhd-util: vhd-util.o + $(CC) $(CFLAGS) -o vhd-util vhd-util.o $(LIBS) + +vhd-update: vhd-update.o + $(CC) $(CFLAGS) -o vhd-update vhd-update.o $(LIBS) + +install: all + $(MAKE) -C $(LIBDIR) install + $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR) + +clean: + $(MAKE) -C $(LIBDIR) clean + rm -rf *.o *~ $(DEPS) $(IBIN) + +.PHONY: all build clean install libvhd vhd-util vhd-update + +-include $(DEPS) diff --git a/tools/blktap2/vhd/lib/Makefile b/tools/blktap2/vhd/lib/Makefile new file mode 100644 index 0000000000..e26ef86403 --- /dev/null +++ b/tools/blktap2/vhd/lib/Makefile @@ -0,0 +1,73 @@ +XEN_ROOT=../../../../ +BLKTAP_ROOT := ../../ +include $(XEN_ROOT)/tools/Rules.mk + +LIBVHD-MAJOR = 1.0 +LIBVHD-MINOR = 0 +LIBVHD-SONAME = libvhd.so.$(LIBVHD-MAJOR) + +LVM-UTIL-OBJ := $(BLKTAP_ROOT)lvm/lvm-util.o + +LIBVHD-BUILD := libvhd.a + +INST-DIR = $(LIBDIR) + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -I../../include +CFLAGS += -D_GNU_SOURCE +CFLAGS += -fPIC +CFLAGS += -g + +LIBS := -luuid + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +LIB-SRCS := libvhd.c +LIB-SRCS += libvhd-journal.c +LIB-SRCS += vhd-util-coalesce.c +LIB-SRCS += vhd-util-create.c +LIB-SRCS += vhd-util-fill.c +LIB-SRCS += vhd-util-modify.c +LIB-SRCS += vhd-util-query.c +LIB-SRCS += vhd-util-read.c +LIB-SRCS += vhd-util-repair.c +LIB-SRCS += vhd-util-resize.c +LIB-SRCS += vhd-util-revert.c +LIB-SRCS += vhd-util-set-field.c +LIB-SRCS += vhd-util-snapshot.c +LIB-SRCS += vhd-util-scan.c +LIB-SRCS += vhd-util-check.c +LIB-SRCS += relative-path.c +LIB-SRCS += atomicio.c + +LIB-OBJS = $(patsubst %.c,%.o,$(LIB-SRCS)) +LIB-OBJS += $(LVM-UTIL-OBJ) + +LIBVHD = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) + +all: build + +build: $(LIBVHD-BUILD) + +libvhd.a: $(LIB-OBJS) + $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_CFLAGS) \ + -o libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(LIBS) $^ + ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.so.$(LIBVHD-MAJOR) + ln -sf libvhd.so.$(LIBVHD-MAJOR) libvhd.so + $(AR) rc $@ $^ + +install: all + $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR) + $(INSTALL_DATA) $(LIBVHD) $(DESTDIR)$(INST-DIR) + ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR) + ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so + +clean: + rm -rf *.a *.so* *.o *~ $(DEPS) $(LIBVHD) + +.PHONY: all build clean install libvhd + +-include $(DEPS) diff --git a/tools/blktap2/vhd/lib/atomicio.c b/tools/blktap2/vhd/lib/atomicio.c new file mode 100644 index 0000000000..ae0e24b00a --- /dev/null +++ b/tools/blktap2/vhd/lib/atomicio.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved. + * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <errno.h> +#include "atomicio.h" + +/* + * ensure all of data on socket comes through. f==read || f==vwrite + */ +size_t +atomicio(f, fd, _s, n) + ssize_t (*f) (int, void *, size_t); + int fd; + void *_s; + size_t n; +{ + char *s = _s; + size_t pos = 0; + ssize_t res; + + while (n > pos) { + res = (f) (fd, s + pos, n - pos); + switch (res) { + case -1: + if (errno == EINTR || errno == EAGAIN) + continue; + return 0; + case 0: + errno = EPIPE; + return pos; + default: + pos += (size_t)res; + } + } + return (pos); +} + diff --git a/tools/blktap2/vhd/lib/libvhd-journal.c b/tools/blktap2/vhd/lib/libvhd-journal.c new file mode 100644 index 0000000000..c52affea1a --- /dev/null +++ b/tools/blktap2/vhd/lib/libvhd-journal.c @@ -0,0 +1,1534 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "atomicio.h" +#include "libvhd-journal.h" + +#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P 1 +#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C 2 +#define VHD_JOURNAL_ENTRY_TYPE_HEADER 3 +#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR 4 +#define VHD_JOURNAL_ENTRY_TYPE_BAT 5 +#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H 6 +#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M 7 +#define VHD_JOURNAL_ENTRY_TYPE_DATA 8 + +typedef struct vhd_journal_entry { + uint64_t cookie; + uint32_t type; + uint32_t size; + uint64_t offset; + uint32_t checksum; +} vhd_journal_entry_t; + +static inline int +vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence) +{ + off64_t off; + + off = lseek64(j->jfd, offset, whence); + if (off == (off64_t)-1) + return -errno; + + return 0; +} + +static inline off64_t +vhd_journal_position(vhd_journal_t *j) +{ + return lseek64(j->jfd, 0, SEEK_CUR); +} + +static inline int +vhd_journal_read(vhd_journal_t *j, void *buf, size_t size) +{ + ssize_t ret; + + errno = 0; + + ret = atomicio(read, j->jfd, buf, size); + if (ret != size) + return (errno ? -errno : -EIO); + + return 0; +} + +static inline int +vhd_journal_write(vhd_journal_t *j, void *buf, size_t size) +{ + ssize_t ret; + + errno = 0; + + ret = atomicio(vwrite, j->jfd, buf, size); + if (ret != size) + return (errno ? -errno : -EIO); + + return 0; +} + +static inline int +vhd_journal_truncate(vhd_journal_t *j, off64_t length) +{ + int err; + + err = ftruncate(j->jfd, length); + if (err == -1) + return -errno; + + return 0; +} + +static inline int +vhd_journal_sync(vhd_journal_t *j) +{ + int err; + + err = fdatasync(j->jfd); + if (err) + return -errno; + + return 0; +} + +static inline void +vhd_journal_header_in(vhd_journal_header_t *header) +{ + BE64_IN(&header->vhd_footer_offset); + BE32_IN(&header->journal_data_entries); + BE32_IN(&header->journal_metadata_entries); + BE64_IN(&header->journal_data_offset); + BE64_IN(&header->journal_metadata_offset); +} + +static inline void +vhd_journal_header_out(vhd_journal_header_t *header) +{ + BE64_OUT(&header->vhd_footer_offset); + BE32_OUT(&header->journal_data_entries); + BE32_OUT(&header->journal_metadata_entries); + BE64_OUT(&header->journal_data_offset); + BE64_OUT(&header->journal_metadata_offset); +} + +static int +vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + off64_t eof; + + if (memcmp(header->cookie, + VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie))) + return -EINVAL; + + err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); + if (err) + return err; + + eof = vhd_journal_position(j); + if (eof == (off64_t)-1) + return -errno; + + if (j->header.journal_data_offset > j->header.journal_eof) + return -EINVAL; + + if (j->header.journal_metadata_offset > j->header.journal_eof) + return -EINVAL; + + return 0; +} + +static int +vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + size_t size; + + size = sizeof(vhd_journal_header_t); + err = vhd_journal_seek(j, 0, SEEK_SET); + if (err) + return err; + + err = vhd_journal_read(j, header, size); + if (err) + return err; + + vhd_journal_header_in(header); + + return vhd_journal_validate_header(j, header); +} + +static int +vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + size_t size; + vhd_journal_header_t h; + + memcpy(&h, header, sizeof(vhd_journal_header_t)); + + err = vhd_journal_validate_header(j, &h); + if (err) + return err; + + vhd_journal_header_out(&h); + size = sizeof(vhd_journal_header_t); + + err = vhd_journal_seek(j, 0, SEEK_SET); + if (err) + return err; + + err = vhd_journal_write(j, &h, size); + if (err) + return err; + + return 0; +} + +static int +vhd_journal_add_journal_header(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + + vhd = &j->vhd; + memset(&j->header, 0, sizeof(vhd_journal_header_t)); + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + return err; + + off = vhd_position(vhd); + if (off == (off64_t)-1) + return -errno; + + err = vhd_get_footer(vhd); + if (err) + return err; + + uuid_copy(j->header.uuid, vhd->footer.uuid); + memcpy(j->header.cookie, + VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie)); + j->header.vhd_footer_offset = off - sizeof(vhd_footer_t); + j->header.journal_eof = sizeof(vhd_journal_header_t); + + return vhd_journal_write_header(j, &j->header); +} + +static void +vhd_journal_entry_in(vhd_journal_entry_t *entry) +{ + BE32_IN(&entry->type); + BE32_IN(&entry->size); + BE64_IN(&entry->offset); + BE64_IN(&entry->cookie); + BE32_IN(&entry->checksum); +} + +static void +vhd_journal_entry_out(vhd_journal_entry_t *entry) +{ + BE32_OUT(&entry->type); + BE32_OUT(&entry->size); + BE64_OUT(&entry->offset); + BE64_OUT(&entry->cookie); + BE32_OUT(&entry->checksum); +} + +static uint32_t +vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size) +{ + int i; + unsigned char *blob; + uint32_t checksum, tmp; + + checksum = 0; + tmp = entry->checksum; + entry->checksum = 0; + + blob = (unsigned char *)entry; + for (i = 0; i < sizeof(vhd_journal_entry_t); i++) + checksum += blob[i]; + + blob = (unsigned char *)buf; + for (i = 0; i < size; i++) + checksum += blob[i]; + + entry->checksum = tmp; + return ~checksum; +} + +static int +vhd_journal_validate_entry(vhd_journal_entry_t *entry) +{ + if (entry->size == 0) + return -EINVAL; + + if (entry->size & (VHD_SECTOR_SIZE - 1)) + return -EINVAL; + + if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE) + return -EINVAL; + + return 0; +} + +static int +vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) +{ + int err; + + err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t)); + if (err) + return err; + + vhd_journal_entry_in(entry); + return vhd_journal_validate_entry(entry); +} + +static int +vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) +{ + int err; + vhd_journal_entry_t e; + + err = vhd_journal_validate_entry(entry); + if (err) + return err; + + memcpy(&e, entry, sizeof(vhd_journal_entry_t)); + vhd_journal_entry_out(&e); + + err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t)); + if (err) + err; + + return 0; +} + +static int +vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf) +{ + int err; + uint32_t checksum; + + err = 0; + checksum = vhd_journal_checksum_entry(entry, buf, entry->size); + + if (checksum != entry->checksum) + return -EINVAL; + + return err; +} + +static int +vhd_journal_update(vhd_journal_t *j, off64_t offset, + char *buf, size_t size, uint32_t type) +{ + int err; + off64_t eof; + uint64_t *off, off_bak; + uint32_t *entries; + vhd_journal_entry_t entry; + + entry.type = type; + entry.size = size; + entry.offset = offset; + entry.cookie = VHD_JOURNAL_ENTRY_COOKIE; + entry.checksum = vhd_journal_checksum_entry(&entry, buf, size); + + err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); + if (err) + return err; + + err = vhd_journal_write_entry(j, &entry); + if (err) + goto fail; + + err = vhd_journal_write(j, buf, size); + if (err) + goto fail; + + if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) { + off = &j->header.journal_data_offset; + entries = &j->header.journal_data_entries; + } else { + off = &j->header.journal_metadata_offset; + entries = &j->header.journal_metadata_entries; + } + + off_bak = *off; + if (!(*entries)++) + *off = j->header.journal_eof; + j->header.journal_eof += (size + sizeof(vhd_journal_entry_t)); + + err = vhd_journal_write_header(j, &j->header); + if (err) { + if (!--(*entries)) + *off = off_bak; + j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t)); + goto fail; + } + + return 0; + +fail: + if (!j->is_block) + vhd_journal_truncate(j, j->header.journal_eof); + return err; +} + +static int +vhd_journal_add_footer(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + vhd_footer_t footer; + + vhd = &j->vhd; + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + return err; + + off = vhd_position(vhd); + if (off == (off64_t)-1) + return -errno; + + err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t)); + if (err) + return err; + + vhd_footer_out(&footer); + err = vhd_journal_update(j, off - sizeof(vhd_footer_t), + (char *)&footer, + sizeof(vhd_footer_t), + VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + return 0; + + err = vhd_read_footer_at(vhd, &footer, 0); + if (err) + return err; + + vhd_footer_out(&footer); + err = vhd_journal_update(j, 0, + (char *)&footer, + sizeof(vhd_footer_t), + VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); + + return err; +} + +static int +vhd_journal_add_header(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + vhd_header_t header; + + vhd = &j->vhd; + + err = vhd_read_header(vhd, &header); + if (err) + return err; + + off = vhd->footer.data_offset; + + vhd_header_out(&header); + err = vhd_journal_update(j, off, + (char *)&header, + sizeof(vhd_header_t), + VHD_JOURNAL_ENTRY_TYPE_HEADER); + + return err; +} + +static int +vhd_journal_add_locators(vhd_journal_t *j) +{ + int i, n, err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_header(vhd); + if (err) + return err; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + for (i = 0; i < n; i++) { + char *buf; + off64_t off; + size_t size; + vhd_parent_locator_t *loc; + + loc = vhd->header.loc + i; + err = vhd_validate_platform_code(loc->code); + if (err) + return err; + + if (loc->code == PLAT_CODE_NONE) + continue; + + off = loc->data_offset; + size = vhd_parent_locator_size(loc); + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) + return -err; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto end; + + err = vhd_read(vhd, buf, size); + if (err) + goto end; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_LOCATOR); + if (err) + goto end; + + err = 0; + + end: + free(buf); + if (err) + break; + } + + return err; +} + +static int +vhd_journal_add_bat(vhd_journal_t *j) +{ + int err; + off64_t off; + size_t size; + vhd_bat_t bat; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_header(vhd); + if (err) + return err; + + err = vhd_read_bat(vhd, &bat); + if (err) + return err; + + off = vhd->header.table_offset; + size = vhd_bytes_padded(bat.entries * sizeof(uint32_t)); + + vhd_bat_out(&bat); + err = vhd_journal_update(j, off, (char *)bat.bat, size, + VHD_JOURNAL_ENTRY_TYPE_BAT); + + free(bat.bat); + return err; +} + +static int +vhd_journal_add_batmap(vhd_journal_t *j) +{ + int err; + off64_t off; + size_t size; + vhd_context_t *vhd; + vhd_batmap_t batmap; + + vhd = &j->vhd; + + err = vhd_batmap_header_offset(vhd, &off); + if (err) + return err; + + err = vhd_read_batmap(vhd, &batmap); + if (err) + return err; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + vhd_batmap_header_out(&batmap); + err = vhd_journal_update(j, off, (char *)&batmap.header, size, + VHD_JOURNAL_ENTRY_TYPE_BATMAP_H); + if (err) + goto out; + + vhd_batmap_header_in(&batmap); + off = batmap.header.batmap_offset; + size = vhd_sectors_to_bytes(batmap.header.batmap_size); + + err = vhd_journal_update(j, off, batmap.map, size, + VHD_JOURNAL_ENTRY_TYPE_BATMAP_M); + +out: + free(batmap.map); + return err; +} + +static int +vhd_journal_add_metadata(vhd_journal_t *j) +{ + int err; + off64_t eof; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_journal_add_footer(j); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + return 0; + + err = vhd_journal_add_header(j); + if (err) + return err; + + err = vhd_journal_add_locators(j); + if (err) + return err; + + err = vhd_journal_add_bat(j); + if (err) + return err; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_add_batmap(j); + if (err) + return err; + } + + j->header.journal_data_offset = j->header.journal_eof; + return vhd_journal_write_header(j, &j->header); +} + +static int +__vhd_journal_read_footer(vhd_journal_t *j, + vhd_footer_t *footer, uint32_t type) +{ + int err; + vhd_journal_entry_t entry; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != type) + return -EINVAL; + + if (entry.size != sizeof(vhd_footer_t)) + return -EINVAL; + + err = vhd_journal_read(j, footer, entry.size); + if (err) + return err; + + vhd_footer_in(footer); + return vhd_validate_footer(footer); +} + +static int +vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer) +{ + return __vhd_journal_read_footer(j, footer, + VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); +} + +static int +vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) +{ + return __vhd_journal_read_footer(j, footer, + VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); +} + +static int +vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header) +{ + int err; + vhd_journal_entry_t entry; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER) + return -EINVAL; + + if (entry.size != sizeof(vhd_header_t)) + return -EINVAL; + + err = vhd_journal_read(j, header, entry.size); + if (err) + return err; + + vhd_header_in(header); + return vhd_validate_header(header); +} + +static int +vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs) +{ + int err, n, _locs; + char **_locators, *buf; + off_t pos; + vhd_journal_entry_t entry; + + _locs = 0; + *locs = 0; + *locators = NULL; + + n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t); + _locators = calloc(n, sizeof(char *)); + if (!_locators) + return -ENOMEM; + + for (;;) { + buf = NULL; + + pos = vhd_journal_position(j); + err = vhd_journal_read_entry(j, &entry); + if (err) + goto fail; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) { + err = vhd_journal_seek(j, pos, SEEK_SET); + if (err) + goto fail; + break; + } + + if (_locs >= n) { + err = -EINVAL; + goto fail; + } + + err = posix_memalign((void **)&buf, + VHD_SECTOR_SIZE, entry.size); + if (err) { + err = -err; + buf = NULL; + goto fail; + } + + err = vhd_journal_read(j, buf, entry.size); + if (err) + goto fail; + + _locators[_locs++] = buf; + err = 0; + } + + + *locs = _locs; + *locators = _locators; + + return 0; + +fail: + if (_locators) { + for (n = 0; n < _locs; n++) + free(_locators[n]); + free(_locators); + } + return err; +} + +static int +vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat) +{ + int err; + size_t size; + vhd_context_t *vhd; + vhd_journal_entry_t entry; + + vhd = &j->vhd; + + size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t)); + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT) + return -EINVAL; + + if (entry.size != size) + return -EINVAL; + + if (entry.offset != vhd->header.table_offset) + return -EINVAL; + + err = posix_memalign((void **)&bat->bat, VHD_SECTOR_SIZE, size); + if (err) + return -err; + + err = vhd_journal_read(j, bat->bat, entry.size); + if (err) + goto fail; + + bat->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; + bat->entries = vhd->header.max_bat_size; + vhd_bat_in(bat); + + return 0; + +fail: + free(bat->bat); + bat->bat = NULL; + return err; +} + +static int +vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + char *buf; + size_t size; + vhd_journal_entry_t entry; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H) + return -EINVAL; + + if (entry.size != size) + return -EINVAL; + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) + return err; + + err = vhd_journal_read(j, buf, entry.size); + if (err) { + free(buf); + return err; + } + + memcpy(&batmap->header, buf, sizeof(batmap->header)); + + vhd_batmap_header_in(batmap); + return vhd_validate_batmap_header(batmap); +} + +static int +vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + vhd_journal_entry_t entry; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M) + return -EINVAL; + + if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size)) + return -EINVAL; + + if (entry.offset != batmap->header.batmap_offset) + return -EINVAL; + + err = posix_memalign((void **)&batmap->map, + VHD_SECTOR_SIZE, entry.size); + if (err) + return -err; + + err = vhd_journal_read(j, batmap->map, entry.size); + if (err) { + free(batmap->map); + batmap->map = NULL; + return err; + } + + return 0; +} + +static int +vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + + err = vhd_journal_read_batmap_header(j, batmap); + if (err) + return err; + + err = vhd_journal_read_batmap_map(j, batmap); + if (err) + return err; + + err = vhd_validate_batmap(batmap); + if (err) { + free(batmap->map); + batmap->map = NULL; + return err; + } + + return 0; +} + +static int +vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer) +{ + return vhd_write_footer_at(&j->vhd, footer, + j->header.vhd_footer_offset); +} + +static int +vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) +{ + return vhd_write_footer_at(&j->vhd, footer, 0); +} + +static int +vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header) +{ + off64_t off; + vhd_context_t *vhd; + + vhd = &j->vhd; + off = vhd->footer.data_offset; + + return vhd_write_header_at(&j->vhd, header, off); +} + +static int +vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs) +{ + size_t size; + vhd_context_t *vhd; + int i, n, lidx, err; + vhd_parent_locator_t *loc; + + lidx = 0; + vhd = &j->vhd; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + + for (i = 0; i < n && lidx < locs; i++) { + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + err = vhd_seek(vhd, loc->data_offset, SEEK_SET); + if (err) + return err; + + size = vhd_parent_locator_size(loc); + err = vhd_write(vhd, locators[lidx++], size); + if (err) + return err; + } + + return 0; +} + +static int +vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat) +{ + return vhd_write_bat(&j->vhd, bat); +} + +static int +vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + return vhd_write_batmap(&j->vhd, batmap); +} + +static int +vhd_journal_restore_metadata(vhd_journal_t *j) +{ + off64_t off; + char **locators; + vhd_footer_t copy; + vhd_context_t *vhd; + int i, locs, hlocs, err; + + vhd = &j->vhd; + locs = 0; + hlocs = 0; + locators = NULL; + + err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET); + if (err) + return err; + + err = vhd_journal_read_footer(j, &vhd->footer); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + goto restore; + + err = vhd_journal_read_footer_copy(j, ©); + if (err) + return err; + + err = vhd_journal_read_header(j, &vhd->header); + if (err) + return err; + + for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) { + if (vhd_validate_platform_code(vhd->header.loc[i].code)) + return err; + + if (vhd->header.loc[i].code != PLAT_CODE_NONE) + hlocs++; + } + + if (hlocs) { + err = vhd_journal_read_locators(j, &locators, &locs); + if (err) + return err; + + if (hlocs != locs) { + err = -EINVAL; + goto out; + } + } + + err = vhd_journal_read_bat(j, &vhd->bat); + if (err) + goto out; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_read_batmap(j, &vhd->batmap); + if (err) + goto out; + } + +restore: + off = vhd_journal_position(j); + if (off == (off64_t)-1) + return -errno; + + if (j->header.journal_data_offset != off) + return -EINVAL; + + err = vhd_journal_restore_footer(j, &vhd->footer); + if (err) + goto out; + + if (!vhd_type_dynamic(vhd)) + goto out; + + err = vhd_journal_restore_footer_copy(j, ©); + if (err) + goto out; + + err = vhd_journal_restore_header(j, &vhd->header); + if (err) + goto out; + + if (locs) { + err = vhd_journal_restore_locators(j, locators, locs); + if (err) + goto out; + } + + err = vhd_journal_restore_bat(j, &vhd->bat); + if (err) + goto out; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_restore_batmap(j, &vhd->batmap); + if (err) + goto out; + } + + err = 0; + +out: + if (locators) { + for (i = 0; i < locs; i++) + free(locators[i]); + free(locators); + } + + if (!err && !vhd->is_block) + err = ftruncate(vhd->fd, + j->header.vhd_footer_offset + + sizeof(vhd_footer_t)); + + return err; +} + +static int +vhd_journal_disable_vhd(vhd_journal_t *j) +{ + int err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_footer(vhd); + if (err) + return err; + + memcpy(&vhd->footer.cookie, + VHD_POISON_COOKIE, sizeof(vhd->footer.cookie)); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + return 0; +} + +static int +vhd_journal_enable_vhd(vhd_journal_t *j) +{ + int err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_footer(vhd); + if (err) + return err; + + if (!vhd_disabled(vhd)) + return 0; + + memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie)); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + return 0; +} + +int +vhd_journal_close(vhd_journal_t *j) +{ + if (j->jfd) + close(j->jfd); + + vhd_close(&j->vhd); + free(j->jname); + + return 0; +} + +int +vhd_journal_remove(vhd_journal_t *j) +{ + int err; + + err = vhd_journal_enable_vhd(j); + if (err) + return err; + + if (j->jfd) { + close(j->jfd); + if (!j->is_block) + unlink(j->jname); + } + + vhd_close(&j->vhd); + free(j->jname); + + return 0; +} + +int +vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile) +{ + int err; + vhd_context_t *vhd; + + memset(j, 0, sizeof(vhd_journal_t)); + + j->jfd = -1; + vhd = &j->vhd; + + j->jname = strdup(jfile); + if (j->jname == NULL) + return -ENOMEM; + + j->jfd = open(j->jname, O_LARGEFILE | O_RDWR); + if (j->jfd == -1) { + err = -errno; + goto fail; + } + + err = vhd_test_file_fixed(j->jname, &j->is_block); + if (err) + goto fail; + + vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT); + if (vhd->fd == -1) { + err = -errno; + goto fail; + } + + err = vhd_test_file_fixed(file, &vhd->is_block); + if (err) + goto fail; + + err = vhd_journal_read_journal_header(j, &j->header); + if (err) + goto fail; + + err = vhd_journal_restore_metadata(j); + if (err) + goto fail; + + close(vhd->fd); + free(vhd->bat.bat); + free(vhd->batmap.map); + + err = vhd_open(vhd, file, VHD_OPEN_RDWR); + if (err) + goto fail; + + err = vhd_get_bat(vhd); + if (err) + goto fail; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + goto fail; + } + + err = vhd_journal_disable_vhd(j); + if (err) + goto fail; + + return 0; + +fail: + vhd_journal_close(j); + return err; +} + +int +vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile) +{ + char *buf; + int i, err; + size_t size; + off64_t off; + struct stat stats; + + memset(j, 0, sizeof(vhd_journal_t)); + j->jfd = -1; + + j->jname = strdup(jfile); + if (j->jname == NULL) { + err = -ENOMEM; + goto fail1; + } + + if (access(j->jname, F_OK) == 0) { + err = vhd_test_file_fixed(j->jname, &j->is_block); + if (err) + goto fail1; + + if (!j->is_block) { + err = -EEXIST; + goto fail1; + } + } + + if (j->is_block) + j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644); + else + j->jfd = open(j->jname, + O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644); + if (j->jfd == -1) { + err = -errno; + goto fail1; + } + + err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT); + if (err) + goto fail1; + + err = vhd_get_bat(&j->vhd); + if (err) + goto fail2; + + if (vhd_has_batmap(&j->vhd)) { + err = vhd_get_batmap(&j->vhd); + if (err) + goto fail2; + } + + err = vhd_journal_add_journal_header(j); + if (err) + goto fail2; + + err = vhd_journal_add_metadata(j); + if (err) + goto fail2; + + err = vhd_journal_disable_vhd(j); + if (err) + goto fail2; + + err = vhd_journal_sync(j); + if (err) + goto fail2; + + return 0; + +fail1: + if (j->jfd != -1) { + close(j->jfd); + if (!j->is_block) + unlink(j->jname); + } + free(j->jname); + memset(j, 0, sizeof(vhd_journal_t)); + + return err; + +fail2: + vhd_journal_remove(j); + return err; +} + +int +vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode) +{ + int err; + char *buf; + off64_t off; + size_t size; + uint64_t blk; + vhd_context_t *vhd; + + buf = NULL; + vhd = &j->vhd; + + if (!vhd_type_dynamic(vhd)) + return -EINVAL; + + err = vhd_get_bat(vhd); + if (err) + return err; + + if (block >= vhd->bat.entries) + return -ERANGE; + + blk = vhd->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return 0; + + off = vhd_sectors_to_bytes(blk); + + if (mode & VHD_JOURNAL_METADATA) { + size = vhd_sectors_to_bytes(vhd->bm_secs); + + err = vhd_read_bitmap(vhd, block, &buf); + if (err) + return err; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_DATA); + + free(buf); + + if (err) + return err; + } + + if (mode & VHD_JOURNAL_DATA) { + off += vhd_sectors_to_bytes(vhd->bm_secs); + size = vhd_sectors_to_bytes(vhd->spb); + + err = vhd_read_block(vhd, block, &buf); + if (err) + return err; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_DATA); + free(buf); + + if (err) + return err; + } + + return vhd_journal_sync(j); +} + +/* + * commit indicates the transaction completed + * successfully and we can remove the undo log + */ +int +vhd_journal_commit(vhd_journal_t *j) +{ + int err; + + j->header.journal_data_entries = 0; + j->header.journal_metadata_entries = 0; + j->header.journal_data_offset = 0; + j->header.journal_metadata_offset = 0; + + err = vhd_journal_write_header(j, &j->header); + if (err) + return err; + + if (!j->is_block) + err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t)); + if (err) + return -errno; + + return 0; +} + +/* + * revert indicates the transaction failed + * and we should revert any changes via the undo log + */ +int +vhd_journal_revert(vhd_journal_t *j) +{ + int i, err; + char *buf, *file; + vhd_context_t *vhd; + vhd_journal_entry_t entry; + + err = 0; + vhd = &j->vhd; + buf = NULL; + + file = strdup(vhd->file); + if (!file) + return -ENOMEM; + + vhd_close(&j->vhd); + j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE); + if (j->vhd.fd == -1) { + free(file); + return -errno; + } + + err = vhd_test_file_fixed(file, &vhd->is_block); + if (err) { + free(file); + return err; + } + + err = vhd_journal_restore_metadata(j); + if (err) { + free(file); + return err; + } + + close(vhd->fd); + free(vhd->bat.bat); + free(vhd->batmap.map); + + err = vhd_open(vhd, file, VHD_OPEN_RDWR); + free(file); + if (err) + return err; + + err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET); + if (err) + return err; + + for (i = 0; i < j->header.journal_data_entries; i++) { + err = vhd_journal_read_entry(j, &entry); + if (err) + goto end; + + err = posix_memalign((void **)&buf, + VHD_SECTOR_SIZE, entry.size); + if (err) { + err = -err; + buf = NULL; + goto end; + } + + err = vhd_journal_read(j, buf, entry.size); + if (err) + goto end; + + err = vhd_journal_validate_entry_data(&entry, buf); + if (err) + goto end; + + err = vhd_seek(vhd, entry.offset, SEEK_SET); + if (err) + goto end; + + err = vhd_write(vhd, buf, entry.size); + if (err) + goto end; + + err = 0; + + end: + free(buf); + buf = NULL; + if (err) + break; + } + + if (err) + return err; + + if (!vhd->is_block) { + err = ftruncate(vhd->fd, j->header.vhd_footer_offset + + sizeof(vhd_footer_t)); + if (err) + return -errno; + } + + return vhd_journal_sync(j); +} diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap2/vhd/lib/libvhd.c new file mode 100644 index 0000000000..1af30ad1f6 --- /dev/null +++ b/tools/blktap2/vhd/lib/libvhd.c @@ -0,0 +1,3328 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <libgen.h> +#include <iconv.h> +#include <sys/mman.h> + +#include "libvhd.h" +#include "relative-path.h" + +static int libvhd_dbg = 0; + +void +libvhd_set_log_level(int level) +{ + if (level) + libvhd_dbg = 1; +} + +#define VHDLOG(_f, _a...) \ + do { \ + if (libvhd_dbg) \ + syslog(LOG_INFO, "libvhd::%s: "_f, \ + __func__, ##_a); \ + } while (0) + +#define BIT_MASK 0x80 + +#ifdef ENABLE_FAILURE_TESTING +const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = { + "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN", + "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR", + "VHD_UTIL_TEST_FAIL_REPARENT_END", + "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN", + "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED", + "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED", + "VHD_UTIL_TEST_FAIL_RESIZE_END" +}; +int TEST_FAIL[NUM_FAIL_TESTS]; +#endif // ENABLE_FAILURE_TESTING + +static inline int +test_bit (volatile char *addr, int nr) +{ + return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0; +} + +static inline void +set_bit (volatile char *addr, int nr) +{ + addr[nr >> 3] |= (BIT_MASK >> (nr & 7)); +} + +static inline void +clear_bit (volatile char *addr, int nr) +{ + addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7)); +} + +static inline int +old_test_bit(volatile char *addr, int nr) +{ + return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1; +} + +static inline void +old_set_bit(volatile char *addr, int nr) +{ + ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31)); +} + +static inline void +old_clear_bit(volatile char *addr, int nr) +{ + ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31)); +} + +void +vhd_footer_in(vhd_footer_t *footer) +{ + BE32_IN(&footer->features); + BE32_IN(&footer->ff_version); + BE64_IN(&footer->data_offset); + BE32_IN(&footer->timestamp); + BE32_IN(&footer->crtr_ver); + BE32_IN(&footer->crtr_os); + BE64_IN(&footer->orig_size); + BE64_IN(&footer->curr_size); + BE32_IN(&footer->geometry); + BE32_IN(&footer->type); + BE32_IN(&footer->checksum); +} + +void +vhd_footer_out(vhd_footer_t *footer) +{ + BE32_OUT(&footer->features); + BE32_OUT(&footer->ff_version); + BE64_OUT(&footer->data_offset); + BE32_OUT(&footer->timestamp); + BE32_OUT(&footer->crtr_ver); + BE32_OUT(&footer->crtr_os); + BE64_OUT(&footer->orig_size); + BE64_OUT(&footer->curr_size); + BE32_OUT(&footer->geometry); + BE32_OUT(&footer->type); + BE32_OUT(&footer->checksum); +} + +void +vhd_header_in(vhd_header_t *header) +{ + int i, n; + + BE64_IN(&header->data_offset); + BE64_IN(&header->table_offset); + BE32_IN(&header->hdr_ver); + BE32_IN(&header->max_bat_size); + BE32_IN(&header->block_size); + BE32_IN(&header->checksum); + BE32_IN(&header->prt_ts); + + n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); + + for (i = 0; i < n; i++) { + BE32_IN(&header->loc[i].code); + BE32_IN(&header->loc[i].data_space); + BE32_IN(&header->loc[i].data_len); + BE64_IN(&header->loc[i].data_offset); + } +} + +void +vhd_header_out(vhd_header_t *header) +{ + int i, n; + + BE64_OUT(&header->data_offset); + BE64_OUT(&header->table_offset); + BE32_OUT(&header->hdr_ver); + BE32_OUT(&header->max_bat_size); + BE32_OUT(&header->block_size); + BE32_OUT(&header->checksum); + BE32_OUT(&header->prt_ts); + + n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); + + for (i = 0; i < n; i++) { + BE32_OUT(&header->loc[i].code); + BE32_OUT(&header->loc[i].data_space); + BE32_OUT(&header->loc[i].data_len); + BE64_OUT(&header->loc[i].data_offset); + } +} + +void +vhd_batmap_header_in(vhd_batmap_t *batmap) +{ + BE64_IN(&batmap->header.batmap_offset); + BE32_IN(&batmap->header.batmap_size); + BE32_IN(&batmap->header.batmap_version); + BE32_IN(&batmap->header.checksum); +} + +void +vhd_batmap_header_out(vhd_batmap_t *batmap) +{ + BE64_OUT(&batmap->header.batmap_offset); + BE32_OUT(&batmap->header.batmap_size); + BE32_OUT(&batmap->header.batmap_version); + BE32_OUT(&batmap->header.checksum); +} + +void +vhd_bat_in(vhd_bat_t *bat) +{ + int i; + + for (i = 0; i < bat->entries; i++) + BE32_IN(&bat->bat[i]); +} + +void +vhd_bat_out(vhd_bat_t *bat) +{ + int i; + + for (i = 0; i < bat->entries; i++) + BE32_OUT(&bat->bat[i]); +} + +uint32_t +vhd_checksum_footer(vhd_footer_t *footer) +{ + int i; + unsigned char *blob; + uint32_t checksum, tmp; + + checksum = 0; + tmp = footer->checksum; + footer->checksum = 0; + + blob = (unsigned char *)footer; + for (i = 0; i < sizeof(vhd_footer_t); i++) + checksum += (uint32_t)blob[i]; + + footer->checksum = tmp; + return ~checksum; +} + +int +vhd_validate_footer(vhd_footer_t *footer) +{ + int csize; + uint32_t checksum; + + csize = sizeof(footer->cookie); + if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 && + memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) { + char buf[9]; + memcpy(buf, footer->cookie, 8); + buf[8]= '\0'; + VHDLOG("invalid footer cookie: %s\n", buf); + return -EINVAL; + } + + checksum = vhd_checksum_footer(footer); + if (checksum != footer->checksum) { + /* + * early td-util did not re-calculate + * checksum when marking vhds 'hidden' + */ + if (footer->hidden && + !strncmp(footer->crtr_app, "tap", 3) && + (footer->crtr_ver == VHD_VERSION(0, 1) || + footer->crtr_ver == VHD_VERSION(1, 1))) { + char tmp = footer->hidden; + footer->hidden = 0; + checksum = vhd_checksum_footer(footer); + footer->hidden = tmp; + + if (checksum == footer->checksum) + return 0; + } + + VHDLOG("invalid footer checksum: " + "footer = 0x%08x, calculated = 0x%08x\n", + footer->checksum, checksum); + return -EINVAL; + } + + return 0; +} + +uint32_t +vhd_checksum_header(vhd_header_t *header) +{ + int i; + unsigned char *blob; + uint32_t checksum, tmp; + + checksum = 0; + tmp = header->checksum; + header->checksum = 0; + + blob = (unsigned char *)header; + for (i = 0; i < sizeof(vhd_header_t); i++) + checksum += (uint32_t)blob[i]; + + header->checksum = tmp; + return ~checksum; +} + +int +vhd_validate_header(vhd_header_t *header) +{ + int i, n; + uint32_t checksum; + + if (memcmp(header->cookie, DD_COOKIE, 8) != 0) { + char buf[9]; + memcpy(buf, header->cookie, 8); + buf[8] = '\0'; + VHDLOG("invalid header cookie: %s\n", buf); + return -EINVAL; + } + + if (header->hdr_ver != 0x00010000) { + VHDLOG("invalid header version 0x%08x\n", header->hdr_ver); + return -EINVAL; + } + + if (header->data_offset != 0xFFFFFFFFFFFFFFFF) { + VHDLOG("invalid header data_offset 0x%016"PRIx64"\n", + header->data_offset); + return -EINVAL; + } + + n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); + for (i = 0; i < n; i++) + if (vhd_validate_platform_code(header->loc[i].code)) + return -EINVAL; + + checksum = vhd_checksum_header(header); + if (checksum != header->checksum) { + VHDLOG("invalid header checksum: " + "header = 0x%08x, calculated = 0x%08x\n", + header->checksum, checksum); + return -EINVAL; + } + + return 0; +} + +static inline int +vhd_validate_bat(vhd_bat_t *bat) +{ + if (!bat->bat) + return -EINVAL; + + return 0; +} + +uint32_t +vhd_checksum_batmap(vhd_batmap_t *batmap) +{ + int i, n; + char *blob; + uint32_t checksum; + + blob = batmap->map; + checksum = 0; + + n = vhd_sectors_to_bytes(batmap->header.batmap_size); + + for (i = 0; i < n; i++) { + if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1)) + checksum += (uint32_t)blob[i]; + else + checksum += (uint32_t)(unsigned char)blob[i]; + } + + return ~checksum; +} + +int +vhd_validate_batmap_header(vhd_batmap_t *batmap) +{ + if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8)) + return -EINVAL; + + if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION) + return -EINVAL; + + return 0; +} + +int +vhd_validate_batmap(vhd_batmap_t *batmap) +{ + uint32_t checksum; + + if (!batmap->map) + return -EINVAL; + + checksum = vhd_checksum_batmap(batmap); + if (checksum != batmap->header.checksum) + return -EINVAL; + + return 0; +} + +int +vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *_off) +{ + off64_t off; + size_t bat; + + *_off = 0; + + off = ctx->header.table_offset; + bat = ctx->header.max_bat_size * sizeof(uint32_t); + off += vhd_bytes_padded(bat); + + *_off = off; + return 0; +} + +int +vhd_validate_platform_code(uint32_t code) +{ + switch (code) { + case PLAT_CODE_NONE: + case PLAT_CODE_WI2R: + case PLAT_CODE_WI2K: + case PLAT_CODE_W2RU: + case PLAT_CODE_W2KU: + case PLAT_CODE_MAC: + case PLAT_CODE_MACX: + return 0; + default: + VHDLOG("invalid parent locator code %u\n", code); + return -EINVAL; + } +} + +int +vhd_parent_locator_count(vhd_context_t *ctx) +{ + return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t)); +} + +int +vhd_hidden(vhd_context_t *ctx, int *hidden) +{ + int err; + + *hidden = 0; + + if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) && + (ctx->footer.crtr_ver == VHD_VERSION(0, 1) || + ctx->footer.crtr_ver == VHD_VERSION(1, 1))) { + vhd_footer_t copy; + + err = vhd_read_footer_at(ctx, ©, 0); + if (err) { + VHDLOG("error reading backup footer of %s: %d\n", + ctx->file, err); + return err; + } + *hidden = copy.hidden; + } else + *hidden = ctx->footer.hidden; + + return 0; +} + +int +vhd_chain_depth(vhd_context_t *ctx, int *depth) +{ + char *file; + int err, cnt; + vhd_context_t vhd, *cur; + + err = 0; + cnt = 0; + *depth = 0; + file = NULL; + cur = ctx; + + for (;;) { + cnt++; + + if (cur->footer.type != HD_TYPE_DIFF) + break; + + if (vhd_parent_raw(cur)) { + cnt++; + break; + } + + err = vhd_parent_locator_get(cur, &file); + if (err) { + file = NULL; + break; + } + + if (cur != ctx) { + vhd_close(cur); + cur = NULL; + } + + err = vhd_open(&vhd, file, VHD_OPEN_RDONLY); + if (err) + break; + + cur = &vhd; + free(file); + file = NULL; + } + + free(file); + if (cur && cur != ctx) + vhd_close(cur); + + if (!err) + *depth = cnt; + + return err; +} + +int +vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) +{ + if (!vhd_has_batmap(ctx) || !batmap->map) + return 0; + + if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) + return 0; + + return test_bit(batmap->map, block); +} + +void +vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) +{ + if (!vhd_has_batmap(ctx) || !batmap->map) + return; + + if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) + return; + + set_bit(batmap->map, block); +} + +void +vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) +{ + if (!vhd_has_batmap(ctx) || !batmap->map) + return; + + if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) + return; + + clear_bit(batmap->map, block); +} + +int +vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block) +{ + if (vhd_creator_tapdisk(ctx) && + ctx->footer.crtr_ver == 0x00000001) + return old_test_bit(map, block); + + return test_bit(map, block); +} + +void +vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block) +{ + if (vhd_creator_tapdisk(ctx) && + ctx->footer.crtr_ver == 0x00000001) + return old_set_bit(map, block); + + return set_bit(map, block); +} + +void +vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block) +{ + if (vhd_creator_tapdisk(ctx) && + ctx->footer.crtr_ver == 0x00000001) + return old_clear_bit(map, block); + + return clear_bit(map, block); +} + +/* + * returns absolute offset of the first + * byte of the file which is not vhd metadata + */ +int +vhd_end_of_headers(vhd_context_t *ctx, off64_t *end) +{ + int err, i, n; + uint32_t bat_bytes; + off64_t eom, bat_end; + vhd_parent_locator_t *loc; + + *end = 0; + + if (!vhd_type_dynamic(ctx)) + return 0; + + eom = ctx->footer.data_offset + sizeof(vhd_header_t); + + bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t)); + bat_end = ctx->header.table_offset + bat_bytes; + + eom = MAX(eom, bat_end); + + if (vhd_has_batmap(ctx)) { + off64_t hdr_end, hdr_secs, map_end, map_secs; + + err = vhd_get_batmap(ctx); + if (err) + return err; + + hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t)); + err = vhd_batmap_header_offset(ctx, &hdr_end); + if (err) + return err; + + hdr_end += vhd_sectors_to_bytes(hdr_secs); + eom = MAX(eom, hdr_end); + + map_secs = ctx->batmap.header.batmap_size; + map_end = (ctx->batmap.header.batmap_offset + + vhd_sectors_to_bytes(map_secs)); + eom = MAX(eom, map_end); + } + + /* parent locators */ + n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t); + + for (i = 0; i < n; i++) { + off64_t loc_end; + + loc = &ctx->header.loc[i]; + if (loc->code == PLAT_CODE_NONE) + continue; + + loc_end = loc->data_offset + vhd_parent_locator_size(loc); + eom = MAX(eom, loc_end); + } + + *end = eom; + return 0; +} + +int +vhd_end_of_data(vhd_context_t *ctx, off64_t *end) +{ + int i, err; + off64_t max; + uint64_t blk; + + if (!vhd_type_dynamic(ctx)) { + err = vhd_seek(ctx, 0, SEEK_END); + if (err) + return err; + + max = vhd_position(ctx); + if (max == (off64_t)-1) + return -errno; + + *end = max - sizeof(vhd_footer_t); + return 0; + } + + err = vhd_end_of_headers(ctx, &max); + if (err) + return err; + + err = vhd_get_bat(ctx); + if (err) + return err; + + max >>= VHD_SECTOR_SHIFT; + + for (i = 0; i < ctx->bat.entries; i++) { + blk = ctx->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + blk += ctx->spb + ctx->bm_secs; + max = MAX(blk, max); + } + } + + *end = vhd_sectors_to_bytes(max); + return 0; +} + +uint32_t +vhd_time(time_t time) +{ + struct tm tm; + time_t micro_epoch; + + memset(&tm, 0, sizeof(struct tm)); + tm.tm_year = 100; + tm.tm_mon = 0; + tm.tm_mday = 1; + micro_epoch = mktime(&tm); + + return (uint32_t)(time - micro_epoch); +} + +/* + * Stringify the VHD timestamp for printing. + * As with ctime_r, target must be >=26 bytes. + */ +size_t +vhd_time_to_string(uint32_t timestamp, char *target) +{ + char *cr; + struct tm tm; + time_t t1, t2; + + memset(&tm, 0, sizeof(struct tm)); + + /* VHD uses an epoch of 12:00AM, Jan 1, 2000. */ + /* Need to adjust this to the expected epoch of 1970. */ + tm.tm_year = 100; + tm.tm_mon = 0; + tm.tm_mday = 1; + + t1 = mktime(&tm); + t2 = t1 + (time_t)timestamp; + ctime_r(&t2, target); + + /* handle mad ctime_r newline appending. */ + if ((cr = strchr(target, '\n')) != NULL) + *cr = '\0'; + + return (strlen(target)); +} + +/* + * nabbed from vhd specs. + */ +uint32_t +vhd_chs(uint64_t size) +{ + uint32_t secs, cylinders, heads, spt, cth; + + secs = secs_round_up_no_zero(size); + + if (secs > 65535 * 16 * 255) + secs = 65535 * 16 * 255; + + if (secs >= 65535 * 16 * 63) { + spt = 255; + cth = secs / spt; + heads = 16; + } else { + spt = 17; + cth = secs / spt; + heads = (cth + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (cth >= (heads * 1024) || heads > 16) { + spt = 31; + cth = secs / spt; + heads = 16; + } + + if (cth >= heads * 1024) { + spt = 63; + cth = secs / spt; + heads = 16; + } + } + + cylinders = cth / heads; + + return GEOM_ENCODE(cylinders, heads, spt); +} + +int +vhd_get_footer(vhd_context_t *ctx) +{ + if (!vhd_validate_footer(&ctx->footer)) + return 0; + + return vhd_read_footer(ctx, &ctx->footer); +} + +int +vhd_get_header(vhd_context_t *ctx) +{ + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + if (!vhd_validate_header(&ctx->header)) + return 0; + + return vhd_read_header(ctx, &ctx->header); +} + +int +vhd_get_bat(vhd_context_t *ctx) +{ + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + if (!vhd_validate_bat(&ctx->bat)) + return 0; + + vhd_put_bat(ctx); + return vhd_read_bat(ctx, &ctx->bat); +} + +int +vhd_get_batmap(vhd_context_t *ctx) +{ + if (!vhd_has_batmap(ctx)) + return -EINVAL; + + if (!vhd_validate_batmap(&ctx->batmap)) + return 0; + + vhd_put_batmap(ctx); + return vhd_read_batmap(ctx, &ctx->batmap); +} + +void +vhd_put_footer(vhd_context_t *ctx) +{ + memset(&ctx->footer, 0, sizeof(vhd_footer_t)); +} + +void +vhd_put_header(vhd_context_t *ctx) +{ + memset(&ctx->header, 0, sizeof(vhd_header_t)); +} + +void +vhd_put_bat(vhd_context_t *ctx) +{ + if (!vhd_type_dynamic(ctx)) + return; + + free(ctx->bat.bat); + memset(&ctx->bat, 0, sizeof(vhd_bat_t)); +} + +void +vhd_put_batmap(vhd_context_t *ctx) +{ + if (!vhd_type_dynamic(ctx)) + return; + + if (!vhd_has_batmap(ctx)) + return; + + free(ctx->batmap.map); + memset(&ctx->batmap, 0, sizeof(vhd_batmap_t)); +} + +/* + * look for 511 byte footer at end of file + */ +int +vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer) +{ + int err; + char *buf; + off64_t eof; + + buf = NULL; + + err = vhd_seek(ctx, 0, SEEK_END); + if (err) + goto out; + + eof = vhd_position(ctx); + if (eof == (off64_t)-1) { + err = -errno; + goto out; + } + + err = vhd_seek(ctx, eof - 511, SEEK_SET); + if (err) + goto out; + + err = posix_memalign((void **)&buf, + VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); + if (err) { + buf = NULL; + err = -err; + goto out; + } + + memset(buf, 0, sizeof(vhd_footer_t)); + + /* + * expecting short read here + */ + vhd_read(ctx, buf, sizeof(vhd_footer_t)); + + memcpy(footer, buf, sizeof(vhd_footer_t)); + + vhd_footer_in(footer); + err = vhd_validate_footer(footer); + +out: + if (err) + VHDLOG("%s: failed reading short footer: %d\n", + ctx->file, err); + free(buf); + return err; +} + +int +vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off) +{ + int err; + char *buf; + + buf = NULL; + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + err = posix_memalign((void **)&buf, + VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); + if (err) { + buf = NULL; + err = -err; + goto out; + } + + err = vhd_read(ctx, buf, sizeof(vhd_footer_t)); + if (err) + goto out; + + memcpy(footer, buf, sizeof(vhd_footer_t)); + + vhd_footer_in(footer); + err = vhd_validate_footer(footer); + +out: + if (err) + VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n", + ctx->file, off, err); + free(buf); + return err; +} + +int +vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer) +{ + int err; + off64_t off; + + err = vhd_seek(ctx, 0, SEEK_END); + if (err) + return err; + + off = vhd_position(ctx); + if (off == (off64_t)-1) + return -errno; + + err = vhd_read_footer_at(ctx, footer, off - 512); + if (err != -EINVAL) + return err; + + err = vhd_read_short_footer(ctx, footer); + if (err != -EINVAL) + return err; + + if (ctx->oflags & VHD_OPEN_STRICT) + return -EINVAL; + + return vhd_read_footer_at(ctx, footer, 0); +} + +int +vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off) +{ + int err; + char *buf; + + buf = NULL; + + if (!vhd_type_dynamic(ctx)) { + err = -EINVAL; + goto out; + } + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + err = posix_memalign((void **)&buf, + VHD_SECTOR_SIZE, sizeof(vhd_header_t)); + if (err) { + buf = NULL; + err = -err; + goto out; + } + + err = vhd_read(ctx, buf, sizeof(vhd_header_t)); + if (err) + goto out; + + memcpy(header, buf, sizeof(vhd_header_t)); + + vhd_header_in(header); + err = vhd_validate_header(header); + +out: + if (err) + VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n", + ctx->file, off, err); + free(buf); + return err; +} + +int +vhd_read_header(vhd_context_t *ctx, vhd_header_t *header) +{ + int err; + off64_t off; + + if (!vhd_type_dynamic(ctx)) { + VHDLOG("%s is not dynamic!\n", ctx->file); + return -EINVAL; + } + + off = ctx->footer.data_offset; + return vhd_read_header_at(ctx, header, off); +} + +int +vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat) +{ + int err; + char *buf; + off64_t off; + size_t size; + + buf = NULL; + + if (!vhd_type_dynamic(ctx)) { + err = -EINVAL; + goto fail; + } + + off = ctx->header.table_offset; + size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t)); + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + buf = NULL; + err = -err; + goto fail; + } + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto fail; + + err = vhd_read(ctx, buf, size); + if (err) + goto fail; + + bat->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; + bat->entries = ctx->header.max_bat_size; + bat->bat = (uint32_t *)buf; + + vhd_bat_in(bat); + + return 0; + +fail: + free(buf); + memset(bat, 0, sizeof(vhd_bat_t)); + VHDLOG("%s: failed to read bat: %d\n", ctx->file, err); + return err; +} + +static int +vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap) +{ + int err; + char *buf; + off64_t off; + size_t size; + + buf = NULL; + + err = vhd_batmap_header_offset(ctx, &off); + if (err) + goto fail; + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto fail; + + size = vhd_bytes_padded(sizeof(vhd_batmap_header_t)); + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + buf = NULL; + err = -err; + goto fail; + } + + err = vhd_read(ctx, buf, size); + if (err) + goto fail; + + memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t)); + free(buf); + buf = NULL; + + vhd_batmap_header_in(batmap); + + return 0; + +fail: + free(buf); + memset(&batmap->header, 0, sizeof(vhd_batmap_header_t)); + VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err); + return err; +} + +static int +vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap) +{ + int err; + char *buf; + off64_t off; + size_t map_size; + + map_size = vhd_sectors_to_bytes(batmap->header.batmap_size); + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size); + if (err) { + buf = NULL; + err = -err; + goto fail; + } + + off = batmap->header.batmap_offset; + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto fail; + + err = vhd_read(ctx, buf, map_size); + if (err) + goto fail; + + batmap->map = buf; + return 0; + +fail: + free(buf); + batmap->map = NULL; + VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err); + return err; +} + +int +vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) +{ + int err; + + if (!vhd_has_batmap(ctx)) + return -EINVAL; + + memset(batmap, 0, sizeof(vhd_batmap_t)); + + err = vhd_read_batmap_header(ctx, batmap); + if (err) + return err; + + err = vhd_validate_batmap_header(batmap); + if (err) + return err; + + err = vhd_read_batmap_map(ctx, batmap); + if (err) + return err; + + err = vhd_validate_batmap(batmap); + if (err) + goto fail; + + return 0; + +fail: + free(batmap->map); + memset(batmap, 0, sizeof(vhd_batmap_t)); + return err; +} + +int +vhd_has_batmap(vhd_context_t *ctx) +{ + if (!vhd_type_dynamic(ctx)) + return 0; + + if (!vhd_creator_tapdisk(ctx)) + return 0; + + if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1)) + return 0; + + if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2)) + return 1; + + /* + * VHDs of version 1.1 probably have a batmap, but may not + * if they were updated from version 0.1 via vhd-update. + */ + if (!vhd_validate_batmap_header(&ctx->batmap)) + return 1; + + if (vhd_read_batmap_header(ctx, &ctx->batmap)) + return 0; + + return (!vhd_validate_batmap_header(&ctx->batmap)); +} + +/* + * Is this a block device (with a fixed size)? This affects whether the file + * can be truncated and where the footer is written for VHDs. + */ +int +vhd_test_file_fixed(const char *file, int *is_block) +{ + int err; + struct stat stats; + + err = stat(file, &stats); + if (err == -1) + return -errno; + + *is_block = !!(S_ISBLK(stats.st_mode)); + return err; +} + +int +vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location) +{ + int err; + char *location, *cpath, *cdir, *path; + + err = 0; + path = NULL; + cpath = NULL; + location = NULL; + *_location = NULL; + + if (!parent) + return -EINVAL; + + if (parent[0] == '/') { + if (!access(parent, R_OK)) { + path = strdup(parent); + if (!path) + return -ENOMEM; + *_location = path; + return 0; + } + } + + /* check parent path relative to child's directory */ + cpath = realpath(ctx->file, NULL); + if (!cpath) { + err = -errno; + goto out; + } + + cdir = dirname(cpath); + if (asprintf(&location, "%s/%s", cdir, parent) == -1) { + err = -errno; + location = NULL; + goto out; + } + + if (!access(location, R_OK)) { + path = realpath(location, NULL); + if (path) { + *_location = path; + return 0; + } + } + err = -errno; + +out: + free(location); + free(cpath); + return err; +} + +static int +vhd_macx_encode_location(char *name, char **out, int *outlen) +{ + iconv_t cd; + int len, err; + size_t ibl, obl; + char *uri, *urip, *uri_utf8, *uri_utf8p, *ret; + + err = 0; + ret = NULL; + *out = NULL; + *outlen = 0; + len = strlen(name) + strlen("file://"); + + ibl = len; + obl = len; + + uri = urip = malloc(ibl + 1); + uri_utf8 = uri_utf8p = malloc(obl); + + if (!uri || !uri_utf8) + return -ENOMEM; + + cd = iconv_open("UTF-8", "ASCII"); + if (cd == (iconv_t)-1) { + err = -errno; + goto out; + } + + sprintf(uri, "file://%s", name); + + if (iconv(cd, &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 || + ibl || obl) { + err = (errno ? -errno : -EIO); + goto out; + } + + ret = malloc(len); + if (!ret) { + err = -ENOMEM; + goto out; + } + + memcpy(ret, uri_utf8, len); + *outlen = len; + *out = ret; + + out: + free(uri); + free(uri_utf8); + if (cd != (iconv_t)-1) + iconv_close(cd); + + return err; +} + +static int +vhd_w2u_encode_location(char *name, char **out, int *outlen) +{ + iconv_t cd; + int len, err; + size_t ibl, obl; + char *uri, *urip, *uri_utf16, *uri_utf16p, *tmp, *ret; + + err = 0; + ret = NULL; + *out = NULL; + *outlen = 0; + cd = (iconv_t) -1; + + /* + * MICROSOFT_COMPAT + * relative paths must start with ".\" + */ + if (name[0] != '/') { + tmp = strstr(name, "./"); + if (tmp == name) + tmp += strlen("./"); + else + tmp = name; + + err = asprintf(&uri, ".\\%s", tmp); + } else + err = asprintf(&uri, "%s", name); + + if (err == -1) + return -ENOMEM; + + tmp = uri; + while (*tmp != '\0') { + if (*tmp == '/') + *tmp = '\\'; + tmp++; + } + + len = strlen(uri); + ibl = len; + obl = len * 2; + urip = uri; + + uri_utf16 = uri_utf16p = malloc(obl); + if (!uri_utf16) { + err = -ENOMEM; + goto out; + } + + /* + * MICROSOFT_COMPAT + * little endian unicode here + */ + cd = iconv_open("UTF-16LE", "ASCII"); + if (cd == (iconv_t)-1) { + err = -errno; + goto out; + } + + if (iconv(cd, &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 || + ibl || obl) { + err = (errno ? -errno : -EIO); + goto out; + } + + len = len * 2; + ret = malloc(len); + if (!ret) { + err = -ENOMEM; + goto out; + } + + memcpy(ret, uri_utf16, len); + *outlen = len; + *out = ret; + err = 0; + + out: + free(uri); + free(uri_utf16); + if (cd != (iconv_t)-1) + iconv_close(cd); + + return err; +} + +static char * +vhd_macx_decode_location(char *in, char *out, int len) +{ + iconv_t cd; + char *name; + size_t ibl, obl; + + name = out; + ibl = obl = len; + + cd = iconv_open("ASCII", "UTF-8"); + if (cd == (iconv_t)-1) + return NULL; + + if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl) + return NULL; + + iconv_close(cd); + *out = '\0'; + + if (strstr(name, "file://") != name) + return NULL; + + name += strlen("file://"); + + return strdup(name); +} + +static char * +vhd_w2u_decode_location(char *in, char *out, int len, char *utf_type) +{ + iconv_t cd; + char *name, *tmp; + size_t ibl, obl; + + tmp = name = out; + ibl = obl = len; + + cd = iconv_open("ASCII", utf_type); + if (cd == (iconv_t)-1) + return NULL; + + if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl) + return NULL; + + iconv_close(cd); + *out = '\0'; + + /* TODO: spaces */ + while (tmp != out) { + if (*tmp == '\\') + *tmp = '/'; + tmp++; + } + + if (strstr(name, "C:") == name || strstr(name, "c:") == name) + name += strlen("c:"); + + return strdup(name); +} + +int +vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf) +{ + char *code, out[512]; + + if (vhd_creator_tapdisk(ctx) && + ctx->footer.crtr_ver == VHD_VERSION(0, 1)) + code = UTF_16; + else + code = UTF_16BE; + + *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code); + return (*buf == NULL ? -EINVAL : 0); +} + +int +vhd_parent_locator_read(vhd_context_t *ctx, + vhd_parent_locator_t *loc, char **parent) +{ + int err, size; + char *raw, *out, *name; + + raw = NULL; + out = NULL; + name = NULL; + *parent = NULL; + + if (ctx->footer.type != HD_TYPE_DIFF) { + err = -EINVAL; + goto out; + } + + switch (loc->code) { + case PLAT_CODE_MACX: + case PLAT_CODE_W2KU: + case PLAT_CODE_W2RU: + break; + default: + err = -EINVAL; + goto out; + } + + err = vhd_seek(ctx, loc->data_offset, SEEK_SET); + if (err) + goto out; + + size = vhd_parent_locator_size(loc); + if (size <= 0) { + err = -EINVAL; + goto out; + } + + err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size); + if (err) { + raw = NULL; + err = -err; + goto out; + } + + err = vhd_read(ctx, raw, size); + if (err) + goto out; + + out = malloc(loc->data_len + 1); + if (!out) { + err = -ENOMEM; + goto out; + } + + switch (loc->code) { + case PLAT_CODE_MACX: + name = vhd_macx_decode_location(raw, out, loc->data_len); + break; + case PLAT_CODE_W2KU: + case PLAT_CODE_W2RU: + name = vhd_w2u_decode_location(raw, out, + loc->data_len, UTF_16LE); + break; + } + + if (!name) { + err = -EINVAL; + goto out; + } + + err = 0; + *parent = name; + +out: + free(raw); + free(out); + + if (err) { + VHDLOG("%s: error reading parent locator: %d\n", + ctx->file, err); + VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, " + "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space, + loc->data_len, loc->data_offset); + } + + return err; +} + +int +vhd_parent_locator_get(vhd_context_t *ctx, char **parent) +{ + int i, n, err; + char *name, *location; + vhd_parent_locator_t *loc; + + err = 0; + *parent = NULL; + + if (ctx->footer.type != HD_TYPE_DIFF) + return -EINVAL; + + n = vhd_parent_locator_count(ctx); + for (i = 0; i < n; i++) { + loc = ctx->header.loc + i; + err = vhd_parent_locator_read(ctx, loc, &name); + if (err) + continue; + + err = vhd_find_parent(ctx, name, &location); + if (err) + VHDLOG("%s: couldn't find parent %s (%d)\n", + ctx->file, name, err); + free(name); + + if (!err) { + *parent = location; + return 0; + } + } + + return err; +} + +int +vhd_parent_locator_write_at(vhd_context_t *ctx, + const char *parent, off64_t off, uint32_t code, + size_t max_bytes, vhd_parent_locator_t *loc) +{ + struct stat stats; + int err, len, size; + char *absolute_path, *relative_path, *encoded, *block; + + memset(loc, 0, sizeof(vhd_parent_locator_t)); + + if (ctx->footer.type != HD_TYPE_DIFF) + return -EINVAL; + + absolute_path = NULL; + relative_path = NULL; + encoded = NULL; + block = NULL; + size = 0; + len = 0; + + switch (code) { + case PLAT_CODE_MACX: + case PLAT_CODE_W2KU: + case PLAT_CODE_W2RU: + break; + default: + return -EINVAL; + } + + absolute_path = realpath(parent, NULL); + if (!absolute_path) { + err = -errno; + goto out; + } + + err = stat(absolute_path, &stats); + if (err) { + err = -errno; + goto out; + } + + if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { + err = -EINVAL; + goto out; + } + + relative_path = relative_path_to(ctx->file, absolute_path, &err); + if (!relative_path || err) { + err = (err ? err : -EINVAL); + goto out; + } + + switch (code) { + case PLAT_CODE_MACX: + err = vhd_macx_encode_location(relative_path, &encoded, &len); + break; + case PLAT_CODE_W2KU: + case PLAT_CODE_W2RU: + err = vhd_w2u_encode_location(relative_path, &encoded, &len); + break; + default: + err = -EINVAL; + } + + if (err) + goto out; + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + size = vhd_bytes_padded(len); + + if (max_bytes && size > max_bytes) { + err = -ENAMETOOLONG; + goto out; + } + + err = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size); + if (err) { + block = NULL; + err = -err; + goto out; + } + + memset(block, 0, size); + memcpy(block, encoded, len); + + err = vhd_write(ctx, block, size); + if (err) + goto out; + + err = 0; + +out: + free(absolute_path); + free(relative_path); + free(encoded); + free(block); + + if (!err) { + loc->res = 0; + loc->code = code; + loc->data_len = len; + /* + * write number of bytes ('size') instead of number of sectors + * into loc->data_space to be compatible with MSFT, even though + * this goes against the specs + */ + loc->data_space = size; + loc->data_offset = off; + } + + return err; +} + +static int +vhd_footer_offset_at_eof(vhd_context_t *ctx, off64_t *off) +{ + int err; + if ((err = vhd_seek(ctx, 0, SEEK_END))) + return errno; + *off = vhd_position(ctx) - sizeof(vhd_footer_t); + return 0; +} + +int +vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp) +{ + int err; + char *buf; + size_t size; + off64_t off; + uint64_t blk; + + buf = NULL; + *bufp = NULL; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + err = vhd_get_bat(ctx); + if (err) + return err; + + if (block >= ctx->bat.entries) + return -ERANGE; + + blk = ctx->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return -EINVAL; + + off = vhd_sectors_to_bytes(blk); + size = vhd_bytes_padded(ctx->spb >> 3); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + return err; + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) + return -err; + + err = vhd_read(ctx, buf, size); + if (err) + goto fail; + + *bufp = buf; + return 0; + +fail: + free(buf); + return err; +} + +int +vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp) +{ + int err; + char *buf; + size_t size; + uint64_t blk; + off64_t end, off; + + buf = NULL; + *bufp = NULL; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + err = vhd_get_bat(ctx); + if (err) + return err; + + if (block >= ctx->bat.entries) + return -ERANGE; + + blk = ctx->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return -EINVAL; + + off = vhd_sectors_to_bytes(blk + ctx->bm_secs); + size = vhd_sectors_to_bytes(ctx->spb); + + err = vhd_footer_offset_at_eof(ctx, &end); + if (err) + return err; + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + err = -err; + goto fail; + } + + if (end < off + ctx->header.block_size) { + size = end - off; + memset(buf + size, 0, ctx->header.block_size - size); + } + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto fail; + + err = vhd_read(ctx, buf, size); + if (err) + goto fail; + + *bufp = buf; + return 0; + +fail: + free(buf); + return err; +} + +int +vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off) +{ + int err; + vhd_footer_t *f; + + f = NULL; + + err = posix_memalign((void **)&f, + VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); + if (err) { + f = NULL; + err = -err; + goto out; + } + + memcpy(f, footer, sizeof(vhd_footer_t)); + f->checksum = vhd_checksum_footer(f); + + err = vhd_validate_footer(f); + if (err) + goto out; + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + vhd_footer_out(f); + + err = vhd_write(ctx, f, sizeof(vhd_footer_t)); + +out: + if (err) + VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n", + ctx->file, off, err); + free(f); + return err; +} + +int +vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer) +{ + int err; + off64_t off; + + if (ctx->is_block) + err = vhd_footer_offset_at_eof(ctx, &off); + else + err = vhd_end_of_data(ctx, &off); + if (err) + return err; + + err = vhd_write_footer_at(ctx, footer, off); + if (err) + return err; + + if (!vhd_type_dynamic(ctx)) + return 0; + + return vhd_write_footer_at(ctx, footer, 0); +} + +int +vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off) +{ + int err; + vhd_header_t *h; + + h = NULL; + + if (!vhd_type_dynamic(ctx)) { + err = -EINVAL; + goto out; + } + + err = posix_memalign((void **)&h, + VHD_SECTOR_SIZE, sizeof(vhd_header_t)); + if (err) { + h = NULL; + err = -err; + goto out; + } + + memcpy(h, header, sizeof(vhd_header_t)); + + h->checksum = vhd_checksum_header(h); + err = vhd_validate_header(h); + if (err) + goto out; + + vhd_header_out(h); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(ctx, h, sizeof(vhd_header_t)); + +out: + if (err) + VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n", + ctx->file, off, err); + free(h); + return err; +} + +int +vhd_write_header(vhd_context_t *ctx, vhd_header_t *header) +{ + int err; + off64_t off; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + off = ctx->footer.data_offset; + return vhd_write_header_at(ctx, header, off); +} + +int +vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat) +{ + int err; + off64_t off; + vhd_bat_t b; + size_t size; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + err = vhd_validate_bat(&ctx->bat); + if (err) + return err; + + err = vhd_validate_bat(bat); + if (err) + return err; + + memset(&b, 0, sizeof(vhd_bat_t)); + + off = ctx->header.table_offset; + size = vhd_bytes_padded(bat->entries * sizeof(uint32_t)); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + return err; + + err = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size); + if (err) + return -err; + + memcpy(b.bat, bat->bat, size); + b.spb = bat->spb; + b.entries = bat->entries; + vhd_bat_out(&b); + + err = vhd_write(ctx, b.bat, size); + free(b.bat); + + return err; +} + +int +vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) +{ + int err; + off64_t off; + vhd_batmap_t b; + char *buf, *map; + size_t size, map_size; + + buf = NULL; + map = NULL; + + if (!vhd_has_batmap(ctx)) { + err = -EINVAL; + goto out; + } + + b.header = batmap->header; + b.map = batmap->map; + + b.header.checksum = vhd_checksum_batmap(&b); + err = vhd_validate_batmap(&b); + if (err) + goto out; + + off = b.header.batmap_offset; + map_size = vhd_sectors_to_bytes(b.header.batmap_size); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + err = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size); + if (err) { + map = NULL; + err = -err; + goto out; + } + + memcpy(map, b.map, map_size); + + err = vhd_write(ctx, map, map_size); + if (err) + goto out; + + err = vhd_batmap_header_offset(ctx, &off); + if (err) + goto out; + + size = vhd_bytes_padded(sizeof(vhd_batmap_header_t)); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + goto out; + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + err = -err; + buf = NULL; + goto out; + } + + vhd_batmap_header_out(&b); + memset(buf, 0, size); + memcpy(buf, &b.header, sizeof(vhd_batmap_header_t)); + + err = vhd_write(ctx, buf, size); + +out: + if (err) + VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err); + free(buf); + free(map); + return 0; +} + +int +vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap) +{ + int err; + off64_t off; + uint64_t blk; + size_t secs, size; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + err = vhd_validate_bat(&ctx->bat); + if (err) + return err; + + if (block >= ctx->bat.entries) + return -ERANGE; + + if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1)) + return -EINVAL; + + blk = ctx->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return -EINVAL; + + off = vhd_sectors_to_bytes(blk); + size = vhd_sectors_to_bytes(ctx->bm_secs); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + return err; + + err = vhd_write(ctx, bitmap, size); + if (err) + return err; + + return 0; +} + +int +vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data) +{ + int err; + off64_t off; + size_t size; + uint64_t blk; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + err = vhd_validate_bat(&ctx->bat); + if (err) + return err; + + if (block >= ctx->bat.entries) + return -ERANGE; + + if ((unsigned long)data & ~(VHD_SECTOR_SIZE -1)) + return -EINVAL; + + blk = ctx->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return -EINVAL; + + off = vhd_sectors_to_bytes(blk + ctx->bm_secs); + size = vhd_sectors_to_bytes(ctx->spb); + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + return err; + + err = vhd_write(ctx, data, size); + if (err) + return err; + + return 0; +} + +static inline int +namedup(char **dup, const char *name) +{ + *dup = NULL; + + if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN) + return -ENAMETOOLONG; + + *dup = strdup(name); + if (*dup == NULL) + return -ENOMEM; + + return 0; +} + +int +vhd_seek(vhd_context_t *ctx, off64_t offset, int whence) +{ + off64_t off; + + off = lseek64(ctx->fd, offset, whence); + if (off == (off64_t)-1) { + VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n", + ctx->file, offset, whence, -errno); + return -errno; + } + + return 0; +} + +off64_t +vhd_position(vhd_context_t *ctx) +{ + return lseek64(ctx->fd, 0, SEEK_CUR); +} + +int +vhd_read(vhd_context_t *ctx, void *buf, size_t size) +{ + size_t ret; + + errno = 0; + + ret = read(ctx->fd, buf, size); + if (ret == size) + return 0; + + VHDLOG("%s: read of %zu returned %zd, errno: %d\n", + ctx->file, size, ret, -errno); + + return (errno ? -errno : -EIO); +} + +int +vhd_write(vhd_context_t *ctx, void *buf, size_t size) +{ + size_t ret; + + errno = 0; + + ret = write(ctx->fd, buf, size); + if (ret == size) + return 0; + + VHDLOG("%s: write of %zu returned %zd, errno: %d\n", + ctx->file, size, ret, -errno); + + return (errno ? -errno : -EIO); +} + +int +vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset) +{ + int err; + uint32_t block; + + if (!vhd_type_dynamic(ctx)) + return sector; + + err = vhd_get_bat(ctx); + if (err) + return err; + + block = sector / ctx->spb; + if (ctx->bat.bat[block] == DD_BLK_UNUSED) + *offset = DD_BLK_UNUSED; + else + *offset = ctx->bat.bat[block] + + ctx->bm_secs + (sector % ctx->spb); + + return 0; +} + +int +vhd_open_fast(vhd_context_t *ctx) +{ + int err; + char *buf; + size_t size; + + size = sizeof(vhd_footer_t) + sizeof(vhd_header_t); + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + VHDLOG("failed allocating %s: %d\n", ctx->file, -err); + return -err; + } + + err = vhd_read(ctx, buf, size); + if (err) { + VHDLOG("failed reading %s: %d\n", ctx->file, err); + goto out; + } + + memcpy(&ctx->footer, buf, sizeof(vhd_footer_t)); + vhd_footer_in(&ctx->footer); + err = vhd_validate_footer(&ctx->footer); + if (err) + goto out; + + if (vhd_type_dynamic(ctx)) { + if (ctx->footer.data_offset != sizeof(vhd_footer_t)) + err = vhd_read_header(ctx, &ctx->header); + else { + memcpy(&ctx->header, + buf + sizeof(vhd_footer_t), + sizeof(vhd_header_t)); + vhd_header_in(&ctx->header); + err = vhd_validate_header(&ctx->header); + } + + if (err) + goto out; + + ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; + ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3); + } + +out: + free(buf); + return err; +} + +int +vhd_open(vhd_context_t *ctx, const char *file, int flags) +{ + int err, oflags; + + if (flags & VHD_OPEN_STRICT) + vhd_flag_clear(flags, VHD_OPEN_FAST); + + memset(ctx, 0, sizeof(vhd_context_t)); + ctx->fd = -1; + ctx->oflags = flags; + + err = namedup(&ctx->file, file); + if (err) + return err; + + oflags = O_DIRECT | O_LARGEFILE; + if (flags & VHD_OPEN_RDONLY) + oflags |= O_RDONLY; + if (flags & VHD_OPEN_RDWR) + oflags |= O_RDWR; + + ctx->fd = open(ctx->file, oflags, 0644); + if (ctx->fd == -1) { + err = -errno; + VHDLOG("failed to open %s: %d\n", ctx->file, err); + goto fail; + } + + err = vhd_test_file_fixed(ctx->file, &ctx->is_block); + if (err) + goto fail; + + if (flags & VHD_OPEN_FAST) { + err = vhd_open_fast(ctx); + if (err) + goto fail; + + return 0; + } + + err = vhd_read_footer(ctx, &ctx->footer); + if (err) + goto fail; + + if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) { + err = -EINVAL; + goto fail; + } + + if (vhd_type_dynamic(ctx)) { + err = vhd_read_header(ctx, &ctx->header); + if (err) + goto fail; + + ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; + ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3); + } + + return 0; + +fail: + if (ctx->fd != -1) + close(ctx->fd); + free(ctx->file); + memset(ctx, 0, sizeof(vhd_context_t)); + return err; +} + +void +vhd_close(vhd_context_t *ctx) +{ + if (ctx->file) + close(ctx->fd); + free(ctx->file); + free(ctx->bat.bat); + free(ctx->batmap.map); + memset(ctx, 0, sizeof(vhd_context_t)); +} + +static inline void +vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size) +{ + memset(&ctx->footer, 0, sizeof(vhd_footer_t)); + memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie)); + ctx->footer.features = HD_RESERVED; + ctx->footer.ff_version = HD_FF_VERSION; + ctx->footer.timestamp = vhd_time(time(NULL)); + ctx->footer.crtr_ver = VHD_CURRENT_VERSION; + ctx->footer.crtr_os = 0x00000000; + ctx->footer.orig_size = size; + ctx->footer.curr_size = size; + ctx->footer.geometry = vhd_chs(size); + ctx->footer.type = type; + ctx->footer.saved = 0; + ctx->footer.data_offset = 0xFFFFFFFFFFFFFFFF; + strcpy(ctx->footer.crtr_app, "tap"); + uuid_generate(ctx->footer.uuid); +} + +static int +vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path) +{ + int err; + iconv_t cd; + size_t ibl, obl; + char *pname, *ppath, *dst; + + err = 0; + pname = NULL; + ppath = NULL; + + /* + * MICROSOFT_COMPAT + * big endian unicode here + */ + cd = iconv_open(UTF_16BE, "ASCII"); + if (cd == (iconv_t)-1) { + err = -errno; + goto out; + } + + ppath = strdup(parent_path); + if (!ppath) { + err = -ENOMEM; + goto out; + } + + pname = basename(ppath); + if (!strcmp(pname, "")) { + err = -EINVAL; + goto out; + } + + ibl = strlen(pname); + obl = sizeof(ctx->header.prt_name); + dst = ctx->header.prt_name; + + memset(dst, 0, obl); + + if (iconv(cd, &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl) + err = (errno ? -errno : -EINVAL); + +out: + iconv_close(cd); + free(ppath); + return err; +} + +static off64_t +get_file_size(const char *name) +{ + int fd; + off64_t end; + + fd = open(name, O_LARGEFILE | O_RDONLY); + if (fd == -1) { + VHDLOG("unable to open '%s': %d\n", name, errno); + return -errno; + } + end = lseek64(fd, 0, SEEK_END); + close(fd); + return end; +} + +static int +vhd_initialize_header(vhd_context_t *ctx, const char *parent_path, + uint64_t size, int raw) +{ + int err; + struct stat stats; + vhd_context_t parent; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + memset(&ctx->header, 0, sizeof(vhd_header_t)); + memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie)); + ctx->header.data_offset = (uint64_t)-1; + ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */ + ctx->header.hdr_ver = DD_VERSION; + ctx->header.block_size = VHD_BLOCK_SIZE; + ctx->header.prt_ts = 0; + ctx->header.res1 = 0; + ctx->header.max_bat_size = (ctx->footer.curr_size + + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; + + ctx->footer.data_offset = VHD_SECTOR_SIZE; + + if (ctx->footer.type == HD_TYPE_DYNAMIC) + return 0; + + err = stat(parent_path, &stats); + if (err == -1) + return -errno; + + if (raw) { + ctx->header.prt_ts = vhd_time(stats.st_mtime); + if (!size) + size = get_file_size(parent_path); + } + else { + err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY); + if (err) + return err; + + ctx->header.prt_ts = vhd_time(stats.st_mtime); + uuid_copy(ctx->header.prt_uuid, parent.footer.uuid); + if (!size) + size = parent.footer.curr_size; + vhd_close(&parent); + } + ctx->footer.orig_size = size; + ctx->footer.curr_size = size; + ctx->footer.geometry = vhd_chs(size); + ctx->header.max_bat_size = + (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; + + return vhd_initialize_header_parent_name(ctx, parent_path); +} + +static int +vhd_write_parent_locators(vhd_context_t *ctx, const char *parent) +{ + int i, err; + off64_t off; + uint32_t code; + + code = PLAT_CODE_NONE; + + if (ctx->footer.type != HD_TYPE_DIFF) + return -EINVAL; + + off = ctx->batmap.header.batmap_offset + + vhd_sectors_to_bytes(ctx->batmap.header.batmap_size); + if (off & (VHD_SECTOR_SIZE - 1)) + off = vhd_bytes_padded(off); + + for (i = 0; i < 3; i++) { + switch (i) { + case 0: + code = PLAT_CODE_MACX; + break; + case 1: + code = PLAT_CODE_W2KU; + break; + case 2: + code = PLAT_CODE_W2RU; + break; + } + + err = vhd_parent_locator_write_at(ctx, parent, off, code, + 0, ctx->header.loc + i); + if (err) + return err; + + off += vhd_parent_locator_size(ctx->header.loc + i); + } + + return 0; +} + +int +vhd_change_parent(vhd_context_t *child, char *parent_path, int raw) +{ + int i, err; + char *ppath; + struct stat stats; + vhd_context_t parent; + + ppath = realpath(parent_path, NULL); + if (!ppath) { + VHDLOG("error resolving parent path %s for %s: %d\n", + parent_path, child->file, errno); + return -errno; + } + + err = stat(ppath, &stats); + if (err == -1) { + err = -errno; + goto out; + } + + if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { + err = -EINVAL; + goto out; + } + + if (raw) { + uuid_clear(child->header.prt_uuid); + } else { + err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY); + if (err) { + VHDLOG("error opening parent %s for %s: %d\n", + ppath, child->file, err); + goto out; + } + uuid_copy(child->header.prt_uuid, parent.footer.uuid); + vhd_close(&parent); + } + + vhd_initialize_header_parent_name(child, ppath); + child->header.prt_ts = vhd_time(stats.st_mtime); + + for (i = 0; i < vhd_parent_locator_count(child); i++) { + vhd_parent_locator_t *loc = child->header.loc + i; + size_t max = vhd_parent_locator_size(loc); + + switch (loc->code) { + case PLAT_CODE_MACX: + case PLAT_CODE_W2KU: + case PLAT_CODE_W2RU: + break; + default: + continue; + } + + err = vhd_parent_locator_write_at(child, ppath, + loc->data_offset, + loc->code, max, loc); + if (err) { + VHDLOG("error writing parent locator %d for %s: %d\n", + i, child->file, err); + goto out; + } + } + + TEST_FAIL_AT(FAIL_REPARENT_LOCATOR); + + err = vhd_write_header(child, &child->header); + if (err) { + VHDLOG("error writing header for %s: %d\n", child->file, err); + goto out; + } + + err = 0; + +out: + free(ppath); + return err; +} + +static int +vhd_create_batmap(vhd_context_t *ctx) +{ + off64_t off; + int err, map_bytes; + vhd_batmap_header_t *header; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + map_bytes = (ctx->header.max_bat_size + 7) >> 3; + header = &ctx->batmap.header; + + memset(header, 0, sizeof(vhd_batmap_header_t)); + memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie)); + + err = vhd_batmap_header_offset(ctx, &off); + if (err) + return err; + + header->batmap_offset = off + + vhd_bytes_padded(sizeof(vhd_batmap_header_t)); + header->batmap_size = secs_round_up_no_zero(map_bytes); + header->batmap_version = VHD_BATMAP_CURRENT_VERSION; + + map_bytes = vhd_sectors_to_bytes(header->batmap_size); + + err = posix_memalign((void **)&ctx->batmap.map, + VHD_SECTOR_SIZE, map_bytes); + if (err) { + ctx->batmap.map = NULL; + return -err; + } + + memset(ctx->batmap.map, 0, map_bytes); + + return vhd_write_batmap(ctx, &ctx->batmap); +} + +static int +vhd_create_bat(vhd_context_t *ctx) +{ + int i, err; + size_t size; + + if (!vhd_type_dynamic(ctx)) + return -EINVAL; + + size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t)); + err = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size); + if (err) { + ctx->bat.bat = NULL; + return err; + } + + memset(ctx->bat.bat, 0, size); + for (i = 0; i < ctx->header.max_bat_size; i++) + ctx->bat.bat[i] = DD_BLK_UNUSED; + + err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET); + if (err) + return err; + + ctx->bat.entries = ctx->header.max_bat_size; + ctx->bat.spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; + + return vhd_write_bat(ctx, &ctx->bat); +} + +static int +vhd_initialize_fixed_disk(vhd_context_t *ctx) +{ + char *buf; + int i, err; + + if (ctx->footer.type != HD_TYPE_FIXED) + return -EINVAL; + + err = vhd_seek(ctx, 0, SEEK_SET); + if (err) + return err; + + buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return -errno; + + for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) { + err = vhd_write(ctx, buf, VHD_BLOCK_SIZE); + if (err) + goto out; + } + + err = 0; + +out: + munmap(buf, VHD_BLOCK_SIZE); + return err; +} + +int +vhd_get_phys_size(vhd_context_t *ctx, off64_t *size) +{ + int err; + + if ((err = vhd_end_of_data(ctx, size))) + return err; + *size += sizeof(vhd_footer_t); + return 0; +} + +int +vhd_set_phys_size(vhd_context_t *ctx, off64_t size) +{ + off64_t phys_size; + int err; + + err = vhd_get_phys_size(ctx, &phys_size); + if (err) + return err; + if (size < phys_size) { + // would result in data loss + VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n", + size, phys_size); + return -EINVAL; + } + return vhd_write_footer_at(ctx, &ctx->footer, + size - sizeof(vhd_footer_t)); +} + +static int +__vhd_create(const char *name, const char *parent, uint64_t bytes, int type, + vhd_flag_creat_t flags) +{ + int err; + off64_t off; + vhd_context_t ctx; + vhd_footer_t *footer; + vhd_header_t *header; + uint64_t size, blks; + + switch (type) { + case HD_TYPE_DIFF: + if (!parent) + return -EINVAL; + case HD_TYPE_FIXED: + case HD_TYPE_DYNAMIC: + break; + default: + return -EINVAL; + } + + if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1) + return -ENAMETOOLONG; + + memset(&ctx, 0, sizeof(vhd_context_t)); + footer = &ctx.footer; + header = &ctx.header; + blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; + size = blks << VHD_BLOCK_SHIFT; + + ctx.fd = open(name, O_WRONLY | O_CREAT | + O_TRUNC | O_LARGEFILE | O_DIRECT, 0644); + if (ctx.fd == -1) + return -errno; + + ctx.file = strdup(name); + if (!ctx.file) { + err = -ENOMEM; + goto out; + } + + err = vhd_test_file_fixed(ctx.file, &ctx.is_block); + if (err) + goto out; + + vhd_initialize_footer(&ctx, type, size); + + if (type == HD_TYPE_FIXED) { + err = vhd_initialize_fixed_disk(&ctx); + if (err) + goto out; + } else { + int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW); + err = vhd_initialize_header(&ctx, parent, size, raw); + if (err) + goto out; + + err = vhd_write_footer_at(&ctx, &ctx.footer, 0); + if (err) + goto out; + + err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE); + if (err) + goto out; + + err = vhd_create_batmap(&ctx); + if (err) + goto out; + + err = vhd_create_bat(&ctx); + if (err) + goto out; + + if (type == HD_TYPE_DIFF) { + err = vhd_write_parent_locators(&ctx, parent); + if (err) + goto out; + } + + /* write header again since it may have changed */ + err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE); + if (err) + goto out; + } + + err = vhd_seek(&ctx, 0, SEEK_END); + if (err) + goto out; + + off = vhd_position(&ctx); + if (off == (off64_t)-1) { + err = -errno; + goto out; + } + + if (ctx.is_block) + off -= sizeof(vhd_footer_t); + + err = vhd_write_footer_at(&ctx, &ctx.footer, off); + if (err) + goto out; + + err = 0; + +out: + vhd_close(&ctx); + if (err && !ctx.is_block) + unlink(name); + return err; +} + +int +vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags) +{ + return __vhd_create(name, NULL, bytes, type, flags); +} + +int +vhd_snapshot(const char *name, uint64_t bytes, const char *parent, + vhd_flag_creat_t flags) +{ + return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags); +} + +static int +__vhd_io_fixed_read(vhd_context_t *ctx, + char *buf, uint64_t sec, uint32_t secs) +{ + int err; + + err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET); + if (err) + return err; + + return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs)); +} + +static void +__vhd_io_dynamic_copy_data(vhd_context_t *ctx, + char *map, int map_off, + char *bitmap, int bitmap_off, + char *dst, char *src, int secs) +{ + int i; + + for (i = 0; i < secs; i++) { + if (test_bit(map, map_off + i)) + goto next; + + if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i)) + goto next; + + memcpy(dst, src, VHD_SECTOR_SIZE); + set_bit(map, map_off + i); + + next: + src += VHD_SECTOR_SIZE; + dst += VHD_SECTOR_SIZE; + } +} + +static int +__vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map, + char *buf, uint64_t sector, uint32_t secs) +{ + off64_t off; + uint32_t blk, sec; + int err, cnt, map_off; + char *bitmap, *data, *src; + + map_off = 0; + + do { + blk = sector / ctx->spb; + sec = sector % ctx->spb; + off = ctx->bat.bat[blk]; + data = NULL; + bitmap = NULL; + + if (off == DD_BLK_UNUSED) { + cnt = MIN(secs, ctx->spb); + goto next; + } + + err = vhd_read_bitmap(ctx, blk, &bitmap); + if (err) + return err; + + err = vhd_read_block(ctx, blk, &data); + if (err) { + free(bitmap); + return err; + } + + cnt = MIN(secs, ctx->spb - sec); + src = data + vhd_sectors_to_bytes(sec); + + __vhd_io_dynamic_copy_data(ctx, + map, map_off, + bitmap, sec, + buf, src, cnt); + + next: + free(data); + free(bitmap); + + secs -= cnt; + sector += cnt; + map_off += cnt; + buf += vhd_sectors_to_bytes(cnt); + + } while (secs); + + return 0; +} + +static int +__raw_read_link(char *filename, + char *map, char *buf, uint64_t sec, uint32_t secs) +{ + int fd, err; + off64_t off; + uint64_t size; + char *data; + + err = 0; + errno = 0; + fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (fd == -1) { + VHDLOG("%s: failed to open: %d\n", filename, -errno); + return -errno; + } + + off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET); + if (off == (off64_t)-1) { + VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n", + filename, vhd_sectors_to_bytes(sec), -errno); + err = -errno; + goto close; + } + + size = vhd_sectors_to_bytes(secs); + err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size); + if (err) + goto close; + + err = read(fd, data, size); + if (err != size) { + VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n", + filename, size, err, -errno); + free(data); + err = errno ? -errno : -EIO; + goto close; + } + __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs); + free(data); + err = 0; + +close: + close(fd); + return err; +} + +static int +__vhd_io_dynamic_read(vhd_context_t *ctx, + char *buf, uint64_t sec, uint32_t secs) +{ + int err; + uint32_t i, done; + char *map, *next; + vhd_context_t parent, *vhd; + + err = vhd_get_bat(ctx); + if (err) + return err; + + vhd = ctx; + next = NULL; + map = calloc(1, secs << (VHD_SECTOR_SHIFT - 3)); + if (!map) + return -ENOMEM; + + memset(buf, 0, vhd_sectors_to_bytes(secs)); + + for (;;) { + err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs); + if (err) + goto close; + + for (done = 0, i = 0; i < secs; i++) + if (test_bit(map, i)) + done++; + + if (done == secs) { + err = 0; + goto close; + } + + if (vhd->footer.type == HD_TYPE_DIFF) { + err = vhd_parent_locator_get(vhd, &next); + if (err) + goto close; + if (vhd_parent_raw(vhd)) { + err = __raw_read_link(next, map, buf, sec, + secs); + goto close; + } + } else { + err = 0; + goto close; + } + + if (vhd != ctx) + vhd_close(vhd); + vhd = &parent; + + err = vhd_open(vhd, next, VHD_OPEN_RDONLY); + if (err) + goto out; + + err = vhd_get_bat(vhd); + if (err) + goto close; + + free(next); + next = NULL; + } + +close: + if (vhd != ctx) + vhd_close(vhd); +out: + free(map); + free(next); + return err; +} + +int +vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) +{ + if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size) + return -ERANGE; + + if (!vhd_type_dynamic(ctx)) + return __vhd_io_fixed_read(ctx, buf, sec, secs); + + return __vhd_io_dynamic_read(ctx, buf, sec, secs); +} + +static int +__vhd_io_fixed_write(vhd_context_t *ctx, + char *buf, uint64_t sec, uint32_t secs) +{ + int err; + + err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET); + if (err) + return err; + + return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs)); +} + +static int +__vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block) +{ + char *buf; + size_t size; + off64_t off, max; + int i, err, gap, spp; + + spp = getpagesize() >> VHD_SECTOR_SHIFT; + + err = vhd_end_of_data(ctx, &max); + if (err) + return err; + + gap = 0; + off = max; + max >>= VHD_SECTOR_SHIFT; + + /* data region of segment should begin on page boundary */ + if ((max + ctx->bm_secs) % spp) { + gap = (spp - ((max + ctx->bm_secs) % spp)); + max += gap; + } + + err = vhd_seek(ctx, off, SEEK_SET); + if (err) + return err; + + size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap); + buf = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return -errno; + + err = vhd_write(ctx, buf, size); + if (err) + goto out; + + ctx->bat.bat[block] = max; + err = vhd_write_bat(ctx, &ctx->bat); + if (err) + goto out; + + err = 0; + +out: + munmap(buf, size); + return err; +} + +static int +__vhd_io_dynamic_write(vhd_context_t *ctx, + char *buf, uint64_t sector, uint32_t secs) +{ + char *map; + off64_t off; + uint32_t blk, sec; + int i, err, cnt, ret; + + if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size) + return -ERANGE; + + err = vhd_get_bat(ctx); + if (err) + return err; + + if (vhd_has_batmap(ctx)) { + err = vhd_get_batmap(ctx); + if (err) + return err; + } + + do { + blk = sector / ctx->spb; + sec = sector % ctx->spb; + + off = ctx->bat.bat[blk]; + if (off == DD_BLK_UNUSED) { + err = __vhd_io_allocate_block(ctx, blk); + if (err) + return err; + + off = ctx->bat.bat[blk]; + } + + off += ctx->bm_secs + sec; + err = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET); + if (err) + return err; + + cnt = MIN(secs, ctx->spb - sec); + err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt)); + if (err) + return err; + + if (vhd_has_batmap(ctx) && + vhd_batmap_test(ctx, &ctx->batmap, blk)) + goto next; + + err = vhd_read_bitmap(ctx, blk, &map); + if (err) + return err; + + for (i = 0; i < cnt; i++) + vhd_bitmap_set(ctx, map, sec + i); + + err = vhd_write_bitmap(ctx, blk, map); + if (err) + goto fail; + + if (vhd_has_batmap(ctx)) { + for (i = 0; i < ctx->spb; i++) + if (!vhd_bitmap_test(ctx, map, i)) { + free(map); + goto next; + } + + vhd_batmap_set(ctx, &ctx->batmap, blk); + err = vhd_write_batmap(ctx, &ctx->batmap); + if (err) + goto fail; + } + + free(map); + map = NULL; + + next: + secs -= cnt; + sector += cnt; + buf += vhd_sectors_to_bytes(cnt); + } while (secs); + + err = 0; + +out: + ret = vhd_write_footer(ctx, &ctx->footer); + return (err ? err : ret); + +fail: + free(map); + goto out; +} + +int +vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) +{ + if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size) + return -ERANGE; + + if (!vhd_type_dynamic(ctx)) + return __vhd_io_fixed_write(ctx, buf, sec, secs); + + return __vhd_io_dynamic_write(ctx, buf, sec, secs); +} diff --git a/tools/blktap2/vhd/lib/relative-path.c b/tools/blktap2/vhd/lib/relative-path.c new file mode 100644 index 0000000000..8b7cb71fc9 --- /dev/null +++ b/tools/blktap2/vhd/lib/relative-path.c @@ -0,0 +1,299 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "relative-path.h" + +#define sfree(ptr) \ +do { \ + free(ptr); \ + ptr = NULL; \ +} while (0) + +/* + * count number of tokens between DELIMETER characters + */ +static int +count_nodes(char *path) +{ + int i; + char *tmp; + + if (!path) + return 0; + + for (i = 0, tmp = path; *tmp != '\0'; tmp++) + if (*tmp == DELIMITER) + i++; + + return i; +} + +/* + * return copy of next node in @path, or NULL + * @path is moved to the end of the next node + * @err is set to -errno on failure + * copy should be freed + */ +static char * +next_node(char **path, int *err) +{ + int ret; + char *tmp, *start; + + if (!path || !*path) { + *err = -EINVAL; + return NULL; + } + + *err = 0; + start = *path; + + for (tmp = *path; *tmp != '\0'; tmp++) + if (*tmp == DELIMITER) { + int size; + char *node; + + size = tmp - start + 1; + node = malloc(size); + if (!node) { + *err = -ENOMEM; + return NULL; + } + + ret = snprintf(node, size, "%s", start); + if (ret < 0) { + free(node); + *err = -EINVAL; + return NULL; + } + + *path = tmp; + return node; + } + + return NULL; +} + +/* + * count number of nodes in common betwee @to and @from + * returns number of common nodes, or -errno on failure + */ +static int +count_common_nodes(char *to, char *from) +{ + int err, common; + char *to_node, *from_node; + + if (!to || !from) + return -EINVAL; + + err = 0; + common = 0; + to_node = NULL; + from_node = NULL; + + do { + to_node = next_node(&to, &err); + if (err || !to_node) + break; + + from_node = next_node(&from, &err); + if (err || !from_node) + break; + + if (strncmp(to_node, from_node, MAX_NAME_LEN)) + break; + + ++to; + ++from; + ++common; + sfree(to_node); + sfree(from_node); + + } while (1); + + sfree(to_node); + sfree(from_node); + + if (err) + return err; + + return common; +} + +/* + * construct path of @count '../', './' if @count is zero, or NULL on error + * result should be freed + */ +static char * +up_nodes(int count) +{ + char *path, *tmp; + int i, ret, len, size; + + if (!count) + return strdup("./"); + + len = strlen("../"); + size = len * count; + if (size >= MAX_NAME_LEN) + return NULL; + + path = malloc(size + 1); + if (!path) + return NULL; + + tmp = path; + for (i = 0; i < count; i++) { + ret = sprintf(tmp, "../"); + if (ret < 0 || ret != len) { + free(path); + return NULL; + } + tmp += ret; + } + + return path; +} + +/* + * return pointer to @offset'th node of path or NULL on error + */ +static char * +node_offset(char *from, int offset) +{ + char *path; + + if (!from || !offset) + return NULL; + + for (path = from; *path != '\0'; path++) { + if (*path == DELIMITER) + if (--offset == 0) + return path + 1; + } + + return NULL; +} + +/* + * return a relative path from @from to @to + * result should be freed + */ +char * +relative_path_to(char *from, char *to, int *err) +{ + int from_nodes, common; + char *to_absolute, *from_absolute; + char *up, *common_target_path, *relative_path; + + *err = 0; + up = NULL; + to_absolute = NULL; + from_absolute = NULL; + relative_path = NULL; + + if (strnlen(to, MAX_NAME_LEN) == MAX_NAME_LEN || + strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) { + EPRINTF("invalid input; max path length is %d\n", + MAX_NAME_LEN); + *err = -ENAMETOOLONG; + return NULL; + } + + to_absolute = realpath(to, NULL); + if (!to_absolute) { + EPRINTF("failed to get absolute path of %s\n", to); + *err = -errno; + goto out; + } + + from_absolute = realpath(from, NULL); + if (!from_absolute) { + EPRINTF("failed to get absolute path of %s\n", from); + *err = -errno; + goto out; + } + + if (strnlen(to_absolute, MAX_NAME_LEN) == MAX_NAME_LEN || + strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) { + EPRINTF("invalid input; max path length is %d\n", + MAX_NAME_LEN); + *err = -ENAMETOOLONG; + goto out; + } + + /* count nodes in source path */ + from_nodes = count_nodes(from_absolute); + + /* count nodes in common */ + common = count_common_nodes(to_absolute + 1, from_absolute + 1); + if (common < 0) { + EPRINTF("failed to count common nodes of %s and %s: %d\n", + to_absolute, from_absolute, common); + *err = common; + goto out; + } + + /* move up to common node */ + up = up_nodes(from_nodes - common - 1); + if (!up) { + EPRINTF("failed to allocate relative path for %s: %d\n", + from_absolute, -ENOMEM); + *err = -ENOMEM; + goto out; + } + + /* get path from common node to target */ + common_target_path = node_offset(to_absolute, common + 1); + if (!common_target_path) { + EPRINTF("failed to find common target path to %s: %d\n", + to_absolute, -EINVAL); + *err = -EINVAL; + goto out; + } + + /* get relative path */ + if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) { + EPRINTF("failed to construct final path %s%s: %d\n", + up, common_target_path, -ENOMEM); + relative_path = NULL; + *err = -ENOMEM; + goto out; + } + +out: + sfree(up); + sfree(to_absolute); + sfree(from_absolute); + + return relative_path; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-check.c b/tools/blktap2/vhd/lib/vhd-util-check.c new file mode 100644 index 0000000000..d7d588088a --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-check.c @@ -0,0 +1,977 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <time.h> +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <libgen.h> +#include <inttypes.h> +#include <sys/stat.h> + +#include "libvhd.h" +#include "vhd-util.h" + +// allow the VHD timestamp to be at most this many seconds into the future to +// account for time skew with NFS servers +#define TIMESTAMP_MAX_SLACK 1800 + +static int +vhd_util_check_zeros(void *buf, size_t size) +{ + int i; + char *p; + + p = buf; + for (i = 0; i < size; i++) + if (p[i]) + return i; + + return 0; +} + +static int +vhd_util_check_footer_opened(vhd_footer_t *footer) +{ + int i, n; + uint32_t *buf; + + buf = (uint32_t *)footer; + n = sizeof(*footer) / sizeof(uint32_t); + + for (i = 0; i < n; i++) + if (buf[i] != 0xc7c7c7c7) + return 0; + + return 1; +} + +static char * +vhd_util_check_validate_footer(vhd_footer_t *footer) +{ + int size; + uint32_t checksum, now; + + size = sizeof(footer->cookie); + if (memcmp(footer->cookie, HD_COOKIE, size)) + return "invalid cookie"; + + checksum = vhd_checksum_footer(footer); + if (checksum != footer->checksum) { + if (footer->hidden && + !strncmp(footer->crtr_app, "tap", 3) && + (footer->crtr_ver == VHD_VERSION(0, 1) || + footer->crtr_ver == VHD_VERSION(1, 1))) { + char tmp = footer->hidden; + footer->hidden = 0; + checksum = vhd_checksum_footer(footer); + footer->hidden = tmp; + + if (checksum == footer->checksum) + goto ok; + } + + return "invalid checksum"; + } + +ok: + if (!(footer->features & HD_RESERVED)) + return "invalid 'reserved' feature"; + + if (footer->features & ~(HD_TEMPORARY | HD_RESERVED)) + return "invalid extra features"; + + if (footer->ff_version != HD_FF_VERSION) + return "invalid file format version"; + + if (footer->type != HD_TYPE_DYNAMIC && + footer->type != HD_TYPE_DIFF && + footer->data_offset != ~(0ULL)) + return "invalid data offset"; + + now = vhd_time(time(NULL)); + if (footer->timestamp > now + TIMESTAMP_MAX_SLACK) + return "creation time in future"; + + if (!strncmp(footer->crtr_app, "tap", 3) && + footer->crtr_ver > VHD_CURRENT_VERSION) + return "unsupported tap creator version"; + + if (vhd_chs(footer->curr_size) < footer->geometry) + return "geometry too large"; + + if (footer->type != HD_TYPE_FIXED && + footer->type != HD_TYPE_DYNAMIC && + footer->type != HD_TYPE_DIFF) + return "invalid type"; + + if (footer->saved && footer->saved != 1) + return "invalid 'saved' state"; + + if (footer->hidden && footer->hidden != 1) + return "invalid 'hidden' state"; + + if (vhd_util_check_zeros(footer->reserved, + sizeof(footer->reserved))) + return "invalid 'reserved' bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_header(int fd, vhd_header_t *header) +{ + off64_t eof; + int i, cnt, size; + uint32_t checksum; + + size = sizeof(header->cookie); + if (memcmp(header->cookie, DD_COOKIE, size)) + return "invalid cookie"; + + checksum = vhd_checksum_header(header); + if (checksum != header->checksum) + return "invalid checksum"; + + if (header->hdr_ver != 0x00010000) + return "invalid header version"; + + if (header->data_offset != ~(0ULL)) + return "invalid data offset"; + + eof = lseek64(fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (header->table_offset <= 0 || + header->table_offset % 512 || + (header->table_offset + + (header->max_bat_size * sizeof(uint32_t)) > + eof - sizeof(vhd_footer_t))) + return "invalid table offset"; + + for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++) + if ((header->block_size >> i) & 1) + cnt++; + + if (cnt != 1) + return "invalid block size"; + + if (header->res1) + return "invalid reserved bits"; + + if (vhd_util_check_zeros(header->res2, sizeof(header->res2))) + return "invalid reserved bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_differencing_header(vhd_context_t *vhd) +{ + vhd_header_t *header; + + header = &vhd->header; + + if (vhd->footer.type == HD_TYPE_DIFF) { + char *parent; + uint32_t now; + + now = vhd_time(time(NULL)); + if (header->prt_ts > now + TIMESTAMP_MAX_SLACK) + return "parent creation time in future"; + + if (vhd_header_decode_parent(vhd, header, &parent)) + return "invalid parent name"; + + free(parent); + } else { + if (vhd_util_check_zeros(header->prt_name, + sizeof(header->prt_name))) + return "invalid non-null parent name"; + + if (vhd_util_check_zeros(header->loc, sizeof(header->loc))) + return "invalid non-null parent locators"; + + if (!uuid_is_null(header->prt_uuid)) + return "invalid non-null parent uuid"; + + if (header->prt_ts) + return "invalid non-zero parent timestamp"; + } + + return NULL; +} + +static char * +vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap) +{ + int size; + off64_t eof; + uint32_t checksum; + + size = sizeof(batmap->header.cookie); + if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size)) + return "invalid cookie"; + + if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION) + return "unsupported batmap version"; + + checksum = vhd_checksum_batmap(batmap); + if (checksum != batmap->header.checksum) + return "invalid checksum"; + + if (!batmap->header.batmap_size) + return "invalid size zero"; + + eof = lseek64(vhd->fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (!batmap->header.batmap_offset || + batmap->header.batmap_offset % 512) + return "invalid batmap offset"; + + if ((batmap->header.batmap_offset + + vhd_sectors_to_bytes(batmap->header.batmap_size)) > + eof - sizeof(vhd_footer_t)) + return "invalid batmap size"; + + return NULL; +} + +static char * +vhd_util_check_validate_parent_locator(vhd_context_t *vhd, + vhd_parent_locator_t *loc) +{ + off64_t eof; + + if (vhd_validate_platform_code(loc->code)) + return "invalid platform code"; + + if (loc->code == PLAT_CODE_NONE) { + if (vhd_util_check_zeros(loc, sizeof(*loc))) + return "non-zero locator"; + + return NULL; + } + + if (!loc->data_offset) + return "invalid data offset"; + + if (!loc->data_space) + return "invalid data space"; + + if (!loc->data_len) + return "invalid data length"; + + eof = lseek64(vhd->fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (loc->data_offset + vhd_parent_locator_size(loc) > + eof - sizeof(vhd_footer_t)) + return "invalid size"; + + if (loc->res) + return "invalid reserved bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_parent(vhd_context_t *vhd, const char *ppath) +{ + char *msg; + vhd_context_t parent; + + msg = NULL; + + if (vhd_parent_raw(vhd)) + return msg; + + if (vhd_open(&parent, ppath, + VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED)) + return "error opening parent"; + + if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) { + msg = "invalid parent uuid"; + goto out; + } + +out: + vhd_close(&parent); + return msg; +} + +static int +vhd_util_check_footer(int fd, vhd_footer_t *footer, int ignore) +{ + size_t size; + int err, opened; + char *msg, *buf; + off64_t eof, off; + vhd_footer_t primary, backup; + + memset(&primary, 0, sizeof(primary)); + memset(&backup, 0, sizeof(backup)); + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(primary)); + if (err) { + printf("error allocating buffer: %d\n", err); + return -err; + } + + memset(buf, 0, sizeof(primary)); + + eof = lseek64(fd, 0, SEEK_END); + if (eof == (off64_t)-1) { + err = -errno; + printf("error calculating end of file: %d\n", err); + goto out; + } + + size = ((eof % 512) ? 511 : 512); + eof = lseek64(fd, eof - size, SEEK_SET); + if (eof == (off64_t)-1) { + err = -errno; + printf("error calculating end of file: %d\n", err); + goto out; + } + + err = read(fd, buf, 512); + if (err != size) { + err = (errno ? -errno : -EIO); + printf("error reading primary footer: %d\n", err); + goto out; + } + + memcpy(&primary, buf, sizeof(primary)); + opened = vhd_util_check_footer_opened(&primary); + vhd_footer_in(&primary); + + msg = vhd_util_check_validate_footer(&primary); + if (msg) { + if (opened && ignore) + goto check_backup; + + err = -EINVAL; + printf("primary footer invalid: %s\n", msg); + goto out; + } + + if (primary.type == HD_TYPE_FIXED) { + err = 0; + goto out; + } + +check_backup: + off = lseek64(fd, 0, SEEK_SET); + if (off == (off64_t)-1) { + err = -errno; + printf("error seeking to backup footer: %d\n", err); + goto out; + } + + size = 512; + memset(buf, 0, sizeof(primary)); + + err = read(fd, buf, size); + if (err != size) { + err = (errno ? -errno : -EIO); + printf("error reading backup footer: %d\n", err); + goto out; + } + + memcpy(&backup, buf, sizeof(backup)); + vhd_footer_in(&backup); + + msg = vhd_util_check_validate_footer(&backup); + if (msg) { + err = -EINVAL; + printf("backup footer invalid: %s\n", msg); + goto out; + } + + if (memcmp(&primary, &backup, sizeof(primary))) { + if (opened && ignore) { + memcpy(&primary, &backup, sizeof(primary)); + goto ok; + } + + if (backup.hidden && + !strncmp(backup.crtr_app, "tap", 3) && + (backup.crtr_ver == VHD_VERSION(0, 1) || + backup.crtr_ver == VHD_VERSION(1, 1))) { + char cmp, tmp = backup.hidden; + backup.hidden = 0; + cmp = memcmp(&primary, &backup, sizeof(primary)); + backup.hidden = tmp; + if (!cmp) + goto ok; + } + + err = -EINVAL; + printf("primary and backup footers do not match\n"); + goto out; + } + +ok: + err = 0; + memcpy(footer, &primary, sizeof(primary)); + +out: + free(buf); + return err; +} + +static int +vhd_util_check_header(int fd, vhd_footer_t *footer) +{ + int err; + off64_t off; + char *msg, *buf; + vhd_header_t header; + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(header)); + if (err) { + printf("error allocating header: %d\n", err); + return err; + } + + off = footer->data_offset; + off = lseek64(fd, off, SEEK_SET); + if (off == (off64_t)-1) { + err = -errno; + printf("error seeking to header: %d\n", err); + goto out; + } + + err = read(fd, buf, sizeof(header)); + if (err != sizeof(header)) { + err = (errno ? -errno : -EIO); + printf("error reading header: %d\n", err); + goto out; + } + + memcpy(&header, buf, sizeof(header)); + vhd_header_in(&header); + + msg = vhd_util_check_validate_header(fd, &header); + if (msg) { + err = -EINVAL; + printf("header is invalid: %s\n", msg); + goto out; + } + + err = 0; + +out: + free(buf); + return err; +} + +static int +vhd_util_check_differencing_header(vhd_context_t *vhd) +{ + char *msg; + + msg = vhd_util_check_validate_differencing_header(vhd); + if (msg) { + printf("differencing header is invalid: %s\n", msg); + return -EINVAL; + } + + return 0; +} + +static int +vhd_util_check_bat(vhd_context_t *vhd) +{ + off64_t eof, eoh; + int i, j, err, block_size; + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) { + printf("error calculating eof: %d\n", err); + return err; + } + + eof = vhd_position(vhd); + if (eof == (off64_t)-1) { + printf("error calculating eof: %d\n", -errno); + return -errno; + } + + /* adjust eof for vhds with short footers */ + if (eof % 512) { + if (eof % 512 != 511) { + printf("invalid file size: 0x%"PRIx64"\n", eof); + return -EINVAL; + } + + eof++; + } + + err = vhd_get_bat(vhd); + if (err) { + printf("error reading bat: %d\n", err); + return err; + } + + err = vhd_end_of_headers(vhd, &eoh); + if (err) { + printf("error calculating end of metadata: %d\n", err); + return err; + } + + eof -= sizeof(vhd_footer_t); + eof >>= VHD_SECTOR_SHIFT; + eoh >>= VHD_SECTOR_SHIFT; + block_size = vhd->spb + vhd->bm_secs; + + for (i = 0; i < vhd->header.max_bat_size; i++) { + uint32_t off = vhd->bat.bat[i]; + if (off == DD_BLK_UNUSED) + continue; + + if (off < eoh) { + printf("block %d (offset 0x%x) clobbers headers\n", + i, off); + return -EINVAL; + } + + if (off + block_size > eof) { + printf("block %d (offset 0x%x) clobbers footer\n", + i, off); + return -EINVAL; + } + + for (j = 0; j < vhd->header.max_bat_size; j++) { + uint32_t joff = vhd->bat.bat[j]; + + if (i == j) + continue; + + if (joff == DD_BLK_UNUSED) + continue; + + if (off == joff) + err = -EINVAL; + + if (off > joff && off < joff + block_size) + err = -EINVAL; + + if (off + block_size > joff && + off + block_size < joff + block_size) + err = -EINVAL; + + if (err) { + printf("block %d (offset 0x%x) clobbers " + "block %d (offset 0x%x)\n", + i, off, j, joff); + return err; + } + } + } + + return 0; +} + +static int +vhd_util_check_batmap(vhd_context_t *vhd) +{ + char *msg; + int i, err; + + err = vhd_get_bat(vhd); + if (err) { + printf("error reading bat: %d\n", err); + return err; + } + + err = vhd_get_batmap(vhd); + if (err) { + printf("error reading batmap: %d\n", err); + return err; + } + + msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap); + if (msg) { + printf("batmap is invalid: %s\n", msg); + return -EINVAL; + } + + for (i = 0; i < vhd->header.max_bat_size; i++) { + if (!vhd_batmap_test(vhd, &vhd->batmap, i)) + continue; + + if (vhd->bat.bat[i] == DD_BLK_UNUSED) { + printf("batmap shows unallocated block %d full\n", i); + return -EINVAL; + } + } + + return 0; +} + +static int +vhd_util_check_parent_locators(vhd_context_t *vhd) +{ + int i, n, err; + vhd_parent_locator_t *loc; + char *msg, *file, *ppath, *location, *pname; + int mac, macx, w2ku, w2ru, wi2r, wi2k, found; + + mac = 0; + macx = 0; + w2ku = 0; + w2ru = 0; + wi2r = 0; + wi2k = 0; + found = 0; + pname = NULL; + ppath = NULL; + location = NULL; + + err = vhd_header_decode_parent(vhd, &vhd->header, &pname); + if (err) { + printf("error decoding parent name: %d\n", err); + return err; + } + + n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]); + for (i = 0; i < n; i++) { + ppath = NULL; + location = NULL; + loc = vhd->header.loc + i; + + msg = vhd_util_check_validate_parent_locator(vhd, loc); + if (msg) { + err = -EINVAL; + printf("invalid parent locator %d: %s\n", i, msg); + goto out; + } + + if (loc->code == PLAT_CODE_NONE) + continue; + + switch (loc->code) { + case PLAT_CODE_MACX: + if (macx++) + goto dup; + break; + + case PLAT_CODE_MAC: + if (mac++) + goto dup; + break; + + case PLAT_CODE_W2KU: + if (w2ku++) + goto dup; + break; + + case PLAT_CODE_W2RU: + if (w2ru++) + goto dup; + break; + + case PLAT_CODE_WI2R: + if (wi2r++) + goto dup; + break; + + case PLAT_CODE_WI2K: + if (wi2k++) + goto dup; + break; + + default: + err = -EINVAL; + printf("invalid platform code for locator %d\n", i); + goto out; + } + + if (loc->code != PLAT_CODE_MACX && + loc->code != PLAT_CODE_W2RU && + loc->code != PLAT_CODE_W2KU) + continue; + + err = vhd_parent_locator_read(vhd, loc, &ppath); + if (err) { + printf("error reading parent locator %d: %d\n", i, err); + goto out; + } + + file = basename(ppath); + if (strcmp(pname, file)) { + err = -EINVAL; + printf("parent locator %d name (%s) does not match " + "header name (%s)\n", i, file, pname); + goto out; + } + + err = vhd_find_parent(vhd, ppath, &location); + if (err) { + printf("error resolving %s: %d\n", ppath, err); + goto out; + } + + err = access(location, R_OK); + if (err && loc->code == PLAT_CODE_MACX) { + err = -errno; + printf("parent locator %d points to missing file %s " + "(resolved to %s)\n", i, ppath, location); + goto out; + } + + msg = vhd_util_check_validate_parent(vhd, location); + if (msg) { + err = -EINVAL; + printf("invalid parent %s: %s\n", location, msg); + goto out; + } + + found++; + free(ppath); + free(location); + ppath = NULL; + location = NULL; + + continue; + + dup: + printf("duplicate platform code in locator %d: 0x%x\n", + i, loc->code); + err = -EINVAL; + goto out; + } + + if (!found) { + err = -EINVAL; + printf("could not find parent %s\n", pname); + goto out; + } + + err = 0; + +out: + free(pname); + free(ppath); + free(location); + return err; +} + +static void +vhd_util_dump_headers(const char *name) +{ + char *argv[] = { "read", "-p", "-n", (char *)name }; + int argc = sizeof(argv) / sizeof(argv[0]); + + printf("%s appears invalid; dumping metadata\n", name); + vhd_util_read(argc, argv); +} + +static int +vhd_util_check_vhd(const char *name, int ignore) +{ + int fd, err; + vhd_context_t vhd; + struct stat stats; + vhd_footer_t footer; + + fd = -1; + memset(&vhd, 0, sizeof(vhd)); + + err = stat(name, &stats); + if (err == -1) { + printf("cannot stat %s: %d\n", name, errno); + return -errno; + } + + if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { + printf("%s is not a regular file or block device\n", name); + return -EINVAL; + } + + fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (fd == -1) { + printf("error opening %s\n", name); + return -errno; + } + + err = vhd_util_check_footer(fd, &footer, ignore); + if (err) + goto out; + + if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF) + goto out; + + err = vhd_util_check_header(fd, &footer); + if (err) + goto out; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) + goto out; + + err = vhd_util_check_differencing_header(&vhd); + if (err) + goto out; + + err = vhd_util_check_bat(&vhd); + if (err) + goto out; + + if (vhd_has_batmap(&vhd)) { + err = vhd_util_check_batmap(&vhd); + if (err) + goto out; + } + + if (vhd.footer.type == HD_TYPE_DIFF) { + err = vhd_util_check_parent_locators(&vhd); + if (err) + goto out; + } + + err = 0; + printf("%s is valid\n", name); + +out: + if (err) + vhd_util_dump_headers(name); + if (fd != -1) + close(fd); + vhd_close(&vhd); + return err; +} + +static int +vhd_util_check_parents(const char *name, int ignore) +{ + int err; + vhd_context_t vhd; + char *cur, *parent; + + cur = (char *)name; + + for (;;) { + err = vhd_open(&vhd, cur, + VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) + goto out; + + if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) { + vhd_close(&vhd); + goto out; + } + + err = vhd_parent_locator_get(&vhd, &parent); + vhd_close(&vhd); + + if (err) { + printf("error getting parent: %d\n", err); + goto out; + } + + if (cur != name) + free(cur); + cur = parent; + + err = vhd_util_check_vhd(cur, ignore); + if (err) + goto out; + } + +out: + if (err) + printf("error checking parents: %d\n", err); + if (cur != name) + free(cur); + return err; +} + +int +vhd_util_check(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + int c, err, ignore, parents; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + ignore = 0; + parents = 0; + name = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "n:iph")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'i': + ignore = 1; + break; + case 'p': + parents = 1; + break; + case 'h': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || optind != argc) { + err = -EINVAL; + goto usage; + } + + err = vhd_util_check_vhd(name, ignore); + if (err) + goto out; + + if (parents) + err = vhd_util_check_parents(name, ignore); + +out: + return err; + +usage: + printf("options: -n <file> [-i ignore missing primary footers] " + "[-p check parents] [-h help]\n"); + return err; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-coalesce.c b/tools/blktap2/vhd/lib/vhd-util-coalesce.c new file mode 100644 index 0000000000..f6461fc687 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-coalesce.c @@ -0,0 +1,218 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +static int +__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs) +{ + off64_t off; + size_t ret; + + errno = 0; + off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET); + if (off == (off64_t)-1) { + printf("raw parent: seek(0x%08"PRIx64") failed: %d\n", + vhd_sectors_to_bytes(sec), -errno); + return -errno; + } + + ret = write(fd, buf, vhd_sectors_to_bytes(secs)); + if (ret == vhd_sectors_to_bytes(secs)) + return 0; + + printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n", + vhd_sectors_to_bytes(secs), ret, -errno); + return (errno ? -errno : -EIO); +} + +/* + * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw + */ +static int +vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent, + int parent_fd, uint64_t block) +{ + int i, err; + char *buf, *map; + uint64_t sec, secs; + + buf = NULL; + map = NULL; + sec = block * vhd->spb; + + if (vhd->bat.bat[block] == DD_BLK_UNUSED) + return 0; + + err = posix_memalign((void **)&buf, 4096, vhd->header.block_size); + if (err) + return -err; + + err = vhd_io_read(vhd, buf, sec, vhd->spb); + if (err) + goto done; + + if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) { + if (parent->file) + err = vhd_io_write(parent, buf, sec, vhd->spb); + else + err = __raw_io_write(parent_fd, buf, sec, vhd->spb); + goto done; + } + + err = vhd_read_bitmap(vhd, block, &map); + if (err) + goto done; + + for (i = 0; i < vhd->spb; i++) { + if (!vhd_bitmap_test(vhd, map, i)) + continue; + + for (secs = 0; i + secs < vhd->spb; secs++) + if (!vhd_bitmap_test(vhd, map, i + secs)) + break; + + if (parent->file) + err = vhd_io_write(parent, + buf + vhd_sectors_to_bytes(i), + sec + i, secs); + else + err = __raw_io_write(parent_fd, + buf + vhd_sectors_to_bytes(i), + sec + i, secs); + if (err) + goto done; + + i += secs; + } + + err = 0; + +done: + free(buf); + free(map); + return err; +} + +int +vhd_util_coalesce(int argc, char **argv) +{ + int err, c; + uint64_t i; + char *name, *pname; + vhd_context_t vhd, parent; + int parent_fd = -1; + + name = NULL; + pname = NULL; + parent.file = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'h': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_parent_locator_get(&vhd, &pname); + if (err) { + printf("error finding %s parent: %d\n", name, err); + vhd_close(&vhd); + return err; + } + + if (vhd_parent_raw(&vhd)) { + parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644); + if (parent_fd == -1) { + err = -errno; + printf("failed to open parent %s: %d\n", pname, err); + vhd_close(&vhd); + return err; + } + } else { + err = vhd_open(&parent, pname, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", pname, err); + free(pname); + vhd_close(&vhd); + return err; + } + } + + err = vhd_get_bat(&vhd); + if (err) + goto done; + + if (vhd_has_batmap(&vhd)) { + err = vhd_get_batmap(&vhd); + if (err) + goto done; + } + + for (i = 0; i < vhd.bat.entries; i++) { + err = vhd_util_coalesce_block(&vhd, &parent, parent_fd, i); + if (err) + goto done; + } + + err = 0; + + done: + free(pname); + vhd_close(&vhd); + if (parent.file) + vhd_close(&parent); + else + close(parent_fd); + return err; + +usage: + printf("options: <-n name> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-create.c b/tools/blktap2/vhd/lib/vhd-util-create.c new file mode 100644 index 0000000000..a9bdf05fee --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-create.c @@ -0,0 +1,80 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_create(int argc, char **argv) +{ + char *name; + uint64_t size; + int c, sparse, err; + vhd_flag_creat_t flags; + + err = -EINVAL; + size = 0; + sparse = 1; + name = NULL; + flags = 0; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:s:rh")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 's': + err = 0; + size = strtoull(optarg, NULL, 10); + break; + case 'r': + sparse = 0; + break; + case 'h': + default: + goto usage; + } + } + + if (err || !name || optind != argc) + goto usage; + + return vhd_create(name, size << 20, + (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED), + flags); + +usage: + printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-fill.c b/tools/blktap2/vhd/lib/vhd-util-fill.c new file mode 100644 index 0000000000..afbfccee48 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-fill.c @@ -0,0 +1,105 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_fill(int argc, char **argv) +{ + int err, c; + char *buf, *name; + vhd_context_t vhd; + uint64_t i, sec, secs; + + buf = NULL; + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'h': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_get_bat(&vhd); + if (err) + goto done; + + err = posix_memalign((void **)&buf, 4096, vhd.header.block_size); + if (err) { + err = -err; + goto done; + } + + sec = 0; + secs = vhd.header.block_size >> VHD_SECTOR_SHIFT; + + for (i = 0; i < vhd.header.max_bat_size; i++) { + err = vhd_io_read(&vhd, buf, sec, secs); + if (err) + goto done; + + err = vhd_io_write(&vhd, buf, sec, secs); + if (err) + goto done; + + sec += secs; + } + + err = 0; + + done: + free(buf); + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-modify.c b/tools/blktap2/vhd/lib/vhd-util-modify.c new file mode 100644 index 0000000000..3b07e31b25 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-modify.c @@ -0,0 +1,132 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Altering operations: + * + * 1. Change the parent pointer to another file. + * 2. Change the size of the file containing the VHD image. This does NOT + * affect the VHD disk capacity, only the physical size of the file containing + * the VHD. Naturally, it is not possible to set the file size to be less than + * the what VHD utilizes. + * The operation doesn't actually change the file size, but it writes the + * footer in the right location such that resizing the file (manually, as a + * separate step) will produce the correct results. If the new file size is + * greater than the current file size, the file must first be expanded and then + * altered with this operation. If the new size is smaller than the current + * size, the VHD must first be altered with this operation and then the file + * must be shrunk. Failing to resize the file will result in a corrupted VHD. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +TEST_FAIL_EXTERN_VARS; + +int +vhd_util_modify(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + int err, c, size, parent, parent_raw; + off64_t newsize = 0; + char *newparent = NULL; + + name = NULL; + size = 0; + parent = 0; + parent_raw = 0; + + optind = 0; + while ((c = getopt(argc, argv, "n:s:p:mh")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 's': + size = 1; + errno = 0; + newsize = strtoll(optarg, NULL, 10); + if (errno) { + fprintf(stderr, "Invalid size '%s'\n", optarg); + goto usage; + } + break; + case 'p': + parent = 1; + newparent = optarg; + break; + case 'm': + parent_raw = 1; + break; + + case 'h': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (size) { + err = vhd_set_phys_size(&vhd, newsize); + if (err) + printf("failed to set physical size to %"PRIu64":" + " %d\n", newsize, err); + } + + if (parent) { + TEST_FAIL_AT(FAIL_REPARENT_BEGIN); + err = vhd_change_parent(&vhd, newparent, parent_raw); + if (err) { + printf("failed to set parent to '%s': %d\n", + newparent, err); + goto done; + } + TEST_FAIL_AT(FAIL_REPARENT_END); + } + +done: + vhd_close(&vhd); + return err; + +usage: + printf("*** Dangerous operations, use with care ***\n"); + printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] " + "[-s NEW_SIZE set size] [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-query.c b/tools/blktap2/vhd/lib/vhd-util-query.c new file mode 100644 index 0000000000..3477a17f27 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-query.c @@ -0,0 +1,159 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_query(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + off64_t currsize; + int ret, err, c, size, physize, parent, fields, depth; + + name = NULL; + size = 0; + physize = 0; + parent = 0; + fields = 0; + depth = 0; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + optind = 0; + while ((c = getopt(argc, argv, "n:vspfdh")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'v': + size = 1; + break; + case 's': + physize = 1; + break; + case 'p': + parent = 1; + break; + case 'f': + fields = 1; + break; + case 'd': + depth = 1; + break; + case 'h': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || optind != argc) { + err = -EINVAL; + goto usage; + } + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (size) + printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); + + if (physize) { + err = vhd_get_phys_size(&vhd, &currsize); + if (err) + printf("failed to get physical size: %d\n", err); + else + printf("%"PRIu64"\n", currsize); + } + + if (parent) { + ret = 0; + + if (vhd.footer.type != HD_TYPE_DIFF) + printf("%s has no parent\n", name); + else { + char *pname; + + ret = vhd_parent_locator_get(&vhd, &pname); + if (ret) + printf("query failed\n"); + else { + printf("%s\n", pname); + free(pname); + } + } + + err = (err ? : ret); + } + + if (fields) { + int hidden; + + ret = vhd_hidden(&vhd, &hidden); + if (ret) + printf("error checking 'hidden' field: %d\n", ret); + else + printf("hidden: %d\n", hidden); + + err = (err ? : ret); + } + + if (depth) { + int length; + + ret = vhd_chain_depth(&vhd, &length); + if (ret) + printf("error checking chain depth: %d\n", ret); + else + printf("chain depth: %d\n", length); + + err = (err ? : ret); + } + + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-v print virtual size (in MB)] " + "[-s print physical utilization (bytes)] [-p print parent] " + "[-f print fields] [-d print chain depth] [-h help]\n"); + return err; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-read.c b/tools/blktap2/vhd/lib/vhd-util-read.c new file mode 100644 index 0000000000..7b5246c5f7 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-read.c @@ -0,0 +1,742 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> + +#include "libvhd.h" +#include "vhd-util.h" + +#define nsize 15 +static char nbuf[nsize]; + +static inline char * +__xconv(uint64_t num) +{ + snprintf(nbuf, nsize, "%#" PRIx64 , num); + return nbuf; +} + +static inline char * +__dconv(uint64_t num) +{ + snprintf(nbuf, nsize, "%" PRIu64, num); + return nbuf; +} + +#define conv(hex, num) \ + (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num)) + +static void +vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex) +{ + int err; + uint32_t cksm; + char uuid[37], time_str[26], cookie[9], out[512], *name; + + printf("VHD Header Summary:\n-------------------\n"); + + snprintf(cookie, 9, "%s", h->cookie); + printf("Cookie : %s\n", cookie); + + printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset)); + printf("Table offset : %s\n", conv(hex, h->table_offset)); + printf("Header version : 0x%08x\n", h->hdr_ver); + printf("Max BAT size : %s\n", conv(hex, h->max_bat_size)); + printf("Block size : %s ", conv(hex, h->block_size)); + printf("(%s MB)\n", conv(hex, h->block_size >> 20)); + + err = vhd_header_decode_parent(vhd, h, &name); + printf("Parent name : %s\n", + (err ? "failed to read name" : name)); + free(name); + + uuid_unparse(h->prt_uuid, uuid); + printf("Parent UUID : %s\n", uuid); + + vhd_time_to_string(h->prt_ts, time_str); + printf("Parent timestamp : %s\n", time_str); + + cksm = vhd_checksum_header(h); + printf("Checksum : 0x%x|0x%x (%s)\n", h->checksum, cksm, + h->checksum == cksm ? "Good!" : "Bad!"); + printf("\n"); +} + +static void +vhd_print_footer(vhd_footer_t *f, int hex) +{ + uint64_t c, h, s; + uint32_t ff_maj, ff_min, cr_maj, cr_min, cksm, cksm_save; + char time_str[26], creator[5], uuid[37], cookie[9]; + + printf("VHD Footer Summary:\n-------------------\n"); + + snprintf(cookie, 9, "%s", f->cookie); + printf("Cookie : %s\n", cookie); + + printf("Features : (0x%08x) %s%s\n", f->features, + (f->features & HD_TEMPORARY) ? "<TEMP>" : "", + (f->features & HD_RESERVED) ? "<RESV>" : ""); + + ff_maj = f->ff_version >> 16; + ff_min = f->ff_version & 0xffff; + printf("File format version : Major: %d, Minor: %d\n", + ff_maj, ff_min); + + printf("Data offset : %s\n", conv(hex, f->data_offset)); + + vhd_time_to_string(f->timestamp, time_str); + printf("Timestamp : %s\n", time_str); + + memcpy(creator, f->crtr_app, 4); + creator[4] = '\0'; + printf("Creator Application : '%s'\n", creator); + + cr_maj = f->crtr_ver >> 16; + cr_min = f->crtr_ver & 0xffff; + printf("Creator version : Major: %d, Minor: %d\n", + cr_maj, cr_min); + + printf("Creator OS : %s\n", + ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" : + ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : + "Unknown!"))); + + printf("Original disk size : %s MB ", conv(hex, f->orig_size >> 20)); + printf("(%s Bytes)\n", conv(hex, f->orig_size)); + + printf("Current disk size : %s MB ", conv(hex, f->curr_size >> 20)); + printf("(%s Bytes)\n", conv(hex, f->curr_size)); + + c = f->geometry >> 16; + h = (f->geometry & 0x0000FF00) >> 8; + s = f->geometry & 0x000000FF; + printf("Geometry : Cyl: %s, ", conv(hex, c)); + printf("Hds: %s, ", conv(hex, h)); + printf("Sctrs: %s\n", conv(hex, s)); + printf(" : = %s MB ", conv(hex, (c * h * s) >> 11)); + printf("(%s Bytes)\n", conv(hex, c * h * s << 9)); + + printf("Disk type : %s\n", + f->type <= HD_TYPE_MAX ? + HD_TYPE_STR[f->type] : "Unknown type!\n"); + + cksm = vhd_checksum_footer(f); + printf("Checksum : 0x%x|0x%x (%s)\n", f->checksum, cksm, + f->checksum == cksm ? "Good!" : "Bad!"); + + uuid_unparse(f->uuid, uuid); + printf("UUID : %s\n", uuid); + + printf("Saved state : %s\n", f->saved == 0 ? "No" : "Yes"); + printf("Hidden : %d\n", f->hidden); + printf("\n"); +} + +static inline char * +code_name(uint32_t code) +{ + switch(code) { + case PLAT_CODE_NONE: + return "PLAT_CODE_NONE"; + case PLAT_CODE_WI2R: + return "PLAT_CODE_WI2R"; + case PLAT_CODE_WI2K: + return "PLAT_CODE_WI2K"; + case PLAT_CODE_W2RU: + return "PLAT_CODE_W2RU"; + case PLAT_CODE_W2KU: + return "PLAT_CODE_W2KU"; + case PLAT_CODE_MAC: + return "PLAT_CODE_MAC"; + case PLAT_CODE_MACX: + return "PLAT_CODE_MACX"; + default: + return "UNKOWN"; + } +} + +static void +vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc) +{ + int err; + char *buf; + + err = vhd_parent_locator_read(vhd, loc, &buf); + if (err) { + printf("failed to read parent name\n"); + return; + } + + printf(" decoded name : %s\n", buf); +} + +static void +vhd_print_parent_locators(vhd_context_t *vhd, int hex) +{ + int i, n; + vhd_parent_locator_t *loc; + + printf("VHD Parent Locators:\n--------------------\n"); + + n = sizeof(vhd->header.loc) / sizeof(struct prt_loc); + for (i = 0; i < n; i++) { + loc = &vhd->header.loc[i]; + + if (loc->code == PLAT_CODE_NONE) + continue; + + printf("locator: : %d\n", i); + printf(" code : %s\n", + code_name(loc->code)); + printf(" data_space : %s\n", + conv(hex, loc->data_space)); + printf(" data_length : %s\n", + conv(hex, loc->data_len)); + printf(" data_offset : %s\n", + conv(hex, loc->data_offset)); + vhd_print_parent(vhd, loc); + printf("\n"); + } +} + +static void +vhd_print_batmap_header(vhd_batmap_t *batmap, int hex) +{ + uint32_t cksm; + + printf("VHD Batmap Summary:\n-------------------\n"); + printf("Batmap offset : %s\n", + conv(hex, batmap->header.batmap_offset)); + printf("Batmap size (secs) : %s\n", + conv(hex, batmap->header.batmap_size)); + printf("Batmap version : 0x%08x\n", + batmap->header.batmap_version); + + cksm = vhd_checksum_batmap(batmap); + printf("Checksum : 0x%x|0x%x (%s)\n", + batmap->header.checksum, cksm, + (batmap->header.checksum == cksm ? "Good!" : "Bad!")); + printf("\n"); +} + +static inline int +check_block_range(vhd_context_t *vhd, uint64_t block, int hex) +{ + if (block > vhd->header.max_bat_size) { + fprintf(stderr, "block %s past end of file\n", + conv(hex, block)); + return -ERANGE; + } + + return 0; +} + +static int +vhd_print_headers(vhd_context_t *vhd, int hex) +{ + int err; + + vhd_print_footer(&vhd->footer, hex); + + if (vhd_type_dynamic(vhd)) { + vhd_print_header(vhd, &vhd->header, hex); + + if (vhd->footer.type == HD_TYPE_DIFF) + vhd_print_parent_locators(vhd, hex); + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) { + printf("failed to get batmap header\n"); + return err; + } + + vhd_print_batmap_header(&vhd->batmap, hex); + } + } + + return 0; +} + +static int +vhd_dump_headers(const char *name, int hex) +{ + vhd_context_t vhd; + + libvhd_set_log_level(1); + memset(&vhd, 0, sizeof(vhd)); + + printf("\n%s appears invalid; dumping headers\n\n", name); + + vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY); + if (vhd.fd == -1) + return -errno; + + vhd.file = strdup(name); + + vhd_read_footer(&vhd, &vhd.footer); + vhd_read_header(&vhd, &vhd.header); + + vhd_print_footer(&vhd.footer, hex); + vhd_print_header(&vhd, &vhd.header, hex); + + close(vhd.fd); + free(vhd.file); + + return 0; +} + +static int +vhd_print_logical_to_physical(vhd_context_t *vhd, + uint64_t sector, int count, int hex) +{ + int i; + uint32_t blk, lsec; + uint64_t cur, offset; + + if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { + fprintf(stderr, "sector %s past end of file\n", + conv(hex, sector + count)); + return -ERANGE; + } + + for (i = 0; i < count; i++) { + cur = sector + i; + blk = cur / vhd->spb; + lsec = cur % vhd->spb; + offset = vhd->bat.bat[blk]; + + if (offset != DD_BLK_UNUSED) { + offset += lsec + 1; + offset = vhd_sectors_to_bytes(offset); + } + + printf("logical sector %s: ", conv(hex, cur)); + printf("block number: %s, ", conv(hex, blk)); + printf("sector offset: %s, ", conv(hex, lsec)); + printf("file offset: %s\n", (offset == DD_BLK_UNUSED ? + "not allocated" : conv(hex, offset))); + } + + return 0; +} + +static int +vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + int i; + uint64_t cur, offset; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + cur = block + i; + offset = vhd->bat.bat[cur]; + + printf("block: %s: ", conv(hex, cur)); + printf("offset: %s\n", + (offset == DD_BLK_UNUSED ? "not allocated" : + conv(hex, vhd_sectors_to_bytes(offset)))); + } + + return 0; +} + +static inline void +write_full(int fd, void* buf, size_t count) +{ + ssize_t num_written = 0; + if (!buf) return; + + + while(count > 0) { + + num_written = write(fd, buf, count); + if (num_written == -1) { + if (errno == EINTR) + continue; + else + return; + } + + count -= num_written; + buf += num_written; + } +} + +static int +vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + char *buf; + int i, err; + uint64_t cur; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + cur = block + i; + + if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { + printf("block %s not allocated\n", conv(hex, cur)); + continue; + } + + err = vhd_read_bitmap(vhd, cur, &buf); + if (err) + goto out; + + write_full(STDOUT_FILENO, buf, + vhd_sectors_to_bytes(vhd->bm_secs)); + free(buf); + } + + err = 0; +out: + return err; +} + +static int +vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex) +{ + char *buf; + uint64_t cur; + int i, err, bit; + uint32_t blk, bm_blk, sec; + + if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { + printf("sector %s past end of file\n", conv(hex, sector)); + return -ERANGE; + } + + bm_blk = -1; + buf = NULL; + + for (i = 0; i < count; i++) { + cur = sector + i; + blk = cur / vhd->spb; + sec = cur % vhd->spb; + + if (blk != bm_blk) { + bm_blk = blk; + free(buf); + buf = NULL; + + if (vhd->bat.bat[blk] != DD_BLK_UNUSED) { + err = vhd_read_bitmap(vhd, blk, &buf); + if (err) + goto out; + } + } + + if (vhd->bat.bat[blk] == DD_BLK_UNUSED) + bit = 0; + else + bit = vhd_bitmap_test(vhd, buf, blk); + + print: + printf("block %s: ", conv(hex, blk)); + printf("sec: %s: %d\n", conv(hex, sec), bit); + } + + err = 0; + out: + free(buf); + return err; +} + +static int +vhd_print_batmap(vhd_context_t *vhd) +{ + int err; + size_t size; + + err = vhd_get_batmap(vhd); + if (err) { + printf("failed to read batmap: %d\n", err); + return err; + } + + size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size); + write_full(STDOUT_FILENO, vhd->batmap.map, size); + + return 0; +} + +static int +vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + int i, err; + uint64_t cur; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + err = vhd_get_batmap(vhd); + if (err) { + fprintf(stderr, "failed to get batmap\n"); + return err; + } + + for (i = 0; i < count; i++) { + cur = block + i; + fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur), + vhd_batmap_test(vhd, &vhd->batmap, cur)); + } + + return 0; +} + +static int +vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + char *buf; + int i, err; + uint64_t cur; + + err = 0; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + cur = block + i; + + if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { + printf("block %s not allocated\n", conv(hex, cur)); + continue; + } + + err = vhd_read_block(vhd, cur, &buf); + if (err) + break; + + write_full(STDOUT_FILENO, buf, vhd->header.block_size); + free(buf); + } + + return err; +} + +static int +vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex) +{ + char *buf; + uint64_t cur; + int err, max, secs; + + if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size) + return -ERANGE; + + max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE); + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, max); + if (err) + return -err; + + cur = sec; + while (count) { + secs = MIN((max >> VHD_SECTOR_SHIFT), count); + err = vhd_io_read(vhd, buf, cur, secs); + if (err) + break; + + write_full(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs)); + + cur += secs; + count -= secs; + } + + free(buf); + return err; +} + +int +vhd_util_read(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + int c, err, headers, hex; + uint64_t bat, bitmap, tbitmap, batmap, tbatmap, data, lsec, count, read; + + err = 0; + hex = 0; + headers = 0; + count = 1; + bat = -1; + bitmap = -1; + tbitmap = -1; + batmap = -1; + tbatmap = -1; + data = -1; + lsec = -1; + read = -1; + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:pt:b:m:i:aj:d:c:r:xh")) != -1) { + switch(c) { + case 'n': + name = optarg; + break; + case 'p': + headers = 1; + break; + case 't': + lsec = strtoul(optarg, NULL, 10); + break; + case 'b': + bat = strtoull(optarg, NULL, 10); + break; + case 'm': + bitmap = strtoull(optarg, NULL, 10); + break; + case 'i': + tbitmap = strtoul(optarg, NULL, 10); + break; + case 'a': + batmap = 1; + break; + case 'j': + tbatmap = strtoull(optarg, NULL, 10); + break; + case 'd': + data = strtoull(optarg, NULL, 10); + break; + case 'r': + read = strtoull(optarg, NULL, 10); + break; + case 'c': + count = strtoul(optarg, NULL, 10); + break; + case 'x': + hex = 1; + break; + case 'h': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) { + printf("Failed to open %s: %d\n", name, err); + vhd_dump_headers(name, hex); + return err; + } + + err = vhd_get_bat(&vhd); + if (err) { + printf("Failed to get bat for %s: %d\n", name, err); + goto out; + } + + if (headers) + vhd_print_headers(&vhd, hex); + + if (lsec != -1) { + err = vhd_print_logical_to_physical(&vhd, lsec, count, hex); + if (err) + goto out; + } + + if (bat != -1) { + err = vhd_print_bat(&vhd, bat, count, hex); + if (err) + goto out; + } + + if (bitmap != -1) { + err = vhd_print_bitmap(&vhd, bitmap, count, hex); + if (err) + goto out; + } + + if (tbitmap != -1) { + err = vhd_test_bitmap(&vhd, tbitmap, count, hex); + if (err) + goto out; + } + + if (batmap != -1) { + err = vhd_print_batmap(&vhd); + if (err) + goto out; + } + + if (tbatmap != -1) { + err = vhd_test_batmap(&vhd, tbatmap, count, hex); + if (err) + goto out; + } + + if (data != -1) { + err = vhd_print_data(&vhd, data, count, hex); + if (err) + goto out; + } + + if (read != -1) { + err = vhd_read_data(&vhd, read, count, hex); + if (err) + goto out; + } + + err = 0; + + out: + vhd_close(&vhd); + return err; + + usage: + printf("options:\n" + "-h help\n" + "-n name\n" + "-p print VHD headers\n" + "-t sec translate logical sector to VHD location\n" + "-b blk print bat entry\n" + "-m blk print bitmap\n" + "-i sec test bitmap for logical sector\n" + "-a print batmap\n" + "-j blk test batmap for block\n" + "-d blk print data\n" + "-c num num units\n" + "-r sec read num sectors at sec\n" + "-x print in hex\n"); + return EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-repair.c b/tools/blktap2/vhd/lib/vhd-util-repair.c new file mode 100644 index 0000000000..a1d2c45c12 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-repair.c @@ -0,0 +1,84 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_repair(int argc, char **argv) +{ + char *name; + int err, c; + off64_t eof; + vhd_context_t vhd; + + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'h': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_end_of_data(&vhd, &eof); + if (err) { + printf("error finding end of data: %d\n", err); + goto done; + } + + err = vhd_write_footer_at(&vhd, &vhd.footer, eof); + + done: + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-resize.c b/tools/blktap2/vhd/lib/vhd-util-resize.c new file mode 100644 index 0000000000..0143d7a0d3 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-resize.c @@ -0,0 +1,1131 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <syslog.h> +#include <inttypes.h> +#include <sys/mman.h> + +#include "libvhd-journal.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +#define EPRINTF(_f, _a...) \ + do { \ + syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ + DFPRINTF(_f, _a); \ + } while (0) + +typedef struct vhd_block { + uint32_t block; + uint32_t offset; +} vhd_block_t; + +TEST_FAIL_EXTERN_VARS; + +static inline uint32_t +secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs) +{ + return secs / vhd->spb; +} + +static uint32_t +secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs) +{ + uint32_t blocks; + + blocks = secs / vhd->spb; + if (secs % vhd->spb) + blocks++; + + return blocks; +} + +static int +vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs) +{ + int err; + uint64_t new_eof; + vhd_context_t *vhd; + + vhd = &journal->vhd; + + new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs); + if (new_eof <= sizeof(vhd_footer_t)) + return -EINVAL; + + err = ftruncate(vhd->fd, new_eof); + if (err) + return errno; + + vhd->footer.curr_size = new_eof; + return vhd_write_footer(vhd, &vhd->footer); +} + +static int +vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size) +{ + int err; + char *buf; + vhd_context_t *vhd; + uint64_t bytes, map; + + vhd = &journal->vhd; + map = MIN(size, VHD_BLOCK_SIZE); + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + return err; + + buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return -errno; + + do { + bytes = MIN(size, map); + + err = vhd_write(vhd, buf, bytes); + if (err) + break; + + size -= bytes; + } while (size); + + munmap(buf, map); + + return err; +} + +static int +vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs) +{ + int err; + vhd_context_t *vhd; + uint64_t size, eof, new_eof; + + size = vhd_sectors_to_bytes(secs); + vhd = &journal->vhd; + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + goto out; + + eof = vhd_position(vhd); + if (eof == (off64_t)-1) { + err = -errno; + goto out; + } + + err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size); + if (err) + goto out; + + new_eof = eof + size; + err = vhd_seek(vhd, new_eof, SEEK_SET); + if (err) + goto out; + + vhd->footer.curr_size += size; + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + goto out; + + err = 0; + +out: + return err; +} + +static int +vhd_fixed_resize(vhd_journal_t *journal, uint64_t size) +{ + int err; + vhd_context_t *vhd; + uint64_t cur_secs, new_secs; + + vhd = &journal->vhd; + cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; + new_secs = size << (20 - VHD_SECTOR_SHIFT); + + if (cur_secs == new_secs) + return 0; + else if (cur_secs > new_secs) + err = vhd_fixed_shrink(journal, cur_secs - new_secs); + else + err = vhd_fixed_grow(journal, new_secs - cur_secs); + + return err; +} + +static inline void +swap(vhd_block_t *list, int a, int b) +{ + vhd_block_t tmp; + + tmp = list[a]; + list[a] = list[b]; + list[b] = tmp; +} + +static int +partition(vhd_block_t *list, int left, int right, int pidx) +{ + int i, sidx; + long long pval; + + sidx = left; + pval = list[pidx].offset; + swap(list, pidx, right); + + for (i = left; i < right; i++) + if (list[i].offset >= pval) { + swap(list, sidx, i); + ++sidx; + } + + swap(list, right, sidx); + return sidx; +} + +static void +quicksort(vhd_block_t *list, int left, int right) +{ + int pidx, new_pidx; + + if (right < left) + return; + + pidx = left; + new_pidx = partition(list, left, right, pidx); + quicksort(list, left, new_pidx - 1); + quicksort(list, new_pidx + 1, right); +} + +static int +vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset) +{ + int err; + char *buf; + size_t size; + vhd_context_t *vhd; + off64_t off, src_off; + + buf = NULL; + vhd = &journal->vhd; + off = offset; + size = vhd_sectors_to_bytes(vhd->bm_secs); + src_off = vhd->bat.bat[src]; + + if (src_off == DD_BLK_UNUSED) + return -EINVAL; + src_off = vhd_sectors_to_bytes(src_off); + + err = vhd_journal_add_block(journal, src, + VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); + if (err) + goto out; + + err = vhd_read_bitmap(vhd, src, &buf); + if (err) + goto out; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, buf, size); + if (err) + goto out; + + free(buf); + buf = NULL; + off += size; + size = vhd_sectors_to_bytes(vhd->spb); + + err = vhd_read_block(vhd, src, &buf); + if (err) + goto out; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, buf, size); + if (err) + goto out; + + vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT; + + err = vhd_write_zeros(journal, src_off, + vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb)); + +out: + free(buf); + return err; +} + +static int +vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest) +{ + int err; + off64_t off; + vhd_context_t *vhd; + + vhd = &journal->vhd; + off = vhd_sectors_to_bytes(vhd->bat.bat[dest]); + + err = vhd_journal_add_block(journal, dest, + VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); + if (err) + return err; + + err = vhd_move_block(journal, src, off); + if (err) + return err; + + vhd->bat.bat[dest] = DD_BLK_UNUSED; + + return 0; +} + +/* + * remove a list of blocks from the vhd file + * if a block to be removed: + * - resides at the end of the file: simply clear its bat entry + * - resides elsewhere: move the last block in the file into its position + * and update the bat to reflect this + */ +static int +vhd_defrag_shrink(vhd_journal_t *journal, + vhd_block_t *original_free_list, int free_cnt) +{ + vhd_context_t *vhd; + int i, j, free_idx, err; + vhd_block_t *blocks, *free_list; + + err = 0; + blocks = NULL; + free_list = NULL; + vhd = &journal->vhd; + + blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t)); + if (!blocks) { + err = -ENOMEM; + goto out; + } + + free_list = malloc(free_cnt * sizeof(vhd_block_t)); + if (!free_list) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < vhd->bat.entries; i++) { + blocks[i].block = i; + blocks[i].offset = vhd->bat.bat[i]; + } + + memcpy(free_list, original_free_list, + free_cnt * sizeof(vhd_block_t)); + + /* sort both the to-free list and the bat list + * in order of descending file offset */ + quicksort(free_list, 0, free_cnt - 1); + quicksort(blocks, 0, vhd->bat.entries - 1); + + for (i = 0, free_idx = 0; + i < vhd->bat.entries && free_idx < free_cnt; i++) { + vhd_block_t *b = blocks + i; + + if (b->offset == DD_BLK_UNUSED) + continue; + + for (j = free_idx; j < free_cnt; j++) + if (b->block == free_list[j].block) { + /* the last block in the file is in the list of + * blocks to remove; no need to shuffle the + * data -- just clear the bat entry */ + vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED; + free_idx++; + continue; + } + + err = vhd_clobber_block(journal, b->block, + free_list[free_idx++].block); + if (err) + goto out; + } + + /* clear any bat entries for blocks we did not shuffle */ + for (i = free_idx; i < free_cnt; i++) + vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED; + +out: + free(blocks); + free(free_list); + + return err; +} + +static int +vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries) +{ + int i, err; + vhd_context_t *vhd; + off64_t orig_map_off, new_map_off; + uint32_t orig_entries, new_entries; + + vhd = &journal->vhd; + orig_entries = vhd->header.max_bat_size; + new_entries = orig_entries - entries; + + if (vhd_has_batmap(vhd)) { + err = vhd_batmap_header_offset(vhd, &orig_map_off); + if (err) + return err; + } + + /* update header */ + vhd->header.max_bat_size = new_entries; + err = vhd_write_header(vhd, &vhd->header); + if (err) + return err; + + /* update footer */ + vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; + vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + /* update bat -- we don't reclaim space, just clear entries */ + for (i = new_entries; i < orig_entries; i++) + vhd->bat.bat[i] = 0; + + err = vhd_write_bat(vhd, &vhd->bat); + if (err) + return err; + + /* update this after write_bat so the end of the bat is zeored */ + vhd->bat.entries = new_entries; + + if (!vhd_has_batmap(vhd)) + return 0; + + /* zero out old batmap header if new header has moved */ + err = vhd_batmap_header_offset(vhd, &new_map_off); + if (err) + return err; + + if (orig_map_off != new_map_off) { + size_t size; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + err = vhd_write_zeros(journal, orig_map_off, size); + if (err) + return err; + } + + /* update batmap -- clear entries for freed blocks */ + for (i = new_entries; i < orig_entries; i++) + vhd_batmap_clear(vhd, &vhd->batmap, i); + + err = vhd_write_batmap(vhd, &vhd->batmap); + if (err) + return err; + + return 0; +} + +static int +vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs) +{ + off64_t eof; + uint32_t blocks; + vhd_context_t *vhd; + int i, j, err, free_cnt; + struct vhd_block *free_list; + + printf("dynamic shrink not fully implemented\n"); + return -ENOSYS; + + eof = 0; + free_cnt = 0; + free_list = NULL; + vhd = &journal->vhd; + + blocks = secs_to_blocks_down(vhd, secs); + if (blocks == 0) + return 0; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + return err; + } + + free_list = malloc(blocks * sizeof(struct vhd_block)); + if (!free_list) + return -ENOMEM; + + for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) { + uint32_t blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + free_list[free_cnt].block = i; + free_list[free_cnt].offset = blk; + free_cnt++; + } + } + + if (free_cnt) { + err = vhd_defrag_shrink(journal, free_list, free_cnt); + if (err) + goto out; + } + + err = vhd_clear_bat_entries(journal, blocks); + if (err) + goto out; + + /* remove data beyond footer */ + err = vhd_end_of_data(vhd, &eof); + if (err) + goto out; + + err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t)); + if (err) { + err = -errno; + goto out; + } + + err = 0; + +out: + free(free_list); + return err; +} + +static inline void +vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block) +{ + int i; + uint32_t blk; + + memset(block, 0, sizeof(vhd_block_t)); + + for (i = 0; i < vhd->bat.entries; i++) { + blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + if (!block->offset || blk < block->offset) { + block->block = i; + block->offset = blk; + } + } + } +} + +static inline uint32_t +vhd_next_block_offset(vhd_context_t *vhd) +{ + int i; + uint32_t blk, end, spp, next; + + next = 0; + spp = getpagesize() >> VHD_SECTOR_SHIFT; + + for (i = 0; i < vhd->bat.entries; i++) { + blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + end = blk + vhd->spb + vhd->bm_secs; + next = MAX(next, end); + } + } + + return next; +} + +static inline int +in_range(off64_t off, off64_t start, off64_t size) +{ + return (start < off && start + size > off); +} + +#define SKIP_HEADER 0x01 +#define SKIP_BAT 0x02 +#define SKIP_BATMAP 0x04 +#define SKIP_PLOC 0x08 +#define SKIP_DATA 0x10 + +static inline int +skip_check(int mode, int type) +{ + return mode & type; +} + +static int +vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode) +{ + int i, n; + char *msg; + size_t size; + vhd_block_t fb; + vhd_parent_locator_t *loc; + + msg = NULL; + + if (!vhd_type_dynamic(vhd)) + return 0; + + if (off < VHD_SECTOR_SIZE) { + msg = "backup footer"; + goto fail; + } + + if (!skip_check(mode, SKIP_HEADER)) + if (in_range(off, + vhd->footer.data_offset, sizeof(vhd_header_t))) { + msg = "header"; + goto fail; + } + + if (!skip_check(mode, SKIP_BAT)) + if (in_range(off, vhd->header.table_offset, + vhd_bytes_padded(vhd->header.max_bat_size * + sizeof(uint32_t)))) { + msg = "bat"; + goto fail; + } + + if (!skip_check(mode, SKIP_BATMAP)) + if (vhd_has_batmap(vhd) && + in_range(off, vhd->batmap.header.batmap_offset, + vhd_bytes_padded(vhd->batmap.header.batmap_size))) { + msg = "batmap"; + goto fail; + } + + if (!skip_check(mode, SKIP_PLOC)) { + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + for (i = 0; i < n; i++) { + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + size = vhd_parent_locator_size(loc); + if (in_range(off, loc->data_offset, size)) { + msg = "parent locator"; + goto fail; + } + } + } + + if (!skip_check(mode, SKIP_DATA)) { + vhd_first_data_block(vhd, &fb); + if (fb.offset && in_range(off, + vhd_sectors_to_bytes(fb.offset), + VHD_BLOCK_SIZE)) { + msg = "data block"; + goto fail; + } + } + + return 0; + +fail: + EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg); + return -EINVAL; +} + +/* + * take any metadata after the bat (@eob) and shift it + */ +static int +vhd_shift_metadata(vhd_journal_t *journal, off64_t eob, + size_t bat_needed, size_t map_needed) +{ + int i, n, err; + vhd_context_t *vhd; + size_t size_needed; + char *buf, **locators; + vhd_parent_locator_t *loc; + + vhd = &journal->vhd; + size_needed = bat_needed + map_needed; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + + locators = calloc(n, sizeof(char *)); + if (!locators) + return -ENOMEM; + + for (i = 0; i < n; i++) { + size_t size; + + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + if (loc->data_offset < eob) + continue; + + size = vhd_parent_locator_size(loc); + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + err = -err; + buf = NULL; + goto out; + } + + err = vhd_seek(vhd, loc->data_offset, SEEK_SET); + if (err) + goto out; + + err = vhd_read(vhd, buf, size); + if (err) + goto out; + + locators[i] = buf; + } + + for (i = 0; i < n; i++) { + off64_t off; + size_t size; + + if (!locators[i]) + continue; + + loc = vhd->header.loc + i; + off = loc->data_offset + size_needed; + size = vhd_parent_locator_size(loc); + + if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) { + EPRINTF("%s: shifting locator %d would clobber data\n", + vhd->file, i); + return -EINVAL; + } + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, locators[i], size); + if (err) + goto out; + + free(locators[i]); + locators[i] = NULL; + loc->data_offset = off; + + /* write the new header after writing the new bat */ + } + + if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) { + vhd->batmap.header.batmap_offset += bat_needed; + + /* write the new batmap after writing the new bat */ + } + + err = 0; + +out: + for (i = 0; i < n; i++) + free(locators[i]); + free(locators); + + return err; +} + +static int +vhd_add_bat_entries(vhd_journal_t *journal, int entries) +{ + int i, err; + off64_t off; + vhd_bat_t new_bat; + vhd_context_t *vhd; + uint32_t new_entries; + vhd_batmap_t new_batmap; + uint64_t bat_size, new_bat_size, map_size, new_map_size; + + vhd = &journal->vhd; + new_entries = vhd->header.max_bat_size + entries; + + bat_size = vhd_bytes_padded(vhd->header.max_bat_size * + sizeof(uint32_t)); + new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t)); + + map_size = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3); + new_map_size = vhd_bytes_padded((new_entries + 7) >> 3); + + off = vhd->header.table_offset + new_bat_size; + if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) { + EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes " + "at 0x%08"PRIx64" would clobber data\n", + vhd->file, new_bat_size, vhd->header.table_offset); + return -EINVAL; + } + + if (vhd_has_batmap(vhd)) { + off = vhd->batmap.header.batmap_offset + new_map_size; + if (vhd_check_for_clobber(vhd, off, 0)) { + EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes" + " at 0x%08"PRIx64" would clobber data\n", vhd->file, + new_map_size, vhd->batmap.header.batmap_offset); + return -EINVAL; + } + } + + /* update header */ + vhd->header.max_bat_size = new_entries; + err = vhd_write_header(vhd, &vhd->header); + if (err) + return err; + + /* update footer */ + vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; + vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + /* allocate new bat */ + err = posix_memalign((void **)&new_bat.bat, VHD_SECTOR_SIZE, new_bat_size); + if (err) + return -err; + + new_bat.spb = vhd->bat.spb; + new_bat.entries = new_entries; + memcpy(new_bat.bat, vhd->bat.bat, bat_size); + for (i = vhd->bat.entries; i < new_entries; i++) + new_bat.bat[i] = DD_BLK_UNUSED; + + /* write new bat */ + err = vhd_write_bat(vhd, &new_bat); + if (err) { + free(new_bat.bat); + return err; + } + + /* update in-memory bat */ + free(vhd->bat.bat); + vhd->bat = new_bat; + + if (!vhd_has_batmap(vhd)) + return 0; + + /* allocate new batmap */ + err = posix_memalign((void **)&new_batmap.map, + VHD_SECTOR_SIZE, new_map_size); + if (err) + return err; + + new_batmap.header = vhd->batmap.header; + new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size); + memcpy(new_batmap.map, vhd->batmap.map, map_size); + memset(new_batmap.map + map_size, 0, new_map_size - map_size); + + /* write new batmap */ + err = vhd_write_batmap(vhd, &new_batmap); + if (err) { + free(new_batmap.map); + return err; + } + + /* update in-memory batmap */ + free(vhd->batmap.map); + vhd->batmap = new_batmap; + + return 0; +} + +static int +vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs) +{ + int i, err; + off64_t eob, eom; + vhd_context_t *vhd; + vhd_block_t first_block; + uint64_t blocks, size_needed; + uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs; + uint64_t map_needed, map_size, map_avail, map_bytes, map_secs; + + vhd = &journal->vhd; + + size_needed = 0; + bat_needed = 0; + map_needed = 0; + + /* number of vhd blocks to add */ + blocks = secs_to_blocks_up(vhd, secs); + + /* size in bytes needed for new bat entries */ + bat_needed = blocks * sizeof(uint32_t); + map_needed = (blocks >> 3) + 1; + + /* available bytes in current bat */ + bat_bytes = vhd->header.max_bat_size * sizeof(uint32_t); + bat_secs = secs_round_up_no_zero(bat_bytes); + bat_size = vhd_sectors_to_bytes(bat_secs); + bat_avail = bat_size - bat_bytes; + + if (vhd_has_batmap(vhd)) { + /* avaliable bytes in current batmap */ + map_bytes = (vhd->header.max_bat_size + 7) >> 3; + map_secs = vhd->batmap.header.batmap_size; + map_size = vhd_sectors_to_bytes(map_secs); + map_avail = map_size - map_bytes; + } else { + map_needed = 0; + map_avail = 0; + } + + /* we have enough space already; just extend the bat */ + if (bat_needed <= bat_avail && map_needed <= map_avail) + goto add_entries; + + /* we need to add new sectors to the bat */ + if (bat_needed > bat_avail) { + bat_needed -= bat_avail; + bat_needed = vhd_bytes_padded(bat_needed); + } else + bat_needed = 0; + + /* we need to add new sectors to the batmap */ + if (map_needed > map_avail) { + map_needed -= map_avail; + map_needed = vhd_bytes_padded(map_needed); + } else + map_needed = 0; + + /* how many additional bytes do we need? */ + size_needed = bat_needed + map_needed; + + /* calculate space between end of headers and beginning of data */ + err = vhd_end_of_headers(vhd, &eom); + if (err) + return err; + + eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs); + vhd_first_data_block(vhd, &first_block); + + /* no blocks allocated; just shift post-bat metadata */ + if (!first_block.offset) + goto shift_metadata; + + /* + * not enough space -- + * move vhd data blocks to the end of the file to make room + */ + do { + off64_t new_off, bm_size, gap_size; + + new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd)); + + /* data region of segment should begin on page boundary */ + bm_size = vhd_sectors_to_bytes(vhd->bm_secs); + if ((new_off + bm_size) % 4096) { + gap_size = 4096 - ((new_off + bm_size) % 4096); + + err = vhd_write_zeros(journal, new_off, gap_size); + if (err) + return err; + + new_off += gap_size; + } + + err = vhd_move_block(journal, first_block.block, new_off); + if (err) + return err; + + vhd_first_data_block(vhd, &first_block); + + } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset)); + + TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED); + +shift_metadata: + /* shift any metadata after the bat to make room for new bat sectors */ + err = vhd_shift_metadata(journal, eob, bat_needed, map_needed); + if (err) + return err; + + TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED); + +add_entries: + return vhd_add_bat_entries(journal, blocks); +} + +static int +vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size) +{ + int err; + vhd_context_t *vhd; + uint64_t cur_secs, new_secs; + + vhd = &journal->vhd; + cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; + new_secs = size << (20 - VHD_SECTOR_SHIFT); + + if (cur_secs == new_secs) + return 0; + + err = vhd_get_header(vhd); + if (err) + return err; + + err = vhd_get_bat(vhd); + if (err) + return err; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + return err; + } + + if (cur_secs > new_secs) + err = vhd_dynamic_shrink(journal, cur_secs - new_secs); + else + err = vhd_dynamic_grow(journal, new_secs - cur_secs); + + return err; +} + +static int +vhd_util_resize_check_creator(const char *name) +{ + int err; + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (!vhd_creator_tapdisk(&vhd)) { + printf("%s not created by xen; resize not supported\n", name); + err = -EINVAL; + } + + vhd_close(&vhd); + return err; +} + +int +vhd_util_resize(int argc, char **argv) +{ + char *name, *jname; + uint64_t size; + int c, err, jerr; + vhd_journal_t journal; + vhd_context_t *vhd; + + err = -EINVAL; + size = 0; + name = NULL; + jname = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "n:j:s:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'j': + jname = optarg; + break; + case 's': + err = 0; + size = strtoull(optarg, NULL, 10); + break; + case 'h': + default: + goto usage; + } + } + + if (err || !name || !jname || argc != optind) + goto usage; + + err = vhd_util_resize_check_creator(name); + if (err) + return err; + + libvhd_set_log_level(1); + err = vhd_journal_create(&journal, name, jname); + if (err) { + printf("creating journal failed: %d\n", err); + return err; + } + + vhd = &journal.vhd; + + err = vhd_get_footer(vhd); + if (err) + goto out; + + TEST_FAIL_AT(FAIL_RESIZE_BEGIN); + + if (vhd_type_dynamic(vhd)) + err = vhd_dynamic_resize(&journal, size); + else + err = vhd_fixed_resize(&journal, size); + + TEST_FAIL_AT(FAIL_RESIZE_END); + +out: + if (err) { + printf("resize failed: %d\n", err); + jerr = vhd_journal_revert(&journal); + } else + jerr = vhd_journal_commit(&journal); + + if (jerr) { + printf("closing journal failed: %d\n", jerr); + vhd_journal_close(&journal); + } else + vhd_journal_remove(&journal); + + return (err ? : jerr); + +usage: + printf("options: <-n name> <-j journal> <-s size (in MB)> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-revert.c b/tools/blktap2/vhd/lib/vhd-util-revert.c new file mode 100644 index 0000000000..dab6e8b950 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-revert.c @@ -0,0 +1,106 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Altering operations: + * + * 1. Change the parent pointer to another file. + * 2. Change the size of the file containing the VHD image. This does NOT + * affect the VHD disk capacity, only the physical size of the file containing + * the VHD. Naturally, it is not possible to set the file size to be less than + * the what VHD utilizes. + * The operation doesn't actually change the file size, but it writes the + * footer in the right location such that resizing the file (manually, as a + * separate step) will produce the correct results. If the new file size is + * greater than the current file size, the file must first be expanded and then + * altered with this operation. If the new size is smaller than the current + * size, the VHD must first be altered with this operation and then the file + * must be shrunk. Failing to resize the file will result in a corrupted VHD. +*/ + +#include <errno.h> +//#include <fcntl.h> +#include <stdio.h> +//#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" +#include "libvhd-journal.h" + +int +vhd_util_revert(int argc, char **argv) +{ + char *name, *jname; + vhd_journal_t journal; + int c, err; + + name = NULL; + jname = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "n:j:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'j': + jname = optarg; + break; + case 'h': + default: + goto usage; + } + } + + if (!name || !jname || argc != optind) + goto usage; + + libvhd_set_log_level(1); + err = vhd_journal_open(&journal, name, jname); + if (err) { + printf("opening journal failed: %d\n", err); + return err; + } + + err = vhd_journal_revert(&journal); + if (err) { + printf("reverting journal failed: %d\n", err); + vhd_journal_close(&journal); + return err; + } + + err = vhd_journal_remove(&journal); + if (err) { + printf("removing journal failed: %d\n", err); + vhd_journal_close(&journal); + return err; + } + + return 0; + +usage: + printf("options: <-n name> <-j journal> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-scan.c b/tools/blktap2/vhd/lib/vhd-util-scan.c new file mode 100644 index 0000000000..4ecfb52e7d --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-scan.c @@ -0,0 +1,1315 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include <glob.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <fnmatch.h> + +#include "list.h" +#include "libvhd.h" +#include "lvm-util.h" + +#define VHD_SCAN_FAST 0x01 +#define VHD_SCAN_PRETTY 0x02 +#define VHD_SCAN_VOLUME 0x04 +#define VHD_SCAN_NOFAIL 0x08 +#define VHD_SCAN_VERBOSE 0x10 +#define VHD_SCAN_PARENTS 0x20 + +#define VHD_TYPE_RAW_FILE 0x01 +#define VHD_TYPE_VHD_FILE 0x02 +#define VHD_TYPE_RAW_VOLUME 0x04 +#define VHD_TYPE_VHD_VOLUME 0x08 + +static inline int +target_volume(uint8_t type) +{ + return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME); +} + +static inline int +target_vhd(uint8_t type) +{ + return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME); +} + +struct target { + char name[VHD_MAX_NAME_LEN]; + char device[VHD_MAX_NAME_LEN]; + uint64_t size; + uint64_t start; + uint64_t end; + uint8_t type; +}; + +struct iterator { + int cur; + int cur_size; + int max_size; + struct target *targets; +}; + +struct vhd_image { + char *name; + char *parent; + uint64_t capacity; + off64_t size; + uint8_t hidden; + int error; + char *message; + + struct target *target; + + struct list_head sibling; + struct list_head children; + struct vhd_image *parent_image; +}; + +struct vhd_scan { + int cur; + int size; + + int lists_cur; + int lists_size; + + struct vhd_image **images; + struct vhd_image **lists; +}; + +static int flags; +static struct vg vg; +static struct vhd_scan scan; + +static int +vhd_util_scan_pretty_allocate_list(int cnt) +{ + int i; + struct vhd_image *list; + + memset(&scan, 0, sizeof(scan)); + + scan.lists_cur = 1; + scan.lists_size = 10; + + scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *)); + if (!scan.lists) + goto fail; + + scan.lists[0] = calloc(cnt, sizeof(struct vhd_image)); + if (!scan.lists[0]) + goto fail; + + scan.images = calloc(cnt, sizeof(struct vhd_image *)); + if (!scan.images) + goto fail; + + for (i = 0; i < cnt; i++) + scan.images[i] = scan.lists[0] + i; + + scan.cur = 0; + scan.size = cnt; + + return 0; + +fail: + if (scan.lists) { + free(scan.lists[0]); + free(scan.lists); + } + + free(scan.images); + memset(&scan, 0, sizeof(scan)); + return -ENOMEM; +} + +static void +vhd_util_scan_pretty_free_list(void) +{ + int i; + + if (scan.lists) { + for (i = 0; i < scan.lists_cur; i++) + free(scan.lists[i]); + free(scan.lists); + } + + free(scan.images); + memset(&scan, 0, sizeof(scan)); +} + +static int +vhd_util_scan_pretty_add_image(struct vhd_image *image) +{ + int i; + struct vhd_image *img; + + for (i = 0; i < scan.cur; i++) { + img = scan.images[i]; + if (!strcmp(img->name, image->name)) + return 0; + } + + if (scan.cur >= scan.size) { + struct vhd_image *new, **list; + + if (scan.lists_cur >= scan.lists_size) { + list = realloc(scan.lists, scan.lists_size * 2 * + sizeof(struct vhd_image *)); + if (!list) + return -ENOMEM; + + scan.lists_size *= 2; + scan.lists = list; + } + + new = calloc(scan.size, sizeof(struct vhd_image)); + if (!new) + return -ENOMEM; + + scan.lists[scan.lists_cur++] = new; + scan.size *= 2; + + list = realloc(scan.images, scan.size * + sizeof(struct vhd_image *)); + if (!list) + return -ENOMEM; + + scan.images = list; + for (i = 0; i + scan.cur < scan.size; i++) + scan.images[i + scan.cur] = new + i; + } + + img = scan.images[scan.cur]; + INIT_LIST_HEAD(&img->sibling); + INIT_LIST_HEAD(&img->children); + + img->capacity = image->capacity; + img->size = image->size; + img->hidden = image->hidden; + img->error = image->error; + img->message = image->message; + + img->name = strdup(image->name); + if (!img->name) + goto fail; + + if (image->parent) { + img->parent = strdup(image->parent); + if (!img->parent) + goto fail; + } + + scan.cur++; + return 0; + +fail: + free(img->name); + free(img->parent); + memset(img, 0, sizeof(*img)); + return -ENOMEM; +} + +static int +vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs) +{ + struct vhd_image *l, *r; + + l = *(struct vhd_image **)lhs; + r = *(struct vhd_image **)rhs; + + return strcmp(l->name, r->name); +} + +static void +vhd_util_scan_print_image_indent(struct vhd_image *image, int tab) +{ + char *pad, *name, *pmsg, *parent; + + pad = (tab ? " " : ""); + name = image->name; + parent = (image->parent ? : "none"); + + if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image) + pmsg = " (not found in scan)"; + else + pmsg = ""; + + if (!(flags & VHD_SCAN_VERBOSE)) { + name = basename(image->name); + if (image->parent) + parent = basename(image->parent); + } + + if (image->error) + printf("%*svhd=%s scan-error=%d error-message='%s'\n", + tab, pad, image->name, image->error, image->message); + else + printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u " + "parent=%s%s\n", tab, pad, name, image->capacity, + image->size, image->hidden, parent, pmsg); +} + +static void +vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth) +{ + struct vhd_image *img, *tmp; + + vhd_util_scan_print_image_indent(image, depth * 3); + + list_for_each_entry_safe(img, tmp, &image->children, sibling) + if (!img->hidden) + vhd_util_scan_pretty_print_tree(img, depth + 1); + + list_for_each_entry_safe(img, tmp, &image->children, sibling) + if (img->hidden) + vhd_util_scan_pretty_print_tree(img, depth + 1); + + free(image->name); + free(image->parent); + + image->name = NULL; + image->parent = NULL; +} + +static void +vhd_util_scan_pretty_print_images(void) +{ + int i; + struct vhd_image *image, **parentp, *parent, *keyp, key; + + qsort(scan.images, scan.cur, sizeof(scan.images[0]), + vhd_util_scan_pretty_image_compare); + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->parent) { + image->parent_image = NULL; + continue; + } + + memset(&key, 0, sizeof(key)); + key.name = image->parent; + keyp = &key; + + parentp = bsearch(&keyp, scan.images, scan.cur, + sizeof(scan.images[0]), + vhd_util_scan_pretty_image_compare); + if (!parentp) { + image->parent_image = NULL; + continue; + } + + parent = *parentp; + image->parent_image = parent; + list_add_tail(&image->sibling, &parent->children); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (image->parent_image || !image->hidden) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->name || image->parent_image) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->name) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } +} + +static void +vhd_util_scan_print_image(struct vhd_image *image) +{ + int err; + + if (!image->error && (flags & VHD_SCAN_PRETTY)) { + err = vhd_util_scan_pretty_add_image(image); + if (!err) + return; + + if (!image->error) { + image->error = err; + image->message = "allocating memory"; + } + } + + vhd_util_scan_print_image_indent(image, 0); +} + +static int +vhd_util_scan_error(const char *file, int err) +{ + struct vhd_image image; + + memset(&image, 0, sizeof(image)); + image.name = (char *)file; + image.error = err; + image.message = "failure scanning target"; + + vhd_util_scan_print_image(&image); + + /* + if (flags & VHD_SCAN_NOFAIL) + return 0; + */ + + return err; +} + +static vhd_parent_locator_t * +vhd_util_scan_get_parent_locator(vhd_context_t *vhd) +{ + int i; + vhd_parent_locator_t *loc; + + loc = NULL; + + for (i = 0; i < 8; i++) { + if (vhd->header.loc[i].code == PLAT_CODE_MACX) { + loc = vhd->header.loc + i; + break; + } + + if (vhd->header.loc[i].code == PLAT_CODE_W2RU) + loc = vhd->header.loc + i; + + if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE) + loc = vhd->header.loc + i; + } + + return loc; +} + +static inline int +copy_name(char *dst, const char *src) +{ + if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN) + return 0; + + return -ENAMETOOLONG; +} + +/* + * LVHD stores realpath(parent) in parent locators, so + * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name> + */ +static int +vhd_util_scan_extract_volume_name(char *dst, const char *src) +{ + int err; + char copy[VHD_MAX_NAME_LEN], *name, *s, *c; + + name = strrchr(src, '/'); + if (!name) + name = (char *)src; + + /* convert single dashes to slashes, double dashes to single dashes */ + for (c = copy, s = name; *s != '\0'; s++, c++) { + if (*s == '-') { + if (s[1] != '-') + *c = '/'; + else { + s++; + *c = '-'; + } + } else + *c = *s; + } + + *c = '\0'; + c = strrchr(copy, '/'); + if (c == name) { + /* unrecognized format */ + strcpy(dst, src); + return -EINVAL; + } + + strcpy(dst, ++c); + return 0; +} + +static int +vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + char name[VHD_MAX_NAME_LEN]; + vhd_parent_locator_t *loc, copy; + + if (flags & VHD_SCAN_FAST) { + err = vhd_header_decode_parent(vhd, + &vhd->header, &image->parent); + if (!err) + goto found; + } + + loc = vhd_util_scan_get_parent_locator(vhd); + if (!loc) + return -EINVAL; + + copy = *loc; + copy.data_offset += image->target->start; + err = vhd_parent_locator_read(vhd, ©, &image->parent); + if (err) + return err; + +found: + err = vhd_util_scan_extract_volume_name(name, image->parent); + if (!err) + return copy_name(image->parent, name); + + return 0; +} + +static int +vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image) +{ + int i, err; + vhd_parent_locator_t *loc; + + if (!target_vhd(image->target->type)) { + image->parent = NULL; + return 0; + } + + loc = NULL; + + if (target_volume(image->target->type)) + return vhd_util_scan_get_volume_parent(vhd, image); + + if (flags & VHD_SCAN_FAST) { + err = vhd_header_decode_parent(vhd, + &vhd->header, &image->parent); + if (!err) + return 0; + } else { + /* + * vhd_parent_locator_get checks for the existence of the + * parent file. if this call succeeds, all is well; if not, + * we'll try to return whatever string we have before failing + * outright. + */ + err = vhd_parent_locator_get(vhd, &image->parent); + if (!err) + return 0; + } + + loc = vhd_util_scan_get_parent_locator(vhd); + if (!loc) + return -EINVAL; + + return vhd_parent_locator_read(vhd, loc, &image->parent); +} + +static int +vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image) +{ + int err, hidden; + + err = 0; + hidden = 0; + + if (target_vhd(image->target->type)) + err = vhd_hidden(vhd, &hidden); + else + hidden = 1; + + if (err) + return err; + + image->hidden = hidden; + return 0; +} + +static int +vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image) +{ + image->size = image->target->size; + + if (target_vhd(image->target->type)) + image->capacity = vhd->footer.curr_size; + else + image->capacity = image->size; + + return 0; +} + +static int +vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image) +{ + int err, vhd_flags; + + if (!target_vhd(image->target->type)) + return 0; + + vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED; + if (flags & VHD_SCAN_FAST) + vhd_flags |= VHD_OPEN_FAST; + + err = vhd_open(vhd, image->name, vhd_flags); + if (err) { + vhd->file = NULL; + image->message = "opening file"; + image->error = err; + return image->error; + } + + return 0; +} + +static int +vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + char *buf; + size_t size; + struct target *target; + + buf = NULL; + target = image->target; + size = sizeof(vhd_footer_t) + sizeof(vhd_header_t); + + err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size); + if (err) { + buf = NULL; + image->message = "allocating image"; + image->error = -err; + goto out; + } + + err = vhd_seek(vhd, target->start, SEEK_SET); + if (err) { + image->message = "seeking to headers"; + image->error = err; + goto out; + } + + err = vhd_read(vhd, buf, size); + if (err) { + image->message = "reading headers"; + image->error = err; + goto out; + } + + memcpy(&vhd->footer, buf, sizeof(vhd_footer_t)); + vhd_footer_in(&vhd->footer); + err = vhd_validate_footer(&vhd->footer); + if (err) { + image->message = "invalid footer"; + image->error = err; + goto out; + } + + /* lvhd vhds should always be dynamic */ + if (vhd_type_dynamic(vhd)) { + if (vhd->footer.data_offset != sizeof(vhd_footer_t)) + err = vhd_read_header_at(vhd, &vhd->header, + vhd->footer.data_offset + + target->start); + else { + memcpy(&vhd->header, + buf + sizeof(vhd_footer_t), + sizeof(vhd_header_t)); + vhd_header_in(&vhd->header); + err = vhd_validate_header(&vhd->header); + } + + if (err) { + image->message = "reading header"; + image->error = err; + goto out; + } + + vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; + vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3); + } + +out: + free(buf); + return image->error; +} + +static int +vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + struct target *target; + + target = image->target; + memset(vhd, 0, sizeof(*vhd)); + vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST; + + if (target->end - target->start < 4096) { + image->message = "device too small"; + image->error = -EINVAL; + return image->error; + } + + vhd->file = strdup(image->name); + if (!vhd->file) { + image->message = "allocating device"; + image->error = -ENOMEM; + return image->error; + } + + vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (vhd->fd == -1) { + free(vhd->file); + vhd->file = NULL; + + image->message = "opening device"; + image->error = -errno; + return image->error; + } + + if (target_vhd(target->type)) + return vhd_util_scan_read_volume_headers(vhd, image); + + return 0; +} + +static int +vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image) +{ + struct target *target; + + target = image->target; + + if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY)) + image->name = target->name; + else { + image->name = realpath(target->name, NULL); + if (!image->name) { + image->name = target->name; + image->message = "resolving name"; + image->error = -errno; + return image->error; + } + } + + if (target_volume(target->type)) + return vhd_util_scan_open_volume(vhd, image); + else + return vhd_util_scan_open_file(vhd, image); +} + +static int +vhd_util_scan_init_file_target(struct target *target, + const char *file, uint8_t type) +{ + int err; + struct stat stats; + + err = stat(file, &stats); + if (err == -1) + return -errno; + + err = copy_name(target->name, file); + if (err) + return err; + + err = copy_name(target->device, file); + if (err) + return err; + + target->type = type; + target->start = 0; + target->size = stats.st_size; + target->end = stats.st_size; + + return 0; +} + +static int +vhd_util_scan_init_volume_target(struct target *target, + struct lv *lv, uint8_t type) +{ + int err; + + if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR) + return -ENOSYS; + + err = copy_name(target->name, lv->name); + if (err) + return err; + + err = copy_name(target->device, lv->first_segment.device); + if (err) + return err; + + target->type = type; + target->size = lv->size; + target->start = lv->first_segment.pe_start; + target->end = target->start + lv->first_segment.pe_size; + + return 0; +} + +static int +iterator_init(struct iterator *itr, int cnt, struct target *targets) +{ + memset(itr, 0, sizeof(*itr)); + + itr->targets = malloc(sizeof(struct target) * cnt); + if (!itr->targets) + return -ENOMEM; + + memcpy(itr->targets, targets, sizeof(struct target) * cnt); + + itr->cur = 0; + itr->cur_size = cnt; + itr->max_size = cnt; + + return 0; +} + +static struct target * +iterator_next(struct iterator *itr) +{ + if (itr->cur == itr->cur_size) + return NULL; + + return itr->targets + itr->cur++; +} + +static int +iterator_add_file(struct iterator *itr, + struct target *target, const char *parent, uint8_t type) +{ + int i; + struct target *t; + char *lname, *rname; + + for (i = 0; i < itr->cur_size; i++) { + t = itr->targets + i; + lname = basename((char *)t->name); + rname = basename((char *)parent); + + if (!strcmp(lname, rname)) + return -EEXIST; + } + + return vhd_util_scan_init_file_target(target, parent, type); +} + +static int +iterator_add_volume(struct iterator *itr, + struct target *target, const char *parent, uint8_t type) +{ + int i, err; + struct lv *lv; + + lv = NULL; + err = -ENOENT; + + for (i = 0; i < itr->cur_size; i++) + if (!strcmp(parent, itr->targets[i].name)) + return -EEXIST; + + for (i = 0; i < vg.lv_cnt; i++) { + err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME); + if (err != FNM_NOMATCH) { + lv = vg.lvs + i; + break; + } + } + + if (err && err != FNM_PATHNAME) + return err; + + if (!lv) + return -ENOENT; + + return vhd_util_scan_init_volume_target(target, lv, type); +} + +static int +iterator_add(struct iterator *itr, const char *parent, uint8_t type) +{ + int err; + struct target *target; + + if (itr->cur_size == itr->max_size) { + struct target *new; + + new = realloc(itr->targets, + sizeof(struct target) * + itr->max_size * 2); + if (!new) + return -ENOMEM; + + itr->max_size *= 2; + itr->targets = new; + } + + target = itr->targets + itr->cur_size; + + if (target_volume(type)) + err = iterator_add_volume(itr, target, parent, type); + else + err = iterator_add_file(itr, target, parent, type); + + if (err) + memset(target, 0, sizeof(*target)); + else + itr->cur_size++; + + return (err == -EEXIST ? 0 : err); +} + +static void +iterator_free(struct iterator *itr) +{ + free(itr->targets); + memset(itr, 0, sizeof(*itr)); +} + +static void +vhd_util_scan_add_parent(struct iterator *itr, + vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + uint8_t type; + + if (vhd_parent_raw(vhd)) + type = target_volume(image->target->type) ? + VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE; + else + type = target_volume(image->target->type) ? + VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE; + + err = iterator_add(itr, image->parent, type); + if (err) + vhd_util_scan_error(image->parent, err); +} + +static int +vhd_util_scan_targets(int cnt, struct target *targets) +{ + int ret, err; + vhd_context_t vhd; + struct iterator itr; + struct target *target; + struct vhd_image image; + + ret = 0; + err = 0; + + err = iterator_init(&itr, cnt, targets); + if (err) + return err; + + while ((target = iterator_next(&itr))) { + memset(&vhd, 0, sizeof(vhd)); + memset(&image, 0, sizeof(image)); + + image.target = target; + + err = vhd_util_scan_open(&vhd, &image); + if (err) { + ret = -EAGAIN; + goto end; + } + + err = vhd_util_scan_get_size(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "getting physical size"; + image.error = err; + goto end; + } + + err = vhd_util_scan_get_hidden(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "checking 'hidden' field"; + image.error = err; + goto end; + } + + if (vhd.footer.type == HD_TYPE_DIFF) { + err = vhd_util_scan_get_parent(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "getting parent"; + image.error = err; + goto end; + } + } + + end: + vhd_util_scan_print_image(&image); + + if (flags & VHD_SCAN_PARENTS && image.parent) + vhd_util_scan_add_parent(&itr, &vhd, &image); + + if (vhd.file) + vhd_close(&vhd); + if (image.name != target->name) + free(image.name); + free(image.parent); + + if (err && !(flags & VHD_SCAN_NOFAIL)) + break; + } + + iterator_free(&itr); + + if (flags & VHD_SCAN_NOFAIL) + return ret; + + return err; +} + +static int +vhd_util_scan_targets_pretty(int cnt, struct target *targets) +{ + int err; + + err = vhd_util_scan_pretty_allocate_list(cnt); + if (err) { + printf("scan failed: no memory\n"); + return -ENOMEM; + } + + err = vhd_util_scan_targets(cnt, targets); + + vhd_util_scan_pretty_print_images(); + vhd_util_scan_pretty_free_list(); + + return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); +} + +static int +vhd_util_scan_find_file_targets(int cnt, char **names, + const char *filter, + struct target **_targets, int *_total) +{ + glob_t g; + struct target *targets; + int i, globs, err, total; + + total = cnt; + globs = 0; + *_total = 0; + *_targets = NULL; + + memset(&g, 0, sizeof(g)); + + if (filter) { + int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0); + + errno = 0; + err = glob(filter, gflags, vhd_util_scan_error, &g); + + switch (err) { + case GLOB_NOSPACE: + err = -ENOMEM; + break; + case GLOB_ABORTED: + err = -EIO; + break; + case GLOB_NOMATCH: + err = -errno; + break; + } + + if (err) { + vhd_util_scan_error(filter, err); + return err; + } + + globs = g.gl_pathc; + total += globs; + } + + targets = calloc(total, sizeof(struct target)); + if (!targets) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < g.gl_pathc; i++) { + err = vhd_util_scan_init_file_target(targets + i, + g.gl_pathv[i], + VHD_TYPE_VHD_FILE); + if (err) { + vhd_util_scan_error(g.gl_pathv[i], err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + for (i = 0; i + globs < total; i++) { + err = vhd_util_scan_init_file_target(targets + i + globs, + names[i], + VHD_TYPE_VHD_FILE); + if (err) { + vhd_util_scan_error(names[i], err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + err = 0; + *_total = total; + *_targets = targets; + +out: + if (err) + free(targets); + if (filter) + globfree(&g); + + return err; +} + +static inline void +swap_volume(struct lv *lvs, int dst, int src) +{ + struct lv copy, *ldst, *lsrc; + + if (dst == src) + return; + + lsrc = lvs + src; + ldst = lvs + dst; + + memcpy(©, ldst, sizeof(copy)); + memcpy(ldst, lsrc, sizeof(*ldst)); + memcpy(lsrc, ©, sizeof(copy)); +} + +static int +vhd_util_scan_sort_volumes(struct lv *lvs, int cnt, + const char *filter, int *_matches) +{ + struct lv *lv; + int i, err, matches; + + matches = 0; + *_matches = 0; + + if (!filter) + return 0; + + for (i = 0; i < cnt; i++) { + lv = lvs + i; + + err = fnmatch(filter, lv->name, FNM_PATHNAME); + if (err) { + if (err != FNM_NOMATCH) { + vhd_util_scan_error(lv->name, err); + if (!(flags & VHD_SCAN_NOFAIL)) + return err; + } + + continue; + } + + swap_volume(lvs, matches++, i); + } + + *_matches = matches; + return 0; +} + +static int +vhd_util_scan_find_volume_targets(int cnt, char **names, + const char *volume, const char *filter, + struct target **_targets, int *_total) +{ + struct target *targets; + int i, err, total, matches; + + *_total = 0; + *_targets = NULL; + targets = NULL; + + err = lvm_scan_vg(volume, &vg); + if (err) + return err; + + err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt, + filter, &matches); + if (err) + goto out; + + total = matches; + for (i = 0; i < cnt; i++) { + err = vhd_util_scan_sort_volumes(vg.lvs + total, + vg.lv_cnt - total, + names[i], &matches); + if (err) + goto out; + + total += matches; + } + + targets = calloc(total, sizeof(struct target)); + if (!targets) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < total; i++) { + err = vhd_util_scan_init_volume_target(targets + i, + vg.lvs + i, + VHD_TYPE_VHD_VOLUME); + if (err) { + vhd_util_scan_error(vg.lvs[i].name, err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + err = 0; + *_total = total; + *_targets = targets; + +out: + if (err) + free(targets); + return err; +} + +static int +vhd_util_scan_find_targets(int cnt, char **names, + const char *volume, const char *filter, + struct target **targets, int *total) +{ + if (flags & VHD_SCAN_VOLUME) + return vhd_util_scan_find_volume_targets(cnt, names, + volume, filter, + targets, total); + return vhd_util_scan_find_file_targets(cnt, names, + filter, targets, total); +} + +int +vhd_util_scan(int argc, char **argv) +{ + int c, ret, err, cnt; + char *filter, *volume; + struct target *targets; + + cnt = 0; + ret = 0; + err = 0; + flags = 0; + filter = NULL; + volume = NULL; + targets = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "m:fcl:pavh")) != -1) { + switch (c) { + case 'm': + filter = optarg; + break; + case 'f': + flags |= VHD_SCAN_FAST; + break; + case 'c': + flags |= VHD_SCAN_NOFAIL; + break; + case 'l': + volume = optarg; + flags |= VHD_SCAN_VOLUME; + break; + case 'p': + flags |= VHD_SCAN_PRETTY; + break; + case 'a': + flags |= VHD_SCAN_PARENTS; + break; + case 'v': + flags |= VHD_SCAN_VERBOSE; + break; + case 'h': + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!filter && argc - optind == 0) { + err = -EINVAL; + goto usage; + } + + if (flags & VHD_SCAN_PRETTY) + flags &= ~VHD_SCAN_FAST; + + err = vhd_util_scan_find_targets(argc - optind, argv + optind, + volume, filter, &targets, &cnt); + if (err) { + printf("scan failed: %d\n", err); + return err; + } + + if (!cnt) + return 0; + + if (flags & VHD_SCAN_PRETTY) + err = vhd_util_scan_targets_pretty(cnt, targets); + else + err = vhd_util_scan_targets(cnt, targets); + + free(targets); + lvm_free_vg(&vg); + + return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); + +usage: + printf("usage: [OPTIONS] FILES\n" + "options: [-m match filter] [-f fast] [-c continue on failure] " + "[-l LVM volume] [-p pretty print] [-a scan parents] " + "[-v verbose] [-h help]\n"); + return err; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-set-field.c b/tools/blktap2/vhd/lib/vhd-util-set-field.c new file mode 100644 index 0000000000..ac185735d9 --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-set-field.c @@ -0,0 +1,106 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_set_field(int argc, char **argv) +{ + long value; + int err, c; + off64_t eof; + vhd_context_t vhd; + char *name, *field; + + err = -EINVAL; + value = 0; + name = NULL; + field = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:f:v:h")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'f': + field = optarg; + break; + case 'v': + err = 0; + value = strtol(optarg, NULL, 10); + break; + case 'h': + default: + goto usage; + } + } + + if (!name || !field || optind != argc || err) + goto usage; + + if (strnlen(field, 25) >= 25) { + printf("invalid field\n"); + goto usage; + } + + if (strcmp(field, "hidden")) { + printf("invalid field %s\n", field); + goto usage; + } + + if (value < 0 || value > 255) { + printf("invalid value %ld\n", value); + goto usage; + } + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + vhd.footer.hidden = (char)value; + + err = vhd_write_footer(&vhd, &vhd.footer); + + done: + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> <-f field> <-v value> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap2/vhd/lib/vhd-util-snapshot.c b/tools/blktap2/vhd/lib/vhd-util-snapshot.c new file mode 100644 index 0000000000..75960f96ea --- /dev/null +++ b/tools/blktap2/vhd/lib/vhd-util-snapshot.c @@ -0,0 +1,216 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +static int +vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw) +{ + int i, err; + char *target; + vhd_context_t vhd; + + *parent_raw = 0; + *result = NULL; + + target = strdup(name); + if (!target) + return -ENOMEM; + + for (;;) { + err = vhd_open(&vhd, target, VHD_OPEN_RDONLY); + if (err) + return err; + + if (vhd.footer.type != HD_TYPE_DIFF) + goto out; + + err = vhd_get_bat(&vhd); + if (err) + goto out; + + for (i = 0; i < vhd.bat.entries; i++) + if (vhd.bat.bat[i] != DD_BLK_UNUSED) + goto out; + + free(target); + err = vhd_parent_locator_get(&vhd, &target); + if (err) + goto out; + + if (vhd_parent_raw(&vhd)) { + *parent_raw = 1; + goto out; + } + + vhd_close(&vhd); + } + +out: + vhd_close(&vhd); + if (err) + free(target); + else + *result = target; + + return err; +} + +static int +vhd_util_check_depth(const char *name, int *depth) +{ + int err; + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) + return err; + + err = vhd_chain_depth(&vhd, depth); + vhd_close(&vhd); + + return err; +} + +int +vhd_util_snapshot(int argc, char **argv) +{ + vhd_flag_creat_t flags; + int c, err, prt_raw, limit; + char *name, *pname, *ppath, *backing; + uint64_t size; + vhd_context_t vhd; + + name = NULL; + pname = NULL; + ppath = NULL; + backing = NULL; + size = 0; + flags = 0; + limit = 0; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + optind = 0; + while ((c = getopt(argc, argv, "n:p:l:mh")) != -1) { + switch (c) { + case 'n': + name = optarg; + break; + case 'p': + pname = optarg; + break; + case 'l': + limit = strtol(optarg, NULL, 10); + break; + case 'm': + vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); + break; + case 'h': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || !pname || optind != argc) { + err = -EINVAL; + goto usage; + } + + ppath = realpath(pname, NULL); + if (!ppath) + return -errno; + + if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) { + backing = strdup(ppath); + if (!backing) { + err = -ENOMEM; + goto out; + } + } else { + err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw); + if (err) { + backing = NULL; + goto out; + } + + /* + * if the sizes of the parent chain are non-uniform, we need to + * pick the right size: that of the supplied parent + */ + if (strcmp(ppath, backing)) { + err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY); + if (err) + goto out; + size = vhd.footer.curr_size; + vhd_close(&vhd); + } + + if (prt_raw) + vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); + } + + if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) { + int depth; + + err = vhd_util_check_depth(backing, &depth); + if (err) + printf("error checking snapshot depth: %d\n", err); + else if (depth + 1 > limit) { + err = -ENOSPC; + printf("snapshot depth exceeded: " + "current depth: %d, limit: %d\n", depth, limit); + } + + if (err) + goto out; + } + + err = vhd_snapshot(name, size, backing, flags); + +out: + free(ppath); + free(backing); + + return err; + +usage: + printf("options: <-n name> <-p parent name> [-l snapshot depth limit]" + " [-m parent_is_raw] [-h help]\n"); + return err; +} diff --git a/tools/blktap2/vhd/vhd-update.c b/tools/blktap2/vhd/vhd-update.c new file mode 100644 index 0000000000..fbc23cc7ae --- /dev/null +++ b/tools/blktap2/vhd/vhd-update.c @@ -0,0 +1,261 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Before updating a VHD file, we create a journal consisting of: + * - all data at the beginning of the file, up to and including the BAT + * - each allocated bitmap (existing at the same offset in the journal as + * its corresponding bitmap in the original file) + * Updates are performed in place by writing appropriately + * transformed versions of journaled bitmaps to the original file. + */ +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <endian.h> +#include <byteswap.h> + +#include "atomicio.h" +#include "libvhd.h" +#include "libvhd-journal.h" + +static void +usage(void) +{ + printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n"); + exit(EINVAL); +} + +/* + * update vhd creator version to reflect its new bitmap ordering + */ +static inline int +update_creator_version(vhd_journal_t *journal) +{ + journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1); + return vhd_write_footer(&journal->vhd, &journal->vhd.footer); +} + +static int +journal_bitmaps(vhd_journal_t *journal) +{ + int i, err; + + for (i = 0; i < journal->vhd.bat.entries; i++) { + err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA); + if (err) + return err; + } + + return 0; +} + +/* + * older VHD bitmaps were little endian + * and bits within a word were set from right to left + */ +static inline int +old_test_bit(int nr, volatile void * addr) +{ + return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> + (nr % (sizeof(unsigned long)*8))) & 1; +} + +/* + * new VHD bitmaps are big endian + * and bits within a word are set from left to right + */ +#define BIT_MASK 0x80 +static inline void +new_set_bit (int nr, volatile char *addr) +{ + addr[nr >> 3] |= (BIT_MASK >> (nr & 7)); +} + +static void +convert_bitmap(char *in, char *out, int bytes) +{ + int i; + + memset(out, 0, bytes); + + for (i = 0; i < bytes << 3; i++) + if (old_test_bit(i, (void *)in)) + new_set_bit(i, out); +} + +static int +update_vhd(vhd_journal_t *journal, int rollback) +{ + int i, err; + size_t size; + char *buf, *converted; + + buf = NULL; + converted = NULL; + + size = vhd_bytes_padded(journal->vhd.spb / 8); + err = posix_memalign((void **)&converted, 512, size); + if (err) { + converted = NULL; + goto out; + } + + for (i = 0; i < journal->vhd.bat.entries; i++) { + if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED) + continue; + + err = vhd_read_bitmap(&journal->vhd, i, &buf); + if (err) + goto out; + + if (rollback) + memcpy(converted, buf, size); + else + convert_bitmap(buf, converted, size); + + free(buf); + + err = vhd_write_bitmap(&journal->vhd, i, converted); + if (err) + goto out; + } + + err = 0; + out: + free(converted); + return err; +} + +static int +open_journal(vhd_journal_t *journal, const char *file, const char *jfile) +{ + int err; + + err = vhd_journal_create(journal, file, jfile); + if (err) { + printf("error creating journal for %s: %d\n", file, err); + return err; + } + + return 0; +} + +static int +close_journal(vhd_journal_t *journal, int err) +{ + if (err) + err = vhd_journal_revert(journal); + else + err = vhd_journal_commit(journal); + + if (err) + return vhd_journal_close(journal); + else + return vhd_journal_remove(journal); +} + +int +main(int argc, char **argv) +{ + char *file, *jfile; + int c, err, rollback; + vhd_journal_t journal; + + file = NULL; + jfile = NULL; + rollback = 0; + + while ((c = getopt(argc, argv, "n:j:rh")) != -1) { + switch(c) { + case 'n': + file = optarg; + break; + case 'j': + jfile = optarg; + err = access(jfile, R_OK); + if (err == -1) { + printf("invalid journal arg %s\n", jfile); + return -errno; + } + break; + case 'r': + /* add a rollback option for debugging which + * pushes journalled bitmaps to original file + * without transforming them */ + rollback = 1; + break; + default: + usage(); + } + } + + if (!file) + usage(); + + if (rollback && !jfile) { + printf("rollback requires a journal argument\n"); + usage(); + } + + err = open_journal(&journal, file, jfile); + if (err) + return err; + + if (!vhd_creator_tapdisk(&journal.vhd) || + journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) || + journal.vhd.footer.type == HD_TYPE_FIXED) { + err = 0; + goto out; + } + + err = journal_bitmaps(&journal); + if (err) { + /* no changes to vhd file yet, + * so close the journal and bail */ + vhd_journal_close(&journal); + return err; + } + + err = update_vhd(&journal, rollback); + if (err) { + printf("update failed: %d; saving journal\n", err); + goto out; + } + + err = update_creator_version(&journal); + if (err) { + printf("failed to udpate creator version: %d\n", err); + goto out; + } + + err = 0; + +out: + err = close_journal(&journal, err); + return err; +} diff --git a/tools/blktap2/vhd/vhd-util.c b/tools/blktap2/vhd/vhd-util.c new file mode 100644 index 0000000000..944a59e395 --- /dev/null +++ b/tools/blktap2/vhd/vhd-util.c @@ -0,0 +1,160 @@ +/* Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "libvhd.h" +#include "vhd-util.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +typedef int (*vhd_util_func_t) (int, char **); + +struct command { + char *name; + vhd_util_func_t func; +}; + +struct command commands[] = { + { .name = "create", .func = vhd_util_create }, + { .name = "snapshot", .func = vhd_util_snapshot }, + { .name = "query", .func = vhd_util_query }, + { .name = "read", .func = vhd_util_read }, + { .name = "set", .func = vhd_util_set_field }, + { .name = "repair", .func = vhd_util_repair }, + { .name = "resize", .func = vhd_util_resize }, + { .name = "fill", .func = vhd_util_fill }, + { .name = "coalesce", .func = vhd_util_coalesce }, + { .name = "modify", .func = vhd_util_modify }, + { .name = "scan", .func = vhd_util_scan }, + { .name = "check", .func = vhd_util_check }, + { .name = "revert", .func = vhd_util_revert }, +}; + +#define print_commands() \ + do { \ + int i, n; \ + n = sizeof(commands) / sizeof(struct command); \ + printf("COMMAND := { "); \ + printf("%s", commands[0].name); \ + for (i = 1; i < n; i++) \ + printf(" | %s", commands[i].name); \ + printf(" }\n"); \ + } while (0) + +TEST_FAIL_EXTERN_VARS; + +void +help(void) +{ + printf("usage: vhd-util COMMAND [OPTIONS]\n"); + print_commands(); + exit(0); +} + +struct command * +get_command(char *command) +{ + int i, n; + + if (strnlen(command, 25) >= 25) + return NULL; + + n = sizeof(commands) / sizeof (struct command); + + for (i = 0; i < n; i++) + if (!strcmp(command, commands[i].name)) + return &commands[i]; + + return NULL; +} + +int +main(int argc, char *argv[]) +{ + char **cargv; + struct command *cmd; + int cargc, i, cnt, ret; + +#ifdef CORE_DUMP + #include <sys/resource.h> + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + fprintf(stderr, "setrlimit failed: %d\n", errno); +#endif + + ret = 0; + + if (argc < 2) + help(); + + cargc = argc - 1; + cmd = get_command(argv[1]); + if (!cmd) { + fprintf(stderr, "invalid COMMAND %s\n", argv[1]); + help(); + } + + cargv = malloc(sizeof(char *) * cargc); + if (!cargv) + exit(ENOMEM); + + cnt = 1; + cargv[0] = cmd->name; + for (i = 1; i < cargc; i++) { + char *arg = argv[i + (argc - cargc)]; + + if (!strcmp(arg, "--debug")) { + libvhd_set_log_level(1); + continue; + } + + cargv[cnt++] = arg; + } + +#ifdef ENABLE_FAILURE_TESTING + for (i = 0; i < NUM_FAIL_TESTS; i++) { + TEST_FAIL[i] = 0; + if (getenv(ENV_VAR_FAIL[i])) + TEST_FAIL[i] = 1; + } +#endif // ENABLE_FAILURE_TESTING + + ret = cmd->func(cnt, cargv); + + free(cargv); + + return (ret >= 0 ? ret : -ret); +} diff --git a/tools/check/check_uuid_devel b/tools/check/check_uuid_devel new file mode 100755 index 0000000000..0a90b15eea --- /dev/null +++ b/tools/check/check_uuid_devel @@ -0,0 +1,6 @@ +#!/bin/sh +# CHECK-BUILD + +. ./funcs.sh + +has_header uuid/uuid.h || fail "missing uuid headers (package uuid-dev)" diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index bd499a728a..88a0cbe259 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -27,6 +27,7 @@ Author: Mike Wray <mike.wray@hp.com> import logging import time import threading +import thread import re import copy import os @@ -535,6 +536,25 @@ class XendDomainInfo: @raise XendError: Failed pausing a domain """ try: + bepath="/local/domain/0/backend/" + if(self.domid): + + dev = xstransact.List(bepath + 'vbd' + "/%d" % (self.domid,)) + for x in dev: + path = self.getDeviceController('vbd').readBackend(x, 'params') + if path and path.startswith('/dev/xen/blktap-2'): + #Figure out the sysfs path. + pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$') + ctrlid = pattern.search(path) + ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1) + #pause the disk + f = open(ctrl + '/pause', 'w') + f.write('pause'); + f.close() + except Exception, ex: + log.warn('Could not pause blktap disk.'); + + try: xc.domain_pause(self.domid) self._stateSet(DOM_STATE_PAUSED) except Exception, ex: @@ -547,6 +567,26 @@ class XendDomainInfo: @raise XendError: Failed unpausing a domain """ try: + bepath="/local/domain/0/backend/" + if(self.domid): + dev = xstransact.List(bepath + "vbd" + "/%d" % (self.domid,)) + for x in dev: + path = self.getDeviceController('vbd').readBackend(x, 'params') + if path and path.startswith('/dev/xen/blktap-2'): + #Figure out the sysfs path. + pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$') + ctrlid = pattern.search(path) + ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1) + #unpause the disk + if(os.path.exists(ctrl + '/resume')): + f = open(ctrl + '/resume', 'w'); + f.write('resume'); + f.close(); + + except Exception, ex: + log.warn('Could not unpause blktap disk: %s' % str(ex)); + + try: xc.domain_unpause(self.domid) self._stateSet(DOM_STATE_RUNNING) except Exception, ex: @@ -1171,6 +1211,15 @@ class XendDomainInfo: rc = None if self.domid is not None: + + #new blktap implementation may need a sysfs write after everything is torn down. + dev = self.getDeviceController(deviceClass).convertToDeviceNumber(devid) + path = self.getDeviceController(deviceClass).readBackend(dev, 'params') + if path and path.startswith('/dev/xen/blktap-2'): + frontpath = self.getDeviceController(deviceClass).frontendPath(dev) + backpath = xstransact.Read(frontpath, "backend") + thread.start_new_thread(self.getDeviceController(deviceClass).finishDeviceCleanup, (backpath, path)) + rc = self.getDeviceController(deviceClass).destroyDevice(devid, force) if not force and rm_cfg: # The backend path, other than the device itself, diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py index 36c1d0688e..4c7f334968 100644 --- a/tools/python/xen/xend/server/BlktapController.py +++ b/tools/python/xen/xend/server/BlktapController.py @@ -1,5 +1,6 @@ # Copyright (c) 2005, XenSource Ltd. - +import string, re +import subprocess from xen.xend.server.blkif import BlkifController from xen.xend.XendLogging import log @@ -7,6 +8,11 @@ from xen.xend.XendLogging import log phantomDev = 0; phantomId = 0; +TAPDISK_SYSFS = '/sys/class/blktap2' +TAPDISK_BINARY = '/usr/sbin/tapdisk2' +TAPDISK_DEVICE = '/dev/xen/blktap-2/tapdev' +TAPDISK_CONTROL = TAPDISK_SYSFS + '/blktap' + blktap_disk_types = [ 'aio', 'sync', @@ -14,10 +20,33 @@ blktap_disk_types = [ 'ram', 'qcow', 'qcow2', - + 'vhd', 'ioemu', 'tapdisk', ] + +def doexec(args, inputtext=None): + """Execute a subprocess, then return its return code, stdout and stderr""" + proc = subprocess.Popen(args,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True) + (stdout,stderr) = proc.communicate(inputtext) + rc = proc.returncode + return (rc,stdout,stderr) + +def parseDeviceString(device): + if device.find('/dev') == -1: + raise Exception, 'invalid tap device: ' + device + + pattern = re.compile(TAPDISK_DEVICE + '(\d+)$') + groups = pattern.search(device) + if not groups: + raise Exception, 'malformed tap device: ' + device + + minor = groups.group(1) + control = TAPDISK_CONTROL + minor + + return minor, device, control + + class BlktapController(BlkifController): def __init__(self, vm): @@ -86,3 +115,24 @@ class BlktapController(BlkifController): return (devid, back, front) + def createDevice(self, config): + + uname = config.get('uname', '') + (typ, subtyp, params, file) = string.split(uname, ':', 3) + if typ in ('tap'): + if subtyp in ('tapdisk'): + if params in ('ioemu', 'qcow2', 'vmdk', 'sync'): + log.warn('WARNING: using deprecated blktap module'); + return BlkifController.createDevice(self, config); + + cmd = [ TAPDISK_BINARY, '-n', '%s:%s' % (params, file) ] + (rc,stdout,stderr) = doexec(cmd) + + minor, device, control = parseDeviceString(stdout) + + #modify the configuration to attach as a vbd, now that the + #device is configured. Then continue to create the device + config.update({'uname' : 'phy:' + device.rstrip()}) + self.deviceClass='vbd' + + return BlkifController.createDevice(self, config); diff --git a/tools/python/xen/xend/server/DevController.py b/tools/python/xen/xend/server/DevController.py index 6c2bb09ca6..ed46dd4803 100644 --- a/tools/python/xen/xend/server/DevController.py +++ b/tools/python/xen/xend/server/DevController.py @@ -27,8 +27,8 @@ from xen.xend.server.DevConstants import * from xen.xend.xenstore.xstransact import xstransact, complete from xen.xend.xenstore.xswatch import xswatch - -import os +import xen.xend.server.DevConstants +import os, re xoptions = XendOptions.instance() @@ -238,6 +238,34 @@ class DevController: # xstransact.Remove(self.devicePath()) ?? Below is the same ? self.vm._removeVm("device/%s/%d" % (self.deviceClass, dev)) + # The new blocktap implementation requires a sysfs signal to close + # out disks. This function is called from a thread when the + # domain is detached from the disk. + def finishDeviceCleanup(self, backpath, path): + """Perform any device specific cleanup + + @backpath backend xenstore path. + @path frontend device path + + """ + + if path and path.startswith('/dev/xen/blktap-2'): + + #Figure out what we're going to wait on. + self.waitForBackend_destroy(backpath) + + #Figure out the sysfs path. + pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$') + ctrlid = pattern.search(path) + ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1) + + #Close out the disk + f = open(ctrl + '/remove', 'w') + f.write('remove'); + f.close() + + return + def configurations(self, transaction = None): return map(lambda x: self.configuration(x, transaction), self.deviceIDs(transaction)) |