diff options
author | jchesterfield@dhcp92.uk.xensource.com <jchesterfield@dhcp92.uk.xensource.com> | 2006-07-13 10:13:26 +0100 |
---|---|---|
committer | jchesterfield@dhcp92.uk.xensource.com <jchesterfield@dhcp92.uk.xensource.com> | 2006-07-13 10:13:26 +0100 |
commit | 0da81aa1d4a70baefa42b4e5ff1bbf670abc2711 (patch) | |
tree | bb0c9f29e962352c1e9949f5e10699847fb2d652 /tools | |
parent | 0929bd9fc08ffc28978dad3208422948adb46811 (diff) | |
download | xen-0da81aa1d4a70baefa42b4e5ff1bbf670abc2711.tar.gz xen-0da81aa1d4a70baefa42b4e5ff1bbf670abc2711.tar.bz2 xen-0da81aa1d4a70baefa42b4e5ff1bbf670abc2711.zip |
Added blktap support. Includes kernel driver (enabled as CONFIG_XEN_BLKDEV_TAP=y) and userspace tools. The userspace deamon (blktapctrl) is enabled by default when xend is activated. For further information on using and configuring blktap see tools/blktap/README.
Diffstat (limited to 'tools')
120 files changed, 14531 insertions, 4 deletions
diff --git a/tools/Makefile b/tools/Makefile index ac41f2f321..2a42254e32 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -16,6 +16,8 @@ SUBDIRS-y += guest-headers SUBDIRS-$(VTPM_TOOLS) += vtpm_manager SUBDIRS-$(VTPM_TOOLS) += vtpm SUBDIRS-y += xenstat +SUBDIRS-y += libaio +SUBDIRS-y += blktap # These don't cross-compile ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH)) diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile new file mode 100644 index 0000000000..fb194f3203 --- /dev/null +++ b/tools/blktap/Makefile @@ -0,0 +1,28 @@ +XEN_ROOT = ../.. +include $(XEN_ROOT)/tools/Rules.mk + +SUBDIRS-y := +SUBDIRS-y += lib +SUBDIRS-y += drivers + +.PHONY: all +all: build + +.PHONY: build +build: mk-symlinks + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir all; \ + done + +.PHONY: install +install: + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir install; \ + done + +.PHONY: clean +clean: + rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS + @set -e; for subdir in $(SUBDIRS-y); do \ + $(MAKE) -C $$subdir clean; \ + done diff --git a/tools/blktap/README b/tools/blktap/README new file mode 100644 index 0000000000..5e4108030e --- /dev/null +++ b/tools/blktap/README @@ -0,0 +1,122 @@ +Blktap Userspace Tools + Library +================================ + +Andrew Warfield and Julian Chesterfield +16th June 2006 + +{firstname.lastname}@cl.cam.ac.uk + +The blktap userspace toolkit provides a user-level disk I/O +interface. The blktap mechanism involves a kernel driver that acts +similarly to the existing Xen/Linux blkback driver, and a set of +associated user-level libraries. Using these tools, blktap allows +virtual block devices presented to VMs to be implemented in userspace +and to be backed by raw partitions, files, network, etc. + +The key benefit of blktap is that it makes it easy and fast to write +arbitrary block backends, and that these user-level backends actually +perform very well. Specifically: + +- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse + formats and other compression features can be easily implemented. + +- Accessing file-based images from userspace avoids problems related + to flushing dirty pages which are present in the Linux loopback + driver. (Specifically, doing a large number of writes to an + NFS-backed image don't result in the OOM killer going berserk.) + +- Per-disk handler processes enable easier userspace policing of block + resources, and process-granularity QoS techniques (disk scheduling + and related tools) may be trivially applied to block devices. + +- It's very easy to take advantage of userspace facilities such as + networking libraries, compression utilities, peer-to-peer + file-sharing systems and so on to build more complex block backends. + +- Crashes are contained -- incremental development/debugging is very + fast. + +How it works (in one paragraph): + +Working in conjunction with the kernel blktap driver, all disk I/O +requests from VMs are passed to the userspace deamon (using a shared +memory interface) through a character device. Each active disk is +mapped to an individual device node, allowing per-disk processes to +implement individual block devices where desired. The userspace +drivers are implemented using asynchronous (Linux libaio), +O_DIRECT-based calls to preserve the unbuffered, batched and +asynchronous request dispatch achieved with the existing blkback +code. We provide a simple, asynchronous virtual disk interface that +makes it quite easy to add new disk implementations. + +As of June 2006 the current supported disk formats are: + + - Raw Images (both on partitions and in image files) + - File-backed Qcow disks + - Standalone sparse Qcow disks + - Fast shareable RAM disk between VMs (requires some form of cluster-based + filesystem support e.g. OCFS2 in the guest kernel) + - Some VMDK images - your mileage may vary + +Raw and QCow images have asynchronous backends and so should perform +fairly well. VMDK is based directly on the qemu vmdk driver, which is +synchronous (a.k.a. slow). + +Build and Installation Instructions +=================================== + +Make to configure the blktap backend driver in your dom0 kernel. It +will cooperate fine with the existing backend driver, so you can +experiment with tap disks without breaking existing VM configs. + +To build the tools separately, "make && make install" in +tools/blktap. + + +Using the Tools +=============== + +Prepare the image for booting. For qcow files use the qcow utilities +installed earlier. e.g. qcow-create generates a blank standalone image +or a file-backed CoW image. img2qcow takes an existing image or +partition and creates a sparse, standalone qcow-based file. + +The userspace disk agent is configured to start automatically via xend +(alternatively you can start it manually => 'blktapctrl') + +Customise the VM config file to use the 'tap' handler, followed by the +driver type. e.g. for a raw image such as a file or partition: + +disk = ['tap:aio:<FILENAME>,sda1,w'] + +e.g. for a qcow image: + +disk = ['tap:qcow:<FILENAME>,sda1,w'] + + +Mounting images in Dom0 using the blktap driver +=============================================== +Tap (and blkback) disks are also mountable in Dom0 without requiring an +active VM to attach. You will need to build a xenlinux Dom0 kernel that +includes the blkfront driver (e.g. the default 'make world' or +'make kernels' build. Simply use the xm command-line tool to activate +the backend disks, and blkfront will generate a virtual block device that +can be accessed in the same way as a loop device or partition: + +e.g. for a raw image file <FILENAME> that would normally be mounted using +the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the +following: + +xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0 +mount /dev/xvda1 /mnt/disk <--- don't use loop driver + +In this way, you can use any of the userspace device-type drivers built +with the blktap userspace toolkit to open and mount disks such as qcow +or vmdk images: + +xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0 +mount /dev/xvda1 /mnt/disk + + + + diff --git a/tools/blktap/drivers/Makefile b/tools/blktap/drivers/Makefile new file mode 100644 index 0000000000..6601a4d005 --- /dev/null +++ b/tools/blktap/drivers/Makefile @@ -0,0 +1,76 @@ +XEN_ROOT = ../../.. +include $(XEN_ROOT)/tools/Rules.mk + +INCLUDES += -I.. -I../lib + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +IBIN = blktapctrl tapdisk +QCOW_UTIL = img2qcow qcow2raw qcow-create +INSTALL_DIR = /usr/sbin +LIBAIO_DIR = ../../libaio/src + +CFLAGS += -fPIC +CFLAGS += -Wall +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -g3 +CFLAGS += -fno-strict-aliasing +CFLAGS += -I $(XEN_LIBXC) -I $(LIBAIO_DIR) +CFLAGS += $(INCLUDES) -I. -I../../xenstore +CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +CFLAGS += -D_GNU_SOURCE + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +THREADLIB := -lpthread -lz +LIBS := -L. -L.. -L../lib +LIBS += -L$(XEN_LIBXC) +LIBS += -lblktap +LIBS += -lcrypto +LIBS += -lz +LIBS += -L$(XEN_XENSTORE) -lxenstore + +AIOLIBS := -L $(LIBAIO_DIR) +AIOLIBS += -laio +AIOLIBS += -static + +BLK-OBJS := block-aio.o +BLK-OBJS += block-sync.o +BLK-OBJS += block-vmdk.o +BLK-OBJS += block-ram.o +BLK-OBJS += block-qcow.o +BLK-OBJS += aes.o + +all: $(IBIN) qcow-util + +LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) + + +blktapctrl: + $(CC) $(CFLAGS) -o blktapctrl $(LIBS) blktapctrl.c + +tapdisk: $(BLK-OBJS) + $(CC) $(CFLAGS) -o tapdisk $(BLK-OBJS) tapdisk.c \ + $(AIOLIBS) $(LIBS) + + +qcow-util: $(BLK-OBJS) + $(CC) $(CFLAGS) -o img2qcow $(BLK-OBJS) img2qcow.c \ + $(AIOLIBS) $(LIBS) + $(CC) $(CFLAGS) -o qcow2raw $(BLK-OBJS) qcow2raw.c \ + $(AIOLIBS) $(LIBS) + $(CC) $(CFLAGS) -o qcow-create $(BLK-OBJS) qcow-create.c \ + $(AIOLIBS) $(LIBS) + +install: all + $(INSTALL_PROG) $(IBIN) $(QCOW_UTIL) $(DESTDIR)$(INSTALL_DIR) + +clean: + rm -rf *.o *~ $(DEPS) xen TAGS $(IBIN) $(LIB) $(QCOW_UTIL) + +.PHONY: clean install + +-include $(DEPS) diff --git a/tools/blktap/drivers/aes.c b/tools/blktap/drivers/aes.c new file mode 100644 index 0000000000..4d83fac957 --- /dev/null +++ b/tools/blktap/drivers/aes.c @@ -0,0 +1,1319 @@ +/** + * + * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project. + */ +/* + * rijndael-alg-fst.c + * + * @version 3.0 (December 2000) + * + * Optimised ANSI C code for the Rijndael cipher (now AES) + * + * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> + * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> + * @author Paulo Barreto <paulo.barreto@terra.com.br> + * + * This code is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +//#include "vl.h" +#include <inttypes.h> +#include <string.h> +#include "aes.h" + +//#define NDEBUG +#include <assert.h> + +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; + +#define MAXKC (256/32) +#define MAXKB (256/8) +#define MAXNR 14 + +/* This controls loop-unrolling in aes_core.c */ +#undef FULL_UNROLL +# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3])) +# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); } + +/* +Te0[x] = S [x].[02, 01, 01, 03]; +Te1[x] = S [x].[03, 02, 01, 01]; +Te2[x] = S [x].[01, 03, 02, 01]; +Te3[x] = S [x].[01, 01, 03, 02]; +Te4[x] = S [x].[01, 01, 01, 01]; + +Td0[x] = Si[x].[0e, 09, 0d, 0b]; +Td1[x] = Si[x].[0b, 0e, 09, 0d]; +Td2[x] = Si[x].[0d, 0b, 0e, 09]; +Td3[x] = Si[x].[09, 0d, 0b, 0e]; +Td4[x] = Si[x].[01, 01, 01, 01]; +*/ + +static const u32 Te0[256] = { + 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, + 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, + 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, + 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, + 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, + 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, + 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, + 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, + 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, + 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, + 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, + 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, + 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, + 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, + 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, + 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, + 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, + 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, + 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, + 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, + 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, + 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, + 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, + 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, + 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, + 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, + 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, + 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, + 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, + 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, + 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, + 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, + 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, + 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, + 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, + 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, + 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, + 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, + 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, + 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, + 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, + 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, + 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, + 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, + 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, + 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, + 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, + 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, + 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, + 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, + 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, + 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, + 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, + 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, + 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, + 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, + 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, + 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, + 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, + 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, + 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, + 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, + 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, + 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, +}; +static const u32 Te1[256] = { + 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, + 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, + 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, + 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, + 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, + 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, + 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, + 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, + 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, + 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, + 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, + 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, + 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, + 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, + 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, + 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, + 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, + 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, + 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, + 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, + 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, + 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, + 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, + 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, + 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, + 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, + 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, + 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, + 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, + 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, + 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, + 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, + 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, + 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, + 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, + 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, + 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, + 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, + 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, + 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, + 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, + 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, + 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, + 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, + 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, + 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, + 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, + 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, + 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, + 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, + 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, + 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, + 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, + 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, + 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, + 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, + 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, + 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, + 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, + 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, + 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, + 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, + 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, + 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, +}; +static const u32 Te2[256] = { + 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, + 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, + 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, + 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, + 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, + 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, + 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, + 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, + 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, + 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, + 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, + 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, + 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, + 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, + 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, + 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, + 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, + 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, + 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, + 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, + 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, + 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, + 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, + 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, + 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, + 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, + 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, + 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, + 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, + 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, + 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, + 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, + 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, + 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, + 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, + 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, + 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, + 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, + 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, + 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, + 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, + 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, + 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, + 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, + 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, + 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, + 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, + 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, + 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, + 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, + 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, + 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, + 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, + 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, + 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, + 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, + 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, + 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, + 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, + 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, + 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, + 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, + 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, + 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, +}; +static const u32 Te3[256] = { + + 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, + 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, + 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, + 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, + 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, + 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, + 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, + 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, + 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, + 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, + 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, + 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, + 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, + 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, + 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, + 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, + 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, + 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, + 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, + 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, + 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, + 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, + 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, + 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, + 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, + 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, + 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, + 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, + 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, + 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, + 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, + 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, + 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, + 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, + 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, + 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, + 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, + 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, + 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, + 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, + 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, + 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, + 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, + 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, + 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, + 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, + 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, + 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, + 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, + 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, + 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, + 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, + 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, + 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, + 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, + 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, + 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, + 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, + 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, + 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, + 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, + 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, + 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, + 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, +}; +static const u32 Te4[256] = { + 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU, + 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U, + 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU, + 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U, + 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU, + 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U, + 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU, + 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U, + 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U, + 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU, + 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U, + 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U, + 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U, + 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU, + 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U, + 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U, + 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU, + 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U, + 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U, + 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U, + 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU, + 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU, + 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U, + 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU, + 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU, + 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U, + 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU, + 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U, + 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU, + 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U, + 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U, + 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U, + 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU, + 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U, + 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU, + 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U, + 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU, + 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U, + 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U, + 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU, + 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU, + 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU, + 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U, + 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U, + 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU, + 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U, + 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU, + 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U, + 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU, + 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U, + 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU, + 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU, + 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U, + 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU, + 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U, + 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU, + 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U, + 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U, + 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U, + 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU, + 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU, + 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U, + 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU, + 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U, +}; +static const u32 Td0[256] = { + 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, + 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, + 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, + 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, + 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, + 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, + 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, + 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, + 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, + 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, + 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, + 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, + 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, + 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, + 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, + 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, + 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, + 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, + 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, + 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, + 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, + 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, + 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, + 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, + 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, + 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, + 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, + 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, + 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, + 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, + 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, + 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, + 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, + 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, + 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, + 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, + 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, + 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, + 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, + 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, + 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, + 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, + 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, + 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, + 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, + 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, + 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, + 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, + 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, + 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, + 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, + 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, + 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, + 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, + 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, + 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, + 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, + 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, + 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, + 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, + 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, + 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, + 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, + 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U, +}; +static const u32 Td1[256] = { + 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, + 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, + 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, + 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, + 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, + 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, + 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, + 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, + 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, + 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, + 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, + 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, + 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, + 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, + 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, + 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, + 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, + 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, + 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, + 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, + 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, + 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, + 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, + 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, + 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, + 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, + 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, + 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, + 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, + 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, + 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, + 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, + 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, + 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, + 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, + 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, + 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, + 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, + 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, + 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, + 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, + 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, + 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, + 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, + 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, + 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, + 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, + 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, + 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, + 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, + 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, + 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, + 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, + 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, + 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, + 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, + 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, + 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, + 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, + 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, + 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, + 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, + 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, + 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U, +}; +static const u32 Td2[256] = { + 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, + 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, + 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, + 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, + 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, + 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, + 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, + 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, + 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, + 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, + 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, + 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, + 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, + 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, + 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, + 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, + 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, + 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, + 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, + 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, + + 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, + 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, + 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, + 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, + 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, + 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, + 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, + 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, + 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, + 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, + 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, + 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, + 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, + 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, + 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, + 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, + 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, + 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, + 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, + 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, + 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, + 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, + 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, + 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, + 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, + 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, + 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, + 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, + 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, + 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, + 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, + 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, + 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, + 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, + 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, + 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, + 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, + 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, + 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, + 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, + 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, + 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, + 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, + 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U, +}; +static const u32 Td3[256] = { + 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, + 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, + 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, + 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, + 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, + 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, + 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, + 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, + 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, + 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, + 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, + 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, + 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, + 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, + 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, + 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, + 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, + 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, + 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, + 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, + 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, + 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, + 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, + 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, + 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, + 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, + 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, + 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, + 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, + 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, + 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, + 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, + 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, + 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, + 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, + 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, + 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, + 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, + 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, + 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, + 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, + 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, + 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, + 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, + 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, + 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, + 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, + 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, + 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, + 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, + 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, + 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, + 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, + 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, + 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, + 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, + 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, + 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, + 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, + 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, + 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, + 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, + 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, + 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U, +}; +static const u32 Td4[256] = { + 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U, + 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U, + 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU, + 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU, + 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U, + 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U, + 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U, + 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU, + 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U, + 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU, + 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU, + 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU, + 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U, + 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U, + 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U, + 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U, + 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U, + 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U, + 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU, + 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U, + 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U, + 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU, + 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U, + 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U, + 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U, + 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU, + 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U, + 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U, + 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU, + 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U, + 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U, + 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU, + 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U, + 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU, + 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU, + 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U, + 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U, + 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U, + 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U, + 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU, + 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U, + 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U, + 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU, + 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU, + 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU, + 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U, + 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU, + 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U, + 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U, + 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U, + 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U, + 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU, + 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U, + 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU, + 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU, + 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU, + 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU, + 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U, + 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU, + 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U, + 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU, + 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U, + 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U, + 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU, +}; +static const u32 rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +}; + +/** + * Expand the cipher key into the encryption key schedule. + */ +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + + u32 *rk; + int i = 0; + u32 temp; + + if (!userKey || !key) + return -1; + if (bits != 128 && bits != 192 && bits != 256) + return -2; + + rk = key->rd_key; + + if (bits==128) + key->rounds = 10; + else if (bits==192) + key->rounds = 12; + else + key->rounds = 14; + + rk[0] = GETU32(userKey ); + rk[1] = GETU32(userKey + 4); + rk[2] = GETU32(userKey + 8); + rk[3] = GETU32(userKey + 12); + if (bits == 128) { + while (1) { + temp = rk[3]; + rk[4] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; + rk[5] = rk[1] ^ rk[4]; + rk[6] = rk[2] ^ rk[5]; + rk[7] = rk[3] ^ rk[6]; + if (++i == 10) { + return 0; + } + rk += 4; + } + } + rk[4] = GETU32(userKey + 16); + rk[5] = GETU32(userKey + 20); + if (bits == 192) { + while (1) { + temp = rk[ 5]; + rk[ 6] = rk[ 0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; + rk[ 7] = rk[ 1] ^ rk[ 6]; + rk[ 8] = rk[ 2] ^ rk[ 7]; + rk[ 9] = rk[ 3] ^ rk[ 8]; + if (++i == 8) { + return 0; + } + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + rk += 6; + } + } + rk[6] = GETU32(userKey + 24); + rk[7] = GETU32(userKey + 28); + if (bits == 256) { + while (1) { + temp = rk[ 7]; + rk[ 8] = rk[ 0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[(temp ) & 0xff] & 0x0000ff00) ^ + (Te4[(temp >> 24) ] & 0x000000ff) ^ + rcon[i]; + rk[ 9] = rk[ 1] ^ rk[ 8]; + rk[10] = rk[ 2] ^ rk[ 9]; + rk[11] = rk[ 3] ^ rk[10]; + if (++i == 7) { + return 0; + } + temp = rk[11]; + rk[12] = rk[ 4] ^ + (Te4[(temp >> 24) ] & 0xff000000) ^ + (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(temp ) & 0xff] & 0x000000ff); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + rk += 8; + } + } + return 0; +} + +/** + * Expand the cipher key into the decryption key schedule. + */ +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) { + + u32 *rk; + int i, j, status; + u32 temp; + + /* first, start with an encryption schedule */ + status = AES_set_encrypt_key(userKey, bits, key); + if (status < 0) + return status; + + rk = key->rd_key; + + /* invert the order of the round keys: */ + for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { + temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; + temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; + temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; + temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; + } + /* apply the inverse MixColumn transform to all round keys but the first and the last: */ + for (i = 1; i < (key->rounds); i++) { + rk += 4; + rk[0] = + Td0[Te4[(rk[0] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[0] ) & 0xff] & 0xff]; + rk[1] = + Td0[Te4[(rk[1] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[1] ) & 0xff] & 0xff]; + rk[2] = + Td0[Te4[(rk[2] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[2] ) & 0xff] & 0xff]; + rk[3] = + Td0[Te4[(rk[3] >> 24) ] & 0xff] ^ + Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[(rk[3] ) & 0xff] & 0xff]; + } + return 0; +} + +#ifndef AES_ASM +/* + * Encrypt a single block + * in and out can overlap + */ +void AES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key) { + + const u32 *rk; + u32 s0, s1, s2, s3, t0, t1, t2, t3; +#ifndef FULL_UNROLL + int r; +#endif /* ?FULL_UNROLL */ + + assert(in && out && key); + rk = key->rd_key; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + s0 = GETU32(in ) ^ rk[0]; + s1 = GETU32(in + 4) ^ rk[1]; + s2 = GETU32(in + 8) ^ rk[2]; + s3 = GETU32(in + 12) ^ rk[3]; +#ifdef FULL_UNROLL + /* round 1: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7]; + /* round 2: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11]; + /* round 3: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15]; + /* round 4: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19]; + /* round 5: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23]; + /* round 6: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27]; + /* round 7: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31]; + /* round 8: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35]; + /* round 9: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39]; + if (key->rounds > 10) { + /* round 10: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43]; + /* round 11: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47]; + if (key->rounds > 12) { + /* round 12: */ + s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48]; + s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49]; + s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50]; + s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51]; + /* round 13: */ + t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52]; + t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53]; + t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54]; + t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55]; + } + } + rk += key->rounds << 2; +#else /* !FULL_UNROLL */ + /* + * Nr - 1 full rounds: + */ + r = key->rounds >> 1; + for (;;) { + t0 = + Te0[(s0 >> 24) ] ^ + Te1[(s1 >> 16) & 0xff] ^ + Te2[(s2 >> 8) & 0xff] ^ + Te3[(s3 ) & 0xff] ^ + rk[4]; + t1 = + Te0[(s1 >> 24) ] ^ + Te1[(s2 >> 16) & 0xff] ^ + Te2[(s3 >> 8) & 0xff] ^ + Te3[(s0 ) & 0xff] ^ + rk[5]; + t2 = + Te0[(s2 >> 24) ] ^ + Te1[(s3 >> 16) & 0xff] ^ + Te2[(s0 >> 8) & 0xff] ^ + Te3[(s1 ) & 0xff] ^ + rk[6]; + t3 = + Te0[(s3 >> 24) ] ^ + Te1[(s0 >> 16) & 0xff] ^ + Te2[(s1 >> 8) & 0xff] ^ + Te3[(s2 ) & 0xff] ^ + rk[7]; + + rk += 8; + if (--r == 0) { + break; + } + + s0 = + Te0[(t0 >> 24) ] ^ + Te1[(t1 >> 16) & 0xff] ^ + Te2[(t2 >> 8) & 0xff] ^ + Te3[(t3 ) & 0xff] ^ + rk[0]; + s1 = + Te0[(t1 >> 24) ] ^ + Te1[(t2 >> 16) & 0xff] ^ + Te2[(t3 >> 8) & 0xff] ^ + Te3[(t0 ) & 0xff] ^ + rk[1]; + s2 = + Te0[(t2 >> 24) ] ^ + Te1[(t3 >> 16) & 0xff] ^ + Te2[(t0 >> 8) & 0xff] ^ + Te3[(t1 ) & 0xff] ^ + rk[2]; + s3 = + Te0[(t3 >> 24) ] ^ + Te1[(t0 >> 16) & 0xff] ^ + Te2[(t1 >> 8) & 0xff] ^ + Te3[(t2 ) & 0xff] ^ + rk[3]; + } +#endif /* ?FULL_UNROLL */ + /* + * apply last round and + * map cipher state to byte array block: + */ + s0 = + (Te4[(t0 >> 24) ] & 0xff000000) ^ + (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t3 ) & 0xff] & 0x000000ff) ^ + rk[0]; + PUTU32(out , s0); + s1 = + (Te4[(t1 >> 24) ] & 0xff000000) ^ + (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t0 ) & 0xff] & 0x000000ff) ^ + rk[1]; + PUTU32(out + 4, s1); + s2 = + (Te4[(t2 >> 24) ] & 0xff000000) ^ + (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t1 ) & 0xff] & 0x000000ff) ^ + rk[2]; + PUTU32(out + 8, s2); + s3 = + (Te4[(t3 >> 24) ] & 0xff000000) ^ + (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[(t2 ) & 0xff] & 0x000000ff) ^ + rk[3]; + PUTU32(out + 12, s3); +} + +/* + * Decrypt a single block + * in and out can overlap + */ +void AES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key) { + + const u32 *rk; + u32 s0, s1, s2, s3, t0, t1, t2, t3; +#ifndef FULL_UNROLL + int r; +#endif /* ?FULL_UNROLL */ + + assert(in && out && key); + rk = key->rd_key; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + s0 = GETU32(in ) ^ rk[0]; + s1 = GETU32(in + 4) ^ rk[1]; + s2 = GETU32(in + 8) ^ rk[2]; + s3 = GETU32(in + 12) ^ rk[3]; +#ifdef FULL_UNROLL + /* round 1: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7]; + /* round 2: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11]; + /* round 3: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15]; + /* round 4: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19]; + /* round 5: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23]; + /* round 6: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27]; + /* round 7: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31]; + /* round 8: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35]; + /* round 9: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39]; + if (key->rounds > 10) { + /* round 10: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43]; + /* round 11: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47]; + if (key->rounds > 12) { + /* round 12: */ + s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48]; + s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49]; + s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50]; + s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51]; + /* round 13: */ + t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52]; + t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53]; + t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54]; + t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55]; + } + } + rk += key->rounds << 2; +#else /* !FULL_UNROLL */ + /* + * Nr - 1 full rounds: + */ + r = key->rounds >> 1; + for (;;) { + t0 = + Td0[(s0 >> 24) ] ^ + Td1[(s3 >> 16) & 0xff] ^ + Td2[(s2 >> 8) & 0xff] ^ + Td3[(s1 ) & 0xff] ^ + rk[4]; + t1 = + Td0[(s1 >> 24) ] ^ + Td1[(s0 >> 16) & 0xff] ^ + Td2[(s3 >> 8) & 0xff] ^ + Td3[(s2 ) & 0xff] ^ + rk[5]; + t2 = + Td0[(s2 >> 24) ] ^ + Td1[(s1 >> 16) & 0xff] ^ + Td2[(s0 >> 8) & 0xff] ^ + Td3[(s3 ) & 0xff] ^ + rk[6]; + t3 = + Td0[(s3 >> 24) ] ^ + Td1[(s2 >> 16) & 0xff] ^ + Td2[(s1 >> 8) & 0xff] ^ + Td3[(s0 ) & 0xff] ^ + rk[7]; + + rk += 8; + if (--r == 0) { + break; + } + + s0 = + Td0[(t0 >> 24) ] ^ + Td1[(t3 >> 16) & 0xff] ^ + Td2[(t2 >> 8) & 0xff] ^ + Td3[(t1 ) & 0xff] ^ + rk[0]; + s1 = + Td0[(t1 >> 24) ] ^ + Td1[(t0 >> 16) & 0xff] ^ + Td2[(t3 >> 8) & 0xff] ^ + Td3[(t2 ) & 0xff] ^ + rk[1]; + s2 = + Td0[(t2 >> 24) ] ^ + Td1[(t1 >> 16) & 0xff] ^ + Td2[(t0 >> 8) & 0xff] ^ + Td3[(t3 ) & 0xff] ^ + rk[2]; + s3 = + Td0[(t3 >> 24) ] ^ + Td1[(t2 >> 16) & 0xff] ^ + Td2[(t1 >> 8) & 0xff] ^ + Td3[(t0 ) & 0xff] ^ + rk[3]; + } +#endif /* ?FULL_UNROLL */ + /* + * apply last round and + * map cipher state to byte array block: + */ + s0 = + (Td4[(t0 >> 24) ] & 0xff000000) ^ + (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t1 ) & 0xff] & 0x000000ff) ^ + rk[0]; + PUTU32(out , s0); + s1 = + (Td4[(t1 >> 24) ] & 0xff000000) ^ + (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t2 ) & 0xff] & 0x000000ff) ^ + rk[1]; + PUTU32(out + 4, s1); + s2 = + (Td4[(t2 >> 24) ] & 0xff000000) ^ + (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t3 ) & 0xff] & 0x000000ff) ^ + rk[2]; + PUTU32(out + 8, s2); + s3 = + (Td4[(t3 >> 24) ] & 0xff000000) ^ + (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[(t0 ) & 0xff] & 0x000000ff) ^ + rk[3]; + PUTU32(out + 12, s3); +} + +#endif /* AES_ASM */ + +void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, + const unsigned long length, const AES_KEY *key, + unsigned char *ivec, const int enc) +{ + + unsigned long n; + unsigned long len = length; + unsigned char tmp[AES_BLOCK_SIZE]; + + assert(in && out && key && ivec); + + if (enc) { + while (len >= AES_BLOCK_SIZE) { + for(n=0; n < AES_BLOCK_SIZE; ++n) + tmp[n] = in[n] ^ ivec[n]; + AES_encrypt(tmp, out, key); + memcpy(ivec, out, AES_BLOCK_SIZE); + len -= AES_BLOCK_SIZE; + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + } + if (len) { + for(n=0; n < len; ++n) + tmp[n] = in[n] ^ ivec[n]; + for(n=len; n < AES_BLOCK_SIZE; ++n) + tmp[n] = ivec[n]; + AES_encrypt(tmp, tmp, key); + memcpy(out, tmp, AES_BLOCK_SIZE); + memcpy(ivec, tmp, AES_BLOCK_SIZE); + } + } else { + while (len >= AES_BLOCK_SIZE) { + memcpy(tmp, in, AES_BLOCK_SIZE); + AES_decrypt(in, out, key); + for(n=0; n < AES_BLOCK_SIZE; ++n) + out[n] ^= ivec[n]; + memcpy(ivec, tmp, AES_BLOCK_SIZE); + len -= AES_BLOCK_SIZE; + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + } + if (len) { + memcpy(tmp, in, AES_BLOCK_SIZE); + AES_decrypt(tmp, tmp, key); + for(n=0; n < len; ++n) + out[n] = tmp[n] ^ ivec[n]; + memcpy(ivec, tmp, AES_BLOCK_SIZE); + } + } +} diff --git a/tools/blktap/drivers/aes.h b/tools/blktap/drivers/aes.h new file mode 100644 index 0000000000..a0167eb7d5 --- /dev/null +++ b/tools/blktap/drivers/aes.h @@ -0,0 +1,26 @@ +#ifndef QEMU_AES_H +#define QEMU_AES_H + +#define AES_MAXNR 14 +#define AES_BLOCK_SIZE 16 + +struct aes_key_st { + uint32_t rd_key[4 *(AES_MAXNR + 1)]; + int rounds; +}; +typedef struct aes_key_st AES_KEY; + +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); + +void AES_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void AES_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, + const unsigned long length, const AES_KEY *key, + unsigned char *ivec, const int enc); + +#endif diff --git a/tools/blktap/drivers/blktapctrl.c b/tools/blktap/drivers/blktapctrl.c new file mode 100644 index 0000000000..f4ade5b780 --- /dev/null +++ b/tools/blktap/drivers/blktapctrl.c @@ -0,0 +1,704 @@ +/* + * blktapctrl.c + * + * userspace controller for the blktap disks. + * As requests for new block devices arrive, + * the controller spawns off a separate process + * per-disk. + * + * + * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/user.h> +#include <err.h> +#include <errno.h> +#include <sys/types.h> +#include <linux/types.h> +#include <signal.h> +#include <fcntl.h> +#include <sys/poll.h> +#include <sys/ioctl.h> +#include <string.h> +#include <unistd.h> +#include <xs.h> +#include <printf.h> +#include <sys/time.h> +#include <syslog.h> + +#include "blktaplib.h" +#include "blktapctrl.h" +#include "tapdisk.h" + +#define NUM_POLL_FDS 2 +#define MSG_SIZE 4096 +#define MAX_TIMEOUT 10 +#define MAX_RAND_VAL 0xFFFF + +int run = 1; +int max_timeout = MAX_TIMEOUT; +int ctlfd = 0; + +static int open_ctrl_socket(char *devname); +static int write_msg(int fd, int msgtype, void *ptr, void *ptr2); +static int read_msg(int fd, int msgtype, void *ptr); +static driver_list_entry_t *active_disks[MAX_DISK_TYPES]; + +void sig_handler(int sig) +{ + run = 0; +} + +static void init_driver_list(void) +{ + int i; + + for (i = 0; i < MAX_DISK_TYPES; i++) + active_disks[i] = NULL; + return; +} + +static void init_rng(void) +{ + static uint32_t seed; + struct timeval tv; + + gettimeofday(&tv, NULL); + seed = tv.tv_usec; + srand48(seed); + return; +} + +static void make_blktap_dev(char *devname, int major, int minor) +{ + struct stat st; + + if (lstat(devname, &st) != 0) { + /*Need to create device*/ + if (mkdir(BLKTAP_DEV_DIR, 0755) == 0) + DPRINTF("Created %s directory\n",BLKTAP_DEV_DIR); + if (mknod(devname, S_IFCHR|0600, + makedev(major, minor)) == 0) + DPRINTF("Created %s device\n",devname); + } else DPRINTF("%s device already exists\n",devname); +} + +static int get_new_dev(int *major, int *minor, blkif_t *blkif) +{ + domid_translate_t tr; + int ret; + char *devname; + + tr.domid = blkif->domid; + tr.busid = (unsigned short)blkif->be_id; + ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr ); + + if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) { + DPRINTF("Incorrect Dev ID [%d]\n",ret); + return -1; + } + + *minor = ret; + *major = ioctl(ctlfd, BLKTAP_IOCTL_MAJOR, ret ); + if (*major < 0) { + DPRINTF("Incorrect Major ID [%d]\n",*major); + return -1; + } + + asprintf(&devname,"%s/%s%d",BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, *minor); + make_blktap_dev(devname,*major,*minor); + DPRINTF("Received device id %d and major %d, " + "sent domid %d and be_id %d\n", + *minor, *major, tr.domid, tr.busid); + return 0; +} + +static int get_tapdisk_pid(blkif_t *blkif) +{ + int ret; + + if ((ret = write_msg(blkif->fds[WRITE], CTLMSG_PID, blkif, NULL)) + <= 0) { + DPRINTF("Write_msg failed - CTLMSG_PID(%d)\n", ret); + return -EINVAL; + } + + if ((ret = read_msg(blkif->fds[READ], CTLMSG_PID_RSP, blkif)) + <= 0) { + DPRINTF("Read_msg failure - CTLMSG_PID(%d)\n", ret); + return -EINVAL; + } + return 1; +} + +static blkif_t *test_path(char *path, char **dev, int *type) +{ + char *ptr, handle[10]; + int i, size; + + size = sizeof(dtypes)/sizeof(disk_info_t *); + *type = MAX_DISK_TYPES + 1; + + if ( (ptr = strstr(path, ":"))!=NULL) { + memcpy(handle, path, (ptr - path)); + *dev = ptr + 1; + ptr = handle + (ptr - path); + *ptr = '\0'; + DPRINTF("Detected handle: [%s]\n",handle); + + for (i = 0; i < size; i++) { + if (strncmp(handle, dtypes[i]->handle, (ptr - path)) + ==0) { + *type = dtypes[i]->idnum; + + if (dtypes[i]->single_handler == 1) { + /* Check whether tapdisk process + already exists */ + if (active_disks[dtypes[i]->idnum] + == NULL) return NULL; + else + return active_disks[dtypes[i]->idnum]->blkif; + } + } + } + } else *dev = NULL; + + return NULL; +} + +static void add_disktype(blkif_t *blkif, int type) +{ + driver_list_entry_t *entry, *ptr, *last; + + if (type > MAX_DISK_TYPES) return; + + entry = malloc(sizeof(driver_list_entry_t)); + entry->blkif = blkif; + entry->next = NULL; + ptr = active_disks[type]; + + if (ptr == NULL) { + active_disks[type] = entry; + entry->prev = NULL; + return; + } + + while (ptr != NULL) { + last = ptr; + ptr = ptr->next; + } + + /*We've found the end of the list*/ + last->next = entry; + entry->prev = last; + + return; +} + +static int del_disktype(blkif_t *blkif) +{ + driver_list_entry_t *ptr, *cur, *last; + int type = blkif->drivertype, count = 0, close = 0; + + if (type > MAX_DISK_TYPES) return 1; + + ptr = active_disks[type]; + last = NULL; + while (ptr != NULL) { + count++; + if (blkif == ptr->blkif) { + cur = ptr; + if (ptr->next != NULL) { + /*There's more later in the chain*/ + if (!last) { + /*We're first in the list*/ + active_disks[type] = ptr->next; + ptr = ptr->next; + ptr->prev = NULL; + } + else { + /*We're sandwiched*/ + last->next = ptr->next; + ptr = ptr->next; + ptr->prev = last; + } + + } else if (last) { + /*There's more earlier in the chain*/ + last->next = NULL; + } else { + /*We're the only entry*/ + active_disks[type] = NULL; + if(dtypes[type]->single_handler == 1) + close = 1; + } + DPRINTF("DEL_DISKTYPE: Freeing entry\n"); + free(cur); + if (dtypes[type]->single_handler == 0) close = 1; + + return close; + } + last = ptr; + ptr = ptr->next; + } + DPRINTF("DEL_DISKTYPE: No match\n"); + return 1; +} + +static int write_msg(int fd, int msgtype, void *ptr, void *ptr2) +{ + blkif_t *blkif; + blkif_info_t *blk; + msg_hdr_t *msg; + msg_newdev_t *msg_dev; + char *p, *buf, *path; + int msglen, len, ret; + fd_set writefds; + struct timeval timeout; + image_t *image, *img; + uint32_t seed; + + blkif = (blkif_t *)ptr; + blk = blkif->info; + image = blkif->prv; + len = 0; + + switch (msgtype) + { + case CTLMSG_PARAMS: + path = (char *)ptr2; + DPRINTF("Write_msg called: CTLMSG_PARAMS, sending [%s, %s]\n", + blk->params, path); + + msglen = sizeof(msg_hdr_t) + strlen(path) + 1; + buf = malloc(msglen); + + /*Assign header fields*/ + msg = (msg_hdr_t *)buf; + msg->type = CTLMSG_PARAMS; + msg->len = msglen; + msg->drivertype = blkif->drivertype; + + gettimeofday(&timeout, NULL); + msg->cookie = blkif->cookie; + DPRINTF("Generated cookie, %d\n",blkif->cookie); + + /*Copy blk->params to msg*/ + p = buf + sizeof(msg_hdr_t); + memcpy(p, path, strlen(path) + 1); + + break; + + case CTLMSG_NEWDEV: + DPRINTF("Write_msg called: CTLMSG_NEWDEV\n"); + + msglen = sizeof(msg_hdr_t) + sizeof(msg_newdev_t); + buf = malloc(msglen); + + /*Assign header fields*/ + msg = (msg_hdr_t *)buf; + msg->type = CTLMSG_NEWDEV; + msg->len = msglen; + msg->drivertype = blkif->drivertype; + msg->cookie = blkif->cookie; + + msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t)); + msg_dev->devnum = blkif->minor; + msg_dev->domid = blkif->domid; + + break; + + case CTLMSG_CLOSE: + DPRINTF("Write_msg called: CTLMSG_CLOSE\n"); + + msglen = sizeof(msg_hdr_t); + buf = malloc(msglen); + + /*Assign header fields*/ + msg = (msg_hdr_t *)buf; + msg->type = CTLMSG_CLOSE; + msg->len = msglen; + msg->drivertype = blkif->drivertype; + msg->cookie = blkif->cookie; + + break; + + case CTLMSG_PID: + DPRINTF("Write_msg called: CTLMSG_PID\n"); + + msglen = sizeof(msg_hdr_t); + buf = malloc(msglen); + + /*Assign header fields*/ + msg = (msg_hdr_t *)buf; + msg->type = CTLMSG_PID; + msg->len = msglen; + msg->drivertype = blkif->drivertype; + msg->cookie = blkif->cookie; + + break; + + default: + return -1; + } + + /*Now send the message*/ + ret = 0; + FD_ZERO(&writefds); + FD_SET(fd,&writefds); + timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/ + timeout.tv_usec = 0; + if (select(fd+1, (fd_set *) 0, &writefds, + (fd_set *) 0, &timeout) > 0) { + len = write(fd, buf, msglen); + if (len == -1) DPRINTF("Write failed: (%d)\n",errno); + } + free(buf); + + return len; +} + +static int read_msg(int fd, int msgtype, void *ptr) +{ + blkif_t *blkif; + blkif_info_t *blk; + msg_hdr_t *msg; + msg_pid_t *msg_pid; + char *p, *buf; + int msglen = MSG_SIZE, len, ret; + fd_set readfds; + struct timeval timeout; + image_t *image, *img; + + + blkif = (blkif_t *)ptr; + blk = blkif->info; + image = blkif->prv; + + buf = malloc(MSG_SIZE); + + ret = 0; + FD_ZERO(&readfds); + FD_SET(fd,&readfds); + timeout.tv_sec = max_timeout; /*Wait for up to max_timeout seconds*/ + timeout.tv_usec = 0; + if (select(fd+1, &readfds, (fd_set *) 0, + (fd_set *) 0, &timeout) > 0) { + ret = read(fd, buf, msglen); + + } + if (ret > 0) { + msg = (msg_hdr_t *)buf; + switch (msg->type) + { + case CTLMSG_IMG: + img = (image_t *)(buf + sizeof(msg_hdr_t)); + image->size = img->size; + image->secsize = img->secsize; + image->info = img->info; + + DPRINTF("Received CTLMSG_IMG: %lu, %lu, %lu\n", + image->size, image->secsize, image->info); + if(msgtype != CTLMSG_IMG) ret = 0; + break; + + case CTLMSG_IMG_FAIL: + DPRINTF("Received CTLMSG_IMG_FAIL, " + "unable to open image\n"); + ret = 0; + break; + + case CTLMSG_NEWDEV_RSP: + DPRINTF("Received CTLMSG_NEWDEV_RSP\n"); + if(msgtype != CTLMSG_NEWDEV_RSP) ret = 0; + break; + + case CTLMSG_NEWDEV_FAIL: + DPRINTF("Received CTLMSG_NEWDEV_FAIL\n"); + ret = 0; + break; + + case CTLMSG_CLOSE_RSP: + DPRINTF("Received CTLMSG_CLOSE_RSP\n"); + if (msgtype != CTLMSG_CLOSE_RSP) ret = 0; + break; + + case CTLMSG_PID_RSP: + DPRINTF("Received CTLMSG_PID_RSP\n"); + if (msgtype != CTLMSG_PID_RSP) ret = 0; + else { + msg_pid = (msg_pid_t *) + (buf + sizeof(msg_hdr_t)); + blkif->tappid = msg_pid->pid; + DPRINTF("\tPID: [%d]\n",blkif->tappid); + } + break; + default: + DPRINTF("UNKNOWN MESSAGE TYPE RECEIVED\n"); + ret = 0; + break; + } + } + + free(buf); + + return ret; + +} + +int blktapctrl_new_blkif(blkif_t *blkif) +{ + blkif_info_t *blk; + int major, minor, fd_read, fd_write, type, new; + char *rdctldev, *wrctldev, *cmd, *ptr; + image_t *image; + blkif_t *exist = NULL; + + DPRINTF("Received a poll for a new vbd\n"); + if ( ((blk=blkif->info) != NULL) && (blk->params != NULL) ) { + if (get_new_dev(&major, &minor, blkif)<0) + return -1; + + exist = test_path(blk->params, &ptr, &type); + blkif->drivertype = type; + blkif->cookie = lrand48() % MAX_RAND_VAL; + + if (!exist) { + DPRINTF("Process does not exist:\n"); + asprintf(&rdctldev, "/dev/xen/tapctrlread%d", minor); + blkif->fds[READ] = open_ctrl_socket(rdctldev); + + + asprintf(&wrctldev, "/dev/xen/tapctrlwrite%d", minor); + blkif->fds[WRITE] = open_ctrl_socket(wrctldev); + + if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) + goto fail; + + /*launch the new process*/ + asprintf(&cmd, "tapdisk %s %s", wrctldev, rdctldev); + DPRINTF("Launching process, CMDLINE [%s]\n",cmd); + if (system(cmd) == -1) { + DPRINTF("Unable to fork, cmdline: [%s]\n",cmd); + return -1; + } + + free(rdctldev); + free(wrctldev); + free(cmd); + } else { + DPRINTF("Process exists!\n"); + blkif->fds[READ] = exist->fds[READ]; + blkif->fds[WRITE] = exist->fds[WRITE]; + } + + add_disktype(blkif, type); + blkif->major = major; + blkif->minor = minor; + + image = (image_t *)malloc(sizeof(image_t)); + blkif->prv = (void *)image; + blkif->ops = &tapdisk_ops; + + /*Retrieve the PID of the new process*/ + if (get_tapdisk_pid(blkif) <= 0) { + DPRINTF("Unable to contact disk process\n"); + goto fail; + } + + /* Both of the following read and write calls will block up to + * max_timeout val*/ + if (write_msg(blkif->fds[WRITE], CTLMSG_PARAMS, blkif, ptr) + <= 0) { + DPRINTF("Write_msg failed - CTLMSG_PARAMS\n"); + goto fail; + } + + if (read_msg(blkif->fds[READ], CTLMSG_IMG, blkif) <= 0) { + DPRINTF("Read_msg failure - CTLMSG_IMG\n"); + goto fail; + } + + } else return -1; + + return 0; +fail: + ioctl(ctlfd, BLKTAP_IOCTL_FREEINTF, minor); + return -EINVAL; +} + +int map_new_blktapctrl(blkif_t *blkif) +{ + DPRINTF("Received a poll for a new devmap\n"); + if (write_msg(blkif->fds[WRITE], CTLMSG_NEWDEV, blkif, NULL) <= 0) { + DPRINTF("Write_msg failed - CTLMSG_NEWDEV\n"); + return -EINVAL; + } + + if (read_msg(blkif->fds[READ], CTLMSG_NEWDEV_RSP, blkif) <= 0) { + DPRINTF("Read_msg failed - CTLMSG_NEWDEV_RSP\n"); + return -EINVAL; + } + DPRINTF("Exiting map_new_blktapctrl\n"); + + return blkif->minor - 1; +} + +int unmap_blktapctrl(blkif_t *blkif) +{ + DPRINTF("Unmapping vbd\n"); + + if (write_msg(blkif->fds[WRITE], CTLMSG_CLOSE, blkif, NULL) <= 0) { + DPRINTF("Write_msg failed - CTLMSG_CLOSE\n"); + return -EINVAL; + } + + if (del_disktype(blkif)) { + close(blkif->fds[WRITE]); + close(blkif->fds[READ]); + + } + return 0; +} + +int open_ctrl_socket(char *devname) +{ + int ret; + int ipc_fd; + char *cmd; + fd_set socks; + struct timeval timeout; + + ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO); + if ( (ret != 0) && (errno != EEXIST) ) { + DPRINTF("ERROR: pipe failed (%d)\n", errno); + exit(0); + } + + ipc_fd = open(devname,O_RDWR|O_NONBLOCK); + + if (ipc_fd < 0) { + DPRINTF("FD open failed\n"); + return -1; + } + + return ipc_fd; +} + +static void print_drivers(void) +{ + int i, size; + + size = sizeof(dtypes)/sizeof(disk_info_t *); + DPRINTF("blktapctrl: v1.0.0\n"); + for (i = 0; i < size; i++) + DPRINTF("Found driver: [%s]\n",dtypes[i]->name); +} + +int main(int argc, char *argv[]) +{ + char *devname; + tapdev_info_t *ctlinfo; + int tap_pfd, store_pfd, xs_fd, ret, timeout, pfd_count; + struct xs_handle *h; + struct pollfd pfd[NUM_POLL_FDS]; + pid_t process; + + __init_blkif(); + openlog("BLKTAPCTRL", LOG_CONS|LOG_ODELAY, LOG_DAEMON); + + print_drivers(); + init_driver_list(); + init_rng(); + + register_new_blkif_hook(blktapctrl_new_blkif); + register_new_devmap_hook(map_new_blktapctrl); + register_new_unmap_hook(unmap_blktapctrl); + + /*Attach to blktap0 */ + asprintf(&devname,"%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME); + make_blktap_dev(devname,254,0); + ctlfd = open(devname, O_RDWR); + if (ctlfd == -1) { + DPRINTF("blktap0 open failed\n"); + goto open_failed; + } + + /* Set up store connection and watch. */ + h = xs_daemon_open(); + if (h == NULL) { + DPRINTF("xs_daemon_open failed -- " + "is xenstore running?\n"); + goto open_failed; + } + + ret = add_blockdevice_probe_watch(h, "Domain-0"); + if (ret != 0) { + DPRINTF("adding device probewatch\n"); + goto open_failed; + } + + ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE ); + + process = getpid(); + ret = ioctl(ctlfd, BLKTAP_IOCTL_SENDPID, process ); + + /*Static pollhooks*/ + pfd_count = 0; + tap_pfd = pfd_count++; + pfd[tap_pfd].fd = ctlfd; + pfd[tap_pfd].events = POLLIN; + + store_pfd = pfd_count++; + pfd[store_pfd].fd = xs_fileno(h); + pfd[store_pfd].events = POLLIN; + + while (run) { + timeout = 1000; /*Milliseconds*/ + ret = poll(pfd, pfd_count, timeout); + + if (ret > 0) { + if (pfd[store_pfd].revents) { + ret = xs_fire_next_watch(h); + } + } + } + + ioctl(ctlfd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH ); + close(ctlfd); + closelog(); + + return 0; + + open_failed: + DPRINTF("Unable to start blktapctrl\n"); + closelog(); + return -1; +} diff --git a/tools/blktap/drivers/blktapctrl.h b/tools/blktap/drivers/blktapctrl.h new file mode 100644 index 0000000000..4a5e59577e --- /dev/null +++ b/tools/blktap/drivers/blktapctrl.h @@ -0,0 +1,55 @@ +/* blktapctrl.h + * + * controller image utils. + * + * (c) 2004-6 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +static inline long int tapdisk_get_size(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->size; +} + +static inline long int tapdisk_get_secsize(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->secsize; +} + +static inline unsigned tapdisk_get_info(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->info; +} + +struct blkif_ops tapdisk_ops = { + .get_size = tapdisk_get_size, + .get_secsize = tapdisk_get_secsize, + .get_info = tapdisk_get_info, +}; diff --git a/tools/blktap/drivers/block-aio.c b/tools/blktap/drivers/block-aio.c new file mode 100644 index 0000000000..ebcfc35f56 --- /dev/null +++ b/tools/blktap/drivers/block-aio.c @@ -0,0 +1,327 @@ +/* block-aio.c + * + * libaio-based raw disk implementation. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * NB: This code is not thread-safe. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include <errno.h> +#include <libaio.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include "tapdisk.h" + + +/** + * We used a kernel patch to return an fd associated with the AIO context + * so that we can concurrently poll on synchronous and async descriptors. + * This is signalled by passing 1 as the io context to io_setup. + */ +#define REQUEST_ASYNC_FD 1 + +#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ * 8) + +struct pending_aio { + td_callback_t cb; + int id; + void *private; +}; + +struct tdaio_state { + int fd; + + /* libaio state */ + io_context_t aio_ctx; + struct iocb iocb_list [MAX_AIO_REQS]; + struct iocb *iocb_free [MAX_AIO_REQS]; + struct pending_aio pending_aio[MAX_AIO_REQS]; + int iocb_free_count; + struct iocb *iocb_queue[MAX_AIO_REQS]; + int iocb_queued; + int poll_fd; /* NB: we require aio_poll support */ + struct io_event aio_events[MAX_AIO_REQS]; +}; + +#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list) + +/*Get Image size, secsize*/ +static int get_image_info(struct td_state *s, int fd) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + s->size = 0; + if (ioctl(fd,BLKGETSIZE,&s->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + s->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &s->sector_size); + + if (s->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + s->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + s->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + s->size = (stat.st_size >> SECTOR_SHIFT); + s->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + } + + if (s->size == 0) { + s->size =((uint64_t) 16836057); + s->sector_size = DEFAULT_SECTOR_SIZE; + } + s->info = 0; + + return 0; +} + +/* Open the disk file and initialize aio state. */ +int tdaio_open (struct td_state *s, const char *name) +{ + int i, fd, ret = 0; + struct tdaio_state *prv = (struct tdaio_state *)s->private; + s->private = prv; + + DPRINTF("XXX: block-aio open('%s')", name); + /* Initialize AIO */ + prv->iocb_free_count = MAX_AIO_REQS; + prv->iocb_queued = 0; + + prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; + prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx); + + if (prv->poll_fd < 0) { + ret = prv->poll_fd; + DPRINTF("Couldn't get fd for AIO poll support. This is " + "probably because your kernel does not have the " + "aio-poll patch applied.\n"); + goto done; + } + + for (i=0;i<MAX_AIO_REQS;i++) + prv->iocb_free[i] = &prv->iocb_list[i]; + + /* Open the file */ + fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE); + + if ( (fd == -1) && (errno == EINVAL) ) { + + /* Maybe O_DIRECT isn't supported. */ + fd = open(name, O_RDWR | O_LARGEFILE); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno); + ret = 0 - errno; + goto done; + } + + prv->fd = fd; + + ret = get_image_info(s, fd); +done: + return ret; +} + +int tdaio_queue_read(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct iocb *io; + struct pending_aio *pio; + struct tdaio_state *prv = (struct tdaio_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + long ioidx; + + if (prv->iocb_free_count == 0) + return -ENOMEM; + io = prv->iocb_free[--prv->iocb_free_count]; + + ioidx = IOCB_IDX(prv, io); + pio = &prv->pending_aio[ioidx]; + pio->cb = cb; + pio->id = id; + pio->private = private; + + io_prep_pread(io, prv->fd, buf, size, offset); + io->data = (void *)ioidx; + + prv->iocb_queue[prv->iocb_queued++] = io; + + return 0; +} + +int tdaio_queue_write(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct iocb *io; + struct pending_aio *pio; + struct tdaio_state *prv = (struct tdaio_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + long ioidx; + + if (prv->iocb_free_count == 0) + return -ENOMEM; + io = prv->iocb_free[--prv->iocb_free_count]; + + ioidx = IOCB_IDX(prv, io); + pio = &prv->pending_aio[ioidx]; + pio->cb = cb; + pio->id = id; + pio->private = private; + + io_prep_pwrite(io, prv->fd, buf, size, offset); + io->data = (void *)ioidx; + + prv->iocb_queue[prv->iocb_queued++] = io; + + return 0; +} + +int tdaio_submit(struct td_state *s) +{ + int ret; + struct tdaio_state *prv = (struct tdaio_state *)s->private; + + ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue); + + /* XXX: TODO: Handle error conditions here. */ + + /* Success case: */ + prv->iocb_queued = 0; + + return ret; +} + +int *tdaio_get_fd(struct td_state *s) +{ + struct tdaio_state *prv = (struct tdaio_state *)s->private; + int *fds, i; + + fds = malloc(sizeof(int) * MAX_IOFD); + /*initialise the FD array*/ + for(i=0;i<MAX_IOFD;i++) fds[i] = 0; + + fds[0] = prv->poll_fd; + + return fds; +} + +int tdaio_close(struct td_state *s) +{ + struct tdaio_state *prv = (struct tdaio_state *)s->private; + + io_destroy(prv->aio_ctx); + close(prv->fd); + + return 0; +} + +int tdaio_do_callbacks(struct td_state *s, int sid) +{ + int ret, i, rsp = 0; + struct io_event *ep; + struct tdaio_state *prv = (struct tdaio_state *)s->private; + + /* Non-blocking test for completed io. */ + ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events, + NULL); + + for (ep=prv->aio_events,i=ret; i-->0; ep++) { + struct iocb *io = ep->obj; + struct pending_aio *pio; + + pio = &prv->pending_aio[(long)io->data]; + + if (ep->res != io->u.c.nbytes) { + /* TODO: handle this case better. */ + DPRINTF("AIO did less than I asked it to. \n"); + } + rsp += pio->cb(s, ep->res2, pio->id, pio->private); + + prv->iocb_free[prv->iocb_free_count++] = io; + } + return rsp; +} + +struct tap_disk tapdisk_aio = { + "tapdisk_aio", + sizeof(struct tdaio_state), + tdaio_open, + tdaio_queue_read, + tdaio_queue_write, + tdaio_submit, + tdaio_get_fd, + tdaio_close, + tdaio_do_callbacks, +}; diff --git a/tools/blktap/drivers/block-qcow.c b/tools/blktap/drivers/block-qcow.c new file mode 100644 index 0000000000..7eab8c9834 --- /dev/null +++ b/tools/blktap/drivers/block-qcow.c @@ -0,0 +1,1369 @@ +/* block-qcow.c + * + * Asynchronous Qemu copy-on-write disk implementation. + * Code based on the Qemu implementation + * (see copyright notice below) + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + */ + +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files(the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include <zlib.h> +#include <inttypes.h> +#include <libaio.h> +#include <openssl/md5.h> +#include "bswap.h" +#include "aes.h" +#include "tapdisk.h" + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + + +/******AIO DEFINES******/ +#define REQUEST_ASYNC_FD 1 +#define MAX_QCOW_IDS 0xFFFF +#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ * 8) + +struct pending_aio { + td_callback_t cb; + int id; + void *private; + int nb_sectors; + char *buf; + uint64_t sector; + int qcow_idx; +}; + +#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list) + +#define ZERO_TEST(_b) (_b | 0x00) + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QCowHeader; + +/*Extended header for Xen enhancements*/ +typedef struct QCowHeader_ext { + uint32_t xmagic; + uint32_t cksum; + uint32_t min_cluster_alloc; +} QCowHeader_ext; + +#define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/ + +struct tdqcow_state { + int fd; /*Main Qcow file descriptor */ + uint64_t fd_end; /*Store a local record of file length */ + int bfd; /*Backing file descriptor*/ + char *name; /*Record of the filename*/ + int poll_pipe[2]; /*dummy fd for polling on */ + int encrypted; /*File contents are encrypted or plain*/ + int cluster_bits; /*Determines length of cluster as + *indicated by file hdr*/ + int cluster_size; /*Length of cluster*/ + int cluster_sectors; /*Number of sectors per cluster*/ + int cluster_alloc; /*Blktap fix for allocating full + *extents*/ + int min_cluster_alloc; /*Blktap historical extent alloc*/ + int l2_bits; /*Size of L2 table entry*/ + int l2_size; /*Full table size*/ + int l1_size; /*L1 table size*/ + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; /*L1 table offset from beginning of + *file*/ + uint64_t *l1_table; /*L1 table entries*/ + uint64_t *l2_cache; /*We maintain a cache of size + *L2_CACHE_SIZE of most read entries*/ + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/ + uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/ + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint8_t *sector_lock; /*Locking bitmap for AIO reads/writes*/ + uint64_t cluster_cache_offset; /**/ + uint32_t crypt_method; /*current crypt method, 0 if no + *key yet */ + uint32_t crypt_method_header; /**/ + AES_KEY aes_encrypt_key; /*AES key*/ + AES_KEY aes_decrypt_key; /*AES key*/ + /* libaio state */ + io_context_t aio_ctx; + int nr_reqs [MAX_QCOW_IDS]; + struct iocb iocb_list [MAX_AIO_REQS]; + struct iocb *iocb_free [MAX_AIO_REQS]; + struct pending_aio pending_aio[MAX_AIO_REQS]; + int iocb_free_count; + struct iocb *iocb_queue[MAX_AIO_REQS]; + int iocb_queued; + int poll_fd; /* NB: we require aio_poll support */ + struct io_event aio_events[MAX_AIO_REQS]; +}; + +static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset); + +static int init_aio_state(struct td_state *bs) +{ + int i; + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + long ioidx; + + /*Initialize Locking bitmap*/ + s->sector_lock = calloc(1, bs->size); + + if (!s->sector_lock) { + DPRINTF("Failed to allocate sector lock\n"); + goto fail; + } + + /* Initialize AIO */ + s->iocb_free_count = MAX_AIO_REQS; + s->iocb_queued = 0; + + /*Signal kernel to create Poll FD for Asyc completion events*/ + s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; + s->poll_fd = io_setup(MAX_AIO_REQS, &s->aio_ctx); + + if (s->poll_fd < 0) { + DPRINTF("Retrieving Async poll fd failed\n"); + goto fail; + } + + for (i=0;i<MAX_AIO_REQS;i++) + s->iocb_free[i] = &s->iocb_list[i]; + for (i=0;i<MAX_QCOW_IDS;i++) + s->nr_reqs[i] = 0; + DPRINTF("AIO state initialised\n"); + + return 0; + + fail: + return -1; +} + +/* + *Test if block is zero. + * Return: + * 1 for TRUE + * 0 for FALSE + */ +static inline int IS_ZERO(char *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) { + /*if not zero, return false*/ + if (ZERO_TEST(*(buf + i))) return 0; + } + return 1; +} + +static uint32_t gen_cksum(char *ptr, int len) +{ + unsigned char *md; + uint32_t ret; + + md = malloc(MD5_DIGEST_LENGTH); + + if(!md) return 0; + + if (MD5((unsigned char *)ptr, len, md) != md) return 0; + + memcpy(&ret, md, sizeof(uint32_t)); + free(md); + return ret; +} + +static int qcow_set_key(struct td_state *bs, const char *key) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for (i = 0; i < len; i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for (i=0; i<16; i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for (i = 0; i < 16; i++) + DPRINTF(" %02x", tmp[i]); + DPRINTF("\n"); + for (i = 0; i < 16; i++) + DPRINTF(" %02x", out[i]); + DPRINTF("\n"); + } +#endif + return 0; +} + +static int async_read(struct tdqcow_state *s, int fd, int size, + uint64_t offset, + char *buf, td_callback_t cb, + int id, uint64_t sector, int qcow_idx, void *private) +{ + struct iocb *io; + struct pending_aio *pio; + long ioidx; + + io = s->iocb_free[--s->iocb_free_count]; + + ioidx = IOCB_IDX(s, io); + pio = &s->pending_aio[ioidx]; + pio->cb = cb; + pio->id = id; + pio->private = private; + pio->nb_sectors = size/512; + pio->buf = buf; + pio->sector = sector; + pio->qcow_idx = qcow_idx; + + io_prep_pread(io, fd, buf, size, offset); + io->data = (void *)ioidx; + + s->iocb_queue[s->iocb_queued++] = io; + + return 1; +} + +static int async_write(struct tdqcow_state *s, int fd, int size, + uint64_t offset, + char *buf, td_callback_t cb, + int id, uint64_t sector, int qcow_idx, void *private) +{ + struct iocb *io; + struct pending_aio *pio; + long ioidx; + + io = s->iocb_free[--s->iocb_free_count]; + + ioidx = IOCB_IDX(s, io); + pio = &s->pending_aio[ioidx]; + pio->cb = cb; + pio->id = id; + pio->private = private; + pio->nb_sectors = size/512; + pio->buf = buf; + pio->sector = sector; + pio->qcow_idx = qcow_idx; + + io_prep_pwrite(io, fd, buf, size, offset); + io->data = (void *)ioidx; + + s->iocb_queue[s->iocb_queued++] = io; + + return 1; +} + +/*TODO: Fix sector span!*/ +static int aio_can_lock(struct tdqcow_state *s, uint64_t sector) +{ + return (s->sector_lock[sector] ? 0 : 1); +} + +static int aio_lock(struct tdqcow_state *s, uint64_t sector) +{ + return ++s->sector_lock[sector]; +} + +static void aio_unlock(struct tdqcow_state *s, uint64_t sector) +{ + if (!s->sector_lock[sector]) return; + + --s->sector_lock[sector]; + return; +} + +/*TODO - Use a freelist*/ +static int get_free_idx(struct tdqcow_state *s) +{ + int i; + + for(i = 0; i < MAX_QCOW_IDS; i++) { + if(s->nr_reqs[i] == 0) return i; + } + return -1; +} + +/* + * The crypt function is compatible with the linux cryptoloop + * algorithm for < 4 GB images. NOTE: out_buf == in_buf is + * supported . + */ +static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for (i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(struct td_state *bs, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector; + char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + /*Check L1 table for the extent offset*/ + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* + * allocating a new l2 entry + extent + * at the end of the file, we must also + * update the L1 entry safely. + */ + l2_offset = s->fd_end; + + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + tmp = cpu_to_be64(l2_offset); + + /*Truncate file for L2 table + *(initialised to zero in case we crash)*/ + ftruncate(s->fd, l2_offset + (s->l2_size * sizeof(uint64_t))); + s->fd_end += (s->l2_size * sizeof(uint64_t)); + + /*Update the L1 table entry on disk + * (for O_DIRECT we write 4KByte blocks)*/ + l1_sector = (l1_index * sizeof(uint64_t)) >> 12; + l1_ptr = (char *)s->l1_table + (l1_sector << 12); + + if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) { + DPRINTF("ERROR allocating memory for L1 table\n"); + } + memcpy(tmp_ptr, l1_ptr, 4096); + + /* + * Issue non-asynchronous L1 write. + * For safety, we must ensure that + * entry is written before blocks. + */ + lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET); + if (write(s->fd, tmp_ptr, 4096) != 4096) + return 0; + free(tmp_ptr); + + new_l2_table = 1; + goto cache_miss; + } else if (s->min_cluster_alloc == s->l2_size) { + /*Fast-track the request*/ + cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t)); + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + return cluster_offset + (l2_index * s->cluster_size); + } + + /*Check to see if L2 entry is already cached*/ + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for (j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + +cache_miss: + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + + /*If extent pre-allocated, read table from disk, + *otherwise write new table to disk*/ + if (new_l2_table) { + /*Should we allocate the whole extent? Adjustable parameter.*/ + if (s->cluster_alloc == s->l2_size) { + cluster_offset = l2_offset + + (s->l2_size * sizeof(uint64_t)); + cluster_offset = (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + ftruncate(s->fd, cluster_offset + + (s->cluster_size * s->l2_size)); + s->fd_end = cluster_offset + + (s->cluster_size * s->l2_size); + for (i = 0; i < s->l2_size; i++) { + l2_table[i] = cpu_to_be64(cluster_offset + + (i*s->cluster_size)); + } + } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + + lseek(s->fd, l2_offset, SEEK_SET); + if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } else { + lseek(s->fd, l2_offset, SEEK_SET); + if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + + /*Update the cache entries*/ + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + +found: + /*The extent is split into 's->l2_size' blocks of + *size 's->cluster_size'*/ + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) { + if (!allocate) + return 0; + + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* cluster is already allocated but compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(s, cluster_offset) < 0) + return 0; + cluster_offset = lseek(s->fd, 0, SEEK_END); + cluster_offset = (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + /* write the cluster content - not asynchronous */ + lseek(s->fd, cluster_offset, SEEK_SET); + if (write(s->fd, s->cluster_cache, s->cluster_size) != + s->cluster_size) + return -1; + } else { + /* allocate a new cluster */ + cluster_offset = lseek(s->fd, 0, SEEK_END); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = + (cluster_offset + s->cluster_size - 1) + & ~(s->cluster_size - 1); + ftruncate(s->fd, cluster_offset + + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (s->crypt_method && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + start_sect = (offset & + ~(s->cluster_size - 1)) + >> 9; + memset(s->cluster_data + 512, + 0xaa, 512); + for (i = 0; i < s->cluster_sectors;i++) + { + if (i < n_start || i >= n_end) + { + encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, 1, + &s->aes_encrypt_key); + lseek(s->fd, cluster_offset + i * 512, SEEK_SET); + if (write(s->fd, s->cluster_data, 512) != 512) + return -1; + } + } + } + } else { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size + << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + + /*For IO_DIRECT we write 4KByte blocks*/ + l2_sector = (l2_index * sizeof(uint64_t)) >> 12; + l2_ptr = (char *)l2_table + (l2_sector << 12); + + if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) { + DPRINTF("ERROR allocating memory for L1 table\n"); + } + memcpy(tmp_ptr2, l2_ptr, 4096); + aio_lock(s, offset >> 9); + async_write(s, s->fd, 4096, l2_offset + (l2_sector << 12), + tmp_ptr2, 0, -2, offset >> 9, 0, NULL); + } + return cluster_offset; +} + +static void init_cluster_cache(struct td_state *bs) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + uint32_t count = 0; + int i, cluster_entries; + + cluster_entries = s->cluster_size / 512; + DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n", + cluster_entries, s->cluster_size); + + for (i = 0; i < bs->size; i += cluster_entries) { + if (get_cluster_offset(bs, i << 9, 0, 0, 0, 1)) count++; + if (count >= L2_CACHE_SIZE) return; + } + DPRINTF("Finished cluster initialisation, added %d entries\n", count); + return; +} + +static int qcow_is_allocated(struct td_state *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + + int index_in_cluster, n; + uint64_t cluster_offset; + + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + (out_len != out_buf_size) ) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset) +{ + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + lseek(s->fd, coffset, SEEK_SET); + ret = read(s->fd, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +/* Open the disk file and initialize qcow state. */ +int tdqcow_open (struct td_state *bs, const char *name) +{ + int fd, len, i, shift, ret, size, l1_table_size; + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + char *buf; + QCowHeader *header; + QCowHeader_ext *exthdr; + uint32_t cksum; + + DPRINTF("QCOW: Opening %s\n",name); + /* set up a pipe so that we can hand back a poll fd that won't fire.*/ + ret = pipe(s->poll_pipe); + if (ret != 0) + return (0 - errno); + + fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE); + if (fd < 0) { + DPRINTF("Unable to open %s (%d)\n",name,0 - errno); + return -1; + } + + s->fd = fd; + asprintf(&s->name,"%s", name); + + ASSERT(sizeof(header) < 512); + + ret = posix_memalign((void **)&buf, 512, 512); + if (ret != 0) goto fail; + + if (read(fd, buf, 512) != 512) + goto fail; + + header = (QCowHeader *)buf; + be32_to_cpus(&header->magic); + be32_to_cpus(&header->version); + be64_to_cpus(&header->backing_file_offset); + be32_to_cpus(&header->backing_file_size); + be32_to_cpus(&header->mtime); + be64_to_cpus(&header->size); + be32_to_cpus(&header->crypt_method); + be64_to_cpus(&header->l1_table_offset); + + if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION) + goto fail; + if (header->size <= 1 || header->cluster_bits < 9) + goto fail; + if (header->crypt_method > QCOW_CRYPT_AES) + goto fail; + s->crypt_method_header = header->crypt_method; + if (s->crypt_method_header) + s->encrypted = 1; + s->cluster_bits = header->cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header->l2_bits; + s->l2_size = 1 << s->l2_bits; + s->cluster_alloc = s->l2_size; + bs->size = header->size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + + /* read the level 1 table */ + shift = s->cluster_bits + s->l2_bits; + s->l1_size = (header->size + (1LL << shift) - 1) >> shift; + + s->l1_table_offset = header->l1_table_offset; + + /*allocate a 4Kbyte multiple of memory*/ + l1_table_size = s->l1_size * sizeof(uint64_t); + if (l1_table_size % 4096 > 0) { + l1_table_size = ((l1_table_size >> 12) + 1) << 12; + } + ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size); + if (ret != 0) goto fail; + memset(s->l1_table, 0x00, l1_table_size); + + DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n", + (long long)s->l1_table_offset, + (int) (s->l1_size * sizeof(uint64_t)), + l1_table_size); + + lseek(fd, s->l1_table_offset, SEEK_SET); + if (read(fd, s->l1_table, l1_table_size) != l1_table_size) + goto fail; +/* for(i = 0;i < s->l1_size; i++) { + //be64_to_cpus(&s->l1_table[i]); + DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]); + }*/ + + /* alloc L2 cache */ + size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t); + ret = posix_memalign((void **)&s->l2_cache, 4096, size); + if(ret != 0) goto fail; + + size = s->cluster_size; + ret = posix_memalign((void **)&s->cluster_cache, 4096, size); + if(ret != 0) goto fail; + + ret = posix_memalign((void **)&s->cluster_data, 4096, size); + if(ret != 0) goto fail; + s->cluster_cache_offset = -1; + + /* read the backing file name */ + s->bfd = -1; + if (header->backing_file_offset != 0) { + DPRINTF("Reading backing file data\n"); + len = header->backing_file_size; + if (len > 1023) + len = 1023; + + /*TODO - Fix read size for O_DIRECT and use original fd!*/ + fd = open(name, O_RDONLY | O_LARGEFILE); + + lseek(fd, header->backing_file_offset, SEEK_SET); + if (read(fd, bs->backing_file, len) != len) + goto fail; + bs->backing_file[len] = '\0'; + close(fd); + /***********************************/ + + /*Open backing file*/ + fd = open(bs->backing_file, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (fd < 0) { + DPRINTF("Unable to open backing file: %s\n", + bs->backing_file); + goto fail; + } + s->bfd = fd; + s->cluster_alloc = 1; /*Cannot use pre-alloc*/ + } + + bs->sector_size = 512; + bs->info = 0; + + /*Detect min_cluster_alloc*/ + s->min_cluster_alloc = 1; /*Default*/ + if (s->bfd == -1 && (s->l1_table_offset % 4096 == 0) ) { + /*We test to see if the xen magic # exists*/ + exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader)); + be32_to_cpus(&exthdr->xmagic); + if(exthdr->xmagic != XEN_MAGIC) + goto end_xenhdr; + + /*Finally check the L1 table cksum*/ + be32_to_cpus(&exthdr->cksum); + cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t)); + if(exthdr->cksum != cksum) + goto end_xenhdr; + + be32_to_cpus(&exthdr->min_cluster_alloc); + s->min_cluster_alloc = exthdr->min_cluster_alloc; + } + + end_xenhdr: + if (init_aio_state(bs)!=0) { + DPRINTF("Unable to initialise AIO state\n"); + goto fail; + } + s->fd_end = lseek(s->fd, 0, SEEK_END); + + return 0; + +fail: + DPRINTF("QCOW Open failed\n"); + free(s->l1_table); + free(s->l2_cache); + free(s->cluster_cache); + free(s->cluster_data); + close(fd); + return -1; +} + + int tdqcow_queue_read(struct td_state *bs, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0; + uint64_t cluster_offset; + + /*Check we can get a lock*/ + for (i = 0; i < nb_sectors; i++) + if (!aio_can_lock(s, sector + i)) { + DPRINTF("AIO_CAN_LOCK failed [%llu]\n", + (long long) sector + i); + return -EBUSY; + } + + /*We store a local record of the request*/ + qcow_idx = get_free_idx(s); + while (nb_sectors > 0) { + cluster_offset = + get_cluster_offset(bs, sector << 9, 0, 0, 0, 0); + index_in_cluster = sector & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + + if (s->iocb_free_count == 0 || !aio_lock(s, sector)) { + DPRINTF("AIO_LOCK or iocb_free_count (%d) failed" + "[%llu]\n", s->iocb_free_count, + (long long) sector); + return -ENOMEM; + } + + if (!cluster_offset && (s->bfd > 0)) { + s->nr_reqs[qcow_idx]++; + asubmit += async_read(s, s->bfd, n * 512, sector << 9, + buf, cb, id, sector, + qcow_idx, private); + } else if(!cluster_offset) { + memset(buf, 0, 512 * n); + aio_unlock(s, sector); + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + if (decompress_cluster(s, cluster_offset) < 0) { + ret = -1; + goto done; + } + memcpy(buf, s->cluster_cache + index_in_cluster * 512, + 512 * n); + } else { + s->nr_reqs[qcow_idx]++; + asubmit += async_read(s, s->fd, n * 512, + (cluster_offset + + index_in_cluster * 512), + buf, cb, id, sector, + qcow_idx, private); + } + nb_sectors -= n; + sector += n; + buf += n * 512; + } +done: + /*Callback if no async requests outstanding*/ + if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private); + + return 0; +} + + int tdqcow_queue_write(struct td_state *bs, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0; + uint64_t cluster_offset; + + /*Check we can get a lock*/ + for (i = 0; i < nb_sectors; i++) + if (!aio_can_lock(s, sector + i)) { + DPRINTF("AIO_CAN_LOCK failed [%llu]\n", + (long long) (sector + i)); + return -EBUSY; + } + + /*We store a local record of the request*/ + qcow_idx = get_free_idx(s); + while (nb_sectors > 0) { + index_in_cluster = sector & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + + if (s->iocb_free_count == 0 || !aio_lock(s, sector)){ + DPRINTF("AIO_LOCK or iocb_free_count (%d) failed" + "[%llu]\n", s->iocb_free_count, + (long long) sector); + return -ENOMEM; + } + + if (!IS_ZERO(buf,n * 512)) { + + cluster_offset = get_cluster_offset(bs, sector << 9, + 1, 0, + index_in_cluster, + index_in_cluster+n + ); + if (!cluster_offset) { + DPRINTF("Ooops, no write cluster offset!\n"); + ret = -1; + goto done; + } + + if (s->crypt_method) { + encrypt_sectors(s, sector, s->cluster_data, + (unsigned char *)buf, n, 1, + &s->aes_encrypt_key); + s->nr_reqs[qcow_idx]++; + asubmit += async_write(s, s->fd, n * 512, + (cluster_offset + + index_in_cluster*512), + (char *)s->cluster_data, + cb, id, sector, + qcow_idx, private); + } else { + s->nr_reqs[qcow_idx]++; + asubmit += async_write(s, s->fd, n * 512, + (cluster_offset + + index_in_cluster*512), + buf, cb, id, sector, + qcow_idx, private); + } + } else { + /*Write data contains zeros, but we must check to see + if cluster already allocated*/ + cluster_offset = get_cluster_offset(bs, sector << 9, + 0, 0, + index_in_cluster, + index_in_cluster+n + ); + if(cluster_offset) { + if (s->crypt_method) { + encrypt_sectors(s, sector, + s->cluster_data, + (unsigned char *)buf, + n, 1, + &s->aes_encrypt_key); + s->nr_reqs[qcow_idx]++; + asubmit += async_write(s, s->fd, + n * 512, + (cluster_offset+ + index_in_cluster * 512), + (char *)s->cluster_data, cb, id, sector, + qcow_idx, private); + } else { + s->nr_reqs[qcow_idx]++; + asubmit += async_write(s, s->fd, n*512, + cluster_offset + index_in_cluster * 512, + buf, cb, id, sector, + qcow_idx, private); + } + } + else aio_unlock(s, sector); + } + nb_sectors -= n; + sector += n; + buf += n * 512; + } + s->cluster_cache_offset = -1; /* disable compressed cache */ + +done: + /*Callback if no async requests outstanding*/ + if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private); + + return 0; +} + +int tdqcow_submit(struct td_state *bs) +{ + int ret; + struct tdqcow_state *prv = (struct tdqcow_state *)bs->private; + + ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue); + + /* XXX: TODO: Handle error conditions here. */ + + /* Success case: */ + prv->iocb_queued = 0; + + return ret; +} + + +int *tdqcow_get_fd(struct td_state *bs) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + int *fds, i; + + fds = malloc(sizeof(int) * MAX_IOFD); + /*initialise the FD array*/ + for(i=0;i<MAX_IOFD;i++) fds[i] = 0; + + fds[0] = s->poll_fd; + return fds; +} + +int tdqcow_close(struct td_state *bs) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + uint32_t cksum, out; + int fd, offset; + + /*Update the hdr cksum*/ + if(s->min_cluster_alloc == s->l2_size) { + cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t)); + printf("Writing cksum: %d",cksum); + fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/ + offset = sizeof(QCowHeader) + sizeof(uint32_t); + lseek(fd, offset, SEEK_SET); + out = cpu_to_be32(cksum); + write(fd, &out, sizeof(uint32_t)); + close(fd); + } + + free(s->name); + free(s->l1_table); + free(s->l2_cache); + free(s->cluster_cache); + free(s->cluster_data); + close(s->fd); + return 0; +} + +int tdqcow_do_callbacks(struct td_state *s, int sid) +{ + int ret, i, rsp = 0,*ptr; + struct io_event *ep; + struct tdqcow_state *prv = (struct tdqcow_state *)s->private; + + if (sid > MAX_IOFD) return 1; + + /* Non-blocking test for completed io. */ + ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events, + NULL); + + for (ep=prv->aio_events, i = ret; i-->0; ep++) { + struct iocb *io = ep->obj; + struct pending_aio *pio; + + pio = &prv->pending_aio[(long)io->data]; + + if (ep->res != io->u.c.nbytes) { + /* TODO: handle this case better. */ + ptr = (int *)&ep->res; + DPRINTF("AIO did less than I asked it to " + "[%lu,%lu,%d]\n", + ep->res, io->u.c.nbytes, *ptr); + } + aio_unlock(prv, pio->sector); + if (pio->id >= 0) { + if (prv->crypt_method) + encrypt_sectors(prv, pio->sector, + (unsigned char *)pio->buf, + (unsigned char *)pio->buf, + pio->nb_sectors, 0, + &prv->aes_decrypt_key); + prv->nr_reqs[pio->qcow_idx]--; + if (prv->nr_reqs[pio->qcow_idx] == 0) + rsp += pio->cb(s, ep->res2, pio->id, + pio->private); + } else if (pio->id == -2) free(pio->buf); + + prv->iocb_free[prv->iocb_free_count++] = io; + } + return rsp; +} + +int qcow_create(const char *filename, uint64_t total_size, + const char *backing_file, int flags) +{ + int fd, header_size, backing_filename_len, l1_size, i; + int shift, length, adjust, ret = 0; + QCowHeader header; + QCowHeader_ext exthdr; + char backing_filename[1024], *ptr; + uint64_t tmp, size; + struct stat st; + + DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size); + + fd = open(filename, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + if (fd < 0) + return -1; + + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + + /*Create extended header fields*/ + exthdr.xmagic = cpu_to_be32(XEN_MAGIC); + + header_size = sizeof(header) + sizeof(QCowHeader_ext); + backing_filename_len = 0; + size = (total_size >> SECTOR_SHIFT); + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + const char *p; + /* XXX: this is a hack: we do not attempt to + *check for URL like syntax */ + p = strchr(backing_file, ':'); + if (p && (p - backing_file) >= 2) { + /* URL like but exclude "c:" like filenames */ + strncpy(backing_filename, backing_file, + sizeof(backing_filename)); + } else { + realpath(backing_file, backing_filename); + if (stat(backing_filename, &st) != 0) { + return -1; + } + } + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_filename); + header.backing_file_size = cpu_to_be32( + backing_filename_len); + header_size += backing_filename_len; + + /*Set to the backing file size*/ + size = (st.st_size >> SECTOR_SHIFT); + DPRINTF("Backing file size detected: %lld sectors" + "(total %lld [%lld MB])\n", + (long long)total_size, + (long long)(total_size << SECTOR_SHIFT), + (long long)(total_size >> 11)); + } else { + backing_file = NULL; + DPRINTF("Setting file size: %lld (total %lld)\n", + (long long) total_size, + (long long) (total_size << SECTOR_SHIFT)); + } + header.mtime = cpu_to_be32(st.st_mtime); + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodifyed sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + exthdr.min_cluster_alloc = cpu_to_be32(1); + } else { + DPRINTF("Setting file size: %lld sectors" + "(total %lld [%lld MB])\n", + (long long) size, + (long long) (size << SECTOR_SHIFT), + (long long) (size >> 11)); + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + exthdr.min_cluster_alloc = cpu_to_be32(1 << 9); + } + /*Set the header size value*/ + header.size = cpu_to_be64(size * 512); + + header_size = (header_size + 7) & ~7; + if (header_size % 4096 > 0) { + header_size = ((header_size >> 12) + 1) << 12; + } + + shift = header.cluster_bits + header.l2_bits; + l1_size = ((size * 512) + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + DPRINTF("L1 Table offset: %d, size %d\n", + header_size, + (int)(l1_size * sizeof(uint64_t))); + if (flags) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + ptr = calloc(1, l1_size * sizeof(uint64_t)); + exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t))); + printf("Created cksum: %d\n",exthdr.cksum); + free(ptr); + + /* write all the data */ + ret += write(fd, &header, sizeof(header)); + ret += write(fd, &exthdr, sizeof(exthdr)); + if (backing_file) { + ret += write(fd, backing_filename, backing_filename_len); + } + lseek(fd, header_size, SEEK_SET); + tmp = 0; + for (i = 0;i < l1_size; i++) { + ret += write(fd, &tmp, sizeof(tmp)); + } + + /*adjust file length to 4 KByte boundary*/ + length = header_size + l1_size * sizeof(uint64_t); + if (length % 4096 > 0) { + length = ((length >> 12) + 1) << 12; + ftruncate(fd, length); + DPRINTF("Adjusted filelength to %d for 4 " + "Kbyte alignment\n",length); + } + + close(fd); + + return 0; +} + +int qcow_make_empty(struct td_state *bs) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + + memset(s->l1_table, 0, l1_length); + lseek(s->fd, s->l1_table_offset, SEEK_SET); + if (write(s->fd, s->l1_table, l1_length) < 0) + return -1; + ftruncate(s->fd, s->l1_table_offset + l1_length); + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +int qcow_get_cluster_size(struct td_state *bs) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + + return s->cluster_size; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +int qcow_compress_cluster(struct td_state *bs, int64_t sector_num, + const uint8_t *buf) +{ + struct tdqcow_state *s = (struct tdqcow_state *)bs->private; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + if (!out_buf) + return -1; + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + free(out_buf); + return -1; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + free(out_buf); + deflateEnd(&strm); + return -1; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors); + } else { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); + cluster_offset &= s->cluster_offset_mask; + lseek(s->fd, cluster_offset, SEEK_SET); + if (write(s->fd, out_buf, out_len) != out_len) { + free(out_buf); + return -1; + } + } + + free(out_buf); + return 0; +} + +struct tap_disk tapdisk_qcow = { + "tapdisk_qcow", + sizeof(struct tdqcow_state), + tdqcow_open, + tdqcow_queue_read, + tdqcow_queue_write, + tdqcow_submit, + tdqcow_get_fd, + tdqcow_close, + tdqcow_do_callbacks, +}; + diff --git a/tools/blktap/drivers/block-ram.c b/tools/blktap/drivers/block-ram.c new file mode 100644 index 0000000000..4c378ed427 --- /dev/null +++ b/tools/blktap/drivers/block-ram.c @@ -0,0 +1,296 @@ +/* block-ram.c + * + * Fast Ramdisk implementation. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include "tapdisk.h" + +#define MAX_DISK_SIZE 1024000 /*500MB disk limit*/ + +char *img; +long int disksector_size; +long int disksize; +long int diskinfo; +static int connections = 0; + +struct tdram_state { + int fd; + int poll_pipe[2]; /* dummy fd for polling on */ +}; + +/*Get Image size, secsize*/ +static int get_image_info(struct td_state *s, int fd) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + s->size = 0; + if (ioctl(fd,BLKGETSIZE,&s->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + s->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &s->sector_size); + + if (s->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + s->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + s->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + s->size = (stat.st_size >> SECTOR_SHIFT); + s->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + } + + if (s->size == 0) { + s->size =((uint64_t) MAX_DISK_SIZE); + s->sector_size = DEFAULT_SECTOR_SIZE; + } + s->info = 0; + + /*Store variables locally*/ + disksector_size = s->sector_size; + disksize = s->size; + diskinfo = s->info; + DPRINTF("Image sector_size: \n\t[%lu]\n", + s->sector_size); + + return 0; +} + +/* Open the disk file and initialize ram state. */ +int tdram_open (struct td_state *s, const char *name) +{ + int i, fd, ret = 0, count = 0; + struct tdram_state *prv = (struct tdram_state *)s->private; + uint64_t size; + char *p; + s->private = prv; + + connections++; + + /* set up a pipe so that we can hand back a poll fd that won't fire.*/ + ret = pipe(prv->poll_pipe); + if (ret != 0) + return (0 - errno); + + if (connections > 1) { + s->sector_size = disksector_size; + s->size = disksize; + s->info = diskinfo; + DPRINTF("Image already open, returning parameters:\n"); + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + DPRINTF("Image sector_size: \n\t[%lu]\n", + s->sector_size); + + prv->fd = -1; + goto done; + } + + /* Open the file */ + fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE); + + if ((fd == -1) && (errno == EINVAL)) { + + /* Maybe O_DIRECT isn't supported. */ + fd = open(name, O_RDWR | O_LARGEFILE); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s]!\n",name); + ret = 0 - errno; + goto done; + } + + prv->fd = fd; + + ret = get_image_info(s, fd); + size = MAX_DISK_SIZE; + + if (s->size > size) { + DPRINTF("Disk exceeds limit, must be less than [%d]MB", + (MAX_DISK_SIZE<<SECTOR_SHIFT)>>20); + return -ENOMEM; + } + + /*Read the image into memory*/ + p = img = malloc(s->size << SECTOR_SHIFT); + if (img == NULL) { + DPRINTF("Mem malloc failed\n"); + return -1; + } + DPRINTF("Reading %llu bytes.......",(long long unsigned)s->size << SECTOR_SHIFT); + + for (i = 0; i < s->size; i++) { + ret = read(prv->fd, p, s->sector_size); + if (ret != s->sector_size) { + ret = 0 - errno; + break; + } else { + count += ret; + p = img + count; + } + } + DPRINTF("[%d]\n",count); + if (count != s->size << SECTOR_SHIFT) { + ret = -1; + } else { + ret = 0; + } + +done: + return ret; +} + + int tdram_queue_read(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdram_state *prv = (struct tdram_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + int ret; + + memcpy(buf, img + offset, size); + ret = size; + + cb(s, (ret < 0) ? ret: 0, id, private); + + return ret; +} + + int tdram_queue_write(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdram_state *prv = (struct tdram_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + int ret; + + /*We assume that write access is controlled at a higher level for multiple disks*/ + memcpy(img + offset, buf, size); + ret = size; + + cb(s, (ret < 0) ? ret : 0, id, private); + + return ret; +} + +int tdram_submit(struct td_state *s) +{ + return 0; +} + + +int *tdram_get_fd(struct td_state *s) +{ + struct tdram_state *prv = (struct tdram_state *)s->private; + int *fds, i; + + fds = malloc(sizeof(int) * MAX_IOFD); + /*initialise the FD array*/ + for(i=0;i<MAX_IOFD;i++) fds[i] = 0; + + fds[0] = prv->poll_pipe[0]; + return fds; +} + +int tdram_close(struct td_state *s) +{ + struct tdram_state *prv = (struct tdram_state *)s->private; + + connections--; + + return 0; +} + +int tdram_do_callbacks(struct td_state *s, int sid) +{ + /* always ask for a kick */ + return 1; +} + +struct tap_disk tapdisk_ram = { + "tapdisk_ram", + sizeof(struct tdram_state), + tdram_open, + tdram_queue_read, + tdram_queue_write, + tdram_submit, + tdram_get_fd, + tdram_close, + tdram_do_callbacks, +}; + diff --git a/tools/blktap/drivers/block-sync.c b/tools/blktap/drivers/block-sync.c new file mode 100644 index 0000000000..77865cc1ab --- /dev/null +++ b/tools/blktap/drivers/block-sync.c @@ -0,0 +1,242 @@ +/* block-sync.c + * + * simple slow synchronous raw disk implementation. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include "tapdisk.h" + +struct tdsync_state { + int fd; + int poll_pipe[2]; /* dummy fd for polling on */ +}; + +/*Get Image size, secsize*/ +static int get_image_info(struct td_state *s, int fd) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + s->size = 0; + if (ioctl(fd,BLKGETSIZE,&s->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + s->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &s->sector_size); + + if (s->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + s->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + s->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + s->size = (stat.st_size >> SECTOR_SHIFT); + s->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%lluu]\n\tpost " + "sector_shift [%lluu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + } + + if (s->size == 0) + return -EINVAL; + + s->info = 0; + + return 0; +} + +/* Open the disk file and initialize aio state. */ +int tdsync_open (struct td_state *s, const char *name) +{ + int i, fd, ret = 0; + struct tdsync_state *prv = (struct tdsync_state *)s->private; + s->private = prv; + + /* set up a pipe so that we can hand back a poll fd that won't fire.*/ + ret = pipe(prv->poll_pipe); + if (ret != 0) + return (0 - errno); + + /* Open the file */ + fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE); + + if ( (fd == -1) && (errno == EINVAL) ) { + + /* Maybe O_DIRECT isn't supported. */ + fd = open(name, O_RDWR | O_LARGEFILE); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s]!\n",name); + ret = 0 - errno; + goto done; + } + + prv->fd = fd; + + ret = get_image_info(s, fd); +done: + return ret; +} + + int tdsync_queue_read(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdsync_state *prv = (struct tdsync_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + int ret; + + ret = lseek(prv->fd, offset, SEEK_SET); + if (ret != (off_t)-1) { + ret = read(prv->fd, buf, size); + if (ret != size) { + ret = 0 - errno; + } else { + ret = 1; + } + } else ret = 0 - errno; + + cb(s, (ret < 0) ? ret: 0, id, private); + + return 1; +} + + int tdsync_queue_write(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdsync_state *prv = (struct tdsync_state *)s->private; + int size = nb_sectors * s->sector_size; + uint64_t offset = sector * (uint64_t)s->sector_size; + int ret = 0; + + ret = lseek(prv->fd, offset, SEEK_SET); + if (ret != (off_t)-1) { + ret = write(prv->fd, buf, size); + if (ret != size) { + ret = 0 - errno; + } else { + ret = 1; + } + } else ret = 0 - errno; + + cb(s, (ret < 0) ? ret : 0, id, private); + + return 1; +} + +int tdsync_submit(struct td_state *s) +{ + return 0; +} + + +int *tdsync_get_fd(struct td_state *s) +{ + struct tdsync_state *prv = (struct tdsync_state *)s->private; + + int *fds, i; + + fds = malloc(sizeof(int) * MAX_IOFD); + /*initialise the FD array*/ + for(i=0;i<MAX_IOFD;i++) fds[i] = 0; + + fds[0] = prv->poll_pipe[0]; + return fds; +} + +int tdsync_close(struct td_state *s) +{ + struct tdsync_state *prv = (struct tdsync_state *)s->private; + + close(prv->fd); + close(prv->poll_pipe[0]); + close(prv->poll_pipe[1]); + + return 0; +} + +int tdsync_do_callbacks(struct td_state *s, int sid) +{ + /* always ask for a kick */ + return 1; +} + +struct tap_disk tapdisk_sync = { + "tapdisk_sync", + sizeof(struct tdsync_state), + tdsync_open, + tdsync_queue_read, + tdsync_queue_write, + tdsync_submit, + tdsync_get_fd, + tdsync_close, + tdsync_do_callbacks, +}; + diff --git a/tools/blktap/drivers/block-vmdk.c b/tools/blktap/drivers/block-vmdk.c new file mode 100644 index 0000000000..437cd5c01f --- /dev/null +++ b/tools/blktap/drivers/block-vmdk.c @@ -0,0 +1,415 @@ +/* block-vmdk.c + * + * VMware Disk format implementation. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * This is largely the same as the vmdk driver in Qemu, I've just twisted it + * to match our interfaces. The original (BSDish) Copyright message appears + * below: + */ + +/* + * Block driver for the VMDK format + * + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2005 Filip Navara + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include "tapdisk.h" +#include "bswap.h" + +#define safer_free(_x) \ + do { \ + if (NULL != _x) { \ + free(_x); \ + (_x) = NULL; \ + } \ + } while (0) ; + +#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') +#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') + +typedef struct { + uint32_t version; + uint32_t flags; + uint32_t disk_sectors; + uint32_t granularity; + uint32_t l1dir_offset; + uint32_t l1dir_size; + uint32_t file_sectors; + uint32_t cylinders; + uint32_t heads; + uint32_t sectors_per_track; +} VMDK3Header; + +typedef struct { + uint32_t version; + uint32_t flags; + int64_t capacity; + int64_t granularity; + int64_t desc_offset; + int64_t desc_size; + int32_t num_gtes_per_gte; + int64_t rgd_offset; + int64_t gd_offset; + int64_t grain_offset; + char filler[1]; + char check_bytes[4]; +} __attribute__((packed)) VMDK4Header; + +#define L2_CACHE_SIZE 16 + +struct tdvmdk_state { + int fd; + int poll_pipe[2]; /* dummy fd for polling on */ + + unsigned int l1_size; + int64_t l1_table_offset; + int64_t l1_backup_table_offset; + uint32_t l1_entry_sectors; + unsigned int l2_size; + + uint32_t *l1_table; + uint32_t *l1_backup_table; + uint32_t *l2_cache; + uint32_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + + unsigned int cluster_sectors; +}; + + +/* Open the disk file and initialize aio state. */ +static int tdvmdk_open (struct td_state *s, const char *name) +{ + int ret, fd; + int l1_size, i; + uint32_t magic; + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + + /* set up a pipe so that we can hand back a poll fd that won't fire.*/ + ret = pipe(prv->poll_pipe); + if (ret != 0) + return -1; + + /* Open the file */ + fd = open(name, O_RDWR | O_LARGEFILE); + + if ( (fd == -1) && (errno == EINVAL) ) { + + /* Maybe O_DIRECT isn't supported. */ + fd = open(name, O_RDWR | O_LARGEFILE); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s]!\n",name); + ret = 0 - errno; + return -1; + } + + prv->fd = fd; + + /* Grok the vmdk header. */ + if ((ret = read(fd, &magic, sizeof(magic))) != sizeof(magic)) + goto fail; + magic = be32_to_cpu(magic); + if (magic == VMDK3_MAGIC) { + VMDK3Header header; + if (read(fd, &header, sizeof(header)) != + sizeof(header)) + goto fail; + prv->cluster_sectors = le32_to_cpu(header.granularity); + prv->l2_size = 1 << 9; + prv->l1_size = 1 << 6; + s->size = le32_to_cpu(header.disk_sectors); + prv->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9; + prv->l1_backup_table_offset = 0; + prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors; + } else if (magic == VMDK4_MAGIC) { + VMDK4Header header; + + if (read(fd, &header, sizeof(header)) != sizeof(header)) + goto fail; + s->size = le32_to_cpu(header.capacity); + prv->cluster_sectors = le32_to_cpu(header.granularity); + prv->l2_size = le32_to_cpu(header.num_gtes_per_gte); + prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors; + if (prv->l1_entry_sectors <= 0) + goto fail; + prv->l1_size = (s->size + prv->l1_entry_sectors - 1) + / prv->l1_entry_sectors; + prv->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9; + prv->l1_backup_table_offset = + le64_to_cpu(header.gd_offset) << 9; + } else { + goto fail; + } + /* read the L1 table */ + l1_size = prv->l1_size * sizeof(uint32_t); + prv->l1_table = malloc(l1_size); + if (!prv->l1_table) + goto fail; + if (lseek(fd, prv->l1_table_offset, SEEK_SET) == -1) + goto fail; + if (read(fd, prv->l1_table, l1_size) != l1_size) + goto fail; + for (i = 0; i < prv->l1_size; i++) { + le32_to_cpus(&prv->l1_table[i]); + } + + if (prv->l1_backup_table_offset) { + prv->l1_backup_table = malloc(l1_size); + if (!prv->l1_backup_table) + goto fail; + if (lseek(fd, prv->l1_backup_table_offset, SEEK_SET) == -1) + goto fail; + if (read(fd, prv->l1_backup_table, l1_size) != l1_size) + goto fail; + for(i = 0; i < prv->l1_size; i++) { + le32_to_cpus(&prv->l1_backup_table[i]); + } + } + + prv->l2_cache = malloc(prv->l2_size * L2_CACHE_SIZE *sizeof(uint32_t)); + if (!prv->l2_cache) + goto fail; + prv->fd = fd; + DPRINTF("VMDK File opened successfully\n"); + return 0; + +fail: + DPRINTF("VMDK File open failed.\n"); + safer_free(prv->l1_backup_table); + free(prv->l1_table); + free(prv->l2_cache); + close(fd); + return -1; +} + +static uint64_t get_cluster_offset(struct td_state *s, + uint64_t offset, int allocate) +{ + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + unsigned int l1_index, l2_offset, l2_index; + int min_index, i, j; + uint32_t min_count, *l2_table, tmp; + uint64_t cluster_offset; + + l1_index = (offset >> 9) / prv->l1_entry_sectors; + if (l1_index >= prv->l1_size) + return 0; + l2_offset = prv->l1_table[l1_index]; + if (!l2_offset) + return 0; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == prv->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++prv->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + prv->l2_cache_counts[j] >>= 1; + } + } + l2_table = prv->l2_cache + (i * prv->l2_size); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (prv->l2_cache_counts[i] < min_count) { + min_count = prv->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = prv->l2_cache + (min_index * prv->l2_size); + lseek(prv->fd, (int64_t)l2_offset * 512, SEEK_SET); + if (read(prv->fd, l2_table, prv->l2_size * sizeof(uint32_t)) != + prv->l2_size * sizeof(uint32_t)) + return 0; + prv->l2_cache_offsets[min_index] = l2_offset; + prv->l2_cache_counts[min_index] = 1; + found: + l2_index = ((offset >> 9) / prv->cluster_sectors) % prv->l2_size; + cluster_offset = le32_to_cpu(l2_table[l2_index]); + if (!cluster_offset) { + if (!allocate) + return 0; + cluster_offset = lseek(prv->fd, 0, SEEK_END); + ftruncate(prv->fd, cluster_offset + + (prv->cluster_sectors << 9)); + cluster_offset >>= 9; + /* update L2 table */ + tmp = cpu_to_le32(cluster_offset); + l2_table[l2_index] = tmp; + lseek(prv->fd, ((int64_t)l2_offset * 512) + + (l2_index * sizeof(tmp)), SEEK_SET); + if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp)) + return 0; + /* update backup L2 table */ + if (prv->l1_backup_table_offset != 0) { + l2_offset = prv->l1_backup_table[l1_index]; + lseek(prv->fd, ((int64_t)l2_offset * 512) + + (l2_index * sizeof(tmp)), SEEK_SET); + if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp)) + return 0; + } + } + cluster_offset <<= 9; + return cluster_offset; +} + +static int tdvmdk_queue_read(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + int index_in_cluster, n; + uint64_t cluster_offset; + int ret = 0; + while (nb_sectors > 0) { + cluster_offset = get_cluster_offset(s, sector << 9, 0); + index_in_cluster = sector % prv->cluster_sectors; + n = prv->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + if (!cluster_offset) { + memset(buf, 0, 512 * n); + } else { + lseek(prv->fd, cluster_offset + index_in_cluster * 512, + SEEK_SET); + ret = read(prv->fd, buf, n * 512); + if (ret != n * 512) { + ret = -1; + goto done; + } + } + nb_sectors -= n; + sector += n; + buf += n * 512; + } +done: + cb(s, ret == -1 ? -1 : 0, id, private); + + return 1; +} + +static int tdvmdk_queue_write(struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *private) +{ + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + int index_in_cluster, n; + uint64_t cluster_offset; + int ret = 0; + + + while (nb_sectors > 0) { + index_in_cluster = sector & (prv->cluster_sectors - 1); + n = prv->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + cluster_offset = get_cluster_offset(s, sector << 9, 1); + if (!cluster_offset) { + ret = -1; + goto done; + } + lseek(prv->fd, cluster_offset + index_in_cluster * 512, + SEEK_SET); + ret = write(prv->fd, buf, n * 512); + if (ret != n * 512) { + ret = -1; + goto done; + } + nb_sectors -= n; + sector += n; + buf += n * 512; + } +done: + cb(s, ret == -1 ? -1 : 0, id, private); + + return 1; +} + +static int tdvmdk_submit(struct td_state *s) +{ + return 0; +} + + +static int *tdvmdk_get_fd(struct td_state *s) +{ + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + int *fds, i; + + fds = malloc(sizeof(int) * MAX_IOFD); + /*initialise the FD array*/ + for (i=0;i<MAX_IOFD;i++) fds[i] = 0; + + fds[0] = prv->poll_pipe[0]; + return fds; +} + +static int tdvmdk_close(struct td_state *s) +{ + struct tdvmdk_state *prv = (struct tdvmdk_state *)s->private; + + safer_free(prv->l1_table); + safer_free(prv->l1_backup_table); + safer_free(prv->l2_cache); + close(prv->fd); + close(prv->poll_pipe[0]); + close(prv->poll_pipe[1]); + return 0; +} + +static int tdvmdk_do_callbacks(struct td_state *s, int sid) +{ + /* always ask for a kick */ + return 1; +} + +struct tap_disk tapdisk_vmdk = { + "tapdisk_vmdk", + sizeof(struct tdvmdk_state), + tdvmdk_open, + tdvmdk_queue_read, + tdvmdk_queue_write, + tdvmdk_submit, + tdvmdk_get_fd, + tdvmdk_close, + tdvmdk_do_callbacks, +}; + diff --git a/tools/blktap/drivers/bswap.h b/tools/blktap/drivers/bswap.h new file mode 100644 index 0000000000..bb9de92b25 --- /dev/null +++ b/tools/blktap/drivers/bswap.h @@ -0,0 +1,202 @@ +#ifndef BSWAP_H +#define BSWAP_H + +//#include "config-host.h" + +#include <inttypes.h> + +#ifdef HAVE_BYTESWAP_H +#include <byteswap.h> +#else + +#define bswap_16(x) \ +({ \ + uint16_t __x = (x); \ + ((uint16_t)( \ + (((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \ + (((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \ +}) + +#define bswap_32(x) \ +({ \ + uint32_t __x = (x); \ + ((uint32_t)( \ + (((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \ + (((uint32_t)(__x) & (uint32_t)0x0000ff00UL) << 8) | \ + (((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >> 8) | \ + (((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \ +}) + +#define bswap_64(x) \ +({ \ + uint64_t __x = (x); \ + ((uint64_t)( \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) << 8) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >> 8) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \ + (uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \ +}) + +#endif /* !HAVE_BYTESWAP_H */ + +static inline uint16_t bswap16(uint16_t x) +{ + return bswap_16(x); +} + +static inline uint32_t bswap32(uint32_t x) +{ + return bswap_32(x); +} + +static inline uint64_t bswap64(uint64_t x) +{ + return bswap_64(x); +} + +static inline void bswap16s(uint16_t *s) +{ + *s = bswap16(*s); +} + +static inline void bswap32s(uint32_t *s) +{ + *s = bswap32(*s); +} + +static inline void bswap64s(uint64_t *s) +{ + *s = bswap64(*s); +} + +#if defined(WORDS_BIGENDIAN) +#define be_bswap(v, size) (v) +#define le_bswap(v, size) bswap ## size(v) +#define be_bswaps(v, size) +#define le_bswaps(p, size) *p = bswap ## size(*p); +#else +#define le_bswap(v, size) (v) +#define be_bswap(v, size) bswap ## size(v) +#define le_bswaps(v, size) +#define be_bswaps(p, size) *p = bswap ## size(*p); +#endif + +#define CPU_CONVERT(endian, size, type)\ +static inline type endian ## size ## _to_cpu(type v)\ +{\ + return endian ## _bswap(v, size);\ +}\ +\ +static inline type cpu_to_ ## endian ## size(type v)\ +{\ + return endian ## _bswap(v, size);\ +}\ +\ +static inline void endian ## size ## _to_cpus(type *p)\ +{\ + endian ## _bswaps(p, size)\ +}\ +\ +static inline void cpu_to_ ## endian ## size ## s(type *p)\ +{\ + endian ## _bswaps(p, size)\ +}\ +\ +static inline type endian ## size ## _to_cpup(const type *p)\ +{\ + return endian ## size ## _to_cpu(*p);\ +}\ +\ +static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\ +{\ + *p = cpu_to_ ## endian ## size(v);\ +} + +CPU_CONVERT(be, 16, uint16_t) +CPU_CONVERT(be, 32, uint32_t) +CPU_CONVERT(be, 64, uint64_t) + +CPU_CONVERT(le, 16, uint16_t) +CPU_CONVERT(le, 32, uint32_t) +CPU_CONVERT(le, 64, uint64_t) + +/* unaligned versions (optimized for frequent unaligned accesses)*/ + +#if defined(__i386__) || defined(__powerpc__) + +#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v) +#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v) +#define le16_to_cpupu(p) le16_to_cpup(p) +#define le32_to_cpupu(p) le32_to_cpup(p) + +#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v) +#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v) + +#else + +static inline void cpu_to_le16wu(uint16_t *p, uint16_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v; + p1[1] = v >> 8; +} + +static inline void cpu_to_le32wu(uint32_t *p, uint32_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v; + p1[1] = v >> 8; + p1[2] = v >> 16; + p1[3] = v >> 24; +} + +static inline uint16_t le16_to_cpupu(const uint16_t *p) +{ + const uint8_t *p1 = (const uint8_t *)p; + return p1[0] | (p1[1] << 8); +} + +static inline uint32_t le32_to_cpupu(const uint32_t *p) +{ + const uint8_t *p1 = (const uint8_t *)p; + return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24); +} + +static inline void cpu_to_be16wu(uint16_t *p, uint16_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v >> 8; + p1[1] = v; +} + +static inline void cpu_to_be32wu(uint32_t *p, uint32_t v) +{ + uint8_t *p1 = (uint8_t *)p; + + p1[0] = v >> 24; + p1[1] = v >> 16; + p1[2] = v >> 8; + p1[3] = v; +} + +#endif + +#ifdef WORDS_BIGENDIAN +#define cpu_to_32wu cpu_to_be32wu +#else +#define cpu_to_32wu cpu_to_le32wu +#endif + +#undef le_bswap +#undef be_bswap +#undef le_bswaps +#undef be_bswaps + +#endif /* BSWAP_H */ diff --git a/tools/blktap/drivers/img2qcow.c b/tools/blktap/drivers/img2qcow.c new file mode 100644 index 0000000000..2c9974c7fc --- /dev/null +++ b/tools/blktap/drivers/img2qcow.c @@ -0,0 +1,289 @@ +/* img2qcow.c + * + * Generates a qcow format disk and fills it from an existing image. + * + * (c) 2006 Julian Chesterfield and Andrew Warfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include "tapdisk.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +#define TAPDISK 1 +#define BLOCK_PROCESSSZ 4096 + +static int maxfds, *io_fd, running = 1, complete = 0; +static int returned_events = 0, submit_events = 0; +static uint64_t prev = 0; +static char output[25]; + +void print_bytes(void *ptr, int length) { + + int i,k; + unsigned char *p = ptr; + + DFPRINTF("Buf dump, length %d:\n",length); + for (k = 0; k < length; k++) { + DFPRINTF("%x",*p); + *p++; + if(k % 16 == 0) DFPRINTF("\n"); + else if(k % 2 == 0) DFPRINTF(" "); + } + DFPRINTF("\n"); + return; +} + +void debug_output(uint64_t progress, uint64_t size) +{ + uint64_t blocks = size/20; + + /*Output progress every 5% */ + if (progress/blocks > prev) { + memcpy(output+prev+1,"=>",2); + prev++; + DFPRINTF("\r%s %llu%%", output, + (long long)(prev-1)*5); + } + return; +} + +static inline void LOCAL_FD_SET(fd_set *readfds) +{ + FD_SET(io_fd[0], readfds); + maxfds = io_fd[0] + 1; + + return; +} + +static int get_image_info(struct td_state *s, int fd) +{ + int ret; + long size; + unsigned long total_size; + struct statvfs statBuf; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DFPRINTF("ERROR: fstat failed, Couldn't stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + s->size = 0; + if (ioctl(fd,BLKGETSIZE,&s->size)!=0) { + DFPRINTF("ERR: BLKGETSIZE failed, " + "couldn't stat image"); + return -EINVAL; + } + + DFPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(s->size << SECTOR_SHIFT), + (long long unsigned)s->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + int arg; + s->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &s->sector_size); + + if (s->sector_size != DEFAULT_SECTOR_SIZE) + DFPRINTF("Note: sector size is %ld (not %d)\n", + s->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + s->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + s->size = (stat.st_size >> SECTOR_SHIFT); + s->sector_size = DEFAULT_SECTOR_SIZE; + DFPRINTF("Image size: [%llu]\n", + (long long unsigned)s->size); + } + + return 0; +} + +static int send_responses(struct td_state *s, int res, int idx, void *private) +{ + if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res); + + returned_events++; + + free(private); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct tap_disk *drv; + struct td_state *s; + int ret = -1, fd, len; + fd_set readfds; + struct timeval timeout; + uint64_t i; + char *buf; + + if (argc != 3) { + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", + argv[0]); + exit(-1); + } + + s = malloc(sizeof(struct td_state)); + + /*Open image*/ + fd = open(argv[2], O_RDONLY | O_LARGEFILE); + + if (fd == -1) { + DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno); + exit(-1); + } + + get_image_info(s, fd); + + /*Create qcow file*/ + ret = qcow_create(argv[1],s->size<<SECTOR_SHIFT,NULL,0); + + if (ret < 0) { + DFPRINTF("Unable to create QCOW file\n"); + exit(-1); + } else DFPRINTF("Qcow file created: size %llu sectors\n", + (long long unsigned)s->size); + + drv = &tapdisk_qcow; + s->private = malloc(drv->private_data_size); + + /*Open qcow file*/ + if (drv->td_open(s, argv[1])!=0) { + DFPRINTF("Unable to open Qcow file [%s]\n",argv[1]); + exit(-1); + } + + io_fd = drv->td_get_fd(s); + + /*Initialise the output string*/ + memset(output,0x20,25); + output[0] = '['; + output[22] = ']'; + output[23] = '\0'; + DFPRINTF("%s",output); + + i = 0; + while (running) { + timeout.tv_sec = 0; + + if (!complete) { + /*Read sector from image*/ + if (lseek(fd, i, SEEK_SET) == (off_t)-1) { + DFPRINTF("Unable to access file offset %llu\n", + (long long)i); + exit(-1); + } + + if( (ret = posix_memalign((void **)&buf, + BLOCK_PROCESSSZ, + BLOCK_PROCESSSZ)) != 0) { + DFPRINTF("Unable to read memalign buf (%d)\n",ret); + exit(-1); + } + + /*We attempt to read 4k sized blocks*/ + len = read(fd, buf, BLOCK_PROCESSSZ); + if (len < 512) { + DFPRINTF("Unable to read sector %llu\n", + (long long unsigned) (i >> 9)); + complete = 1; + continue; + } + + if (len % 512) { + len = (len >> 9) << 9; + } + + ret = drv->td_queue_write(s, i >> 9, + len >> 9, buf, + send_responses, 0, buf); + + if (!ret) submit_events++; + + if (ret < 0) { + DFPRINTF("UNABLE TO WRITE block [%llu]\n", + (long long unsigned) (i >> 9)); + } else i += len; + + if (i >> 9 == s->size) complete = 1; + + debug_output(i,s->size << 9); + + if ((submit_events % 10 == 0) || complete) + drv->td_submit(s); + timeout.tv_usec = 0; + + } else { + timeout.tv_usec = 1000; + if (!submit_events) running = 0; + } + + + /*Check AIO FD*/ + LOCAL_FD_SET(&readfds); + ret = select(maxfds + 1, &readfds, (fd_set *) 0, + (fd_set *) 0, &timeout); + + if (ret > 0) drv->td_do_callbacks(s, 0); + if (complete && (returned_events == submit_events)) + running = 0; + } + memcpy(output+prev+1,"=",1); + DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output); + drv->td_close(s); + free(s->private); + free(s); + + return 0; +} diff --git a/tools/blktap/drivers/qcow-create.c b/tools/blktap/drivers/qcow-create.c new file mode 100644 index 0000000000..be473934e8 --- /dev/null +++ b/tools/blktap/drivers/qcow-create.c @@ -0,0 +1,80 @@ +/* qcow-create.c + * + * Generates a qcow format disk. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include "tapdisk.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + + +int main(int argc, char *argv[]) +{ + int ret = -1; + uint64_t size; + + if ( (argc < 3) || (argc > 4) ) { + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, + "usage: %s <SIZE(MB)> <FILENAME> " + "[<BACKING_FILENAME>]\n", + argv[0]); + exit(-1); + } + + size = atoi(argv[1]); + size = size << 20; + DFPRINTF("Creating file size %llu\n",(long long unsigned)size); + switch(argc) { + case 3: + ret = qcow_create(argv[2],size,NULL,0); + break; + case 4: + ret = qcow_create(argv[2],size,argv[3],0); + break; + } + if (ret < 0) DPRINTF("Unable to create QCOW file\n"); + else DPRINTF("QCOW file successfully created\n"); + + return 0; +} diff --git a/tools/blktap/drivers/qcow2raw.c b/tools/blktap/drivers/qcow2raw.c new file mode 100644 index 0000000000..a7abc1bfa5 --- /dev/null +++ b/tools/blktap/drivers/qcow2raw.c @@ -0,0 +1,346 @@ +/* qcow2raw.c + * + * Generates raw image data from an existing qcow image + * + * (c) 2006 Julian Chesterfield and Andrew Warfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> +#include "tapdisk.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +#define TAPDISK 1 +#define BLOCK_PROCESSSZ 4096 + +static int maxfds, *qcowio_fd, *aio_fd, running = 1, complete = 0; +static int read_complete = 0, write_complete = 0; +static int returned_read_events = 0, returned_write_events = 0; +static int submit_events = 0; +static uint32_t read_idx = 0, write_idx = 0; +struct tap_disk *drv1, *drv2; +struct td_state *sqcow, *saio; +static uint64_t prev = 0, written = 0; +static char output[25]; + +void print_bytes(void *ptr, int length) { + + int i,k; + unsigned char *p = ptr; + + DFPRINTF("Buf dump, length %d:\n",length); + for (k = 0; k < length; k++) { + DFPRINTF("%x",*p); + *p++; + if (k % 16 == 0) DFPRINTF("\n"); + else if (k % 2 == 0) DFPRINTF(" "); + } + DFPRINTF("\n"); + return; +} + +void debug_output(uint64_t progress, uint64_t size) +{ + /*Output progress every 5% */ + uint64_t blocks = size/20; + + if (progress/blocks > prev) { + memcpy(output+prev+1,"=>",2); + prev++; + DFPRINTF("\r%s %llu%%", + output, (long long)((prev-1)*5)); + } + return; +} + +static inline void LOCAL_FD_SET(fd_set *readfds) +{ + FD_SET(qcowio_fd[0], readfds); + FD_SET(aio_fd[0], readfds); + + maxfds = (qcowio_fd[0] > aio_fd[0] ? qcowio_fd[0] : aio_fd[0]) + 1; + + return; +} + +static int send_write_responses(struct td_state *s, int res, int idx, void *private) +{ + if (res < 0) { + DFPRINTF("AIO FAILURE: res [%d]!\n",res); + return 0; + } + written += BLOCK_PROCESSSZ; + returned_write_events++; + write_idx = idx; + if (complete && (returned_write_events == submit_events)) + write_complete = 1; + + debug_output(written, s->size << 9); + free(private); + return 0; +} + +static int send_read_responses(struct td_state *s, int res, int idx, void *private) +{ + int ret; + + if (res < 0) DFPRINTF("AIO FAILURE: res [%d]!\n",res); + + returned_read_events++; + read_idx = idx; + if (complete && (returned_read_events == submit_events)) + read_complete = 1; + + ret = drv2->td_queue_write(saio, idx, BLOCK_PROCESSSZ>>9, private, + send_write_responses, idx, private); + if (ret != 0) { + DFPRINTF("ERROR in submitting queue write!\n"); + return 0; + } + + if ( (complete && returned_read_events == submit_events) || + (returned_read_events % 10 == 0) ) { + drv2->td_submit(saio); + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret = -1, fd, len,input; + long int size; + fd_set readfds; + struct timeval timeout; + uint64_t i; + char *buf; + struct stat finfo; + + if (argc != 3) { + fprintf(stderr, "Qcow-utils: v1.0.0\n"); + fprintf(stderr, "usage: %s <Dest File descriptor> " + "<Qcow SRC IMAGE>\n", + argv[0]); + exit(-1); + } + + sqcow = malloc(sizeof(struct td_state)); + saio = malloc(sizeof(struct td_state)); + + /*Open qcow source file*/ + drv1 = &tapdisk_qcow; + sqcow->private = malloc(drv1->private_data_size); + + if (drv1->td_open(sqcow, argv[2])!=0) { + DFPRINTF("Unable to open Qcow file [%s]\n",argv[2]); + exit(-1); + } else DFPRINTF("QCOW file opened, size %llu\n", + (long long unsigned)sqcow->size); + + qcowio_fd = drv1->td_get_fd(sqcow); + + /*Setup aio destination file*/ + ret = stat(argv[1],&finfo); + if (ret == -1) { + /*Check errno*/ + switch(errno) { + case ENOENT: + /*File doesn't exist, create*/ + fd = open(argv[1], + O_RDWR | O_LARGEFILE | O_CREAT, 0644); + if (fd < 0) { + DFPRINTF("ERROR creating file [%s] " + "(errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + if (ftruncate(fd, (off_t)sqcow->size<<9) < 0) { + DFPRINTF("Unable to create file " + "[%s] of size %llu (errno %d). " + "Exiting...\n", + argv[1], + (long long unsigned)sqcow->size<<9, + 0 - errno); + close(fd); + exit(-1); + } + close(fd); + break; + case ENXIO: + DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]); + exit(-1); + default: + DFPRINTF("An error occurred opening Device [%s] " + "(errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + } else { + fprintf(stderr, "WARNING: All existing data in " + "%s will be overwritten.\nDo you wish to continue? " + "(y or n) ", + argv[1]); + if (getchar() != 'y') { + DFPRINTF("Exiting...\n"); + exit(-1); + } + + /*TODO - Test the existing file or device for adequate space*/ + fd = open(argv[1], O_RDWR | O_LARGEFILE); + if (fd < 0) { + DFPRINTF("ERROR: opening file [%s] (errno %d)\n", + argv[1], 0 - errno); + exit(-1); + } + + if (S_ISBLK(finfo.st_mode)) { + if(ioctl(fd,BLKGETSIZE,&size)!=0) { + DFPRINTF("ERROR: BLKGETSIZE failed, " + "couldn't stat image [%s]\n", + argv[1]); + close(fd); + exit(-1); + } + if (size < sqcow->size<<9) { + DFPRINTF("ERROR: Not enough space on device " + "%s (%lu bytes available, %llu bytes required\n", + argv[1], size, + (long long unsigned)sqcow->size<<9); + close(fd); + exit(-1); + } + } else { + if (ftruncate(fd, (off_t)sqcow->size<<9) < 0) { + DFPRINTF("Unable to create file " + "[%s] of size %llu (errno %d). " + "Exiting...\n", + argv[1], + (long long unsigned)sqcow->size<<9, + 0 - errno); + close(fd); + exit(-1); + } else DFPRINTF("File [%s] truncated to length %llu " + "(%llu)\n", + argv[1], + (long long unsigned)sqcow->size<<9, + (long long unsigned)sqcow->size); + } + close(fd); + } + + /*Open aio destination file*/ + drv2 = &tapdisk_aio; + saio->private = malloc(drv2->private_data_size); + + if (drv2->td_open(saio, argv[1])!=0) { + DFPRINTF("Unable to open Qcow file [%s]\n", argv[1]); + exit(-1); + } + + aio_fd = drv2->td_get_fd(saio); + + /*Initialise the output string*/ + memset(output,0x20,25); + output[0] = '['; + output[22] = ']'; + output[23] = '\0'; + DFPRINTF("%s",output); + + i = 0; + while (running) { + timeout.tv_sec = 0; + + if (!complete) { + /*Read Pages from qcow image*/ + if ( (ret = posix_memalign((void **)&buf, + BLOCK_PROCESSSZ, + BLOCK_PROCESSSZ)) + != 0) { + DFPRINTF("Unable to alloc memory (%d)\n",ret); + exit(-1); + } + + /*Attempt to read 4k sized blocks*/ + ret = drv1->td_queue_read(sqcow, i>>9, + BLOCK_PROCESSSZ>>9, buf, + send_read_responses, i>>9, buf); + + if (ret < 0) { + DFPRINTF("UNABLE TO READ block [%llu]\n", + (long long unsigned)i); + exit(-1); + } else { + i += BLOCK_PROCESSSZ; + submit_events++; + } + + if (i >= sqcow->size<<9) { + complete = 1; + } + + if ((submit_events % 10 == 0) || complete) + drv1->td_submit(sqcow); + timeout.tv_usec = 0; + + } else { + timeout.tv_usec = 1000; + if (!submit_events) running = 0; + } + + + /*Check AIO FD*/ + LOCAL_FD_SET(&readfds); + ret = select(maxfds + 1, &readfds, (fd_set *) 0, + (fd_set *) 0, &timeout); + + if (ret > 0) { + if (FD_ISSET(qcowio_fd[0], &readfds)) + drv1->td_do_callbacks(sqcow, 0); + if (FD_ISSET(aio_fd[0], &readfds)) + drv2->td_do_callbacks(saio, 0); + } + if (complete && (returned_write_events == submit_events)) + running = 0; + } + memcpy(output+prev+1,"=",1); + DFPRINTF("\r%s 100%%\nTRANSFER COMPLETE\n\n", output); + + return 0; +} diff --git a/tools/blktap/drivers/tapdisk.c b/tools/blktap/drivers/tapdisk.c new file mode 100644 index 0000000000..f817a89a46 --- /dev/null +++ b/tools/blktap/drivers/tapdisk.c @@ -0,0 +1,671 @@ +/* tapdisk.c + * + * separate disk process, spawned by blktapctrl. Inherits code from driver + * plugins + * + * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield. + * + */ + +#define MSG_SIZE 4096 +#define TAPDISK + +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <string.h> +#include <signal.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/poll.h> +#include <unistd.h> +#include <errno.h> +#include <pthread.h> +#include <time.h> +#include <err.h> +#include <poll.h> +#include <sys/statvfs.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include "blktaplib.h" +#include "tapdisk.h" + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + +#define INPUT 0 +#define OUTPUT 1 + +static int maxfds, fds[2], run = 1; + +static pid_t process; +int connected_disks = 0; +fd_list_entry_t *fd_start = NULL; + +void usage(void) +{ + fprintf(stderr, "blktap-utils: v1.0.0\n"); + fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n"); + exit(-1); +} + +void daemonize(void) +{ + int i; + + if (getppid()==1) return; /* already a daemon */ + if (fork() != 0) exit(0); + +#if 0 + /*Set new program session ID and close all descriptors*/ + setsid(); + for (i = getdtablesize(); i >= 0; --i) close(i); + + /*Send all I/O to /dev/null */ + i = open("/dev/null",O_RDWR); + dup(i); + dup(i); +#endif + return; +} + +static void unmap_disk(struct td_state *s) +{ + tapdev_info_t *info = s->ring_info; + struct tap_disk *drv = s->drv; + fd_list_entry_t *ptr, *prev; + + drv->td_close(s); + + if (info != NULL && info->mem > 0) + munmap(info->mem, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE); + + ptr = s->fd_entry; + prev = ptr->prev; + + if (prev) { + /*There are entries earlier in the list*/ + prev->next = ptr->next; + if (ptr->next) { + ptr = ptr->next; + ptr->prev = prev; + } + } else { + /*We are the first entry in list*/ + if (ptr->next) { + ptr = ptr->next; + fd_start = ptr; + ptr->prev = NULL; + } else fd_start = NULL; + } + + close(info->fd); + + free(s->fd_entry); + free(s->blkif); + free(s->ring_info); + free(s); + + return; + +} + +void sig_handler(int sig) +{ + /*Received signal to close. If no disks are active, we close app.*/ + + if (connected_disks < 1) run = 0; +} + +static inline int LOCAL_FD_SET(fd_set *readfds) +{ + fd_list_entry_t *ptr; + int i; + + ptr = fd_start; + while (ptr != NULL) { + if (ptr->tap_fd) { + FD_SET(ptr->tap_fd, readfds); + for (i = 0; i < MAX_IOFD; i++) { + if (ptr->io_fd[i]) + FD_SET(ptr->io_fd[i], readfds); + maxfds = (ptr->io_fd[i] > maxfds ? + ptr->io_fd[i]: maxfds); + } + maxfds = (ptr->tap_fd > maxfds ? ptr->tap_fd: maxfds); + } + ptr = ptr->next; + } + + return 0; +} + +static inline fd_list_entry_t *add_fd_entry(int tap_fd, int io_fd[MAX_IOFD], struct td_state *s) +{ + fd_list_entry_t *ptr, *last, *entry; + int i; + DPRINTF("Adding fd_list_entry\n"); + + /*Add to linked list*/ + s->fd_entry = entry = malloc(sizeof(fd_list_entry_t)); + entry->tap_fd = tap_fd; + for (i = 0; i < MAX_IOFD; i++) entry->io_fd[i] = io_fd[i]; + entry->s = s; + entry->next = NULL; + + ptr = fd_start; + if (ptr == NULL) { + /*We are the first entry*/ + fd_start = entry; + entry->prev = NULL; + goto finish; + } + + while (ptr != NULL) { + last = ptr; + ptr = ptr->next; + } + last->next = entry; + entry->prev = last; + + finish: + return entry; +} + +static inline struct td_state *get_state(int cookie) +{ + fd_list_entry_t *ptr; + + ptr = fd_start; + while (ptr != NULL) { + if (ptr->cookie == cookie) return ptr->s; + ptr = ptr->next; + } + return NULL; +} + +static struct tap_disk *get_driver(int drivertype) +{ + /* blktapctrl has passed us the driver type */ + + return dtypes[drivertype]->drv; +} + +static struct td_state *state_init(void) +{ + int i; + struct td_state *s; + blkif_t *blkif; + + s = malloc(sizeof(struct td_state)); + blkif = s->blkif = malloc(sizeof(blkif_t)); + s->ring_info = malloc(sizeof(tapdev_info_t)); + + for (i = 0; i < MAX_REQUESTS; i++) + blkif->pending_list[i].count = 0; + + return s; +} + +static int map_new_dev(struct td_state *s, int minor) +{ + int tap_fd; + tapdev_info_t *info = s->ring_info; + char *devname; + fd_list_entry_t *ptr; + + asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor); + tap_fd = open(devname, O_RDWR); + if (tap_fd == -1) + { + DPRINTF("open failed on dev %s!",devname); + goto fail; + } + info->fd = tap_fd; + + /*Map the shared memory*/ + info->mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0); + if ((long int)info->mem == -1) + { + DPRINTF("mmap failed on dev %s!\n",devname); + goto fail; + } + + /* assign the rings to the mapped memory */ + info->sring = (blkif_sring_t *)((unsigned long)info->mem); + BACK_RING_INIT(&info->fe_ring, info->sring, PAGE_SIZE); + + info->vstart = + (unsigned long)info->mem + (BLKTAP_RING_PAGES << PAGE_SHIFT); + + ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process ); + ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE ); + free(devname); + + /*Update the fd entry*/ + ptr = fd_start; + while (ptr != NULL) { + if (s == ptr->s) { + ptr->tap_fd = tap_fd; + break; + } + ptr = ptr->next; + } + + return minor; + + fail: + free(devname); + return -1; +} + +static int read_msg(char *buf) +{ + int length, len, msglen, tap_fd, *io_fd; + char *ptr, *path; + image_t *img; + struct timeval timeout; + msg_hdr_t *msg; + msg_newdev_t *msg_dev; + msg_pid_t *msg_pid; + struct tap_disk *drv; + int ret = -1; + struct td_state *s = NULL; + fd_list_entry_t *entry; + + length = read(fds[READ], buf, MSG_SIZE); + + if (length > 0 && length >= sizeof(msg_hdr_t)) + { + msg = (msg_hdr_t *)buf; + DPRINTF("Tapdisk: Received msg, len %d, type %d, UID %d\n", + length,msg->type,msg->cookie); + + switch (msg->type) { + case CTLMSG_PARAMS: + ptr = buf + sizeof(msg_hdr_t); + len = (length - sizeof(msg_hdr_t)); + path = calloc(1, len); + + memcpy(path, ptr, len); + DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path); + + /*Assign driver*/ + drv = get_driver(msg->drivertype); + if (drv == NULL) + goto params_done; + + DPRINTF("Loaded driver: name [%s], type [%d]\n", + drv->disk_type, msg->drivertype); + + /* Allocate the disk structs */ + s = state_init(); + if (s == NULL) + goto params_done; + + s->drv = drv; + s->private = malloc(drv->private_data_size); + if (s->private == NULL) { + free(s); + goto params_done; + } + + /*Open file*/ + ret = drv->td_open(s, path); + io_fd = drv->td_get_fd(s); + + entry = add_fd_entry(0, io_fd, s); + entry->cookie = msg->cookie; + DPRINTF("Entered cookie %d\n",entry->cookie); + + memset(buf, 0x00, MSG_SIZE); + + params_done: + if (ret == 0) { + msglen = sizeof(msg_hdr_t) + sizeof(image_t); + msg->type = CTLMSG_IMG; + img = (image_t *)(buf + sizeof(msg_hdr_t)); + img->size = s->size; + img->secsize = s->sector_size; + img->info = s->info; + } else { + msglen = sizeof(msg_hdr_t); + msg->type = CTLMSG_IMG_FAIL; + msg->len = msglen; + } + len = write(fds[WRITE], buf, msglen); + free(path); + return 1; + + + + case CTLMSG_NEWDEV: + msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t)); + + s = get_state(msg->cookie); + DPRINTF("Retrieving state, cookie %d.....[%s]\n",msg->cookie, (s == NULL ? "FAIL":"OK")); + if (s != NULL) { + ret = ((map_new_dev(s, msg_dev->devnum) + == msg_dev->devnum ? 0: -1)); + connected_disks++; + } + + memset(buf, 0x00, MSG_SIZE); + msglen = sizeof(msg_hdr_t); + msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP + : CTLMSG_NEWDEV_FAIL); + msg->len = msglen; + + len = write(fds[WRITE], buf, msglen); + return 1; + + case CTLMSG_CLOSE: + s = get_state(msg->cookie); + if (s) unmap_disk(s); + + connected_disks--; + sig_handler(SIGINT); + + return 1; + + case CTLMSG_PID: + memset(buf, 0x00, MSG_SIZE); + msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t); + msg->type = CTLMSG_PID_RSP; + msg->len = msglen; + + msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t)); + process = getpid(); + msg_pid->pid = process; + + len = write(fds[WRITE], buf, msglen); + return 1; + + default: + return 0; + } + } + return 0; +} + +static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp) +{ + tapdev_info_t *info = s->ring_info; + blkif_response_t *rsp_d; + + rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt); + memcpy(rsp_d, rsp, sizeof(blkif_response_t)); + wmb(); + info->fe_ring.rsp_prod_pvt++; + + return 0; +} + +static inline void kick_responses(struct td_state *s) +{ + tapdev_info_t *info = s->ring_info; + + if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) + { + RING_PUSH_RESPONSES(&info->fe_ring); + ioctl(info->fd, BLKTAP_IOCTL_KICK_FE); + } +} + +void io_done(struct td_state *s, int sid) +{ + struct tap_disk *drv = s->drv; + + if (!run) return; /*We have received signal to close*/ + + if (drv->td_do_callbacks(s, sid) > 0) kick_responses(s); + + return; +} + +int send_responses(struct td_state *s, int res, int idx, void *private) +{ + blkif_request_t *req; + int responses_queued = 0; + blkif_t *blkif = s->blkif; + + req = &blkif->pending_list[idx].req; + + if ( (idx > MAX_REQUESTS-1) || + (blkif->pending_list[idx].count == 0) ) + { + DPRINTF("invalid index returned(%u)!\n", idx); + return 0; + } + + if (res != 0) { + DPRINTF("*** request error %d! \n", res); + return 0; + } + + blkif->pending_list[idx].count--; + + if (blkif->pending_list[idx].count == 0) + { + blkif_request_t tmp; + blkif_response_t *rsp; + + tmp = blkif->pending_list[idx].req; + rsp = (blkif_response_t *)req; + + rsp->id = tmp.id; + rsp->operation = tmp.operation; + rsp->status = blkif->pending_list[idx].status; + + write_rsp_to_ring(s, rsp); + responses_queued++; + } + return responses_queued; +} + +static void get_io_request(struct td_state *s) +{ + RING_IDX rp, rc, j, i, ret; + blkif_request_t *req; + int idx, nsects; + uint64_t sector_nr; + char *page; + int early = 0; /* count early completions */ + struct tap_disk *drv = s->drv; + blkif_t *blkif = s->blkif; + tapdev_info_t *info = s->ring_info; + + if (!run) return; /*We have received signal to close*/ + + rp = info->fe_ring.sring->req_prod; + rmb(); + for (j = info->fe_ring.req_cons; j != rp; j++) + { + int done = 0; + + req = NULL; + req = RING_GET_REQUEST(&info->fe_ring, j); + ++info->fe_ring.req_cons; + + if (req == NULL) continue; + + idx = req->id; + ASSERT(blkif->pending_list[idx].count == 0); + memcpy(&blkif->pending_list[idx].req, req, sizeof(*req)); + blkif->pending_list[idx].status = BLKIF_RSP_OKAY; + blkif->pending_list[idx].count = req->nr_segments; + + sector_nr = req->sector_number; + + for (i = 0; i < req->nr_segments; i++) { + nsects = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= PAGE_SIZE >> 9) || + (nsects <= 0)) + continue; + + page = (char *)MMAP_VADDR(info->vstart, + (unsigned long)req->id, i); + page += (req->seg[i].first_sect << SECTOR_SHIFT); + + if (sector_nr >= s->size) { + DPRINTF("Sector request failed:\n"); + DPRINTF("%s request, idx [%d,%d] size [%llu], " + "sector [%llu,%llu]\n", + (req->operation == BLKIF_OP_WRITE ? + "WRITE" : "READ"), + idx,i, + (long long unsigned) + nsects<<SECTOR_SHIFT, + (long long unsigned) + sector_nr<<SECTOR_SHIFT, + (long long unsigned) sector_nr); + continue; + } + + switch (req->operation) + { + case BLKIF_OP_WRITE: + ret = drv->td_queue_write(s, sector_nr, + nsects, page, send_responses, + idx, NULL); + if (ret > 0) early += ret; + else if (ret == -EBUSY) { + /* + * TODO: Sector is locked * + * Need to put req back on queue * + */ + } + break; + case BLKIF_OP_READ: + ret = drv->td_queue_read(s, sector_nr, + nsects, page, send_responses, + idx, NULL); + if (ret > 0) early += ret; + else if (ret == -EBUSY) { + /* + * TODO: Sector is locked * + * Need to put req back on queue * + */ + } + break; + default: + DPRINTF("Unknown block operation\n"); + break; + } + sector_nr += nsects; + } + } + + /*Batch done*/ + drv->td_submit(s); + + if (early > 0) + io_done(s,10); + + return; +} + +int main(int argc, char *argv[]) +{ + int len, msglen, ret, i; + char *p, *buf; + fd_set readfds, writefds; + struct timeval timeout; + fd_list_entry_t *ptr; + struct tap_disk *drv; + struct td_state *s; + + if (argc != 3) usage(); + + daemonize(); + + openlog("TAPDISK", LOG_CONS|LOG_ODELAY, LOG_DAEMON); + /*Setup signal handlers*/ + signal (SIGBUS, sig_handler); + signal (SIGINT, sig_handler); + + /*Open the control channel*/ + fds[READ] = open(argv[1],O_RDWR|O_NONBLOCK); + fds[WRITE] = open(argv[2],O_RDWR|O_NONBLOCK); + + if ( (fds[READ] < 0) || (fds[WRITE] < 0) ) + { + DPRINTF("FD open failed [%d,%d]\n",fds[READ], fds[WRITE]); + exit(-1); + } + + buf = calloc(MSG_SIZE, 1); + + if (buf == NULL) + { + DPRINTF("ERROR: allocating memory.\n"); + exit(-1); + } + + while (run) + { + ret = 0; + FD_ZERO(&readfds); + FD_SET(fds[READ], &readfds); + maxfds = fds[READ]; + + /*Set all tap fds*/ + LOCAL_FD_SET(&readfds); + + timeout.tv_sec = 0; + timeout.tv_usec = 1000; + + /*Wait for incoming messages*/ + ret = select(maxfds + 1, &readfds, (fd_set *) 0, + (fd_set *) 0, &timeout); + + if (ret > 0) + { + ptr = fd_start; + while (ptr != NULL) { + if (FD_ISSET(ptr->tap_fd, &readfds)) + get_io_request(ptr->s); + for (i = 0; i < MAX_IOFD; i++) { + if (ptr->io_fd[i] && + FD_ISSET(ptr->io_fd[i], &readfds)) + io_done(ptr->s, i); + } + + ptr = ptr->next; + } + + if (FD_ISSET(fds[READ], &readfds)) + read_msg(buf); + } + } + free(buf); + close(fds[READ]); + close(fds[WRITE]); + + ptr = fd_start; + while (ptr != NULL) { + s = ptr->s; + drv = s->drv; + + unmap_disk(s); + drv->td_close(s); + free(s->private); + free(s->blkif); + free(s->ring_info); + free(s); + close(ptr->tap_fd); + ptr = ptr->next; + } + closelog(); + + return 0; +} diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h new file mode 100644 index 0000000000..1f03156456 --- /dev/null +++ b/tools/blktap/drivers/tapdisk.h @@ -0,0 +1,211 @@ +/* tapdisk.h + * + * Generic disk interface for blktap-based image adapters. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + * + * Some notes on the tap_disk interface: + * + * tap_disk aims to provide a generic interface to easily implement new + * types of image accessors. The structure-of-function-calls is similar + * to disk interfaces used in qemu/denali/etc, with the significant + * difference being the expectation of asynchronous rather than synchronous + * I/O. The asynchronous interface is intended to allow lots of requests to + * be pipelined through a disk, without the disk requiring any of its own + * threads of control. As such, a batch of requests is delivered to the disk + * using: + * + * td_queue_[read,write]() + * + * and passing in a completion callback, which the disk is responsible for + * tracking. The end of a back is marked with a call to: + * + * td_submit() + * + * The disk implementation must provide a file handle, which is used to + * indicate that it needs to do work. tapdisk will add this file handle + * (returned from td_get_fd()) to it's poll set, and will call into the disk + * using td_do_callbacks() whenever there is data pending. + * + * Two disk implementations demonstrate how this interface may be used to + * implement disks with both asynchronous and synchronous calls. block-aio.c + * maps this interface down onto the linux libaio calls, while block-sync uses + * normal posix read/write. + * + * A few things to realize about the sync case, which doesn't need to defer + * io completions: + * + * - td_queue_[read,write]() call read/write directly, and then call the + * callback immediately. The MUST then return a value greater than 0 + * in order to tell tapdisk that requests have finished early, and to + * force responses to be kicked to the clents. + * + * - The fd used for poll is an otherwise unused pipe, which allows poll to + * be safely called without ever returning anything. + * + */ + +#ifndef TAPDISK_H_ +#define TAPDISK_H_ + +#include <stdint.h> +#include <syslog.h> +#include "blktaplib.h" + +/*If enabled, log all debug messages to syslog*/ +#if 1 +#define DPRINTF(_f, _a...) syslog( LOG_DEBUG, _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* Things disks need to know about, these should probably be in a higher-level + * header. */ +#define MAX_REQUESTS 64 +#define MAX_SEGMENTS_PER_REQ 11 +#define SECTOR_SHIFT 9 +#define DEFAULT_SECTOR_SIZE 512 + +/* This structure represents the state of an active virtual disk. */ +struct td_state { + void *private; + void *drv; + void *blkif; + void *image; + void *ring_info; + void *fd_entry; + char backing_file[1024]; /*Used by differencing disks, e.g. qcow*/ + long int sector_size; + uint64_t size; + long int info; +}; + +/* Prototype of the callback to activate as requests complete. */ +typedef int (*td_callback_t)(struct td_state *s, int res, int id, void *prv); + +/* Structure describing the interface to a virtual disk implementation. */ +/* See note at the top of this file describing this interface. */ +struct tap_disk { + const char *disk_type; + int private_data_size; + int (*td_open) (struct td_state *s, const char *name); + int (*td_queue_read) (struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *prv); + int (*td_queue_write) (struct td_state *s, uint64_t sector, + int nb_sectors, char *buf, td_callback_t cb, + int id, void *prv); + int (*td_submit) (struct td_state *s); + int *(*td_get_fd) (struct td_state *s); + int (*td_close) (struct td_state *s); + int (*td_do_callbacks)(struct td_state *s, int sid); +}; + +typedef struct disk_info { + int idnum; + char name[50]; /* e.g. "RAMDISK" */ + char handle[10]; /* xend handle, e.g. 'ram' */ + int single_handler; /* is there a single controller for all */ + /* instances of disk type? */ +#ifdef TAPDISK + struct tap_disk *drv; +#endif +} disk_info_t; + +void debug_fe_ring(struct td_state *s); + +extern struct tap_disk tapdisk_aio; +extern struct tap_disk tapdisk_sync; +extern struct tap_disk tapdisk_vmdk; +extern struct tap_disk tapdisk_ram; +extern struct tap_disk tapdisk_qcow; + +#define MAX_DISK_TYPES 20 +#define MAX_IOFD 2 + +#define DISK_TYPE_AIO 0 +#define DISK_TYPE_SYNC 1 +#define DISK_TYPE_VMDK 2 +#define DISK_TYPE_RAM 3 +#define DISK_TYPE_QCOW 4 + + +/*Define Individual Disk Parameters here */ +static disk_info_t aio_disk = { + DISK_TYPE_AIO, + "raw image (aio)", + "aio", + 0, +#ifdef TAPDISK + &tapdisk_aio, +#endif +}; + +static disk_info_t sync_disk = { + DISK_TYPE_SYNC, + "raw image (sync)", + "sync", + 0, +#ifdef TAPDISK + &tapdisk_sync, +#endif +}; + +static disk_info_t vmdk_disk = { + DISK_TYPE_VMDK, + "vmware image (vmdk)", + "vmdk", + 1, +#ifdef TAPDISK + &tapdisk_vmdk, +#endif +}; + +static disk_info_t ram_disk = { + DISK_TYPE_RAM, + "ramdisk image (ram)", + "ram", + 1, +#ifdef TAPDISK + &tapdisk_ram, +#endif +}; + +static disk_info_t qcow_disk = { + DISK_TYPE_QCOW, + "qcow disk (qcow)", + "qcow", + 0, +#ifdef TAPDISK + &tapdisk_qcow, +#endif +}; + +/*Main disk info array */ +static disk_info_t *dtypes[] = { + &aio_disk, + &sync_disk, + &vmdk_disk, + &ram_disk, + &qcow_disk, +}; + +typedef struct driver_list_entry { + void *blkif; + void *prev; + void *next; +} driver_list_entry_t; + +typedef struct fd_list_entry { + int cookie; + int tap_fd; + int io_fd[MAX_IOFD]; + struct td_state *s; + void *prev; + void *next; +} fd_list_entry_t; + +int qcow_create(const char *filename, uint64_t total_size, + const char *backing_file, int flags); + +#endif /*TAPDISK_H_*/ diff --git a/tools/blktap/lib/Makefile b/tools/blktap/lib/Makefile new file mode 100644 index 0000000000..c0eb28bde1 --- /dev/null +++ b/tools/blktap/lib/Makefile @@ -0,0 +1,66 @@ +XEN_ROOT = ../../.. +include $(XEN_ROOT)/tools/Rules.mk + +MAJOR = 3.0 +MINOR = 0 +SONAME = libblktap.so.$(MAJOR) + +BLKTAP_INSTALL_DIR = /usr/sbin + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +INCLUDES += -I. -I.. -I $(XEN_LIBXC) -I $(XEN_XENSTORE) + +LIBS := -lz + +SRCS := +SRCS += xenbus.c blkif.c xs_api.c + +CFLAGS += -Werror +CFLAGS += -Wno-unused +CFLAGS += -fno-strict-aliasing -fPIC +CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +# get asprintf(): +CFLAGS += -D _GNU_SOURCE + +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +CFLAGS += $(INCLUDES) +DEPS = .*.d + +OBJS = $(patsubst %.c,%.o,$(SRCS)) +IBINS := + +LIB = libblktap.a libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) + +all: build + +build: + $(MAKE) libblktap + +install: all + $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_DIR) -p $(DESTDIR)/usr/include + $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include + +clean: + rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS + +libblktap: $(OBJS) + $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared \ + -L$(XEN_XENSTORE) -l xenstore \ + -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) + ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR) + ln -sf libblktap.so.$(MAJOR) $@.so + ar rc libblktap.a $@.so + +.PHONY: TAGS all build clean install libblktap + +TAGS: + etags -t $(SRCS) *.h + +-include $(DEPS) + diff --git a/tools/blktap/lib/blkif.c b/tools/blktap/lib/blkif.c new file mode 100644 index 0000000000..9a195960a0 --- /dev/null +++ b/tools/blktap/lib/blkif.c @@ -0,0 +1,185 @@ +/* + * tools/blktap_user/blkif.c + * + * The blkif interface for blktap. A blkif describes an in-use virtual disk. + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <err.h> +#include <unistd.h> + +#include "blktaplib.h" + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +blkif_t *alloc_blkif(domid_t domid) +{ + blkif_t *blkif; + DPRINTF("Alloc_blkif called [%d]\n",domid); + blkif = (blkif_t *)malloc(sizeof(blkif_t)); + if (!blkif) + return NULL; + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->devnum = -1; + return blkif; +} + +/*Controller callbacks*/ +static int (*new_devmap_hook)(blkif_t *blkif) = NULL; +void register_new_devmap_hook(int (*fn)(blkif_t *blkif)) +{ + new_devmap_hook = fn; +} + +static int (*new_unmap_hook)(blkif_t *blkif) = NULL; +void register_new_unmap_hook(int (*fn)(blkif_t *blkif)) +{ + new_unmap_hook = fn; +} + +static int (*new_blkif_hook)(blkif_t *blkif) = NULL; +void register_new_blkif_hook(int (*fn)(blkif_t *blkif)) +{ + new_blkif_hook = fn; +} + +int blkif_init(blkif_t *blkif, long int handle, long int pdev, + long int readonly) +{ + domid_t domid; + blkif_t **pblkif; + int devnum; + + if (blkif == NULL) + return -EINVAL; + + domid = blkif->domid; + blkif->handle = handle; + blkif->pdev = pdev; + blkif->readonly = readonly; + + /* + * Call out to the new_blkif_hook. + * The tap application should define this, + * and it should return having set blkif->ops + * + */ + if (new_blkif_hook == NULL) + { + DPRINTF("Probe detected a new blkif, but no new_blkif_hook!"); + return -1; + } + if (new_blkif_hook(blkif)!=0) { + DPRINTF("BLKIF: Image open failed\n"); + return -1; + } + + /* Now wire it in. */ + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + DPRINTF("Created hash entry: %d [%d,%ld]\n", + BLKIF_HASH(domid, handle), domid, handle); + + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && + ((*pblkif)->handle == handle) ) + { + DPRINTF("Could not create blkif: already exists\n"); + return -1; + } + pblkif = &(*pblkif)->hash_next; + } + blkif->hash_next = NULL; + *pblkif = blkif; + + if (new_devmap_hook == NULL) + { + DPRINTF("Probe setting up new blkif but no devmap hook!"); + return -1; + } + + devnum = new_devmap_hook(blkif); + if (devnum == -1) + return -1; + blkif->devnum = devnum; + + return 0; +} + +void free_blkif(blkif_t *blkif) +{ + blkif_t **pblkif, *curs; + image_t *image; + + pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)]; + while ( (curs = *pblkif) != NULL ) + { + if ( blkif == curs ) + { + *pblkif = curs->hash_next; + } + pblkif = &curs->hash_next; + } + if (blkif != NULL) { + if ((image=(image_t *)blkif->prv)!=NULL) { + free(blkif->prv); + } + if (blkif->info!=NULL) { + free(blkif->info); + } + if (new_unmap_hook != NULL) new_unmap_hook(blkif); + free(blkif); + } +} + +void __init_blkif(void) +{ + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff --git a/tools/blktap/lib/blktaplib.h b/tools/blktap/lib/blktaplib.h new file mode 100644 index 0000000000..ceab6b7d51 --- /dev/null +++ b/tools/blktap/lib/blktaplib.h @@ -0,0 +1,223 @@ +/* blktaplib.h + * + * Blktap library userspace code. + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __BLKTAPLIB_H__ +#define __BLKTAPLIB_H__ + +#include <xenctrl.h> +#include <sys/user.h> +#include <xen/xen.h> +#include <xen/io/blkif.h> +#include <xen/io/ring.h> +#include <xs.h> +#include <sys/types.h> +#include <unistd.h> + +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) + +/* size of the extra VMA area to map in attached pages. */ +#define BLKTAP_VMA_PAGES BLK_RING_SIZE + +/* blktap IOCTLs: These must correspond with the blktap driver ioctls*/ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_SENDPID 4 +#define BLKTAP_IOCTL_NEWINTF 5 +#define BLKTAP_IOCTL_MINOR 6 +#define BLKTAP_IOCTL_MAJOR 7 +#define BLKTAP_QUERY_ALLOC_REQS 8 +#define BLKTAP_IOCTL_FREEINTF 9 +#define BLKTAP_IOCTL_PRINT_IDXS 100 + +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) ); +} + +#define MAX_REQUESTS 64 + +#define BLKTAP_IOCTL_KICK 1 +#define MAX_PENDING_REQS 64 +#define BLKTAP_DEV_DIR "/dev/xen" +#define BLKTAP_DEV_NAME "blktap" +#define BLKTAP_DEV_MAJOR 254 +#define BLKTAP_DEV_MINOR 0 + +#define BLKTAP_RING_PAGES 1 /* Front */ +#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES) + +struct blkif; + +typedef struct { + blkif_request_t req; + struct blkif *blkif; + int count; + int16_t status; +} pending_req_t; + +struct blkif_ops { + long int (*get_size)(struct blkif *blkif); + long int (*get_secsize)(struct blkif *blkif); + unsigned (*get_info)(struct blkif *blkif); +}; + +typedef struct blkif { + domid_t domid; + long int handle; + + long int pdev; + long int readonly; + + enum { DISCONNECTED, DISCONNECTING, CONNECTED } state; + + struct blkif_ops *ops; + struct blkif *hash_next; + + void *prv; /* device-specific data */ + void *info; /*Image parameter passing */ + pending_req_t pending_list[MAX_REQUESTS]; + int devnum; + int fds[2]; + int be_id; + int major; + int minor; + pid_t tappid; + int drivertype; + uint16_t cookie; +} blkif_t; + +typedef struct blkif_info { + char *params; +} blkif_info_t; + +void register_new_devmap_hook(int (*fn)(blkif_t *blkif)); +void register_new_unmap_hook(int (*fn)(blkif_t *blkif)); +void register_new_blkif_hook(int (*fn)(blkif_t *blkif)); +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +blkif_t *alloc_blkif(domid_t domid); +int blkif_init(blkif_t *blkif, long int handle, long int pdev, + long int readonly); +void free_blkif(blkif_t *blkif); +void __init_blkif(void); + +typedef struct tapdev_info { + int fd; + char *mem; + blkif_sring_t *sring; + blkif_back_ring_t fe_ring; + unsigned long vstart; + blkif_t *blkif; +} tapdev_info_t; + +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + +typedef struct image { + long int size; + long int secsize; + long int info; +} image_t; + +typedef struct msg_hdr { + uint16_t type; + uint16_t len; + uint16_t drivertype; + uint16_t cookie; +} msg_hdr_t; + +typedef struct msg_newdev { + uint8_t devnum; + uint16_t domid; +} msg_newdev_t; + +typedef struct msg_pid { + pid_t pid; +} msg_pid_t; + +#define READ 0 +#define WRITE 1 + +/*Control Messages between manager and tapdev*/ +#define CTLMSG_PARAMS 1 +#define CTLMSG_IMG 2 +#define CTLMSG_IMG_FAIL 3 +#define CTLMSG_NEWDEV 4 +#define CTLMSG_NEWDEV_RSP 5 +#define CTLMSG_NEWDEV_FAIL 6 +#define CTLMSG_CLOSE 7 +#define CTLMSG_CLOSE_RSP 8 +#define CTLMSG_PID 9 +#define CTLMSG_PID_RSP 10 + +/* xenstore/xenbus: */ +extern int add_blockdevice_probe_watch(struct xs_handle *h, + const char *domname); +int xs_fire_next_watch(struct xs_handle *h); + + +/* Abitrary values, must match the underlying driver... */ +#define MAX_PENDING_REQS 64 +#define MAX_TAP_DEV 100 + +/* Accessing attached data page mappings */ +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_vstart,_req,_seg) \ + ((_vstart) + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +/* Defines that are only used by library clients */ + +#ifndef __COMPILING_BLKTAP_LIB + +static char *blkif_op_name[] = { + [BLKIF_OP_READ] = "READ", + [BLKIF_OP_WRITE] = "WRITE", +}; + +#endif /* __COMPILING_BLKTAP_LIB */ + +#endif /* __BLKTAPLIB_H__ */ diff --git a/tools/blktap/lib/list.h b/tools/blktap/lib/list.h new file mode 100644 index 0000000000..bda5f46a38 --- /dev/null +++ b/tools/blktap/lib/list.h @@ -0,0 +1,55 @@ +/* + * list.h + * + * This is a subset of linux's list.h intended to be used in user-space. + * + */ + +#ifndef __LIST_H__ +#define __LIST_H__ + +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#endif /* __LIST_H__ */ diff --git a/tools/blktap/lib/xenbus.c b/tools/blktap/lib/xenbus.c new file mode 100644 index 0000000000..91cdd00536 --- /dev/null +++ b/tools/blktap/lib/xenbus.c @@ -0,0 +1,387 @@ +/* + * xenbus.c + * + * xenbus interface to the blocktap. + * + * this handles the top-half of integration with block devices through the + * store -- the tap driver negotiates the device channel etc, while the + * userland tap client needs to sort out the disk parameters etc. + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <printf.h> +#include <string.h> +#include <err.h> +#include <stdarg.h> +#include <errno.h> +#include <xs.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <poll.h> +#include <time.h> +#include <sys/time.h> +#include "blktaplib.h" +#include "list.h" +#include "xs_api.h" + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +struct backend_info +{ + /* our communications channel */ + blkif_t *blkif; + + long int frontend_id; + long int pdev; + long int readonly; + + char *backpath; + char *frontpath; + + struct list_head list; +}; + +static LIST_HEAD(belist); + +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + +static int get_be_id(const char *str) +{ + int len,end; + const char *ptr; + char *tptr, num[10]; + + len = strsep_len(str, '/', 6); + end = strlen(str); + if( (len < 0) || (end < 0) ) return -1; + + ptr = str + len + 1; + strncpy(num, ptr, end - len); + tptr = num + (end - (len + 1)); + *tptr = '\0'; + + return atoi(num); +} + +static struct backend_info *be_lookup_be(const char *bepath) +{ + struct backend_info *be; + + list_for_each_entry(be, &belist, list) + if (strcmp(bepath, be->backpath) == 0) + return be; + return (struct backend_info *)NULL; +} + +static int be_exists_be(const char *bepath) +{ + return (be_lookup_be(bepath) != NULL); +} + +static struct backend_info *be_lookup_fe(const char *fepath) +{ + struct backend_info *be; + + list_for_each_entry(be, &belist, list) + if (strcmp(fepath, be->frontpath) == 0) + return be; + return (struct backend_info *)NULL; +} + +static int backend_remove(struct xs_handle *h, struct backend_info *be) +{ + /* Unhook from be list. */ + list_del(&be->list); + + /* Free everything else. */ + if (be->blkif) { + DPRINTF("Freeing blkif dev [%d]\n",be->blkif->devnum); + free_blkif(be->blkif); + } + if (be->frontpath) + free(be->frontpath); + if (be->backpath) + free(be->backpath); + free(be); + return 0; +} + +static void ueblktap_setup(struct xs_handle *h, char *bepath) +{ + struct backend_info *be; + char *path = NULL, *p,*dev; + int len, er, deverr; + long int pdev = 0, handle; + blkif_info_t *blk; + + be = be_lookup_be(bepath); + if (be == NULL) + { + DPRINTF("ERROR: backend changed called for nonexistent " + "backend! (%s)\n", bepath); + goto fail; + } + + deverr = xs_gather(h, bepath, "physical-device", "%li", &pdev, NULL); + if (!deverr) { + DPRINTF("pdev set to %ld\n",pdev); + if (be->pdev && be->pdev != pdev) { + DPRINTF("changing physical-device not supported"); + goto fail; + } + be->pdev = pdev; + } + + /*Check to see if device is to be opened read-only*/ + asprintf(&path, "%s/%s", bepath, "read-only"); + if (xs_exists(h, path)) + be->readonly = 1; + + if (be->blkif == NULL) { + + /* Front end dir is a number, which is used as the handle. */ + p = strrchr(be->frontpath, '/') + 1; + handle = strtoul(p, NULL, 0); + + be->blkif = alloc_blkif(be->frontend_id); + + if (be->blkif == NULL) + goto fail; + + be->blkif->be_id = get_be_id(bepath); + + /*Insert device specific info*/ + blk = malloc(sizeof(blkif_info_t)); + if (!blk) { + DPRINTF("Out of memory - blkif_info_t\n"); + goto fail; + } + er = xs_gather(h, bepath, "params", NULL, &blk->params, NULL); + if (er) + goto fail; + be->blkif->info = blk; + + if (deverr) { + /*Dev number was not available, try to set manually*/ + pdev = convert_dev_name_to_num(blk->params); + be->pdev = pdev; + } + + er = blkif_init(be->blkif, handle, be->pdev, be->readonly); + + if (er != 0) { + DPRINTF("Unable to open device %s\n",blk->params); + goto fail; + } + + DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", bepath); + } + /* Supply the information about the device to xenstore */ + er = xs_printf(h, be->backpath, "sectors", "%lu", + be->blkif->ops->get_size(be->blkif)); + + if (er == 0) { + DPRINTF("ERROR: Failed writing sectors"); + goto fail; + } + + er = xs_printf(h, be->backpath, "sector-size", "%lu", + be->blkif->ops->get_secsize(be->blkif)); + + if (er == 0) { + DPRINTF("ERROR: Failed writing sector-size"); + goto fail; + } + + er = xs_printf(h, be->backpath, "info", "%u", + be->blkif->ops->get_info(be->blkif)); + + if (er == 0) { + DPRINTF("ERROR: Failed writing info"); + goto fail; + } + + be->blkif->state = CONNECTED; + DPRINTF("[SETUP] Complete\n\n"); + goto close; + +fail: + if ( (be != NULL) && (be->blkif != NULL) ) + backend_remove(h, be); +close: + if (path) + free(path); + return; +} + +/** + * Xenstore watch callback entry point. This code replaces the hotplug scripts, + * and as soon as the xenstore backend driver entries are created, this script + * gets called. + */ +static void ueblktap_probe(struct xs_handle *h, struct xenbus_watch *w, + const char *bepath_im) +{ + struct backend_info *be = NULL; + char *frontend = NULL, *bepath = NULL, *p; + int er, len; + blkif_t *blkif; + + + bepath = strdup(bepath_im); + + if (!bepath) { + DPRINTF("No path\n"); + return; + } + + /* + *asserts that xenstore structure is always 7 levels deep + *e.g. /local/domain/0/backend/vbd/1/2049 + */ + len = strsep_len(bepath, '/', 7); + if (len < 0) + goto free_be; + bepath[len] = '\0'; + + be = malloc(sizeof(*be)); + if (!be) { + DPRINTF("ERROR: allocating backend structure\n"); + goto free_be; + } + memset(be, 0, sizeof(*be)); + frontend = NULL; + + er = xs_gather(h, bepath, + "frontend-id", "%li", &be->frontend_id, + "frontend", NULL, &frontend, + NULL); + + if (er) { + /* + *Unable to find frontend entries, + *bus-id is no longer valid + */ + DPRINTF("ERROR: Frontend-id check failed, removing backend: " + "[%s]\n",bepath); + + /** + * BE info should already exist, + * free new mem and find old entry + */ + free(be); + be = be_lookup_be(bepath); + if ( (be != NULL) && (be->blkif != NULL) ) + backend_remove(h, be); + else goto free_be; + if (bepath) + free(bepath); + return; + } + + /* Are we already tracking this device? */ + if (be_exists_be(bepath)) { + goto free_be; + } + + be->backpath = bepath; + be->frontpath = frontend; + + list_add(&be->list, &belist); + + DPRINTF("[PROBE]\tADDED NEW DEVICE (%s)\n", bepath); + DPRINTF("\tFRONTEND (%s),(%ld)\n", frontend,be->frontend_id); + + ueblktap_setup(h, bepath); + return; + + free_be: + if (frontend) + free(frontend); + if (bepath) + free(bepath); + if (be) + free(be); + return; +} + +/** + *We set a general watch on the backend vbd directory + *ueblktap_probe is called for every update + *Our job is to monitor for new entries. As they + *are created, we initalise the state and attach a disk. + */ + +int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname) +{ + char *domid, *path; + struct xenbus_watch *vbd_watch; + int er; + + domid = get_dom_domid(h, domname); + + DPRINTF("%s: %s\n", + domname, (domid != NULL) ? domid : "[ not found! ]"); + + asprintf(&path, "/local/domain/%s/backend/tap", domid); + if (path == NULL) + return -ENOMEM; + + vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch)); + if (!vbd_watch) { + DPRINTF("ERROR: unable to malloc vbd_watch [%s]\n", path); + return -EINVAL; + } + vbd_watch->node = path; + vbd_watch->callback = ueblktap_probe; + er = register_xenbus_watch(h, vbd_watch); + if (er == 0) { + DPRINTF("ERROR: adding vbd probe watch %s\n", path); + return -EINVAL; + } + return 0; +} diff --git a/tools/blktap/lib/xs_api.c b/tools/blktap/lib/xs_api.c new file mode 100644 index 0000000000..44abcf2080 --- /dev/null +++ b/tools/blktap/lib/xs_api.c @@ -0,0 +1,364 @@ +/* + * xs_api.c + * + * blocktap interface functions to xenstore + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <printf.h> +#include <string.h> +#include <err.h> +#include <stdarg.h> +#include <errno.h> +#include <xs.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <poll.h> +#include "blktaplib.h" +#include "list.h" +#include "xs_api.h" + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +static LIST_HEAD(watches); +#define BASE_DEV_VAL 2048 + +int xs_gather(struct xs_handle *xs, const char *dir, ...) +{ + va_list ap; + const char *name; + char *path, **e; + int ret = 0, num,i; + unsigned int len; + xs_transaction_t xth; + +again: + if ( (xth = xs_transaction_start(xs)) == XBT_NULL) { + DPRINTF("unable to start xs trasanction\n"); + ret = ENOMEM; + return ret; + } + + va_start(ap, dir); + while ( (ret == 0) && (name = va_arg(ap, char *)) != NULL) { + const char *fmt = va_arg(ap, char *); + void *result = va_arg(ap, void *); + char *p; + + if (asprintf(&path, "%s/%s", dir, name) == -1) + { + printf("allocation error in xs_gather!\n"); + ret = ENOMEM; + break; + } + + p = xs_read(xs, xth, path, &len); + + + free(path); + if (p == NULL) { + ret = ENOENT; + break; + } + if (fmt) { + if (sscanf(p, fmt, result) == 0) + ret = EINVAL; + free(p); + } else + *(char **)result = p; + } + va_end(ap); + + if (!xs_transaction_end(xs, xth, ret)) { + if (ret == 0 && errno == EAGAIN) + goto again; + else + ret = errno; + } + + return ret; +} + + +/* Single printf and write: returns -errno or 0. */ +int xs_printf(struct xs_handle *h, const char *dir, const char *node, + const char *fmt, ...) +{ + char *buf, *path; + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vasprintf(&buf, fmt, ap); + va_end(ap); + + asprintf(&path, "%s/%s", dir, node); + + if ( (path == NULL) || (buf == NULL) ) + return 0; + + ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1); + + free(buf); + free(path); + + return ret; +} + + +int xs_exists(struct xs_handle *h, const char *path) +{ + char **d; + unsigned int num; + xs_transaction_t xth; + + if ( (xth = xs_transaction_start(h)) == XBT_NULL) { + printf("unable to start xs trasanction\n"); + return 0; + } + + d = xs_directory(h, xth, path, &num); + xs_transaction_end(h, xth, 0); + if (d == NULL) + return 0; + free(d); + return 1; +} + + + +/** + * This assumes that the domain name we are looking for is unique. + * Name parameter Domain-0 + */ +char *get_dom_domid(struct xs_handle *h, const char *name) +{ + char **e, *val, *domid = NULL; + unsigned int num, len; + int i; + char *path; + xs_transaction_t xth; + + if ( (xth = xs_transaction_start(h)) == XBT_NULL) { + warn("unable to start xs trasanction\n"); + return NULL; + } + + e = xs_directory(h, xth, "/local/domain", &num); + + i = 0; + while (i < num) { + asprintf(&path, "/local/domain/%s/name", e[i]); + val = xs_read(h, xth, path, &len); + free(path); + if (val == NULL) + continue; + + if (strcmp(val, name) == 0) { + /* match! */ + asprintf(&path, "/local/domain/%s/domid", e[i]); + domid = xs_read(h, xth, path, &len); + free(val); + free(path); + break; + } + free(val); + i++; + } + xs_transaction_end(h, xth, 0); + + free(e); + return domid; +} + +int convert_dev_name_to_num(char *name) { + char *p_sd, *p_hd, *p_xvd, *p_plx, *p, *alpha,*ptr; + int majors[10] = {3,22,33,34,56,57,88,89,90,91}; + int maj,i; + + asprintf(&p_sd,"/dev/sd"); + asprintf(&p_hd,"/dev/hd"); + asprintf(&p_xvd,"/dev/xvd"); + asprintf(&p_plx,"plx"); + asprintf(&alpha,"abcdefghijklmnop"); + + + if (strstr(name, p_sd) != NULL) { + p = name + strlen(p_sd); + for(i = 0, ptr = alpha; i < strlen(alpha); i++) { + if(*ptr == *p) + break; + *ptr++; + } + *p++; + return BASE_DEV_VAL + (16*i) + atoi(p); + } else if (strstr(name, p_hd) != NULL) { + p = name + strlen(p_hd); + for (i = 0, ptr = alpha; i < strlen(alpha); i++) { + if(*ptr == *p) break; + *ptr++; + } + *p++; + return (majors[i/2]*256) + atoi(p); + + } else if (strstr(name, p_xvd) != NULL) { + p = name + strlen(p_xvd); + for(i = 0, ptr = alpha; i < strlen(alpha); i++) { + if(*ptr == *p) break; + *ptr++; + } + *p++; + return (202*256) + (16*i) + atoi(p); + + } else if (strstr(name, p_plx) != NULL) { + p = name + strlen(p_plx); + return atoi(p); + + } else { + DPRINTF("Unknown device type, setting to default.\n"); + return BASE_DEV_VAL; + } + return 0; +} + +/** + * A little paranoia: we don't just trust token. + */ +static struct xenbus_watch *find_watch(const char *token) +{ + struct xenbus_watch *i, *cmp; + + cmp = (void *)strtoul(token, NULL, 16); + + list_for_each_entry(i, &watches, list) + if (i == cmp) + return i; + return NULL; +} + +/** + * Register callback to watch this node. + * like xs_watch, return 0 on failure + */ +int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + /* Pointer in ascii is the token. */ + char token[sizeof(watch) * 2 + 1]; + int er; + + sprintf(token, "%lX", (long)watch); + if (find_watch(token)) + { + DPRINTF("watch collision!\n"); + return -EINVAL; + } + + er = xs_watch(h, watch->node, token); + if (er != 0) { + list_add(&watch->list, &watches); + } + + return er; +} + +int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + char token[sizeof(watch) * 2 + 1]; + int er; + + sprintf(token, "%lX", (long)watch); + if (!find_watch(token)) + { + DPRINTF("no such watch!\n"); + return -EINVAL; + } + + + er = xs_unwatch(h, watch->node, token); + list_del(&watch->list); + + if (er == 0) + DPRINTF("XENBUS Failed to release watch %s: %i\n", + watch->node, er); + return 0; +} + +/** + * Re-register callbacks to all watches. + */ +void reregister_xenbus_watches(struct xs_handle *h) +{ + struct xenbus_watch *watch; + char token[sizeof(watch) * 2 + 1]; + + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX", (long)watch); + xs_watch(h, watch->node, token); + } +} + +/** + * based on watch_thread() + */ +int xs_fire_next_watch(struct xs_handle *h) +{ + char **res; + char *token; + char *node = NULL; + struct xenbus_watch *w; + int er; + unsigned int num; + + res = xs_read_watch(h, &num); + if (res == NULL) + return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */ + + node = res[XS_WATCH_PATH]; + token = res[XS_WATCH_TOKEN]; + + w = find_watch(token); + if (!w) + { + DPRINTF("unregistered watch fired\n"); + goto done; + } + w->callback(h, w, node); + + done: + free(res); + return 1; +} diff --git a/tools/blktap/lib/xs_api.h b/tools/blktap/lib/xs_api.h new file mode 100644 index 0000000000..c4183a2dde --- /dev/null +++ b/tools/blktap/lib/xs_api.h @@ -0,0 +1,50 @@ +/* + * xs_api.h + * + * (c) 2005 Andrew Warfield and Julian Chesterfield + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +struct xenbus_watch +{ + struct list_head list; + char *node; + void (*callback)(struct xs_handle *h, + struct xenbus_watch *, + const char *node); +}; + +int xs_gather(struct xs_handle *xs, const char *dir, ...); +int xs_printf(struct xs_handle *h, const char *dir, const char *node, + const char *fmt, ...); +int xs_exists(struct xs_handle *h, const char *path); +char *get_dom_domid(struct xs_handle *h, const char *name); +int convert_dev_name_to_num(char *name); +int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch); +int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch); +void reregister_xenbus_watches(struct xs_handle *h); +int xs_fire_next_watch(struct xs_handle *h); diff --git a/tools/examples/Makefile b/tools/examples/Makefile index c5ccb6c8af..1280081577 100644 --- a/tools/examples/Makefile +++ b/tools/examples/Makefile @@ -26,6 +26,7 @@ XEN_SCRIPTS += network-route vif-route XEN_SCRIPTS += network-nat vif-nat XEN_SCRIPTS += block XEN_SCRIPTS += block-enbd block-nbd +XEN_SCRIPTS += blktap XEN_SCRIPTS += vtpm vtpm-delete XEN_SCRIPTS += xen-hotplug-cleanup XEN_SCRIPTS += external-device-migrate diff --git a/tools/examples/blktap b/tools/examples/blktap new file mode 100644 index 0000000000..ba9f4ee52f --- /dev/null +++ b/tools/examples/blktap @@ -0,0 +1,15 @@ +#!/bin/sh + +# Copyright (c) 2005, XenSource Ltd. + +dir=$(dirname "$0") +. "$dir/xen-hotplug-common.sh" + +findCommand "$@" + +if [ "$command" == 'add' ] +then + success +fi + +exit 0 diff --git a/tools/examples/xen-backend.agent b/tools/examples/xen-backend.agent index e662015da2..3a01a2c7ea 100755 --- a/tools/examples/xen-backend.agent +++ b/tools/examples/xen-backend.agent @@ -7,6 +7,9 @@ PATH=/etc/xen/scripts:$PATH claim_lock xenbus_hotplug_global case "$XENBUS_TYPE" in + tap) + /etc/xen/scripts/blktap "$ACTION" + ;; vbd) /etc/xen/scripts/block "$ACTION" ;; diff --git a/tools/examples/xen-backend.rules b/tools/examples/xen-backend.rules index 91f0b06107..21c6d8c8fc 100644 --- a/tools/examples/xen-backend.rules +++ b/tools/examples/xen-backend.rules @@ -1,3 +1,4 @@ +SUBSYSTEM=="xen-backend", KERNEL=="tap*", RUN+="/etc/xen/scripts/blktap $env{ACTION}" SUBSYSTEM=="xen-backend", KERNEL=="vbd*", RUN+="/etc/xen/scripts/block $env{ACTION}" SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm $env{ACTION}" SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} online" diff --git a/tools/libaio/COPYING b/tools/libaio/COPYING new file mode 100644 index 0000000000..c4792dd27a --- /dev/null +++ b/tools/libaio/COPYING @@ -0,0 +1,515 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. +^L + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. +^L + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. +^L + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. +^L + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. +^L + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. +^L + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. +^L + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS +^L + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + <one line to give the library's name and a brief idea of what it +does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper +mail. + +You should also get your employer (if you work as a programmer) or +your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James +Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/tools/libaio/ChangeLog b/tools/libaio/ChangeLog new file mode 100644 index 0000000000..ddcf6e3841 --- /dev/null +++ b/tools/libaio/ChangeLog @@ -0,0 +1,43 @@ +0.4.0 + - remove libredhat-kernel + - add rough outline for man pages + - make the compiled io_getevents() add the extra parameter and + pass the timeout for updating as per 2.5 + - fixes for ia64, now works + - fixes for x86-64 + - powerpc support from Gianni Tedesco <gianni@ecsc.co.uk> + - disable the NULL check in harness/cases/4.t on ia64: ia64 + maps the 0 page and causes this check to fail. + +0.3.15 + - use real syscall interface, but don't break source compatibility + yet (that will happen with 0.4.0) + +0.3.13 + - add test cases + +0.3.11 + - use library versioning of libredhat-kernel to always provide a + fallback + +0.3.9 + - add io_queue_release function + +0.3.8 + - make clean deletes libredhat-kernel.so.1 + - const struct timespec * + - add make srpm target + +0.3.7 + - fix assembly function .types + - export io_getevents + - fix io_submit function prototype to match the kernel + - provide /usr/lib/libredhat-kernel.so link for compilation + (do NOT link against libredhat-kernel.so directly) + - fix soname to libaio.so.1 + - fix dummy libredhat-kernel's soname + - work around nfs bug + - provide and install libredhat-kernel.so.1 stub + - Makefile improvements + - make sure dummy libredhat-kernel.so only returns -ENOSYS + diff --git a/tools/libaio/INSTALL b/tools/libaio/INSTALL new file mode 100644 index 0000000000..29b907797a --- /dev/null +++ b/tools/libaio/INSTALL @@ -0,0 +1,18 @@ +To install the library, execute the command: + + make prefix=`pwd`/usr install + +which will install the binaries and header files into the directory +usr. Set prefix=/usr to get them installed into the main system. + +Please note: Do not attempt to install on the system the +"libredhat-kernel.so" file. It is a dummy shared library +provided only for the purpose of being able to bootstrap +this facility while running on systems without the correct +libredhat-kernel.so built. The contents of the included +libredhat-kernel.so are only stubs; this library is NOT +functional for anything except the internal purpose of +linking libaio.so against the provided stubs. At runtime, +libaio.so requires a real libredhat-kernel.so library; this +is provided by the Red Hat kernel RPM packages with async +I/O functionality. diff --git a/tools/libaio/Makefile b/tools/libaio/Makefile new file mode 100644 index 0000000000..06d8775e33 --- /dev/null +++ b/tools/libaio/Makefile @@ -0,0 +1,40 @@ +NAME=libaio +SPECFILE=$(NAME).spec +VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE)) +RELEASE=$(shell awk '/Release:/ { print $$2 }' $(SPECFILE)) +CVSTAG = $(NAME)_$(subst .,-,$(VERSION))_$(subst .,-,$(RELEASE)) +RPMBUILD=$(shell `which rpmbuild >&/dev/null` && echo "rpmbuild" || echo "rpm") + +prefix=/usr +includedir=$(prefix)/include +libdir=$(prefix)/lib + +default: all + +all: + @$(MAKE) -C src + +install: all + +clean: + @$(MAKE) -C src clean + @$(MAKE) -C harness clean + +tag-archive: + @cvs -Q tag -F $(CVSTAG) + +create-archive: tag-archive + @rm -rf /tmp/$(NAME) + @cd /tmp; cvs -Q -d $(CVSROOT) export -r$(CVSTAG) $(NAME) || echo GRRRrrrrr -- ignore [export aborted] + @mv /tmp/$(NAME) /tmp/$(NAME)-$(VERSION) + @cd /tmp; tar czSpf $(NAME)-$(VERSION).tar.gz $(NAME)-$(VERSION) + @rm -rf /tmp/$(NAME)-$(VERSION) + @cp /tmp/$(NAME)-$(VERSION).tar.gz . + @rm -f /tmp/$(NAME)-$(VERSION).tar.gz + @echo " " + @echo "The final archive is ./$(NAME)-$(VERSION).tar.gz." + +archive: clean tag-archive create-archive + +srpm: create-archive + $(RPMBUILD) --define "_sourcedir `pwd`" --define "_srcrpmdir `pwd`" --nodeps -bs $(SPECFILE) diff --git a/tools/libaio/TODO b/tools/libaio/TODO new file mode 100644 index 0000000000..0a9ac15b19 --- /dev/null +++ b/tools/libaio/TODO @@ -0,0 +1,4 @@ +- Write man pages. +- Make -static links against libaio work. +- Fallback on userspace if the kernel calls return -ENOSYS. + diff --git a/tools/libaio/harness/Makefile b/tools/libaio/harness/Makefile new file mode 100644 index 0000000000..d2483fdda2 --- /dev/null +++ b/tools/libaio/harness/Makefile @@ -0,0 +1,37 @@ +# foo. +TEST_SRCS:=$(shell find cases/ -name \*.t | sort -n -t/ -k2) +PROGS:=$(patsubst %.t,%.p,$(TEST_SRCS)) +HARNESS_SRCS:=main.c +# io_queue.c + +CFLAGS=-Wall -Werror -g -O -laio +#-lpthread -lrt + +all: $(PROGS) + +$(PROGS): %.p: %.t $(HARNESS_SRCS) + $(CC) $(CFLAGS) -DTEST_NAME=\"$<\" -o $@ main.c + +clean: + rm -f $(PROGS) *.o runtests.out rofile wofile rwfile + +.PHONY: + +testdir/rofile: .PHONY + rm -f $@ + echo "test" >$@ + chmod 400 $@ + +testdir/wofile: .PHONY + rm -f $@ + echo "test" >$@ + chmod 200 $@ + +testdir/rwfile: .PHONY + rm -f $@ + echo "test" >$@ + chmod 600 $@ + +check: $(PROGS) testdir/rofile testdir/rwfile testdir/wofile + ./runtests.sh $(PROGS) + diff --git a/tools/libaio/harness/README b/tools/libaio/harness/README new file mode 100644 index 0000000000..5557370589 --- /dev/null +++ b/tools/libaio/harness/README @@ -0,0 +1,19 @@ +Notes on running this test suite: + +To run the test suite, run "make check". All test cases should pass +and there should be 0 fails. + +Several of the test cases require a directory on the filesystem under +test for the creation of test files, as well as the generation of +error conditions. The test cases assume the directories (or symlinks +to directories) are as follows: + + testdir/ + - used for general read/write test cases. Must have at + least as much free space as the machine has RAM (up + to 768MB). + testdir.enospc/ + - a filesystem that has space for writing 8KB out, but + fails with -ENOSPC beyond 8KB. + testdir.ext2/ + - must be an ext2 filesystem. diff --git a/tools/libaio/harness/attic/0.t b/tools/libaio/harness/attic/0.t new file mode 100644 index 0000000000..033e62c1b2 --- /dev/null +++ b/tools/libaio/harness/attic/0.t @@ -0,0 +1,9 @@ +/* 0.t + Test harness check: okay. +*/ +int test_main(void) +{ + printf("test_main: okay\n"); + return 0; +} + diff --git a/tools/libaio/harness/attic/1.t b/tools/libaio/harness/attic/1.t new file mode 100644 index 0000000000..799ffd179a --- /dev/null +++ b/tools/libaio/harness/attic/1.t @@ -0,0 +1,9 @@ +/* 1.t + Test harness check: fail. +*/ +int test_main(void) +{ + printf("test_main: fail\n"); + return 1; +} + diff --git a/tools/libaio/harness/cases/10.t b/tools/libaio/harness/cases/10.t new file mode 100644 index 0000000000..9d3beb2fdb --- /dev/null +++ b/tools/libaio/harness/cases/10.t @@ -0,0 +1,53 @@ +/* 10.t - uses testdir.enospc/rwfile +- Check results on out-of-space and out-of-quota. (10.t) + - write that fills filesystem but does not go over should succeed + - write that fills filesystem and goes over should be partial + - write to full filesystem should return -ENOSPC + - read beyond end of file after ENOSPC should return 0 +*/ +#include "aio_setup.h" + +#include <sys/time.h> +#include <sys/resource.h> +#include <unistd.h> + +int test_main(void) +{ +/* Note: changing either of these requires updating the ext2-enospc.img + * filesystem image. Also, if SIZE is less than PAGE_SIZE, problems + * crop up due to ext2's preallocation. + */ +#define LIMIT 65536 +#define SIZE 65536 + char *buf; + int rwfd; + int status = 0, res; + + rwfd = open("testdir.enospc/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600); + assert(rwfd != -1); + res = ftruncate(rwfd, 0); assert(res == 0); + buf = malloc(SIZE); assert(buf != NULL); + memset(buf, 0, SIZE); + + + status |= attempt_rw(rwfd, buf, SIZE, LIMIT-SIZE, WRITE, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, LIMIT-SIZE, READ, SIZE); + + status |= attempt_rw(rwfd, buf, SIZE, LIMIT, WRITE, -ENOSPC); + status |= attempt_rw(rwfd, buf, SIZE, LIMIT, READ, 0); + + res = ftruncate(rwfd, 0); assert(res == 0); + + status |= attempt_rw(rwfd, buf, SIZE, 1+LIMIT-SIZE, WRITE, SIZE-1); + status |= attempt_rw(rwfd, buf, SIZE, 1+LIMIT-SIZE, READ, SIZE-1); + status |= attempt_rw(rwfd, buf, SIZE, LIMIT, READ, 0); + + status |= attempt_rw(rwfd, buf, SIZE, LIMIT, WRITE, -ENOSPC); + status |= attempt_rw(rwfd, buf, SIZE, LIMIT, READ, 0); + status |= attempt_rw(rwfd, buf, 0, LIMIT, WRITE, 0); + + res = close(rwfd); assert(res == 0); + res = unlink("testdir.enospc/rwfile"); assert(res == 0); + return status; +} + diff --git a/tools/libaio/harness/cases/11.t b/tools/libaio/harness/cases/11.t new file mode 100644 index 0000000000..efcf6d45f3 --- /dev/null +++ b/tools/libaio/harness/cases/11.t @@ -0,0 +1,39 @@ +/* 11.t - uses testdir/rwfile +- repeated read / write of same page (to check accounting) (11.t) +*/ +#include "aio_setup.h" + +#include <sys/time.h> +#include <sys/resource.h> +#include <unistd.h> + +int test_main(void) +{ +#define COUNT 1000000 +#define SIZE 256 + char *buf; + int rwfd; + int status = 0; + int i; + + rwfd = open("testdir/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600); + assert(rwfd != -1); + buf = malloc(SIZE); assert(buf != NULL); + memset(buf, 0, SIZE); + + for (i=0; i<COUNT; i++) { + status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE_SILENT, SIZE); + if (status) + break; + } + printf("completed %d out of %d writes\n", i, COUNT); + for (i=0; i<COUNT; i++) { + status |= attempt_rw(rwfd, buf, SIZE, 0, READ_SILENT, SIZE); + if (status) + break; + } + printf("completed %d out of %d reads\n", i, COUNT); + + return status; +} + diff --git a/tools/libaio/harness/cases/12.t b/tools/libaio/harness/cases/12.t new file mode 100644 index 0000000000..3499204440 --- /dev/null +++ b/tools/libaio/harness/cases/12.t @@ -0,0 +1,49 @@ +/* 12.t +- ioctx access across fork() (12.t) + */ +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <signal.h> + +#include "aio_setup.h" + +void test_child(void) +{ + int res; + res = attempt_io_submit(io_ctx, 0, NULL, -EINVAL); + fflush(stdout); + _exit(res); +} + +int test_main(void) +{ + int res, status; + pid_t pid; + + if (attempt_io_submit(io_ctx, 0, NULL, 0)) + return 1; + + sigblock(sigmask(SIGCHLD) | siggetmask()); + fflush(NULL); + pid = fork(); assert(pid != -1); + + if (pid == 0) + test_child(); + + res = waitpid(pid, &status, 0); + + if (WIFEXITED(status)) { + int failed = (WEXITSTATUS(status) != 0); + printf("child exited with status %d%s\n", WEXITSTATUS(status), + failed ? " -- FAILED" : ""); + return failed; + } + + /* anything else: failed */ + if (WIFSIGNALED(status)) + printf("child killed by signal %d -- FAILED.\n", + WTERMSIG(status)); + + return 1; +} diff --git a/tools/libaio/harness/cases/13.t b/tools/libaio/harness/cases/13.t new file mode 100644 index 0000000000..5f18005b6a --- /dev/null +++ b/tools/libaio/harness/cases/13.t @@ -0,0 +1,66 @@ +/* 13.t - uses testdir/rwfile +- Submit multiple writes larger than aio-max-size (deadlocks on older + aio code) +*/ +#include "aio_setup.h" + +#include <sys/time.h> +#include <sys/resource.h> +#include <unistd.h> + +int test_main(void) +{ +#define SIZE (1024 * 1024) +#define IOS 8 + struct iocb iocbs[IOS]; + struct iocb *iocb_list[IOS]; + char *bufs[IOS]; + int rwfd; + int status = 0, res; + int i; + + rwfd = open("testdir/rwfile", O_RDWR|O_CREAT|O_TRUNC, 0600); + assert(rwfd != -1); + res = ftruncate(rwfd, 0); assert(res == 0); + + for (i=0; i<IOS; i++) { + bufs[i] = malloc(SIZE); + assert(bufs[i] != NULL); + memset(bufs[i], 0, SIZE); + + io_prep_pwrite(&iocbs[i], rwfd, bufs[i], SIZE, i * SIZE); + iocb_list[i] = &iocbs[i]; + } + + status |= attempt_io_submit(io_ctx, IOS, iocb_list, IOS); + + for (i=0; i<IOS; i++) { + struct timespec ts = { tv_sec: 30, tv_nsec: 0 }; + struct io_event event; + struct iocb *iocb; + + res = io_getevents(io_ctx, 0, 1, &event, &ts); + if (res != 1) { + status |= 1; + printf("io_getevents failed [%d] with res=%d [%s]\n", + i, res, (res < 0) ? strerror(-res) : "okay"); + break; + } + + if (event.res != SIZE) + status |= 1; + + iocb = (void *)event.obj; + printf("event[%d]: write[%d] %s, returned: %ld [%s]\n", + i, (int)(iocb - &iocbs[0]), + (event.res != SIZE) ? "failed" : "okay", + (long)event.res, + (event.res < 0) ? strerror(-event.res) : "okay" + ); + } + + res = ftruncate(rwfd, 0); assert(res == 0); + res = close(rwfd); assert(res == 0); + return status; +} + diff --git a/tools/libaio/harness/cases/14.t b/tools/libaio/harness/cases/14.t new file mode 100644 index 0000000000..514622b569 --- /dev/null +++ b/tools/libaio/harness/cases/14.t @@ -0,0 +1,90 @@ +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <signal.h> + +#include "aio_setup.h" +#include <sys/mman.h> + +#define SIZE 768*1024*1024 + +//just submit an I/O + +int test_child(void) +{ + char *buf; + int rwfd; + int res; + long size; + struct iocb iocb; + struct iocb *iocbs[] = { &iocb }; + int loop = 10; + int i; + + aio_setup(1024); + + size = SIZE; + + printf("size = %ld\n", size); + + rwfd = open("testdir/rwfile", O_RDWR); assert(rwfd != +-1); + res = ftruncate(rwfd, 0); assert(res == 0); + buf = malloc(size); assert(buf != +NULL); + + for(i=0;i<loop;i++) { + + switch(i%2) { + case 0: + io_prep_pwrite(&iocb, rwfd, buf, size, 0); + break; + case 1: + io_prep_pread(&iocb, rwfd, buf, size, 0); + } + + res = io_submit(io_ctx, 1, iocbs); + if (res != 1) { + printf("child: submit: io_submit res=%d [%s]\n", res, +strerror(-res)); + _exit(1); + } + } + + res = ftruncate(rwfd, 0); assert(res == 0); + + _exit(0); +} + +/* from 12.t */ +int test_main(void) +{ + int res, status; + pid_t pid; + + if (attempt_io_submit(io_ctx, 0, NULL, 0)) + return 1; + + sigblock(sigmask(SIGCHLD) | siggetmask()); + fflush(NULL); + pid = fork(); assert(pid != -1); + + if (pid == 0) + test_child(); + + res = waitpid(pid, &status, 0); + + if (WIFEXITED(status)) { + int failed = (WEXITSTATUS(status) != 0); + printf("child exited with status %d%s\n", WEXITSTATUS(status), + failed ? " -- FAILED" : ""); + return failed; + } + + /* anything else: failed */ + if (WIFSIGNALED(status)) + printf("child killed by signal %d -- FAILED.\n", + WTERMSIG(status)); + + return 1; +} diff --git a/tools/libaio/harness/cases/2.t b/tools/libaio/harness/cases/2.t new file mode 100644 index 0000000000..3a0212d698 --- /dev/null +++ b/tools/libaio/harness/cases/2.t @@ -0,0 +1,41 @@ +/* 2.t +- io_setup (#2) + - with invalid context pointer + - with maxevents <= 0 + - with an already initialized ctxp +*/ + +int attempt(int n, io_context_t *ctxp, int expect) +{ + int res; + + printf("expect %3d: io_setup(%5d, %p) = ", expect, n, ctxp); + fflush(stdout); + res = io_setup(n, ctxp); + printf("%3d [%s]%s\n", res, strerror(-res), + (res != expect) ? " -- FAILED" : ""); + if (res != expect) + return 1; + + return 0; +} + +int test_main(void) +{ + io_context_t ctx; + int status = 0; + + ctx = NULL; + status |= attempt(-1000, KERNEL_RW_POINTER, -EFAULT); + status |= attempt( 1000, KERNEL_RW_POINTER, -EFAULT); + status |= attempt( 0, KERNEL_RW_POINTER, -EFAULT); + status |= attempt(-1000, &ctx, -EINVAL); + status |= attempt( -1, &ctx, -EINVAL); + status |= attempt( 0, &ctx, -EINVAL); + assert(ctx == NULL); + status |= attempt( 1, &ctx, 0); + status |= attempt( 1, &ctx, -EINVAL); + + return status; +} + diff --git a/tools/libaio/harness/cases/3.t b/tools/libaio/harness/cases/3.t new file mode 100644 index 0000000000..7773d80f06 --- /dev/null +++ b/tools/libaio/harness/cases/3.t @@ -0,0 +1,25 @@ +/* 3.t +- io_submit/io_getevents with invalid addresses (3.t) + +*/ +#include "aio_setup.h" + +int test_main(void) +{ + struct iocb a, b; + struct iocb *good_ios[] = { &a, &b }; + struct iocb *bad1_ios[] = { NULL, &b }; + struct iocb *bad2_ios[] = { KERNEL_RW_POINTER, &a }; + int status = 0; + + status |= attempt_io_submit(BAD_CTX, 1, good_ios, -EINVAL); + status |= attempt_io_submit( io_ctx, 0, good_ios, 0); + status |= attempt_io_submit( io_ctx, 1, NULL, -EFAULT); + status |= attempt_io_submit( io_ctx, 1, (void *)-1, -EFAULT); + status |= attempt_io_submit( io_ctx, 2, bad1_ios, -EFAULT); + status |= attempt_io_submit( io_ctx, 2, bad2_ios, -EFAULT); + status |= attempt_io_submit( io_ctx, -1, good_ios, -EINVAL); + + return status; +} + diff --git a/tools/libaio/harness/cases/4.t b/tools/libaio/harness/cases/4.t new file mode 100644 index 0000000000..972b4f24b1 --- /dev/null +++ b/tools/libaio/harness/cases/4.t @@ -0,0 +1,72 @@ +/* 4.t +- read of descriptor without read permission (4.t) +- write to descriptor without write permission (4.t) +- check that O_APPEND writes actually append + +*/ +#include "aio_setup.h" + +#define SIZE 512 +#define READ 'r' +#define WRITE 'w' +int attempt(int fd, void *buf, int count, long long pos, int rw, int expect) +{ + struct iocb iocb; + int res; + + switch(rw) { + case READ: io_prep_pread (&iocb, fd, buf, count, pos); break; + case WRITE: io_prep_pwrite(&iocb, fd, buf, count, pos); break; + } + + printf("expect %3d: (%c), res = ", expect, rw); + fflush(stdout); + res = sync_submit(&iocb); + printf("%3d [%s]%s\n", res, (res <= 0) ? strerror(-res) : "Success", + (res != expect) ? " -- FAILED" : ""); + if (res != expect) + return 1; + + return 0; +} + +int test_main(void) +{ + char buf[SIZE]; + int rofd, wofd, rwfd; + int status = 0, res; + + memset(buf, 0, SIZE); + + rofd = open("testdir/rofile", O_RDONLY); assert(rofd != -1); + wofd = open("testdir/wofile", O_WRONLY); assert(wofd != -1); + rwfd = open("testdir/rwfile", O_RDWR); assert(rwfd != -1); + + status |= attempt(rofd, buf, SIZE, 0, WRITE, -EBADF); + status |= attempt(wofd, buf, SIZE, 0, READ, -EBADF); + status |= attempt(rwfd, buf, SIZE, 0, WRITE, SIZE); + status |= attempt(rwfd, buf, SIZE, 0, READ, SIZE); + status |= attempt(rwfd, buf, SIZE, -1, READ, -EINVAL); + status |= attempt(rwfd, buf, SIZE, -1, WRITE, -EINVAL); + + rwfd = open("testdir/rwfile", O_RDWR|O_APPEND); assert(rwfd != -1); + res = ftruncate(rwfd, 0); assert(res == 0); + status |= attempt(rwfd, buf, SIZE, 0, READ, 0); + status |= attempt(rwfd, "1234", 4, 0, WRITE, 4); + status |= attempt(rwfd, "5678", 4, 0, WRITE, 4); + memset(buf, 0, SIZE); + status |= attempt(rwfd, buf, SIZE, 0, READ, 8); + printf("read after append: [%s]\n", buf); + assert(memcmp(buf, "12345678", 8) == 0); + + status |= attempt(rwfd, KERNEL_RW_POINTER, SIZE, 0, READ, -EFAULT); + status |= attempt(rwfd, KERNEL_RW_POINTER, SIZE, 0, WRITE, -EFAULT); + + /* Some architectures map the 0 page. Ugh. */ +#if !defined(__ia64__) + status |= attempt(rwfd, NULL, SIZE, 0, WRITE, -EFAULT); +#endif + + return status; +} + diff --git a/tools/libaio/harness/cases/5.t b/tools/libaio/harness/cases/5.t new file mode 100644 index 0000000000..7669fd7006 --- /dev/null +++ b/tools/libaio/harness/cases/5.t @@ -0,0 +1,47 @@ +/* 5.t +- Write from a mmap() of the same file. (5.t) +*/ +#include "aio_setup.h" +#include <sys/mman.h> + +int test_main(void) +{ + int page_size = getpagesize(); +#define SIZE 512 + char *buf; + int rwfd; + int status = 0, res; + + rwfd = open("testdir/rwfile", O_RDWR); assert(rwfd != -1); + res = ftruncate(rwfd, 512); assert(res == 0); + + buf = mmap(0, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, rwfd, 0); + assert(buf != (char *)-1); + + status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, 0, READ, SIZE); + + res = munmap(buf, page_size); assert(res == 0); + buf = mmap(0, page_size, PROT_READ|PROT_WRITE, MAP_SHARED, rwfd, 0); + assert(buf != (char *)-1); + + status |= attempt_rw(rwfd, buf, SIZE, 0, READ, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE, SIZE); + + res = munmap(buf, page_size); assert(res == 0); + buf = mmap(0, page_size, PROT_READ, MAP_SHARED, rwfd, 0); + assert(buf != (char *)-1); + + status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, 0, READ, -EFAULT); + + res = munmap(buf, page_size); assert(res == 0); + buf = mmap(0, page_size, PROT_WRITE, MAP_SHARED, rwfd, 0); + assert(buf != (char *)-1); + + status |= attempt_rw(rwfd, buf, SIZE, 0, READ, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, 0, WRITE, -EFAULT); + + return status; +} + diff --git a/tools/libaio/harness/cases/6.t b/tools/libaio/harness/cases/6.t new file mode 100644 index 0000000000..cea4b01c96 --- /dev/null +++ b/tools/libaio/harness/cases/6.t @@ -0,0 +1,57 @@ +/* 6.t +- huge reads (pinned pages) (6.t) +- huge writes (6.t) +*/ +#include "aio_setup.h" +#include <sys/mman.h> + +long getmemsize(void) +{ + FILE *f = fopen("/proc/meminfo", "r"); + long size; + int gotit = 0; + char str[256]; + + assert(f != NULL); + while (NULL != fgets(str, 255, f)) { + str[255] = 0; + if (0 == memcmp(str, "MemTotal:", 9)) { + if (1 == sscanf(str + 9, "%ld", &size)) { + gotit = 1; + break; + } + } + } + fclose(f); + + assert(gotit != 0); + return size; +} + +int test_main(void) +{ + char *buf; + int rwfd; + int status = 0, res; + long size; + + size = getmemsize(); + printf("size = %ld\n", size); + assert(size >= (16 * 1024)); + if (size > (768 * 1024)) + size = 768 * 1024; + size *= 1024; + + rwfd = open("testdir/rwfile", O_RDWR); assert(rwfd != -1); + res = ftruncate(rwfd, 0); assert(res == 0); + buf = malloc(size); assert(buf != NULL); + + //memset(buf, 0, size); + status |= attempt_rw(rwfd, buf, size, 0, WRITE, size); + status |= attempt_rw(rwfd, buf, size, 0, READ, size); + + //res = ftruncate(rwfd, 0); assert(res == 0); + + return status; +} + diff --git a/tools/libaio/harness/cases/7.t b/tools/libaio/harness/cases/7.t new file mode 100644 index 0000000000..d2d6cbc653 --- /dev/null +++ b/tools/libaio/harness/cases/7.t @@ -0,0 +1,27 @@ +/* 7.t +- Write overlapping the file size rlimit boundary: should be a short + write. (7.t) +- Write at the file size rlimit boundary: should give EFBIG. (I think + the spec requires that you do NOT deliver SIGXFSZ in this case, where + you would do so for sync IO.) (7.t) +- Special case: a write of zero bytes at or beyond the file size rlimit + boundary must return success. (7.t) +*/ + +#include <sys/resource.h> + +void SET_RLIMIT(long long limit) +{ + struct rlimit rlim; + int res; + + rlim.rlim_cur = limit; assert(rlim.rlim_cur == limit); + rlim.rlim_max = limit; assert(rlim.rlim_max == limit); + + res = setrlimit(RLIMIT_FSIZE, &rlim); assert(res == 0); +} + +#define LIMIT 8192 +#define FILENAME "testdir/rwfile" + +#include "common-7-8.h" diff --git a/tools/libaio/harness/cases/8.t b/tools/libaio/harness/cases/8.t new file mode 100644 index 0000000000..8a3d83ec94 --- /dev/null +++ b/tools/libaio/harness/cases/8.t @@ -0,0 +1,49 @@ +/* 8.t +- Ditto for the above three tests at the offset maximum (largest + possible ext2/3 file size.) (8.t) + */ +#include <sys/vfs.h> + +#define EXT2_OLD_SUPER_MAGIC 0xEF51 +#define EXT2_SUPER_MAGIC 0xEF53 + +long long get_fs_limit(int fd) +{ + struct statfs s; + int res; + long long lim = 0; + + res = fstatfs(fd, &s); assert(res == 0); + + switch(s.f_type) { + case EXT2_OLD_SUPER_MAGIC: + case EXT2_SUPER_MAGIC: +#if 0 + { + long long tmp; + tmp = s.f_bsize / 4; + /* 12 direct + indirect block + dind + tind */ + lim = 12 + tmp + tmp * tmp + tmp * tmp * tmp; + lim *= s.f_bsize; + printf("limit(%ld) = %Ld\n", (long)s.f_bsize, lim); + } +#endif + switch(s.f_bsize) { + case 4096: lim = 2199023251456; break; + default: + printf("unknown ext2 blocksize %ld\n", (long)s.f_bsize); + exit(3); + } + break; + default: + printf("unknown filesystem 0x%08lx\n", (long)s.f_type); + exit(3); + } + return lim; +} + +#define SET_RLIMIT(x) do ; while (0) +#define LIMIT get_fs_limit(rwfd) +#define FILENAME "testdir.ext2/rwfile" + +#include "common-7-8.h" diff --git a/tools/libaio/harness/cases/aio_setup.h b/tools/libaio/harness/cases/aio_setup.h new file mode 100644 index 0000000000..37c96189b2 --- /dev/null +++ b/tools/libaio/harness/cases/aio_setup.h @@ -0,0 +1,98 @@ +io_context_t io_ctx; +#define BAD_CTX ((io_context_t)-1) + +void aio_setup(int n) +{ + int res = io_queue_init(n, &io_ctx); + if (res != 0) { + printf("io_queue_setup(%d) returned %d (%s)\n", + n, res, strerror(-res)); + exit(3); + } +} + +int attempt_io_submit(io_context_t ctx, long nr, struct iocb *ios[], int expect) +{ + int res; + + printf("expect %3d: io_submit(%10p, %3ld, %10p) = ", expect, ctx, nr, ios); + fflush(stdout); + res = io_submit(ctx, nr, ios); + printf("%3d [%s]%s\n", res, (res <= 0) ? strerror(-res) : "", + (res != expect) ? " -- FAILED" : ""); + if (res != expect) + return 1; + + return 0; +} + +int sync_submit(struct iocb *iocb) +{ + struct io_event event; + struct iocb *iocbs[] = { iocb }; + int res; + + /* 30 second timeout should be enough */ + struct timespec ts; + ts.tv_sec = 30; + ts.tv_nsec = 0; + + res = io_submit(io_ctx, 1, iocbs); + if (res != 1) { + printf("sync_submit: io_submit res=%d [%s]\n", res, strerror(-res)); + return res; + } + + res = io_getevents(io_ctx, 0, 1, &event, &ts); + if (res != 1) { + printf("sync_submit: io_getevents res=%d [%s]\n", res, strerror(-res)); + return res; + } + return event.res; +} + +#define SETUP aio_setup(1024) + + +#define READ 'r' +#define WRITE 'w' +#define READ_SILENT 'R' +#define WRITE_SILENT 'W' +int attempt_rw(int fd, void *buf, int count, long long pos, int rw, int expect) +{ + struct iocb iocb; + int res; + int silent = 0; + + switch(rw) { + case READ_SILENT: + silent = 1; + case READ: + io_prep_pread (&iocb, fd, buf, count, pos); + break; + case WRITE_SILENT: + silent = 1; + case WRITE: + io_prep_pwrite(&iocb, fd, buf, count, pos); + break; + } + + if (!silent) { + printf("expect %5d: (%c), res = ", expect, rw); + fflush(stdout); + } + res = sync_submit(&iocb); + if (!silent || res != expect) { + if (silent) + printf("expect %5d: (%c), res = ", expect, rw); + printf("%5d [%s]%s\n", res, + (res <= 0) ? strerror(-res) : "Success", + (res != expect) ? " -- FAILED" : ""); + } + + if (res != expect) + return 1; + + return 0; +} + diff --git a/tools/libaio/harness/cases/common-7-8.h b/tools/libaio/harness/cases/common-7-8.h new file mode 100644 index 0000000000..3ec2bb439d --- /dev/null +++ b/tools/libaio/harness/cases/common-7-8.h @@ -0,0 +1,37 @@ +/* common-7-8.h +*/ +#include "aio_setup.h" + +#include <unistd.h> + +#define SIZE 512 + +int test_main(void) +{ + char *buf; + int rwfd; + int status = 0, res; + long long limit; + + rwfd = open(FILENAME, O_RDWR); assert(rwfd != -1); + res = ftruncate(rwfd, 0); assert(res == 0); + buf = malloc(SIZE); assert(buf != NULL); + memset(buf, 0, SIZE); + + limit = LIMIT; + + SET_RLIMIT(limit); + + status |= attempt_rw(rwfd, buf, SIZE, limit-SIZE, WRITE, SIZE); + status |= attempt_rw(rwfd, buf, SIZE, limit-SIZE, READ, SIZE); + + status |= attempt_rw(rwfd, buf, SIZE, 1+limit-SIZE, WRITE, SIZE-1); + status |= attempt_rw(rwfd, buf, SIZE, 1+limit-SIZE, READ, SIZE-1); + + status |= attempt_rw(rwfd, buf, SIZE, limit, WRITE, -EFBIG); + status |= attempt_rw(rwfd, buf, SIZE, limit, READ, 0); + status |= attempt_rw(rwfd, buf, 0, limit, WRITE, 0); + + return status; +} + diff --git a/tools/libaio/harness/main.c b/tools/libaio/harness/main.c new file mode 100644 index 0000000000..74b2764620 --- /dev/null +++ b/tools/libaio/harness/main.c @@ -0,0 +1,39 @@ +#include <stdio.h> +#include <errno.h> +#include <assert.h> +#include <stdlib.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <libaio.h> + +#if defined(__i386__) +#define KERNEL_RW_POINTER ((void *)0xc0010000) +#else +//#warning Not really sure where kernel memory is. Guessing. +#define KERNEL_RW_POINTER ((void *)0xffffffffc0010000) +#endif + + +char test_name[] = TEST_NAME; + +#include TEST_NAME + +int main(void) +{ + int res; + +#if defined(SETUP) + SETUP; +#endif + + res = test_main(); + printf("test %s completed %s.\n", test_name, + res ? "FAILED" : "PASSED" + ); + fflush(stdout); + return res ? 1 : 0; +} diff --git a/tools/libaio/harness/runtests.sh b/tools/libaio/harness/runtests.sh new file mode 100644 index 0000000000..d763d88b31 --- /dev/null +++ b/tools/libaio/harness/runtests.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +passes=0 +fails=0 + +echo "Test run starting at" `date` + +while [ $# -ge 1 ] ; do + this_test=$1 + shift + echo "Starting $this_test" + $this_test 2>&1 + res=$? + if [ $res -eq 0 ] ; then str="" ; passes=$[passes + 1] ; else str=" -- FAILED" ; fails=$[fails + 1] ; fi + echo "Completed $this_test with $res$str". +done + +echo "Pass: $passes Fail: $fails" +echo "Test run complete at" `date` diff --git a/tools/libaio/libaio.spec b/tools/libaio/libaio.spec new file mode 100644 index 0000000000..1f16c91b07 --- /dev/null +++ b/tools/libaio/libaio.spec @@ -0,0 +1,177 @@ +Name: libaio +Version: 0.3.104 +Release: 1 +Summary: Linux-native asynchronous I/O access library +Copyright: LGPL +Group: System Environment/Libraries +Source: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-root +# Fix ExclusiveArch as we implement this functionality on more architectures +ExclusiveArch: i386 x86_64 ia64 s390 s390x ppc ppc64 ppc64pseries ppc64iseries alpha alphaev6 + +%description +The Linux-native asynchronous I/O facility ("async I/O", or "aio") has a +richer API and capability set than the simple POSIX async I/O facility. +This library, libaio, provides the Linux-native API for async I/O. +The POSIX async I/O facility requires this library in order to provide +kernel-accelerated async I/O capabilities, as do applications which +require the Linux-native async I/O API. + +%package devel +Summary: Development files for Linux-native asynchronous I/O access +Group: Development/System +Requires: libaio +Provides: libaio.so.1 + +%description devel +This package provides header files to include and libraries to link with +for the Linux-native asynchronous I/O facility ("async I/O", or "aio"). + +%prep +%setup + +%build +make + +%install +[ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT + +make install prefix=$RPM_BUILD_ROOT/usr \ + libdir=$RPM_BUILD_ROOT/%{_libdir} \ + root=$RPM_BUILD_ROOT + +%clean +[ "$RPM_BUILD_ROOT" != "/" ] && rm -rf $RPM_BUILD_ROOT + +%post -p /sbin/ldconfig + +%postun -p /sbin/ldconfig + +%files +%defattr(-,root,root) +%attr(0755,root,root) %{_libdir}/libaio.so.* +%doc COPYING TODO + +%files devel +%defattr(-,root,root) +%attr(0644,root,root) %{_includedir}/* +%attr(0755,root,root) %{_libdir}/libaio.so +%attr(0644,root,root) %{_libdir}/libaio.a + +%changelog +* Fri Apr 1 2005 Jeff Moyer <jmoyer@redhat.com> - 0.3.104-1 +- Add Alpha architecture support. (Sergey Tikhonov <tsv@solvo.ru>) + +* Tue Jan 25 2005 Jeff Moyer <jmoyer@redhat.com> - 0.3.103-1 +- Fix SONAME breakage. In changing file names around, I also changed the + SONAME, which is a no no. + +* Thu Oct 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.102-1 +- S390 asm had a bug; I forgot to update the clobber list. Lucky for me, + newer compilers complain about such things. +- Also update the s390 asm to look more like the new kernel variants. + +* Wed Oct 13 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.101-1 +- Revert syscall return values to be -ERRNO. This was an inadvertant bug + introduced when clobber lists changed. +- add ppc64pseries and ppc64iseries to exclusivearch + +* Tue Sep 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.100-1 +- Switch around the tests for _PPC_ and _powerpc64_ so that the ppc64 + platforms get the right padding. + +* Wed Jul 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-4 +- Ok, there was a race in moving the cvs module. Someone rebuild from + the old cvs into fc3. *sigh* bumping rev. + +* Wed Jul 14 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-3 +- Actually provide libaio.so.1. + +* Tue Mar 30 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-2 +- Apparently the 0.3.93 patch was not meant for 0.3.96. Backed it out. + +* Tue Mar 30 2004 Jeff Moyer <jmoyer@redhat.com> - 0.3.99-1 +- Fix compat calls. +- make library .so.1.0.0 and make symlinks properly. +- Fix header file for inclusion in c++ code. + +* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.98-2 +- bah. fix version nr in changelog. + +* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.98-1 +- fix compiler warnings. + +* Thu Feb 26 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.97-2 +- make srpm was using rpm to do a build. changed that to use rpmbuild if + it exists, and fallback to rpm if it doesn't. + +* Tue Feb 24 2004 Jeff Moyer <jmoyer@redhat.com> 0.3.97-1 +- Use libc syscall(2) instead of rolling our own calling mechanism. This + change is inspired due to a failure to build with newer gcc, since clobber + lists were wrong. +- Add -fpic to the CFLAGS for all architectures. Should address bz #109457. +- change a #include from <linux/types.h> to <sys/types.h>. Fixes a build + issue on s390. + +* Wed Jul 7 2003 Bill Nottingham <notting@redhat.com> 0.3.96-3 +- fix paths on lib64 arches + +* Wed Jun 18 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.96-2 +- optimization in io_getevents from Arjan van de Ven in 0.3.96-1 +- deal with ia64 in 0.3.96-2 + +* Wed May 28 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.95-1 +- ppc bugfix from Julie DeWandel + +* Tue May 20 2003 Michael K. Johnson <johnsonm@redhat.com> 0.3.94-1 +- symbol versioning fix from Ulrich Drepper + +* Mon Jan 27 2003 Benjamin LaHaise <bcrl@redhat.com> +- bump to 0.3.93-3 for rebuild. + +* Mon Dec 16 2002 Benjamin LaHaise <bcrl@redhat.com> +- libaio 0.3.93 test release +- add powerpc support from Gianni Tedesco <gianni@ecsc.co.uk> +- add s/390 support from Arnd Bergmann <arnd@bergmann-dalldorf.de> + +* Fri Sep 12 2002 Benjamin LaHaise <bcrl@redhat.com> +- libaio 0.3.92 test release +- build on x86-64 + +* Thu Sep 12 2002 Benjamin LaHaise <bcrl@redhat.com> +- libaio 0.3.91 test release +- build on ia64 +- remove libredhat-kernel from the .spec file + +* Thu Sep 5 2002 Benjamin LaHaise <bcrl@redhat.com> +- libaio 0.3.90 test release + +* Mon Apr 29 2002 Benjamin LaHaise <bcrl@redhat.com> +- add requires initscripts >= 6.47-1 to get boot time libredhat-kernel + linkage correct. +- typo fix + +* Thu Apr 25 2002 Benjamin LaHaise <bcrl@redhat.com> +- make /usr/lib/libredhat-kernel.so point to /lib/libredhat-kernel.so.1.0.0 + +* Mon Apr 15 2002 Tim Powers <timp@redhat.com> +- make the post scriptlet not use /bin/sh + +* Sat Apr 12 2002 Benjamin LaHaise <bcrl@redhat.com> +- add /lib/libredhat-kernel* to %files. + +* Fri Apr 12 2002 Benjamin LaHaise <bcrl@redhat.com> +- make the dummy install as /lib/libredhat-kernel.so.1.0.0 so + that ldconfig will link against it if no other is installed. + +* Tue Jan 22 2002 Benjamin LaHaise <bcrl@redhat.com> +- add io_getevents + +* Tue Jan 22 2002 Michael K. Johnson <johnsonm@redhat.com> +- Make linker happy with /usr/lib symlink for libredhat-kernel.so + +* Mon Jan 21 2002 Michael K. Johnson <johnsonm@redhat.com> +- Added stub library + +* Sun Jan 20 2002 Michael K. Johnson <johnsonm@redhat.com> +- Initial packaging diff --git a/tools/libaio/man/aio.3 b/tools/libaio/man/aio.3 new file mode 100644 index 0000000000..6dc3c63a8f --- /dev/null +++ b/tools/libaio/man/aio.3 @@ -0,0 +1,315 @@ +.TH aio 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio \- Asynchronous IO +.SH SYNOPSIS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.fi +.SH DESCRIPTION +The POSIX.1b standard defines a new set of I/O operations which can +significantly reduce the time an application spends waiting at I/O. The +new functions allow a program to initiate one or more I/O operations and +then immediately resume normal work while the I/O operations are +executed in parallel. This functionality is available if the +.IR "unistd.h" +file defines the symbol +.B "_POSIX_ASYNCHRONOUS_IO" +. + +These functions are part of the library with realtime functions named +.IR "librt" +. They are not actually part of the +.IR "libc" +binary. +The implementation of these functions can be done using support in the +kernel (if available) or using an implementation based on threads at +userlevel. In the latter case it might be necessary to link applications +with the thread library +.IR "libpthread" +in addition to +.IR "librt" +and +.IR "libaio" +. + +All AIO operations operate on files which were opened previously. There +might be arbitrarily many operations running for one file. The +asynchronous I/O operations are controlled using a data structure named +.IR "struct aiocb" +It is defined in +.IR "aio.h" + as follows. + +.nf +struct aiocb +{ + int aio_fildes; /* File desriptor. */ + int aio_lio_opcode; /* Operation to be performed. */ + int aio_reqprio; /* Request priority offset. */ + volatile void *aio_buf; /* Location of buffer. */ + size_t aio_nbytes; /* Length of transfer. */ + struct sigevent aio_sigevent; /* Signal number and value. */ + + /* Internal members. */ + struct aiocb *__next_prio; + int __abs_prio; + int __policy; + int __error_code; + __ssize_t __return_value; + +#ifndef __USE_FILE_OFFSET64 + __off_t aio_offset; /* File offset. */ + char __pad[sizeof (__off64_t) - sizeof (__off_t)]; +#else + __off64_t aio_offset; /* File offset. */ +#endif + char __unused[32]; +}; + +.fi +The POSIX.1b standard mandates that the +.IR "struct aiocb" +structure +contains at least the members described in the following table. There +might be more elements which are used by the implementation, but +depending upon these elements is not portable and is highly deprecated. + +.TP +.IR "int aio_fildes" +This element specifies the file descriptor to be used for the +operation. It must be a legal descriptor, otherwise the operation will +fail. + +The device on which the file is opened must allow the seek operation. +I.e., it is not possible to use any of the AIO operations on devices +like terminals where an +.IR "lseek" + call would lead to an error. +.TP +.IR "off_t aio_offset" +This element specifies the offset in the file at which the operation (input +or output) is performed. Since the operations are carried out in arbitrary +order and more than one operation for one file descriptor can be +started, one cannot expect a current read/write position of the file +descriptor. +.TP +.IR "volatile void *aio_buf" +This is a pointer to the buffer with the data to be written or the place +where the read data is stored. +.TP +.IR "size_t aio_nbytes" +This element specifies the length of the buffer pointed to by +.IR "aio_buf" +. +.TP +.IR "int aio_reqprio" +If the platform has defined +.B "_POSIX_PRIORITIZED_IO" +and +.B "_POSIX_PRIORITY_SCHEDULING" +, the AIO requests are +processed based on the current scheduling priority. The +.IR "aio_reqprio" +element can then be used to lower the priority of the +AIO operation. +.TP +.IR "struct sigevent aio_sigevent" +This element specifies how the calling process is notified once the +operation terminates. If the +.IR "sigev_notify" +element is +.B "SIGEV_NONE" +, no notification is sent. If it is +.B "SIGEV_SIGNAL" +, +the signal determined by +.IR "sigev_signo" +is sent. Otherwise, +.IR "sigev_notify" +must be +.B "SIGEV_THREAD" +. In this case, a thread +is created which starts executing the function pointed to by +.IR "sigev_notify_function" +. +.TP +.IR "int aio_lio_opcode" +This element is only used by the +.IR "lio_listio" + and +.IR "lio_listio64" + functions. Since these functions allow an +arbitrary number of operations to start at once, and each operation can be +input or output (or nothing), the information must be stored in the +control block. The possible values are: +.TP +.B "LIO_READ" +Start a read operation. Read from the file at position +.IR "aio_offset" + and store the next +.IR "aio_nbytes" + bytes in the +buffer pointed to by +.IR "aio_buf" +. +.TP +.B "LIO_WRITE" +Start a write operation. Write +.IR "aio_nbytes" +bytes starting at +.IR "aio_buf" +into the file starting at position +.IR "aio_offset" +. +.TP +.B "LIO_NOP" +Do nothing for this control block. This value is useful sometimes when +an array of +.IR "struct aiocb" +values contains holes, i.e., some of the +values must not be handled although the whole array is presented to the +.IR "lio_listio" +function. + +When the sources are compiled using +.B "_FILE_OFFSET_BITS == 64" +on a +32 bit machine, this type is in fact +.IR "struct aiocb64" +, since the LFS +interface transparently replaces the +.IR "struct aiocb" +definition. +.PP +For use with the AIO functions defined in the LFS, there is a similar type +defined which replaces the types of the appropriate members with larger +types but otherwise is equivalent to +.IR "struct aiocb" +. Particularly, +all member names are the same. + +.nf +/* The same for the 64bit offsets. Please note that the members aio_fildes + to __return_value have to be the same in aiocb and aiocb64. */ +#ifdef __USE_LARGEFILE64 +struct aiocb64 +{ + int aio_fildes; /* File desriptor. */ + int aio_lio_opcode; /* Operation to be performed. */ + int aio_reqprio; /* Request priority offset. */ + volatile void *aio_buf; /* Location of buffer. */ + size_t aio_nbytes; /* Length of transfer. */ + struct sigevent aio_sigevent; /* Signal number and value. */ + + /* Internal members. */ + struct aiocb *__next_prio; + int __abs_prio; + int __policy; + int __error_code; + __ssize_t __return_value; + + __off64_t aio_offset; /* File offset. */ + char __unused[32]; +}; + +.fi +.TP +.IR "int aio_fildes" +This element specifies the file descriptor which is used for the +operation. It must be a legal descriptor since otherwise the operation +fails for obvious reasons. +The device on which the file is opened must allow the seek operation. +I.e., it is not possible to use any of the AIO operations on devices +like terminals where an +.IR "lseek" + call would lead to an error. +.TP +.IR "off64_t aio_offset" +This element specifies at which offset in the file the operation (input +or output) is performed. Since the operation are carried in arbitrary +order and more than one operation for one file descriptor can be +started, one cannot expect a current read/write position of the file +descriptor. +.TP +.IR "volatile void *aio_buf" +This is a pointer to the buffer with the data to be written or the place +where the read data is stored. +.TP +.IR "size_t aio_nbytes" +This element specifies the length of the buffer pointed to by +.IR "aio_buf" +. +.TP +.IR "int aio_reqprio" +If for the platform +.B "_POSIX_PRIORITIZED_IO" +and +.B "_POSIX_PRIORITY_SCHEDULING" +are defined the AIO requests are +processed based on the current scheduling priority. The +.IR "aio_reqprio" +element can then be used to lower the priority of the +AIO operation. +.TP +.IR "struct sigevent aio_sigevent" +This element specifies how the calling process is notified once the +operation terminates. If the +.IR "sigev_notify" +, element is +.B "SIGEV_NONE" +no notification is sent. If it is +.B "SIGEV_SIGNAL" +, +the signal determined by +.IR "sigev_signo" +is sent. Otherwise, +.IR "sigev_notify" + must be +.B "SIGEV_THREAD" +in which case a thread +which starts executing the function pointed to by +.IR "sigev_notify_function" +. +.TP +.IR "int aio_lio_opcode" +This element is only used by the +.IR "lio_listio" +and +.IR "lio_listio64" +functions. Since these functions allow an +arbitrary number of operations to start at once, and since each operation can be +input or output (or nothing), the information must be stored in the +control block. See the description of +.IR "struct aiocb" +for a description +of the possible values. +.PP +When the sources are compiled using +.B "_FILE_OFFSET_BITS == 64" +on a +32 bit machine, this type is available under the name +.IR "struct aiocb64" +, since the LFS transparently replaces the old interface. +.SH "RETURN VALUES" +.SH ERRORS +.SH "SEE ALSO" +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_cancel.3 b/tools/libaio/man/aio_cancel.3 new file mode 100644 index 0000000000..502c83c3da --- /dev/null +++ b/tools/libaio/man/aio_cancel.3 @@ -0,0 +1,137 @@ +.TH aio_cancel 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_cancel - Cancel asynchronous I/O requests +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_cancel (int fildes " , struct aiocb *aiocbp " )" +.fi +.SH DESCRIPTION +When one or more requests are asynchronously processed, it might be +useful in some situations to cancel a selected operation, e.g., if it +becomes obvious that the written data is no longer accurate and would +have to be overwritten soon. As an example, assume an application, which +writes data in files in a situation where new incoming data would have +to be written in a file which will be updated by an enqueued request. +The POSIX AIO implementation provides such a function, but this function +is not capable of forcing the cancellation of the request. It is up to the +implementation to decide whether it is possible to cancel the operation +or not. Therefore using this function is merely a hint. +.B "The libaio implementation does not implement the cancel operation in the" +.B "POSIX libraries". +.PP +The +.IR aio_cancel +function can be used to cancel one or more +outstanding requests. If the +.IR aiocbp +parameter is +.IR NULL +, the +function tries to cancel all of the outstanding requests which would process +the file descriptor +.IR fildes +(i.e., whose +.IR aio_fildes +member +is +.IR fildes +). If +.IR aiocbp is not +.IR NULL +, +.IR aio_cancel +attempts to cancel the specific request pointed to by +.IR aiocbp. + +For requests which were successfully canceled, the normal notification +about the termination of the request should take place. I.e., depending +on the +.IR "struct sigevent" +object which controls this, nothing +happens, a signal is sent or a thread is started. If the request cannot +be canceled, it terminates the usual way after performing the operation. +After a request is successfully canceled, a call to +.IR aio_error +with +a reference to this request as the parameter will return +.B ECANCELED +and a call to +.IR aio_return +will return +.IR -1. +If the request wasn't canceled and is still running the error status is +still +.B EINPROGRESS. +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +, this +function is in fact +.IR aio_cancel64 +since the LFS interface +transparently replaces the normal implementation. + +.SH "RETURN VALUES" +.TP +.B AIO_CANCELED +If there were +requests which haven't terminated and which were successfully canceled. +.TP +.B AIO_NOTCANCELED +If there is one or more requests left which couldn't be canceled, +. In this case +.IR aio_error +must be used to find out which of the, perhaps multiple, requests (in +.IR aiocbp +is +.IR NULL +) weren't successfully canceled. +.TP +.B AIO_ALLDONE +If all +requests already terminated at the time +.IR aio_cancel +is called the +return value is +. +.SH ERRORS +If an error occurred during the execution of +.IR aio_cancel +the +function returns +.IR -1 +and sets +.IR errno +to one of the following +values. +.TP +.B EBADF +The file descriptor +.IR fildes +is not valid. +.TP +.B ENOSYS +.IR aio_cancel +is not implemented. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_cancel64.3 b/tools/libaio/man/aio_cancel64.3 new file mode 100644 index 0000000000..ede775be5e --- /dev/null +++ b/tools/libaio/man/aio_cancel64.3 @@ -0,0 +1,50 @@ +.TH aio_cancel64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_cancel64 \- Cancel asynchronous I/O requests +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_cancel64 (int fildes, struct aiocb64 *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to +.IR aio_cancel +with the only difference +that the argument is a reference to a variable of type +.IR struct aiocb64 +. + +When the sources are compiled with +.IR _FILE_OFFSET_BITS == 64 +, this +function is available under the name +.IR aio_cancel +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +See aio_cancel(3). +.SH ERRORS +See aio_cancel(3). +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_error.3 b/tools/libaio/man/aio_error.3 new file mode 100644 index 0000000000..12b82cf894 --- /dev/null +++ b/tools/libaio/man/aio_error.3 @@ -0,0 +1,81 @@ +.TH aio_error 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_error \- Getting the Status of AIO Operations +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_error (const struct aiocb *aiocbp)" +.fi +.SH DESCRIPTION +The function +.IR aio_error +determines the error state of the request described by the +.IR "struct aiocb" +variable pointed to by +.I aiocbp +. + +When the operation is performed truly asynchronously (as with +.IR "aio_read" +and +.IR "aio_write" +and with +.IR "lio_listio" +when the mode is +.IR "LIO_NOWAIT" +), one sometimes needs to know whether a +specific request already terminated and if so, what the result was. +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this function is in fact +.IR "aio_error64" +since the LFS interface transparently replaces the normal implementation. +.SH "RETURN VALUES" +If the request has not yet terminated the value returned is always +.IR "EINPROGRESS" +. Once the request has terminated the value +.IR "aio_error" +returns is either +.I 0 +if the request completed successfully or it returns the value which would be stored in the +.IR "errno" +variable if the request would have been done using +.IR "read" +, +.IR "write" +, or +.IR "fsync" +. +.SH ERRORS +.TP +.IR "ENOSYS" +if it is not implemented. It +could also return +.TP +.IR "EINVAL" +if the +.I aiocbp +parameter does not +refer to an asynchronous operation whose return status is not yet known. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_error64.3 b/tools/libaio/man/aio_error64.3 new file mode 100644 index 0000000000..3333161d9a --- /dev/null +++ b/tools/libaio/man/aio_error64.3 @@ -0,0 +1,64 @@ +.TH aio_error64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_error64 \- Return errors +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_error64 (const struct aiocb64 *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to +.IR aio_error +with the only difference +that the argument is a reference to a variable of type +.IR "struct aiocb64". +.PP +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is available under the name +.IR aio_error +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +If the request has not yet terminated the value returned is always +.IR "EINPROGRESS" +. Once the request has terminated the value +.IR "aio_error" +returns is either +.I 0 +if the request completed successfully or it returns the value which would be stored in the +.IR "errno" +variable if the request would have been done using +.IR "read" +, +.IR "write" +, or +.IR "fsync" +. +.SH ERRORS +See +.IR aio_error(3). +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_fsync.3 b/tools/libaio/man/aio_fsync.3 new file mode 100644 index 0000000000..637f0f63d4 --- /dev/null +++ b/tools/libaio/man/aio_fsync.3 @@ -0,0 +1,139 @@ +.TH aio_fsync 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_fsync \- Synchronize a file's complete in-core state with that on disk +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_fsync (int op, struct aiocb aiocbp)" +.fi +.SH DESCRIPTION +.PP +When dealing with asynchronous operations it is sometimes necessary to +get into a consistent state. This would mean for AIO that one wants to +know whether a certain request or a group of request were processed. +This could be done by waiting for the notification sent by the system +after the operation terminated, but this sometimes would mean wasting +resources (mainly computation time). Instead POSIX.1b defines two +functions which will help with most kinds of consistency. +.PP +The +.IR aio_fsync +and +.IR "aio_fsync64" +functions are only available +if the symbol +.IR "_POSIX_SYNCHRONIZED_IO" +is defined in +.I unistd.h +. + +Calling this function forces all I/O operations operating queued at the +time of the function call operating on the file descriptor +.IR "aiocbp->aio_fildes" +into the synchronized I/O completion state . The +.IR "aio_fsync" +function returns +immediately but the notification through the method described in +.IR "aiocbp->aio_sigevent" +will happen only after all requests for this +file descriptor have terminated and the file is synchronized. This also +means that requests for this very same file descriptor which are queued +after the synchronization request are not affected. + +If +.IR "op" +is +.IR "O_DSYNC" +the synchronization happens as with a call +to +.IR "fdatasync" +. Otherwise +.IR "op" +should be +.IR "O_SYNC" +and +the synchronization happens as with +.IR "fsync" +. + +As long as the synchronization has not happened, a call to +.IR "aio_error" +with the reference to the object pointed to by +.IR "aiocbp" +returns +.IR "EINPROGRESS" +. Once the synchronization is +done +.IR "aio_error" +return +.IR 0 +if the synchronization was not +successful. Otherwise the value returned is the value to which the +.IR "fsync" +or +.IR "fdatasync" +function would have set the +.IR "errno" +variable. In this case nothing can be assumed about the +consistency for the data written to this file descriptor. + +.SH "RETURN VALUES" +The return value of this function is +.IR 0 +if the request was +successfully enqueued. Otherwise the return value is +.IR -1 +and +.IR "errno". +.SH ERRORS +.TP +.B EAGAIN +The request could not be enqueued due to temporary lack of resources. +.TP +.B EBADF +The file descriptor +.IR "aiocbp->aio_fildes" +is not valid or not open +for writing. +.TP +.B EINVAL +The implementation does not support I/O synchronization or the +.IR "op" +parameter is other than +.IR "O_DSYNC" +and +.IR "O_SYNC" +. +.TP +.B ENOSYS +This function is not implemented. +.PP +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" + this +function is in fact +.IR "aio_return64" +since the LFS interface +transparently replaces the normal implementation. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_fsync64.3 b/tools/libaio/man/aio_fsync64.3 new file mode 100644 index 0000000000..5dce22dda9 --- /dev/null +++ b/tools/libaio/man/aio_fsync64.3 @@ -0,0 +1,51 @@ +.TH aio_fsync64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_fsync64 \- Synchronize a file's complete in-core state with that on disk +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_fsync64 (int op, struct aiocb64 *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to +.IR aio_fsync +with the only difference +that the argument is a reference to a variable of type +.IR "struct aiocb64". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is available under the name +.IR aio_fsync +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +See +.IR aio_fsync. +.SH ERRORS +See +.IR aio_fsync. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_init.3 b/tools/libaio/man/aio_init.3 new file mode 100644 index 0000000000..3b0ec95a83 --- /dev/null +++ b/tools/libaio/man/aio_init.3 @@ -0,0 +1,96 @@ +.TH aio_init 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_init \- How to optimize the AIO implementation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "void aio_init (const struct aioinit *init)" +.fi +.SH DESCRIPTION + +The POSIX standard does not specify how the AIO functions are +implemented. They could be system calls, but it is also possible to +emulate them at userlevel. + +At the point of this writing, the available implementation is a userlevel +implementation which uses threads for handling the enqueued requests. +While this implementation requires making some decisions about +limitations, hard limitations are something which is best avoided +in the GNU C library. Therefore, the GNU C library provides a means +for tuning the AIO implementation according to the individual use. + +.BI "struct aioinit" +.PP +This data type is used to pass the configuration or tunable parameters +to the implementation. The program has to initialize the members of +this struct and pass it to the implementation using the +.IR aio_init +function. +.TP +.B "int aio_threads" +This member specifies the maximal number of threads which may be used +at any one time. +.TP +.B "int aio_num" +This number provides an estimate on the maximal number of simultaneously +enqueued requests. +.TP +.B "int aio_locks" +Unused. +.TP +.B "int aio_usedba" +Unused. +.TP +.B "int aio_debug" +Unused. +.TP +.B "int aio_numusers" +Unused. +.TP +.B "int aio_reserved[2]" +Unused. +.PP +This function must be called before any other AIO function. Calling it +is completely voluntary, as it is only meant to help the AIO +implementation perform better. + +Before calling the +.IR aio_init +, function the members of a variable of +type +.IR "struct aioinit" +must be initialized. Then a reference to +this variable is passed as the parameter to +.IR aio_init +which itself +may or may not pay attention to the hints. + +It is a extension which follows a proposal from the SGI implementation in +.IR Irix 6 +. It is not covered by POSIX.1b or Unix98. +.SH "RETURN VALUES" +The function has no return value. +.SH ERRORS +The function has no error cases defined. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_read.3 b/tools/libaio/man/aio_read.3 new file mode 100644 index 0000000000..5bcb6c8a11 --- /dev/null +++ b/tools/libaio/man/aio_read.3 @@ -0,0 +1,146 @@ +.TH aio_read 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_read \- Initiate an asynchronous read operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_read (struct aiocb *aiocbp)" +.fi +.SH DESCRIPTION +This function initiates an asynchronous read operation. It +immediately returns after the operation was enqueued or when an +error was encountered. + +The first +.IR "aiocbp->aio_nbytes" +bytes of the file for which +.IR "aiocbp->aio_fildes" +is a descriptor are written to the buffer +starting at +.IR "aiocbp->aio_buf" +. Reading starts at the absolute +position +.IR "aiocbp->aio_offset" +in the file. + +If prioritized I/O is supported by the platform the +.IR "aiocbp->aio_reqprio" +value is used to adjust the priority before +the request is actually enqueued. + +The calling process is notified about the termination of the read +request according to the +.IR "aiocbp->aio_sigevent" +value. + +.SH "RETURN VALUES" +When +.IR "aio_read" +returns, the return value is zero if no error +occurred that can be found before the process is enqueued. If such an +early error is found, the function returns +.IR -1 +and sets +.IR "errno". + +.PP +If +.IR "aio_read" +returns zero, the current status of the request +can be queried using +.IR "aio_error" +and +.IR "aio_return" +functions. +As long as the value returned by +.IR "aio_error" +is +.IR "EINPROGRESS" +the operation has not yet completed. If +.IR "aio_error" +returns zero, +the operation successfully terminated, otherwise the value is to be +interpreted as an error code. If the function terminated, the result of +the operation can be obtained using a call to +.IR "aio_return" +. The +returned value is the same as an equivalent call to +.IR "read" +would +have returned. +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is in fact +.IR "aio_read64" +since the LFS interface transparently +replaces the normal implementation. + +.SH ERRORS +In the case of an early error: +.TP +.B EAGAIN +The request was not enqueued due to (temporarily) exceeded resource +limitations. +.TP +.B ENOSYS +The +.IR "aio_read" +function is not implemented. +.TP +.B EBADF +The +.IR "aiocbp->aio_fildes" +descriptor is not valid. This condition +need not be recognized before enqueueing the request and so this error +might also be signaled asynchronously. +.TP +.B EINVAL +The +.IR "aiocbp->aio_offset" +or +.IR "aiocbp->aio_reqpiro" +value is +invalid. This condition need not be recognized before enqueueing the +request and so this error might also be signaled asynchronously. + +.PP +In the case of a normal return, possible error codes returned by +.IR "aio_error" +are: +.TP +.B EBADF +The +.IR "aiocbp->aio_fildes" +descriptor is not valid. +.TP +.B ECANCELED +The operation was canceled before the operation was finished +.TP +.B EINVAL +The +.IR "aiocbp->aio_offset" +value is invalid. +.PP +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_read64.3 b/tools/libaio/man/aio_read64.3 new file mode 100644 index 0000000000..8e407a5591 --- /dev/null +++ b/tools/libaio/man/aio_read64.3 @@ -0,0 +1,60 @@ +.TH aio_read64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_read64 \- Initiate an asynchronous read operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_read64 (struct aiocb *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to the +.IR "aio_read" +function. The only +difference is that on +.IR "32 bit" +machines, the file descriptor should +be opened in the large file mode. Internally, +.IR "aio_read64" +uses +functionality equivalent to +.IR "lseek64" +to position the file descriptor correctly for the reading, +as opposed to +.IR "lseek" +functionality used in +.IR "aio_read". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +, this +function is available under the name +.IR "aio_read" +and so transparently +replaces the interface for small files on 32 bit machines. +.SH "RETURN VALUES" +See +.IR aio_read. +.SH ERRORS +See +.IR aio_read. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_return.3 b/tools/libaio/man/aio_return.3 new file mode 100644 index 0000000000..1e3335fdb6 --- /dev/null +++ b/tools/libaio/man/aio_return.3 @@ -0,0 +1,71 @@ +.TH aio_return 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_return \- Retrieve status of asynchronous I/O operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "ssize_t aio_return (const struct aiocb *aiocbp)" +.fi +.SH DESCRIPTION +This function can be used to retrieve the return status of the operation +carried out by the request described in the variable pointed to by +.IR aiocbp +. As long as the error status of this request as returned +by +.IR aio_error +is +.IR EINPROGRESS +the return of this function is +undefined. + +Once the request is finished this function can be used exactly once to +retrieve the return value. Following calls might lead to undefined +behavior. +When the sources are compiled with +.B "_FILE_OFFSET_BITS == 64" +this function is in fact +.IR aio_return64 +since the LFS interface +transparently replaces the normal implementation. +.SH "RETURN VALUES" +The return value itself is the value which would have been +returned by the +.IR read +, +.IR write +, or +.IR fsync +call. +.SH ERRORS +The function can return +.TP +.B ENOSYS +if it is not implemented. +.TP +.B EINVAL +if the +.IR aiocbp +parameter does not +refer to an asynchronous operation whose return status is not yet known. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_return64.3 b/tools/libaio/man/aio_return64.3 new file mode 100644 index 0000000000..7e78362b32 --- /dev/null +++ b/tools/libaio/man/aio_return64.3 @@ -0,0 +1,51 @@ +.TH aio_read64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_read64 \- Retrieve status of asynchronous I/O operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_return64 (const struct aiocb64 *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to +.IR "aio_return" +with the only difference +that the argument is a reference to a variable of type +.IR "struct aiocb64". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is available under the name +.IR "aio_return" +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +See +.IR aio_return. +.SH ERRORS +See +.IR aio_return. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_suspend.3 b/tools/libaio/man/aio_suspend.3 new file mode 100644 index 0000000000..cae1b65691 --- /dev/null +++ b/tools/libaio/man/aio_suspend.3 @@ -0,0 +1,123 @@ +.TH aio_suspend 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_suspend \- Wait until one or more requests of a specific set terminates. +.SH SYNOPSYS +.nf +.B "#include <errno.h>" +.sp +.br +.B "#include <aio.h>" +.sp +.br +.BI "int aio_suspend (const struct aiocb *const list[], int nent, const struct timespec *timeout)" +.fi +.SH DESCRIPTION +Another method of synchronization is to wait until one or more requests of a +specific set terminated. This could be achieved by the +.IR "aio_*" +functions to notify the initiating process about the termination but in +some situations this is not the ideal solution. In a program which +constantly updates clients somehow connected to the server it is not +always the best solution to go round robin since some connections might +be slow. On the other hand letting the +.IR "aio_*" +function notify the +caller might also be not the best solution since whenever the process +works on preparing data for on client it makes no sense to be +interrupted by a notification since the new client will not be handled +before the current client is served. For situations like this +.IR "aio_suspend" +should be used. +.PP +When calling this function, the calling thread is suspended until at +least one of the requests pointed to by the +.IR "nent" +elements of the +array +.IR "list" +has completed. If any of the requests has already +completed at the time +.IR "aio_suspend" +is called, the function returns +immediately. Whether a request has terminated or not is determined by +comparing the error status of the request with +.IR "EINPROGRESS" +. If +an element of +.IR "list" +is +.IR "NULL" +, the entry is simply ignored. + +If no request has finished, the calling process is suspended. If +.IR "timeout" +is +.IR "NULL" +, the process is not woken until a request +has finished. If +.IR "timeout" +is not +.IR "NULL" +, the process remains +suspended at least as long as specified in +.IR "timeout" +. In this case, +.IR "aio_suspend" +returns with an error. +.PP +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is in fact +.IR "aio_suspend64" +since the LFS interface +transparently replaces the normal implementation. +.SH "RETURN VALUES" +The return value of the function is +.IR 0 +if one or more requests +from the +.IR "list" +have terminated. Otherwise the function returns +.IR -1 +and +.IR "errno" +is set. +.SH ERRORS +.TP +.B EAGAIN +None of the requests from the +.IR "list" +completed in the time specified +by +.IR "timeout" +. +.TP +.B EINTR +A signal interrupted the +.IR "aio_suspend" +function. This signal might +also be sent by the AIO implementation while signalling the termination +of one of the requests. +.TP +.B ENOSYS +The +.IR "aio_suspend" +function is not implemented. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_suspend64.3 b/tools/libaio/man/aio_suspend64.3 new file mode 100644 index 0000000000..2f289ecceb --- /dev/null +++ b/tools/libaio/man/aio_suspend64.3 @@ -0,0 +1,51 @@ +.TH aio_suspend64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_suspend64 \- Wait until one or more requests of a specific set terminates +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_suspend64 (const struct aiocb64 *const list[], int nent, const struct timespec *timeout)" +.fi +.SH DESCRIPTION +This function is similar to +.IR "aio_suspend" +with the only difference +that the argument is a reference to a variable of type +.IR "struct aiocb64". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +this +function is available under the name +.IR "aio_suspend" +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +See +.IR aio_suspend. +.SH ERRORS +See +.IR aio_suspend. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_write(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_write.3 b/tools/libaio/man/aio_write.3 new file mode 100644 index 0000000000..7c0cfd0bf7 --- /dev/null +++ b/tools/libaio/man/aio_write.3 @@ -0,0 +1,176 @@ +.TH aio_write 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_write \- Initiate an asynchronous write operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_write (struct aiocb * aiocbp);" +.fi +.SH DESCRIPTION +This function initiates an asynchronous write operation. The function +call immediately returns after the operation was enqueued or if before +this happens an error was encountered. + +The first +.IR "aiocbp->aio_nbytes" +bytes from the buffer starting at +.IR "aiocbp->aio_buf" +are written to the file for which +.IR "aiocbp->aio_fildes" +is an descriptor, starting at the absolute +position +.IR "aiocbp->aio_offset" +in the file. + +If prioritized I/O is supported by the platform, the +.IR "aiocbp->aio_reqprio " +value is used to adjust the priority before +the request is actually enqueued. + +The calling process is notified about the termination of the read +request according to the +.IR "aiocbp->aio_sigevent" +value. + +When +.IR "aio_write" +returns, the return value is zero if no error +occurred that can be found before the process is enqueued. If such an +early error is found the function returns +.IR -1 +and sets +.IR "errno" +to one of the following values. + +.TP +.B EAGAIN +The request was not enqueued due to (temporarily) exceeded resource +limitations. +.TP +.B ENOSYS +The +.IR "aio_write" +function is not implemented. +.TP +.B EBADF +The +.IR "aiocbp->aio_fildes" +descriptor is not valid. This condition +may not be recognized before enqueueing the request, and so this error +might also be signaled asynchronously. +.TP +.B EINVAL +The +.IR "aiocbp->aio_offset" +or +.IR "aiocbp->aio_reqprio" +value is +invalid. This condition may not be recognized before enqueueing the +request and so this error might also be signaled asynchronously. +.PP + +In the case +.IR "aio_write" +returns zero, the current status of the +request can be queried using +.IR "aio_error" +and +.IR "aio_return" +functions. As long as the value returned by +.IR "aio_error" +is +.IR "EINPROGRESS" +the operation has not yet completed. If +.IR "aio_error" +returns zero, the operation successfully terminated, +otherwise the value is to be interpreted as an error code. If the +function terminated, the result of the operation can be get using a call +to +.IR "aio_return" +. The returned value is the same as an equivalent +call to +.IR "read" +would have returned. Possible error codes returned +by +.IR "aio_error" +are: + +.TP +.B EBADF +The +.IR "aiocbp->aio_fildes" +descriptor is not valid. +.TP +.B ECANCELED +The operation was canceled before the operation was finished. +.TP +.B EINVAL +The +.IR "aiocbp->aio_offset" +value is invalid. +.PP +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +, this +function is in fact +.IR "aio_write64" +since the LFS interface transparently +replaces the normal implementation. +.SH "RETURN VALUES" +When +.IR "aio_write" +returns, the return value is zero if no error +occurred that can be found before the process is enqueued. If such an +early error is found the function returns +.IR -1 +and sets +.IR "errno" +to one of the following values. +.SH ERRORS +.TP +.B EAGAIN +The request was not enqueued due to (temporarily) exceeded resource +limitations. +.TP +.B ENOSYS +The +.IR "aio_write" +function is not implemented. +.TP +.B EBADF +The +.IR "aiocbp->aio_fildes" +descriptor is not valid. This condition +may not be recognized before enqueueing the request, and so this error +might also be signaled asynchronously. +.TP +.B EINVAL +The +.IR "aiocbp->aio_offset" +or +.IR "aiocbp->aio_reqprio" +value is +invalid. This condition may not be recognized before enqueueing the +request and so this error might also be signaled asynchronously. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write64(3), +.BR errno(3), diff --git a/tools/libaio/man/aio_write64.3 b/tools/libaio/man/aio_write64.3 new file mode 100644 index 0000000000..1080903aca --- /dev/null +++ b/tools/libaio/man/aio_write64.3 @@ -0,0 +1,61 @@ +.TH aio_write64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +aio_write64 \- Initiate an asynchronous write operation +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <aio.h> +.sp +.br +.BI "int aio_write64 (struct aiocb *aiocbp)" +.fi +.SH DESCRIPTION +This function is similar to the +.IR "aio_write" +function. The only +difference is that on +.IR "32 bit" +machines the file descriptor should +be opened in the large file mode. Internally +.IR "aio_write64" +uses +functionality equivalent to +.IR "lseek64" +to position the file descriptor correctly for the writing, +as opposed to +.IR "lseek" +functionality used in +.IR "aio_write". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +, this +function is available under the name +.IR "aio_write" +and so transparently +replaces the interface for small files on 32 bit machines. +.SH "RETURN VALUES" +See +.IR aio_write. +.SH ERRORS +See +.IR aio_write. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR errno(3), diff --git a/tools/libaio/man/io.3 b/tools/libaio/man/io.3 new file mode 100644 index 0000000000..d910a689f5 --- /dev/null +++ b/tools/libaio/man/io.3 @@ -0,0 +1,351 @@ +.TH io 3 2002-09-12 "Linux 2.4" Linux IO" +.SH NAME +io \- Asynchronous IO +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <libio.h> +.sp +.fi +.SH DESCRIPTION +The libaio library defines a new set of I/O operations which can +significantly reduce the time an application spends waiting at I/O. The +new functions allow a program to initiate one or more I/O operations and +then immediately resume normal work while the I/O operations are +executed in parallel. + +These functions are part of the library with realtime functions named +.IR "libaio" +. They are not actually part of the +.IR "libc" +binary. +The implementation of these functions can be done using support in the +kernel. + +All IO operations operate on files which were opened previously. There +might be arbitrarily many operations running for one file. The +asynchronous I/O operations are controlled using a data structure named +.IR "struct iocb" +It is defined in +.IR "libio.h" +as follows. + +.nf + +typedef struct io_context *io_context_t; + +typedef enum io_iocb_cmd { + IO_CMD_PREAD = 0, + IO_CMD_PWRITE = 1, + + IO_CMD_FSYNC = 2, + IO_CMD_FDSYNC = 3, + + IO_CMD_POLL = 5, + IO_CMD_NOOP = 6, +} io_iocb_cmd_t; + +struct io_iocb_common { + void *buf; + unsigned __pad1; + long nbytes; + unsigned __pad2; + long long offset; + long long __pad3, __pad4; +}; /* result code is the amount read or -'ve errno */ + + +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; + union { + struct io_iocb_common c; + struct io_iocb_vector v; + struct io_iocb_poll poll; + struct io_iocb_sockaddr saddr; + } u; +}; + + +.fi +.TP +.IR "int aio_fildes" +This element specifies the file descriptor to be used for the +operation. It must be a legal descriptor, otherwise the operation will +fail. + +The device on which the file is opened must allow the seek operation. +I.e., it is not possible to use any of the IO operations on devices +like terminals where an +.IR "lseek" +call would lead to an error. +.TP +.IR "long u.c.offset" +This element specifies the offset in the file at which the operation (input +or output) is performed. Since the operations are carried out in arbitrary +order and more than one operation for one file descriptor can be +started, one cannot expect a current read/write position of the file +descriptor. +.TP +.IR "void *buf" +This is a pointer to the buffer with the data to be written or the place +where the read data is stored. +.TP +.IR "long u.c.nbytes" +This element specifies the length of the buffer pointed to by +.IR "io_buf" +. +.TP +.IR "int aio_reqprio" +Is not currently used. +.TP +.B "IO_CMD_PREAD" +Start a read operation. Read from the file at position +.IR "u.c.offset" +and store the next +.IR "u.c.nbytes" +bytes in the +buffer pointed to by +.IR "buf" +. +.TP +.B "IO_CMD_PWRITE" +Start a write operation. Write +.IR "u.c.nbytes" +bytes starting at +.IR "buf" +into the file starting at position +.IR "u.c.offset" +. +.TP +.B "IO_CMD_NOP" +Do nothing for this control block. This value is useful sometimes when +an array of +.IR "struct iocb" +values contains holes, i.e., some of the +values must not be handled although the whole array is presented to the +.IR "io_submit" +function. +.TP +.B "IO_CMD_FSYNC" +.TP +.B "IO_CMD_POLL" +This is experimental. +.SH EXAMPLE +.nf +/* + * Simplistic version of copy command using async i/o + * + * From: Stephen Hemminger <shemminger@osdl.org> + * Copy file by using a async I/O state machine. + * 1. Start read request + * 2. When read completes turn it into a write request + * 3. When write completes decrement counter and free resources + * + * + * Usage: aiocp file(s) desination + */ + +#include <unistd.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/param.h> +#include <fcntl.h> +#include <errno.h> + +#include <libaio.h> + +#define AIO_BLKSIZE (64*1024) +#define AIO_MAXIO 32 + +static int busy = 0; // # of I/O's in flight +static int tocopy = 0; // # of blocks left to copy +static int dstfd = -1; // destination file descriptor +static const char *dstname = NULL; +static const char *srcname = NULL; + + +/* Fatal error handler */ +static void io_error(const char *func, int rc) +{ + if (rc == -ENOSYS) + fprintf(stderr, "AIO not in this kernel\n"); + else if (rc < 0 && -rc < sys_nerr) + fprintf(stderr, "%s: %s\n", func, sys_errlist[-rc]); + else + fprintf(stderr, "%s: error %d\n", func, rc); + + if (dstfd > 0) + close(dstfd); + if (dstname) + unlink(dstname); + exit(1); +} + +/* + * Write complete callback. + * Adjust counts and free resources + */ +static void wr_done(io_context_t ctx, struct iocb *iocb, long res, long res2) +{ + if (res2 != 0) { + io_error("aio write", res2); + } + if (res != iocb->u.c.nbytes) { + fprintf(stderr, "write missed bytes expect %d got %d\n", iocb->u.c.nbytes, res2); + exit(1); + } + --tocopy; + --busy; + free(iocb->u.c.buf); + + memset(iocb, 0xff, sizeof(iocb)); // paranoia + free(iocb); + write(2, "w", 1); +} + +/* + * Read complete callback. + * Change read iocb into a write iocb and start it. + */ +static void rd_done(io_context_t ctx, struct iocb *iocb, long res, long res2) +{ + /* library needs accessors to look at iocb? */ + int iosize = iocb->u.c.nbytes; + char *buf = iocb->u.c.buf; + off_t offset = iocb->u.c.offset; + + if (res2 != 0) + io_error("aio read", res2); + if (res != iosize) { + fprintf(stderr, "read missing bytes expect %d got %d\n", iocb->u.c.nbytes, res); + exit(1); + } + + + /* turn read into write */ + io_prep_pwrite(iocb, dstfd, buf, iosize, offset); + io_set_callback(iocb, wr_done); + if (1 != (res = io_submit(ctx, 1, &iocb))) + io_error("io_submit write", res); + write(2, "r", 1); +} + + +int main(int argc, char *const *argv) +{ + int srcfd; + struct stat st; + off_t length = 0, offset = 0; + io_context_t myctx; + + if (argc != 3 || argv[1][0] == '-') { + fprintf(stderr, "Usage: aiocp SOURCE DEST"); + exit(1); + } + if ((srcfd = open(srcname = argv[1], O_RDONLY)) < 0) { + perror(srcname); + exit(1); + } + if (fstat(srcfd, &st) < 0) { + perror("fstat"); + exit(1); + } + length = st.st_size; + + if ((dstfd = open(dstname = argv[2], O_WRONLY | O_CREAT, 0666)) < 0) { + close(srcfd); + perror(dstname); + exit(1); + } + + /* initialize state machine */ + memset(&myctx, 0, sizeof(myctx)); + io_queue_init(AIO_MAXIO, &myctx); + tocopy = howmany(length, AIO_BLKSIZE); + + while (tocopy > 0) { + int i, rc; + /* Submit as many reads as once as possible upto AIO_MAXIO */ + int n = MIN(MIN(AIO_MAXIO - busy, AIO_MAXIO / 2), + howmany(length - offset, AIO_BLKSIZE)); + if (n > 0) { + struct iocb *ioq[n]; + + for (i = 0; i < n; i++) { + struct iocb *io = (struct iocb *) malloc(sizeof(struct iocb)); + int iosize = MIN(length - offset, AIO_BLKSIZE); + char *buf = (char *) malloc(iosize); + + if (NULL == buf || NULL == io) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + + io_prep_pread(io, srcfd, buf, iosize, offset); + io_set_callback(io, rd_done); + ioq[i] = io; + offset += iosize; + } + + rc = io_submit(myctx, n, ioq); + if (rc < 0) + io_error("io_submit", rc); + + busy += n; + } + + // Handle IO's that have completed + rc = io_queue_run(myctx); + if (rc < 0) + io_error("io_queue_run", rc); + + // if we have maximum number of i/o's in flight + // then wait for one to complete + if (busy == AIO_MAXIO) { + rc = io_queue_wait(myctx, NULL); + if (rc < 0) + io_error("io_queue_wait", rc); + } + + } + + close(srcfd); + close(dstfd); + exit(0); +} + +/* + * Results look like: + * [alanm@toolbox ~/MOT3]$ ../taio kernel-source-2.4.8-0.4g.ppc.rpm abc + * rrrrrrrrrrrrrrrwwwrwrrwwrrwrwwrrwrwrwwrrwrwrrrrwwrwwwrrwrrrwwwwwwwwwwwwwwwww + * rrrrrrrrrrrrrrwwwrrwrwrwrwrrwwwwwwwwwwwwwwrrrrrrrrrrrrrrrrrrwwwwrwrwwrwrwrwr + * wrrrrrrrwwwwwwwwwwwwwrrrwrrrwrrwrwwwwwwwwwwrrrrwwrwrrrrrrrrrrrwwwwwwwwwwwrww + * wwwrrrrrrrrwwrrrwwrwrwrwwwrrrrrrrwwwrrwwwrrwrwwwwwwwwrrrrrrrwwwrrrrrrrwwwwww + * wwwwwwwrwrrrrrrrrwrrwrrwrrwrwrrrwrrrwrrrwrwwwwwwwwwwwwwwwwwwrrrwwwrrrrrrrrrr + * rrwrrrrrrwrrwwwwwwwwwwwwwwwwrwwwrrwrwwrrrrrrrrrrrrrrrrrrrwwwwwwwwwwwwwwwwwww + * rrrrrwrrwrwrwrrwrrrwwwwwwwwrrrrwrrrwrwwrwrrrwrrwrrrrwwwwwwwrwrwwwwrwwrrrwrrr + * rrrwwwwwwwrrrrwwrrrrrrrrrrrrwrwrrrrwwwwwwwwwwwwwwrwrrrrwwwwrwrrrrwrwwwrrrwww + * rwwrrrrrrrwrrrrrrrrrrrrwwwwrrrwwwrwrrwwwwwwwwwwwwwwwwwwwwwrrrrrrrwwwwwwwrw + */ +.fi +.SH "SEE ALSO" +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_cancel.1 b/tools/libaio/man/io_cancel.1 new file mode 100644 index 0000000000..16e898a7de --- /dev/null +++ b/tools/libaio/man/io_cancel.1 @@ -0,0 +1,21 @@ +.\"/* sys_io_cancel: +.\" * Attempts to cancel an iocb previously passed to io_submit. If +.\" * the operation is successfully cancelled, the resulting event is +.\" * copied into the memory pointed to by result without being placed +.\" * into the completion queue and 0 is returned. May fail with +.\" * -EFAULT if any of the data structures pointed to are invalid. +.\" * May fail with -EINVAL if aio_context specified by ctx_id is +.\" * invalid. May fail with -EAGAIN if the iocb specified was not +.\" * cancelled. Will fail with -ENOSYS if not implemented. +.\" */ +.\" +.TH io_cancel 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_cancel \- cancel io requests +.SH SYNOPSIS +.B #include <errno.h> +.br +.B #include <libaio.h> +.LP +.BI "int io_submit(io_context_t " ctx ", struct iocb *" iocb ", struct io_event *" result ");" + diff --git a/tools/libaio/man/io_cancel.3 b/tools/libaio/man/io_cancel.3 new file mode 100644 index 0000000000..9a16084a5b --- /dev/null +++ b/tools/libaio/man/io_cancel.3 @@ -0,0 +1,65 @@ +.TH io_cancel 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_cancel \- Cancel io requests +.SH SYNOPSIS +.nf +.B #include <errno.h> +.sp +.br +.B #include <libaio.h> +.sp +.br +.BI "int io_cancel(io_context_t ctx, struct iocb *iocb)" +.br +.sp +struct iocb { + void *data; /* Return in the io completion event */ + unsigned key; /* For use in identifying io requests */ + short aio_lio_opcode; + short aio_reqprio; /* Not used */ + int aio_fildes; +}; +.fi +.SH DESCRIPTION +Attempts to cancel an iocb previously passed to io_submit. If +the operation is successfully cancelled, the resulting event is +copied into the memory pointed to by result without being placed +into the completion queue. +.PP +When one or more requests are asynchronously processed, it might be +useful in some situations to cancel a selected operation, e.g., if it +becomes obvious that the written data is no longer accurate and would +have to be overwritten soon. As an example, assume an application, which +writes data in files in a situation where new incoming data would have +to be written in a file which will be updated by an enqueued request. +.SH "RETURN VALUES" +0 is returned on success , otherwise returns Errno. +.SH ERRORS +.TP +.B EFAULT +If any of the data structures pointed to are invalid. +.TP +.B EINVAL +If aio_context specified by ctx_id is +invalid. +.TP +.B EAGAIN +If the iocb specified was not +cancelled. +.TP +.B ENOSYS +if not implemented. +.SH "SEE ALSO" +.BR io(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_destroy.1 b/tools/libaio/man/io_destroy.1 new file mode 100644 index 0000000000..177683b8e0 --- /dev/null +++ b/tools/libaio/man/io_destroy.1 @@ -0,0 +1,17 @@ +.\"/* sys_io_destroy: +.\" * Destroy the aio_context specified. May cancel any outstanding +.\" * AIOs and block on completion. Will fail with -ENOSYS if not +.\" * implemented. May fail with -EFAULT if the context pointed to +.\" * is invalid. +.\" */ +.\" libaio provides this as io_queue_release. +.TH io_destroy 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_destroy \- destroy an io context +.SH SYNOPSIS +.B #include <errno.h> +.br +.B #include <libaio.h> +.LP +.BI "int io_destroy(io_context_t " ctx ");" + diff --git a/tools/libaio/man/io_fsync.3 b/tools/libaio/man/io_fsync.3 new file mode 100644 index 0000000000..53eb63d278 --- /dev/null +++ b/tools/libaio/man/io_fsync.3 @@ -0,0 +1,82 @@ +./" static inline int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd) +./" { +./" io_prep_fsync(iocb, fd); +./" io_set_callback(iocb, cb); +./" return io_submit(ctx, 1, &iocb); +./" } +.TH io_fsync 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +io_fsync \- Synchronize a file's complete in-core state with that on disk +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <libaio.h> +.sp +.br +.BI "int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd)" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.sp +typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2); +.sp +.fi +.SH DESCRIPTION +When dealing with asynchronous operations it is sometimes necessary to +get into a consistent state. This would mean for AIO that one wants to +know whether a certain request or a group of request were processed. +This could be done by waiting for the notification sent by the system +after the operation terminated, but this sometimes would mean wasting +resources (mainly computation time). +.PP +Calling this function forces all I/O operations operating queued at the +time of the function call operating on the file descriptor +.IR "iocb->io_fildes" +into the synchronized I/O completion state . The +.IR "io_fsync" +function returns +immediately but the notification through the method described in +.IR "io_callback" +will happen only after all requests for this +file descriptor have terminated and the file is synchronized. This also +means that requests for this very same file descriptor which are queued +after the synchronization request are not affected. +.SH "RETURN VALUES" +Returns 0, otherwise returns errno. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I ctx +refers to an unitialized aio context, the iocb pointed to by +.I iocbs +contains an improperly initialized iocb, +.TP +.B EBADF +The iocb contains a file descriptor that does not exist. +.TP +.B EINVAL +The file specified in the iocb does not support the given io operation. +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_getevents(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_getevents.1 b/tools/libaio/man/io_getevents.1 new file mode 100644 index 0000000000..27730b9959 --- /dev/null +++ b/tools/libaio/man/io_getevents.1 @@ -0,0 +1,29 @@ +./"/* io_getevents: +./" * Attempts to read at least min_nr events and up to nr events from +./" * the completion queue for the aio_context specified by ctx_id. May +./" * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, +./" * if nr is out of range, if when is out of range. May fail with +./" * -EFAULT if any of the memory specified to is invalid. May return +./" * 0 or < min_nr if no events are available and the timeout specified +./" * by when has elapsed, where when == NULL specifies an infinite +./" * timeout. Note that the timeout pointed to by when is relative and +./" * will be updated if not NULL and the operation blocks. Will fail +./" * with -ENOSYS if not implemented. +./" */ +./"asmlinkage long sys_io_getevents(io_context_t ctx_id, +./" long min_nr, +./" long nr, +./" struct io_event *events, +./" struct timespec *timeout) +./" +.TH io_getevents 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_getevents \- read resulting events from io requests +.SH SYNOPSIS +.B #include <errno.h> +.br +.B #include <libaio.h> +.sp +.BI "int io_getevents(io_context_t " ctx ", long " min_nr ", long " nr ", struct io_events *" events "[], struct timespec *" timeout ");" + + diff --git a/tools/libaio/man/io_getevents.3 b/tools/libaio/man/io_getevents.3 new file mode 100644 index 0000000000..8e9ddc866a --- /dev/null +++ b/tools/libaio/man/io_getevents.3 @@ -0,0 +1,79 @@ +./"/* io_getevents: +./" * Attempts to read at least min_nr events and up to nr events from +./" * the completion queue for the aio_context specified by ctx_id. May +./" * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, +./" * if nr is out of range, if when is out of range. May fail with +./" * -EFAULT if any of the memory specified to is invalid. May return +./" * 0 or < min_nr if no events are available and the timeout specified +./" * by when has elapsed, where when == NULL specifies an infinite +./" * timeout. Note that the timeout pointed to by when is relative and +./" * will be updated if not NULL and the operation blocks. Will fail +./" * with -ENOSYS if not implemented. +./" */ +./"asmlinkage long sys_io_getevents(io_context_t ctx_id, +./" long min_nr, +./" long nr, +./" struct io_event *events, +./" struct timespec *timeout) +./" +.TH io_getevents 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_getevents \- Read resulting events from io requests +.SH SYNOPSIS +.nf +.B #include <errno.h> +.sp +.br +.B #include <libaio.h> +.br +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.sp +struct io_event { + unsigned PADDED(data, __pad1); + unsigned PADDED(obj, __pad2); + unsigned PADDED(res, __pad3); + unsigned PADDED(res2, __pad4); +}; +.sp +.BI "int io_getevents(io_context_t " ctx ", long " nr ", struct io_event *" events "[], struct timespec *" timeout ");" + +.fi +.SH DESCRIPTION +Attempts to read up to nr events from +the completion queue for the aio_context specified by ctx. +.SH "RETURN VALUES" +May return +0 if no events are available and the timeout specified +by when has elapsed, where when == NULL specifies an infinite +timeout. Note that the timeout pointed to by when is relative and +will be updated if not NULL and the operation blocks. Will fail +with ENOSYS if not implemented. +.SH ERRORS +.TP +.B EINVAL +if ctx_id is invalid, if min_nr is out of range, +if nr is out of range, if when is out of range. +.TP +.B EFAULT +if any of the memory specified to is invalid. +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_prep_fsync.3 b/tools/libaio/man/io_prep_fsync.3 new file mode 100644 index 0000000000..4cf935acaf --- /dev/null +++ b/tools/libaio/man/io_prep_fsync.3 @@ -0,0 +1,89 @@ +./" static inline void io_prep_fsync(struct iocb *iocb, int fd) +./" { +./" memset(iocb, 0, sizeof(*iocb)); +./" iocb->aio_fildes = fd; +./" iocb->aio_lio_opcode = IO_CMD_FSYNC; +./" iocb->aio_reqprio = 0; +./" } +.TH io_prep_fsync 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +io_prep_fsync \- Synchronize a file's complete in-core state with that on disk +.SH SYNOPSYS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "static inline void io_prep_fsync(struct iocb *iocb, int fd)" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.sp +.fi +.SH DESCRIPTION +This is an inline convenience function for setting up an iocbv for a FSYNC request. +.br +The file for which +.TP +.IR "iocb->aio_fildes = fd" +is a descriptor is set up with +the command +.TP +.IR "iocb->aio_lio_opcode = IO_CMD_FSYNC: +. +.PP +The io_prep_fsync() function shall set up an IO_CMD_FSYNC operation +to asynchronously force all I/O +operations associated with the file indicated by the file +descriptor aio_fildes member of the iocb structure referenced by +the iocb argument and queued at the time of the call to +io_submit() to the synchronized I/O completion state. The function +call shall return when the synchronization request has been +initiated or queued to the file or device (even when the data +cannot be synchronized immediately). + +All currently queued I/O operations shall be completed as if by a call +to fsync(); that is, as defined for synchronized I/O file +integrity completion. If the +operation queued by io_prep_fsync() fails, then, as for fsync(), +outstanding I/O operations are not guaranteed to have +been completed. + +If io_prep_fsync() succeeds, then it is only the I/O that was queued +at the time of the call to io_submit() that is guaranteed to be +forced to the relevant completion state. The completion of +subsequent I/O on the file descriptor is not guaranteed to be +completed in a synchronized fashion. +.PP +This function returns immediately . To schedule the operation, the +function +.IR io_submit +must be called. +.PP +Simultaneous asynchronous operations using the same iocb produce +undefined results. +.SH "RETURN VALUES" +None +.SH ERRORS +None +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_prep_pread.3 b/tools/libaio/man/io_prep_pread.3 new file mode 100644 index 0000000000..5938aecc6b --- /dev/null +++ b/tools/libaio/man/io_prep_pread.3 @@ -0,0 +1,79 @@ +./" static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +./" { +./" memset(iocb, 0, sizeof(*iocb)); +./" iocb->aio_fildes = fd; +./" iocb->aio_lio_opcode = IO_CMD_PREAD; +./" iocb->aio_reqprio = 0; +./" iocb->u.c.buf = buf; +./" iocb->u.c.nbytes = count; +./" iocb->u.c.offset = offset; +./" } +.TH io_prep_pread 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +io_prep_pread \- Set up asynchronous read +.SH SYNOPSYS +.nf +.B #include <errno.h> +.sp +.br +.B #include <libaio.h> +.br +.sp +.BI "inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.fi +.SH DESCRIPTION +.IR io_prep_pread +is an inline convenience function designed to facilitate the initialization of +the iocb for an asynchronous read operation. + +The first +.TP +.IR "iocb->u.c.nbytes = count" +bytes of the file for which +.TP +.IR "iocb->aio_fildes = fd" +is a descriptor are written to the buffer +starting at +.TP +.IR "iocb->u.c.buf = buf" +. +.br +Reading starts at the absolute position +.TP +.IR "ioc->u.c.offset = offset" +in the file. +.PP +This function returns immediately . To schedule the operation, the +function +.IR io_submit +must be called. +.PP +Simultaneous asynchronous operations using the same iocb produce +undefined results. +.SH "RETURN VALUES" +None +.SH ERRORS +None +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_prep_pwrite.3 b/tools/libaio/man/io_prep_pwrite.3 new file mode 100644 index 0000000000..68b3500587 --- /dev/null +++ b/tools/libaio/man/io_prep_pwrite.3 @@ -0,0 +1,77 @@ +./" static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +./" { +./" memset(iocb, 0, sizeof(*iocb)); +./" iocb->aio_fildes = fd; +./" iocb->aio_lio_opcode = IO_CMD_PWRITE; +./" iocb->aio_reqprio = 0; +./" iocb->u.c.buf = buf; +./" iocb->u.c.nbytes = count; +./" iocb->u.c.offset = offset; +./" } +.TH io_prep_pwrite 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +io_prep_pwrite \- Set up iocb for asynchronous writes +.SH SYNOPSYS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.fi +.SH DESCRIPTION +io_prep_write is a convenicence function for setting up parallel writes. + +The first +.TP +.IR "iocb->u.c.nbytes = count" +bytes of the file for which +.TP +.IR "iocb->aio_fildes = fd" +is a descriptor are written from the buffer +starting at +.TP +.IR "iocb->u.c.buf = buf" +. +.br +Writing starts at the absolute position +.TP +.IR "ioc->u.c.offset = offset" +in the file. +.PP +This function returns immediately . To schedule the operation, the +function +.IR io_submit +must be called. +.PP +Simultaneous asynchronous operations using the same iocb produce +undefined results. +.SH "RETURN VALUES" +None +.SH ERRORS +None +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_queue_init.3 b/tools/libaio/man/io_queue_init.3 new file mode 100644 index 0000000000..317f631cfc --- /dev/null +++ b/tools/libaio/man/io_queue_init.3 @@ -0,0 +1,63 @@ +.TH io_queue_init 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_queue_init \- Initialize asynchronous io state machine + +.SH SYNOPSIS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "int io_queue_init(int maxevents, io_context_t *ctx );" +.sp +.fi +.SH DESCRIPTION +.B io_queue_init +Attempts to create an aio context capable of receiving at least +.IR maxevents +events. +.IR ctx +must point to an aio context that already exists and must be initialized +to +.IR 0 +before the call. +If the operation is successful, *cxtp is filled with the resulting handle. +.SH "RETURN VALUES" +On success, +.B io_queue_init +returns 0. Otherwise, -error is return, where +error is one of the Exxx values defined in the Errors section. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I maxevents +is <= 0 or +.IR ctx +is an invalid memory locattion. +.TP +.B ENOSYS +Not implemented +.TP +.B EAGAIN +.IR "maxevents > max_aio_reqs" +where max_aio_reqs is a tunable value. +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_queue_release.3 b/tools/libaio/man/io_queue_release.3 new file mode 100644 index 0000000000..06b9ec033d --- /dev/null +++ b/tools/libaio/man/io_queue_release.3 @@ -0,0 +1,48 @@ +.TH io_queue_release 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_queue_release \- Release the context associated with the userspace handle +.SH SYNOPSIS +.nf +.B #include <errno.h> +.br +.B #include <libaio.h> +.br +.sp +.BI "int io_queue_release(io_context_t ctx)" +.sp +.SH DESCRIPTION +.B io_queue_release +destroys the context associated with the userspace handle. May cancel any outstanding +AIOs and block on completion. + +.B cts. +.SH "RETURN VALUES" +On success, +.B io_queue_release +returns 0. Otherwise, -error is return, where +error is one of the Exxx values defined in the Errors section. +.SH ERRORS +.TP +.B EINVAL +.I ctx +refers to an unitialized aio context, the iocb pointed to by +.I iocbs +contains an improperly initialized iocb, +.TP +.B ENOSYS +Not implemented +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) + diff --git a/tools/libaio/man/io_queue_run.3 b/tools/libaio/man/io_queue_run.3 new file mode 100644 index 0000000000..57dd417875 --- /dev/null +++ b/tools/libaio/man/io_queue_run.3 @@ -0,0 +1,50 @@ +.TH io_queue_run 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_queue_run \- Handle completed io requests +.SH SYNOPSIS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "int io_queue_run(io_context_t ctx );" +.sp +.fi +.SH DESCRIPTION +.B io_queue_run +Attempts to read all the events events from +the completion queue for the aio_context specified by ctx_id. +.SH "RETURN VALUES" +May return +0 if no events are available. +Will fail with -ENOSYS if not implemented. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I ctx +refers to an unitialized aio context, the iocb pointed to by +.I iocbs +contains an improperly initialized iocb, +.TP +.B ENOSYS +Not implemented +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_queue_wait.3 b/tools/libaio/man/io_queue_wait.3 new file mode 100644 index 0000000000..2306663eae --- /dev/null +++ b/tools/libaio/man/io_queue_wait.3 @@ -0,0 +1,56 @@ +.TH io_queue_wait 2 2002-09-03 "Linux 2.4" "Linux AIO" +.SH NAME +io_queue_wait \- Wait for io requests to complete +.SH SYNOPSIS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "int io_queue_wait(io_context_t ctx, const struct timespec *timeout);" +.fi +.SH DESCRIPTION +Attempts to read an event from +the completion queue for the aio_context specified by ctx_id. +.SH "RETURN VALUES" +May return +0 if no events are available and the timeout specified +by when has elapsed, where when == NULL specifies an infinite +timeout. Note that the timeout pointed to by when is relative and +will be updated if not NULL and the operation blocks. Will fail +with -ENOSYS if not implemented. +.SH "RETURN VALUES" +On success, +.B io_queue_wait +returns 0. Otherwise, -error is return, where +error is one of the Exxx values defined in the Errors section. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I ctx +refers to an unitialized aio context, the iocb pointed to by +.I iocbs +contains an improperly initialized iocb, +.TP +.B ENOSYS +Not implemented +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_set_callback(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_set_callback.3 b/tools/libaio/man/io_set_callback.3 new file mode 100644 index 0000000000..a8ca789eb2 --- /dev/null +++ b/tools/libaio/man/io_set_callback.3 @@ -0,0 +1,44 @@ +./"static inline void io_set_callback(struct iocb *iocb, io_callback_t cb) +.TH io_set_callback 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +io_set_callback \- Set up io completion callback function +.SH SYNOPSYS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "static inline void io_set_callback(struct iocb *iocb, io_callback_t cb)" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.sp +typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2); +.sp +.fi +.SH DESCRIPTION +The callback is not done if the caller uses raw events from +io_getevents, only with the library helpers +.SH "RETURN VALUES" +.SH ERRORS +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_submit(3), +.BR errno(3) diff --git a/tools/libaio/man/io_setup.1 b/tools/libaio/man/io_setup.1 new file mode 100644 index 0000000000..68690e1e73 --- /dev/null +++ b/tools/libaio/man/io_setup.1 @@ -0,0 +1,15 @@ +./"/* sys_io_setup: +./" * Create an aio_context capable of receiving at least nr_events. +./" * ctxp must not point to an aio_context that already exists, and +./" * must be initialized to 0 prior to the call. On successful +./" * creation of the aio_context, *ctxp is filled in with the resulting +./" * handle. May fail with -EINVAL if *ctxp is not initialized, +./" * if the specified nr_events exceeds internal limits. May fail +./" * with -EAGAIN if the specified nr_events exceeds the user's limit +./" * of available events. May fail with -ENOMEM if insufficient kernel +./" * resources are available. May fail with -EFAULT if an invalid +./" * pointer is passed for ctxp. Will fail with -ENOSYS if not +./" * implemented. +./" */ +./" -- note: libaio is actually providing io_queue_init and io_queue_grow +./" as separate functions. For now io_setup is the same as io_queue_grow. diff --git a/tools/libaio/man/io_submit.1 b/tools/libaio/man/io_submit.1 new file mode 100644 index 0000000000..f66e80f1b5 --- /dev/null +++ b/tools/libaio/man/io_submit.1 @@ -0,0 +1,109 @@ +.TH io_submit 2 2002-09-02 "Linux 2.4" "Linux AIO" +.SH NAME +io_submit \- submit io requests +.SH SYNOPSIS +.B #include <errno.h> +.br +.B #include <libaio.h> +.LP +.BI "int io_submit(io_context_t " ctx ", long " nr ", struct iocb *" iocbs "[]);" +.SH DESCRIPTION +.B io_submit +submits to the io_context +.I ctx +up to +.I nr +I/O requests pointed to by the vector +.IR iocbs . + +The +.B iocb +structure is defined as something like +.sp +.RS +.nf +struct iocb { + void *data; +.\" unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.fi +.RE +.sp +.I data +is a an opaque pointer which will upon completion be returned in the +.B io_event +structure by +.BR io_getevents (2). +.\" and io_wait(2) +Callers will typically use this to point directly or indirectly to a +callback function. +.sp +.I aio_lio_opcode +is the I/O operation requested. Callers will typically set this and the +arguments to the I/O operation calling the +.BR io_prep_ (3) +function corresponding to the operation. +.sp +.I aio_reqprio +is the priority of the request. Higher values have more priority; the +normal priority is 0. +.sp +.I aio_fildes +is the file descriptor for the I/O operation. +Callers will typically set this and the +arguments to the I/O operation calling the +.BR io_prep_ *(3) +function corresponding to the operation. +.sp +The caller may not modify the contents or resubmit a submitted +.B iocb +structure until after the operation completes or is canceled. +The implementation of +.BR io_submit (2) +is permitted to modify reserved fields of the +.B iocb +structure. +.SH "RETURN VALUES" +If able to submit at least one iocb, +.B io_submit +returns the number of iocbs submitted successfully. Otherwise, +.RI - error +is returned, where +.I error +is one of the Exxx values defined in the Errors section. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I nr +is negative, +.I ctx +refers to an uninitialized aio context, the iocb pointed to by +.IR iocbs [0] +is improperly initialized or specifies an unsupported operation. +.TP +.B EBADF +The iocb pointed to by +.IR iocbs [0] +contains a file descriptor that does not exist. +.TP +.B EAGAIN +Insufficient resources were available to queue any operations. +.SH "SEE ALSO" +.BR io_setup (2), +.BR io_destroy (2), +.BR io_getevents (2), +.\".BR io_wait (2), +.BR io_prep_pread (3), +.BR io_prep_pwrite (3), +.BR io_prep_fsync (3), +.BR io_prep_fdsync (3), +.BR io_prep_noop (3), +.BR io_cancel (2), +.BR errno (3) diff --git a/tools/libaio/man/io_submit.3 b/tools/libaio/man/io_submit.3 new file mode 100644 index 0000000000..b6966efd8b --- /dev/null +++ b/tools/libaio/man/io_submit.3 @@ -0,0 +1,135 @@ +./"/* sys_io_submit: +./" * Queue the nr iocbs pointed to by iocbpp for processing. Returns +./" * the number of iocbs queued. May return -EINVAL if the aio_context +./" * specified by ctx_id is invalid, if nr is < 0, if the iocb at +./" * *iocbpp[0] is not properly initialized, if the operation specified +./" * is invalid for the file descriptor in the iocb. May fail with +./" * -EFAULT if any of the data structures point to invalid data. May +./" * fail with -EBADF if the file descriptor specified in the first +./" * iocb is invalid. May fail with -EAGAIN if insufficient resources +./" * are available to queue any iocbs. Will return 0 if nr is 0. Will +./" * fail with -ENOSYS if not implemented. +./" */ +.TH io_submit 2 2002-09-02 "Linux 2.4" "Linux AIO" +.SH NAME +io_submit \- Submit io requests +.SH SYNOPSIS +.nf +.B #include <errno.h> +.br +.sp +.B #include <libaio.h> +.br +.sp +.BI "int io_submit(io_context_t " ctx ", long " nr ", struct iocb *" iocbs "[]);" +.sp +struct iocb { + void *data; + unsigned key; + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; +}; +.fi +.SH DESCRIPTION +.B io_submit +submits +.I nr +iocbs for processing for a given io context ctx. + +The +.IR "io_submit" +function can be used to enqueue an arbitrary +number of read and write requests at one time. The requests can all be +meant for the same file, all for different files or every solution in +between. + +.IR "io_submit" +gets the +.IR "nr" +requests from the array pointed to +by +.IR "iocbs" +. The operation to be performed is determined by the +.IR "aio_lio_opcode" +member in each element of +.IR "iocbs" +. If this +field is +.B "IO_CMD_PREAD" +a read operation is enqueued, similar to a call +of +.IR "io_prep_pread" +for this element of the array (except that the way +the termination is signalled is different, as we will see below). If +the +.IR "aio_lio_opcode" +member is +.B "IO_CMD_PWRITE" +a write operation +is enqueued. Otherwise the +.IR "aio_lio_opcode" +must be +.B "IO_CMD_NOP" +in which case this element of +.IR "iocbs" +is simply ignored. This +``operation'' is useful in situations where one has a fixed array of +.IR "struct iocb" +elements from which only a few need to be handled at +a time. Another situation is where the +.IR "io_submit" +call was +canceled before all requests are processed and the remaining requests have to be reissued. + +The other members of each element of the array pointed to by +.IR "iocbs" +must have values suitable for the operation as described in +the documentation for +.IR "io_prep_pread" +and +.IR "io_prep_pwrite" +above. + +The function returns immediately after +having enqueued all the requests. +On success, +.B io_submit +returns the number of iocbs submitted successfully. Otherwise, -error is return, where +error is one of the Exxx values defined in the Errors section. +.PP +If an error is detected, then the behavior is undefined. +.PP +Simultaneous asynchronous operations using the same iocb produce +undefined results. +.SH ERRORS +.TP +.B EFAULT +.I iocbs +referenced data outside of the program's accessible address space. +.TP +.B EINVAL +.I ctx +refers to an unitialized aio context, the iocb pointed to by +.I iocbs +contains an improperly initialized iocb, +.TP +.B EBADF +The iocb contains a file descriptor that does not exist. +.TP +.B EINVAL +The file specified in the iocb does not support the given io operation. +.SH "SEE ALSO" +.BR io(3), +.BR io_cancel(3), +.BR io_fsync(3), +.BR io_getevents(3), +.BR io_prep_fsync(3), +.BR io_prep_pread(3), +.BR io_prep_pwrite(3), +.BR io_queue_init(3), +.BR io_queue_release(3), +.BR io_queue_run(3), +.BR io_queue_wait(3), +.BR io_set_callback(3), +.BR errno(3) diff --git a/tools/libaio/man/lio_listio.3 b/tools/libaio/man/lio_listio.3 new file mode 100644 index 0000000000..9b5b5e4eb5 --- /dev/null +++ b/tools/libaio/man/lio_listio.3 @@ -0,0 +1,229 @@ +.TH lio_listio 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +lio_listio - List directed I/O +.SH SYNOPSYS +.B #include <errno.h> +.br +.B #include <libaio.h> +.LP +.BI "int lio_listio (int mode, struct aiocb *const list[], int nent, struct sigevent *sig)" +.nf +.SH DESCRIPTION + +Besides these functions with the more or less traditional interface, +POSIX.1b also defines a function which can initiate more than one +operation at a time, and which can handle freely mixed read and write +operations. It is therefore similar to a combination of +.IR readv +and +.IR "writev" +. + +The +.IR "lio_listio" +function can be used to enqueue an arbitrary +number of read and write requests at one time. The requests can all be +meant for the same file, all for different files or every solution in +between. + +.IR "lio_listio" +gets the +.IR "nent" +requests from the array pointed to +by +.IR "list" +. The operation to be performed is determined by the +.IR "aio_lio_opcode" +member in each element of +.IR "list" +. If this +field is +.B "LIO_READ" +a read operation is enqueued, similar to a call +of +.IR "aio_read" +for this element of the array (except that the way +the termination is signalled is different, as we will see below). If +the +.IR "aio_lio_opcode" +member is +.B "LIO_WRITE" +a write operation +is enqueued. Otherwise the +.IR "aio_lio_opcode" +must be +.B "LIO_NOP" +in which case this element of +.IR "list" +is simply ignored. This +``operation'' is useful in situations where one has a fixed array of +.IR "struct aiocb" +elements from which only a few need to be handled at +a time. Another situation is where the +.IR "lio_listio" +call was +canceled before all requests are processed and the remaining requests have to be reissued. + +The other members of each element of the array pointed to by +.IR "list" +must have values suitable for the operation as described in +the documentation for +.IR "aio_read" +and +.IR "aio_write" +above. + +The +.IR "mode" +argument determines how +.IR "lio_listio" +behaves after +having enqueued all the requests. If +.IR "mode" +is +.B "LIO_WAIT" +it +waits until all requests terminated. Otherwise +.IR "mode" +must be +.B "LIO_NOWAIT" +and in this case the function returns immediately after +having enqueued all the requests. In this case the caller gets a +notification of the termination of all requests according to the +.IR "sig" +parameter. If +.IR "sig" +is +.B "NULL" +no notification is +send. Otherwise a signal is sent or a thread is started, just as +described in the description for +.IR "aio_read" +or +.IR "aio_write" +. + +When the sources are compiled with +.B "_FILE_OFFSET_BITS == 64" +, this +function is in fact +.IR "lio_listio64" +since the LFS interface +transparently replaces the normal implementation. +.SH "RETURN VALUES" +If +.IR "mode" +is +.B "LIO_WAIT" +, the return value of +.IR "lio_listio" +is +.IR 0 +when all requests completed successfully. Otherwise the +function return +.IR 1 +and +.IR "errno" +is set accordingly. To find +out which request or requests failed one has to use the +.IR "aio_error" +function on all the elements of the array +.IR "list" +. + +In case +.IR "mode" +is +.B "LIO_NOWAIT" +, the function returns +.IR 0 +if +all requests were enqueued correctly. The current state of the requests +can be found using +.IR "aio_error" +and +.IR "aio_return" +as described +above. If +.IR "lio_listio" +returns +.IR -1 +in this mode, the +global variable +.IR "errno" +is set accordingly. If a request did not +yet terminate, a call to +.IR "aio_error" +returns +.B "EINPROGRESS" +. If +the value is different, the request is finished and the error value (or + +.IR 0 +) is returned and the result of the operation can be retrieved +using +.IR "aio_return" +. +.SH ERRORS +Possible values for +.IR "errno" +are: + +.TP +.B EAGAIN +The resources necessary to queue all the requests are not available at +the moment. The error status for each element of +.IR "list" +must be +checked to determine which request failed. + +Another reason could be that the system wide limit of AIO requests is +exceeded. This cannot be the case for the implementation on GNU systems +since no arbitrary limits exist. +.TP +.B EINVAL +The +.IR "mode" +parameter is invalid or +.IR "nent" +is larger than +.B "AIO_LISTIO_MAX" +. +.TP +.B EIO +One or more of the request's I/O operations failed. The error status of +each request should be checked to determine which one failed. +.TP +.B ENOSYS +The +.IR "lio_listio" +function is not supported. +.PP + +If the +.IR "mode" +parameter is +.B "LIO_NOWAIT" +and the caller cancels +a request, the error status for this request returned by +.IR "aio_error" +is +.B "ECANCELED" +. +.SH "SEE ALSO" +.BR aio(3), +.BR aio_cancel(3), +.BR aio_cancel64(3), +.BR aio_error(3), +.BR aio_error64(3), +.BR aio_fsync(3), +.BR aio_fsync64(3), +.BR aio_init(3), +.BR aio_read(3), +.BR aio_read64(3), +.BR aio_return(3), +.BR aio_return64(3), +.BR aio_suspend(3), +.BR aio_suspend64(3), +.BR aio_write(3), +.BR aio_write64(3) diff --git a/tools/libaio/man/lio_listio64.3 b/tools/libaio/man/lio_listio64.3 new file mode 100644 index 0000000000..97f69556c0 --- /dev/null +++ b/tools/libaio/man/lio_listio64.3 @@ -0,0 +1,39 @@ +.TH lio_listio64 3 2002-09-12 "Linux 2.4" Linux AIO" +.SH NAME +lio_listio64 \- List directed I/O +.SH SYNOPSYS +.B #include <errno.h> +.br +.B #include <libaio.h> +.LP +.BI "int lio_listio64 (int mode, struct aiocb *const list[], int nent, struct sigevent *sig)" +.nf +.SH DESCRIPTION +This function is similar to the +.IR "code{lio_listio" +function. The only +difference is that on +.IR "32 bit" +machines, the file descriptor should +be opened in the large file mode. Internally, +.IR "lio_listio64" +uses +functionality equivalent to +.IR lseek64" +to position the file descriptor correctly for the reading or +writing, as opposed to +.IR "lseek" +functionality used in +.IR "lio_listio". + +When the sources are compiled with +.IR "_FILE_OFFSET_BITS == 64" +, this +function is available under the name +.IR "lio_listio" +and so +transparently replaces the interface for small files on 32 bit +machines. +.SH "RETURN VALUES" +.SH ERRORS +.SH "SEE ALSO" diff --git a/tools/libaio/src/Makefile b/tools/libaio/src/Makefile new file mode 100644 index 0000000000..8d134cc005 --- /dev/null +++ b/tools/libaio/src/Makefile @@ -0,0 +1,64 @@ +prefix=/usr +includedir=$(prefix)/include +libdir=$(prefix)/lib + +ARCH := $(shell uname -m | sed -e s/i.86/i386/) +CFLAGS := -nostdlib -nostartfiles -Wall -I. -g -fomit-frame-pointer -O2 -fPIC +SO_CFLAGS=-shared $(CFLAGS) +L_CFLAGS=$(CFLAGS) +LINK_FLAGS= + +soname=libaio.so.1 +minor=0 +micro=1 +libname=$(soname).$(minor).$(micro) +all_targets += libaio.a $(libname) + +all: $(all_targets) + +# libaio provided functions +libaio_srcs := io_queue_init.c io_queue_release.c +libaio_srcs += io_queue_wait.c io_queue_run.c + +# real syscalls +libaio_srcs += io_getevents.c io_submit.c io_cancel.c +libaio_srcs += io_setup.c io_destroy.c + +# internal functions +libaio_srcs += raw_syscall.c + +# old symbols +libaio_srcs += compat-0_1.c + +libaio_objs := $(patsubst %.c,%.ol,$(libaio_srcs)) +libaio_sobjs := $(patsubst %.c,%.os,$(libaio_srcs)) + +$(libaio_objs) $(libaio_sobjs): libaio.h vsys_def.h + +%.os: %.c + $(CC) $(SO_CFLAGS) -c -o $@ $< + +%.ol: %.c + $(CC) $(L_CFLAGS) -c -o $@ $< + + +libaio.a: $(libaio_objs) + rm -f libaio.a + ar r libaio.a $^ + ranlib libaio.a + +$(libname): $(libaio_sobjs) libaio.map + $(CC) $(SO_CFLAGS) -Wl,--version-script=libaio.map -Wl,-soname=$(soname) -o $@ $(libaio_sobjs) $(LINK_FLAGS) + +install: $(all_targets) + install -D -m 644 libaio.h $(includedir)/libaio.h + install -D -m 644 libaio.a $(libdir)/libaio.a + install -D -m 755 $(libname) $(libdir)/$(libname) + ln -sf $(libname) $(libdir)/$(soname) + ln -sf $(libname) $(libdir)/libaio.so + +$(libaio_objs): libaio.h + +clean: + rm -f $(all_targets) $(libaio_objs) $(libaio_sobjs) $(soname).new + rm -f *.so* *.a *.o diff --git a/tools/libaio/src/compat-0_1.c b/tools/libaio/src/compat-0_1.c new file mode 100644 index 0000000000..136396f996 --- /dev/null +++ b/tools/libaio/src/compat-0_1.c @@ -0,0 +1,62 @@ +/* libaio Linux async I/O interface + + compat-0_1.c : compatibility symbols for libaio 0.1.x-0.3.x + + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <asm/errno.h> + +#include "libaio.h" +#include "vsys_def.h" + +#include "syscall.h" + + +/* ABI change. Provide partial compatibility on this one for now. */ +SYMVER(compat0_1_io_cancel, io_cancel, 0.1); +int compat0_1_io_cancel(io_context_t ctx, struct iocb *iocb) +{ + struct io_event event; + + /* FIXME: the old ABI would return the event on the completion queue */ + return io_cancel(ctx, iocb, &event); +} + +SYMVER(compat0_1_io_queue_wait, io_queue_wait, 0.1); +int compat0_1_io_queue_wait(io_context_t ctx, struct timespec *when) +{ + struct timespec timeout; + if (when) + timeout = *when; + return io_getevents(ctx, 0, 0, NULL, when ? &timeout : NULL); +} + + +/* ABI change. Provide backwards compatibility for this one. */ +SYMVER(compat0_1_io_getevents, io_getevents, 0.1); +int compat0_1_io_getevents(io_context_t ctx_id, long nr, + struct io_event *events, + const struct timespec *const_timeout) +{ + struct timespec timeout; + if (const_timeout) + timeout = *const_timeout; + return io_getevents(ctx_id, 1, nr, events, + const_timeout ? &timeout : NULL); +} + diff --git a/tools/libaio/src/io_cancel.c b/tools/libaio/src/io_cancel.c new file mode 100644 index 0000000000..2f0f5f4aa0 --- /dev/null +++ b/tools/libaio/src/io_cancel.c @@ -0,0 +1,23 @@ +/* io_cancel.c + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <libaio.h> +#include "syscall.h" + +io_syscall3(int, io_cancel_0_4, io_cancel, io_context_t, ctx, struct iocb *, iocb, struct io_event *, event) +DEFSYMVER(io_cancel_0_4, io_cancel, 0.4) diff --git a/tools/libaio/src/io_destroy.c b/tools/libaio/src/io_destroy.c new file mode 100644 index 0000000000..0ab6bd1743 --- /dev/null +++ b/tools/libaio/src/io_destroy.c @@ -0,0 +1,23 @@ +/* io_destroy + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <errno.h> +#include <libaio.h> +#include "syscall.h" + +io_syscall1(int, io_destroy, io_destroy, io_context_t, ctx) diff --git a/tools/libaio/src/io_getevents.c b/tools/libaio/src/io_getevents.c new file mode 100644 index 0000000000..5a0517402d --- /dev/null +++ b/tools/libaio/src/io_getevents.c @@ -0,0 +1,57 @@ +/* io_getevents.c + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <libaio.h> +#include <errno.h> +#include <stdlib.h> +#include <time.h> +#include "syscall.h" + +io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout) + +#define AIO_RING_MAGIC 0xa10a10a1 + +/* Ben will hate me for this */ +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ +}; + +int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout) +{ + struct aio_ring *ring; + ring = (struct aio_ring*)ctx; + if (ring==NULL || ring->magic != AIO_RING_MAGIC) + goto do_syscall; + if (timeout!=NULL && timeout->tv_sec == 0 && timeout->tv_nsec == 0) { + if (ring->head == ring->tail) + return 0; + } + +do_syscall: + return __io_getevents_0_4(ctx, min_nr, nr, events, timeout); +} + +DEFSYMVER(io_getevents_0_4, io_getevents, 0.4) diff --git a/tools/libaio/src/io_queue_init.c b/tools/libaio/src/io_queue_init.c new file mode 100644 index 0000000000..563d1375a4 --- /dev/null +++ b/tools/libaio/src/io_queue_init.c @@ -0,0 +1,33 @@ +/* io_queue_init.c + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <libaio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <errno.h> + +#include "syscall.h" + +int io_queue_init(int maxevents, io_context_t *ctxp) +{ + if (maxevents > 0) { + *ctxp = NULL; + return io_setup(maxevents, ctxp); + } + return -EINVAL; +} diff --git a/tools/libaio/src/io_queue_release.c b/tools/libaio/src/io_queue_release.c new file mode 100644 index 0000000000..94bbb867a0 --- /dev/null +++ b/tools/libaio/src/io_queue_release.c @@ -0,0 +1,27 @@ +/* io_queue_release.c + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <libaio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <errno.h> + +int io_queue_release(io_context_t ctx) +{ + return io_destroy(ctx); +} diff --git a/tools/libaio/src/io_queue_run.c b/tools/libaio/src/io_queue_run.c new file mode 100644 index 0000000000..e0132f4009 --- /dev/null +++ b/tools/libaio/src/io_queue_run.c @@ -0,0 +1,39 @@ +/* io_submit + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <libaio.h> +#include <errno.h> +#include <stdlib.h> +#include <time.h> + +int io_queue_run(io_context_t ctx) +{ + static struct timespec timeout = { 0, 0 }; + struct io_event event; + int ret; + + /* FIXME: batch requests? */ + while (1 == (ret = io_getevents(ctx, 0, 1, &event, &timeout))) { + io_callback_t cb = (io_callback_t)event.data; + struct iocb *iocb = event.obj; + + cb(ctx, iocb, event.res, event.res2); + } + + return ret; +} diff --git a/tools/libaio/src/io_queue_wait.c b/tools/libaio/src/io_queue_wait.c new file mode 100644 index 0000000000..538d2f3b7b --- /dev/null +++ b/tools/libaio/src/io_queue_wait.c @@ -0,0 +1,31 @@ +/* io_submit + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define NO_SYSCALL_ERRNO +#include <sys/types.h> +#include <libaio.h> +#include <errno.h> +#include "syscall.h" + +struct timespec; + +int io_queue_wait_0_4(io_context_t ctx, struct timespec *timeout) +{ + return io_getevents(ctx, 0, 0, NULL, timeout); +} +DEFSYMVER(io_queue_wait_0_4, io_queue_wait, 0.4) diff --git a/tools/libaio/src/io_setup.c b/tools/libaio/src/io_setup.c new file mode 100644 index 0000000000..4ba1afc993 --- /dev/null +++ b/tools/libaio/src/io_setup.c @@ -0,0 +1,23 @@ +/* io_setup + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <errno.h> +#include <libaio.h> +#include "syscall.h" + +io_syscall2(int, io_setup, io_setup, int, maxevents, io_context_t *, ctxp) diff --git a/tools/libaio/src/io_submit.c b/tools/libaio/src/io_submit.c new file mode 100644 index 0000000000..e22ba54960 --- /dev/null +++ b/tools/libaio/src/io_submit.c @@ -0,0 +1,23 @@ +/* io_submit + libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <errno.h> +#include <libaio.h> +#include "syscall.h" + +io_syscall3(int, io_submit, io_submit, io_context_t, ctx, long, nr, struct iocb **, iocbs) diff --git a/tools/libaio/src/libaio.h b/tools/libaio/src/libaio.h new file mode 100644 index 0000000000..657460128a --- /dev/null +++ b/tools/libaio/src/libaio.h @@ -0,0 +1,222 @@ +/* /usr/include/libaio.h + * + * Copyright 2000,2001,2002 Red Hat, Inc. + * + * Written by Benjamin LaHaise <bcrl@redhat.com> + * + * libaio Linux async I/O interface + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef __LIBAIO_H +#define __LIBAIO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <string.h> + +struct timespec; +struct sockaddr; +struct iovec; +struct iocb; + +typedef struct io_context *io_context_t; + +typedef enum io_iocb_cmd { + IO_CMD_PREAD = 0, + IO_CMD_PWRITE = 1, + + IO_CMD_FSYNC = 2, + IO_CMD_FDSYNC = 3, + + IO_CMD_POLL = 5, + IO_CMD_NOOP = 6, +} io_iocb_cmd_t; + +#if defined(__i386__) /* little endian, 32 bits */ +#define PADDED(x, y) x; unsigned y +#define PADDEDptr(x, y) x; unsigned y +#define PADDEDul(x, y) unsigned long x; unsigned y +#elif defined(__ia64__) || defined(__x86_64__) || defined(__alpha__) +#define PADDED(x, y) x, y +#define PADDEDptr(x, y) x +#define PADDEDul(x, y) unsigned long x +#elif defined(__powerpc64__) /* big endian, 64 bits */ +#define PADDED(x, y) unsigned y; x +#define PADDEDptr(x,y) x +#define PADDEDul(x, y) unsigned long x +#elif defined(__PPC__) /* big endian, 32 bits */ +#define PADDED(x, y) unsigned y; x +#define PADDEDptr(x, y) unsigned y; x +#define PADDEDul(x, y) unsigned y; unsigned long x +#elif defined(__s390x__) /* big endian, 64 bits */ +#define PADDED(x, y) unsigned y; x +#define PADDEDptr(x,y) x +#define PADDEDul(x, y) unsigned long x +#elif defined(__s390__) /* big endian, 32 bits */ +#define PADDED(x, y) unsigned y; x +#define PADDEDptr(x, y) unsigned y; x +#define PADDEDul(x, y) unsigned y; unsigned long x +#else +#error endian? +#endif + +struct io_iocb_poll { + PADDED(int events, __pad1); +}; /* result code is the set of result flags or -'ve errno */ + +struct io_iocb_sockaddr { + struct sockaddr *addr; + int len; +}; /* result code is the length of the sockaddr, or -'ve errno */ + +struct io_iocb_common { + PADDEDptr(void *buf, __pad1); + PADDEDul(nbytes, __pad2); + long long offset; + long long __pad3, __pad4; +}; /* result code is the amount read or -'ve errno */ + +struct io_iocb_vector { + const struct iovec *vec; + int nr; + long long offset; +}; /* result code is the amount read or -'ve errno */ + +struct iocb { + PADDEDptr(void *data, __pad1); /* Return in the io completion event */ + PADDED(unsigned key, __pad2); /* For use in identifying io requests */ + + short aio_lio_opcode; + short aio_reqprio; + int aio_fildes; + + union { + struct io_iocb_common c; + struct io_iocb_vector v; + struct io_iocb_poll poll; + struct io_iocb_sockaddr saddr; + } u; +}; + +struct io_event { + PADDEDptr(void *data, __pad1); + PADDEDptr(struct iocb *obj, __pad2); + PADDEDul(res, __pad3); + PADDEDul(res2, __pad4); +}; + +#undef PADDED +#undef PADDEDptr +#undef PADDEDul + +typedef void (*io_callback_t)(io_context_t ctx, struct iocb *iocb, long res, long res2); + +/* library wrappers */ +extern int io_queue_init(int maxevents, io_context_t *ctxp); +/*extern int io_queue_grow(io_context_t ctx, int new_maxevents);*/ +extern int io_queue_release(io_context_t ctx); +/*extern int io_queue_wait(io_context_t ctx, struct timespec *timeout);*/ +extern int io_queue_run(io_context_t ctx); + +/* Actual syscalls */ +extern int io_setup(int maxevents, io_context_t *ctxp); +extern int io_destroy(io_context_t ctx); +extern int io_submit(io_context_t ctx, long nr, struct iocb *ios[]); +extern int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt); +extern int io_getevents(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout); + + +static inline void io_set_callback(struct iocb *iocb, io_callback_t cb) +{ + iocb->data = (void *)cb; +} + +static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PREAD; + iocb->aio_reqprio = 0; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PWRITE; + iocb->aio_reqprio = 0; + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +static inline void io_prep_poll(struct iocb *iocb, int fd, int events) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_POLL; + iocb->aio_reqprio = 0; + iocb->u.poll.events = events; +} + +static inline int io_poll(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd, int events) +{ + io_prep_poll(iocb, fd, events); + io_set_callback(iocb, cb); + return io_submit(ctx, 1, &iocb); +} + +static inline void io_prep_fsync(struct iocb *iocb, int fd) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_FSYNC; + iocb->aio_reqprio = 0; +} + +static inline int io_fsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd) +{ + io_prep_fsync(iocb, fd); + io_set_callback(iocb, cb); + return io_submit(ctx, 1, &iocb); +} + +static inline void io_prep_fdsync(struct iocb *iocb, int fd) +{ + memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_FDSYNC; + iocb->aio_reqprio = 0; +} + +static inline int io_fdsync(io_context_t ctx, struct iocb *iocb, io_callback_t cb, int fd) +{ + io_prep_fdsync(iocb, fd); + io_set_callback(iocb, cb); + return io_submit(ctx, 1, &iocb); +} + +#ifdef __cplusplus +} +#endif + +#endif /* __LIBAIO_H */ diff --git a/tools/libaio/src/libaio.map b/tools/libaio/src/libaio.map new file mode 100644 index 0000000000..dc37725960 --- /dev/null +++ b/tools/libaio/src/libaio.map @@ -0,0 +1,22 @@ +LIBAIO_0.1 { + global: + io_queue_init; + io_queue_run; + io_queue_wait; + io_queue_release; + io_cancel; + io_submit; + io_getevents; + local: + *; + +}; + +LIBAIO_0.4 { + global: + io_setup; + io_destroy; + io_cancel; + io_getevents; + io_queue_wait; +} LIBAIO_0.1; diff --git a/tools/libaio/src/raw_syscall.c b/tools/libaio/src/raw_syscall.c new file mode 100644 index 0000000000..3c8d7fa6d9 --- /dev/null +++ b/tools/libaio/src/raw_syscall.c @@ -0,0 +1,18 @@ +#include "syscall.h" + +#if defined(__ia64__) +/* based on code from glibc by Jes Sorensen */ +__asm__(".text\n" + ".globl __ia64_aio_raw_syscall\n" + "__ia64_aio_raw_syscall:\n" + "alloc r2=ar.pfs,1,0,8,0\n" + "mov r15=r32\n" + "break 0x100000\n" + ";;" + "br.ret.sptk.few b0\n" + ".size __ia64_aio_raw_syscall, . - __ia64_aio_raw_syscall\n" + ".endp __ia64_aio_raw_syscall" +); +#endif + +; diff --git a/tools/libaio/src/syscall-alpha.h b/tools/libaio/src/syscall-alpha.h new file mode 100644 index 0000000000..467b74f07e --- /dev/null +++ b/tools/libaio/src/syscall-alpha.h @@ -0,0 +1,209 @@ +#define __NR_io_setup 398 +#define __NR_io_destroy 399 +#define __NR_io_getevents 400 +#define __NR_io_submit 401 +#define __NR_io_cancel 402 + +#define inline_syscall_r0_asm +#define inline_syscall_r0_out_constraint "=v" + +#define inline_syscall_clobbers \ + "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", \ + "$22", "$23", "$24", "$25", "$27", "$28", "memory" + +#define inline_syscall0(name, args...) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_19 __asm__("$19"); \ + \ + _sc_0 = name; \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19) \ + : "0"(_sc_0) \ + : inline_syscall_clobbers, \ + "$16", "$17", "$18", "$20", "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall1(name,arg1) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_19 __asm__("$19"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16) \ + : "0"(_sc_0), "2"(_sc_16) \ + : inline_syscall_clobbers, \ + "$17", "$18", "$20", "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall2(name,arg1,arg2) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_17 __asm__("$17"); \ + register long _sc_19 __asm__("$19"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + _sc_17 = (long) (arg2); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3 %4" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17) \ + : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17) \ + : inline_syscall_clobbers, \ + "$18", "$20", "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall3(name,arg1,arg2,arg3) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_17 __asm__("$17"); \ + register long _sc_18 __asm__("$18"); \ + register long _sc_19 __asm__("$19"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + _sc_17 = (long) (arg2); \ + _sc_18 = (long) (arg3); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3 %4 %5" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17), \ + "=r"(_sc_18) \ + : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17), \ + "4"(_sc_18) \ + : inline_syscall_clobbers, "$20", "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall4(name,arg1,arg2,arg3,arg4) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_17 __asm__("$17"); \ + register long _sc_18 __asm__("$18"); \ + register long _sc_19 __asm__("$19"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + _sc_17 = (long) (arg2); \ + _sc_18 = (long) (arg3); \ + _sc_19 = (long) (arg4); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3 %4 %5 %6" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17), \ + "=r"(_sc_18) \ + : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17), \ + "4"(_sc_18), "1"(_sc_19) \ + : inline_syscall_clobbers, "$20", "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall5(name,arg1,arg2,arg3,arg4,arg5) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_17 __asm__("$17"); \ + register long _sc_18 __asm__("$18"); \ + register long _sc_19 __asm__("$19"); \ + register long _sc_20 __asm__("$20"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + _sc_17 = (long) (arg2); \ + _sc_18 = (long) (arg3); \ + _sc_19 = (long) (arg4); \ + _sc_20 = (long) (arg5); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3 %4 %5 %6 %7" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17), \ + "=r"(_sc_18), "=r"(_sc_20) \ + : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17), \ + "4"(_sc_18), "1"(_sc_19), "5"(_sc_20) \ + : inline_syscall_clobbers, "$21"); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define inline_syscall6(name,arg1,arg2,arg3,arg4,arg5,arg6) \ +{ \ + register long _sc_0 inline_syscall_r0_asm; \ + register long _sc_16 __asm__("$16"); \ + register long _sc_17 __asm__("$17"); \ + register long _sc_18 __asm__("$18"); \ + register long _sc_19 __asm__("$19"); \ + register long _sc_20 __asm__("$20"); \ + register long _sc_21 __asm__("$21"); \ + \ + _sc_0 = name; \ + _sc_16 = (long) (arg1); \ + _sc_17 = (long) (arg2); \ + _sc_18 = (long) (arg3); \ + _sc_19 = (long) (arg4); \ + _sc_20 = (long) (arg5); \ + _sc_21 = (long) (arg6); \ + __asm__ __volatile__ \ + ("callsys # %0 %1 <= %2 %3 %4 %5 %6 %7 %8" \ + : inline_syscall_r0_out_constraint (_sc_0), \ + "=r"(_sc_19), "=r"(_sc_16), "=r"(_sc_17), \ + "=r"(_sc_18), "=r"(_sc_20), "=r"(_sc_21) \ + : "0"(_sc_0), "2"(_sc_16), "3"(_sc_17), "4"(_sc_18), \ + "1"(_sc_19), "5"(_sc_20), "6"(_sc_21) \ + : inline_syscall_clobbers); \ + _sc_ret = _sc_0, _sc_err = _sc_19; \ +} + +#define INLINE_SYSCALL1(name, nr, args...) \ +({ \ + long _sc_ret, _sc_err; \ + inline_syscall##nr(__NR_##name, args); \ + if (_sc_err != 0) \ + { \ + _sc_ret = -(_sc_ret); \ + } \ + _sc_ret; \ +}) + +#define io_syscall1(type,fname,sname,type1,arg1) \ +type fname(type1 arg1) \ +{ \ + return (type)INLINE_SYSCALL1(sname, 1, arg1); \ +} + +#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2) \ +type fname(type1 arg1,type2 arg2) \ +{ \ + return (type)INLINE_SYSCALL1(sname, 2, arg1, arg2); \ +} + +#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \ +type fname(type1 arg1,type2 arg2,type3 arg3) \ +{ \ + return (type)INLINE_SYSCALL1(sname, 3, arg1, arg2, arg3); \ +} + +#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ +type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ + return (type)INLINE_SYSCALL1(sname, 4, arg1, arg2, arg3, arg4); \ +} + +#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ +type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \ +{ \ + return (type)INLINE_SYSCALL1(sname, 5, arg1, arg2, arg3, arg4, arg5);\ +} diff --git a/tools/libaio/src/syscall-i386.h b/tools/libaio/src/syscall-i386.h new file mode 100644 index 0000000000..9576975a19 --- /dev/null +++ b/tools/libaio/src/syscall-i386.h @@ -0,0 +1,72 @@ +#define __NR_io_setup 245 +#define __NR_io_destroy 246 +#define __NR_io_getevents 247 +#define __NR_io_submit 248 +#define __NR_io_cancel 249 + +#define io_syscall1(type,fname,sname,type1,arg1) \ +type fname(type1 arg1) \ +{ \ +long __res; \ +__asm__ volatile ("xchgl %%edi,%%ebx\n" \ + "int $0x80\n" \ + "xchgl %%edi,%%ebx" \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1))); \ +return __res; \ +} + +#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2) \ +type fname(type1 arg1,type2 arg2) \ +{ \ +long __res; \ +__asm__ volatile ("xchgl %%edi,%%ebx\n" \ + "int $0x80\n" \ + "xchgl %%edi,%%ebx" \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2))); \ +return __res; \ +} + +#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \ +type fname(type1 arg1,type2 arg2,type3 arg3) \ +{ \ +long __res; \ +__asm__ volatile ("xchgl %%edi,%%ebx\n" \ + "int $0x80\n" \ + "xchgl %%edi,%%ebx" \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3))); \ +return __res; \ +} + +#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ +type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ +long __res; \ +__asm__ volatile ("xchgl %%edi,%%ebx\n" \ + "int $0x80\n" \ + "xchgl %%edi,%%ebx" \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4))); \ +return __res; \ +} + +#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ +type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \ +{ \ +long __res; \ +long tmp; \ +__asm__ volatile ("movl %%ebx,%7\n" \ + "movl %2,%%ebx\n" \ + "int $0x80\n" \ + "movl %7,%%ebx" \ + : "=a" (__res) \ + : "0" (__NR_##sname),"rm" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)), \ + "m" (tmp)); \ +return __res; \ +} diff --git a/tools/libaio/src/syscall-ia64.h b/tools/libaio/src/syscall-ia64.h new file mode 100644 index 0000000000..2f6a01a4a9 --- /dev/null +++ b/tools/libaio/src/syscall-ia64.h @@ -0,0 +1,44 @@ +#define __NR_io_setup 1238 +#define __NR_io_destroy 1239 +#define __NR_io_getevents 1240 +#define __NR_io_submit 1241 +#define __NR_io_cancel 1242 + +#define __ia64_raw_syscall(fname, sname) \ + __asm__ (".text\n" \ + ".globl " SYMSTR(fname) "\n" \ + SYMSTR(fname) ":\n" \ + " mov r15=" SYMSTR( __NR_ ## sname ) "\n" \ + " break 0x100000\n" \ + " ;;\n" \ + " cmp.eq p6,p0=-1,r10\n" \ + " ;;\n" \ + " (p6) sub r8=0,r8\n" \ + " br.ret.sptk.few b0\n" \ + ".size " SYMSTR(fname) ", . - " SYMSTR(fname) "\n" \ + ".endp " SYMSTR(fname) "\n" \ + ); + +#define io_syscall0(type, name) \ + extern type name(void); \ + __ia64_raw_syscall(name); + +#define io_syscall1(type, fname, sname, type1, arg1) \ + extern type fname(type1 arg1); \ + __ia64_raw_syscall(fname, sname); + +#define io_syscall2(type, fname, sname, type1, arg1, type2, arg2) \ + extern type fname(type1 arg1, type2 arg2); \ + __ia64_raw_syscall(fname, sname); + +#define io_syscall3(type, fname, sname, type1, arg1, type2, arg2, type3, arg3) \ + extern type fname(type1 arg1, type2 arg2, type3 arg3); \ + __ia64_raw_syscall(fname, sname); + +#define io_syscall4(type, fname, sname, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + extern type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4); \ + __ia64_raw_syscall(fname, sname); + +#define io_syscall5(type, fname, sname, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) \ + extern type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5); \ + __ia64_raw_syscall(fname, sname); diff --git a/tools/libaio/src/syscall-ppc.h b/tools/libaio/src/syscall-ppc.h new file mode 100644 index 0000000000..ca70dd2092 --- /dev/null +++ b/tools/libaio/src/syscall-ppc.h @@ -0,0 +1,94 @@ +#define __NR_io_setup 227 +#define __NR_io_destroy 228 +#define __NR_io_getevents 229 +#define __NR_io_submit 230 +#define __NR_io_cancel 231 + +/* On powerpc a system call basically clobbers the same registers like a + * function call, with the exception of LR (which is needed for the + * "sc; bnslr" sequence) and CR (where only CR0.SO is clobbered to signal + * an error return status). + */ + +#define __syscall_nr(nr, type, name, args...) \ + unsigned long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0"); \ + register unsigned long __sc_3 __asm__ ("r3"); \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + \ + __sc_loadargs_##nr(name, args); \ + __asm__ __volatile__ \ + ("sc \n\t" \ + "mfcr %0 " \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : __sc_asm_input_##nr \ + : "cr0", "ctr", "memory", \ + "r9", "r10","r11", "r12"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + if (__sc_err & 0x10000000) return -((int)__sc_ret); \ + return (type) __sc_ret + +#define __sc_loadargs_0(name, dummy...) \ + __sc_0 = __NR_##name +#define __sc_loadargs_1(name, arg1) \ + __sc_loadargs_0(name); \ + __sc_3 = (unsigned long) (arg1) +#define __sc_loadargs_2(name, arg1, arg2) \ + __sc_loadargs_1(name, arg1); \ + __sc_4 = (unsigned long) (arg2) +#define __sc_loadargs_3(name, arg1, arg2, arg3) \ + __sc_loadargs_2(name, arg1, arg2); \ + __sc_5 = (unsigned long) (arg3) +#define __sc_loadargs_4(name, arg1, arg2, arg3, arg4) \ + __sc_loadargs_3(name, arg1, arg2, arg3); \ + __sc_6 = (unsigned long) (arg4) +#define __sc_loadargs_5(name, arg1, arg2, arg3, arg4, arg5) \ + __sc_loadargs_4(name, arg1, arg2, arg3, arg4); \ + __sc_7 = (unsigned long) (arg5) + +#define __sc_asm_input_0 "0" (__sc_0) +#define __sc_asm_input_1 __sc_asm_input_0, "1" (__sc_3) +#define __sc_asm_input_2 __sc_asm_input_1, "2" (__sc_4) +#define __sc_asm_input_3 __sc_asm_input_2, "3" (__sc_5) +#define __sc_asm_input_4 __sc_asm_input_3, "4" (__sc_6) +#define __sc_asm_input_5 __sc_asm_input_4, "5" (__sc_7) + +#define io_syscall1(type,fname,sname,type1,arg1) \ +type fname(type1 arg1) \ +{ \ + __syscall_nr(1, type, sname, arg1); \ +} + +#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2) \ +type fname(type1 arg1, type2 arg2) \ +{ \ + __syscall_nr(2, type, sname, arg1, arg2); \ +} + +#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \ +type fname(type1 arg1, type2 arg2, type3 arg3) \ +{ \ + __syscall_nr(3, type, sname, arg1, arg2, arg3); \ +} + +#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ +type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ + __syscall_nr(4, type, sname, arg1, arg2, arg3, arg4); \ +} + +#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5) \ +type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ +{ \ + __syscall_nr(5, type, sname, arg1, arg2, arg3, arg4, arg5); \ +} diff --git a/tools/libaio/src/syscall-s390.h b/tools/libaio/src/syscall-s390.h new file mode 100644 index 0000000000..3ec5ee34ee --- /dev/null +++ b/tools/libaio/src/syscall-s390.h @@ -0,0 +1,131 @@ +#define __NR_io_setup 243 +#define __NR_io_destroy 244 +#define __NR_io_getevents 245 +#define __NR_io_submit 246 +#define __NR_io_cancel 247 + +#define io_svc_clobber "1", "cc", "memory" + +#define io_syscall1(type,fname,sname,type1,arg1) \ +type fname(type1 arg1) { \ + register type1 __arg1 asm("2") = arg1; \ + register long __svcres asm("2"); \ + long __res; \ + __asm__ __volatile__ ( \ + " .if %1 < 256\n" \ + " svc %b1\n" \ + " .else\n" \ + " la %%r1,%1\n" \ + " .svc 0\n" \ + " .endif" \ + : "=d" (__svcres) \ + : "i" (__NR_##sname), \ + "0" (__arg1) \ + : io_svc_clobber ); \ + __res = __svcres; \ + return (type) __res; \ +} + +#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2) \ +type fname(type1 arg1, type2 arg2) { \ + register type1 __arg1 asm("2") = arg1; \ + register type2 __arg2 asm("3") = arg2; \ + register long __svcres asm("2"); \ + long __res; \ + __asm__ __volatile__ ( \ + " .if %1 < 256\n" \ + " svc %b1\n" \ + " .else\n" \ + " la %%r1,%1\n" \ + " svc 0\n" \ + " .endif" \ + : "=d" (__svcres) \ + : "i" (__NR_##sname), \ + "0" (__arg1), \ + "d" (__arg2) \ + : io_svc_clobber ); \ + __res = __svcres; \ + return (type) __res; \ +} + +#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2, \ + type3,arg3) \ +type fname(type1 arg1, type2 arg2, type3 arg3) { \ + register type1 __arg1 asm("2") = arg1; \ + register type2 __arg2 asm("3") = arg2; \ + register type3 __arg3 asm("4") = arg3; \ + register long __svcres asm("2"); \ + long __res; \ + __asm__ __volatile__ ( \ + " .if %1 < 256\n" \ + " svc %b1\n" \ + " .else\n" \ + " la %%r1,%1\n" \ + " svc 0\n" \ + " .endif" \ + : "=d" (__svcres) \ + : "i" (__NR_##sname), \ + "0" (__arg1), \ + "d" (__arg2), \ + "d" (__arg3) \ + : io_svc_clobber ); \ + __res = __svcres; \ + return (type) __res; \ +} + +#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2, \ + type3,arg3,type4,arg4) \ +type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + register type1 __arg1 asm("2") = arg1; \ + register type2 __arg2 asm("3") = arg2; \ + register type3 __arg3 asm("4") = arg3; \ + register type4 __arg4 asm("5") = arg4; \ + register long __svcres asm("2"); \ + long __res; \ + __asm__ __volatile__ ( \ + " .if %1 < 256\n" \ + " svc %b1\n" \ + " .else\n" \ + " la %%r1,%1\n" \ + " svc 0\n" \ + " .endif" \ + : "=d" (__svcres) \ + : "i" (__NR_##sname), \ + "0" (__arg1), \ + "d" (__arg2), \ + "d" (__arg3), \ + "d" (__arg4) \ + : io_svc_clobber ); \ + __res = __svcres; \ + return (type) __res; \ +} + +#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2, \ + type3,arg3,type4,arg4,type5,arg5) \ +type fname(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + register type1 __arg1 asm("2") = arg1; \ + register type2 __arg2 asm("3") = arg2; \ + register type3 __arg3 asm("4") = arg3; \ + register type4 __arg4 asm("5") = arg4; \ + register type5 __arg5 asm("6") = arg5; \ + register long __svcres asm("2"); \ + long __res; \ + __asm__ __volatile__ ( \ + " .if %1 < 256\n" \ + " svc %b1\n" \ + " .else\n" \ + " la %%r1,%1\n" \ + " svc 0\n" \ + " .endif" \ + : "=d" (__svcres) \ + : "i" (__NR_##sname), \ + "0" (__arg1), \ + "d" (__arg2), \ + "d" (__arg3), \ + "d" (__arg4), \ + "d" (__arg5) \ + : io_svc_clobber ); \ + __res = __svcres; \ + return (type) __res; \ +} diff --git a/tools/libaio/src/syscall-x86_64.h b/tools/libaio/src/syscall-x86_64.h new file mode 100644 index 0000000000..9361856723 --- /dev/null +++ b/tools/libaio/src/syscall-x86_64.h @@ -0,0 +1,63 @@ +#define __NR_io_setup 206 +#define __NR_io_destroy 207 +#define __NR_io_getevents 208 +#define __NR_io_submit 209 +#define __NR_io_cancel 210 + +#define __syscall_clobber "r11","rcx","memory" +#define __syscall "syscall" + +#define io_syscall1(type,fname,sname,type1,arg1) \ +type fname(type1 arg1) \ +{ \ +long __res; \ +__asm__ volatile (__syscall \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)) : __syscall_clobber ); \ +return __res; \ +} + +#define io_syscall2(type,fname,sname,type1,arg1,type2,arg2) \ +type fname(type1 arg1,type2 arg2) \ +{ \ +long __res; \ +__asm__ volatile (__syscall \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \ +return __res; \ +} + +#define io_syscall3(type,fname,sname,type1,arg1,type2,arg2,type3,arg3) \ +type fname(type1 arg1,type2 arg2,type3 arg3) \ +{ \ +long __res; \ +__asm__ volatile (__syscall \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)), \ + "d" ((long)(arg3)) : __syscall_clobber); \ +return __res; \ +} + +#define io_syscall4(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ +type fname (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ +long __res; \ +__asm__ volatile ("movq %5,%%r10 ;" __syscall \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)), \ + "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \ +return __res; \ +} + +#define io_syscall5(type,fname,sname,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ +type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \ +{ \ +long __res; \ +__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \ + : "=a" (__res) \ + : "0" (__NR_##sname),"D" ((long)(arg1)),"S" ((long)(arg2)), \ + "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \ + __syscall_clobber,"r8","r10" ); \ +return __res; \ +} diff --git a/tools/libaio/src/syscall.h b/tools/libaio/src/syscall.h new file mode 100644 index 0000000000..0283825817 --- /dev/null +++ b/tools/libaio/src/syscall.h @@ -0,0 +1,27 @@ +#include <sys/syscall.h> +#include <unistd.h> + +#define _SYMSTR(str) #str +#define SYMSTR(str) _SYMSTR(str) + +#define SYMVER(compat_sym, orig_sym, ver_sym) \ + __asm__(".symver " SYMSTR(compat_sym) "," SYMSTR(orig_sym) "@LIBAIO_" SYMSTR(ver_sym)); + +#define DEFSYMVER(compat_sym, orig_sym, ver_sym) \ + __asm__(".symver " SYMSTR(compat_sym) "," SYMSTR(orig_sym) "@@LIBAIO_" SYMSTR(ver_sym)); + +#if defined(__i386__) +#include "syscall-i386.h" +#elif defined(__x86_64__) +#include "syscall-x86_64.h" +#elif defined(__ia64__) +#include "syscall-ia64.h" +#elif defined(__PPC__) +#include "syscall-ppc.h" +#elif defined(__s390__) +#include "syscall-s390.h" +#elif defined(__alpha__) +#include "syscall-alpha.h" +#else +#error "add syscall-arch.h" +#endif diff --git a/tools/libaio/src/vsys_def.h b/tools/libaio/src/vsys_def.h new file mode 100644 index 0000000000..13d032e330 --- /dev/null +++ b/tools/libaio/src/vsys_def.h @@ -0,0 +1,24 @@ +/* libaio Linux async I/O interface + Copyright 2002 Red Hat, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +extern int vsys_io_setup(unsigned nr_reqs, io_context_t *ctxp); +extern int vsys_io_destroy(io_context_t ctx); +extern int vsys_io_submit(io_context_t ctx, long nr, struct iocb *iocbs[]); +extern int vsys_io_cancel(io_context_t ctx, struct iocb *iocb); +extern int vsys_io_wait(io_context_t ctx, struct iocb *iocb, const struct timespec *when); +extern int vsys_io_getevents(io_context_t ctx_id, long nr, struct io_event *events, const struct timespec *timeout); + diff --git a/tools/misc/xend b/tools/misc/xend index cd35438090..e9bd8e18c7 100644 --- a/tools/misc/xend +++ b/tools/misc/xend @@ -92,6 +92,10 @@ def start_xenstored(): def start_consoled(): if os.fork() == 0: os.execvp('xenconsoled', ['xenconsoled']) + +def start_blktapctrl(): + if os.fork() == 0: + os.execvp('blktapctrl', ['blktapctrl']) def main(): try: @@ -106,16 +110,19 @@ def main(): elif sys.argv[1] == 'start': start_xenstored() start_consoled() + start_blktapctrl() return daemon.start() elif sys.argv[1] == 'trace_start': start_xenstored() start_consoled() + start_blktapctrl() return daemon.start(trace=1) elif sys.argv[1] == 'stop': return daemon.stop() elif sys.argv[1] == 'restart': start_xenstored() start_consoled() + start_blktapctrl() return daemon.stop() or daemon.start() elif sys.argv[1] == 'status': return daemon.status() diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index e9b21c7ce5..14f9f4311a 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -1701,6 +1701,7 @@ def addControllerClass(device_class, cls): from xen.xend.server import blkif, netif, tpmif, pciif, iopif, irqif, usbif +from xen.xend.server.BlktapController import BlktapController addControllerClass('vbd', blkif.BlkifController) addControllerClass('vif', netif.NetifController) addControllerClass('vtpm', tpmif.TPMifController) @@ -1708,3 +1709,4 @@ addControllerClass('pci', pciif.PciController) addControllerClass('ioports', iopif.IOPortsController) addControllerClass('irq', irqif.IRQController) addControllerClass('usb', usbif.UsbifController) +addControllerClass('tap', BlktapController) diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py new file mode 100644 index 0000000000..062769a061 --- /dev/null +++ b/tools/python/xen/xend/server/BlktapController.py @@ -0,0 +1,14 @@ +# Copyright (c) 2005, XenSource Ltd. + + +from xen.xend.server.blkif import BlkifController + + +class BlktapController(BlkifController): + def __init__(self, vm): + BlkifController.__init__(self, vm) + + def frontendRoot(self): + """@see DevController#frontendRoot""" + + return "%s/device/vbd" % self.vm.getDomainPath() diff --git a/tools/python/xen/xm/create.py b/tools/python/xen/xm/create.py index e5d0273465..4af00f458d 100644 --- a/tools/python/xen/xm/create.py +++ b/tools/python/xen/xm/create.py @@ -479,7 +479,13 @@ def configure_disks(config_devs, vals): """Create the config for disks (virtual block devices). """ for (uname, dev, mode, backend) in vals.disk: - config_vbd = ['vbd', + + if uname.startswith('tap:'): + cls = 'tap' + else: + cls = 'vbd' + + config_vbd = [cls, ['uname', uname], ['dev', dev ], ['mode', mode ] ] diff --git a/tools/python/xen/xm/main.py b/tools/python/xen/xm/main.py index 791c18eacd..f34ad0947e 100644 --- a/tools/python/xen/xm/main.py +++ b/tools/python/xen/xm/main.py @@ -994,7 +994,13 @@ def xm_block_attach(args): arg_check(args, 'block-attach', 4, 5) dom = args[0] - vbd = ['vbd', + + if args[1].startswith('tap:'): + cls = 'tap' + else: + cls = 'vbd' + + vbd = [cls, ['uname', args[1]], ['dev', args[2]], ['mode', args[3]]] diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile index d6b143e1c6..c8a6a483d8 100644 --- a/tools/xenstore/Makefile +++ b/tools/xenstore/Makefile @@ -35,7 +35,7 @@ XENSTORED_Linux = xenstored_linux.o XENSTORED_OBJS += $(XENSTORED_$(OS)) .PHONY: all -all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls +all: libxenstore.so libxenstore.a xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls test_interleaved_transactions: test_interleaved_transactions.o $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ @@ -90,6 +90,9 @@ talloc_test.o: talloc.c libxenstore.so: xs.opic xs_lib.opic $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxenstore.so -shared -o $@ $^ -lpthread +libxenstore.a: libxenstore.so + ar rcs libxenstore.a $^ + .PHONY: clean clean: testsuite-clean rm -f *.o *.opic *.so @@ -172,7 +175,7 @@ install: all $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_LIBS) libxenstore.so $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_DATA) libxenstore.* $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_DATA) xs.h $(DESTDIR)/usr/include $(INSTALL_DATA) xs_lib.h $(DESTDIR)/usr/include |