diff options
Diffstat (limited to 'block')
59 files changed, 50927 insertions, 0 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs new file mode 100644 index 00000000..58ef2ef3 --- /dev/null +++ b/block/Makefile.objs @@ -0,0 +1,44 @@ +block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o +block-obj-y += qed-check.o +block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o +block-obj-y += quorum.o +block-obj-y += parallels.o blkdebug.o blkverify.o +block-obj-y += block-backend.o snapshot.o qapi.o +block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o +block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-y += null.o mirror.o io.o +block-obj-y += throttle-groups.o + +block-obj-y += nbd.o nbd-client.o sheepdog.o +block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_LIBNFS) += nfs.o +block-obj-$(CONFIG_CURL) += curl.o +block-obj-$(CONFIG_RBD) += rbd.o +block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o +block-obj-$(CONFIG_LIBSSH2) += ssh.o +block-obj-y += accounting.o +block-obj-y += write-threshold.o + +common-obj-y += stream.o +common-obj-y += commit.o +common-obj-y += backup.o + +iscsi.o-cflags     := $(LIBISCSI_CFLAGS) +iscsi.o-libs       := $(LIBISCSI_LIBS) +curl.o-cflags      := $(CURL_CFLAGS) +curl.o-libs        := $(CURL_LIBS) +rbd.o-cflags       := $(RBD_CFLAGS) +rbd.o-libs         := $(RBD_LIBS) +gluster.o-cflags   := $(GLUSTERFS_CFLAGS) +gluster.o-libs     := $(GLUSTERFS_LIBS) +ssh.o-cflags       := $(LIBSSH2_CFLAGS) +ssh.o-libs         := $(LIBSSH2_LIBS) +archipelago.o-libs := $(ARCHIPELAGO_LIBS) +block-obj-m        += dmg.o +dmg.o-libs         := $(BZIP2_LIBS) +qcow.o-libs        := -lz +linux-aio.o-libs   := -laio diff --git a/block/accounting.c b/block/accounting.c new file mode 100644 index 00000000..01d594ff --- /dev/null +++ b/block/accounting.c @@ -0,0 +1,63 @@ +/* + * QEMU System Emulator block accounting + * + * Copyright (c) 2011 Christoph Hellwig + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/accounting.h" +#include "block/block_int.h" +#include "qemu/timer.h" + +void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, +                      int64_t bytes, enum BlockAcctType type) +{ +    assert(type < BLOCK_MAX_IOTYPE); + +    cookie->bytes = bytes; +    cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +    cookie->type = type; +} + +void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ +    assert(cookie->type < BLOCK_MAX_IOTYPE); + +    stats->nr_bytes[cookie->type] += cookie->bytes; +    stats->nr_ops[cookie->type]++; +    stats->total_time_ns[cookie->type] += +        qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns; +} + + +void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num, +                               unsigned int nb_sectors) +{ +    if (stats->wr_highest_sector < sector_num + nb_sectors - 1) { +        stats->wr_highest_sector = sector_num + nb_sectors - 1; +    } +} + +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, +                      int num_requests) +{ +    assert(type < BLOCK_MAX_IOTYPE); +    stats->merged[type] += num_requests; +} diff --git a/block/archipelago.c b/block/archipelago.c new file mode 100644 index 00000000..855655c6 --- /dev/null +++ b/block/archipelago.c @@ -0,0 +1,1084 @@ +/* + * QEMU Block driver for Archipelago + * + * Copyright (C) 2014 Chrysostomos Nanakos <cnanakos@grnet.gr> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * VM Image on Archipelago volume is specified like this: + * + * file.driver=archipelago,file.volume=<volumename> + * [,file.mport=<mapperd_port>[,file.vport=<vlmcd_port>] + * [,file.segment=<segment_name>]] + * + * or + * + * file=archipelago:<volumename>[/mport=<mapperd_port>[:vport=<vlmcd_port>][: + * segment=<segment_name>]] + * + * 'archipelago' is the protocol. + * + * 'mport' is the port number on which mapperd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'vport' is the port number on which vlmcd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'segment' is the name of the shared memory segment Archipelago stack + * is using. This is optional and if not specified, QEMU will make Archipelago + * to use the default value, 'archipelago'. + * + * Examples: + * + * file.driver=archipelago,file.volume=my_vm_volume + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + *  file.vport=1234 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + *  file.vport=1234,file.segment=my_segment + * + * or + * + * file=archipelago:my_vm_volume + * file=archipelago:my_vm_volume/mport=123 + * file=archipelago:my_vm_volume/mport=123:vport=1234 + * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/thread.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" +#include "qemu/atomic.h" + +#include <inttypes.h> +#include <xseg/xseg.h> +#include <xseg/protocol.h> + +#define MAX_REQUEST_SIZE    524288 + +#define ARCHIPELAGO_OPT_VOLUME      "volume" +#define ARCHIPELAGO_OPT_SEGMENT     "segment" +#define ARCHIPELAGO_OPT_MPORT       "mport" +#define ARCHIPELAGO_OPT_VPORT       "vport" +#define ARCHIPELAGO_DFL_MPORT       1001 +#define ARCHIPELAGO_DFL_VPORT       501 + +#define archipelagolog(fmt, ...) \ +    do {                         \ +        fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \ +    } while (0) + +typedef enum { +    ARCHIP_OP_READ, +    ARCHIP_OP_WRITE, +    ARCHIP_OP_FLUSH, +    ARCHIP_OP_VOLINFO, +    ARCHIP_OP_TRUNCATE, +} ARCHIPCmd; + +typedef struct ArchipelagoAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    struct BDRVArchipelagoState *s; +    QEMUIOVector *qiov; +    ARCHIPCmd cmd; +    int status; +    int64_t size; +    int64_t ret; +} ArchipelagoAIOCB; + +typedef struct BDRVArchipelagoState { +    ArchipelagoAIOCB *event_acb; +    char *volname; +    char *segment_name; +    uint64_t size; +    /* Archipelago specific */ +    struct xseg *xseg; +    struct xseg_port *port; +    xport srcport; +    xport sport; +    xport mportno; +    xport vportno; +    QemuMutex archip_mutex; +    QemuCond archip_cond; +    bool is_signaled; +    /* Request handler specific */ +    QemuThread request_th; +    QemuCond request_cond; +    QemuMutex request_mutex; +    bool th_is_signaled; +    bool stopping; +} BDRVArchipelagoState; + +typedef struct ArchipelagoSegmentedRequest { +    size_t count; +    size_t total; +    int ref; +    int failed; +} ArchipelagoSegmentedRequest; + +typedef struct AIORequestData { +    const char *volname; +    off_t offset; +    size_t size; +    uint64_t bufidx; +    int ret; +    int op; +    ArchipelagoAIOCB *aio_cb; +    ArchipelagoSegmentedRequest *segreq; +} AIORequestData; + +static void qemu_archipelago_complete_aio(void *opaque); + +static void init_local_signal(struct xseg *xseg, xport sport, xport srcport) +{ +    if (xseg && (sport != srcport)) { +        xseg_init_local_signal(xseg, srcport); +        sport = srcport; +    } +} + +static void archipelago_finish_aiocb(AIORequestData *reqdata) +{ +    if (reqdata->aio_cb->ret != reqdata->segreq->total) { +        reqdata->aio_cb->ret = -EIO; +    } else if (reqdata->aio_cb->ret == reqdata->segreq->total) { +        reqdata->aio_cb->ret = 0; +    } +    reqdata->aio_cb->bh = aio_bh_new( +                        bdrv_get_aio_context(reqdata->aio_cb->common.bs), +                        qemu_archipelago_complete_aio, reqdata +                        ); +    qemu_bh_schedule(reqdata->aio_cb->bh); +} + +static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port, +                      struct xseg_request *expected_req) +{ +    struct xseg_request *req; +    xseg_prepare_wait(xseg, srcport); +    void *psd = xseg_get_signal_desc(xseg, port); +    while (1) { +        req = xseg_receive(xseg, srcport, X_NONBLOCK); +        if (req) { +            if (req != expected_req) { +                archipelagolog("Unknown received request\n"); +                xseg_put_request(xseg, req, srcport); +            } else if (!(req->state & XS_SERVED)) { +                return -1; +            } else { +                break; +            } +        } +        xseg_wait_signal(xseg, psd, 100000UL); +    } +    xseg_cancel_wait(xseg, srcport); +    return 0; +} + +static void xseg_request_handler(void *state) +{ +    BDRVArchipelagoState *s = (BDRVArchipelagoState *) state; +    void *psd = xseg_get_signal_desc(s->xseg, s->port); +    qemu_mutex_lock(&s->request_mutex); + +    while (!s->stopping) { +        struct xseg_request *req; +        void *data; +        xseg_prepare_wait(s->xseg, s->srcport); +        req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK); +        if (req) { +            AIORequestData *reqdata; +            ArchipelagoSegmentedRequest *segreq; +            xseg_get_req_data(s->xseg, req, (void **)&reqdata); + +            switch (reqdata->op) { +            case ARCHIP_OP_READ: +                data = xseg_get_data(s->xseg, req); +                segreq = reqdata->segreq; +                segreq->count += req->serviced; + +                qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx, +                                    data, +                                    req->serviced); + +                xseg_put_request(s->xseg, req, s->srcport); + +                if (atomic_fetch_dec(&segreq->ref) == 1) { +                    if (!segreq->failed) { +                        reqdata->aio_cb->ret = segreq->count; +                        archipelago_finish_aiocb(reqdata); +                        g_free(segreq); +                    } else { +                        g_free(segreq); +                        g_free(reqdata); +                    } +                } else { +                    g_free(reqdata); +                } +                break; +            case ARCHIP_OP_WRITE: +            case ARCHIP_OP_FLUSH: +                segreq = reqdata->segreq; +                segreq->count += req->serviced; +                xseg_put_request(s->xseg, req, s->srcport); + +                if (atomic_fetch_dec(&segreq->ref) == 1) { +                    if (!segreq->failed) { +                        reqdata->aio_cb->ret = segreq->count; +                        archipelago_finish_aiocb(reqdata); +                        g_free(segreq); +                    } else { +                        g_free(segreq); +                        g_free(reqdata); +                    } +                } else { +                    g_free(reqdata); +                } +                break; +            case ARCHIP_OP_VOLINFO: +            case ARCHIP_OP_TRUNCATE: +                s->is_signaled = true; +                qemu_cond_signal(&s->archip_cond); +                break; +            } +        } else { +            xseg_wait_signal(s->xseg, psd, 100000UL); +        } +        xseg_cancel_wait(s->xseg, s->srcport); +    } + +    s->th_is_signaled = true; +    qemu_cond_signal(&s->request_cond); +    qemu_mutex_unlock(&s->request_mutex); +    qemu_thread_exit(NULL); +} + +static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s) +{ +    if (xseg_initialize()) { +        archipelagolog("Cannot initialize XSEG\n"); +        goto err_exit; +    } + +    s->xseg = xseg_join("posix", s->segment_name, +                        "posixfd", NULL); +    if (!s->xseg) { +        archipelagolog("Cannot join XSEG shared memory segment\n"); +        goto err_exit; +    } +    s->port = xseg_bind_dynport(s->xseg); +    s->srcport = s->port->portno; +    init_local_signal(s->xseg, s->sport, s->srcport); +    return 0; + +err_exit: +    return -1; +} + +static int qemu_archipelago_init(BDRVArchipelagoState *s) +{ +    int ret; + +    ret = qemu_archipelago_xseg_init(s); +    if (ret < 0) { +        error_report("Cannot initialize XSEG. Aborting..."); +        goto err_exit; +    } + +    qemu_cond_init(&s->archip_cond); +    qemu_mutex_init(&s->archip_mutex); +    qemu_cond_init(&s->request_cond); +    qemu_mutex_init(&s->request_mutex); +    s->th_is_signaled = false; +    qemu_thread_create(&s->request_th, "xseg_io_th", +                       (void *) xseg_request_handler, +                       (void *) s, QEMU_THREAD_JOINABLE); + +err_exit: +    return ret; +} + +static void qemu_archipelago_complete_aio(void *opaque) +{ +    AIORequestData *reqdata = (AIORequestData *) opaque; +    ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb; + +    qemu_bh_delete(aio_cb->bh); +    aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); +    aio_cb->status = 0; + +    qemu_aio_unref(aio_cb); +    g_free(reqdata); +} + +static void xseg_find_port(char *pstr, const char *needle, xport *aport) +{ +    const char *a; +    char *endptr = NULL; +    unsigned long port; +    if (strstart(pstr, needle, &a)) { +        if (strlen(a) > 0) { +            port = strtoul(a, &endptr, 10); +            if (strlen(endptr)) { +                *aport = -2; +                return; +            } +            *aport = (xport) port; +        } +    } +} + +static void xseg_find_segment(char *pstr, const char *needle, +                              char **segment_name) +{ +    const char *a; +    if (strstart(pstr, needle, &a)) { +        if (strlen(a) > 0) { +            *segment_name = g_strdup(a); +        } +    } +} + +static void parse_filename_opts(const char *filename, Error **errp, +                                char **volume, char **segment_name, +                                xport *mport, xport *vport) +{ +    const char *start; +    char *tokens[4], *ds; +    int idx; +    xport lmport = NoPort, lvport = NoPort; + +    strstart(filename, "archipelago:", &start); + +    ds = g_strdup(start); +    tokens[0] = strtok(ds, "/"); +    tokens[1] = strtok(NULL, ":"); +    tokens[2] = strtok(NULL, ":"); +    tokens[3] = strtok(NULL, "\0"); + +    if (!strlen(tokens[0])) { +        error_setg(errp, "volume name must be specified first"); +        g_free(ds); +        return; +    } + +    for (idx = 1; idx < 4; idx++) { +        if (tokens[idx] != NULL) { +            if (strstart(tokens[idx], "mport=", NULL)) { +                xseg_find_port(tokens[idx], "mport=", &lmport); +            } +            if (strstart(tokens[idx], "vport=", NULL)) { +                xseg_find_port(tokens[idx], "vport=", &lvport); +            } +            if (strstart(tokens[idx], "segment=", NULL)) { +                xseg_find_segment(tokens[idx], "segment=", segment_name); +            } +        } +    } + +    if ((lmport == -2) || (lvport == -2)) { +        error_setg(errp, "mport and/or vport must be set"); +        g_free(ds); +        return; +    } +    *volume = g_strdup(tokens[0]); +    *mport = lmport; +    *vport = lvport; +    g_free(ds); +} + +static void archipelago_parse_filename(const char *filename, QDict *options, +                                       Error **errp) +{ +    const char *start; +    char *volume = NULL, *segment_name = NULL; +    xport mport = NoPort, vport = NoPort; + +    if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME) +            || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT) +            || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT) +            || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) { +        error_setg(errp, "volume/mport/vport/segment and a file name may not" +                         " be specified at the same time"); +        return; +    } + +    if (!strstart(filename, "archipelago:", &start)) { +        error_setg(errp, "File name must start with 'archipelago:'"); +        return; +    } + +    if (!strlen(start) || strstart(start, "/", NULL)) { +        error_setg(errp, "volume name must be specified"); +        return; +    } + +    parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport); + +    if (volume) { +        qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume)); +        g_free(volume); +    } +    if (segment_name) { +        qdict_put(options, ARCHIPELAGO_OPT_SEGMENT, +                  qstring_from_str(segment_name)); +        g_free(segment_name); +    } +    if (mport != NoPort) { +        qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport)); +    } +    if (vport != NoPort) { +        qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport)); +    } +} + +static QemuOptsList archipelago_runtime_opts = { +    .name = "archipelago", +    .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), +    .desc = { +        { +            .name = ARCHIPELAGO_OPT_VOLUME, +            .type = QEMU_OPT_STRING, +            .help = "Name of the volume image", +        }, +        { +            .name = ARCHIPELAGO_OPT_SEGMENT, +            .type = QEMU_OPT_STRING, +            .help = "Name of the Archipelago shared memory segment", +        }, +        { +            .name = ARCHIPELAGO_OPT_MPORT, +            .type = QEMU_OPT_NUMBER, +            .help = "Archipelago mapperd port number" +        }, +        { +            .name = ARCHIPELAGO_OPT_VPORT, +            .type = QEMU_OPT_NUMBER, +            .help = "Archipelago vlmcd port number" + +        }, +        { /* end of list */ } +    }, +}; + +static int qemu_archipelago_open(BlockDriverState *bs, +                                 QDict *options, +                                 int bdrv_flags, +                                 Error **errp) +{ +    int ret = 0; +    const char *volume, *segment_name; +    QemuOpts *opts; +    Error *local_err = NULL; +    BDRVArchipelagoState *s = bs->opaque; + +    opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto err_exit; +    } + +    s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT, +                                     ARCHIPELAGO_DFL_MPORT); +    s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT, +                                     ARCHIPELAGO_DFL_VPORT); + +    segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT); +    if (segment_name == NULL) { +        s->segment_name = g_strdup("archipelago"); +    } else { +        s->segment_name = g_strdup(segment_name); +    } + +    volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME); +    if (volume == NULL) { +        error_setg(errp, "archipelago block driver requires the 'volume'" +                   " option"); +        ret = -EINVAL; +        goto err_exit; +    } +    s->volname = g_strdup(volume); + +    /* Initialize XSEG, join shared memory segment */ +    ret = qemu_archipelago_init(s); +    if (ret < 0) { +        error_setg(errp, "cannot initialize XSEG and join shared " +                   "memory segment"); +        goto err_exit; +    } + +    qemu_opts_del(opts); +    return 0; + +err_exit: +    g_free(s->volname); +    g_free(s->segment_name); +    qemu_opts_del(opts); +    return ret; +} + +static void qemu_archipelago_close(BlockDriverState *bs) +{ +    int r, targetlen; +    char *target; +    struct xseg_request *req; +    BDRVArchipelagoState *s = bs->opaque; + +    s->stopping = true; + +    qemu_mutex_lock(&s->request_mutex); +    while (!s->th_is_signaled) { +        qemu_cond_wait(&s->request_cond, +                       &s->request_mutex); +    } +    qemu_mutex_unlock(&s->request_mutex); +    qemu_thread_join(&s->request_th); +    qemu_cond_destroy(&s->request_cond); +    qemu_mutex_destroy(&s->request_mutex); + +    qemu_cond_destroy(&s->archip_cond); +    qemu_mutex_destroy(&s->archip_mutex); + +    targetlen = strlen(s->volname); +    req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); +    if (!req) { +        archipelagolog("Cannot get XSEG request\n"); +        goto err_exit; +    } +    r = xseg_prep_request(s->xseg, req, targetlen, 0); +    if (r < 0) { +        xseg_put_request(s->xseg, req, s->srcport); +        archipelagolog("Cannot prepare XSEG close request\n"); +        goto err_exit; +    } + +    target = xseg_get_target(s->xseg, req); +    memcpy(target, s->volname, targetlen); +    req->size = req->datalen; +    req->offset = 0; +    req->op = X_CLOSE; + +    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); +    if (p == NoPort) { +        xseg_put_request(s->xseg, req, s->srcport); +        archipelagolog("Cannot submit XSEG close request\n"); +        goto err_exit; +    } + +    xseg_signal(s->xseg, p); +    wait_reply(s->xseg, s->srcport, s->port, req); + +    xseg_put_request(s->xseg, req, s->srcport); + +err_exit: +    g_free(s->volname); +    g_free(s->segment_name); +    xseg_quit_local_signal(s->xseg, s->srcport); +    xseg_leave_dynport(s->xseg, s->port); +    xseg_leave(s->xseg); +} + +static int qemu_archipelago_create_volume(Error **errp, const char *volname, +                                          char *segment_name, +                                          uint64_t size, xport mportno, +                                          xport vportno) +{ +    int ret, targetlen; +    struct xseg *xseg = NULL; +    struct xseg_request *req; +    struct xseg_request_clone *xclone; +    struct xseg_port *port; +    xport srcport = NoPort, sport = NoPort; +    char *target; + +    /* Try default values if none has been set */ +    if (mportno == (xport) -1) { +        mportno = ARCHIPELAGO_DFL_MPORT; +    } + +    if (vportno == (xport) -1) { +        vportno = ARCHIPELAGO_DFL_VPORT; +    } + +    if (xseg_initialize()) { +        error_setg(errp, "Cannot initialize XSEG"); +        return -1; +    } + +    xseg = xseg_join("posix", segment_name, +                     "posixfd", NULL); + +    if (!xseg) { +        error_setg(errp, "Cannot join XSEG shared memory segment"); +        return -1; +    } + +    port = xseg_bind_dynport(xseg); +    srcport = port->portno; +    init_local_signal(xseg, sport, srcport); + +    req = xseg_get_request(xseg, srcport, mportno, X_ALLOC); +    if (!req) { +        error_setg(errp, "Cannot get XSEG request"); +        return -1; +    } + +    targetlen = strlen(volname); +    ret = xseg_prep_request(xseg, req, targetlen, +                            sizeof(struct xseg_request_clone)); +    if (ret < 0) { +        error_setg(errp, "Cannot prepare XSEG request"); +        goto err_exit; +    } + +    target = xseg_get_target(xseg, req); +    if (!target) { +        error_setg(errp, "Cannot get XSEG target."); +        goto err_exit; +    } +    memcpy(target, volname, targetlen); +    xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req); +    memset(xclone->target, 0 , XSEG_MAX_TARGETLEN); +    xclone->targetlen = 0; +    xclone->size = size; +    req->offset = 0; +    req->size = req->datalen; +    req->op = X_CLONE; + +    xport p = xseg_submit(xseg, req, srcport, X_ALLOC); +    if (p == NoPort) { +        error_setg(errp, "Could not submit XSEG request"); +        goto err_exit; +    } +    xseg_signal(xseg, p); + +    ret = wait_reply(xseg, srcport, port, req); +    if (ret < 0) { +        error_setg(errp, "wait_reply() error."); +    } + +    xseg_put_request(xseg, req, srcport); +    xseg_quit_local_signal(xseg, srcport); +    xseg_leave_dynport(xseg, port); +    xseg_leave(xseg); +    return ret; + +err_exit: +    xseg_put_request(xseg, req, srcport); +    xseg_quit_local_signal(xseg, srcport); +    xseg_leave_dynport(xseg, port); +    xseg_leave(xseg); +    return -1; +} + +static int qemu_archipelago_create(const char *filename, +                                   QemuOpts *options, +                                   Error **errp) +{ +    int ret = 0; +    uint64_t total_size = 0; +    char *volname = NULL, *segment_name = NULL; +    const char *start; +    xport mport = NoPort, vport = NoPort; + +    if (!strstart(filename, "archipelago:", &start)) { +        error_setg(errp, "File name must start with 'archipelago:'"); +        return -1; +    } + +    if (!strlen(start) || strstart(start, "/", NULL)) { +        error_setg(errp, "volume name must be specified"); +        return -1; +    } + +    parse_filename_opts(filename, errp, &volname, &segment_name, &mport, +                        &vport); +    total_size = ROUND_UP(qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); + +    if (segment_name == NULL) { +        segment_name = g_strdup("archipelago"); +    } + +    /* Create an Archipelago volume */ +    ret = qemu_archipelago_create_volume(errp, volname, segment_name, +                                         total_size, mport, +                                         vport); + +    g_free(volname); +    g_free(segment_name); +    return ret; +} + +static const AIOCBInfo archipelago_aiocb_info = { +    .aiocb_size = sizeof(ArchipelagoAIOCB), +}; + +static int archipelago_submit_request(BDRVArchipelagoState *s, +                                        uint64_t bufidx, +                                        size_t count, +                                        off_t offset, +                                        ArchipelagoAIOCB *aio_cb, +                                        ArchipelagoSegmentedRequest *segreq, +                                        int op) +{ +    int ret, targetlen; +    char *target; +    void *data = NULL; +    struct xseg_request *req; +    AIORequestData *reqdata = g_new(AIORequestData, 1); + +    targetlen = strlen(s->volname); +    req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); +    if (!req) { +        archipelagolog("Cannot get XSEG request\n"); +        goto err_exit2; +    } +    ret = xseg_prep_request(s->xseg, req, targetlen, count); +    if (ret < 0) { +        archipelagolog("Cannot prepare XSEG request\n"); +        goto err_exit; +    } +    target = xseg_get_target(s->xseg, req); +    if (!target) { +        archipelagolog("Cannot get XSEG target\n"); +        goto err_exit; +    } +    memcpy(target, s->volname, targetlen); +    req->size = count; +    req->offset = offset; + +    switch (op) { +    case ARCHIP_OP_READ: +        req->op = X_READ; +        break; +    case ARCHIP_OP_WRITE: +        req->op = X_WRITE; +        break; +    case ARCHIP_OP_FLUSH: +        req->op = X_FLUSH; +        break; +    } +    reqdata->volname = s->volname; +    reqdata->offset = offset; +    reqdata->size = count; +    reqdata->bufidx = bufidx; +    reqdata->aio_cb = aio_cb; +    reqdata->segreq = segreq; +    reqdata->op = op; + +    xseg_set_req_data(s->xseg, req, reqdata); +    if (op == ARCHIP_OP_WRITE) { +        data = xseg_get_data(s->xseg, req); +        if (!data) { +            archipelagolog("Cannot get XSEG data\n"); +            goto err_exit; +        } +        qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count); +    } + +    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); +    if (p == NoPort) { +        archipelagolog("Could not submit XSEG request\n"); +        goto err_exit; +    } +    xseg_signal(s->xseg, p); +    return 0; + +err_exit: +    g_free(reqdata); +    xseg_put_request(s->xseg, req, s->srcport); +    return -EIO; +err_exit2: +    g_free(reqdata); +    return -EIO; +} + +static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s, +                                        size_t count, +                                        off_t offset, +                                        ArchipelagoAIOCB *aio_cb, +                                        int op) +{ +    int ret, segments_nr; +    size_t pos = 0; +    ArchipelagoSegmentedRequest *segreq; + +    segreq = g_new0(ArchipelagoSegmentedRequest, 1); + +    if (op == ARCHIP_OP_FLUSH) { +        segments_nr = 1; +    } else { +        segments_nr = (int)(count / MAX_REQUEST_SIZE) + \ +                      ((count % MAX_REQUEST_SIZE) ? 1 : 0); +    } +    segreq->total = count; +    atomic_mb_set(&segreq->ref, segments_nr); + +    while (segments_nr > 1) { +        ret = archipelago_submit_request(s, pos, +                                            MAX_REQUEST_SIZE, +                                            offset + pos, +                                            aio_cb, segreq, op); + +        if (ret < 0) { +            goto err_exit; +        } +        count -= MAX_REQUEST_SIZE; +        pos += MAX_REQUEST_SIZE; +        segments_nr--; +    } +    ret = archipelago_submit_request(s, pos, count, offset + pos, +                                     aio_cb, segreq, op); + +    if (ret < 0) { +        goto err_exit; +    } +    return 0; + +err_exit: +    segreq->failed = 1; +    if (atomic_fetch_sub(&segreq->ref, segments_nr) == segments_nr) { +        g_free(segreq); +    } +    return ret; +} + +static BlockAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, +                                           int64_t sector_num, +                                           QEMUIOVector *qiov, +                                           int nb_sectors, +                                           BlockCompletionFunc *cb, +                                           void *opaque, +                                           int op) +{ +    ArchipelagoAIOCB *aio_cb; +    BDRVArchipelagoState *s = bs->opaque; +    int64_t size, off; +    int ret; + +    aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque); +    aio_cb->cmd = op; +    aio_cb->qiov = qiov; + +    aio_cb->ret = 0; +    aio_cb->s = s; +    aio_cb->status = -EINPROGRESS; + +    off = sector_num * BDRV_SECTOR_SIZE; +    size = nb_sectors * BDRV_SECTOR_SIZE; +    aio_cb->size = size; + +    ret = archipelago_aio_segmented_rw(s, size, off, +                                       aio_cb, op); +    if (ret < 0) { +        goto err_exit; +    } +    return &aio_cb->common; + +err_exit: +    error_report("qemu_archipelago_aio_rw(): I/O Error"); +    qemu_aio_unref(aio_cb); +    return NULL; +} + +static BlockAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, +                                   opaque, ARCHIP_OP_READ); +} + +static BlockAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, +                                   opaque, ARCHIP_OP_WRITE); +} + +static int64_t archipelago_volume_info(BDRVArchipelagoState *s) +{ +    uint64_t size; +    int ret, targetlen; +    struct xseg_request *req; +    struct xseg_reply_info *xinfo; +    AIORequestData *reqdata = g_new(AIORequestData, 1); + +    const char *volname = s->volname; +    targetlen = strlen(volname); +    req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); +    if (!req) { +        archipelagolog("Cannot get XSEG request\n"); +        goto err_exit2; +    } +    ret = xseg_prep_request(s->xseg, req, targetlen, +                            sizeof(struct xseg_reply_info)); +    if (ret < 0) { +        archipelagolog("Cannot prepare XSEG request\n"); +        goto err_exit; +    } +    char *target = xseg_get_target(s->xseg, req); +    if (!target) { +        archipelagolog("Cannot get XSEG target\n"); +        goto err_exit; +    } +    memcpy(target, volname, targetlen); +    req->size = req->datalen; +    req->offset = 0; +    req->op = X_INFO; + +    reqdata->op = ARCHIP_OP_VOLINFO; +    reqdata->volname = volname; +    xseg_set_req_data(s->xseg, req, reqdata); + +    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); +    if (p == NoPort) { +        archipelagolog("Cannot submit XSEG request\n"); +        goto err_exit; +    } +    xseg_signal(s->xseg, p); +    qemu_mutex_lock(&s->archip_mutex); +    while (!s->is_signaled) { +        qemu_cond_wait(&s->archip_cond, &s->archip_mutex); +    } +    s->is_signaled = false; +    qemu_mutex_unlock(&s->archip_mutex); + +    xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req); +    size = xinfo->size; +    xseg_put_request(s->xseg, req, s->srcport); +    g_free(reqdata); +    s->size = size; +    return size; + +err_exit: +    xseg_put_request(s->xseg, req, s->srcport); +err_exit2: +    g_free(reqdata); +    return -EIO; +} + +static int64_t qemu_archipelago_getlength(BlockDriverState *bs) +{ +    int64_t ret; +    BDRVArchipelagoState *s = bs->opaque; + +    ret = archipelago_volume_info(s); +    return ret; +} + +static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset) +{ +    int ret, targetlen; +    struct xseg_request *req; +    BDRVArchipelagoState *s = bs->opaque; +    AIORequestData *reqdata = g_new(AIORequestData, 1); + +    const char *volname = s->volname; +    targetlen = strlen(volname); +    req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); +    if (!req) { +        archipelagolog("Cannot get XSEG request\n"); +        goto err_exit2; +    } + +    ret = xseg_prep_request(s->xseg, req, targetlen, 0); +    if (ret < 0) { +        archipelagolog("Cannot prepare XSEG request\n"); +        goto err_exit; +    } +    char *target = xseg_get_target(s->xseg, req); +    if (!target) { +        archipelagolog("Cannot get XSEG target\n"); +        goto err_exit; +    } +    memcpy(target, volname, targetlen); +    req->offset = offset; +    req->op = X_TRUNCATE; + +    reqdata->op = ARCHIP_OP_TRUNCATE; +    reqdata->volname = volname; + +    xseg_set_req_data(s->xseg, req, reqdata); + +    xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); +    if (p == NoPort) { +        archipelagolog("Cannot submit XSEG request\n"); +        goto err_exit; +    } + +    xseg_signal(s->xseg, p); +    qemu_mutex_lock(&s->archip_mutex); +    while (!s->is_signaled) { +        qemu_cond_wait(&s->archip_cond, &s->archip_mutex); +    } +    s->is_signaled = false; +    qemu_mutex_unlock(&s->archip_mutex); +    xseg_put_request(s->xseg, req, s->srcport); +    g_free(reqdata); +    return 0; + +err_exit: +    xseg_put_request(s->xseg, req, s->srcport); +err_exit2: +    g_free(reqdata); +    return -EIO; +} + +static QemuOptsList qemu_archipelago_create_opts = { +    .name = "archipelago-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +static BlockAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, +        BlockCompletionFunc *cb, void *opaque) +{ +    return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque, +                                   ARCHIP_OP_FLUSH); +} + +static BlockDriver bdrv_archipelago = { +    .format_name         = "archipelago", +    .protocol_name       = "archipelago", +    .instance_size       = sizeof(BDRVArchipelagoState), +    .bdrv_parse_filename = archipelago_parse_filename, +    .bdrv_file_open      = qemu_archipelago_open, +    .bdrv_close          = qemu_archipelago_close, +    .bdrv_create         = qemu_archipelago_create, +    .bdrv_getlength      = qemu_archipelago_getlength, +    .bdrv_truncate       = qemu_archipelago_truncate, +    .bdrv_aio_readv      = qemu_archipelago_aio_readv, +    .bdrv_aio_writev     = qemu_archipelago_aio_writev, +    .bdrv_aio_flush      = qemu_archipelago_aio_flush, +    .bdrv_has_zero_init  = bdrv_has_zero_init_1, +    .create_opts         = &qemu_archipelago_create_opts, +}; + +static void bdrv_archipelago_init(void) +{ +    bdrv_register(&bdrv_archipelago); +} + +block_init(bdrv_archipelago_init); diff --git a/block/backup.c b/block/backup.c new file mode 100644 index 00000000..965654d5 --- /dev/null +++ b/block/backup.c @@ -0,0 +1,549 @@ +/* + * QEMU backup + * + * Copyright (C) 2013 Proxmox Server Solutions + * + * Authors: + *  Dietmar Maurer (dietmar@proxmox.com) + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + +#include "trace.h" +#include "block/block.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" + +#define BACKUP_CLUSTER_BITS 16 +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) +#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct CowRequest { +    int64_t start; +    int64_t end; +    QLIST_ENTRY(CowRequest) list; +    CoQueue wait_queue; /* coroutines blocked on this request */ +} CowRequest; + +typedef struct BackupBlockJob { +    BlockJob common; +    BlockDriverState *target; +    /* bitmap for sync=incremental */ +    BdrvDirtyBitmap *sync_bitmap; +    MirrorSyncMode sync_mode; +    RateLimit limit; +    BlockdevOnError on_source_error; +    BlockdevOnError on_target_error; +    CoRwlock flush_rwlock; +    uint64_t sectors_read; +    HBitmap *bitmap; +    QLIST_HEAD(, CowRequest) inflight_reqs; +} BackupBlockJob; + +/* See if in-flight requests overlap and wait for them to complete */ +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, +                                                       int64_t start, +                                                       int64_t end) +{ +    CowRequest *req; +    bool retry; + +    do { +        retry = false; +        QLIST_FOREACH(req, &job->inflight_reqs, list) { +            if (end > req->start && start < req->end) { +                qemu_co_queue_wait(&req->wait_queue); +                retry = true; +                break; +            } +        } +    } while (retry); +} + +/* Keep track of an in-flight request */ +static void cow_request_begin(CowRequest *req, BackupBlockJob *job, +                                     int64_t start, int64_t end) +{ +    req->start = start; +    req->end = end; +    qemu_co_queue_init(&req->wait_queue); +    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); +} + +/* Forget about a completed request */ +static void cow_request_end(CowRequest *req) +{ +    QLIST_REMOVE(req, list); +    qemu_co_queue_restart_all(&req->wait_queue); +} + +static int coroutine_fn backup_do_cow(BlockDriverState *bs, +                                      int64_t sector_num, int nb_sectors, +                                      bool *error_is_read) +{ +    BackupBlockJob *job = (BackupBlockJob *)bs->job; +    CowRequest cow_request; +    struct iovec iov; +    QEMUIOVector bounce_qiov; +    void *bounce_buffer = NULL; +    int ret = 0; +    int64_t start, end; +    int n; + +    qemu_co_rwlock_rdlock(&job->flush_rwlock); + +    start = sector_num / BACKUP_SECTORS_PER_CLUSTER; +    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); + +    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); + +    wait_for_overlapping_requests(job, start, end); +    cow_request_begin(&cow_request, job, start, end); + +    for (; start < end; start++) { +        if (hbitmap_get(job->bitmap, start)) { +            trace_backup_do_cow_skip(job, start); +            continue; /* already copied */ +        } + +        trace_backup_do_cow_process(job, start); + +        n = MIN(BACKUP_SECTORS_PER_CLUSTER, +                job->common.len / BDRV_SECTOR_SIZE - +                start * BACKUP_SECTORS_PER_CLUSTER); + +        if (!bounce_buffer) { +            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); +        } +        iov.iov_base = bounce_buffer; +        iov.iov_len = n * BDRV_SECTOR_SIZE; +        qemu_iovec_init_external(&bounce_qiov, &iov, 1); + +        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, +                            &bounce_qiov); +        if (ret < 0) { +            trace_backup_do_cow_read_fail(job, start, ret); +            if (error_is_read) { +                *error_is_read = true; +            } +            goto out; +        } + +        if (buffer_is_zero(iov.iov_base, iov.iov_len)) { +            ret = bdrv_co_write_zeroes(job->target, +                                       start * BACKUP_SECTORS_PER_CLUSTER, +                                       n, BDRV_REQ_MAY_UNMAP); +        } else { +            ret = bdrv_co_writev(job->target, +                                 start * BACKUP_SECTORS_PER_CLUSTER, n, +                                 &bounce_qiov); +        } +        if (ret < 0) { +            trace_backup_do_cow_write_fail(job, start, ret); +            if (error_is_read) { +                *error_is_read = false; +            } +            goto out; +        } + +        hbitmap_set(job->bitmap, start, 1); + +        /* Publish progress, guest I/O counts as progress too.  Note that the +         * offset field is an opaque progress value, it is not a disk offset. +         */ +        job->sectors_read += n; +        job->common.offset += n * BDRV_SECTOR_SIZE; +    } + +out: +    if (bounce_buffer) { +        qemu_vfree(bounce_buffer); +    } + +    cow_request_end(&cow_request); + +    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); + +    qemu_co_rwlock_unlock(&job->flush_rwlock); + +    return ret; +} + +static int coroutine_fn backup_before_write_notify( +        NotifierWithReturn *notifier, +        void *opaque) +{ +    BdrvTrackedRequest *req = opaque; +    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; +    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; + +    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); +    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + +    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); +} + +static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ +    BackupBlockJob *s = container_of(job, BackupBlockJob, common); + +    if (speed < 0) { +        error_setg(errp, QERR_INVALID_PARAMETER, "speed"); +        return; +    } +    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static void backup_iostatus_reset(BlockJob *job) +{ +    BackupBlockJob *s = container_of(job, BackupBlockJob, common); + +    bdrv_iostatus_reset(s->target); +} + +static const BlockJobDriver backup_job_driver = { +    .instance_size  = sizeof(BackupBlockJob), +    .job_type       = BLOCK_JOB_TYPE_BACKUP, +    .set_speed      = backup_set_speed, +    .iostatus_reset = backup_iostatus_reset, +}; + +static BlockErrorAction backup_error_action(BackupBlockJob *job, +                                            bool read, int error) +{ +    if (read) { +        return block_job_error_action(&job->common, job->common.bs, +                                      job->on_source_error, true, error); +    } else { +        return block_job_error_action(&job->common, job->target, +                                      job->on_target_error, false, error); +    } +} + +typedef struct { +    int ret; +} BackupCompleteData; + +static void backup_complete(BlockJob *job, void *opaque) +{ +    BackupBlockJob *s = container_of(job, BackupBlockJob, common); +    BackupCompleteData *data = opaque; + +    bdrv_unref(s->target); + +    block_job_completed(job, data->ret); +    g_free(data); +} + +static bool coroutine_fn yield_and_check(BackupBlockJob *job) +{ +    if (block_job_is_cancelled(&job->common)) { +        return true; +    } + +    /* we need to yield so that bdrv_drain_all() returns. +     * (without, VM does not reboot) +     */ +    if (job->common.speed) { +        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, +                                                      job->sectors_read); +        job->sectors_read = 0; +        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); +    } else { +        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); +    } + +    if (block_job_is_cancelled(&job->common)) { +        return true; +    } + +    return false; +} + +static int coroutine_fn backup_run_incremental(BackupBlockJob *job) +{ +    bool error_is_read; +    int ret = 0; +    int clusters_per_iter; +    uint32_t granularity; +    int64_t sector; +    int64_t cluster; +    int64_t end; +    int64_t last_cluster = -1; +    BlockDriverState *bs = job->common.bs; +    HBitmapIter hbi; + +    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); +    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); +    bdrv_dirty_iter_init(job->sync_bitmap, &hbi); + +    /* Find the next dirty sector(s) */ +    while ((sector = hbitmap_iter_next(&hbi)) != -1) { +        cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + +        /* Fake progress updates for any clusters we skipped */ +        if (cluster != last_cluster + 1) { +            job->common.offset += ((cluster - last_cluster - 1) * +                                   BACKUP_CLUSTER_SIZE); +        } + +        for (end = cluster + clusters_per_iter; cluster < end; cluster++) { +            do { +                if (yield_and_check(job)) { +                    return ret; +                } +                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, +                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read); +                if ((ret < 0) && +                    backup_error_action(job, error_is_read, -ret) == +                    BLOCK_ERROR_ACTION_REPORT) { +                    return ret; +                } +            } while (ret < 0); +        } + +        /* If the bitmap granularity is smaller than the backup granularity, +         * we need to advance the iterator pointer to the next cluster. */ +        if (granularity < BACKUP_CLUSTER_SIZE) { +            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); +        } + +        last_cluster = cluster - 1; +    } + +    /* Play some final catchup with the progress meter */ +    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); +    if (last_cluster + 1 < end) { +        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); +    } + +    return ret; +} + +static void coroutine_fn backup_run(void *opaque) +{ +    BackupBlockJob *job = opaque; +    BackupCompleteData *data; +    BlockDriverState *bs = job->common.bs; +    BlockDriverState *target = job->target; +    BlockdevOnError on_target_error = job->on_target_error; +    NotifierWithReturn before_write = { +        .notify = backup_before_write_notify, +    }; +    int64_t start, end; +    int ret = 0; + +    QLIST_INIT(&job->inflight_reqs); +    qemu_co_rwlock_init(&job->flush_rwlock); + +    start = 0; +    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + +    job->bitmap = hbitmap_alloc(end, 0); + +    bdrv_set_enable_write_cache(target, true); +    bdrv_set_on_error(target, on_target_error, on_target_error); +    bdrv_iostatus_enable(target); + +    bdrv_add_before_write_notifier(bs, &before_write); + +    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { +        while (!block_job_is_cancelled(&job->common)) { +            /* Yield until the job is cancelled.  We just let our before_write +             * notify callback service CoW requests. */ +            job->common.busy = false; +            qemu_coroutine_yield(); +            job->common.busy = true; +        } +    } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { +        ret = backup_run_incremental(job); +    } else { +        /* Both FULL and TOP SYNC_MODE's require copying.. */ +        for (; start < end; start++) { +            bool error_is_read; +            if (yield_and_check(job)) { +                break; +            } + +            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { +                int i, n; +                int alloced = 0; + +                /* Check to see if these blocks are already in the +                 * backing file. */ + +                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { +                    /* bdrv_is_allocated() only returns true/false based +                     * on the first set of sectors it comes across that +                     * are are all in the same state. +                     * For that reason we must verify each sector in the +                     * backup cluster length.  We end up copying more than +                     * needed but at some point that is always the case. */ +                    alloced = +                        bdrv_is_allocated(bs, +                                start * BACKUP_SECTORS_PER_CLUSTER + i, +                                BACKUP_SECTORS_PER_CLUSTER - i, &n); +                    i += n; + +                    if (alloced == 1 || n == 0) { +                        break; +                    } +                } + +                /* If the above loop never found any sectors that are in +                 * the topmost image, skip this backup. */ +                if (alloced == 0) { +                    continue; +                } +            } +            /* FULL sync mode we copy the whole drive. */ +            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, +                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read); +            if (ret < 0) { +                /* Depending on error action, fail now or retry cluster */ +                BlockErrorAction action = +                    backup_error_action(job, error_is_read, -ret); +                if (action == BLOCK_ERROR_ACTION_REPORT) { +                    break; +                } else { +                    start--; +                    continue; +                } +            } +        } +    } + +    notifier_with_return_remove(&before_write); + +    /* wait until pending backup_do_cow() calls have completed */ +    qemu_co_rwlock_wrlock(&job->flush_rwlock); +    qemu_co_rwlock_unlock(&job->flush_rwlock); + +    if (job->sync_bitmap) { +        BdrvDirtyBitmap *bm; +        if (ret < 0 || block_job_is_cancelled(&job->common)) { +            /* Merge the successor back into the parent, delete nothing. */ +            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); +            assert(bm); +        } else { +            /* Everything is fine, delete this bitmap and install the backup. */ +            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); +            assert(bm); +        } +    } +    hbitmap_free(job->bitmap); + +    bdrv_iostatus_disable(target); +    bdrv_op_unblock_all(target, job->common.blocker); + +    data = g_malloc(sizeof(*data)); +    data->ret = ret; +    block_job_defer_to_main_loop(&job->common, backup_complete, data); +} + +void backup_start(BlockDriverState *bs, BlockDriverState *target, +                  int64_t speed, MirrorSyncMode sync_mode, +                  BdrvDirtyBitmap *sync_bitmap, +                  BlockdevOnError on_source_error, +                  BlockdevOnError on_target_error, +                  BlockCompletionFunc *cb, void *opaque, +                  Error **errp) +{ +    int64_t len; + +    assert(bs); +    assert(target); +    assert(cb); + +    if (bs == target) { +        error_setg(errp, "Source and target cannot be the same"); +        return; +    } + +    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || +         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && +        !bdrv_iostatus_is_enabled(bs)) { +        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); +        return; +    } + +    if (!bdrv_is_inserted(bs)) { +        error_setg(errp, "Device is not inserted: %s", +                   bdrv_get_device_name(bs)); +        return; +    } + +    if (!bdrv_is_inserted(target)) { +        error_setg(errp, "Device is not inserted: %s", +                   bdrv_get_device_name(target)); +        return; +    } + +    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { +        return; +    } + +    if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { +        return; +    } + +    if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { +        if (!sync_bitmap) { +            error_setg(errp, "must provide a valid bitmap name for " +                             "\"incremental\" sync mode"); +            return; +        } + +        /* Create a new bitmap, and freeze/disable this one. */ +        if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { +            return; +        } +    } else if (sync_bitmap) { +        error_setg(errp, +                   "a sync_bitmap was provided to backup_run, " +                   "but received an incompatible sync_mode (%s)", +                   MirrorSyncMode_lookup[sync_mode]); +        return; +    } + +    len = bdrv_getlength(bs); +    if (len < 0) { +        error_setg_errno(errp, -len, "unable to get length for '%s'", +                         bdrv_get_device_name(bs)); +        goto error; +    } + +    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, +                                           cb, opaque, errp); +    if (!job) { +        goto error; +    } + +    bdrv_op_block_all(target, job->common.blocker); + +    job->on_source_error = on_source_error; +    job->on_target_error = on_target_error; +    job->target = target; +    job->sync_mode = sync_mode; +    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? +                       sync_bitmap : NULL; +    job->common.len = len; +    job->common.co = qemu_coroutine_create(backup_run); +    qemu_coroutine_enter(job->common.co, job); +    return; + + error: +    if (sync_bitmap) { +        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); +    } +} diff --git a/block/blkdebug.c b/block/blkdebug.c new file mode 100644 index 00000000..bc247f46 --- /dev/null +++ b/block/blkdebug.c @@ -0,0 +1,805 @@ +/* + * Block protocol for I/O error injection + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" + +typedef struct BDRVBlkdebugState { +    int state; +    int new_state; + +    QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; +    QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; +    QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs; +} BDRVBlkdebugState; + +typedef struct BlkdebugAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    int ret; +} BlkdebugAIOCB; + +typedef struct BlkdebugSuspendedReq { +    Coroutine *co; +    char *tag; +    QLIST_ENTRY(BlkdebugSuspendedReq) next; +} BlkdebugSuspendedReq; + +static const AIOCBInfo blkdebug_aiocb_info = { +    .aiocb_size    = sizeof(BlkdebugAIOCB), +}; + +enum { +    ACTION_INJECT_ERROR, +    ACTION_SET_STATE, +    ACTION_SUSPEND, +}; + +typedef struct BlkdebugRule { +    BlkDebugEvent event; +    int action; +    int state; +    union { +        struct { +            int error; +            int immediately; +            int once; +            int64_t sector; +        } inject; +        struct { +            int new_state; +        } set_state; +        struct { +            char *tag; +        } suspend; +    } options; +    QLIST_ENTRY(BlkdebugRule) next; +    QSIMPLEQ_ENTRY(BlkdebugRule) active_next; +} BlkdebugRule; + +static QemuOptsList inject_error_opts = { +    .name = "inject-error", +    .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head), +    .desc = { +        { +            .name = "event", +            .type = QEMU_OPT_STRING, +        }, +        { +            .name = "state", +            .type = QEMU_OPT_NUMBER, +        }, +        { +            .name = "errno", +            .type = QEMU_OPT_NUMBER, +        }, +        { +            .name = "sector", +            .type = QEMU_OPT_NUMBER, +        }, +        { +            .name = "once", +            .type = QEMU_OPT_BOOL, +        }, +        { +            .name = "immediately", +            .type = QEMU_OPT_BOOL, +        }, +        { /* end of list */ } +    }, +}; + +static QemuOptsList set_state_opts = { +    .name = "set-state", +    .head = QTAILQ_HEAD_INITIALIZER(set_state_opts.head), +    .desc = { +        { +            .name = "event", +            .type = QEMU_OPT_STRING, +        }, +        { +            .name = "state", +            .type = QEMU_OPT_NUMBER, +        }, +        { +            .name = "new_state", +            .type = QEMU_OPT_NUMBER, +        }, +        { /* end of list */ } +    }, +}; + +static QemuOptsList *config_groups[] = { +    &inject_error_opts, +    &set_state_opts, +    NULL +}; + +static const char *event_names[BLKDBG_EVENT_MAX] = { +    [BLKDBG_L1_UPDATE]                      = "l1_update", +    [BLKDBG_L1_GROW_ALLOC_TABLE]            = "l1_grow.alloc_table", +    [BLKDBG_L1_GROW_WRITE_TABLE]            = "l1_grow.write_table", +    [BLKDBG_L1_GROW_ACTIVATE_TABLE]         = "l1_grow.activate_table", + +    [BLKDBG_L2_LOAD]                        = "l2_load", +    [BLKDBG_L2_UPDATE]                      = "l2_update", +    [BLKDBG_L2_UPDATE_COMPRESSED]           = "l2_update_compressed", +    [BLKDBG_L2_ALLOC_COW_READ]              = "l2_alloc.cow_read", +    [BLKDBG_L2_ALLOC_WRITE]                 = "l2_alloc.write", + +    [BLKDBG_READ_AIO]                       = "read_aio", +    [BLKDBG_READ_BACKING_AIO]               = "read_backing_aio", +    [BLKDBG_READ_COMPRESSED]                = "read_compressed", + +    [BLKDBG_WRITE_AIO]                      = "write_aio", +    [BLKDBG_WRITE_COMPRESSED]               = "write_compressed", + +    [BLKDBG_VMSTATE_LOAD]                   = "vmstate_load", +    [BLKDBG_VMSTATE_SAVE]                   = "vmstate_save", + +    [BLKDBG_COW_READ]                       = "cow_read", +    [BLKDBG_COW_WRITE]                      = "cow_write", + +    [BLKDBG_REFTABLE_LOAD]                  = "reftable_load", +    [BLKDBG_REFTABLE_GROW]                  = "reftable_grow", +    [BLKDBG_REFTABLE_UPDATE]                = "reftable_update", + +    [BLKDBG_REFBLOCK_LOAD]                  = "refblock_load", +    [BLKDBG_REFBLOCK_UPDATE]                = "refblock_update", +    [BLKDBG_REFBLOCK_UPDATE_PART]           = "refblock_update_part", +    [BLKDBG_REFBLOCK_ALLOC]                 = "refblock_alloc", +    [BLKDBG_REFBLOCK_ALLOC_HOOKUP]          = "refblock_alloc.hookup", +    [BLKDBG_REFBLOCK_ALLOC_WRITE]           = "refblock_alloc.write", +    [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS]    = "refblock_alloc.write_blocks", +    [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE]     = "refblock_alloc.write_table", +    [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE]    = "refblock_alloc.switch_table", + +    [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc", +    [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes", +    [BLKDBG_CLUSTER_FREE]                   = "cluster_free", + +    [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os", +    [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk", + +    [BLKDBG_PWRITEV_RMW_HEAD]               = "pwritev_rmw.head", +    [BLKDBG_PWRITEV_RMW_AFTER_HEAD]         = "pwritev_rmw.after_head", +    [BLKDBG_PWRITEV_RMW_TAIL]               = "pwritev_rmw.tail", +    [BLKDBG_PWRITEV_RMW_AFTER_TAIL]         = "pwritev_rmw.after_tail", +    [BLKDBG_PWRITEV]                        = "pwritev", +    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero", +    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done", + +    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare", +}; + +static int get_event_by_name(const char *name, BlkDebugEvent *event) +{ +    int i; + +    for (i = 0; i < BLKDBG_EVENT_MAX; i++) { +        if (!strcmp(event_names[i], name)) { +            *event = i; +            return 0; +        } +    } + +    return -1; +} + +struct add_rule_data { +    BDRVBlkdebugState *s; +    int action; +}; + +static int add_rule(void *opaque, QemuOpts *opts, Error **errp) +{ +    struct add_rule_data *d = opaque; +    BDRVBlkdebugState *s = d->s; +    const char* event_name; +    BlkDebugEvent event; +    struct BlkdebugRule *rule; + +    /* Find the right event for the rule */ +    event_name = qemu_opt_get(opts, "event"); +    if (!event_name) { +        error_setg(errp, "Missing event name for rule"); +        return -1; +    } else if (get_event_by_name(event_name, &event) < 0) { +        error_setg(errp, "Invalid event name \"%s\"", event_name); +        return -1; +    } + +    /* Set attributes common for all actions */ +    rule = g_malloc0(sizeof(*rule)); +    *rule = (struct BlkdebugRule) { +        .event  = event, +        .action = d->action, +        .state  = qemu_opt_get_number(opts, "state", 0), +    }; + +    /* Parse action-specific options */ +    switch (d->action) { +    case ACTION_INJECT_ERROR: +        rule->options.inject.error = qemu_opt_get_number(opts, "errno", EIO); +        rule->options.inject.once  = qemu_opt_get_bool(opts, "once", 0); +        rule->options.inject.immediately = +            qemu_opt_get_bool(opts, "immediately", 0); +        rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); +        break; + +    case ACTION_SET_STATE: +        rule->options.set_state.new_state = +            qemu_opt_get_number(opts, "new_state", 0); +        break; + +    case ACTION_SUSPEND: +        rule->options.suspend.tag = +            g_strdup(qemu_opt_get(opts, "tag")); +        break; +    }; + +    /* Add the rule */ +    QLIST_INSERT_HEAD(&s->rules[event], rule, next); + +    return 0; +} + +static void remove_rule(BlkdebugRule *rule) +{ +    switch (rule->action) { +    case ACTION_INJECT_ERROR: +    case ACTION_SET_STATE: +        break; +    case ACTION_SUSPEND: +        g_free(rule->options.suspend.tag); +        break; +    } + +    QLIST_REMOVE(rule, next); +    g_free(rule); +} + +static int read_config(BDRVBlkdebugState *s, const char *filename, +                       QDict *options, Error **errp) +{ +    FILE *f = NULL; +    int ret; +    struct add_rule_data d; +    Error *local_err = NULL; + +    if (filename) { +        f = fopen(filename, "r"); +        if (f == NULL) { +            error_setg_errno(errp, errno, "Could not read blkdebug config file"); +            return -errno; +        } + +        ret = qemu_config_parse(f, config_groups, filename); +        if (ret < 0) { +            error_setg(errp, "Could not parse blkdebug config file"); +            ret = -EINVAL; +            goto fail; +        } +    } + +    qemu_config_parse_qdict(options, config_groups, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    d.s = s; +    d.action = ACTION_INJECT_ERROR; +    qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    d.action = ACTION_SET_STATE; +    qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    ret = 0; +fail: +    qemu_opts_reset(&inject_error_opts); +    qemu_opts_reset(&set_state_opts); +    if (f) { +        fclose(f); +    } +    return ret; +} + +/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */ +static void blkdebug_parse_filename(const char *filename, QDict *options, +                                    Error **errp) +{ +    const char *c; + +    /* Parse the blkdebug: prefix */ +    if (!strstart(filename, "blkdebug:", &filename)) { +        /* There was no prefix; therefore, all options have to be already +           present in the QDict (except for the filename) */ +        qdict_put(options, "x-image", qstring_from_str(filename)); +        return; +    } + +    /* Parse config file path */ +    c = strchr(filename, ':'); +    if (c == NULL) { +        error_setg(errp, "blkdebug requires both config file and image path"); +        return; +    } + +    if (c != filename) { +        QString *config_path; +        config_path = qstring_from_substr(filename, 0, c - filename - 1); +        qdict_put(options, "config", config_path); +    } + +    /* TODO Allow multi-level nesting and set file.filename here */ +    filename = c + 1; +    qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { +    .name = "blkdebug", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "config", +            .type = QEMU_OPT_STRING, +            .help = "Path to the configuration file", +        }, +        { +            .name = "x-image", +            .type = QEMU_OPT_STRING, +            .help = "[internal use only, will be removed]", +        }, +        { +            .name = "align", +            .type = QEMU_OPT_SIZE, +            .help = "Required alignment in bytes", +        }, +        { /* end of list */ } +    }, +}; + +static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, +                         Error **errp) +{ +    BDRVBlkdebugState *s = bs->opaque; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *config; +    uint64_t align; +    int ret; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    /* Read rules from config file or command line options */ +    config = qemu_opt_get(opts, "config"); +    ret = read_config(s, config, options, errp); +    if (ret) { +        goto out; +    } + +    /* Set initial state */ +    s->state = 1; + +    /* Open the backing file */ +    assert(bs->file == NULL); +    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", +                          bs, &child_file, false, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto out; +    } + +    /* Set request alignment */ +    align = qemu_opt_get_size(opts, "align", bs->request_alignment); +    if (align > 0 && align < INT_MAX && !(align & (align - 1))) { +        bs->request_alignment = align; +    } else { +        error_setg(errp, "Invalid alignment"); +        ret = -EINVAL; +        goto fail_unref; +    } + +    ret = 0; +    goto out; + +fail_unref: +    bdrv_unref(bs->file); +out: +    qemu_opts_del(opts); +    return ret; +} + +static void error_callback_bh(void *opaque) +{ +    struct BlkdebugAIOCB *acb = opaque; +    qemu_bh_delete(acb->bh); +    acb->common.cb(acb->common.opaque, acb->ret); +    qemu_aio_unref(acb); +} + +static BlockAIOCB *inject_error(BlockDriverState *bs, +    BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule) +{ +    BDRVBlkdebugState *s = bs->opaque; +    int error = rule->options.inject.error; +    struct BlkdebugAIOCB *acb; +    QEMUBH *bh; +    bool immediately = rule->options.inject.immediately; + +    if (rule->options.inject.once) { +        QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next); +        remove_rule(rule); +    } + +    if (immediately) { +        return NULL; +    } + +    acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); +    acb->ret = -error; + +    bh = aio_bh_new(bdrv_get_aio_context(bs), error_callback_bh, acb); +    acb->bh = bh; +    qemu_bh_schedule(bh); + +    return &acb->common; +} + +static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, +    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +    BlockCompletionFunc *cb, void *opaque) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugRule *rule = NULL; + +    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { +        if (rule->options.inject.sector == -1 || +            (rule->options.inject.sector >= sector_num && +             rule->options.inject.sector < sector_num + nb_sectors)) { +            break; +        } +    } + +    if (rule && rule->options.inject.error) { +        return inject_error(bs, cb, opaque, rule); +    } + +    return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, +    int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +    BlockCompletionFunc *cb, void *opaque) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugRule *rule = NULL; + +    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { +        if (rule->options.inject.sector == -1 || +            (rule->options.inject.sector >= sector_num && +             rule->options.inject.sector < sector_num + nb_sectors)) { +            break; +        } +    } + +    if (rule && rule->options.inject.error) { +        return inject_error(bs, cb, opaque, rule); +    } + +    return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, +    BlockCompletionFunc *cb, void *opaque) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugRule *rule = NULL; + +    QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { +        if (rule->options.inject.sector == -1) { +            break; +        } +    } + +    if (rule && rule->options.inject.error) { +        return inject_error(bs, cb, opaque, rule); +    } + +    return bdrv_aio_flush(bs->file, cb, opaque); +} + + +static void blkdebug_close(BlockDriverState *bs) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugRule *rule, *next; +    int i; + +    for (i = 0; i < BLKDBG_EVENT_MAX; i++) { +        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { +            remove_rule(rule); +        } +    } +} + +static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugSuspendedReq r; + +    r = (BlkdebugSuspendedReq) { +        .co         = qemu_coroutine_self(), +        .tag        = g_strdup(rule->options.suspend.tag), +    }; + +    remove_rule(rule); +    QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next); + +    printf("blkdebug: Suspended request '%s'\n", r.tag); +    qemu_coroutine_yield(); +    printf("blkdebug: Resuming request '%s'\n", r.tag); + +    QLIST_REMOVE(&r, next); +    g_free(r.tag); +} + +static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, +    bool injected) +{ +    BDRVBlkdebugState *s = bs->opaque; + +    /* Only process rules for the current state */ +    if (rule->state && rule->state != s->state) { +        return injected; +    } + +    /* Take the action */ +    switch (rule->action) { +    case ACTION_INJECT_ERROR: +        if (!injected) { +            QSIMPLEQ_INIT(&s->active_rules); +            injected = true; +        } +        QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next); +        break; + +    case ACTION_SET_STATE: +        s->new_state = rule->options.set_state.new_state; +        break; + +    case ACTION_SUSPEND: +        suspend_request(bs, rule); +        break; +    } +    return injected; +} + +static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) +{ +    BDRVBlkdebugState *s = bs->opaque; +    struct BlkdebugRule *rule, *next; +    bool injected; + +    assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); + +    injected = false; +    s->new_state = s->state; +    QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) { +        injected = process_rule(bs, rule, injected); +    } +    s->state = s->new_state; +} + +static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, +                                     const char *tag) +{ +    BDRVBlkdebugState *s = bs->opaque; +    struct BlkdebugRule *rule; +    BlkDebugEvent blkdebug_event; + +    if (get_event_by_name(event, &blkdebug_event) < 0) { +        return -ENOENT; +    } + + +    rule = g_malloc(sizeof(*rule)); +    *rule = (struct BlkdebugRule) { +        .event  = blkdebug_event, +        .action = ACTION_SUSPEND, +        .state  = 0, +        .options.suspend.tag = g_strdup(tag), +    }; + +    QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next); + +    return 0; +} + +static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugSuspendedReq *r, *next; + +    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { +        if (!strcmp(r->tag, tag)) { +            qemu_coroutine_enter(r->co, NULL); +            return 0; +        } +    } +    return -ENOENT; +} + +static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, +                                            const char *tag) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugSuspendedReq *r, *r_next; +    BlkdebugRule *rule, *next; +    int i, ret = -ENOENT; + +    for (i = 0; i < BLKDBG_EVENT_MAX; i++) { +        QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { +            if (rule->action == ACTION_SUSPEND && +                !strcmp(rule->options.suspend.tag, tag)) { +                remove_rule(rule); +                ret = 0; +            } +        } +    } +    QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { +        if (!strcmp(r->tag, tag)) { +            qemu_coroutine_enter(r->co, NULL); +            ret = 0; +        } +    } +    return ret; +} + +static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) +{ +    BDRVBlkdebugState *s = bs->opaque; +    BlkdebugSuspendedReq *r; + +    QLIST_FOREACH(r, &s->suspended_reqs, next) { +        if (!strcmp(r->tag, tag)) { +            return true; +        } +    } +    return false; +} + +static int64_t blkdebug_getlength(BlockDriverState *bs) +{ +    return bdrv_getlength(bs->file); +} + +static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) +{ +    return bdrv_truncate(bs->file, offset); +} + +static void blkdebug_refresh_filename(BlockDriverState *bs) +{ +    QDict *opts; +    const QDictEntry *e; +    bool force_json = false; + +    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { +        if (strcmp(qdict_entry_key(e), "config") && +            strcmp(qdict_entry_key(e), "x-image") && +            strcmp(qdict_entry_key(e), "image") && +            strncmp(qdict_entry_key(e), "image.", strlen("image."))) +        { +            force_json = true; +            break; +        } +    } + +    if (force_json && !bs->file->full_open_options) { +        /* The config file cannot be recreated, so creating a plain filename +         * is impossible */ +        return; +    } + +    if (!force_json && bs->file->exact_filename[0]) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "blkdebug:%s:%s", +                 qdict_get_try_str(bs->options, "config") ?: "", +                 bs->file->exact_filename); +    } + +    opts = qdict_new(); +    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug"))); + +    QINCREF(bs->file->full_open_options); +    qdict_put_obj(opts, "image", QOBJECT(bs->file->full_open_options)); + +    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { +        if (strcmp(qdict_entry_key(e), "x-image") && +            strcmp(qdict_entry_key(e), "image") && +            strncmp(qdict_entry_key(e), "image.", strlen("image."))) +        { +            qobject_incref(qdict_entry_value(e)); +            qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e)); +        } +    } + +    bs->full_open_options = opts; +} + +static BlockDriver bdrv_blkdebug = { +    .format_name            = "blkdebug", +    .protocol_name          = "blkdebug", +    .instance_size          = sizeof(BDRVBlkdebugState), + +    .bdrv_parse_filename    = blkdebug_parse_filename, +    .bdrv_file_open         = blkdebug_open, +    .bdrv_close             = blkdebug_close, +    .bdrv_getlength         = blkdebug_getlength, +    .bdrv_truncate          = blkdebug_truncate, +    .bdrv_refresh_filename  = blkdebug_refresh_filename, + +    .bdrv_aio_readv         = blkdebug_aio_readv, +    .bdrv_aio_writev        = blkdebug_aio_writev, +    .bdrv_aio_flush         = blkdebug_aio_flush, + +    .bdrv_debug_event           = blkdebug_debug_event, +    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint, +    .bdrv_debug_remove_breakpoint +                                = blkdebug_debug_remove_breakpoint, +    .bdrv_debug_resume          = blkdebug_debug_resume, +    .bdrv_debug_is_suspended    = blkdebug_debug_is_suspended, +}; + +static void bdrv_blkdebug_init(void) +{ +    bdrv_register(&bdrv_blkdebug); +} + +block_init(bdrv_blkdebug_init); diff --git a/block/blkverify.c b/block/blkverify.c new file mode 100644 index 00000000..d277e632 --- /dev/null +++ b/block/blkverify.c @@ -0,0 +1,360 @@ +/* + * Block protocol for block driver correctness testing + * + * Copyright (C) 2010 IBM, Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <stdarg.h> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "block/block_int.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" + +typedef struct { +    BlockDriverState *test_file; +} BDRVBlkverifyState; + +typedef struct BlkverifyAIOCB BlkverifyAIOCB; +struct BlkverifyAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; + +    /* Request metadata */ +    bool is_write; +    int64_t sector_num; +    int nb_sectors; + +    int ret;                    /* first completed request's result */ +    unsigned int done;          /* completion counter */ + +    QEMUIOVector *qiov;         /* user I/O vector */ +    QEMUIOVector raw_qiov;      /* cloned I/O vector for raw file */ +    void *buf;                  /* buffer for raw file I/O */ + +    void (*verify)(BlkverifyAIOCB *acb); +}; + +static const AIOCBInfo blkverify_aiocb_info = { +    .aiocb_size         = sizeof(BlkverifyAIOCB), +}; + +static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, +                                             const char *fmt, ...) +{ +    va_list ap; + +    va_start(ap, fmt); +    fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", +            acb->is_write ? "write" : "read", acb->sector_num, +            acb->nb_sectors); +    vfprintf(stderr, fmt, ap); +    fprintf(stderr, "\n"); +    va_end(ap); +    exit(1); +} + +/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ +static void blkverify_parse_filename(const char *filename, QDict *options, +                                     Error **errp) +{ +    const char *c; +    QString *raw_path; + + +    /* Parse the blkverify: prefix */ +    if (!strstart(filename, "blkverify:", &filename)) { +        /* There was no prefix; therefore, all options have to be already +           present in the QDict (except for the filename) */ +        qdict_put(options, "x-image", qstring_from_str(filename)); +        return; +    } + +    /* Parse the raw image filename */ +    c = strchr(filename, ':'); +    if (c == NULL) { +        error_setg(errp, "blkverify requires raw copy and original image path"); +        return; +    } + +    /* TODO Implement option pass-through and set raw.filename here */ +    raw_path = qstring_from_substr(filename, 0, c - filename - 1); +    qdict_put(options, "x-raw", raw_path); + +    /* TODO Allow multi-level nesting and set file.filename here */ +    filename = c + 1; +    qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { +    .name = "blkverify", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "x-raw", +            .type = QEMU_OPT_STRING, +            .help = "[internal use only, will be removed]", +        }, +        { +            .name = "x-image", +            .type = QEMU_OPT_STRING, +            .help = "[internal use only, will be removed]", +        }, +        { /* end of list */ } +    }, +}; + +static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, +                          Error **errp) +{ +    BDRVBlkverifyState *s = bs->opaque; +    QemuOpts *opts; +    Error *local_err = NULL; +    int ret; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    /* Open the raw file */ +    assert(bs->file == NULL); +    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, +                          "raw", bs, &child_file, false, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto fail; +    } + +    /* Open the test file */ +    assert(s->test_file == NULL); +    ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, +                          "test", bs, &child_format, false, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        s->test_file = NULL; +        goto fail; +    } + +    ret = 0; +fail: +    qemu_opts_del(opts); +    return ret; +} + +static void blkverify_close(BlockDriverState *bs) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    bdrv_unref(s->test_file); +    s->test_file = NULL; +} + +static int64_t blkverify_getlength(BlockDriverState *bs) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    return bdrv_getlength(s->test_file); +} + +static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, +                                         int64_t sector_num, QEMUIOVector *qiov, +                                         int nb_sectors, +                                         BlockCompletionFunc *cb, +                                         void *opaque) +{ +    BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque); + +    acb->bh = NULL; +    acb->is_write = is_write; +    acb->sector_num = sector_num; +    acb->nb_sectors = nb_sectors; +    acb->ret = -EINPROGRESS; +    acb->done = 0; +    acb->qiov = qiov; +    acb->buf = NULL; +    acb->verify = NULL; +    return acb; +} + +static void blkverify_aio_bh(void *opaque) +{ +    BlkverifyAIOCB *acb = opaque; + +    qemu_bh_delete(acb->bh); +    if (acb->buf) { +        qemu_iovec_destroy(&acb->raw_qiov); +        qemu_vfree(acb->buf); +    } +    acb->common.cb(acb->common.opaque, acb->ret); +    qemu_aio_unref(acb); +} + +static void blkverify_aio_cb(void *opaque, int ret) +{ +    BlkverifyAIOCB *acb = opaque; + +    switch (++acb->done) { +    case 1: +        acb->ret = ret; +        break; + +    case 2: +        if (acb->ret != ret) { +            blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); +        } + +        if (acb->verify) { +            acb->verify(acb); +        } + +        acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), +                             blkverify_aio_bh, acb); +        qemu_bh_schedule(acb->bh); +        break; +    } +} + +static void blkverify_verify_readv(BlkverifyAIOCB *acb) +{ +    ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); +    if (offset != -1) { +        blkverify_err(acb, "contents mismatch in sector %" PRId64, +                      acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); +    } +} + +static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    BDRVBlkverifyState *s = bs->opaque; +    BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, +                                            nb_sectors, cb, opaque); + +    acb->verify = blkverify_verify_readv; +    acb->buf = qemu_blockalign(bs->file, qiov->size); +    qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); +    qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + +    bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, +                   blkverify_aio_cb, acb); +    bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, +                   blkverify_aio_cb, acb); +    return &acb->common; +} + +static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    BDRVBlkverifyState *s = bs->opaque; +    BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, +                                            nb_sectors, cb, opaque); + +    bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, +                    blkverify_aio_cb, acb); +    bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, +                    blkverify_aio_cb, acb); +    return &acb->common; +} + +static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, +                                       BlockCompletionFunc *cb, +                                       void *opaque) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    /* Only flush test file, the raw file is not important */ +    return bdrv_aio_flush(s->test_file, cb, opaque); +} + +static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, +                                                  BlockDriverState *candidate) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    bool perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + +    if (perm) { +        return true; +    } + +    return bdrv_recurse_is_first_non_filter(s->test_file, candidate); +} + +/* Propagate AioContext changes to ->test_file */ +static void blkverify_detach_aio_context(BlockDriverState *bs) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    bdrv_detach_aio_context(s->test_file); +} + +static void blkverify_attach_aio_context(BlockDriverState *bs, +                                         AioContext *new_context) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    bdrv_attach_aio_context(s->test_file, new_context); +} + +static void blkverify_refresh_filename(BlockDriverState *bs) +{ +    BDRVBlkverifyState *s = bs->opaque; + +    /* bs->file has already been refreshed */ +    bdrv_refresh_filename(s->test_file); + +    if (bs->file->full_open_options && s->test_file->full_open_options) { +        QDict *opts = qdict_new(); +        qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify"))); + +        QINCREF(bs->file->full_open_options); +        qdict_put_obj(opts, "raw", QOBJECT(bs->file->full_open_options)); +        QINCREF(s->test_file->full_open_options); +        qdict_put_obj(opts, "test", QOBJECT(s->test_file->full_open_options)); + +        bs->full_open_options = opts; +    } + +    if (bs->file->exact_filename[0] && s->test_file->exact_filename[0]) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "blkverify:%s:%s", +                 bs->file->exact_filename, s->test_file->exact_filename); +    } +} + +static BlockDriver bdrv_blkverify = { +    .format_name                      = "blkverify", +    .protocol_name                    = "blkverify", +    .instance_size                    = sizeof(BDRVBlkverifyState), + +    .bdrv_parse_filename              = blkverify_parse_filename, +    .bdrv_file_open                   = blkverify_open, +    .bdrv_close                       = blkverify_close, +    .bdrv_getlength                   = blkverify_getlength, +    .bdrv_refresh_filename            = blkverify_refresh_filename, + +    .bdrv_aio_readv                   = blkverify_aio_readv, +    .bdrv_aio_writev                  = blkverify_aio_writev, +    .bdrv_aio_flush                   = blkverify_aio_flush, + +    .bdrv_attach_aio_context          = blkverify_attach_aio_context, +    .bdrv_detach_aio_context          = blkverify_detach_aio_context, + +    .is_filter                        = true, +    .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, +}; + +static void bdrv_blkverify_init(void) +{ +    bdrv_register(&bdrv_blkverify); +} + +block_init(bdrv_blkverify_init); diff --git a/block/block-backend.c b/block/block-backend.c new file mode 100644 index 00000000..aee8a120 --- /dev/null +++ b/block/block-backend.c @@ -0,0 +1,920 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014 Red Hat, Inc. + * + * Authors: + *  Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later.  See the COPYING.LIB file in the top-level directory. + */ + +#include "sysemu/block-backend.h" +#include "block/block_int.h" +#include "sysemu/blockdev.h" +#include "qapi-event.h" + +/* Number of coroutines to reserve per attached device model */ +#define COROUTINE_POOL_RESERVATION 64 + +struct BlockBackend { +    char *name; +    int refcnt; +    BlockDriverState *bs; +    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */ +    QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */ + +    void *dev;                  /* attached device model, if any */ +    /* TODO change to DeviceState when all users are qdevified */ +    const BlockDevOps *dev_ops; +    void *dev_opaque; +}; + +typedef struct BlockBackendAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    int ret; +} BlockBackendAIOCB; + +static const AIOCBInfo block_backend_aiocb_info = { +    .aiocb_size = sizeof(BlockBackendAIOCB), +}; + +static void drive_info_del(DriveInfo *dinfo); + +/* All the BlockBackends (except for hidden ones) */ +static QTAILQ_HEAD(, BlockBackend) blk_backends = +    QTAILQ_HEAD_INITIALIZER(blk_backends); + +/* + * Create a new BlockBackend with @name, with a reference count of one. + * @name must not be null or empty. + * Fail if a BlockBackend with this name already exists. + * Store an error through @errp on failure, unless it's null. + * Return the new BlockBackend on success, null on failure. + */ +BlockBackend *blk_new(const char *name, Error **errp) +{ +    BlockBackend *blk; + +    assert(name && name[0]); +    if (!id_wellformed(name)) { +        error_setg(errp, "Invalid device name"); +        return NULL; +    } +    if (blk_by_name(name)) { +        error_setg(errp, "Device with id '%s' already exists", name); +        return NULL; +    } +    if (bdrv_find_node(name)) { +        error_setg(errp, +                   "Device name '%s' conflicts with an existing node name", +                   name); +        return NULL; +    } + +    blk = g_new0(BlockBackend, 1); +    blk->name = g_strdup(name); +    blk->refcnt = 1; +    QTAILQ_INSERT_TAIL(&blk_backends, blk, link); +    return blk; +} + +/* + * Create a new BlockBackend with a new BlockDriverState attached. + * Otherwise just like blk_new(), which see. + */ +BlockBackend *blk_new_with_bs(const char *name, Error **errp) +{ +    BlockBackend *blk; +    BlockDriverState *bs; + +    blk = blk_new(name, errp); +    if (!blk) { +        return NULL; +    } + +    bs = bdrv_new_root(); +    blk->bs = bs; +    bs->blk = blk; +    return blk; +} + +/* + * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState. + * + * Just as with bdrv_open(), after having called this function the reference to + * @options belongs to the block layer (even on failure). + * + * TODO: Remove @filename and @flags; it should be possible to specify a whole + * BDS tree just by specifying the @options QDict (or @reference, + * alternatively). At the time of adding this function, this is not possible, + * though, so callers of this function have to be able to specify @filename and + * @flags. + */ +BlockBackend *blk_new_open(const char *name, const char *filename, +                           const char *reference, QDict *options, int flags, +                           Error **errp) +{ +    BlockBackend *blk; +    int ret; + +    blk = blk_new_with_bs(name, errp); +    if (!blk) { +        QDECREF(options); +        return NULL; +    } + +    ret = bdrv_open(&blk->bs, filename, reference, options, flags, NULL, errp); +    if (ret < 0) { +        blk_unref(blk); +        return NULL; +    } + +    return blk; +} + +static void blk_delete(BlockBackend *blk) +{ +    assert(!blk->refcnt); +    assert(!blk->dev); +    if (blk->bs) { +        assert(blk->bs->blk == blk); +        blk->bs->blk = NULL; +        bdrv_unref(blk->bs); +        blk->bs = NULL; +    } +    /* Avoid double-remove after blk_hide_on_behalf_of_hmp_drive_del() */ +    if (blk->name[0]) { +        QTAILQ_REMOVE(&blk_backends, blk, link); +    } +    g_free(blk->name); +    drive_info_del(blk->legacy_dinfo); +    g_free(blk); +} + +static void drive_info_del(DriveInfo *dinfo) +{ +    if (!dinfo) { +        return; +    } +    qemu_opts_del(dinfo->opts); +    g_free(dinfo->serial); +    g_free(dinfo); +} + +/* + * Increment @blk's reference count. + * @blk must not be null. + */ +void blk_ref(BlockBackend *blk) +{ +    blk->refcnt++; +} + +/* + * Decrement @blk's reference count. + * If this drops it to zero, destroy @blk. + * For convenience, do nothing if @blk is null. + */ +void blk_unref(BlockBackend *blk) +{ +    if (blk) { +        assert(blk->refcnt > 0); +        if (!--blk->refcnt) { +            blk_delete(blk); +        } +    } +} + +/* + * Return the BlockBackend after @blk. + * If @blk is null, return the first one. + * Else, return @blk's next sibling, which may be null. + * + * To iterate over all BlockBackends, do + * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { + *     ... + * } + */ +BlockBackend *blk_next(BlockBackend *blk) +{ +    return blk ? QTAILQ_NEXT(blk, link) : QTAILQ_FIRST(&blk_backends); +} + +/* + * Return @blk's name, a non-null string. + * Wart: the name is empty iff @blk has been hidden with + * blk_hide_on_behalf_of_hmp_drive_del(). + */ +const char *blk_name(BlockBackend *blk) +{ +    return blk->name; +} + +/* + * Return the BlockBackend with name @name if it exists, else null. + * @name must not be null. + */ +BlockBackend *blk_by_name(const char *name) +{ +    BlockBackend *blk; + +    assert(name); +    QTAILQ_FOREACH(blk, &blk_backends, link) { +        if (!strcmp(name, blk->name)) { +            return blk; +        } +    } +    return NULL; +} + +/* + * Return the BlockDriverState attached to @blk if any, else null. + */ +BlockDriverState *blk_bs(BlockBackend *blk) +{ +    return blk->bs; +} + +/* + * Return @blk's DriveInfo if any, else null. + */ +DriveInfo *blk_legacy_dinfo(BlockBackend *blk) +{ +    return blk->legacy_dinfo; +} + +/* + * Set @blk's DriveInfo to @dinfo, and return it. + * @blk must not have a DriveInfo set already. + * No other BlockBackend may have the same DriveInfo set. + */ +DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo) +{ +    assert(!blk->legacy_dinfo); +    return blk->legacy_dinfo = dinfo; +} + +/* + * Return the BlockBackend with DriveInfo @dinfo. + * It must exist. + */ +BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) +{ +    BlockBackend *blk; + +    QTAILQ_FOREACH(blk, &blk_backends, link) { +        if (blk->legacy_dinfo == dinfo) { +            return blk; +        } +    } +    abort(); +} + +/* + * Hide @blk. + * @blk must not have been hidden already. + * Make attached BlockDriverState, if any, anonymous. + * Once hidden, @blk is invisible to all functions that don't receive + * it as argument.  For example, blk_by_name() won't return it. + * Strictly for use by do_drive_del(). + * TODO get rid of it! + */ +void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk) +{ +    QTAILQ_REMOVE(&blk_backends, blk, link); +    blk->name[0] = 0; +    if (blk->bs) { +        bdrv_make_anon(blk->bs); +    } +} + +/* + * Attach device model @dev to @blk. + * Return 0 on success, -EBUSY when a device model is attached already. + */ +int blk_attach_dev(BlockBackend *blk, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ +    if (blk->dev) { +        return -EBUSY; +    } +    blk_ref(blk); +    blk->dev = dev; +    bdrv_iostatus_reset(blk->bs); +    return 0; +} + +/* + * Attach device model @dev to @blk. + * @blk must not have a device model attached already. + * TODO qdevified devices don't use this, remove when devices are qdevified + */ +void blk_attach_dev_nofail(BlockBackend *blk, void *dev) +{ +    if (blk_attach_dev(blk, dev) < 0) { +        abort(); +    } +} + +/* + * Detach device model @dev from @blk. + * @dev must be currently attached to @blk. + */ +void blk_detach_dev(BlockBackend *blk, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ +    assert(blk->dev == dev); +    blk->dev = NULL; +    blk->dev_ops = NULL; +    blk->dev_opaque = NULL; +    bdrv_set_guest_block_size(blk->bs, 512); +    blk_unref(blk); +} + +/* + * Return the device model attached to @blk if any, else null. + */ +void *blk_get_attached_dev(BlockBackend *blk) +/* TODO change to return DeviceState * when all users are qdevified */ +{ +    return blk->dev; +} + +/* + * Set @blk's device model callbacks to @ops. + * @opaque is the opaque argument to pass to the callbacks. + * This is for use by device models. + */ +void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, +                     void *opaque) +{ +    blk->dev_ops = ops; +    blk->dev_opaque = opaque; +} + +/* + * Notify @blk's attached device model of media change. + * If @load is true, notify of media load. + * Else, notify of media eject. + * Also send DEVICE_TRAY_MOVED events as appropriate. + */ +void blk_dev_change_media_cb(BlockBackend *blk, bool load) +{ +    if (blk->dev_ops && blk->dev_ops->change_media_cb) { +        bool tray_was_closed = !blk_dev_is_tray_open(blk); + +        blk->dev_ops->change_media_cb(blk->dev_opaque, load); +        if (tray_was_closed) { +            /* tray open */ +            qapi_event_send_device_tray_moved(blk_name(blk), +                                              true, &error_abort); +        } +        if (load) { +            /* tray close */ +            qapi_event_send_device_tray_moved(blk_name(blk), +                                              false, &error_abort); +        } +    } +} + +/* + * Does @blk's attached device model have removable media? + * %true if no device model is attached. + */ +bool blk_dev_has_removable_media(BlockBackend *blk) +{ +    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb); +} + +/* + * Notify @blk's attached device model of a media eject request. + * If @force is true, the medium is about to be yanked out forcefully. + */ +void blk_dev_eject_request(BlockBackend *blk, bool force) +{ +    if (blk->dev_ops && blk->dev_ops->eject_request_cb) { +        blk->dev_ops->eject_request_cb(blk->dev_opaque, force); +    } +} + +/* + * Does @blk's attached device model have a tray, and is it open? + */ +bool blk_dev_is_tray_open(BlockBackend *blk) +{ +    if (blk->dev_ops && blk->dev_ops->is_tray_open) { +        return blk->dev_ops->is_tray_open(blk->dev_opaque); +    } +    return false; +} + +/* + * Does @blk's attached device model have the medium locked? + * %false if the device model has no such lock. + */ +bool blk_dev_is_medium_locked(BlockBackend *blk) +{ +    if (blk->dev_ops && blk->dev_ops->is_medium_locked) { +        return blk->dev_ops->is_medium_locked(blk->dev_opaque); +    } +    return false; +} + +/* + * Notify @blk's attached device model of a backend size change. + */ +void blk_dev_resize_cb(BlockBackend *blk) +{ +    if (blk->dev_ops && blk->dev_ops->resize_cb) { +        blk->dev_ops->resize_cb(blk->dev_opaque); +    } +} + +void blk_iostatus_enable(BlockBackend *blk) +{ +    bdrv_iostatus_enable(blk->bs); +} + +static int blk_check_byte_request(BlockBackend *blk, int64_t offset, +                                  size_t size) +{ +    int64_t len; + +    if (size > INT_MAX) { +        return -EIO; +    } + +    if (!blk_is_inserted(blk)) { +        return -ENOMEDIUM; +    } + +    len = blk_getlength(blk); +    if (len < 0) { +        return len; +    } + +    if (offset < 0) { +        return -EIO; +    } + +    if (offset > len || len - offset < size) { +        return -EIO; +    } + +    return 0; +} + +static int blk_check_request(BlockBackend *blk, int64_t sector_num, +                             int nb_sectors) +{ +    if (sector_num < 0 || sector_num > INT64_MAX / BDRV_SECTOR_SIZE) { +        return -EIO; +    } + +    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { +        return -EIO; +    } + +    return blk_check_byte_request(blk, sector_num * BDRV_SECTOR_SIZE, +                                  nb_sectors * BDRV_SECTOR_SIZE); +} + +int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, +             int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_read(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, +                         int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_read_unthrottled(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, +              int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_write(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, +                     int nb_sectors, BdrvRequestFlags flags) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + +static void error_callback_bh(void *opaque) +{ +    struct BlockBackendAIOCB *acb = opaque; +    qemu_bh_delete(acb->bh); +    acb->common.cb(acb->common.opaque, acb->ret); +    qemu_aio_unref(acb); +} + +static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb, +                                     void *opaque, int ret) +{ +    struct BlockBackendAIOCB *acb; +    QEMUBH *bh; + +    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); +    acb->ret = ret; + +    bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb); +    acb->bh = bh; +    qemu_bh_schedule(bh); + +    return &acb->common; +} + +BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, +                                 int nb_sectors, BdrvRequestFlags flags, +                                 BlockCompletionFunc *cb, void *opaque) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return abort_aio_request(blk, cb, opaque, ret); +    } + +    return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags, +                                 cb, opaque); +} + +int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) +{ +    int ret = blk_check_byte_request(blk, offset, count); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_pread(blk->bs, offset, buf, count); +} + +int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) +{ +    int ret = blk_check_byte_request(blk, offset, count); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_pwrite(blk->bs, offset, buf, count); +} + +int64_t blk_getlength(BlockBackend *blk) +{ +    return bdrv_getlength(blk->bs); +} + +void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr) +{ +    bdrv_get_geometry(blk->bs, nb_sectors_ptr); +} + +int64_t blk_nb_sectors(BlockBackend *blk) +{ +    return bdrv_nb_sectors(blk->bs); +} + +BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, +                          QEMUIOVector *iov, int nb_sectors, +                          BlockCompletionFunc *cb, void *opaque) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return abort_aio_request(blk, cb, opaque, ret); +    } + +    return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque); +} + +BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, +                           QEMUIOVector *iov, int nb_sectors, +                           BlockCompletionFunc *cb, void *opaque) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return abort_aio_request(blk, cb, opaque, ret); +    } + +    return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque); +} + +BlockAIOCB *blk_aio_flush(BlockBackend *blk, +                          BlockCompletionFunc *cb, void *opaque) +{ +    return bdrv_aio_flush(blk->bs, cb, opaque); +} + +BlockAIOCB *blk_aio_discard(BlockBackend *blk, +                            int64_t sector_num, int nb_sectors, +                            BlockCompletionFunc *cb, void *opaque) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return abort_aio_request(blk, cb, opaque, ret); +    } + +    return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque); +} + +void blk_aio_cancel(BlockAIOCB *acb) +{ +    bdrv_aio_cancel(acb); +} + +void blk_aio_cancel_async(BlockAIOCB *acb) +{ +    bdrv_aio_cancel_async(acb); +} + +int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs) +{ +    int i, ret; + +    for (i = 0; i < num_reqs; i++) { +        ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors); +        if (ret < 0) { +            return ret; +        } +    } + +    return bdrv_aio_multiwrite(blk->bs, reqs, num_reqs); +} + +int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) +{ +    return bdrv_ioctl(blk->bs, req, buf); +} + +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, +                          BlockCompletionFunc *cb, void *opaque) +{ +    return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque); +} + +int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_co_discard(blk->bs, sector_num, nb_sectors); +} + +int blk_co_flush(BlockBackend *blk) +{ +    return bdrv_co_flush(blk->bs); +} + +int blk_flush(BlockBackend *blk) +{ +    return bdrv_flush(blk->bs); +} + +int blk_flush_all(void) +{ +    return bdrv_flush_all(); +} + +void blk_drain(BlockBackend *blk) +{ +    bdrv_drain(blk->bs); +} + +void blk_drain_all(void) +{ +    bdrv_drain_all(); +} + +BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read) +{ +    return bdrv_get_on_error(blk->bs, is_read); +} + +BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, +                                      int error) +{ +    return bdrv_get_error_action(blk->bs, is_read, error); +} + +void blk_error_action(BlockBackend *blk, BlockErrorAction action, +                      bool is_read, int error) +{ +    bdrv_error_action(blk->bs, action, is_read, error); +} + +int blk_is_read_only(BlockBackend *blk) +{ +    return bdrv_is_read_only(blk->bs); +} + +int blk_is_sg(BlockBackend *blk) +{ +    return bdrv_is_sg(blk->bs); +} + +int blk_enable_write_cache(BlockBackend *blk) +{ +    return bdrv_enable_write_cache(blk->bs); +} + +void blk_set_enable_write_cache(BlockBackend *blk, bool wce) +{ +    bdrv_set_enable_write_cache(blk->bs, wce); +} + +void blk_invalidate_cache(BlockBackend *blk, Error **errp) +{ +    bdrv_invalidate_cache(blk->bs, errp); +} + +int blk_is_inserted(BlockBackend *blk) +{ +    return bdrv_is_inserted(blk->bs); +} + +void blk_lock_medium(BlockBackend *blk, bool locked) +{ +    bdrv_lock_medium(blk->bs, locked); +} + +void blk_eject(BlockBackend *blk, bool eject_flag) +{ +    bdrv_eject(blk->bs, eject_flag); +} + +int blk_get_flags(BlockBackend *blk) +{ +    return bdrv_get_flags(blk->bs); +} + +int blk_get_max_transfer_length(BlockBackend *blk) +{ +    return blk->bs->bl.max_transfer_length; +} + +void blk_set_guest_block_size(BlockBackend *blk, int align) +{ +    bdrv_set_guest_block_size(blk->bs, align); +} + +void *blk_blockalign(BlockBackend *blk, size_t size) +{ +    return qemu_blockalign(blk ? blk->bs : NULL, size); +} + +bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp) +{ +    return bdrv_op_is_blocked(blk->bs, op, errp); +} + +void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason) +{ +    bdrv_op_unblock(blk->bs, op, reason); +} + +void blk_op_block_all(BlockBackend *blk, Error *reason) +{ +    bdrv_op_block_all(blk->bs, reason); +} + +void blk_op_unblock_all(BlockBackend *blk, Error *reason) +{ +    bdrv_op_unblock_all(blk->bs, reason); +} + +AioContext *blk_get_aio_context(BlockBackend *blk) +{ +    return bdrv_get_aio_context(blk->bs); +} + +void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) +{ +    bdrv_set_aio_context(blk->bs, new_context); +} + +void blk_add_aio_context_notifier(BlockBackend *blk, +        void (*attached_aio_context)(AioContext *new_context, void *opaque), +        void (*detach_aio_context)(void *opaque), void *opaque) +{ +    bdrv_add_aio_context_notifier(blk->bs, attached_aio_context, +                                  detach_aio_context, opaque); +} + +void blk_remove_aio_context_notifier(BlockBackend *blk, +                                     void (*attached_aio_context)(AioContext *, +                                                                  void *), +                                     void (*detach_aio_context)(void *), +                                     void *opaque) +{ +    bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context, +                                     detach_aio_context, opaque); +} + +void blk_add_close_notifier(BlockBackend *blk, Notifier *notify) +{ +    bdrv_add_close_notifier(blk->bs, notify); +} + +void blk_io_plug(BlockBackend *blk) +{ +    bdrv_io_plug(blk->bs); +} + +void blk_io_unplug(BlockBackend *blk) +{ +    bdrv_io_unplug(blk->bs); +} + +BlockAcctStats *blk_get_stats(BlockBackend *blk) +{ +    return bdrv_get_stats(blk->bs); +} + +void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, +                  BlockCompletionFunc *cb, void *opaque) +{ +    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); +} + +int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, +                                     int nb_sectors, BdrvRequestFlags flags) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_co_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + +int blk_write_compressed(BlockBackend *blk, int64_t sector_num, +                         const uint8_t *buf, int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_write_compressed(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_truncate(BlockBackend *blk, int64_t offset) +{ +    return bdrv_truncate(blk->bs, offset); +} + +int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +{ +    int ret = blk_check_request(blk, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    return bdrv_discard(blk->bs, sector_num, nb_sectors); +} + +int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, +                     int64_t pos, int size) +{ +    return bdrv_save_vmstate(blk->bs, buf, pos, size); +} + +int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size) +{ +    return bdrv_load_vmstate(blk->bs, buf, pos, size); +} + +int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz) +{ +    return bdrv_probe_blocksizes(blk->bs, bsz); +} + +int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo) +{ +    return bdrv_probe_geometry(blk->bs, geo); +} diff --git a/block/bochs.c b/block/bochs.c new file mode 100644 index 00000000..199ac2b9 --- /dev/null +++ b/block/bochs.c @@ -0,0 +1,277 @@ +/* + * Block driver for the various disk image formats used by Bochs + * Currently only for "growing" type in read-only mode + * + * Copyright (c) 2005 Alex Beregszaszi + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "Bochs Virtual HD Image" +#define HEADER_VERSION 0x00020000 +#define HEADER_V1 0x00010000 +#define HEADER_SIZE 512 + +#define REDOLOG_TYPE "Redolog" +#define GROWING_TYPE "Growing" + +// not allocated: 0xffffffff + +// always little-endian +struct bochs_header { +    char magic[32];     /* "Bochs Virtual HD Image" */ +    char type[16];      /* "Redolog" */ +    char subtype[16];   /* "Undoable" / "Volatile" / "Growing" */ +    uint32_t version; +    uint32_t header;    /* size of header */ + +    uint32_t catalog;   /* num of entries */ +    uint32_t bitmap;    /* bitmap size */ +    uint32_t extent;    /* extent size */ + +    union { +        struct { +            uint32_t reserved;  /* for ??? */ +            uint64_t disk;      /* disk size */ +            char padding[HEADER_SIZE - 64 - 20 - 12]; +        } QEMU_PACKED redolog; +        struct { +            uint64_t disk;      /* disk size */ +            char padding[HEADER_SIZE - 64 - 20 - 8]; +        } QEMU_PACKED redolog_v1; +        char padding[HEADER_SIZE - 64 - 20]; +    } extra; +} QEMU_PACKED; + +typedef struct BDRVBochsState { +    CoMutex lock; +    uint32_t *catalog_bitmap; +    uint32_t catalog_size; + +    uint32_t data_offset; + +    uint32_t bitmap_blocks; +    uint32_t extent_blocks; +    uint32_t extent_size; +} BDRVBochsState; + +static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const struct bochs_header *bochs = (const void *)buf; + +    if (buf_size < HEADER_SIZE) +	return 0; + +    if (!strcmp(bochs->magic, HEADER_MAGIC) && +	!strcmp(bochs->type, REDOLOG_TYPE) && +	!strcmp(bochs->subtype, GROWING_TYPE) && +	((le32_to_cpu(bochs->version) == HEADER_VERSION) || +	(le32_to_cpu(bochs->version) == HEADER_V1))) +	return 100; + +    return 0; +} + +static int bochs_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVBochsState *s = bs->opaque; +    uint32_t i; +    struct bochs_header bochs; +    int ret; + +    bs->read_only = 1; // no write support yet + +    ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); +    if (ret < 0) { +        return ret; +    } + +    if (strcmp(bochs.magic, HEADER_MAGIC) || +        strcmp(bochs.type, REDOLOG_TYPE) || +        strcmp(bochs.subtype, GROWING_TYPE) || +	((le32_to_cpu(bochs.version) != HEADER_VERSION) && +	(le32_to_cpu(bochs.version) != HEADER_V1))) { +        error_setg(errp, "Image not in Bochs format"); +        return -EINVAL; +    } + +    if (le32_to_cpu(bochs.version) == HEADER_V1) { +        bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512; +    } else { +        bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; +    } + +    /* Limit to 1M entries to avoid unbounded allocation. This is what is +     * needed for the largest image that bximage can create (~8 TB). */ +    s->catalog_size = le32_to_cpu(bochs.catalog); +    if (s->catalog_size > 0x100000) { +        error_setg(errp, "Catalog size is too large"); +        return -EFBIG; +    } + +    s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size); +    if (s->catalog_size && s->catalog_bitmap == NULL) { +        error_setg(errp, "Could not allocate memory for catalog"); +        return -ENOMEM; +    } + +    ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, +                     s->catalog_size * 4); +    if (ret < 0) { +        goto fail; +    } + +    for (i = 0; i < s->catalog_size; i++) +	le32_to_cpus(&s->catalog_bitmap[i]); + +    s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); + +    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512; +    s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512; + +    s->extent_size = le32_to_cpu(bochs.extent); +    if (s->extent_size < BDRV_SECTOR_SIZE) { +        /* bximage actually never creates extents smaller than 4k */ +        error_setg(errp, "Extent size must be at least 512"); +        ret = -EINVAL; +        goto fail; +    } else if (!is_power_of_2(s->extent_size)) { +        error_setg(errp, "Extent size %" PRIu32 " is not a power of two", +                   s->extent_size); +        ret = -EINVAL; +        goto fail; +    } else if (s->extent_size > 0x800000) { +        error_setg(errp, "Extent size %" PRIu32 " is too large", +                   s->extent_size); +        ret = -EINVAL; +        goto fail; +    } + +    if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors, +                                       s->extent_size / BDRV_SECTOR_SIZE)) +    { +        error_setg(errp, "Catalog size is too small for this disk size"); +        ret = -EINVAL; +        goto fail; +    } + +    qemu_co_mutex_init(&s->lock); +    return 0; + +fail: +    g_free(s->catalog_bitmap); +    return ret; +} + +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ +    BDRVBochsState *s = bs->opaque; +    uint64_t offset = sector_num * 512; +    uint64_t extent_index, extent_offset, bitmap_offset; +    char bitmap_entry; +    int ret; + +    // seek to sector +    extent_index = offset / s->extent_size; +    extent_offset = (offset % s->extent_size) / 512; + +    if (s->catalog_bitmap[extent_index] == 0xffffffff) { +	return 0; /* not allocated */ +    } + +    bitmap_offset = s->data_offset + +        (512 * (uint64_t) s->catalog_bitmap[extent_index] * +        (s->extent_blocks + s->bitmap_blocks)); + +    /* read in bitmap for current extent */ +    ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), +                     &bitmap_entry, 1); +    if (ret < 0) { +        return ret; +    } + +    if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { +	return 0; /* not allocated */ +    } + +    return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); +} + +static int bochs_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    int ret; + +    while (nb_sectors > 0) { +        int64_t block_offset = seek_to_sector(bs, sector_num); +        if (block_offset < 0) { +            return block_offset; +        } else if (block_offset > 0) { +            ret = bdrv_pread(bs->file, block_offset, buf, 512); +            if (ret < 0) { +                return ret; +            } +        } else { +            memset(buf, 0, 512); +        } +        nb_sectors--; +        sector_num++; +        buf += 512; +    } +    return 0; +} + +static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, +                                      uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVBochsState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = bochs_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static void bochs_close(BlockDriverState *bs) +{ +    BDRVBochsState *s = bs->opaque; +    g_free(s->catalog_bitmap); +} + +static BlockDriver bdrv_bochs = { +    .format_name	= "bochs", +    .instance_size	= sizeof(BDRVBochsState), +    .bdrv_probe		= bochs_probe, +    .bdrv_open		= bochs_open, +    .bdrv_read          = bochs_co_read, +    .bdrv_close		= bochs_close, +}; + +static void bdrv_bochs_init(void) +{ +    bdrv_register(&bdrv_bochs); +} + +block_init(bdrv_bochs_init); diff --git a/block/cloop.c b/block/cloop.c new file mode 100644 index 00000000..f328be06 --- /dev/null +++ b/block/cloop.c @@ -0,0 +1,283 @@ +/* + * QEMU Block driver for CLOOP images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> + +/* Maximum compressed block size */ +#define MAX_BLOCK_SIZE (64 * 1024 * 1024) + +typedef struct BDRVCloopState { +    CoMutex lock; +    uint32_t block_size; +    uint32_t n_blocks; +    uint64_t *offsets; +    uint32_t sectors_per_block; +    uint32_t current_block; +    uint8_t *compressed_block; +    uint8_t *uncompressed_block; +    z_stream zstream; +} BDRVCloopState; + +static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const char *magic_version_2_0 = "#!/bin/sh\n" +        "#V2.0 Format\n" +        "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n"; +    int length = strlen(magic_version_2_0); +    if (length > buf_size) { +        length = buf_size; +    } +    if (!memcmp(magic_version_2_0, buf, length)) { +        return 2; +    } +    return 0; +} + +static int cloop_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVCloopState *s = bs->opaque; +    uint32_t offsets_size, max_compressed_block_size = 1, i; +    int ret; + +    bs->read_only = 1; + +    /* read header */ +    ret = bdrv_pread(bs->file, 128, &s->block_size, 4); +    if (ret < 0) { +        return ret; +    } +    s->block_size = be32_to_cpu(s->block_size); +    if (s->block_size % 512) { +        error_setg(errp, "block_size %" PRIu32 " must be a multiple of 512", +                   s->block_size); +        return -EINVAL; +    } +    if (s->block_size == 0) { +        error_setg(errp, "block_size cannot be zero"); +        return -EINVAL; +    } + +    /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but +     * we can accept more.  Prevent ridiculous values like 4 GB - 1 since we +     * need a buffer this big. +     */ +    if (s->block_size > MAX_BLOCK_SIZE) { +        error_setg(errp, "block_size %" PRIu32 " must be %u MB or less", +                   s->block_size, +                   MAX_BLOCK_SIZE / (1024 * 1024)); +        return -EINVAL; +    } + +    ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); +    if (ret < 0) { +        return ret; +    } +    s->n_blocks = be32_to_cpu(s->n_blocks); + +    /* read offsets */ +    if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { +        /* Prevent integer overflow */ +        error_setg(errp, "n_blocks %" PRIu32 " must be %zu or less", +                   s->n_blocks, +                   (UINT32_MAX - 1) / sizeof(uint64_t)); +        return -EINVAL; +    } +    offsets_size = (s->n_blocks + 1) * sizeof(uint64_t); +    if (offsets_size > 512 * 1024 * 1024) { +        /* Prevent ridiculous offsets_size which causes memory allocation to +         * fail or overflows bdrv_pread() size.  In practice the 512 MB +         * offsets[] limit supports 16 TB images at 256 KB block size. +         */ +        error_setg(errp, "image requires too many offsets, " +                   "try increasing block size"); +        return -EINVAL; +    } + +    s->offsets = g_try_malloc(offsets_size); +    if (s->offsets == NULL) { +        error_setg(errp, "Could not allocate offsets table"); +        return -ENOMEM; +    } + +    ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); +    if (ret < 0) { +        goto fail; +    } + +    for (i = 0; i < s->n_blocks + 1; i++) { +        uint64_t size; + +        s->offsets[i] = be64_to_cpu(s->offsets[i]); +        if (i == 0) { +            continue; +        } + +        if (s->offsets[i] < s->offsets[i - 1]) { +            error_setg(errp, "offsets not monotonically increasing at " +                       "index %" PRIu32 ", image file is corrupt", i); +            ret = -EINVAL; +            goto fail; +        } + +        size = s->offsets[i] - s->offsets[i - 1]; + +        /* Compressed blocks should be smaller than the uncompressed block size +         * but maybe compression performed poorly so the compressed block is +         * actually bigger.  Clamp down on unrealistic values to prevent +         * ridiculous s->compressed_block allocation. +         */ +        if (size > 2 * MAX_BLOCK_SIZE) { +            error_setg(errp, "invalid compressed block size at index %" PRIu32 +                       ", image file is corrupt", i); +            ret = -EINVAL; +            goto fail; +        } + +        if (size > max_compressed_block_size) { +            max_compressed_block_size = size; +        } +    } + +    /* initialize zlib engine */ +    s->compressed_block = g_try_malloc(max_compressed_block_size + 1); +    if (s->compressed_block == NULL) { +        error_setg(errp, "Could not allocate compressed_block"); +        ret = -ENOMEM; +        goto fail; +    } + +    s->uncompressed_block = g_try_malloc(s->block_size); +    if (s->uncompressed_block == NULL) { +        error_setg(errp, "Could not allocate uncompressed_block"); +        ret = -ENOMEM; +        goto fail; +    } + +    if (inflateInit(&s->zstream) != Z_OK) { +        ret = -EINVAL; +        goto fail; +    } +    s->current_block = s->n_blocks; + +    s->sectors_per_block = s->block_size/512; +    bs->total_sectors = s->n_blocks * s->sectors_per_block; +    qemu_co_mutex_init(&s->lock); +    return 0; + +fail: +    g_free(s->offsets); +    g_free(s->compressed_block); +    g_free(s->uncompressed_block); +    return ret; +} + +static inline int cloop_read_block(BlockDriverState *bs, int block_num) +{ +    BDRVCloopState *s = bs->opaque; + +    if (s->current_block != block_num) { +        int ret; +        uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; + +        ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block, +                         bytes); +        if (ret != bytes) { +            return -1; +        } + +        s->zstream.next_in = s->compressed_block; +        s->zstream.avail_in = bytes; +        s->zstream.next_out = s->uncompressed_block; +        s->zstream.avail_out = s->block_size; +        ret = inflateReset(&s->zstream); +        if (ret != Z_OK) { +            return -1; +        } +        ret = inflate(&s->zstream, Z_FINISH); +        if (ret != Z_STREAM_END || s->zstream.total_out != s->block_size) { +            return -1; +        } + +        s->current_block = block_num; +    } +    return 0; +} + +static int cloop_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    BDRVCloopState *s = bs->opaque; +    int i; + +    for (i = 0; i < nb_sectors; i++) { +        uint32_t sector_offset_in_block = +            ((sector_num + i) % s->sectors_per_block), +            block_num = (sector_num + i) / s->sectors_per_block; +        if (cloop_read_block(bs, block_num) != 0) { +            return -1; +        } +        memcpy(buf + i * 512, +            s->uncompressed_block + sector_offset_in_block * 512, 512); +    } +    return 0; +} + +static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, +                                      uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVCloopState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = cloop_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static void cloop_close(BlockDriverState *bs) +{ +    BDRVCloopState *s = bs->opaque; +    g_free(s->offsets); +    g_free(s->compressed_block); +    g_free(s->uncompressed_block); +    inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_cloop = { +    .format_name    = "cloop", +    .instance_size  = sizeof(BDRVCloopState), +    .bdrv_probe     = cloop_probe, +    .bdrv_open      = cloop_open, +    .bdrv_read      = cloop_co_read, +    .bdrv_close     = cloop_close, +}; + +static void bdrv_cloop_init(void) +{ +    bdrv_register(&bdrv_cloop); +} + +block_init(bdrv_cloop_init); diff --git a/block/commit.c b/block/commit.c new file mode 100644 index 00000000..7312a5bd --- /dev/null +++ b/block/commit.c @@ -0,0 +1,274 @@ +/* + * Live block commit + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + *  Jeff Cody   <jcody@redhat.com> + *  Based on stream.c by Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" + +enum { +    /* +     * Size of data buffer for populating the image file.  This should be large +     * enough to process multiple clusters in a single call, so that populating +     * contiguous regions of the image is efficient. +     */ +    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct CommitBlockJob { +    BlockJob common; +    RateLimit limit; +    BlockDriverState *active; +    BlockDriverState *top; +    BlockDriverState *base; +    BlockdevOnError on_error; +    int base_flags; +    int orig_overlay_flags; +    char *backing_file_str; +} CommitBlockJob; + +static int coroutine_fn commit_populate(BlockDriverState *bs, +                                        BlockDriverState *base, +                                        int64_t sector_num, int nb_sectors, +                                        void *buf) +{ +    int ret = 0; + +    ret = bdrv_read(bs, sector_num, buf, nb_sectors); +    if (ret) { +        return ret; +    } + +    ret = bdrv_write(base, sector_num, buf, nb_sectors); +    if (ret) { +        return ret; +    } + +    return 0; +} + +typedef struct { +    int ret; +} CommitCompleteData; + +static void commit_complete(BlockJob *job, void *opaque) +{ +    CommitBlockJob *s = container_of(job, CommitBlockJob, common); +    CommitCompleteData *data = opaque; +    BlockDriverState *active = s->active; +    BlockDriverState *top = s->top; +    BlockDriverState *base = s->base; +    BlockDriverState *overlay_bs; +    int ret = data->ret; + +    if (!block_job_is_cancelled(&s->common) && ret == 0) { +        /* success */ +        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); +    } + +    /* restore base open flags here if appropriate (e.g., change the base back +     * to r/o). These reopens do not need to be atomic, since we won't abort +     * even on failure here */ +    if (s->base_flags != bdrv_get_flags(base)) { +        bdrv_reopen(base, s->base_flags, NULL); +    } +    overlay_bs = bdrv_find_overlay(active, top); +    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { +        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); +    } +    g_free(s->backing_file_str); +    block_job_completed(&s->common, ret); +    g_free(data); +} + +static void coroutine_fn commit_run(void *opaque) +{ +    CommitBlockJob *s = opaque; +    CommitCompleteData *data; +    BlockDriverState *top = s->top; +    BlockDriverState *base = s->base; +    int64_t sector_num, end; +    int ret = 0; +    int n = 0; +    void *buf = NULL; +    int bytes_written = 0; +    int64_t base_len; + +    ret = s->common.len = bdrv_getlength(top); + + +    if (s->common.len < 0) { +        goto out; +    } + +    ret = base_len = bdrv_getlength(base); +    if (base_len < 0) { +        goto out; +    } + +    if (base_len < s->common.len) { +        ret = bdrv_truncate(base, s->common.len); +        if (ret) { +            goto out; +        } +    } + +    end = s->common.len >> BDRV_SECTOR_BITS; +    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); + +    for (sector_num = 0; sector_num < end; sector_num += n) { +        uint64_t delay_ns = 0; +        bool copy; + +wait: +        /* Note that even when no rate limit is applied we need to yield +         * with no pending I/O here so that bdrv_drain_all() returns. +         */ +        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); +        if (block_job_is_cancelled(&s->common)) { +            break; +        } +        /* Copy if allocated above the base */ +        ret = bdrv_is_allocated_above(top, base, sector_num, +                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, +                                      &n); +        copy = (ret == 1); +        trace_commit_one_iteration(s, sector_num, n, ret); +        if (copy) { +            if (s->common.speed) { +                delay_ns = ratelimit_calculate_delay(&s->limit, n); +                if (delay_ns > 0) { +                    goto wait; +                } +            } +            ret = commit_populate(top, base, sector_num, n, buf); +            bytes_written += n * BDRV_SECTOR_SIZE; +        } +        if (ret < 0) { +            if (s->on_error == BLOCKDEV_ON_ERROR_STOP || +                s->on_error == BLOCKDEV_ON_ERROR_REPORT|| +                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { +                goto out; +            } else { +                n = 0; +                continue; +            } +        } +        /* Publish progress */ +        s->common.offset += n * BDRV_SECTOR_SIZE; +    } + +    ret = 0; + +out: +    qemu_vfree(buf); + +    data = g_malloc(sizeof(*data)); +    data->ret = ret; +    block_job_defer_to_main_loop(&s->common, commit_complete, data); +} + +static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ +    CommitBlockJob *s = container_of(job, CommitBlockJob, common); + +    if (speed < 0) { +        error_setg(errp, QERR_INVALID_PARAMETER, "speed"); +        return; +    } +    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static const BlockJobDriver commit_job_driver = { +    .instance_size = sizeof(CommitBlockJob), +    .job_type      = BLOCK_JOB_TYPE_COMMIT, +    .set_speed     = commit_set_speed, +}; + +void commit_start(BlockDriverState *bs, BlockDriverState *base, +                  BlockDriverState *top, int64_t speed, +                  BlockdevOnError on_error, BlockCompletionFunc *cb, +                  void *opaque, const char *backing_file_str, Error **errp) +{ +    CommitBlockJob *s; +    BlockReopenQueue *reopen_queue = NULL; +    int orig_overlay_flags; +    int orig_base_flags; +    BlockDriverState *overlay_bs; +    Error *local_err = NULL; + +    if ((on_error == BLOCKDEV_ON_ERROR_STOP || +         on_error == BLOCKDEV_ON_ERROR_ENOSPC) && +        !bdrv_iostatus_is_enabled(bs)) { +        error_setg(errp, "Invalid parameter combination"); +        return; +    } + +    assert(top != bs); +    if (top == base) { +        error_setg(errp, "Invalid files for merge: top and base are the same"); +        return; +    } + +    overlay_bs = bdrv_find_overlay(bs, top); + +    if (overlay_bs == NULL) { +        error_setg(errp, "Could not find overlay image for %s:", top->filename); +        return; +    } + +    orig_base_flags    = bdrv_get_flags(base); +    orig_overlay_flags = bdrv_get_flags(overlay_bs); + +    /* convert base & overlay_bs to r/w, if necessary */ +    if (!(orig_base_flags & BDRV_O_RDWR)) { +        reopen_queue = bdrv_reopen_queue(reopen_queue, base, +                                         orig_base_flags | BDRV_O_RDWR); +    } +    if (!(orig_overlay_flags & BDRV_O_RDWR)) { +        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, +                                         orig_overlay_flags | BDRV_O_RDWR); +    } +    if (reopen_queue) { +        bdrv_reopen_multiple(reopen_queue, &local_err); +        if (local_err != NULL) { +            error_propagate(errp, local_err); +            return; +        } +    } + + +    s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); +    if (!s) { +        return; +    } + +    s->base   = base; +    s->top    = top; +    s->active = bs; + +    s->base_flags          = orig_base_flags; +    s->orig_overlay_flags  = orig_overlay_flags; + +    s->backing_file_str = g_strdup(backing_file_str); + +    s->on_error = on_error; +    s->common.co = qemu_coroutine_create(commit_run); + +    trace_commit_start(bs, base, top, s, s->common.co, opaque); +    qemu_coroutine_enter(s->common.co, s); +} diff --git a/block/curl.c b/block/curl.c new file mode 100644 index 00000000..032cc8ae --- /dev/null +++ b/block/curl.c @@ -0,0 +1,825 @@ +/* + * QEMU Block driver for CURL images + * + * Copyright (c) 2009 Alexander Graf <agraf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" +#include <curl/curl.h> + +// #define DEBUG_CURL +// #define DEBUG_VERBOSE + +#ifdef DEBUG_CURL +#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#if LIBCURL_VERSION_NUM >= 0x071000 +/* The multi interface timer callback was introduced in 7.16.0 */ +#define NEED_CURL_TIMER_CALLBACK +#define HAVE_SOCKET_ACTION +#endif + +#ifndef HAVE_SOCKET_ACTION +/* If curl_multi_socket_action isn't available, define it statically here in + * terms of curl_multi_socket. Note that ev_bitmask will be ignored, which is + * less efficient but still safe. */ +static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, +                                            curl_socket_t sockfd, +                                            int ev_bitmask, +                                            int *running_handles) +{ +    return curl_multi_socket(multi_handle, sockfd, running_handles); +} +#define curl_multi_socket_action __curl_multi_socket_action +#endif + +#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ +                   CURLPROTO_FTP | CURLPROTO_FTPS | \ +                   CURLPROTO_TFTP) + +#define CURL_NUM_STATES 8 +#define CURL_NUM_ACB    8 +#define SECTOR_SIZE     512 +#define READ_AHEAD_DEFAULT (256 * 1024) +#define CURL_TIMEOUT_DEFAULT 5 +#define CURL_TIMEOUT_MAX 10000 + +#define FIND_RET_NONE   0 +#define FIND_RET_OK     1 +#define FIND_RET_WAIT   2 + +#define CURL_BLOCK_OPT_URL       "url" +#define CURL_BLOCK_OPT_READAHEAD "readahead" +#define CURL_BLOCK_OPT_SSLVERIFY "sslverify" +#define CURL_BLOCK_OPT_TIMEOUT "timeout" +#define CURL_BLOCK_OPT_COOKIE    "cookie" + +struct BDRVCURLState; + +typedef struct CURLAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    QEMUIOVector *qiov; + +    int64_t sector_num; +    int nb_sectors; + +    size_t start; +    size_t end; +} CURLAIOCB; + +typedef struct CURLState +{ +    struct BDRVCURLState *s; +    CURLAIOCB *acb[CURL_NUM_ACB]; +    CURL *curl; +    curl_socket_t sock_fd; +    char *orig_buf; +    size_t buf_start; +    size_t buf_off; +    size_t buf_len; +    char range[128]; +    char errmsg[CURL_ERROR_SIZE]; +    char in_use; +} CURLState; + +typedef struct BDRVCURLState { +    CURLM *multi; +    QEMUTimer timer; +    size_t len; +    CURLState states[CURL_NUM_STATES]; +    char *url; +    size_t readahead_size; +    bool sslverify; +    uint64_t timeout; +    char *cookie; +    bool accept_range; +    AioContext *aio_context; +} BDRVCURLState; + +static void curl_clean_state(CURLState *s); +static void curl_multi_do(void *arg); +static void curl_multi_read(void *arg); + +#ifdef NEED_CURL_TIMER_CALLBACK +static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) +{ +    BDRVCURLState *s = opaque; + +    DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); +    if (timeout_ms == -1) { +        timer_del(&s->timer); +    } else { +        int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; +        timer_mod(&s->timer, +                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); +    } +    return 0; +} +#endif + +static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, +                        void *userp, void *sp) +{ +    BDRVCURLState *s; +    CURLState *state = NULL; +    curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state); +    state->sock_fd = fd; +    s = state->s; + +    DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); +    switch (action) { +        case CURL_POLL_IN: +            aio_set_fd_handler(s->aio_context, fd, curl_multi_read, +                               NULL, state); +            break; +        case CURL_POLL_OUT: +            aio_set_fd_handler(s->aio_context, fd, NULL, curl_multi_do, state); +            break; +        case CURL_POLL_INOUT: +            aio_set_fd_handler(s->aio_context, fd, curl_multi_read, +                               curl_multi_do, state); +            break; +        case CURL_POLL_REMOVE: +            aio_set_fd_handler(s->aio_context, fd, NULL, NULL, NULL); +            break; +    } + +    return 0; +} + +static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ +    BDRVCURLState *s = opaque; +    size_t realsize = size * nmemb; +    const char *accept_line = "Accept-Ranges: bytes"; + +    if (realsize >= strlen(accept_line) +        && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { +        s->accept_range = true; +    } + +    return realsize; +} + +static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ +    CURLState *s = ((CURLState*)opaque); +    size_t realsize = size * nmemb; +    int i; + +    DPRINTF("CURL: Just reading %zd bytes\n", realsize); + +    if (!s || !s->orig_buf) +        return 0; + +    if (s->buf_off >= s->buf_len) { +        /* buffer full, read nothing */ +        return 0; +    } +    realsize = MIN(realsize, s->buf_len - s->buf_off); +    memcpy(s->orig_buf + s->buf_off, ptr, realsize); +    s->buf_off += realsize; + +    for(i=0; i<CURL_NUM_ACB; i++) { +        CURLAIOCB *acb = s->acb[i]; + +        if (!acb) +            continue; + +        if ((s->buf_off >= acb->end)) { +            qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, +                                acb->end - acb->start); +            acb->common.cb(acb->common.opaque, 0); +            qemu_aio_unref(acb); +            s->acb[i] = NULL; +        } +    } + +    return realsize; +} + +static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, +                         CURLAIOCB *acb) +{ +    int i; +    size_t end = start + len; + +    for (i=0; i<CURL_NUM_STATES; i++) { +        CURLState *state = &s->states[i]; +        size_t buf_end = (state->buf_start + state->buf_off); +        size_t buf_fend = (state->buf_start + state->buf_len); + +        if (!state->orig_buf) +            continue; +        if (!state->buf_off) +            continue; + +        // Does the existing buffer cover our section? +        if ((start >= state->buf_start) && +            (start <= buf_end) && +            (end >= state->buf_start) && +            (end <= buf_end)) +        { +            char *buf = state->orig_buf + (start - state->buf_start); + +            qemu_iovec_from_buf(acb->qiov, 0, buf, len); +            acb->common.cb(acb->common.opaque, 0); + +            return FIND_RET_OK; +        } + +        // Wait for unfinished chunks +        if (state->in_use && +            (start >= state->buf_start) && +            (start <= buf_fend) && +            (end >= state->buf_start) && +            (end <= buf_fend)) +        { +            int j; + +            acb->start = start - state->buf_start; +            acb->end = acb->start + len; + +            for (j=0; j<CURL_NUM_ACB; j++) { +                if (!state->acb[j]) { +                    state->acb[j] = acb; +                    return FIND_RET_WAIT; +                } +            } +        } +    } + +    return FIND_RET_NONE; +} + +static void curl_multi_check_completion(BDRVCURLState *s) +{ +    int msgs_in_queue; + +    /* Try to find done transfers, so we can free the easy +     * handle again. */ +    for (;;) { +        CURLMsg *msg; +        msg = curl_multi_info_read(s->multi, &msgs_in_queue); + +        /* Quit when there are no more completions */ +        if (!msg) +            break; + +        if (msg->msg == CURLMSG_DONE) { +            CURLState *state = NULL; +            curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, +                              (char **)&state); + +            /* ACBs for successful messages get completed in curl_read_cb */ +            if (msg->data.result != CURLE_OK) { +                int i; +                static int errcount = 100; + +                /* Don't lose the original error message from curl, since +                 * it contains extra data. +                 */ +                if (errcount > 0) { +                    error_report("curl: %s", state->errmsg); +                    if (--errcount == 0) { +                        error_report("curl: further errors suppressed"); +                    } +                } + +                for (i = 0; i < CURL_NUM_ACB; i++) { +                    CURLAIOCB *acb = state->acb[i]; + +                    if (acb == NULL) { +                        continue; +                    } + +                    acb->common.cb(acb->common.opaque, -EPROTO); +                    qemu_aio_unref(acb); +                    state->acb[i] = NULL; +                } +            } + +            curl_clean_state(state); +            break; +        } +    } +} + +static void curl_multi_do(void *arg) +{ +    CURLState *s = (CURLState *)arg; +    int running; +    int r; + +    if (!s->s->multi) { +        return; +    } + +    do { +        r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running); +    } while(r == CURLM_CALL_MULTI_PERFORM); + +} + +static void curl_multi_read(void *arg) +{ +    CURLState *s = (CURLState *)arg; + +    curl_multi_do(arg); +    curl_multi_check_completion(s->s); +} + +static void curl_multi_timeout_do(void *arg) +{ +#ifdef NEED_CURL_TIMER_CALLBACK +    BDRVCURLState *s = (BDRVCURLState *)arg; +    int running; + +    if (!s->multi) { +        return; +    } + +    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); + +    curl_multi_check_completion(s); +#else +    abort(); +#endif +} + +static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) +{ +    CURLState *state = NULL; +    int i, j; + +    do { +        for (i=0; i<CURL_NUM_STATES; i++) { +            for (j=0; j<CURL_NUM_ACB; j++) +                if (s->states[i].acb[j]) +                    continue; +            if (s->states[i].in_use) +                continue; + +            state = &s->states[i]; +            state->in_use = 1; +            break; +        } +        if (!state) { +            aio_poll(bdrv_get_aio_context(bs), true); +        } +    } while(!state); + +    if (!state->curl) { +        state->curl = curl_easy_init(); +        if (!state->curl) { +            return NULL; +        } +        curl_easy_setopt(state->curl, CURLOPT_URL, s->url); +        curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER, +                         (long) s->sslverify); +        if (s->cookie) { +            curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie); +        } +        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout); +        curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, +                         (void *)curl_read_cb); +        curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); +        curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); +        curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); +        curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); +        curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); +        curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); +        curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + +        /* Restrict supported protocols to avoid security issues in the more +         * obscure protocols.  For example, do not allow POP3/SMTP/IMAP see +         * CVE-2013-0249. +         * +         * Restricting protocols is only supported from 7.19.4 upwards. +         */ +#if LIBCURL_VERSION_NUM >= 0x071304 +        curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); +        curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); +#endif + +#ifdef DEBUG_VERBOSE +        curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); +#endif +    } + +    state->s = s; + +    return state; +} + +static void curl_clean_state(CURLState *s) +{ +    if (s->s->multi) +        curl_multi_remove_handle(s->s->multi, s->curl); +    s->in_use = 0; +} + +static void curl_parse_filename(const char *filename, QDict *options, +                                Error **errp) +{ +    qdict_put(options, CURL_BLOCK_OPT_URL, qstring_from_str(filename)); +} + +static void curl_detach_aio_context(BlockDriverState *bs) +{ +    BDRVCURLState *s = bs->opaque; +    int i; + +    for (i = 0; i < CURL_NUM_STATES; i++) { +        if (s->states[i].in_use) { +            curl_clean_state(&s->states[i]); +        } +        if (s->states[i].curl) { +            curl_easy_cleanup(s->states[i].curl); +            s->states[i].curl = NULL; +        } +        g_free(s->states[i].orig_buf); +        s->states[i].orig_buf = NULL; +    } +    if (s->multi) { +        curl_multi_cleanup(s->multi); +        s->multi = NULL; +    } + +    timer_del(&s->timer); +} + +static void curl_attach_aio_context(BlockDriverState *bs, +                                    AioContext *new_context) +{ +    BDRVCURLState *s = bs->opaque; + +    aio_timer_init(new_context, &s->timer, +                   QEMU_CLOCK_REALTIME, SCALE_NS, +                   curl_multi_timeout_do, s); + +    assert(!s->multi); +    s->multi = curl_multi_init(); +    s->aio_context = new_context; +    curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); +#ifdef NEED_CURL_TIMER_CALLBACK +    curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); +    curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif +} + +static QemuOptsList runtime_opts = { +    .name = "curl", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = CURL_BLOCK_OPT_URL, +            .type = QEMU_OPT_STRING, +            .help = "URL to open", +        }, +        { +            .name = CURL_BLOCK_OPT_READAHEAD, +            .type = QEMU_OPT_SIZE, +            .help = "Readahead size", +        }, +        { +            .name = CURL_BLOCK_OPT_SSLVERIFY, +            .type = QEMU_OPT_BOOL, +            .help = "Verify SSL certificate" +        }, +        { +            .name = CURL_BLOCK_OPT_TIMEOUT, +            .type = QEMU_OPT_NUMBER, +            .help = "Curl timeout" +        }, +        { +            .name = CURL_BLOCK_OPT_COOKIE, +            .type = QEMU_OPT_STRING, +            .help = "Pass the cookie or list of cookies with each request" +        }, +        { /* end of list */ } +    }, +}; + +static int curl_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    BDRVCURLState *s = bs->opaque; +    CURLState *state = NULL; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *file; +    const char *cookie; +    double d; + +    static int inited = 0; + +    if (flags & BDRV_O_RDWR) { +        error_setg(errp, "curl block device does not support writes"); +        return -EROFS; +    } + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        goto out_noclean; +    } + +    s->readahead_size = qemu_opt_get_size(opts, CURL_BLOCK_OPT_READAHEAD, +                                          READ_AHEAD_DEFAULT); +    if ((s->readahead_size & 0x1ff) != 0) { +        error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", +                   s->readahead_size); +        goto out_noclean; +    } + +    s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT, +                                     CURL_TIMEOUT_DEFAULT); +    if (s->timeout > CURL_TIMEOUT_MAX) { +        error_setg(errp, "timeout parameter is too large or negative"); +        goto out_noclean; +    } + +    s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); + +    cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE); +    s->cookie = g_strdup(cookie); + +    file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL); +    if (file == NULL) { +        error_setg(errp, "curl block driver requires an 'url' option"); +        goto out_noclean; +    } + +    if (!inited) { +        curl_global_init(CURL_GLOBAL_ALL); +        inited = 1; +    } + +    DPRINTF("CURL: Opening %s\n", file); +    s->aio_context = bdrv_get_aio_context(bs); +    s->url = g_strdup(file); +    state = curl_init_state(bs, s); +    if (!state) +        goto out_noclean; + +    // Get file size + +    s->accept_range = false; +    curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1); +    curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION, +                     curl_header_cb); +    curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s); +    if (curl_easy_perform(state->curl)) +        goto out; +    curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d); +    if (d) +        s->len = (size_t)d; +    else if(!s->len) +        goto out; +    if ((!strncasecmp(s->url, "http://", strlen("http://")) +        || !strncasecmp(s->url, "https://", strlen("https://"))) +        && !s->accept_range) { +        pstrcpy(state->errmsg, CURL_ERROR_SIZE, +                "Server does not support 'range' (byte ranges)."); +        goto out; +    } +    DPRINTF("CURL: Size = %zd\n", s->len); + +    curl_clean_state(state); +    curl_easy_cleanup(state->curl); +    state->curl = NULL; + +    curl_attach_aio_context(bs, bdrv_get_aio_context(bs)); + +    qemu_opts_del(opts); +    return 0; + +out: +    error_setg(errp, "CURL: Error opening file: %s", state->errmsg); +    curl_easy_cleanup(state->curl); +    state->curl = NULL; +out_noclean: +    g_free(s->cookie); +    g_free(s->url); +    qemu_opts_del(opts); +    return -EINVAL; +} + +static const AIOCBInfo curl_aiocb_info = { +    .aiocb_size         = sizeof(CURLAIOCB), +}; + + +static void curl_readv_bh_cb(void *p) +{ +    CURLState *state; +    int running; + +    CURLAIOCB *acb = p; +    BDRVCURLState *s = acb->common.bs->opaque; + +    qemu_bh_delete(acb->bh); +    acb->bh = NULL; + +    size_t start = acb->sector_num * SECTOR_SIZE; +    size_t end; + +    // In case we have the requested data already (e.g. read-ahead), +    // we can just call the callback and be done. +    switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) { +        case FIND_RET_OK: +            qemu_aio_unref(acb); +            // fall through +        case FIND_RET_WAIT: +            return; +        default: +            break; +    } + +    // No cache found, so let's start a new request +    state = curl_init_state(acb->common.bs, s); +    if (!state) { +        acb->common.cb(acb->common.opaque, -EIO); +        qemu_aio_unref(acb); +        return; +    } + +    acb->start = 0; +    acb->end = (acb->nb_sectors * SECTOR_SIZE); + +    state->buf_off = 0; +    g_free(state->orig_buf); +    state->buf_start = start; +    state->buf_len = acb->end + s->readahead_size; +    end = MIN(start + state->buf_len, s->len) - 1; +    state->orig_buf = g_try_malloc(state->buf_len); +    if (state->buf_len && state->orig_buf == NULL) { +        curl_clean_state(state); +        acb->common.cb(acb->common.opaque, -ENOMEM); +        qemu_aio_unref(acb); +        return; +    } +    state->acb[0] = acb; + +    snprintf(state->range, 127, "%zd-%zd", start, end); +    DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n", +            (acb->nb_sectors * SECTOR_SIZE), start, state->range); +    curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); + +    curl_multi_add_handle(s->multi, state->curl); + +    /* Tell curl it needs to kick things off */ +    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); +} + +static BlockAIOCB *curl_aio_readv(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    CURLAIOCB *acb; + +    acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque); + +    acb->qiov = qiov; +    acb->sector_num = sector_num; +    acb->nb_sectors = nb_sectors; + +    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb); +    qemu_bh_schedule(acb->bh); +    return &acb->common; +} + +static void curl_close(BlockDriverState *bs) +{ +    BDRVCURLState *s = bs->opaque; + +    DPRINTF("CURL: Close\n"); +    curl_detach_aio_context(bs); + +    g_free(s->cookie); +    g_free(s->url); +} + +static int64_t curl_getlength(BlockDriverState *bs) +{ +    BDRVCURLState *s = bs->opaque; +    return s->len; +} + +static BlockDriver bdrv_http = { +    .format_name                = "http", +    .protocol_name              = "http", + +    .instance_size              = sizeof(BDRVCURLState), +    .bdrv_parse_filename        = curl_parse_filename, +    .bdrv_file_open             = curl_open, +    .bdrv_close                 = curl_close, +    .bdrv_getlength             = curl_getlength, + +    .bdrv_aio_readv             = curl_aio_readv, + +    .bdrv_detach_aio_context    = curl_detach_aio_context, +    .bdrv_attach_aio_context    = curl_attach_aio_context, +}; + +static BlockDriver bdrv_https = { +    .format_name                = "https", +    .protocol_name              = "https", + +    .instance_size              = sizeof(BDRVCURLState), +    .bdrv_parse_filename        = curl_parse_filename, +    .bdrv_file_open             = curl_open, +    .bdrv_close                 = curl_close, +    .bdrv_getlength             = curl_getlength, + +    .bdrv_aio_readv             = curl_aio_readv, + +    .bdrv_detach_aio_context    = curl_detach_aio_context, +    .bdrv_attach_aio_context    = curl_attach_aio_context, +}; + +static BlockDriver bdrv_ftp = { +    .format_name                = "ftp", +    .protocol_name              = "ftp", + +    .instance_size              = sizeof(BDRVCURLState), +    .bdrv_parse_filename        = curl_parse_filename, +    .bdrv_file_open             = curl_open, +    .bdrv_close                 = curl_close, +    .bdrv_getlength             = curl_getlength, + +    .bdrv_aio_readv             = curl_aio_readv, + +    .bdrv_detach_aio_context    = curl_detach_aio_context, +    .bdrv_attach_aio_context    = curl_attach_aio_context, +}; + +static BlockDriver bdrv_ftps = { +    .format_name                = "ftps", +    .protocol_name              = "ftps", + +    .instance_size              = sizeof(BDRVCURLState), +    .bdrv_parse_filename        = curl_parse_filename, +    .bdrv_file_open             = curl_open, +    .bdrv_close                 = curl_close, +    .bdrv_getlength             = curl_getlength, + +    .bdrv_aio_readv             = curl_aio_readv, + +    .bdrv_detach_aio_context    = curl_detach_aio_context, +    .bdrv_attach_aio_context    = curl_attach_aio_context, +}; + +static BlockDriver bdrv_tftp = { +    .format_name                = "tftp", +    .protocol_name              = "tftp", + +    .instance_size              = sizeof(BDRVCURLState), +    .bdrv_parse_filename        = curl_parse_filename, +    .bdrv_file_open             = curl_open, +    .bdrv_close                 = curl_close, +    .bdrv_getlength             = curl_getlength, + +    .bdrv_aio_readv             = curl_aio_readv, + +    .bdrv_detach_aio_context    = curl_detach_aio_context, +    .bdrv_attach_aio_context    = curl_attach_aio_context, +}; + +static void curl_block_init(void) +{ +    bdrv_register(&bdrv_http); +    bdrv_register(&bdrv_https); +    bdrv_register(&bdrv_ftp); +    bdrv_register(&bdrv_ftps); +    bdrv_register(&bdrv_tftp); +} + +block_init(curl_block_init); diff --git a/block/dmg.c b/block/dmg.c new file mode 100644 index 00000000..9f252816 --- /dev/null +++ b/block/dmg.c @@ -0,0 +1,725 @@ +/* + * QEMU Block driver for DMG images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/bswap.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include <zlib.h> +#ifdef CONFIG_BZIP2 +#include <bzlib.h> +#endif +#include <glib.h> + +enum { +    /* Limit chunk sizes to prevent unreasonable amounts of memory being used +     * or truncating when converting to 32-bit types +     */ +    DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ +    DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +}; + +typedef struct BDRVDMGState { +    CoMutex lock; +    /* each chunk contains a certain number of sectors, +     * offsets[i] is the offset in the .dmg file, +     * lengths[i] is the length of the compressed chunk, +     * sectors[i] is the sector beginning at offsets[i], +     * sectorcounts[i] is the number of sectors in that chunk, +     * the sectors array is ordered +     * 0<=i<n_chunks */ + +    uint32_t n_chunks; +    uint32_t* types; +    uint64_t* offsets; +    uint64_t* lengths; +    uint64_t* sectors; +    uint64_t* sectorcounts; +    uint32_t current_chunk; +    uint8_t *compressed_chunk; +    uint8_t *uncompressed_chunk; +    z_stream zstream; +#ifdef CONFIG_BZIP2 +    bz_stream bzstream; +#endif +} BDRVDMGState; + +static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    int len; + +    if (!filename) { +        return 0; +    } + +    len = strlen(filename); +    if (len > 4 && !strcmp(filename + len - 4, ".dmg")) { +        return 2; +    } +    return 0; +} + +static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) +{ +    uint64_t buffer; +    int ret; + +    ret = bdrv_pread(bs->file, offset, &buffer, 8); +    if (ret < 0) { +        return ret; +    } + +    *result = be64_to_cpu(buffer); +    return 0; +} + +static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) +{ +    uint32_t buffer; +    int ret; + +    ret = bdrv_pread(bs->file, offset, &buffer, 4); +    if (ret < 0) { +        return ret; +    } + +    *result = be32_to_cpu(buffer); +    return 0; +} + +static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset) +{ +    return be64_to_cpu(*(uint64_t *)&buffer[offset]); +} + +static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset) +{ +    return be32_to_cpu(*(uint32_t *)&buffer[offset]); +} + +/* Increase max chunk sizes, if necessary.  This function is used to calculate + * the buffer sizes needed for compressed/uncompressed chunk I/O. + */ +static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, +                                  uint32_t *max_compressed_size, +                                  uint32_t *max_sectors_per_chunk) +{ +    uint32_t compressed_size = 0; +    uint32_t uncompressed_sectors = 0; + +    switch (s->types[chunk]) { +    case 0x80000005: /* zlib compressed */ +    case 0x80000006: /* bzip2 compressed */ +        compressed_size = s->lengths[chunk]; +        uncompressed_sectors = s->sectorcounts[chunk]; +        break; +    case 1: /* copy */ +        uncompressed_sectors = (s->lengths[chunk] + 511) / 512; +        break; +    case 2: /* zero */ +        /* as the all-zeroes block may be large, it is treated specially: the +         * sector is not copied from a large buffer, a simple memset is used +         * instead. Therefore uncompressed_sectors does not need to be set. */ +        break; +    } + +    if (compressed_size > *max_compressed_size) { +        *max_compressed_size = compressed_size; +    } +    if (uncompressed_sectors > *max_sectors_per_chunk) { +        *max_sectors_per_chunk = uncompressed_sectors; +    } +} + +static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) +{ +    int64_t length; +    int64_t offset = 0; +    uint8_t buffer[515]; +    int i, ret; + +    /* bdrv_getlength returns a multiple of block size (512), rounded up. Since +     * dmg images can have odd sizes, try to look for the "koly" magic which +     * marks the begin of the UDIF trailer (512 bytes). This magic can be found +     * in the last 511 bytes of the second-last sector or the first 4 bytes of +     * the last sector (search space: 515 bytes) */ +    length = bdrv_getlength(file_bs); +    if (length < 0) { +        error_setg_errno(errp, -length, +            "Failed to get file size while reading UDIF trailer"); +        return length; +    } else if (length < 512) { +        error_setg(errp, "dmg file must be at least 512 bytes long"); +        return -EINVAL; +    } +    if (length > 511 + 512) { +        offset = length - 511 - 512; +    } +    length = length < 515 ? length : 515; +    ret = bdrv_pread(file_bs, offset, buffer, length); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); +        return ret; +    } +    for (i = 0; i < length - 3; i++) { +        if (buffer[i] == 'k' && buffer[i+1] == 'o' && +            buffer[i+2] == 'l' && buffer[i+3] == 'y') { +            return offset + i; +        } +    } +    error_setg(errp, "Could not locate UDIF trailer in dmg file"); +    return -EINVAL; +} + +/* used when building the sector table */ +typedef struct DmgHeaderState { +    /* used internally by dmg_read_mish_block to remember offsets of blocks +     * across calls */ +    uint64_t data_fork_offset; +    /* exported for dmg_open */ +    uint32_t max_compressed_size; +    uint32_t max_sectors_per_chunk; +} DmgHeaderState; + +static bool dmg_is_known_block_type(uint32_t entry_type) +{ +    switch (entry_type) { +    case 0x00000001:    /* uncompressed */ +    case 0x00000002:    /* zeroes */ +    case 0x80000005:    /* zlib */ +#ifdef CONFIG_BZIP2 +    case 0x80000006:    /* bzip2 */ +#endif +        return true; +    default: +        return false; +    } +} + +static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, +                               uint8_t *buffer, uint32_t count) +{ +    uint32_t type, i; +    int ret; +    size_t new_size; +    uint32_t chunk_count; +    int64_t offset = 0; +    uint64_t data_offset; +    uint64_t in_offset = ds->data_fork_offset; +    uint64_t out_offset; + +    type = buff_read_uint32(buffer, offset); +    /* skip data that is not a valid MISH block (invalid magic or too small) */ +    if (type != 0x6d697368 || count < 244) { +        /* assume success for now */ +        return 0; +    } + +    /* chunk offsets are relative to this sector number */ +    out_offset = buff_read_uint64(buffer, offset + 8); + +    /* location in data fork for (compressed) blob (in bytes) */ +    data_offset = buff_read_uint64(buffer, offset + 0x18); +    in_offset += data_offset; + +    /* move to begin of chunk entries */ +    offset += 204; + +    chunk_count = (count - 204) / 40; +    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); +    s->types = g_realloc(s->types, new_size / 2); +    s->offsets = g_realloc(s->offsets, new_size); +    s->lengths = g_realloc(s->lengths, new_size); +    s->sectors = g_realloc(s->sectors, new_size); +    s->sectorcounts = g_realloc(s->sectorcounts, new_size); + +    for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { +        s->types[i] = buff_read_uint32(buffer, offset); +        if (!dmg_is_known_block_type(s->types[i])) { +            chunk_count--; +            i--; +            offset += 40; +            continue; +        } + +        /* sector number */ +        s->sectors[i] = buff_read_uint64(buffer, offset + 8); +        s->sectors[i] += out_offset; + +        /* sector count */ +        s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); + +        /* all-zeroes sector (type 2) does not need to be "uncompressed" and can +         * therefore be unbounded. */ +        if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { +            error_report("sector count %" PRIu64 " for chunk %" PRIu32 +                         " is larger than max (%u)", +                         s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); +            ret = -EINVAL; +            goto fail; +        } + +        /* offset in (compressed) data fork */ +        s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); +        s->offsets[i] += in_offset; + +        /* length in (compressed) data fork */ +        s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); + +        if (s->lengths[i] > DMG_LENGTHS_MAX) { +            error_report("length %" PRIu64 " for chunk %" PRIu32 +                         " is larger than max (%u)", +                         s->lengths[i], i, DMG_LENGTHS_MAX); +            ret = -EINVAL; +            goto fail; +        } + +        update_max_chunk_size(s, i, &ds->max_compressed_size, +                              &ds->max_sectors_per_chunk); +        offset += 40; +    } +    s->n_chunks += chunk_count; +    return 0; + +fail: +    return ret; +} + +static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, +                                  uint64_t info_begin, uint64_t info_length) +{ +    BDRVDMGState *s = bs->opaque; +    int ret; +    uint32_t count, rsrc_data_offset; +    uint8_t *buffer = NULL; +    uint64_t info_end; +    uint64_t offset; + +    /* read offset from begin of resource fork (info_begin) to resource data */ +    ret = read_uint32(bs, info_begin, &rsrc_data_offset); +    if (ret < 0) { +        goto fail; +    } else if (rsrc_data_offset > info_length) { +        ret = -EINVAL; +        goto fail; +    } + +    /* read length of resource data */ +    ret = read_uint32(bs, info_begin + 8, &count); +    if (ret < 0) { +        goto fail; +    } else if (count == 0 || rsrc_data_offset + count > info_length) { +        ret = -EINVAL; +        goto fail; +    } + +    /* begin of resource data (consisting of one or more resources) */ +    offset = info_begin + rsrc_data_offset; + +    /* end of resource data (there is possibly a following resource map +     * which will be ignored). */ +    info_end = offset + count; + +    /* read offsets (mish blocks) from one or more resources in resource data */ +    while (offset < info_end) { +        /* size of following resource */ +        ret = read_uint32(bs, offset, &count); +        if (ret < 0) { +            goto fail; +        } else if (count == 0 || count > info_end - offset) { +            ret = -EINVAL; +            goto fail; +        } +        offset += 4; + +        buffer = g_realloc(buffer, count); +        ret = bdrv_pread(bs->file, offset, buffer, count); +        if (ret < 0) { +            goto fail; +        } + +        ret = dmg_read_mish_block(s, ds, buffer, count); +        if (ret < 0) { +            goto fail; +        } +        /* advance offset by size of resource */ +        offset += count; +    } +    ret = 0; + +fail: +    g_free(buffer); +    return ret; +} + +static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, +                              uint64_t info_begin, uint64_t info_length) +{ +    BDRVDMGState *s = bs->opaque; +    int ret; +    uint8_t *buffer = NULL; +    char *data_begin, *data_end; + +    /* Have at least some length to avoid NULL for g_malloc. Attempt to set a +     * safe upper cap on the data length. A test sample had a XML length of +     * about 1 MiB. */ +    if (info_length == 0 || info_length > 16 * 1024 * 1024) { +        ret = -EINVAL; +        goto fail; +    } + +    buffer = g_malloc(info_length + 1); +    buffer[info_length] = '\0'; +    ret = bdrv_pread(bs->file, info_begin, buffer, info_length); +    if (ret != info_length) { +        ret = -EINVAL; +        goto fail; +    } + +    /* look for <data>...</data>. The data is 284 (0x11c) bytes after base64 +     * decode. The actual data element has 431 (0x1af) bytes which includes tabs +     * and line feeds. */ +    data_end = (char *)buffer; +    while ((data_begin = strstr(data_end, "<data>")) != NULL) { +        guchar *mish; +        gsize out_len = 0; + +        data_begin += 6; +        data_end = strstr(data_begin, "</data>"); +        /* malformed XML? */ +        if (data_end == NULL) { +            ret = -EINVAL; +            goto fail; +        } +        *data_end++ = '\0'; +        mish = g_base64_decode(data_begin, &out_len); +        ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len); +        g_free(mish); +        if (ret < 0) { +            goto fail; +        } +    } +    ret = 0; + +fail: +    g_free(buffer); +    return ret; +} + +static int dmg_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVDMGState *s = bs->opaque; +    DmgHeaderState ds; +    uint64_t rsrc_fork_offset, rsrc_fork_length; +    uint64_t plist_xml_offset, plist_xml_length; +    int64_t offset; +    int ret; + +    bs->read_only = 1; +    s->n_chunks = 0; +    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; +    /* used by dmg_read_mish_block to keep track of the current I/O position */ +    ds.data_fork_offset = 0; +    ds.max_compressed_size = 1; +    ds.max_sectors_per_chunk = 1; + +    /* locate the UDIF trailer */ +    offset = dmg_find_koly_offset(bs->file, errp); +    if (offset < 0) { +        ret = offset; +        goto fail; +    } + +    /* offset of data fork (DataForkOffset) */ +    ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset); +    if (ret < 0) { +        goto fail; +    } else if (ds.data_fork_offset > offset) { +        ret = -EINVAL; +        goto fail; +    } + +    /* offset of resource fork (RsrcForkOffset) */ +    ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); +    if (ret < 0) { +        goto fail; +    } +    ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length); +    if (ret < 0) { +        goto fail; +    } +    if (rsrc_fork_offset >= offset || +        rsrc_fork_length > offset - rsrc_fork_offset) { +        ret = -EINVAL; +        goto fail; +    } +    /* offset of property list (XMLOffset) */ +    ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset); +    if (ret < 0) { +        goto fail; +    } +    ret = read_uint64(bs, offset + 0xe0, &plist_xml_length); +    if (ret < 0) { +        goto fail; +    } +    if (plist_xml_offset >= offset || +        plist_xml_length > offset - plist_xml_offset) { +        ret = -EINVAL; +        goto fail; +    } +    ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors); +    if (ret < 0) { +        goto fail; +    } +    if (bs->total_sectors < 0) { +        ret = -EINVAL; +        goto fail; +    } +    if (rsrc_fork_length != 0) { +        ret = dmg_read_resource_fork(bs, &ds, +                                     rsrc_fork_offset, rsrc_fork_length); +        if (ret < 0) { +            goto fail; +        } +    } else if (plist_xml_length != 0) { +        ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length); +        if (ret < 0) { +            goto fail; +        } +    } else { +        ret = -EINVAL; +        goto fail; +    } + +    /* initialize zlib engine */ +    s->compressed_chunk = qemu_try_blockalign(bs->file, +                                              ds.max_compressed_size + 1); +    s->uncompressed_chunk = qemu_try_blockalign(bs->file, +                                                512 * ds.max_sectors_per_chunk); +    if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { +        ret = -ENOMEM; +        goto fail; +    } + +    if (inflateInit(&s->zstream) != Z_OK) { +        ret = -EINVAL; +        goto fail; +    } + +    s->current_chunk = s->n_chunks; + +    qemu_co_mutex_init(&s->lock); +    return 0; + +fail: +    g_free(s->types); +    g_free(s->offsets); +    g_free(s->lengths); +    g_free(s->sectors); +    g_free(s->sectorcounts); +    qemu_vfree(s->compressed_chunk); +    qemu_vfree(s->uncompressed_chunk); +    return ret; +} + +static inline int is_sector_in_chunk(BDRVDMGState* s, +                uint32_t chunk_num, uint64_t sector_num) +{ +    if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num || +            s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) { +        return 0; +    } else { +        return -1; +    } +} + +static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) +{ +    /* binary search */ +    uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; +    while (chunk1 != chunk2) { +        chunk3 = (chunk1 + chunk2) / 2; +        if (s->sectors[chunk3] > sector_num) { +            chunk2 = chunk3; +        } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { +            return chunk3; +        } else { +            chunk1 = chunk3; +        } +    } +    return s->n_chunks; /* error */ +} + +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) +{ +    BDRVDMGState *s = bs->opaque; + +    if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { +        int ret; +        uint32_t chunk = search_chunk(s, sector_num); +#ifdef CONFIG_BZIP2 +        uint64_t total_out; +#endif + +        if (chunk >= s->n_chunks) { +            return -1; +        } + +        s->current_chunk = s->n_chunks; +        switch (s->types[chunk]) { /* block entry type */ +        case 0x80000005: { /* zlib compressed */ +            /* we need to buffer, because only the chunk as whole can be +             * inflated. */ +            ret = bdrv_pread(bs->file, s->offsets[chunk], +                             s->compressed_chunk, s->lengths[chunk]); +            if (ret != s->lengths[chunk]) { +                return -1; +            } + +            s->zstream.next_in = s->compressed_chunk; +            s->zstream.avail_in = s->lengths[chunk]; +            s->zstream.next_out = s->uncompressed_chunk; +            s->zstream.avail_out = 512 * s->sectorcounts[chunk]; +            ret = inflateReset(&s->zstream); +            if (ret != Z_OK) { +                return -1; +            } +            ret = inflate(&s->zstream, Z_FINISH); +            if (ret != Z_STREAM_END || +                s->zstream.total_out != 512 * s->sectorcounts[chunk]) { +                return -1; +            } +            break; } +#ifdef CONFIG_BZIP2 +        case 0x80000006: /* bzip2 compressed */ +            /* we need to buffer, because only the chunk as whole can be +             * inflated. */ +            ret = bdrv_pread(bs->file, s->offsets[chunk], +                             s->compressed_chunk, s->lengths[chunk]); +            if (ret != s->lengths[chunk]) { +                return -1; +            } + +            ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0); +            if (ret != BZ_OK) { +                return -1; +            } +            s->bzstream.next_in = (char *)s->compressed_chunk; +            s->bzstream.avail_in = (unsigned int) s->lengths[chunk]; +            s->bzstream.next_out = (char *)s->uncompressed_chunk; +            s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk]; +            ret = BZ2_bzDecompress(&s->bzstream); +            total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) + +                        s->bzstream.total_out_lo32; +            BZ2_bzDecompressEnd(&s->bzstream); +            if (ret != BZ_STREAM_END || +                total_out != 512 * s->sectorcounts[chunk]) { +                return -1; +            } +            break; +#endif /* CONFIG_BZIP2 */ +        case 1: /* copy */ +            ret = bdrv_pread(bs->file, s->offsets[chunk], +                             s->uncompressed_chunk, s->lengths[chunk]); +            if (ret != s->lengths[chunk]) { +                return -1; +            } +            break; +        case 2: /* zero */ +            /* see dmg_read, it is treated specially. No buffer needs to be +             * pre-filled, the zeroes can be set directly. */ +            break; +        } +        s->current_chunk = chunk; +    } +    return 0; +} + +static int dmg_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    BDRVDMGState *s = bs->opaque; +    int i; + +    for (i = 0; i < nb_sectors; i++) { +        uint32_t sector_offset_in_chunk; +        if (dmg_read_chunk(bs, sector_num + i) != 0) { +            return -1; +        } +        /* Special case: current chunk is all zeroes. Do not perform a memcpy as +         * s->uncompressed_chunk may be too small to cover the large all-zeroes +         * section. dmg_read_chunk is called to find s->current_chunk */ +        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ +            memset(buf + i * 512, 0, 512); +            continue; +        } +        sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; +        memcpy(buf + i * 512, +               s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); +    } +    return 0; +} + +static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, +                                    uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVDMGState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = dmg_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static void dmg_close(BlockDriverState *bs) +{ +    BDRVDMGState *s = bs->opaque; + +    g_free(s->types); +    g_free(s->offsets); +    g_free(s->lengths); +    g_free(s->sectors); +    g_free(s->sectorcounts); +    qemu_vfree(s->compressed_chunk); +    qemu_vfree(s->uncompressed_chunk); + +    inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_dmg = { +    .format_name    = "dmg", +    .instance_size  = sizeof(BDRVDMGState), +    .bdrv_probe     = dmg_probe, +    .bdrv_open      = dmg_open, +    .bdrv_read      = dmg_co_read, +    .bdrv_close     = dmg_close, +}; + +static void bdrv_dmg_init(void) +{ +    bdrv_register(&bdrv_dmg); +} + +block_init(bdrv_dmg_init); diff --git a/block/gluster.c b/block/gluster.c new file mode 100644 index 00000000..1eb3a8c3 --- /dev/null +++ b/block/gluster.c @@ -0,0 +1,833 @@ +/* + * GlusterFS backend for QEMU + * + * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include <glusterfs/api/glfs.h> +#include "block/block_int.h" +#include "qemu/uri.h" + +typedef struct GlusterAIOCB { +    int64_t size; +    int ret; +    QEMUBH *bh; +    Coroutine *coroutine; +    AioContext *aio_context; +} GlusterAIOCB; + +typedef struct BDRVGlusterState { +    struct glfs *glfs; +    struct glfs_fd *fd; +} BDRVGlusterState; + +typedef struct GlusterConf { +    char *server; +    int port; +    char *volname; +    char *image; +    char *transport; +} GlusterConf; + +static void qemu_gluster_gconf_free(GlusterConf *gconf) +{ +    if (gconf) { +        g_free(gconf->server); +        g_free(gconf->volname); +        g_free(gconf->image); +        g_free(gconf->transport); +        g_free(gconf); +    } +} + +static int parse_volume_options(GlusterConf *gconf, char *path) +{ +    char *p, *q; + +    if (!path) { +        return -EINVAL; +    } + +    /* volume */ +    p = q = path + strspn(path, "/"); +    p += strcspn(p, "/"); +    if (*p == '\0') { +        return -EINVAL; +    } +    gconf->volname = g_strndup(q, p - q); + +    /* image */ +    p += strspn(p, "/"); +    if (*p == '\0') { +        return -EINVAL; +    } +    gconf->image = g_strdup(p); +    return 0; +} + +/* + * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] + * + * 'gluster' is the protocol. + * + * 'transport' specifies the transport type used to connect to gluster + * management daemon (glusterd). Valid transport types are + * tcp, unix and rdma. If a transport type isn't specified, then tcp + * type is assumed. + * + * 'server' specifies the server where the volume file specification for + * the given volume resides. This can be either hostname, ipv4 address + * or ipv6 address. ipv6 address needs to be within square brackets [ ]. + * If transport type is 'unix', then 'server' field should not be specified. + * The 'socket' field needs to be populated with the path to unix domain + * socket. + * + * 'port' is the port number on which glusterd is listening. This is optional + * and if not specified, QEMU will send 0 which will make gluster to use the + * default port. If the transport type is unix, then 'port' should not be + * specified. + * + * 'volname' is the name of the gluster volume which contains the VM image. + * + * 'image' is the path to the actual VM image that resides on gluster volume. + * + * Examples: + * + * file=gluster://1.2.3.4/testvol/a.img + * file=gluster+tcp://1.2.3.4/testvol/a.img + * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img + * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img + * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img + * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img + * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket + * file=gluster+rdma://1.2.3.4:24007/testvol/a.img + */ +static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) +{ +    URI *uri; +    QueryParams *qp = NULL; +    bool is_unix = false; +    int ret = 0; + +    uri = uri_parse(filename); +    if (!uri) { +        return -EINVAL; +    } + +    /* transport */ +    if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { +        gconf->transport = g_strdup("tcp"); +    } else if (!strcmp(uri->scheme, "gluster+tcp")) { +        gconf->transport = g_strdup("tcp"); +    } else if (!strcmp(uri->scheme, "gluster+unix")) { +        gconf->transport = g_strdup("unix"); +        is_unix = true; +    } else if (!strcmp(uri->scheme, "gluster+rdma")) { +        gconf->transport = g_strdup("rdma"); +    } else { +        ret = -EINVAL; +        goto out; +    } + +    ret = parse_volume_options(gconf, uri->path); +    if (ret < 0) { +        goto out; +    } + +    qp = query_params_parse(uri->query); +    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { +        ret = -EINVAL; +        goto out; +    } + +    if (is_unix) { +        if (uri->server || uri->port) { +            ret = -EINVAL; +            goto out; +        } +        if (strcmp(qp->p[0].name, "socket")) { +            ret = -EINVAL; +            goto out; +        } +        gconf->server = g_strdup(qp->p[0].value); +    } else { +        gconf->server = g_strdup(uri->server ? uri->server : "localhost"); +        gconf->port = uri->port; +    } + +out: +    if (qp) { +        query_params_free(qp); +    } +    uri_free(uri); +    return ret; +} + +static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, +                                      Error **errp) +{ +    struct glfs *glfs = NULL; +    int ret; +    int old_errno; + +    ret = qemu_gluster_parseuri(gconf, filename); +    if (ret < 0) { +        error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" +                   "volname/image[?socket=...]"); +        errno = -ret; +        goto out; +    } + +    glfs = glfs_new(gconf->volname); +    if (!glfs) { +        goto out; +    } + +    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, +            gconf->port); +    if (ret < 0) { +        goto out; +    } + +    /* +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when +     * GlusterFS makes GF_LOG_* macros available to libgfapi users. +     */ +    ret = glfs_set_logging(glfs, "-", 4); +    if (ret < 0) { +        goto out; +    } + +    ret = glfs_init(glfs); +    if (ret) { +        error_setg_errno(errp, errno, +                         "Gluster connection failed for server=%s port=%d " +                         "volume=%s image=%s transport=%s", gconf->server, +                         gconf->port, gconf->volname, gconf->image, +                         gconf->transport); + +        /* glfs_init sometimes doesn't set errno although docs suggest that */ +        if (errno == 0) +            errno = EINVAL; + +        goto out; +    } +    return glfs; + +out: +    if (glfs) { +        old_errno = errno; +        glfs_fini(glfs); +        errno = old_errno; +    } +    return NULL; +} + +static void qemu_gluster_complete_aio(void *opaque) +{ +    GlusterAIOCB *acb = (GlusterAIOCB *)opaque; + +    qemu_bh_delete(acb->bh); +    acb->bh = NULL; +    qemu_coroutine_enter(acb->coroutine, NULL); +} + +/* + * AIO callback routine called from GlusterFS thread. + */ +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) +{ +    GlusterAIOCB *acb = (GlusterAIOCB *)arg; + +    if (!ret || ret == acb->size) { +        acb->ret = 0; /* Success */ +    } else if (ret < 0) { +        acb->ret = ret; /* Read/Write failed */ +    } else { +        acb->ret = -EIO; /* Partial read/write - fail it */ +    } + +    acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb); +    qemu_bh_schedule(acb->bh); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { +    .name = "gluster", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "URL to the gluster image", +        }, +        { /* end of list */ } +    }, +}; + +static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) +{ +    assert(open_flags != NULL); + +    *open_flags |= O_BINARY; + +    if (bdrv_flags & BDRV_O_RDWR) { +        *open_flags |= O_RDWR; +    } else { +        *open_flags |= O_RDONLY; +    } + +    if ((bdrv_flags & BDRV_O_NOCACHE)) { +        *open_flags |= O_DIRECT; +    } +} + +static int qemu_gluster_open(BlockDriverState *bs,  QDict *options, +                             int bdrv_flags, Error **errp) +{ +    BDRVGlusterState *s = bs->opaque; +    int open_flags = 0; +    int ret = 0; +    GlusterConf *gconf = g_new0(GlusterConf, 1); +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    s->glfs = qemu_gluster_init(gconf, filename, errp); +    if (!s->glfs) { +        ret = -errno; +        goto out; +    } + +    qemu_gluster_parse_flags(bdrv_flags, &open_flags); + +    s->fd = glfs_open(s->glfs, gconf->image, open_flags); +    if (!s->fd) { +        ret = -errno; +    } + +out: +    qemu_opts_del(opts); +    qemu_gluster_gconf_free(gconf); +    if (!ret) { +        return ret; +    } +    if (s->fd) { +        glfs_close(s->fd); +    } +    if (s->glfs) { +        glfs_fini(s->glfs); +    } +    return ret; +} + +typedef struct BDRVGlusterReopenState { +    struct glfs *glfs; +    struct glfs_fd *fd; +} BDRVGlusterReopenState; + + +static int qemu_gluster_reopen_prepare(BDRVReopenState *state, +                                       BlockReopenQueue *queue, Error **errp) +{ +    int ret = 0; +    BDRVGlusterReopenState *reop_s; +    GlusterConf *gconf = NULL; +    int open_flags = 0; + +    assert(state != NULL); +    assert(state->bs != NULL); + +    state->opaque = g_new0(BDRVGlusterReopenState, 1); +    reop_s = state->opaque; + +    qemu_gluster_parse_flags(state->flags, &open_flags); + +    gconf = g_new0(GlusterConf, 1); + +    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); +    if (reop_s->glfs == NULL) { +        ret = -errno; +        goto exit; +    } + +    reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); +    if (reop_s->fd == NULL) { +        /* reops->glfs will be cleaned up in _abort */ +        ret = -errno; +        goto exit; +    } + +exit: +    /* state->opaque will be freed in either the _abort or _commit */ +    qemu_gluster_gconf_free(gconf); +    return ret; +} + +static void qemu_gluster_reopen_commit(BDRVReopenState *state) +{ +    BDRVGlusterReopenState *reop_s = state->opaque; +    BDRVGlusterState *s = state->bs->opaque; + + +    /* close the old */ +    if (s->fd) { +        glfs_close(s->fd); +    } +    if (s->glfs) { +        glfs_fini(s->glfs); +    } + +    /* use the newly opened image / connection */ +    s->fd         = reop_s->fd; +    s->glfs       = reop_s->glfs; + +    g_free(state->opaque); +    state->opaque = NULL; + +    return; +} + + +static void qemu_gluster_reopen_abort(BDRVReopenState *state) +{ +    BDRVGlusterReopenState *reop_s = state->opaque; + +    if (reop_s == NULL) { +        return; +    } + +    if (reop_s->fd) { +        glfs_close(reop_s->fd); +    } + +    if (reop_s->glfs) { +        glfs_fini(reop_s->glfs); +    } + +    g_free(state->opaque); +    state->opaque = NULL; + +    return; +} + +#ifdef CONFIG_GLUSTERFS_ZEROFILL +static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ +    int ret; +    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); +    BDRVGlusterState *s = bs->opaque; +    off_t size = nb_sectors * BDRV_SECTOR_SIZE; +    off_t offset = sector_num * BDRV_SECTOR_SIZE; + +    acb->size = size; +    acb->ret = 0; +    acb->coroutine = qemu_coroutine_self(); +    acb->aio_context = bdrv_get_aio_context(bs); + +    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); +    if (ret < 0) { +        ret = -errno; +        goto out; +    } + +    qemu_coroutine_yield(); +    ret = acb->ret; + +out: +    g_slice_free(GlusterAIOCB, acb); +    return ret; +} + +static inline bool gluster_supports_zerofill(void) +{ +    return 1; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, +        int64_t size) +{ +    return glfs_zerofill(fd, offset, size); +} + +#else +static inline bool gluster_supports_zerofill(void) +{ +    return 0; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, +        int64_t size) +{ +    return 0; +} +#endif + +static int qemu_gluster_create(const char *filename, +                               QemuOpts *opts, Error **errp) +{ +    struct glfs *glfs; +    struct glfs_fd *fd; +    int ret = 0; +    int prealloc = 0; +    int64_t total_size = 0; +    char *tmp = NULL; +    GlusterConf *gconf = g_new0(GlusterConf, 1); + +    glfs = qemu_gluster_init(gconf, filename, errp); +    if (!glfs) { +        ret = -errno; +        goto out; +    } + +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); + +    tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); +    if (!tmp || !strcmp(tmp, "off")) { +        prealloc = 0; +    } else if (!strcmp(tmp, "full") && +               gluster_supports_zerofill()) { +        prealloc = 1; +    } else { +        error_setg(errp, "Invalid preallocation mode: '%s'" +            " or GlusterFS doesn't support zerofill API", +            tmp); +        ret = -EINVAL; +        goto out; +    } + +    fd = glfs_creat(glfs, gconf->image, +        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); +    if (!fd) { +        ret = -errno; +    } else { +        if (!glfs_ftruncate(fd, total_size)) { +            if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) { +                ret = -errno; +            } +        } else { +            ret = -errno; +        } + +        if (glfs_close(fd) != 0) { +            ret = -errno; +        } +    } +out: +    g_free(tmp); +    qemu_gluster_gconf_free(gconf); +    if (glfs) { +        glfs_fini(glfs); +    } +    return ret; +} + +static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) +{ +    int ret; +    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); +    BDRVGlusterState *s = bs->opaque; +    size_t size = nb_sectors * BDRV_SECTOR_SIZE; +    off_t offset = sector_num * BDRV_SECTOR_SIZE; + +    acb->size = size; +    acb->ret = 0; +    acb->coroutine = qemu_coroutine_self(); +    acb->aio_context = bdrv_get_aio_context(bs); + +    if (write) { +        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, +            &gluster_finish_aiocb, acb); +    } else { +        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, +            &gluster_finish_aiocb, acb); +    } + +    if (ret < 0) { +        ret = -errno; +        goto out; +    } + +    qemu_coroutine_yield(); +    ret = acb->ret; + +out: +    g_slice_free(GlusterAIOCB, acb); +    return ret; +} + +static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) +{ +    int ret; +    BDRVGlusterState *s = bs->opaque; + +    ret = glfs_ftruncate(s->fd, offset); +    if (ret < 0) { +        return -errno; +    } + +    return 0; +} + +static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); +} + +static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); +} + +static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) +{ +    int ret; +    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); +    BDRVGlusterState *s = bs->opaque; + +    acb->size = 0; +    acb->ret = 0; +    acb->coroutine = qemu_coroutine_self(); +    acb->aio_context = bdrv_get_aio_context(bs); + +    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); +    if (ret < 0) { +        ret = -errno; +        goto out; +    } + +    qemu_coroutine_yield(); +    ret = acb->ret; + +out: +    g_slice_free(GlusterAIOCB, acb); +    return ret; +} + +#ifdef CONFIG_GLUSTERFS_DISCARD +static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors) +{ +    int ret; +    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); +    BDRVGlusterState *s = bs->opaque; +    size_t size = nb_sectors * BDRV_SECTOR_SIZE; +    off_t offset = sector_num * BDRV_SECTOR_SIZE; + +    acb->size = 0; +    acb->ret = 0; +    acb->coroutine = qemu_coroutine_self(); +    acb->aio_context = bdrv_get_aio_context(bs); + +    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); +    if (ret < 0) { +        ret = -errno; +        goto out; +    } + +    qemu_coroutine_yield(); +    ret = acb->ret; + +out: +    g_slice_free(GlusterAIOCB, acb); +    return ret; +} +#endif + +static int64_t qemu_gluster_getlength(BlockDriverState *bs) +{ +    BDRVGlusterState *s = bs->opaque; +    int64_t ret; + +    ret = glfs_lseek(s->fd, 0, SEEK_END); +    if (ret < 0) { +        return -errno; +    } else { +        return ret; +    } +} + +static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs) +{ +    BDRVGlusterState *s = bs->opaque; +    struct stat st; +    int ret; + +    ret = glfs_fstat(s->fd, &st); +    if (ret < 0) { +        return -errno; +    } else { +        return st.st_blocks * 512; +    } +} + +static void qemu_gluster_close(BlockDriverState *bs) +{ +    BDRVGlusterState *s = bs->opaque; + +    if (s->fd) { +        glfs_close(s->fd); +        s->fd = NULL; +    } +    glfs_fini(s->glfs); +} + +static int qemu_gluster_has_zero_init(BlockDriverState *bs) +{ +    /* GlusterFS volume could be backed by a block device */ +    return 0; +} + +static QemuOptsList qemu_gluster_create_opts = { +    .name = "qemu-gluster-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_PREALLOC, +            .type = QEMU_OPT_STRING, +            .help = "Preallocation mode (allowed values: off, full)" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_gluster = { +    .format_name                  = "gluster", +    .protocol_name                = "gluster", +    .instance_size                = sizeof(BDRVGlusterState), +    .bdrv_needs_filename          = true, +    .bdrv_file_open               = qemu_gluster_open, +    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare, +    .bdrv_reopen_commit           = qemu_gluster_reopen_commit, +    .bdrv_reopen_abort            = qemu_gluster_reopen_abort, +    .bdrv_close                   = qemu_gluster_close, +    .bdrv_create                  = qemu_gluster_create, +    .bdrv_getlength               = qemu_gluster_getlength, +    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, +    .bdrv_truncate                = qemu_gluster_truncate, +    .bdrv_co_readv                = qemu_gluster_co_readv, +    .bdrv_co_writev               = qemu_gluster_co_writev, +    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk, +    .bdrv_has_zero_init           = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD +    .bdrv_co_discard              = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL +    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes, +#endif +    .create_opts                  = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_tcp = { +    .format_name                  = "gluster", +    .protocol_name                = "gluster+tcp", +    .instance_size                = sizeof(BDRVGlusterState), +    .bdrv_needs_filename          = true, +    .bdrv_file_open               = qemu_gluster_open, +    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare, +    .bdrv_reopen_commit           = qemu_gluster_reopen_commit, +    .bdrv_reopen_abort            = qemu_gluster_reopen_abort, +    .bdrv_close                   = qemu_gluster_close, +    .bdrv_create                  = qemu_gluster_create, +    .bdrv_getlength               = qemu_gluster_getlength, +    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, +    .bdrv_truncate                = qemu_gluster_truncate, +    .bdrv_co_readv                = qemu_gluster_co_readv, +    .bdrv_co_writev               = qemu_gluster_co_writev, +    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk, +    .bdrv_has_zero_init           = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD +    .bdrv_co_discard              = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL +    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes, +#endif +    .create_opts                  = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_unix = { +    .format_name                  = "gluster", +    .protocol_name                = "gluster+unix", +    .instance_size                = sizeof(BDRVGlusterState), +    .bdrv_needs_filename          = true, +    .bdrv_file_open               = qemu_gluster_open, +    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare, +    .bdrv_reopen_commit           = qemu_gluster_reopen_commit, +    .bdrv_reopen_abort            = qemu_gluster_reopen_abort, +    .bdrv_close                   = qemu_gluster_close, +    .bdrv_create                  = qemu_gluster_create, +    .bdrv_getlength               = qemu_gluster_getlength, +    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, +    .bdrv_truncate                = qemu_gluster_truncate, +    .bdrv_co_readv                = qemu_gluster_co_readv, +    .bdrv_co_writev               = qemu_gluster_co_writev, +    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk, +    .bdrv_has_zero_init           = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD +    .bdrv_co_discard              = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL +    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes, +#endif +    .create_opts                  = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_rdma = { +    .format_name                  = "gluster", +    .protocol_name                = "gluster+rdma", +    .instance_size                = sizeof(BDRVGlusterState), +    .bdrv_needs_filename          = true, +    .bdrv_file_open               = qemu_gluster_open, +    .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare, +    .bdrv_reopen_commit           = qemu_gluster_reopen_commit, +    .bdrv_reopen_abort            = qemu_gluster_reopen_abort, +    .bdrv_close                   = qemu_gluster_close, +    .bdrv_create                  = qemu_gluster_create, +    .bdrv_getlength               = qemu_gluster_getlength, +    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, +    .bdrv_truncate                = qemu_gluster_truncate, +    .bdrv_co_readv                = qemu_gluster_co_readv, +    .bdrv_co_writev               = qemu_gluster_co_writev, +    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk, +    .bdrv_has_zero_init           = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD +    .bdrv_co_discard              = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL +    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes, +#endif +    .create_opts                  = &qemu_gluster_create_opts, +}; + +static void bdrv_gluster_init(void) +{ +    bdrv_register(&bdrv_gluster_rdma); +    bdrv_register(&bdrv_gluster_unix); +    bdrv_register(&bdrv_gluster_tcp); +    bdrv_register(&bdrv_gluster); +} + +block_init(bdrv_gluster_init); diff --git a/block/io.c b/block/io.c new file mode 100644 index 00000000..d4bc83b3 --- /dev/null +++ b/block/io.c @@ -0,0 +1,2610 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "block/blockjob.h" +#include "block/block_int.h" +#include "block/throttle-groups.h" +#include "qemu/error-report.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, +                                         int64_t sector_num, int nb_sectors, +                                         QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, +                                         int64_t sector_num, int nb_sectors, +                                         QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, +    BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, +    BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, +                                         int64_t sector_num, +                                         QEMUIOVector *qiov, +                                         int nb_sectors, +                                         BdrvRequestFlags flags, +                                         BlockCompletionFunc *cb, +                                         void *opaque, +                                         bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, +                        ThrottleConfig *cfg) +{ +    int i; + +    throttle_group_config(bs, cfg); + +    for (i = 0; i < 2; i++) { +        qemu_co_enter_next(&bs->throttled_reqs[i]); +    } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ +    bool drained = false; +    bool enabled = bs->io_limits_enabled; +    int i; + +    bs->io_limits_enabled = false; + +    for (i = 0; i < 2; i++) { +        while (qemu_co_enter_next(&bs->throttled_reqs[i])) { +            drained = true; +        } +    } + +    bs->io_limits_enabled = enabled; + +    return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ +    bs->io_limits_enabled = false; +    bdrv_start_throttled_reqs(bs); +    throttle_group_unregister_bs(bs); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) +{ +    assert(!bs->io_limits_enabled); +    throttle_group_register_bs(bs, group); +    bs->io_limits_enabled = true; +} + +void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) +{ +    /* this bs is not part of any group */ +    if (!bs->throttle_state) { +        return; +    } + +    /* this bs is a part of the same group than the one we want */ +    if (!g_strcmp0(throttle_group_get_name(bs), group)) { +        return; +    } + +    /* need to change the group this bs belong to */ +    bdrv_io_limits_disable(bs); +    bdrv_io_limits_enable(bs, group); +} + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ +    /* Block drivers without coroutine functions need emulation */ +    if (!bdrv->bdrv_co_readv) { +        bdrv->bdrv_co_readv = bdrv_co_readv_em; +        bdrv->bdrv_co_writev = bdrv_co_writev_em; + +        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if +         * the block driver lacks aio we need to emulate that too. +         */ +        if (!bdrv->bdrv_aio_readv) { +            /* add AIO emulation layer */ +            bdrv->bdrv_aio_readv = bdrv_aio_readv_em; +            bdrv->bdrv_aio_writev = bdrv_aio_writev_em; +        } +    } +} + +void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    BlockDriver *drv = bs->drv; +    Error *local_err = NULL; + +    memset(&bs->bl, 0, sizeof(bs->bl)); + +    if (!drv) { +        return; +    } + +    /* Take some limits from the children as a default */ +    if (bs->file) { +        bdrv_refresh_limits(bs->file, &local_err); +        if (local_err) { +            error_propagate(errp, local_err); +            return; +        } +        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; +        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; +        bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; +        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; +    } else { +        bs->bl.min_mem_alignment = 512; +        bs->bl.opt_mem_alignment = getpagesize(); +    } + +    if (bs->backing_hd) { +        bdrv_refresh_limits(bs->backing_hd, &local_err); +        if (local_err) { +            error_propagate(errp, local_err); +            return; +        } +        bs->bl.opt_transfer_length = +            MAX(bs->bl.opt_transfer_length, +                bs->backing_hd->bl.opt_transfer_length); +        bs->bl.max_transfer_length = +            MIN_NON_ZERO(bs->bl.max_transfer_length, +                         bs->backing_hd->bl.max_transfer_length); +        bs->bl.opt_mem_alignment = +            MAX(bs->bl.opt_mem_alignment, +                bs->backing_hd->bl.opt_mem_alignment); +        bs->bl.min_mem_alignment = +            MAX(bs->bl.min_mem_alignment, +                bs->backing_hd->bl.min_mem_alignment); +    } + +    /* Then let the driver override it */ +    if (drv->bdrv_refresh_limits) { +        drv->bdrv_refresh_limits(bs, errp); +    } +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ +    bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ +    assert(bs->copy_on_read > 0); +    bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ +    if (!QLIST_EMPTY(&bs->tracked_requests)) { +        return true; +    } +    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { +        return true; +    } +    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { +        return true; +    } +    if (bs->file && bdrv_requests_pending(bs->file)) { +        return true; +    } +    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { +        return true; +    } +    return false; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + * + * Only this BlockDriverState's AioContext is run, so in-flight requests must + * not depend on events in other AioContexts.  In that case, use + * bdrv_drain_all() instead. + */ +void bdrv_drain(BlockDriverState *bs) +{ +    bool busy = true; + +    while (busy) { +        /* Keep iterating */ +         bdrv_flush_io_queue(bs); +         busy = bdrv_requests_pending(bs); +         busy |= aio_poll(bdrv_get_aio_context(bs), busy); +    } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + */ +void bdrv_drain_all(void) +{ +    /* Always run first iteration so any pending completion BHs run */ +    bool busy = true; +    BlockDriverState *bs = NULL; +    GSList *aio_ctxs = NULL, *ctx; + +    while ((bs = bdrv_next(bs))) { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        aio_context_acquire(aio_context); +        if (bs->job) { +            block_job_pause(bs->job); +        } +        aio_context_release(aio_context); + +        if (!g_slist_find(aio_ctxs, aio_context)) { +            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); +        } +    } + +    /* Note that completion of an asynchronous I/O operation can trigger any +     * number of other I/O operations on other devices---for example a +     * coroutine can submit an I/O request to another device in response to +     * request completion.  Therefore we must keep looping until there was no +     * more activity rather than simply draining each device independently. +     */ +    while (busy) { +        busy = false; + +        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { +            AioContext *aio_context = ctx->data; +            bs = NULL; + +            aio_context_acquire(aio_context); +            while ((bs = bdrv_next(bs))) { +                if (aio_context == bdrv_get_aio_context(bs)) { +                    bdrv_flush_io_queue(bs); +                    if (bdrv_requests_pending(bs)) { +                        busy = true; +                        aio_poll(aio_context, busy); +                    } +                } +            } +            busy |= aio_poll(aio_context, false); +            aio_context_release(aio_context); +        } +    } + +    bs = NULL; +    while ((bs = bdrv_next(bs))) { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        aio_context_acquire(aio_context); +        if (bs->job) { +            block_job_resume(bs->job); +        } +        aio_context_release(aio_context); +    } +    g_slist_free(aio_ctxs); +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ +    if (req->serialising) { +        req->bs->serialising_in_flight--; +    } + +    QLIST_REMOVE(req, list); +    qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, +                                  BlockDriverState *bs, +                                  int64_t offset, +                                  unsigned int bytes, bool is_write) +{ +    *req = (BdrvTrackedRequest){ +        .bs = bs, +        .offset         = offset, +        .bytes          = bytes, +        .is_write       = is_write, +        .co             = qemu_coroutine_self(), +        .serialising    = false, +        .overlap_offset = offset, +        .overlap_bytes  = bytes, +    }; + +    qemu_co_queue_init(&req->wait_queue); + +    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ +    int64_t overlap_offset = req->offset & ~(align - 1); +    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) +                               - overlap_offset; + +    if (!req->serialising) { +        req->bs->serialising_in_flight++; +        req->serialising = true; +    } + +    req->overlap_offset = MIN(req->overlap_offset, overlap_offset); +    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, +                            int64_t sector_num, int nb_sectors, +                            int64_t *cluster_sector_num, +                            int *cluster_nb_sectors) +{ +    BlockDriverInfo bdi; + +    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { +        *cluster_sector_num = sector_num; +        *cluster_nb_sectors = nb_sectors; +    } else { +        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; +        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); +        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + +                                            nb_sectors, c); +    } +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ +    BlockDriverInfo bdi; +    int ret; + +    ret = bdrv_get_info(bs, &bdi); +    if (ret < 0 || bdi.cluster_size == 0) { +        return bs->request_alignment; +    } else { +        return bdi.cluster_size; +    } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, +                                     int64_t offset, unsigned int bytes) +{ +    /*        aaaa   bbbb */ +    if (offset >= req->overlap_offset + req->overlap_bytes) { +        return false; +    } +    /* bbbb   aaaa        */ +    if (req->overlap_offset >= offset + bytes) { +        return false; +    } +    return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ +    BlockDriverState *bs = self->bs; +    BdrvTrackedRequest *req; +    bool retry; +    bool waited = false; + +    if (!bs->serialising_in_flight) { +        return false; +    } + +    do { +        retry = false; +        QLIST_FOREACH(req, &bs->tracked_requests, list) { +            if (req == self || (!req->serialising && !self->serialising)) { +                continue; +            } +            if (tracked_request_overlaps(req, self->overlap_offset, +                                         self->overlap_bytes)) +            { +                /* Hitting this means there was a reentrant request, for +                 * example, a block driver issuing nested requests.  This must +                 * never happen since it means deadlock. +                 */ +                assert(qemu_coroutine_self() != req->co); + +                /* If the request is already (indirectly) waiting for us, or +                 * will wait for us as soon as it wakes up, then just go on +                 * (instead of producing a deadlock in the former case). */ +                if (!req->waiting_for) { +                    self->waiting_for = req; +                    qemu_co_queue_wait(&req->wait_queue); +                    self->waiting_for = NULL; +                    retry = true; +                    waited = true; +                    break; +                } +            } +        } +    } while (retry); + +    return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, +                                   size_t size) +{ +    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { +        return -EIO; +    } + +    if (!bdrv_is_inserted(bs)) { +        return -ENOMEDIUM; +    } + +    if (offset < 0) { +        return -EIO; +    } + +    return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, +                              int nb_sectors) +{ +    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { +        return -EIO; +    } + +    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, +                                   nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { +    BlockDriverState *bs; +    int64_t offset; +    QEMUIOVector *qiov; +    bool is_write; +    int ret; +    BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ +    RwCo *rwco = opaque; + +    if (!rwco->is_write) { +        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, +                                      rwco->qiov->size, rwco->qiov, +                                      rwco->flags); +    } else { +        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, +                                       rwco->qiov->size, rwco->qiov, +                                       rwco->flags); +    } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, +                        QEMUIOVector *qiov, bool is_write, +                        BdrvRequestFlags flags) +{ +    Coroutine *co; +    RwCo rwco = { +        .bs = bs, +        .offset = offset, +        .qiov = qiov, +        .is_write = is_write, +        .ret = NOT_DONE, +        .flags = flags, +    }; + +    /** +     * In sync call context, when the vcpu is blocked, this throttling timer +     * will not fire; so the I/O throttling function has to be disabled here +     * if it has been enabled. +     */ +    if (bs->io_limits_enabled) { +        fprintf(stderr, "Disabling I/O throttling on '%s' due " +                        "to synchronous I/O.\n", bdrv_get_device_name(bs)); +        bdrv_io_limits_disable(bs); +    } + +    if (qemu_in_coroutine()) { +        /* Fast-path if already in coroutine context */ +        bdrv_rw_co_entry(&rwco); +    } else { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        co = qemu_coroutine_create(bdrv_rw_co_entry); +        qemu_coroutine_enter(co, &rwco); +        while (rwco.ret == NOT_DONE) { +            aio_poll(aio_context, true); +        } +    } +    return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, +                      int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ +    QEMUIOVector qiov; +    struct iovec iov = { +        .iov_base = (void *)buf, +        .iov_len = nb_sectors * BDRV_SECTOR_SIZE, +    }; + +    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { +        return -EINVAL; +    } + +    qemu_iovec_init_external(&qiov, &iov, 1); +    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, +                        &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, +              uint8_t *buf, int nb_sectors) +{ +    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, +                          uint8_t *buf, int nb_sectors) +{ +    bool enabled; +    int ret; + +    enabled = bs->io_limits_enabled; +    bs->io_limits_enabled = false; +    ret = bdrv_read(bs, sector_num, buf, nb_sectors); +    bs->io_limits_enabled = enabled; +    return ret; +} + +/* Return < 0 if error. Important errors are: +  -EIO         generic I/O error (may happen for all errors) +  -ENOMEDIUM   No media inserted. +  -EINVAL      Invalid sector number or nb_sectors +  -EACCES      Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, +               const uint8_t *buf, int nb_sectors) +{ +    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, +                      int nb_sectors, BdrvRequestFlags flags) +{ +    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, +                      BDRV_REQ_ZERO_WRITE | flags); +} + +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ +    int64_t target_sectors, ret, nb_sectors, sector_num = 0; +    int n; + +    target_sectors = bdrv_nb_sectors(bs); +    if (target_sectors < 0) { +        return target_sectors; +    } + +    for (;;) { +        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); +        if (nb_sectors <= 0) { +            return 0; +        } +        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); +        if (ret < 0) { +            error_report("error getting block status at sector %" PRId64 ": %s", +                         sector_num, strerror(-ret)); +            return ret; +        } +        if (ret & BDRV_BLOCK_ZERO) { +            sector_num += n; +            continue; +        } +        ret = bdrv_write_zeroes(bs, sector_num, n, flags); +        if (ret < 0) { +            error_report("error writing zeroes at sector %" PRId64 ": %s", +                         sector_num, strerror(-ret)); +            return ret; +        } +        sector_num += n; +    } +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ +    QEMUIOVector qiov; +    struct iovec iov = { +        .iov_base = (void *)buf, +        .iov_len = bytes, +    }; +    int ret; + +    if (bytes < 0) { +        return -EINVAL; +    } + +    qemu_iovec_init_external(&qiov, &iov, 1); +    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); +    if (ret < 0) { +        return ret; +    } + +    return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ +    int ret; + +    ret = bdrv_prwv_co(bs, offset, qiov, true, 0); +    if (ret < 0) { +        return ret; +    } + +    return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, +                const void *buf, int bytes) +{ +    QEMUIOVector qiov; +    struct iovec iov = { +        .iov_base   = (void *) buf, +        .iov_len    = bytes, +    }; + +    if (bytes < 0) { +        return -EINVAL; +    } + +    qemu_iovec_init_external(&qiov, &iov, 1); +    return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, +    const void *buf, int count) +{ +    int ret; + +    ret = bdrv_pwrite(bs, offset, buf, count); +    if (ret < 0) { +        return ret; +    } + +    /* No flush needed for cache modes that already do it */ +    if (bs->enable_write_cache) { +        bdrv_flush(bs); +    } + +    return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    /* Perform I/O through a temporary buffer so that users who scribble over +     * their read buffer while the operation is in progress do not end up +     * modifying the image file.  This is critical for zero-copy guest I/O +     * where anything might happen inside guest memory. +     */ +    void *bounce_buffer; + +    BlockDriver *drv = bs->drv; +    struct iovec iov; +    QEMUIOVector bounce_qiov; +    int64_t cluster_sector_num; +    int cluster_nb_sectors; +    size_t skip_bytes; +    int ret; + +    /* Cover entire cluster so no additional backing file I/O is required when +     * allocating cluster in the image file. +     */ +    bdrv_round_to_clusters(bs, sector_num, nb_sectors, +                           &cluster_sector_num, &cluster_nb_sectors); + +    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, +                                   cluster_sector_num, cluster_nb_sectors); + +    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; +    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); +    if (bounce_buffer == NULL) { +        ret = -ENOMEM; +        goto err; +    } + +    qemu_iovec_init_external(&bounce_qiov, &iov, 1); + +    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, +                             &bounce_qiov); +    if (ret < 0) { +        goto err; +    } + +    if (drv->bdrv_co_write_zeroes && +        buffer_is_zero(bounce_buffer, iov.iov_len)) { +        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, +                                      cluster_nb_sectors, 0); +    } else { +        /* This does not change the data on the disk, it is not necessary +         * to flush even in cache=writethrough mode. +         */ +        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, +                                  &bounce_qiov); +    } + +    if (ret < 0) { +        /* It might be okay to ignore write errors for guest requests.  If this +         * is a deliberate copy-on-read then we don't want to ignore the error. +         * Simply report it in all cases. +         */ +        goto err; +    } + +    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; +    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, +                        nb_sectors * BDRV_SECTOR_SIZE); + +err: +    qemu_vfree(bounce_buffer); +    return ret; +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, +    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, +    int64_t align, QEMUIOVector *qiov, int flags) +{ +    BlockDriver *drv = bs->drv; +    int ret; + +    int64_t sector_num = offset >> BDRV_SECTOR_BITS; +    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + +    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); +    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); +    assert(!qiov || bytes == qiov->size); + +    /* Handle Copy on Read and associated serialisation */ +    if (flags & BDRV_REQ_COPY_ON_READ) { +        /* If we touch the same cluster it counts as an overlap.  This +         * guarantees that allocating writes will be serialized and not race +         * with each other for the same cluster.  For example, in copy-on-read +         * it ensures that the CoR read and write operations are atomic and +         * guest writes cannot interleave between them. */ +        mark_request_serialising(req, bdrv_get_cluster_size(bs)); +    } + +    wait_serialising_requests(req); + +    if (flags & BDRV_REQ_COPY_ON_READ) { +        int pnum; + +        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); +        if (ret < 0) { +            goto out; +        } + +        if (!ret || pnum != nb_sectors) { +            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); +            goto out; +        } +    } + +    /* Forward the request to the BlockDriver */ +    if (!bs->zero_beyond_eof) { +        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); +    } else { +        /* Read zeros after EOF */ +        int64_t total_sectors, max_nb_sectors; + +        total_sectors = bdrv_nb_sectors(bs); +        if (total_sectors < 0) { +            ret = total_sectors; +            goto out; +        } + +        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), +                                  align >> BDRV_SECTOR_BITS); +        if (nb_sectors < max_nb_sectors) { +            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); +        } else if (max_nb_sectors > 0) { +            QEMUIOVector local_qiov; + +            qemu_iovec_init(&local_qiov, qiov->niov); +            qemu_iovec_concat(&local_qiov, qiov, 0, +                              max_nb_sectors * BDRV_SECTOR_SIZE); + +            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, +                                     &local_qiov); + +            qemu_iovec_destroy(&local_qiov); +        } else { +            ret = 0; +        } + +        /* Reading beyond end of file is supposed to produce zeroes */ +        if (ret == 0 && total_sectors < sector_num + nb_sectors) { +            uint64_t offset = MAX(0, total_sectors - sector_num); +            uint64_t bytes = (sector_num + nb_sectors - offset) * +                              BDRV_SECTOR_SIZE; +            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); +        } +    } + +out: +    return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, +    BdrvRequestFlags flags) +{ +    BlockDriver *drv = bs->drv; +    BdrvTrackedRequest req; + +    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ +    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); +    uint8_t *head_buf = NULL; +    uint8_t *tail_buf = NULL; +    QEMUIOVector local_qiov; +    bool use_local_qiov = false; +    int ret; + +    if (!drv) { +        return -ENOMEDIUM; +    } + +    ret = bdrv_check_byte_request(bs, offset, bytes); +    if (ret < 0) { +        return ret; +    } + +    if (bs->copy_on_read) { +        flags |= BDRV_REQ_COPY_ON_READ; +    } + +    /* throttling disk I/O */ +    if (bs->io_limits_enabled) { +        throttle_group_co_io_limits_intercept(bs, bytes, false); +    } + +    /* Align read if necessary by padding qiov */ +    if (offset & (align - 1)) { +        head_buf = qemu_blockalign(bs, align); +        qemu_iovec_init(&local_qiov, qiov->niov + 2); +        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); +        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); +        use_local_qiov = true; + +        bytes += offset & (align - 1); +        offset = offset & ~(align - 1); +    } + +    if ((offset + bytes) & (align - 1)) { +        if (!use_local_qiov) { +            qemu_iovec_init(&local_qiov, qiov->niov + 1); +            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); +            use_local_qiov = true; +        } +        tail_buf = qemu_blockalign(bs, align); +        qemu_iovec_add(&local_qiov, tail_buf, +                       align - ((offset + bytes) & (align - 1))); + +        bytes = ROUND_UP(bytes, align); +    } + +    tracked_request_begin(&req, bs, offset, bytes, false); +    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, +                              use_local_qiov ? &local_qiov : qiov, +                              flags); +    tracked_request_end(&req); + +    if (use_local_qiov) { +        qemu_iovec_destroy(&local_qiov); +        qemu_vfree(head_buf); +        qemu_vfree(tail_buf); +    } + +    return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +    BdrvRequestFlags flags) +{ +    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { +        return -EINVAL; +    } + +    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, +                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, +    int nb_sectors, QEMUIOVector *qiov) +{ +    trace_bdrv_co_readv(bs, sector_num, nb_sectors); + +    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + +    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, +                            BDRV_REQ_COPY_ON_READ); +} + +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ +    BlockDriver *drv = bs->drv; +    QEMUIOVector qiov; +    struct iovec iov = {0}; +    int ret = 0; + +    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, +                                        BDRV_REQUEST_MAX_SECTORS); + +    while (nb_sectors > 0 && !ret) { +        int num = nb_sectors; + +        /* Align request.  Block drivers can expect the "bulk" of the request +         * to be aligned. +         */ +        if (bs->bl.write_zeroes_alignment +            && num > bs->bl.write_zeroes_alignment) { +            if (sector_num % bs->bl.write_zeroes_alignment != 0) { +                /* Make a small request up to the first aligned sector.  */ +                num = bs->bl.write_zeroes_alignment; +                num -= sector_num % bs->bl.write_zeroes_alignment; +            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { +                /* Shorten the request to the last aligned sector.  num cannot +                 * underflow because num > bs->bl.write_zeroes_alignment. +                 */ +                num -= (sector_num + num) % bs->bl.write_zeroes_alignment; +            } +        } + +        /* limit request size */ +        if (num > max_write_zeroes) { +            num = max_write_zeroes; +        } + +        ret = -ENOTSUP; +        /* First try the efficient write zeroes operation */ +        if (drv->bdrv_co_write_zeroes) { +            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); +        } + +        if (ret == -ENOTSUP) { +            /* Fall back to bounce buffer if write zeroes is unsupported */ +            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, +                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER); +            num = MIN(num, max_xfer_len); +            iov.iov_len = num * BDRV_SECTOR_SIZE; +            if (iov.iov_base == NULL) { +                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); +                if (iov.iov_base == NULL) { +                    ret = -ENOMEM; +                    goto fail; +                } +                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); +            } +            qemu_iovec_init_external(&qiov, &iov, 1); + +            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + +            /* Keep bounce buffer around if it is big enough for all +             * all future requests. +             */ +            if (num < max_xfer_len) { +                qemu_vfree(iov.iov_base); +                iov.iov_base = NULL; +            } +        } + +        sector_num += num; +        nb_sectors -= num; +    } + +fail: +    qemu_vfree(iov.iov_base); +    return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, +    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, +    QEMUIOVector *qiov, int flags) +{ +    BlockDriver *drv = bs->drv; +    bool waited; +    int ret; + +    int64_t sector_num = offset >> BDRV_SECTOR_BITS; +    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + +    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); +    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); +    assert(!qiov || bytes == qiov->size); + +    waited = wait_serialising_requests(req); +    assert(!waited || !req->serialising); +    assert(req->overlap_offset <= offset); +    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + +    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + +    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && +        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && +        qemu_iovec_is_zero(qiov)) { +        flags |= BDRV_REQ_ZERO_WRITE; +        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { +            flags |= BDRV_REQ_MAY_UNMAP; +        } +    } + +    if (ret < 0) { +        /* Do nothing, write notifier decided to fail this request */ +    } else if (flags & BDRV_REQ_ZERO_WRITE) { +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); +        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); +    } else { +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV); +        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); +    } +    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + +    if (ret == 0 && !bs->enable_write_cache) { +        ret = bdrv_co_flush(bs); +    } + +    bdrv_set_dirty(bs, sector_num, nb_sectors); + +    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + +    if (ret >= 0) { +        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); +    } + +    return ret; +} + +static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, +                                                int64_t offset, +                                                unsigned int bytes, +                                                BdrvRequestFlags flags, +                                                BdrvTrackedRequest *req) +{ +    uint8_t *buf = NULL; +    QEMUIOVector local_qiov; +    struct iovec iov; +    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); +    unsigned int head_padding_bytes, tail_padding_bytes; +    int ret = 0; + +    head_padding_bytes = offset & (align - 1); +    tail_padding_bytes = align - ((offset + bytes) & (align - 1)); + + +    assert(flags & BDRV_REQ_ZERO_WRITE); +    if (head_padding_bytes || tail_padding_bytes) { +        buf = qemu_blockalign(bs, align); +        iov = (struct iovec) { +            .iov_base   = buf, +            .iov_len    = align, +        }; +        qemu_iovec_init_external(&local_qiov, &iov, 1); +    } +    if (head_padding_bytes) { +        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); + +        /* RMW the unaligned part before head. */ +        mark_request_serialising(req, align); +        wait_serialising_requests(req); +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); +        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, +                                  align, &local_qiov, 0); +        if (ret < 0) { +            goto fail; +        } +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + +        memset(buf + head_padding_bytes, 0, zero_bytes); +        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, +                                   &local_qiov, +                                   flags & ~BDRV_REQ_ZERO_WRITE); +        if (ret < 0) { +            goto fail; +        } +        offset += zero_bytes; +        bytes -= zero_bytes; +    } + +    assert(!bytes || (offset & (align - 1)) == 0); +    if (bytes >= align) { +        /* Write the aligned part in the middle. */ +        uint64_t aligned_bytes = bytes & ~(align - 1); +        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, +                                   NULL, flags); +        if (ret < 0) { +            goto fail; +        } +        bytes -= aligned_bytes; +        offset += aligned_bytes; +    } + +    assert(!bytes || (offset & (align - 1)) == 0); +    if (bytes) { +        assert(align == tail_padding_bytes + bytes); +        /* RMW the unaligned part after tail. */ +        mark_request_serialising(req, align); +        wait_serialising_requests(req); +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); +        ret = bdrv_aligned_preadv(bs, req, offset, align, +                                  align, &local_qiov, 0); +        if (ret < 0) { +            goto fail; +        } +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + +        memset(buf, 0, bytes); +        ret = bdrv_aligned_pwritev(bs, req, offset, align, +                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); +    } +fail: +    qemu_vfree(buf); +    return ret; + +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, +    BdrvRequestFlags flags) +{ +    BdrvTrackedRequest req; +    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ +    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); +    uint8_t *head_buf = NULL; +    uint8_t *tail_buf = NULL; +    QEMUIOVector local_qiov; +    bool use_local_qiov = false; +    int ret; + +    if (!bs->drv) { +        return -ENOMEDIUM; +    } +    if (bs->read_only) { +        return -EPERM; +    } + +    ret = bdrv_check_byte_request(bs, offset, bytes); +    if (ret < 0) { +        return ret; +    } + +    /* throttling disk I/O */ +    if (bs->io_limits_enabled) { +        throttle_group_co_io_limits_intercept(bs, bytes, true); +    } + +    /* +     * Align write if necessary by performing a read-modify-write cycle. +     * Pad qiov with the read parts and be sure to have a tracked request not +     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. +     */ +    tracked_request_begin(&req, bs, offset, bytes, true); + +    if (!qiov) { +        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); +        goto out; +    } + +    if (offset & (align - 1)) { +        QEMUIOVector head_qiov; +        struct iovec head_iov; + +        mark_request_serialising(&req, align); +        wait_serialising_requests(&req); + +        head_buf = qemu_blockalign(bs, align); +        head_iov = (struct iovec) { +            .iov_base   = head_buf, +            .iov_len    = align, +        }; +        qemu_iovec_init_external(&head_qiov, &head_iov, 1); + +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); +        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, +                                  align, &head_qiov, 0); +        if (ret < 0) { +            goto fail; +        } +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + +        qemu_iovec_init(&local_qiov, qiov->niov + 2); +        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); +        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); +        use_local_qiov = true; + +        bytes += offset & (align - 1); +        offset = offset & ~(align - 1); +    } + +    if ((offset + bytes) & (align - 1)) { +        QEMUIOVector tail_qiov; +        struct iovec tail_iov; +        size_t tail_bytes; +        bool waited; + +        mark_request_serialising(&req, align); +        waited = wait_serialising_requests(&req); +        assert(!waited || !use_local_qiov); + +        tail_buf = qemu_blockalign(bs, align); +        tail_iov = (struct iovec) { +            .iov_base   = tail_buf, +            .iov_len    = align, +        }; +        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); +        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, +                                  align, &tail_qiov, 0); +        if (ret < 0) { +            goto fail; +        } +        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + +        if (!use_local_qiov) { +            qemu_iovec_init(&local_qiov, qiov->niov + 1); +            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); +            use_local_qiov = true; +        } + +        tail_bytes = (offset + bytes) & (align - 1); +        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + +        bytes = ROUND_UP(bytes, align); +    } + +    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, +                               use_local_qiov ? &local_qiov : qiov, +                               flags); + +fail: + +    if (use_local_qiov) { +        qemu_iovec_destroy(&local_qiov); +    } +    qemu_vfree(head_buf); +    qemu_vfree(tail_buf); +out: +    tracked_request_end(&req); +    return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +    BdrvRequestFlags flags) +{ +    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { +        return -EINVAL; +    } + +    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, +                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, +    int nb_sectors, QEMUIOVector *qiov) +{ +    trace_bdrv_co_writev(bs, sector_num, nb_sectors); + +    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, +                                      int64_t sector_num, int nb_sectors, +                                      BdrvRequestFlags flags) +{ +    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + +    if (!(bs->open_flags & BDRV_O_UNMAP)) { +        flags &= ~BDRV_REQ_MAY_UNMAP; +    } + +    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, +                             BDRV_REQ_ZERO_WRITE | flags); +} + +int bdrv_flush_all(void) +{ +    BlockDriverState *bs = NULL; +    int result = 0; + +    while ((bs = bdrv_next(bs))) { +        AioContext *aio_context = bdrv_get_aio_context(bs); +        int ret; + +        aio_context_acquire(aio_context); +        ret = bdrv_flush(bs); +        if (ret < 0 && !result) { +            result = ret; +        } +        aio_context_release(aio_context); +    } + +    return result; +} + +typedef struct BdrvCoGetBlockStatusData { +    BlockDriverState *bs; +    BlockDriverState *base; +    int64_t sector_num; +    int nb_sectors; +    int *pnum; +    int64_t ret; +    bool done; +} BdrvCoGetBlockStatusData; + +/* + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, +                                                     int64_t sector_num, +                                                     int nb_sectors, int *pnum) +{ +    int64_t total_sectors; +    int64_t n; +    int64_t ret, ret2; + +    total_sectors = bdrv_nb_sectors(bs); +    if (total_sectors < 0) { +        return total_sectors; +    } + +    if (sector_num >= total_sectors) { +        *pnum = 0; +        return 0; +    } + +    n = total_sectors - sector_num; +    if (n < nb_sectors) { +        nb_sectors = n; +    } + +    if (!bs->drv->bdrv_co_get_block_status) { +        *pnum = nb_sectors; +        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; +        if (bs->drv->protocol_name) { +            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); +        } +        return ret; +    } + +    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); +    if (ret < 0) { +        *pnum = 0; +        return ret; +    } + +    if (ret & BDRV_BLOCK_RAW) { +        assert(ret & BDRV_BLOCK_OFFSET_VALID); +        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, +                                     *pnum, pnum); +    } + +    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { +        ret |= BDRV_BLOCK_ALLOCATED; +    } else { +        if (bdrv_unallocated_blocks_are_zero(bs)) { +            ret |= BDRV_BLOCK_ZERO; +        } else if (bs->backing_hd) { +            BlockDriverState *bs2 = bs->backing_hd; +            int64_t nb_sectors2 = bdrv_nb_sectors(bs2); +            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { +                ret |= BDRV_BLOCK_ZERO; +            } +        } +    } + +    if (bs->file && +        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && +        (ret & BDRV_BLOCK_OFFSET_VALID)) { +        int file_pnum; + +        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, +                                        *pnum, &file_pnum); +        if (ret2 >= 0) { +            /* Ignore errors.  This is just providing extra information, it +             * is useful but not necessary. +             */ +            if (!file_pnum) { +                /* !file_pnum indicates an offset at or beyond the EOF; it is +                 * perfectly valid for the format block driver to point to such +                 * offsets, so catch it and mark everything as zero */ +                ret |= BDRV_BLOCK_ZERO; +            } else { +                /* Limit request to the range reported by the protocol driver */ +                *pnum = file_pnum; +                ret |= (ret2 & BDRV_BLOCK_ZERO); +            } +        } +    } + +    return ret; +} + +static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, +        BlockDriverState *base, +        int64_t sector_num, +        int nb_sectors, +        int *pnum) +{ +    BlockDriverState *p; +    int64_t ret = 0; + +    assert(bs != base); +    for (p = bs; p != base; p = p->backing_hd) { +        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); +        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { +            break; +        } +        /* [sector_num, pnum] unallocated on this layer, which could be only +         * the first part of [sector_num, nb_sectors].  */ +        nb_sectors = MIN(nb_sectors, *pnum); +    } +    return ret; +} + +/* Coroutine wrapper for bdrv_get_block_status_above() */ +static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) +{ +    BdrvCoGetBlockStatusData *data = opaque; + +    data->ret = bdrv_co_get_block_status_above(data->bs, data->base, +                                               data->sector_num, +                                               data->nb_sectors, +                                               data->pnum); +    data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_get_block_status_above(). + * + * See bdrv_co_get_block_status_above() for details. + */ +int64_t bdrv_get_block_status_above(BlockDriverState *bs, +                                    BlockDriverState *base, +                                    int64_t sector_num, +                                    int nb_sectors, int *pnum) +{ +    Coroutine *co; +    BdrvCoGetBlockStatusData data = { +        .bs = bs, +        .base = base, +        .sector_num = sector_num, +        .nb_sectors = nb_sectors, +        .pnum = pnum, +        .done = false, +    }; + +    if (qemu_in_coroutine()) { +        /* Fast-path if already in coroutine context */ +        bdrv_get_block_status_above_co_entry(&data); +    } else { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); +        qemu_coroutine_enter(co, &data); +        while (!data.done) { +            aio_poll(aio_context, true); +        } +    } +    return data.ret; +} + +int64_t bdrv_get_block_status(BlockDriverState *bs, +                              int64_t sector_num, +                              int nb_sectors, int *pnum) +{ +    return bdrv_get_block_status_above(bs, bs->backing_hd, +                                       sector_num, nb_sectors, pnum); +} + +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, +                                   int nb_sectors, int *pnum) +{ +    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); +    if (ret < 0) { +        return ret; +    } +    return !!(ret & BDRV_BLOCK_ALLOCATED); +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive).  BASE can be NULL to check if the given + * sector is allocated in any image of the chain.  Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + *  the specified sector) that are known to be in the same + *  allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, +                            BlockDriverState *base, +                            int64_t sector_num, +                            int nb_sectors, int *pnum) +{ +    BlockDriverState *intermediate; +    int ret, n = nb_sectors; + +    intermediate = top; +    while (intermediate && intermediate != base) { +        int pnum_inter; +        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, +                                &pnum_inter); +        if (ret < 0) { +            return ret; +        } else if (ret) { +            *pnum = pnum_inter; +            return 1; +        } + +        /* +         * [sector_num, nb_sectors] is unallocated on top but intermediate +         * might have +         * +         * [sector_num+x, nr_sectors] allocated. +         */ +        if (n > pnum_inter && +            (intermediate == top || +             sector_num + pnum_inter < intermediate->total_sectors)) { +            n = pnum_inter; +        } + +        intermediate = intermediate->backing_hd; +    } + +    *pnum = n; +    return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, +                          const uint8_t *buf, int nb_sectors) +{ +    BlockDriver *drv = bs->drv; +    int ret; + +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (!drv->bdrv_write_compressed) { +        return -ENOTSUP; +    } +    ret = bdrv_check_request(bs, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } + +    assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + +    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, +                      int64_t pos, int size) +{ +    QEMUIOVector qiov; +    struct iovec iov = { +        .iov_base   = (void *) buf, +        .iov_len    = size, +    }; + +    qemu_iovec_init_external(&qiov, &iov, 1); +    return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ +    BlockDriver *drv = bs->drv; + +    if (!drv) { +        return -ENOMEDIUM; +    } else if (drv->bdrv_save_vmstate) { +        return drv->bdrv_save_vmstate(bs, qiov, pos); +    } else if (bs->file) { +        return bdrv_writev_vmstate(bs->file, qiov, pos); +    } + +    return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, +                      int64_t pos, int size) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) +        return -ENOMEDIUM; +    if (drv->bdrv_load_vmstate) +        return drv->bdrv_load_vmstate(bs, buf, pos, size); +    if (bs->file) +        return bdrv_load_vmstate(bs->file, buf, pos, size); +    return -ENOTSUP; +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, +                           QEMUIOVector *qiov, int nb_sectors, +                           BlockCompletionFunc *cb, void *opaque) +{ +    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + +    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, +                                 cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, +                            QEMUIOVector *qiov, int nb_sectors, +                            BlockCompletionFunc *cb, void *opaque) +{ +    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + +    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, +                                 cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, +        BlockCompletionFunc *cb, void *opaque) +{ +    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + +    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, +                                 BDRV_REQ_ZERO_WRITE | flags, +                                 cb, opaque, true); +} + + +typedef struct MultiwriteCB { +    int error; +    int num_requests; +    int num_callbacks; +    struct { +        BlockCompletionFunc *cb; +        void *opaque; +        QEMUIOVector *free_qiov; +    } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ +    int i; + +    for (i = 0; i < mcb->num_callbacks; i++) { +        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); +        if (mcb->callbacks[i].free_qiov) { +            qemu_iovec_destroy(mcb->callbacks[i].free_qiov); +        } +        g_free(mcb->callbacks[i].free_qiov); +    } +} + +static void multiwrite_cb(void *opaque, int ret) +{ +    MultiwriteCB *mcb = opaque; + +    trace_multiwrite_cb(mcb, ret); + +    if (ret < 0 && !mcb->error) { +        mcb->error = ret; +    } + +    mcb->num_requests--; +    if (mcb->num_requests == 0) { +        multiwrite_user_cb(mcb); +        g_free(mcb); +    } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ +    const BlockRequest *req1 = a, *req2 = b; + +    /* +     * Note that we can't simply subtract req2->sector from req1->sector +     * here as that could overflow the return value. +     */ +    if (req1->sector > req2->sector) { +        return 1; +    } else if (req1->sector < req2->sector) { +        return -1; +    } else { +        return 0; +    } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, +    int num_reqs, MultiwriteCB *mcb) +{ +    int i, outidx; + +    // Sort requests by start sector +    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + +    // Check if adjacent requests touch the same clusters. If so, combine them, +    // filling up gaps with zero sectors. +    outidx = 0; +    for (i = 1; i < num_reqs; i++) { +        int merge = 0; +        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + +        // Handle exactly sequential writes and overlapping writes. +        if (reqs[i].sector <= oldreq_last) { +            merge = 1; +        } + +        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { +            merge = 0; +        } + +        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + +            reqs[i].nb_sectors > bs->bl.max_transfer_length) { +            merge = 0; +        } + +        if (merge) { +            size_t size; +            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); +            qemu_iovec_init(qiov, +                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + +            // Add the first request to the merged one. If the requests are +            // overlapping, drop the last sectors of the first request. +            size = (reqs[i].sector - reqs[outidx].sector) << 9; +            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + +            // We should need to add any zeros between the two requests +            assert (reqs[i].sector <= oldreq_last); + +            // Add the second request +            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + +            // Add tail of first request, if necessary +            if (qiov->size < reqs[outidx].qiov->size) { +                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, +                                  reqs[outidx].qiov->size - qiov->size); +            } + +            reqs[outidx].nb_sectors = qiov->size >> 9; +            reqs[outidx].qiov = qiov; + +            mcb->callbacks[i].free_qiov = reqs[outidx].qiov; +        } else { +            outidx++; +            reqs[outidx].sector     = reqs[i].sector; +            reqs[outidx].nb_sectors = reqs[i].nb_sectors; +            reqs[outidx].qiov       = reqs[i].qiov; +        } +    } + +    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + +    return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ +    MultiwriteCB *mcb; +    int i; + +    /* don't submit writes if we don't have a medium */ +    if (bs->drv == NULL) { +        for (i = 0; i < num_reqs; i++) { +            reqs[i].error = -ENOMEDIUM; +        } +        return -1; +    } + +    if (num_reqs == 0) { +        return 0; +    } + +    // Create MultiwriteCB structure +    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); +    mcb->num_requests = 0; +    mcb->num_callbacks = num_reqs; + +    for (i = 0; i < num_reqs; i++) { +        mcb->callbacks[i].cb = reqs[i].cb; +        mcb->callbacks[i].opaque = reqs[i].opaque; +    } + +    // Check for mergable requests +    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + +    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + +    /* Run the aio requests. */ +    mcb->num_requests = num_reqs; +    for (i = 0; i < num_reqs; i++) { +        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, +                              reqs[i].nb_sectors, reqs[i].flags, +                              multiwrite_cb, mcb, +                              true); +    } + +    return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ +    qemu_aio_ref(acb); +    bdrv_aio_cancel_async(acb); +    while (acb->refcnt > 1) { +        if (acb->aiocb_info->get_aio_context) { +            aio_poll(acb->aiocb_info->get_aio_context(acb), true); +        } else if (acb->bs) { +            aio_poll(bdrv_get_aio_context(acb->bs), true); +        } else { +            abort(); +        } +    } +    qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ +    if (acb->aiocb_info->cancel_async) { +        acb->aiocb_info->cancel_async(acb); +    } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { +    BlockAIOCB common; +    QEMUBH *bh; +    int ret; +    /* vector translation state */ +    QEMUIOVector *qiov; +    uint8_t *bounce; +    int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { +    .aiocb_size         = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ +    BlockAIOCBSync *acb = opaque; + +    if (!acb->is_write && acb->ret >= 0) { +        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); +    } +    qemu_vfree(acb->bounce); +    acb->common.cb(acb->common.opaque, acb->ret); +    qemu_bh_delete(acb->bh); +    acb->bh = NULL; +    qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, +                                      int64_t sector_num, +                                      QEMUIOVector *qiov, +                                      int nb_sectors, +                                      BlockCompletionFunc *cb, +                                      void *opaque, +                                      int is_write) + +{ +    BlockAIOCBSync *acb; + +    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); +    acb->is_write = is_write; +    acb->qiov = qiov; +    acb->bounce = qemu_try_blockalign(bs, qiov->size); +    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + +    if (acb->bounce == NULL) { +        acb->ret = -ENOMEM; +    } else if (is_write) { +        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); +        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); +    } else { +        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); +    } + +    qemu_bh_schedule(acb->bh); + +    return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockAIOCBCoroutine { +    BlockAIOCB common; +    BlockRequest req; +    bool is_write; +    bool need_bh; +    bool *done; +    QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { +    .aiocb_size         = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +{ +    if (!acb->need_bh) { +        acb->common.cb(acb->common.opaque, acb->req.error); +        qemu_aio_unref(acb); +    } +} + +static void bdrv_co_em_bh(void *opaque) +{ +    BlockAIOCBCoroutine *acb = opaque; + +    assert(!acb->need_bh); +    qemu_bh_delete(acb->bh); +    bdrv_co_complete(acb); +} + +static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +{ +    acb->need_bh = false; +    if (acb->req.error != -EINPROGRESS) { +        BlockDriverState *bs = acb->common.bs; + +        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); +        qemu_bh_schedule(acb->bh); +    } +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ +    BlockAIOCBCoroutine *acb = opaque; +    BlockDriverState *bs = acb->common.bs; + +    if (!acb->is_write) { +        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, +            acb->req.nb_sectors, acb->req.qiov, acb->req.flags); +    } else { +        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, +            acb->req.nb_sectors, acb->req.qiov, acb->req.flags); +    } + +    bdrv_co_complete(acb); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, +                                         int64_t sector_num, +                                         QEMUIOVector *qiov, +                                         int nb_sectors, +                                         BdrvRequestFlags flags, +                                         BlockCompletionFunc *cb, +                                         void *opaque, +                                         bool is_write) +{ +    Coroutine *co; +    BlockAIOCBCoroutine *acb; + +    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); +    acb->need_bh = true; +    acb->req.error = -EINPROGRESS; +    acb->req.sector = sector_num; +    acb->req.nb_sectors = nb_sectors; +    acb->req.qiov = qiov; +    acb->req.flags = flags; +    acb->is_write = is_write; + +    co = qemu_coroutine_create(bdrv_co_do_rw); +    qemu_coroutine_enter(co, acb); + +    bdrv_co_maybe_schedule_bh(acb); +    return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ +    BlockAIOCBCoroutine *acb = opaque; +    BlockDriverState *bs = acb->common.bs; + +    acb->req.error = bdrv_co_flush(bs); +    bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, +        BlockCompletionFunc *cb, void *opaque) +{ +    trace_bdrv_aio_flush(bs, opaque); + +    Coroutine *co; +    BlockAIOCBCoroutine *acb; + +    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); +    acb->need_bh = true; +    acb->req.error = -EINPROGRESS; + +    co = qemu_coroutine_create(bdrv_aio_flush_co_entry); +    qemu_coroutine_enter(co, acb); + +    bdrv_co_maybe_schedule_bh(acb); +    return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ +    BlockAIOCBCoroutine *acb = opaque; +    BlockDriverState *bs = acb->common.bs; + +    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); +    bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    Coroutine *co; +    BlockAIOCBCoroutine *acb; + +    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + +    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); +    acb->need_bh = true; +    acb->req.error = -EINPROGRESS; +    acb->req.sector = sector_num; +    acb->req.nb_sectors = nb_sectors; +    co = qemu_coroutine_create(bdrv_aio_discard_co_entry); +    qemu_coroutine_enter(co, acb); + +    bdrv_co_maybe_schedule_bh(acb); +    return &acb->common; +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, +                   BlockCompletionFunc *cb, void *opaque) +{ +    BlockAIOCB *acb; + +    acb = g_slice_alloc(aiocb_info->aiocb_size); +    acb->aiocb_info = aiocb_info; +    acb->bs = bs; +    acb->cb = cb; +    acb->opaque = opaque; +    acb->refcnt = 1; +    return acb; +} + +void qemu_aio_ref(void *p) +{ +    BlockAIOCB *acb = p; +    acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ +    BlockAIOCB *acb = p; +    assert(acb->refcnt > 0); +    if (--acb->refcnt == 0) { +        g_slice_free1(acb->aiocb_info->aiocb_size, acb); +    } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { +    Coroutine *coroutine; +    int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ +    CoroutineIOCompletion *co = opaque; + +    co->ret = ret; +    qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, +                                      int nb_sectors, QEMUIOVector *iov, +                                      bool is_write) +{ +    CoroutineIOCompletion co = { +        .coroutine = qemu_coroutine_self(), +    }; +    BlockAIOCB *acb; + +    if (is_write) { +        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, +                                       bdrv_co_io_em_complete, &co); +    } else { +        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, +                                      bdrv_co_io_em_complete, &co); +    } + +    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); +    if (!acb) { +        return -EIO; +    } +    qemu_coroutine_yield(); + +    return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, +                                         int64_t sector_num, int nb_sectors, +                                         QEMUIOVector *iov) +{ +    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, +                                         int64_t sector_num, int nb_sectors, +                                         QEMUIOVector *iov) +{ +    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ +    RwCo *rwco = opaque; + +    rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ +    int ret; + +    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || +        bdrv_is_sg(bs)) { +        return 0; +    } + +    /* Write back cached data to the OS even with cache=unsafe */ +    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); +    if (bs->drv->bdrv_co_flush_to_os) { +        ret = bs->drv->bdrv_co_flush_to_os(bs); +        if (ret < 0) { +            return ret; +        } +    } + +    /* But don't actually force it to the disk with cache=unsafe */ +    if (bs->open_flags & BDRV_O_NO_FLUSH) { +        goto flush_parent; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); +    if (bs->drv->bdrv_co_flush_to_disk) { +        ret = bs->drv->bdrv_co_flush_to_disk(bs); +    } else if (bs->drv->bdrv_aio_flush) { +        BlockAIOCB *acb; +        CoroutineIOCompletion co = { +            .coroutine = qemu_coroutine_self(), +        }; + +        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); +        if (acb == NULL) { +            ret = -EIO; +        } else { +            qemu_coroutine_yield(); +            ret = co.ret; +        } +    } else { +        /* +         * Some block drivers always operate in either writethrough or unsafe +         * mode and don't support bdrv_flush therefore. Usually qemu doesn't +         * know how the server works (because the behaviour is hardcoded or +         * depends on server-side configuration), so we can't ensure that +         * everything is safe on disk. Returning an error doesn't work because +         * that would break guests even if the server operates in writethrough +         * mode. +         * +         * Let's hope the user knows what he's doing. +         */ +        ret = 0; +    } +    if (ret < 0) { +        return ret; +    } + +    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH +     * in the case of cache=unsafe, so there are no useless flushes. +     */ +flush_parent: +    return bdrv_co_flush(bs->file); +} + +int bdrv_flush(BlockDriverState *bs) +{ +    Coroutine *co; +    RwCo rwco = { +        .bs = bs, +        .ret = NOT_DONE, +    }; + +    if (qemu_in_coroutine()) { +        /* Fast-path if already in coroutine context */ +        bdrv_flush_co_entry(&rwco); +    } else { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        co = qemu_coroutine_create(bdrv_flush_co_entry); +        qemu_coroutine_enter(co, &rwco); +        while (rwco.ret == NOT_DONE) { +            aio_poll(aio_context, true); +        } +    } + +    return rwco.ret; +} + +typedef struct DiscardCo { +    BlockDriverState *bs; +    int64_t sector_num; +    int nb_sectors; +    int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ +    DiscardCo *rwco = opaque; + +    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, +                                 int nb_sectors) +{ +    int max_discard, ret; + +    if (!bs->drv) { +        return -ENOMEDIUM; +    } + +    ret = bdrv_check_request(bs, sector_num, nb_sectors); +    if (ret < 0) { +        return ret; +    } else if (bs->read_only) { +        return -EPERM; +    } + +    /* Do nothing if disabled.  */ +    if (!(bs->open_flags & BDRV_O_UNMAP)) { +        return 0; +    } + +    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { +        return 0; +    } + +    bdrv_set_dirty(bs, sector_num, nb_sectors); + +    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); +    while (nb_sectors > 0) { +        int ret; +        int num = nb_sectors; + +        /* align request */ +        if (bs->bl.discard_alignment && +            num >= bs->bl.discard_alignment && +            sector_num % bs->bl.discard_alignment) { +            if (num > bs->bl.discard_alignment) { +                num = bs->bl.discard_alignment; +            } +            num -= sector_num % bs->bl.discard_alignment; +        } + +        /* limit request size */ +        if (num > max_discard) { +            num = max_discard; +        } + +        if (bs->drv->bdrv_co_discard) { +            ret = bs->drv->bdrv_co_discard(bs, sector_num, num); +        } else { +            BlockAIOCB *acb; +            CoroutineIOCompletion co = { +                .coroutine = qemu_coroutine_self(), +            }; + +            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, +                                            bdrv_co_io_em_complete, &co); +            if (acb == NULL) { +                return -EIO; +            } else { +                qemu_coroutine_yield(); +                ret = co.ret; +            } +        } +        if (ret && ret != -ENOTSUP) { +            return ret; +        } + +        sector_num += num; +        nb_sectors -= num; +    } +    return 0; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ +    Coroutine *co; +    DiscardCo rwco = { +        .bs = bs, +        .sector_num = sector_num, +        .nb_sectors = nb_sectors, +        .ret = NOT_DONE, +    }; + +    if (qemu_in_coroutine()) { +        /* Fast-path if already in coroutine context */ +        bdrv_discard_co_entry(&rwco); +    } else { +        AioContext *aio_context = bdrv_get_aio_context(bs); + +        co = qemu_coroutine_create(bdrv_discard_co_entry); +        qemu_coroutine_enter(co, &rwco); +        while (rwco.ret == NOT_DONE) { +            aio_poll(aio_context, true); +        } +    } + +    return rwco.ret; +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ +    BlockDriver *drv = bs->drv; + +    if (drv && drv->bdrv_ioctl) +        return drv->bdrv_ioctl(bs, req, buf); +    return -ENOTSUP; +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, +        unsigned long int req, void *buf, +        BlockCompletionFunc *cb, void *opaque) +{ +    BlockDriver *drv = bs->drv; + +    if (drv && drv->bdrv_aio_ioctl) +        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); +    return NULL; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ +    return qemu_memalign(bdrv_opt_mem_align(bs), size); +} + +void *qemu_blockalign0(BlockDriverState *bs, size_t size) +{ +    return memset(qemu_blockalign(bs, size), 0, size); +} + +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ +    size_t align = bdrv_opt_mem_align(bs); + +    /* Ensure that NULL is never returned on success */ +    assert(align > 0); +    if (size == 0) { +        size = align; +    } + +    return qemu_try_memalign(align, size); +} + +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +{ +    void *mem = qemu_try_blockalign(bs, size); + +    if (mem) { +        memset(mem, 0, size); +    } + +    return mem; +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ +    int i; +    size_t alignment = bdrv_min_mem_align(bs); + +    for (i = 0; i < qiov->niov; i++) { +        if ((uintptr_t) qiov->iov[i].iov_base % alignment) { +            return false; +        } +        if (qiov->iov[i].iov_len % alignment) { +            return false; +        } +    } + +    return true; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, +                                    NotifierWithReturn *notifier) +{ +    notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ +    BlockDriver *drv = bs->drv; +    if (drv && drv->bdrv_io_plug) { +        drv->bdrv_io_plug(bs); +    } else if (bs->file) { +        bdrv_io_plug(bs->file); +    } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ +    BlockDriver *drv = bs->drv; +    if (drv && drv->bdrv_io_unplug) { +        drv->bdrv_io_unplug(bs); +    } else if (bs->file) { +        bdrv_io_unplug(bs->file); +    } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ +    BlockDriver *drv = bs->drv; +    if (drv && drv->bdrv_flush_io_queue) { +        drv->bdrv_flush_io_queue(bs); +    } else if (bs->file) { +        bdrv_flush_io_queue(bs->file); +    } +    bdrv_start_throttled_reqs(bs); +} diff --git a/block/iscsi.c b/block/iscsi.c new file mode 100644 index 00000000..93f1ee4c --- /dev/null +++ b/block/iscsi.c @@ -0,0 +1,1819 @@ +/* + * QEMU Block driver for iSCSI images + * + * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> + * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include <math.h> +#include <arpa/inet.h> +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/bitops.h" +#include "qemu/bitmap.h" +#include "block/block_int.h" +#include "block/scsi.h" +#include "qemu/iov.h" +#include "sysemu/sysemu.h" +#include "qmp-commands.h" +#include "qapi/qmp/qstring.h" + +#include <iscsi/iscsi.h> +#include <iscsi/scsi-lowlevel.h> + +#ifdef __linux__ +#include <scsi/sg.h> +#include <block/scsi.h> +#endif + +typedef struct IscsiLun { +    struct iscsi_context *iscsi; +    AioContext *aio_context; +    int lun; +    enum scsi_inquiry_peripheral_device_type type; +    int block_size; +    uint64_t num_blocks; +    int events; +    QEMUTimer *nop_timer; +    QEMUTimer *event_timer; +    struct scsi_inquiry_logical_block_provisioning lbp; +    struct scsi_inquiry_block_limits bl; +    unsigned char *zeroblock; +    unsigned long *allocationmap; +    int cluster_sectors; +    bool use_16_for_rw; +    bool write_protected; +    bool lbpme; +    bool lbprz; +    bool dpofua; +    bool has_write_same; +    bool force_next_flush; +    bool request_timed_out; +} IscsiLun; + +typedef struct IscsiTask { +    int status; +    int complete; +    int retries; +    int do_retry; +    struct scsi_task *task; +    Coroutine *co; +    QEMUBH *bh; +    IscsiLun *iscsilun; +    QEMUTimer retry_timer; +    bool force_next_flush; +} IscsiTask; + +typedef struct IscsiAIOCB { +    BlockAIOCB common; +    QEMUIOVector *qiov; +    QEMUBH *bh; +    IscsiLun *iscsilun; +    struct scsi_task *task; +    uint8_t *buf; +    int status; +    int64_t sector_num; +    int nb_sectors; +#ifdef __linux__ +    sg_io_hdr_t *ioh; +#endif +} IscsiAIOCB; + +/* libiscsi uses time_t so its enough to process events every second */ +#define EVENT_INTERVAL 1000 +#define NOP_INTERVAL 5000 +#define MAX_NOP_FAILURES 3 +#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) +static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768}; + +/* this threshold is a trade-off knob to choose between + * the potential additional overhead of an extra GET_LBA_STATUS request + * vs. unnecessarily reading a lot of zero sectors over the wire. + * If a read request is greater or equal than ISCSI_CHECKALLOC_THRES + * sectors we check the allocation status of the area covered by the + * request first if the allocationmap indicates that the area might be + * unallocated. */ +#define ISCSI_CHECKALLOC_THRES 64 + +static void +iscsi_bh_cb(void *p) +{ +    IscsiAIOCB *acb = p; + +    qemu_bh_delete(acb->bh); + +    g_free(acb->buf); +    acb->buf = NULL; + +    acb->common.cb(acb->common.opaque, acb->status); + +    if (acb->task != NULL) { +        scsi_free_scsi_task(acb->task); +        acb->task = NULL; +    } + +    qemu_aio_unref(acb); +} + +static void +iscsi_schedule_bh(IscsiAIOCB *acb) +{ +    if (acb->bh) { +        return; +    } +    acb->bh = aio_bh_new(acb->iscsilun->aio_context, iscsi_bh_cb, acb); +    qemu_bh_schedule(acb->bh); +} + +static void iscsi_co_generic_bh_cb(void *opaque) +{ +    struct IscsiTask *iTask = opaque; +    iTask->complete = 1; +    qemu_bh_delete(iTask->bh); +    qemu_coroutine_enter(iTask->co, NULL); +} + +static void iscsi_retry_timer_expired(void *opaque) +{ +    struct IscsiTask *iTask = opaque; +    iTask->complete = 1; +    if (iTask->co) { +        qemu_coroutine_enter(iTask->co, NULL); +    } +} + +static inline unsigned exp_random(double mean) +{ +    return -mean * log((double)rand() / RAND_MAX); +} + +/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced + * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION + * macro was introduced in 1.11.0. So use the API_VERSION macro as + * a hint that the macros are defined and define them ourselves + * otherwise to keep the required libiscsi version at 1.9.0 */ +#if !defined(LIBISCSI_API_VERSION) +#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28 +#define QEMU_SCSI_STATUS_TIMEOUT        0x0f000002 +#else +#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL +#define QEMU_SCSI_STATUS_TIMEOUT        SCSI_STATUS_TIMEOUT +#endif + +static void +iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, +                        void *command_data, void *opaque) +{ +    struct IscsiTask *iTask = opaque; +    struct scsi_task *task = command_data; + +    iTask->status = status; +    iTask->do_retry = 0; +    iTask->task = task; + +    if (status != SCSI_STATUS_GOOD) { +        if (iTask->retries++ < ISCSI_CMD_RETRIES) { +            if (status == SCSI_STATUS_CHECK_CONDITION +                && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { +                error_report("iSCSI CheckCondition: %s", +                             iscsi_get_error(iscsi)); +                iTask->do_retry = 1; +                goto out; +            } +            if (status == SCSI_STATUS_BUSY || +                status == QEMU_SCSI_STATUS_TIMEOUT || +                status == QEMU_SCSI_STATUS_TASK_SET_FULL) { +                unsigned retry_time = +                    exp_random(iscsi_retry_times[iTask->retries - 1]); +                if (status == QEMU_SCSI_STATUS_TIMEOUT) { +                    /* make sure the request is rescheduled AFTER the +                     * reconnect is initiated */ +                    retry_time = EVENT_INTERVAL * 2; +                    iTask->iscsilun->request_timed_out = true; +                } +                error_report("iSCSI Busy/TaskSetFull/TimeOut" +                             " (retry #%u in %u ms): %s", +                             iTask->retries, retry_time, +                             iscsi_get_error(iscsi)); +                aio_timer_init(iTask->iscsilun->aio_context, +                               &iTask->retry_timer, QEMU_CLOCK_REALTIME, +                               SCALE_MS, iscsi_retry_timer_expired, iTask); +                timer_mod(&iTask->retry_timer, +                          qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + retry_time); +                iTask->do_retry = 1; +                return; +            } +        } +        error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); +    } else { +        iTask->iscsilun->force_next_flush |= iTask->force_next_flush; +    } + +out: +    if (iTask->co) { +        iTask->bh = aio_bh_new(iTask->iscsilun->aio_context, +                               iscsi_co_generic_bh_cb, iTask); +        qemu_bh_schedule(iTask->bh); +    } else { +        iTask->complete = 1; +    } +} + +static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) +{ +    *iTask = (struct IscsiTask) { +        .co         = qemu_coroutine_self(), +        .iscsilun   = iscsilun, +    }; +} + +static void +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, +                    void *private_data) +{ +    IscsiAIOCB *acb = private_data; + +    acb->status = -ECANCELED; +    iscsi_schedule_bh(acb); +} + +static void +iscsi_aio_cancel(BlockAIOCB *blockacb) +{ +    IscsiAIOCB *acb = (IscsiAIOCB *)blockacb; +    IscsiLun *iscsilun = acb->iscsilun; + +    if (acb->status != -EINPROGRESS) { +        return; +    } + +    /* send a task mgmt call to the target to cancel the task on the target */ +    iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task, +                                     iscsi_abort_task_cb, acb); + +} + +static const AIOCBInfo iscsi_aiocb_info = { +    .aiocb_size         = sizeof(IscsiAIOCB), +    .cancel_async       = iscsi_aio_cancel, +}; + + +static void iscsi_process_read(void *arg); +static void iscsi_process_write(void *arg); + +static void +iscsi_set_events(IscsiLun *iscsilun) +{ +    struct iscsi_context *iscsi = iscsilun->iscsi; +    int ev = iscsi_which_events(iscsi); + +    if (ev != iscsilun->events) { +        aio_set_fd_handler(iscsilun->aio_context, +                           iscsi_get_fd(iscsi), +                           (ev & POLLIN) ? iscsi_process_read : NULL, +                           (ev & POLLOUT) ? iscsi_process_write : NULL, +                           iscsilun); +        iscsilun->events = ev; +    } +} + +static void iscsi_timed_check_events(void *opaque) +{ +    IscsiLun *iscsilun = opaque; + +    /* check for timed out requests */ +    iscsi_service(iscsilun->iscsi, 0); + +    if (iscsilun->request_timed_out) { +        iscsilun->request_timed_out = false; +        iscsi_reconnect(iscsilun->iscsi); +    } + +    /* newer versions of libiscsi may return zero events. Ensure we are able +     * to return to service once this situation changes. */ +    iscsi_set_events(iscsilun); + +    timer_mod(iscsilun->event_timer, +              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); +} + +static void +iscsi_process_read(void *arg) +{ +    IscsiLun *iscsilun = arg; +    struct iscsi_context *iscsi = iscsilun->iscsi; + +    iscsi_service(iscsi, POLLIN); +    iscsi_set_events(iscsilun); +} + +static void +iscsi_process_write(void *arg) +{ +    IscsiLun *iscsilun = arg; +    struct iscsi_context *iscsi = iscsilun->iscsi; + +    iscsi_service(iscsi, POLLOUT); +    iscsi_set_events(iscsilun); +} + +static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ +    return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; +} + +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) +{ +    return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; +} + +static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, +                                      IscsiLun *iscsilun) +{ +    if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || +        (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { +            error_report("iSCSI misaligned request: " +                         "iscsilun->block_size %u, sector_num %" PRIi64 +                         ", nb_sectors %d", +                         iscsilun->block_size, sector_num, nb_sectors); +            return 0; +    } +    return 1; +} + +static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun) +{ +    return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, +                                                       iscsilun), +                                       iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, +                                    int nb_sectors) +{ +    if (iscsilun->allocationmap == NULL) { +        return; +    } +    bitmap_set(iscsilun->allocationmap, +               sector_num / iscsilun->cluster_sectors, +               DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, +                                      int nb_sectors) +{ +    int64_t cluster_num, nb_clusters; +    if (iscsilun->allocationmap == NULL) { +        return; +    } +    cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); +    nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors +                  - cluster_num; +    if (nb_clusters > 0) { +        bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); +    } +} + +static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, +                                        int64_t sector_num, int nb_sectors, +                                        QEMUIOVector *iov) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct IscsiTask iTask; +    uint64_t lba; +    uint32_t num_sectors; +    int fua; + +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +        return -EINVAL; +    } + +    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { +        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " +                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length); +        return -EINVAL; +    } + +    lba = sector_qemu2lun(sector_num, iscsilun); +    num_sectors = sector_qemu2lun(nb_sectors, iscsilun); +    iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: +    fua = iscsilun->dpofua && !bs->enable_write_cache; +    iTask.force_next_flush = !fua; +    if (iscsilun->use_16_for_rw) { +        iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, +                                        NULL, num_sectors * iscsilun->block_size, +                                        iscsilun->block_size, 0, 0, fua, 0, 0, +                                        iscsi_co_generic_cb, &iTask); +    } else { +        iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, +                                        NULL, num_sectors * iscsilun->block_size, +                                        iscsilun->block_size, 0, 0, fua, 0, 0, +                                        iscsi_co_generic_cb, &iTask); +    } +    if (iTask.task == NULL) { +        return -ENOMEM; +    } +    scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, +                          iov->niov); +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +        iTask.task = NULL; +    } + +    if (iTask.do_retry) { +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        return -EIO; +    } + +    iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + +    return 0; +} + + +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, +                                             int64_t sector_num, int nb_sectors) +{ +    unsigned long size; +    if (iscsilun->allocationmap == NULL) { +        return true; +    } +    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); +    return !(find_next_bit(iscsilun->allocationmap, size, +                           sector_num / iscsilun->cluster_sectors) == size); +} + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, +                                                  int64_t sector_num, +                                                  int nb_sectors, int *pnum) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct scsi_get_lba_status *lbas = NULL; +    struct scsi_lba_status_descriptor *lbasd = NULL; +    struct IscsiTask iTask; +    int64_t ret; + +    iscsi_co_init_iscsitask(iscsilun, &iTask); + +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +        ret = -EINVAL; +        goto out; +    } + +    /* default to all sectors allocated */ +    ret = BDRV_BLOCK_DATA; +    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; +    *pnum = nb_sectors; + +    /* LUN does not support logical block provisioning */ +    if (!iscsilun->lbpme) { +        goto out; +    } + +retry: +    if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, +                                  sector_qemu2lun(sector_num, iscsilun), +                                  8 + 16, iscsi_co_generic_cb, +                                  &iTask) == NULL) { +        ret = -ENOMEM; +        goto out; +    } + +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.do_retry) { +        if (iTask.task != NULL) { +            scsi_free_scsi_task(iTask.task); +            iTask.task = NULL; +        } +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        /* in case the get_lba_status_callout fails (i.e. +         * because the device is busy or the cmd is not +         * supported) we pretend all blocks are allocated +         * for backwards compatibility */ +        goto out; +    } + +    lbas = scsi_datain_unmarshall(iTask.task); +    if (lbas == NULL) { +        ret = -EIO; +        goto out; +    } + +    lbasd = &lbas->descriptors[0]; + +    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { +        ret = -EIO; +        goto out; +    } + +    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + +    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || +        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { +        ret &= ~BDRV_BLOCK_DATA; +        if (iscsilun->lbprz) { +            ret |= BDRV_BLOCK_ZERO; +        } +    } + +    if (ret & BDRV_BLOCK_ZERO) { +        iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); +    } else { +        iscsi_allocationmap_set(iscsilun, sector_num, *pnum); +    } + +    if (*pnum > nb_sectors) { +        *pnum = nb_sectors; +    } +out: +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +    } +    return ret; +} + +static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, +                                       int64_t sector_num, int nb_sectors, +                                       QEMUIOVector *iov) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct IscsiTask iTask; +    uint64_t lba; +    uint32_t num_sectors; + +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +        return -EINVAL; +    } + +    if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { +        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " +                     "of %d sectors", nb_sectors, bs->bl.max_transfer_length); +        return -EINVAL; +    } + +    if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && +        !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { +        int64_t ret; +        int pnum; +        ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); +        if (ret < 0) { +            return ret; +        } +        if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { +            qemu_iovec_memset(iov, 0, 0x00, iov->size); +            return 0; +        } +    } + +    lba = sector_qemu2lun(sector_num, iscsilun); +    num_sectors = sector_qemu2lun(nb_sectors, iscsilun); + +    iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: +    if (iscsilun->use_16_for_rw) { +        iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, +                                       num_sectors * iscsilun->block_size, +                                       iscsilun->block_size, 0, 0, 0, 0, 0, +                                       iscsi_co_generic_cb, &iTask); +    } else { +        iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, +                                       num_sectors * iscsilun->block_size, +                                       iscsilun->block_size, +                                       0, 0, 0, 0, 0, +                                       iscsi_co_generic_cb, &iTask); +    } +    if (iTask.task == NULL) { +        return -ENOMEM; +    } +    scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); + +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +        iTask.task = NULL; +    } + +    if (iTask.do_retry) { +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        return -EIO; +    } + +    return 0; +} + +static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct IscsiTask iTask; + +    if (!iscsilun->force_next_flush) { +        return 0; +    } +    iscsilun->force_next_flush = false; + +    iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: +    if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, +                                      0, iscsi_co_generic_cb, &iTask) == NULL) { +        return -ENOMEM; +    } + +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +        iTask.task = NULL; +    } + +    if (iTask.do_retry) { +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        return -EIO; +    } + +    return 0; +} + +#ifdef __linux__ +static void +iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, +                     void *command_data, void *opaque) +{ +    IscsiAIOCB *acb = opaque; + +    g_free(acb->buf); +    acb->buf = NULL; + +    acb->status = 0; +    if (status < 0) { +        error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", +                     iscsi_get_error(iscsi)); +        acb->status = -EIO; +    } + +    acb->ioh->driver_status = 0; +    acb->ioh->host_status   = 0; +    acb->ioh->resid         = 0; + +#define SG_ERR_DRIVER_SENSE    0x08 + +    if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) { +        int ss; + +        acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + +        acb->ioh->sb_len_wr = acb->task->datain.size - 2; +        ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? +             acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; +        memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); +    } + +    iscsi_schedule_bh(acb); +} + +static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, +        unsigned long int req, void *buf, +        BlockCompletionFunc *cb, void *opaque) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct iscsi_context *iscsi = iscsilun->iscsi; +    struct iscsi_data data; +    IscsiAIOCB *acb; + +    assert(req == SG_IO); + +    acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + +    acb->iscsilun = iscsilun; +    acb->bh          = NULL; +    acb->status      = -EINPROGRESS; +    acb->buf         = NULL; +    acb->ioh         = buf; + +    acb->task = malloc(sizeof(struct scsi_task)); +    if (acb->task == NULL) { +        error_report("iSCSI: Failed to allocate task for scsi command. %s", +                     iscsi_get_error(iscsi)); +        qemu_aio_unref(acb); +        return NULL; +    } +    memset(acb->task, 0, sizeof(struct scsi_task)); + +    switch (acb->ioh->dxfer_direction) { +    case SG_DXFER_TO_DEV: +        acb->task->xfer_dir = SCSI_XFER_WRITE; +        break; +    case SG_DXFER_FROM_DEV: +        acb->task->xfer_dir = SCSI_XFER_READ; +        break; +    default: +        acb->task->xfer_dir = SCSI_XFER_NONE; +        break; +    } + +    acb->task->cdb_size = acb->ioh->cmd_len; +    memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); +    acb->task->expxferlen = acb->ioh->dxfer_len; + +    data.size = 0; +    if (acb->task->xfer_dir == SCSI_XFER_WRITE) { +        if (acb->ioh->iovec_count == 0) { +            data.data = acb->ioh->dxferp; +            data.size = acb->ioh->dxfer_len; +        } else { +            scsi_task_set_iov_out(acb->task, +                                 (struct scsi_iovec *) acb->ioh->dxferp, +                                 acb->ioh->iovec_count); +        } +    } + +    if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, +                                 iscsi_aio_ioctl_cb, +                                 (data.size > 0) ? &data : NULL, +                                 acb) != 0) { +        scsi_free_scsi_task(acb->task); +        qemu_aio_unref(acb); +        return NULL; +    } + +    /* tell libiscsi to read straight into the buffer we got from ioctl */ +    if (acb->task->xfer_dir == SCSI_XFER_READ) { +        if (acb->ioh->iovec_count == 0) { +            scsi_task_add_data_in_buffer(acb->task, +                                         acb->ioh->dxfer_len, +                                         acb->ioh->dxferp); +        } else { +            scsi_task_set_iov_in(acb->task, +                                 (struct scsi_iovec *) acb->ioh->dxferp, +                                 acb->ioh->iovec_count); +        } +    } + +    iscsi_set_events(iscsilun); + +    return &acb->common; +} + +static void ioctl_cb(void *opaque, int status) +{ +    int *p_status = opaque; +    *p_status = status; +} + +static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ +    IscsiLun *iscsilun = bs->opaque; +    int status; + +    switch (req) { +    case SG_GET_VERSION_NUM: +        *(int *)buf = 30000; +        break; +    case SG_GET_SCSI_ID: +        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; +        break; +    case SG_IO: +        status = -EINPROGRESS; +        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status); + +        while (status == -EINPROGRESS) { +            aio_poll(iscsilun->aio_context, true); +        } + +        return 0; +    default: +        return -1; +    } +    return 0; +} +#endif + +static int64_t +iscsi_getlength(BlockDriverState *bs) +{ +    IscsiLun *iscsilun = bs->opaque; +    int64_t len; + +    len  = iscsilun->num_blocks; +    len *= iscsilun->block_size; + +    return len; +} + +static int +coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, +                                   int nb_sectors) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct IscsiTask iTask; +    struct unmap_list list; + +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +        return -EINVAL; +    } + +    if (!iscsilun->lbp.lbpu) { +        /* UNMAP is not supported by the target */ +        return 0; +    } + +    list.lba = sector_qemu2lun(sector_num, iscsilun); +    list.num = sector_qemu2lun(nb_sectors, iscsilun); + +    iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: +    if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, +                     iscsi_co_generic_cb, &iTask) == NULL) { +        return -ENOMEM; +    } + +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +        iTask.task = NULL; +    } + +    if (iTask.do_retry) { +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { +        /* the target might fail with a check condition if it +           is not happy with the alignment of the UNMAP request +           we silently fail in this case */ +        return 0; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        return -EIO; +    } + +    iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + +    return 0; +} + +static int +coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, +                                   int nb_sectors, BdrvRequestFlags flags) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct IscsiTask iTask; +    uint64_t lba; +    uint32_t nb_blocks; +    bool use_16_for_ws = iscsilun->use_16_for_rw; + +    if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +        return -EINVAL; +    } + +    if (flags & BDRV_REQ_MAY_UNMAP) { +        if (!use_16_for_ws && !iscsilun->lbp.lbpws10) { +            /* WRITESAME10 with UNMAP is unsupported try WRITESAME16 */ +            use_16_for_ws = true; +        } +        if (use_16_for_ws && !iscsilun->lbp.lbpws) { +            /* WRITESAME16 with UNMAP is not supported by the target, +             * fall back and try WRITESAME10/16 without UNMAP */ +            flags &= ~BDRV_REQ_MAY_UNMAP; +            use_16_for_ws = iscsilun->use_16_for_rw; +        } +    } + +    if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { +        /* WRITESAME without UNMAP is not supported by the target */ +        return -ENOTSUP; +    } + +    lba = sector_qemu2lun(sector_num, iscsilun); +    nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + +    if (iscsilun->zeroblock == NULL) { +        iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); +        if (iscsilun->zeroblock == NULL) { +            return -ENOMEM; +        } +    } + +    iscsi_co_init_iscsitask(iscsilun, &iTask); +    iTask.force_next_flush = true; +retry: +    if (use_16_for_ws) { +        iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, +                                            iscsilun->zeroblock, iscsilun->block_size, +                                            nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), +                                            0, 0, iscsi_co_generic_cb, &iTask); +    } else { +        iTask.task = iscsi_writesame10_task(iscsilun->iscsi, iscsilun->lun, lba, +                                            iscsilun->zeroblock, iscsilun->block_size, +                                            nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), +                                            0, 0, iscsi_co_generic_cb, &iTask); +    } +    if (iTask.task == NULL) { +        return -ENOMEM; +    } + +    while (!iTask.complete) { +        iscsi_set_events(iscsilun); +        qemu_coroutine_yield(); +    } + +    if (iTask.status == SCSI_STATUS_CHECK_CONDITION && +        iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && +        (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE || +         iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) { +        /* WRITE SAME is not supported by the target */ +        iscsilun->has_write_same = false; +        scsi_free_scsi_task(iTask.task); +        return -ENOTSUP; +    } + +    if (iTask.task != NULL) { +        scsi_free_scsi_task(iTask.task); +        iTask.task = NULL; +    } + +    if (iTask.do_retry) { +        iTask.complete = 0; +        goto retry; +    } + +    if (iTask.status != SCSI_STATUS_GOOD) { +        return -EIO; +    } + +    if (flags & BDRV_REQ_MAY_UNMAP) { +        iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); +    } else { +        iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); +    } + +    return 0; +} + +static void parse_chap(struct iscsi_context *iscsi, const char *target, +                       Error **errp) +{ +    QemuOptsList *list; +    QemuOpts *opts; +    const char *user = NULL; +    const char *password = NULL; + +    list = qemu_find_opts("iscsi"); +    if (!list) { +        return; +    } + +    opts = qemu_opts_find(list, target); +    if (opts == NULL) { +        opts = QTAILQ_FIRST(&list->head); +        if (!opts) { +            return; +        } +    } + +    user = qemu_opt_get(opts, "user"); +    if (!user) { +        return; +    } + +    password = qemu_opt_get(opts, "password"); +    if (!password) { +        error_setg(errp, "CHAP username specified but no password was given"); +        return; +    } + +    if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { +        error_setg(errp, "Failed to set initiator username and password"); +    } +} + +static void parse_header_digest(struct iscsi_context *iscsi, const char *target, +                                Error **errp) +{ +    QemuOptsList *list; +    QemuOpts *opts; +    const char *digest = NULL; + +    list = qemu_find_opts("iscsi"); +    if (!list) { +        return; +    } + +    opts = qemu_opts_find(list, target); +    if (opts == NULL) { +        opts = QTAILQ_FIRST(&list->head); +        if (!opts) { +            return; +        } +    } + +    digest = qemu_opt_get(opts, "header-digest"); +    if (!digest) { +        return; +    } + +    if (!strcmp(digest, "CRC32C")) { +        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C); +    } else if (!strcmp(digest, "NONE")) { +        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE); +    } else if (!strcmp(digest, "CRC32C-NONE")) { +        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE); +    } else if (!strcmp(digest, "NONE-CRC32C")) { +        iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); +    } else { +        error_setg(errp, "Invalid header-digest setting : %s", digest); +    } +} + +static char *parse_initiator_name(const char *target) +{ +    QemuOptsList *list; +    QemuOpts *opts; +    const char *name; +    char *iscsi_name; +    UuidInfo *uuid_info; + +    list = qemu_find_opts("iscsi"); +    if (list) { +        opts = qemu_opts_find(list, target); +        if (!opts) { +            opts = QTAILQ_FIRST(&list->head); +        } +        if (opts) { +            name = qemu_opt_get(opts, "initiator-name"); +            if (name) { +                return g_strdup(name); +            } +        } +    } + +    uuid_info = qmp_query_uuid(NULL); +    if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { +        name = qemu_get_vm_name(); +    } else { +        name = uuid_info->UUID; +    } +    iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", +                                 name ? ":" : "", name ? name : ""); +    qapi_free_UuidInfo(uuid_info); +    return iscsi_name; +} + +static int parse_timeout(const char *target) +{ +    QemuOptsList *list; +    QemuOpts *opts; +    const char *timeout; + +    list = qemu_find_opts("iscsi"); +    if (list) { +        opts = qemu_opts_find(list, target); +        if (!opts) { +            opts = QTAILQ_FIRST(&list->head); +        } +        if (opts) { +            timeout = qemu_opt_get(opts, "timeout"); +            if (timeout) { +                return atoi(timeout); +            } +        } +    } + +    return 0; +} + +static void iscsi_nop_timed_event(void *opaque) +{ +    IscsiLun *iscsilun = opaque; + +    if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) { +        error_report("iSCSI: NOP timeout. Reconnecting..."); +        iscsilun->request_timed_out = true; +    } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { +        error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages."); +        return; +    } + +    timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); +    iscsi_set_events(iscsilun); +} + +static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) +{ +    struct scsi_task *task = NULL; +    struct scsi_readcapacity10 *rc10 = NULL; +    struct scsi_readcapacity16 *rc16 = NULL; +    int retries = ISCSI_CMD_RETRIES;  + +    do { +        if (task != NULL) { +            scsi_free_scsi_task(task); +            task = NULL; +        } + +        switch (iscsilun->type) { +        case TYPE_DISK: +            task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun); +            if (task != NULL && task->status == SCSI_STATUS_GOOD) { +                rc16 = scsi_datain_unmarshall(task); +                if (rc16 == NULL) { +                    error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data."); +                } else { +                    iscsilun->block_size = rc16->block_length; +                    iscsilun->num_blocks = rc16->returned_lba + 1; +                    iscsilun->lbpme = !!rc16->lbpme; +                    iscsilun->lbprz = !!rc16->lbprz; +                    iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); +                } +            } +            break; +        case TYPE_ROM: +            task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0); +            if (task != NULL && task->status == SCSI_STATUS_GOOD) { +                rc10 = scsi_datain_unmarshall(task); +                if (rc10 == NULL) { +                    error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data."); +                } else { +                    iscsilun->block_size = rc10->block_size; +                    if (rc10->lba == 0) { +                        /* blank disk loaded */ +                        iscsilun->num_blocks = 0; +                    } else { +                        iscsilun->num_blocks = rc10->lba + 1; +                    } +                } +            } +            break; +        default: +            return; +        } +    } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION +             && task->sense.key == SCSI_SENSE_UNIT_ATTENTION +             && retries-- > 0); + +    if (task == NULL || task->status != SCSI_STATUS_GOOD) { +        error_setg(errp, "iSCSI: failed to send readcapacity10 command."); +    } else if (!iscsilun->block_size || +               iscsilun->block_size % BDRV_SECTOR_SIZE) { +        error_setg(errp, "iSCSI: the target returned an invalid " +                   "block size of %d.", iscsilun->block_size); +    } +    if (task) { +        scsi_free_scsi_task(task); +    } +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { +    .name = "iscsi", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "URL to the iscsi image", +        }, +        { /* end of list */ } +    }, +}; + +static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, +                                          int evpd, int pc, void **inq, Error **errp) +{ +    int full_size; +    struct scsi_task *task = NULL; +    task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64); +    if (task == NULL || task->status != SCSI_STATUS_GOOD) { +        goto fail; +    } +    full_size = scsi_datain_getfullsize(task); +    if (full_size > task->datain.size) { +        scsi_free_scsi_task(task); + +        /* we need more data for the full list */ +        task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size); +        if (task == NULL || task->status != SCSI_STATUS_GOOD) { +            goto fail; +        } +    } + +    *inq = scsi_datain_unmarshall(task); +    if (*inq == NULL) { +        error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); +        goto fail_with_err; +    } + +    return task; + +fail: +    error_setg(errp, "iSCSI: Inquiry command failed : %s", +               iscsi_get_error(iscsi)); +fail_with_err: +    if (task != NULL) { +        scsi_free_scsi_task(task); +    } +    return NULL; +} + +static void iscsi_detach_aio_context(BlockDriverState *bs) +{ +    IscsiLun *iscsilun = bs->opaque; + +    aio_set_fd_handler(iscsilun->aio_context, +                       iscsi_get_fd(iscsilun->iscsi), +                       NULL, NULL, NULL); +    iscsilun->events = 0; + +    if (iscsilun->nop_timer) { +        timer_del(iscsilun->nop_timer); +        timer_free(iscsilun->nop_timer); +        iscsilun->nop_timer = NULL; +    } +    if (iscsilun->event_timer) { +        timer_del(iscsilun->event_timer); +        timer_free(iscsilun->event_timer); +        iscsilun->event_timer = NULL; +    } +} + +static void iscsi_attach_aio_context(BlockDriverState *bs, +                                     AioContext *new_context) +{ +    IscsiLun *iscsilun = bs->opaque; + +    iscsilun->aio_context = new_context; +    iscsi_set_events(iscsilun); + +    /* Set up a timer for sending out iSCSI NOPs */ +    iscsilun->nop_timer = aio_timer_new(iscsilun->aio_context, +                                        QEMU_CLOCK_REALTIME, SCALE_MS, +                                        iscsi_nop_timed_event, iscsilun); +    timer_mod(iscsilun->nop_timer, +              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); + +    /* Set up a timer for periodic calls to iscsi_set_events and to +     * scan for command timeout */ +    iscsilun->event_timer = aio_timer_new(iscsilun->aio_context, +                                          QEMU_CLOCK_REALTIME, SCALE_MS, +                                          iscsi_timed_check_events, iscsilun); +    timer_mod(iscsilun->event_timer, +              qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); +} + +static void iscsi_modesense_sync(IscsiLun *iscsilun) +{ +    struct scsi_task *task; +    struct scsi_mode_sense *ms = NULL; +    iscsilun->write_protected = false; +    iscsilun->dpofua = false; + +    task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun, +                                 1, SCSI_MODESENSE_PC_CURRENT, +                                 0x3F, 0, 255); +    if (task == NULL) { +        error_report("iSCSI: Failed to send MODE_SENSE(6) command: %s", +                     iscsi_get_error(iscsilun->iscsi)); +        goto out; +    } + +    if (task->status != SCSI_STATUS_GOOD) { +        error_report("iSCSI: Failed MODE_SENSE(6), LUN assumed writable"); +        goto out; +    } +    ms = scsi_datain_unmarshall(task); +    if (!ms) { +        error_report("iSCSI: Failed to unmarshall MODE_SENSE(6) data: %s", +                     iscsi_get_error(iscsilun->iscsi)); +        goto out; +    } +    iscsilun->write_protected = ms->device_specific_parameter & 0x80; +    iscsilun->dpofua          = ms->device_specific_parameter & 0x10; + +out: +    if (task) { +        scsi_free_scsi_task(task); +    } +} + +/* + * We support iscsi url's on the form + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> + */ +static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct iscsi_context *iscsi = NULL; +    struct iscsi_url *iscsi_url = NULL; +    struct scsi_task *task = NULL; +    struct scsi_inquiry_standard *inq = NULL; +    struct scsi_inquiry_supported_pages *inq_vpd; +    char *initiator_name = NULL; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename; +    int i, ret = 0, timeout = 0; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    iscsi_url = iscsi_parse_full_url(iscsi, filename); +    if (iscsi_url == NULL) { +        error_setg(errp, "Failed to parse URL : %s", filename); +        ret = -EINVAL; +        goto out; +    } + +    memset(iscsilun, 0, sizeof(IscsiLun)); + +    initiator_name = parse_initiator_name(iscsi_url->target); + +    iscsi = iscsi_create_context(initiator_name); +    if (iscsi == NULL) { +        error_setg(errp, "iSCSI: Failed to create iSCSI context."); +        ret = -ENOMEM; +        goto out; +    } + +    if (iscsi_set_targetname(iscsi, iscsi_url->target)) { +        error_setg(errp, "iSCSI: Failed to set target name."); +        ret = -EINVAL; +        goto out; +    } + +    if (iscsi_url->user[0] != '\0') { +        ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, +                                              iscsi_url->passwd); +        if (ret != 0) { +            error_setg(errp, "Failed to set initiator username and password"); +            ret = -EINVAL; +            goto out; +        } +    } + +    /* check if we got CHAP username/password via the options */ +    parse_chap(iscsi, iscsi_url->target, &local_err); +    if (local_err != NULL) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { +        error_setg(errp, "iSCSI: Failed to set session type to normal."); +        ret = -EINVAL; +        goto out; +    } + +    iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + +    /* check if we got HEADER_DIGEST via the options */ +    parse_header_digest(iscsi, iscsi_url->target, &local_err); +    if (local_err != NULL) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    /* timeout handling is broken in libiscsi before 1.15.0 */ +    timeout = parse_timeout(iscsi_url->target); +#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621 +    iscsi_set_timeout(iscsi, timeout); +#else +    if (timeout) { +        error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0"); +    } +#endif + +    if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { +        error_setg(errp, "iSCSI: Failed to connect to LUN : %s", +            iscsi_get_error(iscsi)); +        ret = -EINVAL; +        goto out; +    } + +    iscsilun->iscsi = iscsi; +    iscsilun->aio_context = bdrv_get_aio_context(bs); +    iscsilun->lun   = iscsi_url->lun; +    iscsilun->has_write_same = true; + +    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, +                            (void **) &inq, errp); +    if (task == NULL) { +        ret = -EINVAL; +        goto out; +    } +    iscsilun->type = inq->periperal_device_type; +    scsi_free_scsi_task(task); +    task = NULL; + +    iscsi_modesense_sync(iscsilun); + +    /* Check the write protect flag of the LUN if we want to write */ +    if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && +        iscsilun->write_protected) { +        error_setg(errp, "Cannot open a write protected LUN as read-write"); +        ret = -EACCES; +        goto out; +    } + +    iscsi_readcapacity_sync(iscsilun, &local_err); +    if (local_err != NULL) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } +    bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); +    bs->request_alignment = iscsilun->block_size; + +    /* We don't have any emulation for devices other than disks and CD-ROMs, so +     * this must be sg ioctl compatible. We force it to be sg, otherwise qemu +     * will try to read from the device to guess the image format. +     */ +    if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { +        bs->sg = 1; +    } + +    task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, +                            SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES, +                            (void **) &inq_vpd, errp); +    if (task == NULL) { +        ret = -EINVAL; +        goto out; +    } +    for (i = 0; i < inq_vpd->num_pages; i++) { +        struct scsi_task *inq_task; +        struct scsi_inquiry_logical_block_provisioning *inq_lbp; +        struct scsi_inquiry_block_limits *inq_bl; +        switch (inq_vpd->pages[i]) { +        case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: +            inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, +                                        SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, +                                        (void **) &inq_lbp, errp); +            if (inq_task == NULL) { +                ret = -EINVAL; +                goto out; +            } +            memcpy(&iscsilun->lbp, inq_lbp, +                   sizeof(struct scsi_inquiry_logical_block_provisioning)); +            scsi_free_scsi_task(inq_task); +            break; +        case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS: +            inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, +                                    SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS, +                                    (void **) &inq_bl, errp); +            if (inq_task == NULL) { +                ret = -EINVAL; +                goto out; +            } +            memcpy(&iscsilun->bl, inq_bl, +                   sizeof(struct scsi_inquiry_block_limits)); +            scsi_free_scsi_task(inq_task); +            break; +        default: +            break; +        } +    } +    scsi_free_scsi_task(task); +    task = NULL; + +    iscsi_attach_aio_context(bs, iscsilun->aio_context); + +    /* Guess the internal cluster (page) size of the iscsi target by the means +     * of opt_unmap_gran. Transfer the unmap granularity only if it has a +     * reasonable size */ +    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && +        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { +        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * +                                     iscsilun->block_size) >> BDRV_SECTOR_BITS; +        if (iscsilun->lbprz) { +            iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); +            if (iscsilun->allocationmap == NULL) { +                ret = -ENOMEM; +            } +        } +    } + +out: +    qemu_opts_del(opts); +    g_free(initiator_name); +    if (iscsi_url != NULL) { +        iscsi_destroy_url(iscsi_url); +    } +    if (task != NULL) { +        scsi_free_scsi_task(task); +    } + +    if (ret) { +        if (iscsi != NULL) { +            if (iscsi_is_logged_in(iscsi)) { +                iscsi_logout_sync(iscsi); +            } +            iscsi_destroy_context(iscsi); +        } +        memset(iscsilun, 0, sizeof(IscsiLun)); +    } +    return ret; +} + +static void iscsi_close(BlockDriverState *bs) +{ +    IscsiLun *iscsilun = bs->opaque; +    struct iscsi_context *iscsi = iscsilun->iscsi; + +    iscsi_detach_aio_context(bs); +    if (iscsi_is_logged_in(iscsi)) { +        iscsi_logout_sync(iscsi); +    } +    iscsi_destroy_context(iscsi); +    g_free(iscsilun->zeroblock); +    g_free(iscsilun->allocationmap); +    memset(iscsilun, 0, sizeof(IscsiLun)); +} + +static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ +    return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); +} + +static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    /* We don't actually refresh here, but just return data queried in +     * iscsi_open(): iscsi targets don't change their limits. */ + +    IscsiLun *iscsilun = bs->opaque; +    uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + +    if (iscsilun->bl.max_xfer_len) { +        max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); +    } + +    bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); + +    if (iscsilun->lbp.lbpu) { +        if (iscsilun->bl.max_unmap < 0xffffffff) { +            bs->bl.max_discard = +                sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); +        } +        bs->bl.discard_alignment = +            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); +    } + +    if (iscsilun->bl.max_ws_len < 0xffffffff) { +        bs->bl.max_write_zeroes = +            sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); +    } +    if (iscsilun->lbp.lbpws) { +        bs->bl.write_zeroes_alignment = +            sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); +    } +    bs->bl.opt_transfer_length = +        sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); +} + +/* Note that this will not re-establish a connection with an iSCSI target - it + * is effectively a NOP.  */ +static int iscsi_reopen_prepare(BDRVReopenState *state, +                                BlockReopenQueue *queue, Error **errp) +{ +    IscsiLun *iscsilun = state->bs->opaque; + +    if (state->flags & BDRV_O_RDWR && iscsilun->write_protected) { +        error_setg(errp, "Cannot open a write protected LUN as read-write"); +        return -EACCES; +    } +    return 0; +} + +static int iscsi_truncate(BlockDriverState *bs, int64_t offset) +{ +    IscsiLun *iscsilun = bs->opaque; +    Error *local_err = NULL; + +    if (iscsilun->type != TYPE_DISK) { +        return -ENOTSUP; +    } + +    iscsi_readcapacity_sync(iscsilun, &local_err); +    if (local_err != NULL) { +        error_free(local_err); +        return -EIO; +    } + +    if (offset > iscsi_getlength(bs)) { +        return -EINVAL; +    } + +    if (iscsilun->allocationmap != NULL) { +        g_free(iscsilun->allocationmap); +        iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); +    } + +    return 0; +} + +static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int ret = 0; +    int64_t total_size = 0; +    BlockDriverState *bs; +    IscsiLun *iscsilun = NULL; +    QDict *bs_options; + +    bs = bdrv_new(); + +    /* Read out options */ +    total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                              BDRV_SECTOR_SIZE); +    bs->opaque = g_new0(struct IscsiLun, 1); +    iscsilun = bs->opaque; + +    bs_options = qdict_new(); +    qdict_put(bs_options, "filename", qstring_from_str(filename)); +    ret = iscsi_open(bs, bs_options, 0, NULL); +    QDECREF(bs_options); + +    if (ret != 0) { +        goto out; +    } +    iscsi_detach_aio_context(bs); +    if (iscsilun->type != TYPE_DISK) { +        ret = -ENODEV; +        goto out; +    } +    if (bs->total_sectors < total_size) { +        ret = -ENOSPC; +        goto out; +    } + +    ret = 0; +out: +    if (iscsilun->iscsi != NULL) { +        iscsi_destroy_context(iscsilun->iscsi); +    } +    g_free(bs->opaque); +    bs->opaque = NULL; +    bdrv_unref(bs); +    return ret; +} + +static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    IscsiLun *iscsilun = bs->opaque; +    bdi->unallocated_blocks_are_zero = iscsilun->lbprz; +    bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; +    bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; +    return 0; +} + +static QemuOptsList iscsi_create_opts = { +    .name = "iscsi-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_iscsi = { +    .format_name     = "iscsi", +    .protocol_name   = "iscsi", + +    .instance_size   = sizeof(IscsiLun), +    .bdrv_needs_filename = true, +    .bdrv_file_open  = iscsi_open, +    .bdrv_close      = iscsi_close, +    .bdrv_create     = iscsi_create, +    .create_opts     = &iscsi_create_opts, +    .bdrv_reopen_prepare  = iscsi_reopen_prepare, + +    .bdrv_getlength  = iscsi_getlength, +    .bdrv_get_info   = iscsi_get_info, +    .bdrv_truncate   = iscsi_truncate, +    .bdrv_refresh_limits = iscsi_refresh_limits, + +    .bdrv_co_get_block_status = iscsi_co_get_block_status, +    .bdrv_co_discard      = iscsi_co_discard, +    .bdrv_co_write_zeroes = iscsi_co_write_zeroes, +    .bdrv_co_readv         = iscsi_co_readv, +    .bdrv_co_writev        = iscsi_co_writev, +    .bdrv_co_flush_to_disk = iscsi_co_flush, + +#ifdef __linux__ +    .bdrv_ioctl       = iscsi_ioctl, +    .bdrv_aio_ioctl   = iscsi_aio_ioctl, +#endif + +    .bdrv_detach_aio_context = iscsi_detach_aio_context, +    .bdrv_attach_aio_context = iscsi_attach_aio_context, +}; + +static QemuOptsList qemu_iscsi_opts = { +    .name = "iscsi", +    .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head), +    .desc = { +        { +            .name = "user", +            .type = QEMU_OPT_STRING, +            .help = "username for CHAP authentication to target", +        },{ +            .name = "password", +            .type = QEMU_OPT_STRING, +            .help = "password for CHAP authentication to target", +        },{ +            .name = "header-digest", +            .type = QEMU_OPT_STRING, +            .help = "HeaderDigest setting. " +                    "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}", +        },{ +            .name = "initiator-name", +            .type = QEMU_OPT_STRING, +            .help = "Initiator iqn name to use when connecting", +        },{ +            .name = "timeout", +            .type = QEMU_OPT_NUMBER, +            .help = "Request timeout in seconds (default 0 = no timeout)", +        }, +        { /* end of list */ } +    }, +}; + +static void iscsi_block_init(void) +{ +    bdrv_register(&bdrv_iscsi); +    qemu_add_opts(&qemu_iscsi_opts); +} + +block_init(iscsi_block_init); diff --git a/block/linux-aio.c b/block/linux-aio.c new file mode 100644 index 00000000..c991443c --- /dev/null +++ b/block/linux-aio.c @@ -0,0 +1,337 @@ +/* + * Linux native AIO support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/raw-aio.h" +#include "qemu/event_notifier.h" + +#include <libaio.h> + +/* + * Queue size (per-device). + * + * XXX: eventually we need to communicate this to the guest and/or make it + *      tunable by the guest.  If we get more outstanding requests at a time + *      than this we will get EAGAIN from io_submit which is communicated to + *      the guest as an I/O error. + */ +#define MAX_EVENTS 128 + +#define MAX_QUEUED_IO  128 + +struct qemu_laiocb { +    BlockAIOCB common; +    struct qemu_laio_state *ctx; +    struct iocb iocb; +    ssize_t ret; +    size_t nbytes; +    QEMUIOVector *qiov; +    bool is_read; +    QSIMPLEQ_ENTRY(qemu_laiocb) next; +}; + +typedef struct { +    int plugged; +    unsigned int n; +    bool blocked; +    QSIMPLEQ_HEAD(, qemu_laiocb) pending; +} LaioQueue; + +struct qemu_laio_state { +    io_context_t ctx; +    EventNotifier e; + +    /* io queue for submit at batch */ +    LaioQueue io_q; + +    /* I/O completion processing */ +    QEMUBH *completion_bh; +    struct io_event events[MAX_EVENTS]; +    int event_idx; +    int event_max; +}; + +static void ioq_submit(struct qemu_laio_state *s); + +static inline ssize_t io_event_ret(struct io_event *ev) +{ +    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); +} + +/* + * Completes an AIO request (calls the callback and frees the ACB). + */ +static void qemu_laio_process_completion(struct qemu_laio_state *s, +    struct qemu_laiocb *laiocb) +{ +    int ret; + +    ret = laiocb->ret; +    if (ret != -ECANCELED) { +        if (ret == laiocb->nbytes) { +            ret = 0; +        } else if (ret >= 0) { +            /* Short reads mean EOF, pad with zeros. */ +            if (laiocb->is_read) { +                qemu_iovec_memset(laiocb->qiov, ret, 0, +                    laiocb->qiov->size - ret); +            } else { +                ret = -EINVAL; +            } +        } +    } +    laiocb->common.cb(laiocb->common.opaque, ret); + +    qemu_aio_unref(laiocb); +} + +/* The completion BH fetches completed I/O requests and invokes their + * callbacks. + * + * The function is somewhat tricky because it supports nested event loops, for + * example when a request callback invokes aio_poll().  In order to do this, + * the completion events array and index are kept in qemu_laio_state.  The BH + * reschedules itself as long as there are completions pending so it will + * either be called again in a nested event loop or will be called after all + * events have been completed.  When there are no events left to complete, the + * BH returns without rescheduling. + */ +static void qemu_laio_completion_bh(void *opaque) +{ +    struct qemu_laio_state *s = opaque; + +    /* Fetch more completion events when empty */ +    if (s->event_idx == s->event_max) { +        do { +            struct timespec ts = { 0 }; +            s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, +                                        s->events, &ts); +        } while (s->event_max == -EINTR); + +        s->event_idx = 0; +        if (s->event_max <= 0) { +            s->event_max = 0; +            return; /* no more events */ +        } +    } + +    /* Reschedule so nested event loops see currently pending completions */ +    qemu_bh_schedule(s->completion_bh); + +    /* Process completion events */ +    while (s->event_idx < s->event_max) { +        struct iocb *iocb = s->events[s->event_idx].obj; +        struct qemu_laiocb *laiocb = +                container_of(iocb, struct qemu_laiocb, iocb); + +        laiocb->ret = io_event_ret(&s->events[s->event_idx]); +        s->event_idx++; + +        qemu_laio_process_completion(s, laiocb); +    } + +    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { +        ioq_submit(s); +    } +} + +static void qemu_laio_completion_cb(EventNotifier *e) +{ +    struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); + +    if (event_notifier_test_and_clear(&s->e)) { +        qemu_bh_schedule(s->completion_bh); +    } +} + +static void laio_cancel(BlockAIOCB *blockacb) +{ +    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; +    struct io_event event; +    int ret; + +    if (laiocb->ret != -EINPROGRESS) { +        return; +    } +    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); +    laiocb->ret = -ECANCELED; +    if (ret != 0) { +        /* iocb is not cancelled, cb will be called by the event loop later */ +        return; +    } + +    laiocb->common.cb(laiocb->common.opaque, laiocb->ret); +} + +static const AIOCBInfo laio_aiocb_info = { +    .aiocb_size         = sizeof(struct qemu_laiocb), +    .cancel_async       = laio_cancel, +}; + +static void ioq_init(LaioQueue *io_q) +{ +    QSIMPLEQ_INIT(&io_q->pending); +    io_q->plugged = 0; +    io_q->n = 0; +    io_q->blocked = false; +} + +static void ioq_submit(struct qemu_laio_state *s) +{ +    int ret, len; +    struct qemu_laiocb *aiocb; +    struct iocb *iocbs[MAX_QUEUED_IO]; +    QSIMPLEQ_HEAD(, qemu_laiocb) completed; + +    do { +        len = 0; +        QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { +            iocbs[len++] = &aiocb->iocb; +            if (len == MAX_QUEUED_IO) { +                break; +            } +        } + +        ret = io_submit(s->ctx, len, iocbs); +        if (ret == -EAGAIN) { +            break; +        } +        if (ret < 0) { +            abort(); +        } + +        s->io_q.n -= ret; +        aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); +        QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); +    } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); +    s->io_q.blocked = (s->io_q.n > 0); +} + +void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +{ +    struct qemu_laio_state *s = aio_ctx; + +    s->io_q.plugged++; +} + +void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +{ +    struct qemu_laio_state *s = aio_ctx; + +    assert(s->io_q.plugged > 0 || !unplug); + +    if (unplug && --s->io_q.plugged > 0) { +        return; +    } + +    if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { +        ioq_submit(s); +    } +} + +BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type) +{ +    struct qemu_laio_state *s = aio_ctx; +    struct qemu_laiocb *laiocb; +    struct iocb *iocbs; +    off_t offset = sector_num * 512; + +    laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); +    laiocb->nbytes = nb_sectors * 512; +    laiocb->ctx = s; +    laiocb->ret = -EINPROGRESS; +    laiocb->is_read = (type == QEMU_AIO_READ); +    laiocb->qiov = qiov; + +    iocbs = &laiocb->iocb; + +    switch (type) { +    case QEMU_AIO_WRITE: +        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); +	break; +    case QEMU_AIO_READ: +        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); +	break; +    /* Currently Linux kernel does not support other operations */ +    default: +        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", +                        __func__, type); +        goto out_free_aiocb; +    } +    io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); + +    QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); +    s->io_q.n++; +    if (!s->io_q.blocked && +        (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { +        ioq_submit(s); +    } +    return &laiocb->common; + +out_free_aiocb: +    qemu_aio_unref(laiocb); +    return NULL; +} + +void laio_detach_aio_context(void *s_, AioContext *old_context) +{ +    struct qemu_laio_state *s = s_; + +    aio_set_event_notifier(old_context, &s->e, NULL); +    qemu_bh_delete(s->completion_bh); +} + +void laio_attach_aio_context(void *s_, AioContext *new_context) +{ +    struct qemu_laio_state *s = s_; + +    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); +    aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); +} + +void *laio_init(void) +{ +    struct qemu_laio_state *s; + +    s = g_malloc0(sizeof(*s)); +    if (event_notifier_init(&s->e, false) < 0) { +        goto out_free_state; +    } + +    if (io_setup(MAX_EVENTS, &s->ctx) != 0) { +        goto out_close_efd; +    } + +    ioq_init(&s->io_q); + +    return s; + +out_close_efd: +    event_notifier_cleanup(&s->e); +out_free_state: +    g_free(s); +    return NULL; +} + +void laio_cleanup(void *s_) +{ +    struct qemu_laio_state *s = s_; + +    event_notifier_cleanup(&s->e); + +    if (io_destroy(s->ctx) != 0) { +        fprintf(stderr, "%s: destroy AIO context %p failed\n", +                        __func__, &s->ctx); +    } +    g_free(s); +} diff --git a/block/mirror.c b/block/mirror.c new file mode 100644 index 00000000..b2fb4b9b --- /dev/null +++ b/block/mirror.c @@ -0,0 +1,828 @@ +/* + * Image mirroring + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + *  Paolo Bonzini  <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/blockjob.h" +#include "block/block_int.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" +#include "qemu/bitmap.h" + +#define SLICE_TIME    100000000ULL /* ns */ +#define MAX_IN_FLIGHT 16 +#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20) + +/* The mirroring buffer is a list of granularity-sized chunks. + * Free chunks are organized in a list. + */ +typedef struct MirrorBuffer { +    QSIMPLEQ_ENTRY(MirrorBuffer) next; +} MirrorBuffer; + +typedef struct MirrorBlockJob { +    BlockJob common; +    RateLimit limit; +    BlockDriverState *target; +    BlockDriverState *base; +    /* The name of the graph node to replace */ +    char *replaces; +    /* The BDS to replace */ +    BlockDriverState *to_replace; +    /* Used to block operations on the drive-mirror-replace target */ +    Error *replace_blocker; +    bool is_none_mode; +    BlockdevOnError on_source_error, on_target_error; +    bool synced; +    bool should_complete; +    int64_t sector_num; +    int64_t granularity; +    size_t buf_size; +    int64_t bdev_length; +    unsigned long *cow_bitmap; +    BdrvDirtyBitmap *dirty_bitmap; +    HBitmapIter hbi; +    uint8_t *buf; +    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; +    int buf_free_count; + +    unsigned long *in_flight_bitmap; +    int in_flight; +    int sectors_in_flight; +    int ret; +    bool unmap; +    bool waiting_for_io; +} MirrorBlockJob; + +typedef struct MirrorOp { +    MirrorBlockJob *s; +    QEMUIOVector qiov; +    int64_t sector_num; +    int nb_sectors; +} MirrorOp; + +static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, +                                            int error) +{ +    s->synced = false; +    if (read) { +        return block_job_error_action(&s->common, s->common.bs, +                                      s->on_source_error, true, error); +    } else { +        return block_job_error_action(&s->common, s->target, +                                      s->on_target_error, false, error); +    } +} + +static void mirror_iteration_done(MirrorOp *op, int ret) +{ +    MirrorBlockJob *s = op->s; +    struct iovec *iov; +    int64_t chunk_num; +    int i, nb_chunks, sectors_per_chunk; + +    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); + +    s->in_flight--; +    s->sectors_in_flight -= op->nb_sectors; +    iov = op->qiov.iov; +    for (i = 0; i < op->qiov.niov; i++) { +        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; +        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); +        s->buf_free_count++; +    } + +    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; +    chunk_num = op->sector_num / sectors_per_chunk; +    nb_chunks = op->nb_sectors / sectors_per_chunk; +    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); +    if (ret >= 0) { +        if (s->cow_bitmap) { +            bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); +        } +        s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; +    } + +    qemu_iovec_destroy(&op->qiov); +    g_slice_free(MirrorOp, op); + +    if (s->waiting_for_io) { +        qemu_coroutine_enter(s->common.co, NULL); +    } +} + +static void mirror_write_complete(void *opaque, int ret) +{ +    MirrorOp *op = opaque; +    MirrorBlockJob *s = op->s; +    if (ret < 0) { +        BlockErrorAction action; + +        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); +        action = mirror_error_action(s, false, -ret); +        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { +            s->ret = ret; +        } +    } +    mirror_iteration_done(op, ret); +} + +static void mirror_read_complete(void *opaque, int ret) +{ +    MirrorOp *op = opaque; +    MirrorBlockJob *s = op->s; +    if (ret < 0) { +        BlockErrorAction action; + +        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); +        action = mirror_error_action(s, true, -ret); +        if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { +            s->ret = ret; +        } + +        mirror_iteration_done(op, ret); +        return; +    } +    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, +                    mirror_write_complete, op); +} + +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ +    BlockDriverState *source = s->common.bs; +    int nb_sectors, sectors_per_chunk, nb_chunks; +    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; +    uint64_t delay_ns = 0; +    MirrorOp *op; +    int pnum; +    int64_t ret; + +    s->sector_num = hbitmap_iter_next(&s->hbi); +    if (s->sector_num < 0) { +        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); +        s->sector_num = hbitmap_iter_next(&s->hbi); +        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); +        assert(s->sector_num >= 0); +    } + +    hbitmap_next_sector = s->sector_num; +    sector_num = s->sector_num; +    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; +    end = s->bdev_length / BDRV_SECTOR_SIZE; + +    /* Extend the QEMUIOVector to include all adjacent blocks that will +     * be copied in this operation. +     * +     * We have to do this if we have no backing file yet in the destination, +     * and the cluster size is very large.  Then we need to do COW ourselves. +     * The first time a cluster is copied, copy it entirely.  Note that, +     * because both the granularity and the cluster size are powers of two, +     * the number of sectors to copy cannot exceed one cluster. +     * +     * We also want to extend the QEMUIOVector to include more adjacent +     * dirty blocks if possible, to limit the number of I/O operations and +     * run efficiently even with a small granularity. +     */ +    nb_chunks = 0; +    nb_sectors = 0; +    next_sector = sector_num; +    next_chunk = sector_num / sectors_per_chunk; + +    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */ +    while (test_bit(next_chunk, s->in_flight_bitmap)) { +        trace_mirror_yield_in_flight(s, sector_num, s->in_flight); +        s->waiting_for_io = true; +        qemu_coroutine_yield(); +        s->waiting_for_io = false; +    } + +    do { +        int added_sectors, added_chunks; + +        if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || +            test_bit(next_chunk, s->in_flight_bitmap)) { +            assert(nb_sectors > 0); +            break; +        } + +        added_sectors = sectors_per_chunk; +        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { +            bdrv_round_to_clusters(s->target, +                                   next_sector, added_sectors, +                                   &next_sector, &added_sectors); + +            /* On the first iteration, the rounding may make us copy +             * sectors before the first dirty one. +             */ +            if (next_sector < sector_num) { +                assert(nb_sectors == 0); +                sector_num = next_sector; +                next_chunk = next_sector / sectors_per_chunk; +            } +        } + +        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); +        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; + +        /* When doing COW, it may happen that there is not enough space for +         * a full cluster.  Wait if that is the case. +         */ +        while (nb_chunks == 0 && s->buf_free_count < added_chunks) { +            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); +            s->waiting_for_io = true; +            qemu_coroutine_yield(); +            s->waiting_for_io = false; +        } +        if (s->buf_free_count < nb_chunks + added_chunks) { +            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); +            break; +        } + +        /* We have enough free space to copy these sectors.  */ +        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); + +        nb_sectors += added_sectors; +        nb_chunks += added_chunks; +        next_sector += added_sectors; +        next_chunk += added_chunks; +        if (!s->synced && s->common.speed) { +            delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); +        } +    } while (delay_ns == 0 && next_sector < end); + +    /* Allocate a MirrorOp that is used as an AIO callback.  */ +    op = g_slice_new(MirrorOp); +    op->s = s; +    op->sector_num = sector_num; +    op->nb_sectors = nb_sectors; + +    /* Now make a QEMUIOVector taking enough granularity-sized chunks +     * from s->buf_free. +     */ +    qemu_iovec_init(&op->qiov, nb_chunks); +    next_sector = sector_num; +    while (nb_chunks-- > 0) { +        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); +        size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + +        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); +        s->buf_free_count--; +        qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); + +        /* Advance the HBitmapIter in parallel, so that we do not examine +         * the same sector twice. +         */ +        if (next_sector > hbitmap_next_sector +            && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { +            hbitmap_next_sector = hbitmap_iter_next(&s->hbi); +        } + +        next_sector += sectors_per_chunk; +    } + +    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); + +    /* Copy the dirty cluster.  */ +    s->in_flight++; +    s->sectors_in_flight += nb_sectors; +    trace_mirror_one_iteration(s, sector_num, nb_sectors); + +    ret = bdrv_get_block_status_above(source, NULL, sector_num, +                                      nb_sectors, &pnum); +    if (ret < 0 || pnum < nb_sectors || +            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { +        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, +                       mirror_read_complete, op); +    } else if (ret & BDRV_BLOCK_ZERO) { +        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, +                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0, +                              mirror_write_complete, op); +    } else { +        assert(!(ret & BDRV_BLOCK_DATA)); +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors, +                         mirror_write_complete, op); +    } +    return delay_ns; +} + +static void mirror_free_init(MirrorBlockJob *s) +{ +    int granularity = s->granularity; +    size_t buf_size = s->buf_size; +    uint8_t *buf = s->buf; + +    assert(s->buf_free_count == 0); +    QSIMPLEQ_INIT(&s->buf_free); +    while (buf_size != 0) { +        MirrorBuffer *cur = (MirrorBuffer *)buf; +        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); +        s->buf_free_count++; +        buf_size -= granularity; +        buf += granularity; +    } +} + +static void mirror_drain(MirrorBlockJob *s) +{ +    while (s->in_flight > 0) { +        s->waiting_for_io = true; +        qemu_coroutine_yield(); +        s->waiting_for_io = false; +    } +} + +typedef struct { +    int ret; +} MirrorExitData; + +static void mirror_exit(BlockJob *job, void *opaque) +{ +    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); +    MirrorExitData *data = opaque; +    AioContext *replace_aio_context = NULL; + +    if (s->to_replace) { +        replace_aio_context = bdrv_get_aio_context(s->to_replace); +        aio_context_acquire(replace_aio_context); +    } + +    if (s->should_complete && data->ret == 0) { +        BlockDriverState *to_replace = s->common.bs; +        if (s->to_replace) { +            to_replace = s->to_replace; +        } +        if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { +            bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); +        } +        bdrv_swap(s->target, to_replace); +        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { +            /* drop the bs loop chain formed by the swap: break the loop then +             * trigger the unref from the top one */ +            BlockDriverState *p = s->base->backing_hd; +            bdrv_set_backing_hd(s->base, NULL); +            bdrv_unref(p); +        } +    } +    if (s->to_replace) { +        bdrv_op_unblock_all(s->to_replace, s->replace_blocker); +        error_free(s->replace_blocker); +        bdrv_unref(s->to_replace); +    } +    if (replace_aio_context) { +        aio_context_release(replace_aio_context); +    } +    g_free(s->replaces); +    bdrv_unref(s->target); +    block_job_completed(&s->common, data->ret); +    g_free(data); +} + +static void coroutine_fn mirror_run(void *opaque) +{ +    MirrorBlockJob *s = opaque; +    MirrorExitData *data; +    BlockDriverState *bs = s->common.bs; +    int64_t sector_num, end, length; +    uint64_t last_pause_ns; +    BlockDriverInfo bdi; +    char backing_filename[2]; /* we only need 2 characters because we are only +                                 checking for a NULL string */ +    int ret = 0; +    int n; + +    if (block_job_is_cancelled(&s->common)) { +        goto immediate_exit; +    } + +    s->bdev_length = bdrv_getlength(bs); +    if (s->bdev_length < 0) { +        ret = s->bdev_length; +        goto immediate_exit; +    } else if (s->bdev_length == 0) { +        /* Report BLOCK_JOB_READY and wait for complete. */ +        block_job_event_ready(&s->common); +        s->synced = true; +        while (!block_job_is_cancelled(&s->common) && !s->should_complete) { +            block_job_yield(&s->common); +        } +        s->common.cancelled = false; +        goto immediate_exit; +    } + +    length = DIV_ROUND_UP(s->bdev_length, s->granularity); +    s->in_flight_bitmap = bitmap_new(length); + +    /* If we have no backing file yet in the destination, we cannot let +     * the destination do COW.  Instead, we copy sectors around the +     * dirty data if needed.  We need a bitmap to do that. +     */ +    bdrv_get_backing_filename(s->target, backing_filename, +                              sizeof(backing_filename)); +    if (backing_filename[0] && !s->target->backing_hd) { +        ret = bdrv_get_info(s->target, &bdi); +        if (ret < 0) { +            goto immediate_exit; +        } +        if (s->granularity < bdi.cluster_size) { +            s->buf_size = MAX(s->buf_size, bdi.cluster_size); +            s->cow_bitmap = bitmap_new(length); +        } +    } + +    end = s->bdev_length / BDRV_SECTOR_SIZE; +    s->buf = qemu_try_blockalign(bs, s->buf_size); +    if (s->buf == NULL) { +        ret = -ENOMEM; +        goto immediate_exit; +    } + +    mirror_free_init(s); + +    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +    if (!s->is_none_mode) { +        /* First part, loop on the sectors and initialize the dirty bitmap.  */ +        BlockDriverState *base = s->base; +        for (sector_num = 0; sector_num < end; ) { +            /* Just to make sure we are not exceeding int limit. */ +            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, +                                 end - sector_num); +            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + +            if (now - last_pause_ns > SLICE_TIME) { +                last_pause_ns = now; +                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); +            } + +            if (block_job_is_cancelled(&s->common)) { +                goto immediate_exit; +            } + +            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); + +            if (ret < 0) { +                goto immediate_exit; +            } + +            assert(n > 0); +            if (ret == 1) { +                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); +            } +            sector_num += n; +        } +    } + +    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); +    for (;;) { +        uint64_t delay_ns = 0; +        int64_t cnt; +        bool should_complete; + +        if (s->ret < 0) { +            ret = s->ret; +            goto immediate_exit; +        } + +        cnt = bdrv_get_dirty_count(s->dirty_bitmap); +        /* s->common.offset contains the number of bytes already processed so +         * far, cnt is the number of dirty sectors remaining and +         * s->sectors_in_flight is the number of sectors currently being +         * processed; together those are the current total operation length */ +        s->common.len = s->common.offset + +                        (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; + +        /* Note that even when no rate limit is applied we need to yield +         * periodically with no pending I/O so that bdrv_drain_all() returns. +         * We do so every SLICE_TIME nanoseconds, or when there is an error, +         * or when the source is clean, whichever comes first. +         */ +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && +            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { +            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || +                (cnt == 0 && s->in_flight > 0)) { +                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); +                s->waiting_for_io = true; +                qemu_coroutine_yield(); +                s->waiting_for_io = false; +                continue; +            } else if (cnt != 0) { +                delay_ns = mirror_iteration(s); +            } +        } + +        should_complete = false; +        if (s->in_flight == 0 && cnt == 0) { +            trace_mirror_before_flush(s); +            ret = bdrv_flush(s->target); +            if (ret < 0) { +                if (mirror_error_action(s, false, -ret) == +                    BLOCK_ERROR_ACTION_REPORT) { +                    goto immediate_exit; +                } +            } else { +                /* We're out of the streaming phase.  From now on, if the job +                 * is cancelled we will actually complete all pending I/O and +                 * report completion.  This way, block-job-cancel will leave +                 * the target in a consistent state. +                 */ +                if (!s->synced) { +                    block_job_event_ready(&s->common); +                    s->synced = true; +                } + +                should_complete = s->should_complete || +                    block_job_is_cancelled(&s->common); +                cnt = bdrv_get_dirty_count(s->dirty_bitmap); +            } +        } + +        if (cnt == 0 && should_complete) { +            /* The dirty bitmap is not updated while operations are pending. +             * If we're about to exit, wait for pending operations before +             * calling bdrv_get_dirty_count(bs), or we may exit while the +             * source has dirty data to copy! +             * +             * Note that I/O can be submitted by the guest while +             * mirror_populate runs. +             */ +            trace_mirror_before_drain(s, cnt); +            bdrv_drain(bs); +            cnt = bdrv_get_dirty_count(s->dirty_bitmap); +        } + +        ret = 0; +        trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); +        if (!s->synced) { +            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); +            if (block_job_is_cancelled(&s->common)) { +                break; +            } +        } else if (!should_complete) { +            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); +            block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); +        } else if (cnt == 0) { +            /* The two disks are in sync.  Exit and report successful +             * completion. +             */ +            assert(QLIST_EMPTY(&bs->tracked_requests)); +            s->common.cancelled = false; +            break; +        } +        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +    } + +immediate_exit: +    if (s->in_flight > 0) { +        /* We get here only if something went wrong.  Either the job failed, +         * or it was cancelled prematurely so that we do not guarantee that +         * the target is a copy of the source. +         */ +        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); +        mirror_drain(s); +    } + +    assert(s->in_flight == 0); +    qemu_vfree(s->buf); +    g_free(s->cow_bitmap); +    g_free(s->in_flight_bitmap); +    bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); +    bdrv_iostatus_disable(s->target); + +    data = g_malloc(sizeof(*data)); +    data->ret = ret; +    block_job_defer_to_main_loop(&s->common, mirror_exit, data); +} + +static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ +    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + +    if (speed < 0) { +        error_setg(errp, QERR_INVALID_PARAMETER, "speed"); +        return; +    } +    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static void mirror_iostatus_reset(BlockJob *job) +{ +    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + +    bdrv_iostatus_reset(s->target); +} + +static void mirror_complete(BlockJob *job, Error **errp) +{ +    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); +    Error *local_err = NULL; +    int ret; + +    ret = bdrv_open_backing_file(s->target, NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return; +    } +    if (!s->synced) { +        error_setg(errp, QERR_BLOCK_JOB_NOT_READY, +                   bdrv_get_device_name(job->bs)); +        return; +    } + +    /* check the target bs is not blocked and block all operations on it */ +    if (s->replaces) { +        AioContext *replace_aio_context; + +        s->to_replace = check_to_replace_node(s->replaces, &local_err); +        if (!s->to_replace) { +            error_propagate(errp, local_err); +            return; +        } + +        replace_aio_context = bdrv_get_aio_context(s->to_replace); +        aio_context_acquire(replace_aio_context); + +        error_setg(&s->replace_blocker, +                   "block device is in use by block-job-complete"); +        bdrv_op_block_all(s->to_replace, s->replace_blocker); +        bdrv_ref(s->to_replace); + +        aio_context_release(replace_aio_context); +    } + +    s->should_complete = true; +    block_job_enter(&s->common); +} + +static const BlockJobDriver mirror_job_driver = { +    .instance_size = sizeof(MirrorBlockJob), +    .job_type      = BLOCK_JOB_TYPE_MIRROR, +    .set_speed     = mirror_set_speed, +    .iostatus_reset= mirror_iostatus_reset, +    .complete      = mirror_complete, +}; + +static const BlockJobDriver commit_active_job_driver = { +    .instance_size = sizeof(MirrorBlockJob), +    .job_type      = BLOCK_JOB_TYPE_COMMIT, +    .set_speed     = mirror_set_speed, +    .iostatus_reset +                   = mirror_iostatus_reset, +    .complete      = mirror_complete, +}; + +static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, +                             const char *replaces, +                             int64_t speed, uint32_t granularity, +                             int64_t buf_size, +                             BlockdevOnError on_source_error, +                             BlockdevOnError on_target_error, +                             bool unmap, +                             BlockCompletionFunc *cb, +                             void *opaque, Error **errp, +                             const BlockJobDriver *driver, +                             bool is_none_mode, BlockDriverState *base) +{ +    MirrorBlockJob *s; + +    if (granularity == 0) { +        granularity = bdrv_get_default_bitmap_granularity(target); +    } + +    assert ((granularity & (granularity - 1)) == 0); + +    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || +         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && +        !bdrv_iostatus_is_enabled(bs)) { +        error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); +        return; +    } + +    if (buf_size < 0) { +        error_setg(errp, "Invalid parameter 'buf-size'"); +        return; +    } + +    if (buf_size == 0) { +        buf_size = DEFAULT_MIRROR_BUF_SIZE; +    } + +    s = block_job_create(driver, bs, speed, cb, opaque, errp); +    if (!s) { +        return; +    } + +    s->replaces = g_strdup(replaces); +    s->on_source_error = on_source_error; +    s->on_target_error = on_target_error; +    s->target = target; +    s->is_none_mode = is_none_mode; +    s->base = base; +    s->granularity = granularity; +    s->buf_size = ROUND_UP(buf_size, granularity); +    s->unmap = unmap; + +    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); +    if (!s->dirty_bitmap) { +        g_free(s->replaces); +        block_job_release(bs); +        return; +    } +    bdrv_set_enable_write_cache(s->target, true); +    bdrv_set_on_error(s->target, on_target_error, on_target_error); +    bdrv_iostatus_enable(s->target); +    s->common.co = qemu_coroutine_create(mirror_run); +    trace_mirror_start(bs, s, s->common.co, opaque); +    qemu_coroutine_enter(s->common.co, s); +} + +void mirror_start(BlockDriverState *bs, BlockDriverState *target, +                  const char *replaces, +                  int64_t speed, uint32_t granularity, int64_t buf_size, +                  MirrorSyncMode mode, BlockdevOnError on_source_error, +                  BlockdevOnError on_target_error, +                  bool unmap, +                  BlockCompletionFunc *cb, +                  void *opaque, Error **errp) +{ +    bool is_none_mode; +    BlockDriverState *base; + +    if (mode == MIRROR_SYNC_MODE_INCREMENTAL) { +        error_setg(errp, "Sync mode 'incremental' not supported"); +        return; +    } +    is_none_mode = mode == MIRROR_SYNC_MODE_NONE; +    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; +    mirror_start_job(bs, target, replaces, +                     speed, granularity, buf_size, +                     on_source_error, on_target_error, unmap, cb, opaque, errp, +                     &mirror_job_driver, is_none_mode, base); +} + +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, +                         int64_t speed, +                         BlockdevOnError on_error, +                         BlockCompletionFunc *cb, +                         void *opaque, Error **errp) +{ +    int64_t length, base_length; +    int orig_base_flags; +    int ret; +    Error *local_err = NULL; + +    orig_base_flags = bdrv_get_flags(base); + +    if (bdrv_reopen(base, bs->open_flags, errp)) { +        return; +    } + +    length = bdrv_getlength(bs); +    if (length < 0) { +        error_setg_errno(errp, -length, +                         "Unable to determine length of %s", bs->filename); +        goto error_restore_flags; +    } + +    base_length = bdrv_getlength(base); +    if (base_length < 0) { +        error_setg_errno(errp, -base_length, +                         "Unable to determine length of %s", base->filename); +        goto error_restore_flags; +    } + +    if (length > base_length) { +        ret = bdrv_truncate(base, length); +        if (ret < 0) { +            error_setg_errno(errp, -ret, +                            "Top image %s is larger than base image %s, and " +                             "resize of base image failed", +                             bs->filename, base->filename); +            goto error_restore_flags; +        } +    } + +    bdrv_ref(base); +    mirror_start_job(bs, base, NULL, speed, 0, 0, +                     on_error, on_error, false, cb, opaque, &local_err, +                     &commit_active_job_driver, false, base); +    if (local_err) { +        error_propagate(errp, local_err); +        goto error_restore_flags; +    } + +    return; + +error_restore_flags: +    /* ignore error and errp for bdrv_reopen, because we want to propagate +     * the original error */ +    bdrv_reopen(base, orig_base_flags, NULL); +    return; +} diff --git a/block/nbd-client.c b/block/nbd-client.c new file mode 100644 index 00000000..e1bb9198 --- /dev/null +++ b/block/nbd-client.c @@ -0,0 +1,407 @@ +/* + * QEMU Block driver for  NBD + * + * Copyright (C) 2008 Bull S.A.S. + *     Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "nbd-client.h" +#include "qemu/sockets.h" + +#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) +#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ ((uint64_t)(intptr_t)bs)) + +static void nbd_recv_coroutines_enter_all(NbdClientSession *s) +{ +    int i; + +    for (i = 0; i < MAX_NBD_REQUESTS; i++) { +        if (s->recv_coroutine[i]) { +            qemu_coroutine_enter(s->recv_coroutine[i], NULL); +        } +    } +} + +static void nbd_teardown_connection(BlockDriverState *bs) +{ +    NbdClientSession *client = nbd_get_client_session(bs); + +    /* finish any pending coroutines */ +    shutdown(client->sock, 2); +    nbd_recv_coroutines_enter_all(client); + +    nbd_client_detach_aio_context(bs); +    closesocket(client->sock); +    client->sock = -1; +} + +static void nbd_reply_ready(void *opaque) +{ +    BlockDriverState *bs = opaque; +    NbdClientSession *s = nbd_get_client_session(bs); +    uint64_t i; +    int ret; + +    if (s->reply.handle == 0) { +        /* No reply already in flight.  Fetch a header.  It is possible +         * that another thread has done the same thing in parallel, so +         * the socket is not readable anymore. +         */ +        ret = nbd_receive_reply(s->sock, &s->reply); +        if (ret == -EAGAIN) { +            return; +        } +        if (ret < 0) { +            s->reply.handle = 0; +            goto fail; +        } +    } + +    /* There's no need for a mutex on the receive side, because the +     * handler acts as a synchronization point and ensures that only +     * one coroutine is called until the reply finishes.  */ +    i = HANDLE_TO_INDEX(s, s->reply.handle); +    if (i >= MAX_NBD_REQUESTS) { +        goto fail; +    } + +    if (s->recv_coroutine[i]) { +        qemu_coroutine_enter(s->recv_coroutine[i], NULL); +        return; +    } + +fail: +    nbd_teardown_connection(bs); +} + +static void nbd_restart_write(void *opaque) +{ +    BlockDriverState *bs = opaque; + +    qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL); +} + +static int nbd_co_send_request(BlockDriverState *bs, +                               struct nbd_request *request, +                               QEMUIOVector *qiov, int offset) +{ +    NbdClientSession *s = nbd_get_client_session(bs); +    AioContext *aio_context; +    int rc, ret, i; + +    qemu_co_mutex_lock(&s->send_mutex); + +    for (i = 0; i < MAX_NBD_REQUESTS; i++) { +        if (s->recv_coroutine[i] == NULL) { +            s->recv_coroutine[i] = qemu_coroutine_self(); +            break; +        } +    } + +    assert(i < MAX_NBD_REQUESTS); +    request->handle = INDEX_TO_HANDLE(s, i); +    s->send_coroutine = qemu_coroutine_self(); +    aio_context = bdrv_get_aio_context(bs); + +    aio_set_fd_handler(aio_context, s->sock, +                       nbd_reply_ready, nbd_restart_write, bs); +    if (qiov) { +        if (!s->is_unix) { +            socket_set_cork(s->sock, 1); +        } +        rc = nbd_send_request(s->sock, request); +        if (rc >= 0) { +            ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, +                                offset, request->len); +            if (ret != request->len) { +                rc = -EIO; +            } +        } +        if (!s->is_unix) { +            socket_set_cork(s->sock, 0); +        } +    } else { +        rc = nbd_send_request(s->sock, request); +    } +    aio_set_fd_handler(aio_context, s->sock, nbd_reply_ready, NULL, bs); +    s->send_coroutine = NULL; +    qemu_co_mutex_unlock(&s->send_mutex); +    return rc; +} + +static void nbd_co_receive_reply(NbdClientSession *s, +    struct nbd_request *request, struct nbd_reply *reply, +    QEMUIOVector *qiov, int offset) +{ +    int ret; + +    /* Wait until we're woken up by the read handler.  TODO: perhaps +     * peek at the next reply and avoid yielding if it's ours?  */ +    qemu_coroutine_yield(); +    *reply = s->reply; +    if (reply->handle != request->handle) { +        reply->error = EIO; +    } else { +        if (qiov && reply->error == 0) { +            ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, +                                offset, request->len); +            if (ret != request->len) { +                reply->error = EIO; +            } +        } + +        /* Tell the read handler to read another header.  */ +        s->reply.handle = 0; +    } +} + +static void nbd_coroutine_start(NbdClientSession *s, +   struct nbd_request *request) +{ +    /* Poor man semaphore.  The free_sema is locked when no other request +     * can be accepted, and unlocked after receiving one reply.  */ +    if (s->in_flight >= MAX_NBD_REQUESTS - 1) { +        qemu_co_mutex_lock(&s->free_sema); +        assert(s->in_flight < MAX_NBD_REQUESTS); +    } +    s->in_flight++; + +    /* s->recv_coroutine[i] is set as soon as we get the send_lock.  */ +} + +static void nbd_coroutine_end(NbdClientSession *s, +    struct nbd_request *request) +{ +    int i = HANDLE_TO_INDEX(s, request->handle); +    s->recv_coroutine[i] = NULL; +    if (s->in_flight-- == MAX_NBD_REQUESTS) { +        qemu_co_mutex_unlock(&s->free_sema); +    } +} + +static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors, QEMUIOVector *qiov, +                          int offset) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    struct nbd_request request = { .type = NBD_CMD_READ }; +    struct nbd_reply reply; +    ssize_t ret; + +    request.from = sector_num * 512; +    request.len = nb_sectors * 512; + +    nbd_coroutine_start(client, &request); +    ret = nbd_co_send_request(bs, &request, NULL, 0); +    if (ret < 0) { +        reply.error = -ret; +    } else { +        nbd_co_receive_reply(client, &request, &reply, qiov, offset); +    } +    nbd_coroutine_end(client, &request); +    return -reply.error; + +} + +static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, +                           int nb_sectors, QEMUIOVector *qiov, +                           int offset) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    struct nbd_request request = { .type = NBD_CMD_WRITE }; +    struct nbd_reply reply; +    ssize_t ret; + +    if (!bdrv_enable_write_cache(bs) && +        (client->nbdflags & NBD_FLAG_SEND_FUA)) { +        request.type |= NBD_CMD_FLAG_FUA; +    } + +    request.from = sector_num * 512; +    request.len = nb_sectors * 512; + +    nbd_coroutine_start(client, &request); +    ret = nbd_co_send_request(bs, &request, qiov, offset); +    if (ret < 0) { +        reply.error = -ret; +    } else { +        nbd_co_receive_reply(client, &request, &reply, NULL, 0); +    } +    nbd_coroutine_end(client, &request); +    return -reply.error; +} + +/* qemu-nbd has a limit of slightly less than 1M per request.  Try to + * remain aligned to 4K. */ +#define NBD_MAX_SECTORS 2040 + +int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, +                        int nb_sectors, QEMUIOVector *qiov) +{ +    int offset = 0; +    int ret; +    while (nb_sectors > NBD_MAX_SECTORS) { +        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); +        if (ret < 0) { +            return ret; +        } +        offset += NBD_MAX_SECTORS * 512; +        sector_num += NBD_MAX_SECTORS; +        nb_sectors -= NBD_MAX_SECTORS; +    } +    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, +                         int nb_sectors, QEMUIOVector *qiov) +{ +    int offset = 0; +    int ret; +    while (nb_sectors > NBD_MAX_SECTORS) { +        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); +        if (ret < 0) { +            return ret; +        } +        offset += NBD_MAX_SECTORS * 512; +        sector_num += NBD_MAX_SECTORS; +        nb_sectors -= NBD_MAX_SECTORS; +    } +    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_co_flush(BlockDriverState *bs) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    struct nbd_request request = { .type = NBD_CMD_FLUSH }; +    struct nbd_reply reply; +    ssize_t ret; + +    if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { +        return 0; +    } + +    if (client->nbdflags & NBD_FLAG_SEND_FUA) { +        request.type |= NBD_CMD_FLAG_FUA; +    } + +    request.from = 0; +    request.len = 0; + +    nbd_coroutine_start(client, &request); +    ret = nbd_co_send_request(bs, &request, NULL, 0); +    if (ret < 0) { +        reply.error = -ret; +    } else { +        nbd_co_receive_reply(client, &request, &reply, NULL, 0); +    } +    nbd_coroutine_end(client, &request); +    return -reply.error; +} + +int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    struct nbd_request request = { .type = NBD_CMD_TRIM }; +    struct nbd_reply reply; +    ssize_t ret; + +    if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { +        return 0; +    } +    request.from = sector_num * 512; +    request.len = nb_sectors * 512; + +    nbd_coroutine_start(client, &request); +    ret = nbd_co_send_request(bs, &request, NULL, 0); +    if (ret < 0) { +        reply.error = -ret; +    } else { +        nbd_co_receive_reply(client, &request, &reply, NULL, 0); +    } +    nbd_coroutine_end(client, &request); +    return -reply.error; + +} + +void nbd_client_detach_aio_context(BlockDriverState *bs) +{ +    aio_set_fd_handler(bdrv_get_aio_context(bs), +                       nbd_get_client_session(bs)->sock, NULL, NULL, NULL); +} + +void nbd_client_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context) +{ +    aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock, +                       nbd_reply_ready, NULL, bs); +} + +void nbd_client_close(BlockDriverState *bs) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    struct nbd_request request = { +        .type = NBD_CMD_DISC, +        .from = 0, +        .len = 0 +    }; + +    if (client->sock == -1) { +        return; +    } + +    nbd_send_request(client->sock, &request); + +    nbd_teardown_connection(bs); +} + +int nbd_client_init(BlockDriverState *bs, int sock, const char *export, +                    Error **errp) +{ +    NbdClientSession *client = nbd_get_client_session(bs); +    int ret; + +    /* NBD handshake */ +    logout("session init %s\n", export); +    qemu_set_block(sock); +    ret = nbd_receive_negotiate(sock, export, +                                &client->nbdflags, &client->size, errp); +    if (ret < 0) { +        logout("Failed to negotiate with the NBD server\n"); +        closesocket(sock); +        return ret; +    } + +    qemu_co_mutex_init(&client->send_mutex); +    qemu_co_mutex_init(&client->free_sema); +    client->sock = sock; + +    /* Now that we're connected, set the socket to be non-blocking and +     * kick the reply mechanism.  */ +    qemu_set_nonblock(sock); +    nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); + +    logout("Established connection with NBD server\n"); +    return 0; +} diff --git a/block/nbd-client.h b/block/nbd-client.h new file mode 100644 index 00000000..e8413408 --- /dev/null +++ b/block/nbd-client.h @@ -0,0 +1,53 @@ +#ifndef NBD_CLIENT_H +#define NBD_CLIENT_H + +#include "qemu-common.h" +#include "block/nbd.h" +#include "block/block_int.h" + +/* #define DEBUG_NBD */ + +#if defined(DEBUG_NBD) +#define logout(fmt, ...) \ +    fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +#define MAX_NBD_REQUESTS    16 + +typedef struct NbdClientSession { +    int sock; +    uint32_t nbdflags; +    off_t size; + +    CoMutex send_mutex; +    CoMutex free_sema; +    Coroutine *send_coroutine; +    int in_flight; + +    Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; +    struct nbd_reply reply; + +    bool is_unix; +} NbdClientSession; + +NbdClientSession *nbd_get_client_session(BlockDriverState *bs); + +int nbd_client_init(BlockDriverState *bs, int sock, const char *export_name, +                    Error **errp); +void nbd_client_close(BlockDriverState *bs); + +int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors); +int nbd_client_co_flush(BlockDriverState *bs); +int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, +                         int nb_sectors, QEMUIOVector *qiov); +int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, +                        int nb_sectors, QEMUIOVector *qiov); + +void nbd_client_detach_aio_context(BlockDriverState *bs); +void nbd_client_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context); + +#endif /* NBD_CLIENT_H */ diff --git a/block/nbd.c b/block/nbd.c new file mode 100644 index 00000000..21761861 --- /dev/null +++ b/block/nbd.c @@ -0,0 +1,448 @@ +/* + * QEMU Block driver for  NBD + * + * Copyright (C) 2008 Bull S.A.S. + *     Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/nbd-client.h" +#include "qemu/uri.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/sockets.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" + +#include <sys/types.h> +#include <unistd.h> + +#define EN_OPTSTR ":exportname=" + +typedef struct BDRVNBDState { +    NbdClientSession client; +    QemuOpts *socket_opts; +} BDRVNBDState; + +static int nbd_parse_uri(const char *filename, QDict *options) +{ +    URI *uri; +    const char *p; +    QueryParams *qp = NULL; +    int ret = 0; +    bool is_unix; + +    uri = uri_parse(filename); +    if (!uri) { +        return -EINVAL; +    } + +    /* transport */ +    if (!strcmp(uri->scheme, "nbd")) { +        is_unix = false; +    } else if (!strcmp(uri->scheme, "nbd+tcp")) { +        is_unix = false; +    } else if (!strcmp(uri->scheme, "nbd+unix")) { +        is_unix = true; +    } else { +        ret = -EINVAL; +        goto out; +    } + +    p = uri->path ? uri->path : "/"; +    p += strspn(p, "/"); +    if (p[0]) { +        qdict_put(options, "export", qstring_from_str(p)); +    } + +    qp = query_params_parse(uri->query); +    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { +        ret = -EINVAL; +        goto out; +    } + +    if (is_unix) { +        /* nbd+unix:///export?socket=path */ +        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { +            ret = -EINVAL; +            goto out; +        } +        qdict_put(options, "path", qstring_from_str(qp->p[0].value)); +    } else { +        QString *host; +        /* nbd[+tcp]://host[:port]/export */ +        if (!uri->server) { +            ret = -EINVAL; +            goto out; +        } + +        /* strip braces from literal IPv6 address */ +        if (uri->server[0] == '[') { +            host = qstring_from_substr(uri->server, 1, +                                       strlen(uri->server) - 2); +        } else { +            host = qstring_from_str(uri->server); +        } + +        qdict_put(options, "host", host); +        if (uri->port) { +            char* port_str = g_strdup_printf("%d", uri->port); +            qdict_put(options, "port", qstring_from_str(port_str)); +            g_free(port_str); +        } +    } + +out: +    if (qp) { +        query_params_free(qp); +    } +    uri_free(uri); +    return ret; +} + +static void nbd_parse_filename(const char *filename, QDict *options, +                               Error **errp) +{ +    char *file; +    char *export_name; +    const char *host_spec; +    const char *unixpath; + +    if (qdict_haskey(options, "host") +        || qdict_haskey(options, "port") +        || qdict_haskey(options, "path")) +    { +        error_setg(errp, "host/port/path and a file name may not be specified " +                         "at the same time"); +        return; +    } + +    if (strstr(filename, "://")) { +        int ret = nbd_parse_uri(filename, options); +        if (ret < 0) { +            error_setg(errp, "No valid URL specified"); +        } +        return; +    } + +    file = g_strdup(filename); + +    export_name = strstr(file, EN_OPTSTR); +    if (export_name) { +        if (export_name[strlen(EN_OPTSTR)] == 0) { +            goto out; +        } +        export_name[0] = 0; /* truncate 'file' */ +        export_name += strlen(EN_OPTSTR); + +        qdict_put(options, "export", qstring_from_str(export_name)); +    } + +    /* extract the host_spec - fail if it's not nbd:... */ +    if (!strstart(file, "nbd:", &host_spec)) { +        error_setg(errp, "File name string for NBD must start with 'nbd:'"); +        goto out; +    } + +    if (!*host_spec) { +        goto out; +    } + +    /* are we a UNIX or TCP socket? */ +    if (strstart(host_spec, "unix:", &unixpath)) { +        qdict_put(options, "path", qstring_from_str(unixpath)); +    } else { +        InetSocketAddress *addr = NULL; + +        addr = inet_parse(host_spec, errp); +        if (!addr) { +            goto out; +        } + +        qdict_put(options, "host", qstring_from_str(addr->host)); +        qdict_put(options, "port", qstring_from_str(addr->port)); +        qapi_free_InetSocketAddress(addr); +    } + +out: +    g_free(file); +} + +static void nbd_config(BDRVNBDState *s, QDict *options, char **export, +                       Error **errp) +{ +    Error *local_err = NULL; + +    if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { +        if (qdict_haskey(options, "path")) { +            error_setg(errp, "path and host may not be used at the same time."); +        } else { +            error_setg(errp, "one of path and host must be specified."); +        } +        return; +    } + +    s->client.is_unix = qdict_haskey(options, "path"); +    s->socket_opts = qemu_opts_create(&socket_optslist, NULL, 0, +                                      &error_abort); + +    qemu_opts_absorb_qdict(s->socket_opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        return; +    } + +    if (!qemu_opt_get(s->socket_opts, "port")) { +        qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT, +                            &error_abort); +    } + +    *export = g_strdup(qdict_get_try_str(options, "export")); +    if (*export) { +        qdict_del(options, "export"); +    } +} + +NbdClientSession *nbd_get_client_session(BlockDriverState *bs) +{ +    BDRVNBDState *s = bs->opaque; +    return &s->client; +} + +static int nbd_establish_connection(BlockDriverState *bs, Error **errp) +{ +    BDRVNBDState *s = bs->opaque; +    int sock; + +    if (s->client.is_unix) { +        sock = unix_connect_opts(s->socket_opts, errp, NULL, NULL); +    } else { +        sock = inet_connect_opts(s->socket_opts, errp, NULL, NULL); +        if (sock >= 0) { +            socket_set_nodelay(sock); +        } +    } + +    /* Failed to establish connection */ +    if (sock < 0) { +        logout("Failed to establish connection to NBD server\n"); +        return -EIO; +    } + +    return sock; +} + +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVNBDState *s = bs->opaque; +    char *export = NULL; +    int result, sock; +    Error *local_err = NULL; + +    /* Pop the config into our state object. Exit if invalid. */ +    nbd_config(s, options, &export, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        return -EINVAL; +    } + +    /* establish TCP connection, return error if it fails +     * TODO: Configurable retry-until-timeout behaviour. +     */ +    sock = nbd_establish_connection(bs, errp); +    if (sock < 0) { +        g_free(export); +        return sock; +    } + +    /* NBD handshake */ +    result = nbd_client_init(bs, sock, export, errp); +    g_free(export); +    return result; +} + +static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, +                        int nb_sectors, QEMUIOVector *qiov) +{ +    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); +} + +static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, +                         int nb_sectors, QEMUIOVector *qiov) +{ +    return nbd_client_co_writev(bs, sector_num, nb_sectors, qiov); +} + +static int nbd_co_flush(BlockDriverState *bs) +{ +    return nbd_client_co_flush(bs); +} + +static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; +    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; +} + +static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors) +{ +    return nbd_client_co_discard(bs, sector_num, nb_sectors); +} + +static void nbd_close(BlockDriverState *bs) +{ +    BDRVNBDState *s = bs->opaque; + +    qemu_opts_del(s->socket_opts); +    nbd_client_close(bs); +} + +static int64_t nbd_getlength(BlockDriverState *bs) +{ +    BDRVNBDState *s = bs->opaque; + +    return s->client.size; +} + +static void nbd_detach_aio_context(BlockDriverState *bs) +{ +    nbd_client_detach_aio_context(bs); +} + +static void nbd_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context) +{ +    nbd_client_attach_aio_context(bs, new_context); +} + +static void nbd_refresh_filename(BlockDriverState *bs) +{ +    QDict *opts = qdict_new(); +    const char *path   = qdict_get_try_str(bs->options, "path"); +    const char *host   = qdict_get_try_str(bs->options, "host"); +    const char *port   = qdict_get_try_str(bs->options, "port"); +    const char *export = qdict_get_try_str(bs->options, "export"); + +    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd"))); + +    if (path && export) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd+unix:///%s?socket=%s", export, path); +    } else if (path && !export) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd+unix://?socket=%s", path); +    } else if (!path && export && port) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd://%s:%s/%s", host, port, export); +    } else if (!path && export && !port) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd://%s/%s", host, export); +    } else if (!path && !export && port) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd://%s:%s", host, port); +    } else if (!path && !export && !port) { +        snprintf(bs->exact_filename, sizeof(bs->exact_filename), +                 "nbd://%s", host); +    } + +    if (path) { +        qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path))); +    } else if (port) { +        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); +        qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port))); +    } else { +        qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); +    } +    if (export) { +        qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export))); +    } + +    bs->full_open_options = opts; +} + +static BlockDriver bdrv_nbd = { +    .format_name                = "nbd", +    .protocol_name              = "nbd", +    .instance_size              = sizeof(BDRVNBDState), +    .bdrv_parse_filename        = nbd_parse_filename, +    .bdrv_file_open             = nbd_open, +    .bdrv_co_readv              = nbd_co_readv, +    .bdrv_co_writev             = nbd_co_writev, +    .bdrv_close                 = nbd_close, +    .bdrv_co_flush_to_os        = nbd_co_flush, +    .bdrv_co_discard            = nbd_co_discard, +    .bdrv_refresh_limits        = nbd_refresh_limits, +    .bdrv_getlength             = nbd_getlength, +    .bdrv_detach_aio_context    = nbd_detach_aio_context, +    .bdrv_attach_aio_context    = nbd_attach_aio_context, +    .bdrv_refresh_filename      = nbd_refresh_filename, +}; + +static BlockDriver bdrv_nbd_tcp = { +    .format_name                = "nbd", +    .protocol_name              = "nbd+tcp", +    .instance_size              = sizeof(BDRVNBDState), +    .bdrv_parse_filename        = nbd_parse_filename, +    .bdrv_file_open             = nbd_open, +    .bdrv_co_readv              = nbd_co_readv, +    .bdrv_co_writev             = nbd_co_writev, +    .bdrv_close                 = nbd_close, +    .bdrv_co_flush_to_os        = nbd_co_flush, +    .bdrv_co_discard            = nbd_co_discard, +    .bdrv_refresh_limits        = nbd_refresh_limits, +    .bdrv_getlength             = nbd_getlength, +    .bdrv_detach_aio_context    = nbd_detach_aio_context, +    .bdrv_attach_aio_context    = nbd_attach_aio_context, +    .bdrv_refresh_filename      = nbd_refresh_filename, +}; + +static BlockDriver bdrv_nbd_unix = { +    .format_name                = "nbd", +    .protocol_name              = "nbd+unix", +    .instance_size              = sizeof(BDRVNBDState), +    .bdrv_parse_filename        = nbd_parse_filename, +    .bdrv_file_open             = nbd_open, +    .bdrv_co_readv              = nbd_co_readv, +    .bdrv_co_writev             = nbd_co_writev, +    .bdrv_close                 = nbd_close, +    .bdrv_co_flush_to_os        = nbd_co_flush, +    .bdrv_co_discard            = nbd_co_discard, +    .bdrv_refresh_limits        = nbd_refresh_limits, +    .bdrv_getlength             = nbd_getlength, +    .bdrv_detach_aio_context    = nbd_detach_aio_context, +    .bdrv_attach_aio_context    = nbd_attach_aio_context, +    .bdrv_refresh_filename      = nbd_refresh_filename, +}; + +static void bdrv_nbd_init(void) +{ +    bdrv_register(&bdrv_nbd); +    bdrv_register(&bdrv_nbd_tcp); +    bdrv_register(&bdrv_nbd_unix); +} + +block_init(bdrv_nbd_init); diff --git a/block/nfs.c b/block/nfs.c new file mode 100644 index 00000000..02eb4e46 --- /dev/null +++ b/block/nfs.c @@ -0,0 +1,516 @@ +/* + * QEMU Block driver for native access to files on NFS shares + * + * Copyright (c) 2014 Peter Lieven <pl@kamp.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/uri.h" +#include "sysemu/sysemu.h" +#include <nfsc/libnfs.h> + +#define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 + +typedef struct NFSClient { +    struct nfs_context *context; +    struct nfsfh *fh; +    int events; +    bool has_zero_init; +    AioContext *aio_context; +} NFSClient; + +typedef struct NFSRPC { +    int ret; +    int complete; +    QEMUIOVector *iov; +    struct stat *st; +    Coroutine *co; +    QEMUBH *bh; +    NFSClient *client; +} NFSRPC; + +static void nfs_process_read(void *arg); +static void nfs_process_write(void *arg); + +static void nfs_set_events(NFSClient *client) +{ +    int ev = nfs_which_events(client->context); +    if (ev != client->events) { +        aio_set_fd_handler(client->aio_context, +                           nfs_get_fd(client->context), +                           (ev & POLLIN) ? nfs_process_read : NULL, +                           (ev & POLLOUT) ? nfs_process_write : NULL, +                           client); + +    } +    client->events = ev; +} + +static void nfs_process_read(void *arg) +{ +    NFSClient *client = arg; +    nfs_service(client->context, POLLIN); +    nfs_set_events(client); +} + +static void nfs_process_write(void *arg) +{ +    NFSClient *client = arg; +    nfs_service(client->context, POLLOUT); +    nfs_set_events(client); +} + +static void nfs_co_init_task(NFSClient *client, NFSRPC *task) +{ +    *task = (NFSRPC) { +        .co             = qemu_coroutine_self(), +        .client         = client, +    }; +} + +static void nfs_co_generic_bh_cb(void *opaque) +{ +    NFSRPC *task = opaque; +    task->complete = 1; +    qemu_bh_delete(task->bh); +    qemu_coroutine_enter(task->co, NULL); +} + +static void +nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, +                  void *private_data) +{ +    NFSRPC *task = private_data; +    task->ret = ret; +    if (task->ret > 0 && task->iov) { +        if (task->ret <= task->iov->size) { +            qemu_iovec_from_buf(task->iov, 0, data, task->ret); +        } else { +            task->ret = -EIO; +        } +    } +    if (task->ret == 0 && task->st) { +        memcpy(task->st, data, sizeof(struct stat)); +    } +    if (task->ret < 0) { +        error_report("NFS Error: %s", nfs_get_error(nfs)); +    } +    if (task->co) { +        task->bh = aio_bh_new(task->client->aio_context, +                              nfs_co_generic_bh_cb, task); +        qemu_bh_schedule(task->bh); +    } else { +        task->complete = 1; +    } +} + +static int coroutine_fn nfs_co_readv(BlockDriverState *bs, +                                     int64_t sector_num, int nb_sectors, +                                     QEMUIOVector *iov) +{ +    NFSClient *client = bs->opaque; +    NFSRPC task; + +    nfs_co_init_task(client, &task); +    task.iov = iov; + +    if (nfs_pread_async(client->context, client->fh, +                        sector_num * BDRV_SECTOR_SIZE, +                        nb_sectors * BDRV_SECTOR_SIZE, +                        nfs_co_generic_cb, &task) != 0) { +        return -ENOMEM; +    } + +    while (!task.complete) { +        nfs_set_events(client); +        qemu_coroutine_yield(); +    } + +    if (task.ret < 0) { +        return task.ret; +    } + +    /* zero pad short reads */ +    if (task.ret < iov->size) { +        qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret); +    } + +    return 0; +} + +static int coroutine_fn nfs_co_writev(BlockDriverState *bs, +                                        int64_t sector_num, int nb_sectors, +                                        QEMUIOVector *iov) +{ +    NFSClient *client = bs->opaque; +    NFSRPC task; +    char *buf = NULL; + +    nfs_co_init_task(client, &task); + +    buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); +    if (nb_sectors && buf == NULL) { +        return -ENOMEM; +    } + +    qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); + +    if (nfs_pwrite_async(client->context, client->fh, +                         sector_num * BDRV_SECTOR_SIZE, +                         nb_sectors * BDRV_SECTOR_SIZE, +                         buf, nfs_co_generic_cb, &task) != 0) { +        g_free(buf); +        return -ENOMEM; +    } + +    while (!task.complete) { +        nfs_set_events(client); +        qemu_coroutine_yield(); +    } + +    g_free(buf); + +    if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) { +        return task.ret < 0 ? task.ret : -EIO; +    } + +    return 0; +} + +static int coroutine_fn nfs_co_flush(BlockDriverState *bs) +{ +    NFSClient *client = bs->opaque; +    NFSRPC task; + +    nfs_co_init_task(client, &task); + +    if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, +                        &task) != 0) { +        return -ENOMEM; +    } + +    while (!task.complete) { +        nfs_set_events(client); +        qemu_coroutine_yield(); +    } + +    return task.ret; +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { +    .name = "nfs", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "URL to the NFS file", +        }, +        { /* end of list */ } +    }, +}; + +static void nfs_detach_aio_context(BlockDriverState *bs) +{ +    NFSClient *client = bs->opaque; + +    aio_set_fd_handler(client->aio_context, +                       nfs_get_fd(client->context), +                       NULL, NULL, NULL); +    client->events = 0; +} + +static void nfs_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context) +{ +    NFSClient *client = bs->opaque; + +    client->aio_context = new_context; +    nfs_set_events(client); +} + +static void nfs_client_close(NFSClient *client) +{ +    if (client->context) { +        if (client->fh) { +            nfs_close(client->context, client->fh); +        } +        aio_set_fd_handler(client->aio_context, +                           nfs_get_fd(client->context), +                           NULL, NULL, NULL); +        nfs_destroy_context(client->context); +    } +    memset(client, 0, sizeof(NFSClient)); +} + +static void nfs_file_close(BlockDriverState *bs) +{ +    NFSClient *client = bs->opaque; +    nfs_client_close(client); +} + +static int64_t nfs_client_open(NFSClient *client, const char *filename, +                               int flags, Error **errp) +{ +    int ret = -EINVAL, i; +    struct stat st; +    URI *uri; +    QueryParams *qp = NULL; +    char *file = NULL, *strp = NULL; + +    uri = uri_parse(filename); +    if (!uri) { +        error_setg(errp, "Invalid URL specified"); +        goto fail; +    } +    if (!uri->server) { +        error_setg(errp, "Invalid URL specified"); +        goto fail; +    } +    strp = strrchr(uri->path, '/'); +    if (strp == NULL) { +        error_setg(errp, "Invalid URL specified"); +        goto fail; +    } +    file = g_strdup(strp); +    *strp = 0; + +    client->context = nfs_init_context(); +    if (client->context == NULL) { +        error_setg(errp, "Failed to init NFS context"); +        goto fail; +    } + +    qp = query_params_parse(uri->query); +    for (i = 0; i < qp->n; i++) { +        unsigned long long val; +        if (!qp->p[i].value) { +            error_setg(errp, "Value for NFS parameter expected: %s", +                       qp->p[i].name); +            goto fail; +        } +        if (parse_uint_full(qp->p[i].value, &val, 0)) { +            error_setg(errp, "Illegal value for NFS parameter: %s", +                       qp->p[i].name); +            goto fail; +        } +        if (!strcmp(qp->p[i].name, "uid")) { +            nfs_set_uid(client->context, val); +        } else if (!strcmp(qp->p[i].name, "gid")) { +            nfs_set_gid(client->context, val); +        } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) { +            nfs_set_tcp_syncnt(client->context, val); +#ifdef LIBNFS_FEATURE_READAHEAD +        } else if (!strcmp(qp->p[i].name, "readahead")) { +            if (val > QEMU_NFS_MAX_READAHEAD_SIZE) { +                error_report("NFS Warning: Truncating NFS readahead" +                             " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE); +                val = QEMU_NFS_MAX_READAHEAD_SIZE; +            } +            nfs_set_readahead(client->context, val); +#endif +        } else { +            error_setg(errp, "Unknown NFS parameter name: %s", +                       qp->p[i].name); +            goto fail; +        } +    } + +    ret = nfs_mount(client->context, uri->server, uri->path); +    if (ret < 0) { +        error_setg(errp, "Failed to mount nfs share: %s", +                   nfs_get_error(client->context)); +        goto fail; +    } + +    if (flags & O_CREAT) { +        ret = nfs_creat(client->context, file, 0600, &client->fh); +        if (ret < 0) { +            error_setg(errp, "Failed to create file: %s", +                       nfs_get_error(client->context)); +            goto fail; +        } +    } else { +        ret = nfs_open(client->context, file, flags, &client->fh); +        if (ret < 0) { +            error_setg(errp, "Failed to open file : %s", +                       nfs_get_error(client->context)); +            goto fail; +        } +    } + +    ret = nfs_fstat(client->context, client->fh, &st); +    if (ret < 0) { +        error_setg(errp, "Failed to fstat file: %s", +                   nfs_get_error(client->context)); +        goto fail; +    } + +    ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); +    client->has_zero_init = S_ISREG(st.st_mode); +    goto out; +fail: +    nfs_client_close(client); +out: +    if (qp) { +        query_params_free(qp); +    } +    uri_free(uri); +    g_free(file); +    return ret; +} + +static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, +                         Error **errp) { +    NFSClient *client = bs->opaque; +    int64_t ret; +    QemuOpts *opts; +    Error *local_err = NULL; + +    client->aio_context = bdrv_get_aio_context(bs); + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } +    ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), +                          (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, +                          errp); +    if (ret < 0) { +        goto out; +    } +    bs->total_sectors = ret; +    ret = 0; +out: +    qemu_opts_del(opts); +    return ret; +} + +static QemuOptsList nfs_create_opts = { +    .name = "nfs-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(nfs_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) +{ +    int ret = 0; +    int64_t total_size = 0; +    NFSClient *client = g_new0(NFSClient, 1); + +    client->aio_context = qemu_get_aio_context(); + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); + +    ret = nfs_client_open(client, url, O_CREAT, errp); +    if (ret < 0) { +        goto out; +    } +    ret = nfs_ftruncate(client->context, client->fh, total_size); +    nfs_client_close(client); +out: +    g_free(client); +    return ret; +} + +static int nfs_has_zero_init(BlockDriverState *bs) +{ +    NFSClient *client = bs->opaque; +    return client->has_zero_init; +} + +static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) +{ +    NFSClient *client = bs->opaque; +    NFSRPC task = {0}; +    struct stat st; + +    task.st = &st; +    if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, +                        &task) != 0) { +        return -ENOMEM; +    } + +    while (!task.complete) { +        nfs_set_events(client); +        aio_poll(client->aio_context, true); +    } + +    return (task.ret < 0 ? task.ret : st.st_blocks * 512); +} + +static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) +{ +    NFSClient *client = bs->opaque; +    return nfs_ftruncate(client->context, client->fh, offset); +} + +static BlockDriver bdrv_nfs = { +    .format_name                    = "nfs", +    .protocol_name                  = "nfs", + +    .instance_size                  = sizeof(NFSClient), +    .bdrv_needs_filename            = true, +    .create_opts                    = &nfs_create_opts, + +    .bdrv_has_zero_init             = nfs_has_zero_init, +    .bdrv_get_allocated_file_size   = nfs_get_allocated_file_size, +    .bdrv_truncate                  = nfs_file_truncate, + +    .bdrv_file_open                 = nfs_file_open, +    .bdrv_close                     = nfs_file_close, +    .bdrv_create                    = nfs_file_create, + +    .bdrv_co_readv                  = nfs_co_readv, +    .bdrv_co_writev                 = nfs_co_writev, +    .bdrv_co_flush_to_disk          = nfs_co_flush, + +    .bdrv_detach_aio_context        = nfs_detach_aio_context, +    .bdrv_attach_aio_context        = nfs_attach_aio_context, +}; + +static void nfs_block_init(void) +{ +    bdrv_register(&bdrv_nfs); +} + +block_init(nfs_block_init); diff --git a/block/null.c b/block/null.c new file mode 100644 index 00000000..7d083233 --- /dev/null +++ b/block/null.c @@ -0,0 +1,222 @@ +/* + * Null block driver + * + * Authors: + *  Fam Zheng <famz@redhat.com> + * + * Copyright (C) 2014 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "block/block_int.h" + +#define NULL_OPT_LATENCY "latency-ns" + +typedef struct { +    int64_t length; +    int64_t latency_ns; +} BDRVNullState; + +static QemuOptsList runtime_opts = { +    .name = "null", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "", +        }, +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "size of the null block", +        }, +        { +            .name = NULL_OPT_LATENCY, +            .type = QEMU_OPT_NUMBER, +            .help = "nanoseconds (approximated) to wait " +                    "before completing request", +        }, +        { /* end of list */ } +    }, +}; + +static int null_file_open(BlockDriverState *bs, QDict *options, int flags, +                          Error **errp) +{ +    QemuOpts *opts; +    BDRVNullState *s = bs->opaque; +    int ret = 0; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &error_abort); +    s->length = +        qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); +    s->latency_ns = +        qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0); +    if (s->latency_ns < 0) { +        error_setg(errp, "latency-ns is invalid"); +        ret = -EINVAL; +    } +    qemu_opts_del(opts); +    return ret; +} + +static void null_close(BlockDriverState *bs) +{ +} + +static int64_t null_getlength(BlockDriverState *bs) +{ +    BDRVNullState *s = bs->opaque; +    return s->length; +} + +static coroutine_fn int null_co_common(BlockDriverState *bs) +{ +    BDRVNullState *s = bs->opaque; + +    if (s->latency_ns) { +        co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME, +                        s->latency_ns); +    } +    return 0; +} + +static coroutine_fn int null_co_readv(BlockDriverState *bs, +                                      int64_t sector_num, int nb_sectors, +                                      QEMUIOVector *qiov) +{ +    return null_co_common(bs); +} + +static coroutine_fn int null_co_writev(BlockDriverState *bs, +                                       int64_t sector_num, int nb_sectors, +                                       QEMUIOVector *qiov) +{ +    return null_co_common(bs); +} + +static coroutine_fn int null_co_flush(BlockDriverState *bs) +{ +    return null_co_common(bs); +} + +typedef struct { +    BlockAIOCB common; +    QEMUBH *bh; +    QEMUTimer timer; +} NullAIOCB; + +static const AIOCBInfo null_aiocb_info = { +    .aiocb_size = sizeof(NullAIOCB), +}; + +static void null_bh_cb(void *opaque) +{ +    NullAIOCB *acb = opaque; +    acb->common.cb(acb->common.opaque, 0); +    qemu_bh_delete(acb->bh); +    qemu_aio_unref(acb); +} + +static void null_timer_cb(void *opaque) +{ +    NullAIOCB *acb = opaque; +    acb->common.cb(acb->common.opaque, 0); +    timer_deinit(&acb->timer); +    qemu_aio_unref(acb); +} + +static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, +                                          BlockCompletionFunc *cb, +                                          void *opaque) +{ +    NullAIOCB *acb; +    BDRVNullState *s = bs->opaque; + +    acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); +    /* Only emulate latency after vcpu is running. */ +    if (s->latency_ns) { +        aio_timer_init(bdrv_get_aio_context(bs), &acb->timer, +                       QEMU_CLOCK_REALTIME, SCALE_NS, +                       null_timer_cb, acb); +        timer_mod_ns(&acb->timer, +                     qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns); +    } else { +        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); +        qemu_bh_schedule(acb->bh); +    } +    return &acb->common; +} + +static BlockAIOCB *null_aio_readv(BlockDriverState *bs, +                                  int64_t sector_num, QEMUIOVector *qiov, +                                  int nb_sectors, +                                  BlockCompletionFunc *cb, +                                  void *opaque) +{ +    return null_aio_common(bs, cb, opaque); +} + +static BlockAIOCB *null_aio_writev(BlockDriverState *bs, +                                   int64_t sector_num, QEMUIOVector *qiov, +                                   int nb_sectors, +                                   BlockCompletionFunc *cb, +                                   void *opaque) +{ +    return null_aio_common(bs, cb, opaque); +} + +static BlockAIOCB *null_aio_flush(BlockDriverState *bs, +                                  BlockCompletionFunc *cb, +                                  void *opaque) +{ +    return null_aio_common(bs, cb, opaque); +} + +static int null_reopen_prepare(BDRVReopenState *reopen_state, +                               BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static BlockDriver bdrv_null_co = { +    .format_name            = "null-co", +    .protocol_name          = "null-co", +    .instance_size          = sizeof(BDRVNullState), + +    .bdrv_file_open         = null_file_open, +    .bdrv_close             = null_close, +    .bdrv_getlength         = null_getlength, + +    .bdrv_co_readv          = null_co_readv, +    .bdrv_co_writev         = null_co_writev, +    .bdrv_co_flush_to_disk  = null_co_flush, +    .bdrv_reopen_prepare    = null_reopen_prepare, +}; + +static BlockDriver bdrv_null_aio = { +    .format_name            = "null-aio", +    .protocol_name          = "null-aio", +    .instance_size          = sizeof(BDRVNullState), + +    .bdrv_file_open         = null_file_open, +    .bdrv_close             = null_close, +    .bdrv_getlength         = null_getlength, + +    .bdrv_aio_readv         = null_aio_readv, +    .bdrv_aio_writev        = null_aio_writev, +    .bdrv_aio_flush         = null_aio_flush, +    .bdrv_reopen_prepare    = null_reopen_prepare, +}; + +static void bdrv_null_init(void) +{ +    bdrv_register(&bdrv_null_co); +    bdrv_register(&bdrv_null_aio); +} + +block_init(bdrv_null_init); diff --git a/block/parallels.c b/block/parallels.c new file mode 100644 index 00000000..046b5684 --- /dev/null +++ b/block/parallels.c @@ -0,0 +1,758 @@ +/* + * Block driver for Parallels disk image format + * + * Copyright (c) 2007 Alex Beregszaszi + * Copyright (c) 2015 Denis V. Lunev <den@openvz.org> + * + * This code was originally based on comparing different disk images created + * by Parallels. Currently it is based on opened OpenVZ sources + * available at + *     http://git.openvz.org/?p=ploop;a=summary + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/bitmap.h" +#include "qapi/util.h" + +/**************************************************************/ + +#define HEADER_MAGIC "WithoutFreeSpace" +#define HEADER_MAGIC2 "WithouFreSpacExt" +#define HEADER_VERSION 2 +#define HEADER_INUSE_MAGIC  (0x746F6E59) + +#define DEFAULT_CLUSTER_SIZE 1048576        /* 1 MiB */ + + +// always little-endian +typedef struct ParallelsHeader { +    char magic[16]; // "WithoutFreeSpace" +    uint32_t version; +    uint32_t heads; +    uint32_t cylinders; +    uint32_t tracks; +    uint32_t bat_entries; +    uint64_t nb_sectors; +    uint32_t inuse; +    uint32_t data_off; +    char padding[12]; +} QEMU_PACKED ParallelsHeader; + + +typedef enum ParallelsPreallocMode { +    PRL_PREALLOC_MODE_FALLOCATE = 0, +    PRL_PREALLOC_MODE_TRUNCATE = 1, +    PRL_PREALLOC_MODE_MAX = 2, +} ParallelsPreallocMode; + +static const char *prealloc_mode_lookup[] = { +    "falloc", +    "truncate", +    NULL, +}; + + +typedef struct BDRVParallelsState { +    /** Locking is conservative, the lock protects +     *   - image file extending (truncate, fallocate) +     *   - any access to block allocation table +     */ +    CoMutex lock; + +    ParallelsHeader *header; +    uint32_t header_size; +    bool header_unclean; + +    unsigned long *bat_dirty_bmap; +    unsigned int  bat_dirty_block; + +    uint32_t *bat_bitmap; +    unsigned int bat_size; + +    int64_t  data_end; +    uint64_t prealloc_size; +    ParallelsPreallocMode prealloc_mode; + +    unsigned int tracks; + +    unsigned int off_multiplier; +} BDRVParallelsState; + + +#define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode" +#define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size" + +static QemuOptsList parallels_runtime_opts = { +    .name = "parallels", +    .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head), +    .desc = { +        { +            .name = PARALLELS_OPT_PREALLOC_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Preallocation size on image expansion", +            .def_value_str = "128MiB", +        }, +        { +            .name = PARALLELS_OPT_PREALLOC_MODE, +            .type = QEMU_OPT_STRING, +            .help = "Preallocation mode on image expansion " +                    "(allowed values: falloc, truncate)", +            .def_value_str = "falloc", +        }, +        { /* end of list */ }, +    }, +}; + + +static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx) +{ +    return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier; +} + +static uint32_t bat_entry_off(uint32_t idx) +{ +    return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx; +} + +static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num) +{ +    uint32_t index, offset; + +    index = sector_num / s->tracks; +    offset = sector_num % s->tracks; + +    /* not allocated */ +    if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) { +        return -1; +    } +    return bat2sect(s, index) + offset; +} + +static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num, +        int nb_sectors) +{ +    int ret = s->tracks - sector_num % s->tracks; +    return MIN(nb_sectors, ret); +} + +static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, +                            int nb_sectors, int *pnum) +{ +    int64_t start_off = -2, prev_end_off = -2; + +    *pnum = 0; +    while (nb_sectors > 0 || start_off == -2) { +        int64_t offset = seek_to_sector(s, sector_num); +        int to_end; + +        if (start_off == -2) { +            start_off = offset; +            prev_end_off = offset; +        } else if (offset != prev_end_off) { +            break; +        } + +        to_end = cluster_remainder(s, sector_num, nb_sectors); +        nb_sectors -= to_end; +        sector_num += to_end; +        *pnum += to_end; + +        if (offset > 0) { +            prev_end_off += to_end; +        } +    } +    return start_off; +} + +static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, +                                 int nb_sectors, int *pnum) +{ +    BDRVParallelsState *s = bs->opaque; +    uint32_t idx, to_allocate, i; +    int64_t pos, space; + +    pos = block_status(s, sector_num, nb_sectors, pnum); +    if (pos > 0) { +        return pos; +    } + +    idx = sector_num / s->tracks; +    if (idx >= s->bat_size) { +        return -EINVAL; +    } + +    to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; +    space = to_allocate * s->tracks; +    if (s->data_end + space > bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS) { +        int ret; +        space += s->prealloc_size; +        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { +            ret = bdrv_write_zeroes(bs->file, s->data_end, space, 0); +        } else { +            ret = bdrv_truncate(bs->file, +                                (s->data_end + space) << BDRV_SECTOR_BITS); +        } +        if (ret < 0) { +            return ret; +        } +    } + +    for (i = 0; i < to_allocate; i++) { +        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); +        s->data_end += s->tracks; +        bitmap_set(s->bat_dirty_bmap, +                   bat_entry_off(idx) / s->bat_dirty_block, 1); +    } + +    return bat2sect(s, idx) + sector_num % s->tracks; +} + + +static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) +{ +    BDRVParallelsState *s = bs->opaque; +    unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block); +    unsigned long bit; + +    qemu_co_mutex_lock(&s->lock); + +    bit = find_first_bit(s->bat_dirty_bmap, size); +    while (bit < size) { +        uint32_t off = bit * s->bat_dirty_block; +        uint32_t to_write = s->bat_dirty_block; +        int ret; + +        if (off + to_write > s->header_size) { +            to_write = s->header_size - off; +        } +        ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write); +        if (ret < 0) { +            qemu_co_mutex_unlock(&s->lock); +            return ret; +        } +        bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1); +    } +    bitmap_zero(s->bat_dirty_bmap, size); + +    qemu_co_mutex_unlock(&s->lock); +    return 0; +} + + +static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVParallelsState *s = bs->opaque; +    int64_t offset; + +    qemu_co_mutex_lock(&s->lock); +    offset = block_status(s, sector_num, nb_sectors, pnum); +    qemu_co_mutex_unlock(&s->lock); + +    if (offset < 0) { +        return 0; +    } + +    return (offset << BDRV_SECTOR_BITS) | +        BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; +} + +static coroutine_fn int parallels_co_writev(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVParallelsState *s = bs->opaque; +    uint64_t bytes_done = 0; +    QEMUIOVector hd_qiov; +    int ret = 0; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    while (nb_sectors > 0) { +        int64_t position; +        int n, nbytes; + +        qemu_co_mutex_lock(&s->lock); +        position = allocate_clusters(bs, sector_num, nb_sectors, &n); +        qemu_co_mutex_unlock(&s->lock); +        if (position < 0) { +            ret = (int)position; +            break; +        } + +        nbytes = n << BDRV_SECTOR_BITS; + +        qemu_iovec_reset(&hd_qiov); +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + +        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov); +        if (ret < 0) { +            break; +        } + +        nb_sectors -= n; +        sector_num += n; +        bytes_done += nbytes; +    } + +    qemu_iovec_destroy(&hd_qiov); +    return ret; +} + +static coroutine_fn int parallels_co_readv(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVParallelsState *s = bs->opaque; +    uint64_t bytes_done = 0; +    QEMUIOVector hd_qiov; +    int ret = 0; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    while (nb_sectors > 0) { +        int64_t position; +        int n, nbytes; + +        qemu_co_mutex_lock(&s->lock); +        position = block_status(s, sector_num, nb_sectors, &n); +        qemu_co_mutex_unlock(&s->lock); + +        nbytes = n << BDRV_SECTOR_BITS; + +        if (position < 0) { +            qemu_iovec_memset(qiov, bytes_done, 0, nbytes); +        } else { +            qemu_iovec_reset(&hd_qiov); +            qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + +            ret = bdrv_co_readv(bs->file, position, n, &hd_qiov); +            if (ret < 0) { +                break; +            } +        } + +        nb_sectors -= n; +        sector_num += n; +        bytes_done += nbytes; +    } + +    qemu_iovec_destroy(&hd_qiov); +    return ret; +} + + +static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, +                           BdrvCheckMode fix) +{ +    BDRVParallelsState *s = bs->opaque; +    int64_t size, prev_off, high_off; +    int ret; +    uint32_t i; +    bool flush_bat = false; +    int cluster_size = s->tracks << BDRV_SECTOR_BITS; + +    size = bdrv_getlength(bs->file); +    if (size < 0) { +        res->check_errors++; +        return size; +    } + +    if (s->header_unclean) { +        fprintf(stderr, "%s image was not closed correctly\n", +                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); +        res->corruptions++; +        if (fix & BDRV_FIX_ERRORS) { +            /* parallels_close will do the job right */ +            res->corruptions_fixed++; +            s->header_unclean = false; +        } +    } + +    res->bfi.total_clusters = s->bat_size; +    res->bfi.compressed_clusters = 0; /* compression is not supported */ + +    high_off = 0; +    prev_off = 0; +    for (i = 0; i < s->bat_size; i++) { +        int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; +        if (off == 0) { +            prev_off = 0; +            continue; +        } + +        /* cluster outside the image */ +        if (off > size) { +            fprintf(stderr, "%s cluster %u is outside image\n", +                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); +            res->corruptions++; +            if (fix & BDRV_FIX_ERRORS) { +                prev_off = 0; +                s->bat_bitmap[i] = 0; +                res->corruptions_fixed++; +                flush_bat = true; +                continue; +            } +        } + +        res->bfi.allocated_clusters++; +        if (off > high_off) { +            high_off = off; +        } + +        if (prev_off != 0 && (prev_off + cluster_size) != off) { +            res->bfi.fragmented_clusters++; +        } +        prev_off = off; +    } + +    if (flush_bat) { +        ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size); +        if (ret < 0) { +            res->check_errors++; +            return ret; +        } +    } + +    res->image_end_offset = high_off + cluster_size; +    if (size > res->image_end_offset) { +        int64_t count; +        count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size); +        fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n", +                fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", +                size - res->image_end_offset); +        res->leaks += count; +        if (fix & BDRV_FIX_LEAKS) { +            ret = bdrv_truncate(bs->file, res->image_end_offset); +            if (ret < 0) { +                res->check_errors++; +                return ret; +            } +            res->leaks_fixed += count; +        } +    } + +    return 0; +} + + +static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int64_t total_size, cl_size; +    uint8_t tmp[BDRV_SECTOR_SIZE]; +    Error *local_err = NULL; +    BlockDriverState *file; +    uint32_t bat_entries, bat_sectors; +    ParallelsHeader header; +    int ret; + +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, +                          DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE); + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } + +    file = NULL; +    ret = bdrv_open(&file, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } +    ret = bdrv_truncate(file, 0); +    if (ret < 0) { +        goto exit; +    } + +    bat_entries = DIV_ROUND_UP(total_size, cl_size); +    bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size); +    bat_sectors = (bat_sectors *  cl_size) >> BDRV_SECTOR_BITS; + +    memset(&header, 0, sizeof(header)); +    memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic)); +    header.version = cpu_to_le32(HEADER_VERSION); +    /* don't care much about geometry, it is not used on image level */ +    header.heads = cpu_to_le32(16); +    header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32); +    header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS); +    header.bat_entries = cpu_to_le32(bat_entries); +    header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE)); +    header.data_off = cpu_to_le32(bat_sectors); + +    /* write all the data */ +    memset(tmp, 0, sizeof(tmp)); +    memcpy(tmp, &header, sizeof(header)); + +    ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); +    if (ret < 0) { +        goto exit; +    } +    ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); +    if (ret < 0) { +        goto exit; +    } +    ret = 0; + +done: +    bdrv_unref(file); +    return ret; + +exit: +    error_setg_errno(errp, -ret, "Failed to create Parallels image"); +    goto done; +} + + +static int parallels_probe(const uint8_t *buf, int buf_size, +                           const char *filename) +{ +    const ParallelsHeader *ph = (const void *)buf; + +    if (buf_size < sizeof(ParallelsHeader)) { +        return 0; +    } + +    if ((!memcmp(ph->magic, HEADER_MAGIC, 16) || +           !memcmp(ph->magic, HEADER_MAGIC2, 16)) && +           (le32_to_cpu(ph->version) == HEADER_VERSION)) { +        return 100; +    } + +    return 0; +} + +static int parallels_update_header(BlockDriverState *bs) +{ +    BDRVParallelsState *s = bs->opaque; +    unsigned size = MAX(bdrv_opt_mem_align(bs->file), sizeof(ParallelsHeader)); + +    if (size > s->header_size) { +        size = s->header_size; +    } +    return bdrv_pwrite_sync(bs->file, 0, s->header, size); +} + +static int parallels_open(BlockDriverState *bs, QDict *options, int flags, +                          Error **errp) +{ +    BDRVParallelsState *s = bs->opaque; +    ParallelsHeader ph; +    int ret, size, i; +    QemuOpts *opts = NULL; +    Error *local_err = NULL; +    char *buf; + +    ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph)); +    if (ret < 0) { +        goto fail; +    } + +    bs->total_sectors = le64_to_cpu(ph.nb_sectors); + +    if (le32_to_cpu(ph.version) != HEADER_VERSION) { +        goto fail_format; +    } +    if (!memcmp(ph.magic, HEADER_MAGIC, 16)) { +        s->off_multiplier = 1; +        bs->total_sectors = 0xffffffff & bs->total_sectors; +    } else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) { +        s->off_multiplier = le32_to_cpu(ph.tracks); +    } else { +        goto fail_format; +    } + +    s->tracks = le32_to_cpu(ph.tracks); +    if (s->tracks == 0) { +        error_setg(errp, "Invalid image: Zero sectors per track"); +        ret = -EINVAL; +        goto fail; +    } +    if (s->tracks > INT32_MAX/513) { +        error_setg(errp, "Invalid image: Too big cluster"); +        ret = -EFBIG; +        goto fail; +    } + +    s->bat_size = le32_to_cpu(ph.bat_entries); +    if (s->bat_size > INT_MAX / sizeof(uint32_t)) { +        error_setg(errp, "Catalog too large"); +        ret = -EFBIG; +        goto fail; +    } + +    size = bat_entry_off(s->bat_size); +    s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file)); +    s->header = qemu_try_blockalign(bs->file, s->header_size); +    if (s->header == NULL) { +        ret = -ENOMEM; +        goto fail; +    } +    s->data_end = le32_to_cpu(ph.data_off); +    if (s->data_end == 0) { +        s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE); +    } +    if (s->data_end < s->header_size) { +        /* there is not enough unused space to fit to block align between BAT +           and actual data. We can't avoid read-modify-write... */ +        s->header_size = size; +    } + +    ret = bdrv_pread(bs->file, 0, s->header, s->header_size); +    if (ret < 0) { +        goto fail; +    } +    s->bat_bitmap = (uint32_t *)(s->header + 1); + +    for (i = 0; i < s->bat_size; i++) { +        int64_t off = bat2sect(s, i); +        if (off >= s->data_end) { +            s->data_end = off + s->tracks; +        } +    } + +    if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) { +        /* Image was not closed correctly. The check is mandatory */ +        s->header_unclean = true; +        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { +            error_setg(errp, "parallels: Image was not closed correctly; " +                       "cannot be opened read/write"); +            ret = -EACCES; +            goto fail; +        } +    } + +    opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, &local_err); +    if (local_err != NULL) { +        goto fail_options; +    } + +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err != NULL) { +        goto fail_options; +    } + +    s->prealloc_size = +        qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0); +    s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS); +    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE); +    s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf, +            PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); +    g_free(buf); +    if (local_err != NULL) { +        goto fail_options; +    } +    if (!bdrv_has_zero_init(bs->file) || +            bdrv_truncate(bs->file, bdrv_getlength(bs->file)) != 0) { +        s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; +    } + +    if (flags & BDRV_O_RDWR) { +        s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC); +        ret = parallels_update_header(bs); +        if (ret < 0) { +            goto fail; +        } +    } + +    s->bat_dirty_block = 4 * getpagesize(); +    s->bat_dirty_bmap = +        bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block)); + +    qemu_co_mutex_init(&s->lock); +    return 0; + +fail_format: +    error_setg(errp, "Image not in Parallels format"); +    ret = -EINVAL; +fail: +    qemu_vfree(s->header); +    return ret; + +fail_options: +    error_propagate(errp, local_err); +    ret = -EINVAL; +    goto fail; +} + + +static void parallels_close(BlockDriverState *bs) +{ +    BDRVParallelsState *s = bs->opaque; + +    if (bs->open_flags & BDRV_O_RDWR) { +        s->header->inuse = 0; +        parallels_update_header(bs); +    } + +    if (bs->open_flags & BDRV_O_RDWR) { +        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS); +    } + +    g_free(s->bat_dirty_bmap); +    qemu_vfree(s->header); +} + +static QemuOptsList parallels_create_opts = { +    .name = "parallels-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size", +        }, +        { +            .name = BLOCK_OPT_CLUSTER_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Parallels image cluster size", +            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE), +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_parallels = { +    .format_name	= "parallels", +    .instance_size	= sizeof(BDRVParallelsState), +    .bdrv_probe		= parallels_probe, +    .bdrv_open		= parallels_open, +    .bdrv_close		= parallels_close, +    .bdrv_co_get_block_status = parallels_co_get_block_status, +    .bdrv_has_zero_init       = bdrv_has_zero_init_1, +    .bdrv_co_flush_to_os      = parallels_co_flush_to_os, +    .bdrv_co_readv  = parallels_co_readv, +    .bdrv_co_writev = parallels_co_writev, + +    .bdrv_create    = parallels_create, +    .bdrv_check     = parallels_check, +    .create_opts    = ¶llels_create_opts, +}; + +static void bdrv_parallels_init(void) +{ +    bdrv_register(&bdrv_parallels); +} + +block_init(bdrv_parallels_init); diff --git a/block/qapi.c b/block/qapi.c new file mode 100644 index 00000000..2ce50971 --- /dev/null +++ b/block/qapi.c @@ -0,0 +1,666 @@ +/* + * Block layer qmp and info dump related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/qapi.h" +#include "block/block_int.h" +#include "block/throttle-groups.h" +#include "block/write-threshold.h" +#include "qmp-commands.h" +#include "qapi-visit.h" +#include "qapi/qmp-output-visitor.h" +#include "qapi/qmp/types.h" +#include "sysemu/block-backend.h" + +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) +{ +    ImageInfo **p_image_info; +    BlockDriverState *bs0; +    BlockDeviceInfo *info = g_malloc0(sizeof(*info)); + +    info->file                   = g_strdup(bs->filename); +    info->ro                     = bs->read_only; +    info->drv                    = g_strdup(bs->drv->format_name); +    info->encrypted              = bs->encrypted; +    info->encryption_key_missing = bdrv_key_required(bs); + +    info->cache = g_new(BlockdevCacheInfo, 1); +    *info->cache = (BlockdevCacheInfo) { +        .writeback      = bdrv_enable_write_cache(bs), +        .direct         = !!(bs->open_flags & BDRV_O_NOCACHE), +        .no_flush       = !!(bs->open_flags & BDRV_O_NO_FLUSH), +    }; + +    if (bs->node_name[0]) { +        info->has_node_name = true; +        info->node_name = g_strdup(bs->node_name); +    } + +    if (bs->backing_file[0]) { +        info->has_backing_file = true; +        info->backing_file = g_strdup(bs->backing_file); +    } + +    info->backing_file_depth = bdrv_get_backing_file_depth(bs); +    info->detect_zeroes = bs->detect_zeroes; + +    if (bs->io_limits_enabled) { +        ThrottleConfig cfg; + +        throttle_group_get_config(bs, &cfg); + +        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg; +        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg; +        info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg; + +        info->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg; +        info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; +        info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; + +        info->has_bps_max     = cfg.buckets[THROTTLE_BPS_TOTAL].max; +        info->bps_max         = cfg.buckets[THROTTLE_BPS_TOTAL].max; +        info->has_bps_rd_max  = cfg.buckets[THROTTLE_BPS_READ].max; +        info->bps_rd_max      = cfg.buckets[THROTTLE_BPS_READ].max; +        info->has_bps_wr_max  = cfg.buckets[THROTTLE_BPS_WRITE].max; +        info->bps_wr_max      = cfg.buckets[THROTTLE_BPS_WRITE].max; + +        info->has_iops_max    = cfg.buckets[THROTTLE_OPS_TOTAL].max; +        info->iops_max        = cfg.buckets[THROTTLE_OPS_TOTAL].max; +        info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; +        info->iops_rd_max     = cfg.buckets[THROTTLE_OPS_READ].max; +        info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; +        info->iops_wr_max     = cfg.buckets[THROTTLE_OPS_WRITE].max; + +        info->has_iops_size = cfg.op_size; +        info->iops_size = cfg.op_size; + +        info->has_group = true; +        info->group = g_strdup(throttle_group_get_name(bs)); +    } + +    info->write_threshold = bdrv_write_threshold_get(bs); + +    bs0 = bs; +    p_image_info = &info->image; +    while (1) { +        Error *local_err = NULL; +        bdrv_query_image_info(bs0, p_image_info, &local_err); +        if (local_err) { +            error_propagate(errp, local_err); +            qapi_free_BlockDeviceInfo(info); +            return NULL; +        } +        if (bs0->drv && bs0->backing_hd) { +            bs0 = bs0->backing_hd; +            (*p_image_info)->has_backing_image = true; +            p_image_info = &((*p_image_info)->backing_image); +        } else { +            break; +        } +    } + +    return info; +} + +/* + * Returns 0 on success, with *p_list either set to describe snapshot + * information, or NULL because there are no snapshots.  Returns -errno on + * error, with *p_list untouched. + */ +int bdrv_query_snapshot_info_list(BlockDriverState *bs, +                                  SnapshotInfoList **p_list, +                                  Error **errp) +{ +    int i, sn_count; +    QEMUSnapshotInfo *sn_tab = NULL; +    SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL; +    SnapshotInfo *info; + +    sn_count = bdrv_snapshot_list(bs, &sn_tab); +    if (sn_count < 0) { +        const char *dev = bdrv_get_device_name(bs); +        switch (sn_count) { +        case -ENOMEDIUM: +            error_setg(errp, "Device '%s' is not inserted", dev); +            break; +        case -ENOTSUP: +            error_setg(errp, +                       "Device '%s' does not support internal snapshots", +                       dev); +            break; +        default: +            error_setg_errno(errp, -sn_count, +                             "Can't list snapshots of device '%s'", dev); +            break; +        } +        return sn_count; +    } + +    for (i = 0; i < sn_count; i++) { +        info = g_new0(SnapshotInfo, 1); +        info->id            = g_strdup(sn_tab[i].id_str); +        info->name          = g_strdup(sn_tab[i].name); +        info->vm_state_size = sn_tab[i].vm_state_size; +        info->date_sec      = sn_tab[i].date_sec; +        info->date_nsec     = sn_tab[i].date_nsec; +        info->vm_clock_sec  = sn_tab[i].vm_clock_nsec / 1000000000; +        info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000; + +        info_list = g_new0(SnapshotInfoList, 1); +        info_list->value = info; + +        /* XXX: waiting for the qapi to support qemu-queue.h types */ +        if (!cur_item) { +            head = cur_item = info_list; +        } else { +            cur_item->next = info_list; +            cur_item = info_list; +        } + +    } + +    g_free(sn_tab); +    *p_list = head; +    return 0; +} + +/** + * bdrv_query_image_info: + * @bs: block device to examine + * @p_info: location to store image information + * @errp: location to store error information + * + * Store "flat" image information in @p_info. + * + * "Flat" means it does *not* query backing image information, + * i.e. (*pinfo)->has_backing_image will be set to false and + * (*pinfo)->backing_image to NULL even when the image does in fact have + * a backing image. + * + * @p_info will be set only on success. On error, store error in @errp. + */ +void bdrv_query_image_info(BlockDriverState *bs, +                           ImageInfo **p_info, +                           Error **errp) +{ +    int64_t size; +    const char *backing_filename; +    BlockDriverInfo bdi; +    int ret; +    Error *err = NULL; +    ImageInfo *info; + +    size = bdrv_getlength(bs); +    if (size < 0) { +        error_setg_errno(errp, -size, "Can't get size of device '%s'", +                         bdrv_get_device_name(bs)); +        return; +    } + +    info = g_new0(ImageInfo, 1); +    info->filename        = g_strdup(bs->filename); +    info->format          = g_strdup(bdrv_get_format_name(bs)); +    info->virtual_size    = size; +    info->actual_size     = bdrv_get_allocated_file_size(bs); +    info->has_actual_size = info->actual_size >= 0; +    if (bdrv_is_encrypted(bs)) { +        info->encrypted = true; +        info->has_encrypted = true; +    } +    if (bdrv_get_info(bs, &bdi) >= 0) { +        if (bdi.cluster_size != 0) { +            info->cluster_size = bdi.cluster_size; +            info->has_cluster_size = true; +        } +        info->dirty_flag = bdi.is_dirty; +        info->has_dirty_flag = true; +    } +    info->format_specific     = bdrv_get_specific_info(bs); +    info->has_format_specific = info->format_specific != NULL; + +    backing_filename = bs->backing_file; +    if (backing_filename[0] != '\0') { +        char *backing_filename2 = g_malloc0(PATH_MAX); +        info->backing_filename = g_strdup(backing_filename); +        info->has_backing_filename = true; +        bdrv_get_full_backing_filename(bs, backing_filename2, PATH_MAX, &err); +        if (err) { +            error_propagate(errp, err); +            qapi_free_ImageInfo(info); +            g_free(backing_filename2); +            return; +        } + +        if (strcmp(backing_filename, backing_filename2) != 0) { +            info->full_backing_filename = +                        g_strdup(backing_filename2); +            info->has_full_backing_filename = true; +        } + +        if (bs->backing_format[0]) { +            info->backing_filename_format = g_strdup(bs->backing_format); +            info->has_backing_filename_format = true; +        } +        g_free(backing_filename2); +    } + +    ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err); +    switch (ret) { +    case 0: +        if (info->snapshots) { +            info->has_snapshots = true; +        } +        break; +    /* recoverable error */ +    case -ENOMEDIUM: +    case -ENOTSUP: +        error_free(err); +        break; +    default: +        error_propagate(errp, err); +        qapi_free_ImageInfo(info); +        return; +    } + +    *p_info = info; +} + +/* @p_info will be set only on success. */ +static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, +                            Error **errp) +{ +    BlockInfo *info = g_malloc0(sizeof(*info)); +    BlockDriverState *bs = blk_bs(blk); +    info->device = g_strdup(blk_name(blk)); +    info->type = g_strdup("unknown"); +    info->locked = blk_dev_is_medium_locked(blk); +    info->removable = blk_dev_has_removable_media(blk); + +    if (blk_dev_has_removable_media(blk)) { +        info->has_tray_open = true; +        info->tray_open = blk_dev_is_tray_open(blk); +    } + +    if (bdrv_iostatus_is_enabled(bs)) { +        info->has_io_status = true; +        info->io_status = bs->iostatus; +    } + +    if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { +        info->has_dirty_bitmaps = true; +        info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); +    } + +    if (bs->drv) { +        info->has_inserted = true; +        info->inserted = bdrv_block_device_info(bs, errp); +        if (info->inserted == NULL) { +            goto err; +        } +    } + +    *p_info = info; +    return; + + err: +    qapi_free_BlockInfo(info); +} + +static BlockStats *bdrv_query_stats(const BlockDriverState *bs, +                                    bool query_backing) +{ +    BlockStats *s; + +    s = g_malloc0(sizeof(*s)); + +    if (bdrv_get_device_name(bs)[0]) { +        s->has_device = true; +        s->device = g_strdup(bdrv_get_device_name(bs)); +    } + +    if (bdrv_get_node_name(bs)[0]) { +        s->has_node_name = true; +        s->node_name = g_strdup(bdrv_get_node_name(bs)); +    } + +    s->stats = g_malloc0(sizeof(*s->stats)); +    s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ]; +    s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE]; +    s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ]; +    s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE]; +    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ]; +    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE]; +    s->stats->wr_highest_offset = +        bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE; +    s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH]; +    s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE]; +    s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ]; +    s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH]; + +    if (bs->file) { +        s->has_parent = true; +        s->parent = bdrv_query_stats(bs->file, query_backing); +    } + +    if (query_backing && bs->backing_hd) { +        s->has_backing = true; +        s->backing = bdrv_query_stats(bs->backing_hd, query_backing); +    } + +    return s; +} + +BlockInfoList *qmp_query_block(Error **errp) +{ +    BlockInfoList *head = NULL, **p_next = &head; +    BlockBackend *blk; +    Error *local_err = NULL; + +    for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { +        BlockInfoList *info = g_malloc0(sizeof(*info)); +        bdrv_query_info(blk, &info->value, &local_err); +        if (local_err) { +            error_propagate(errp, local_err); +            goto err; +        } + +        *p_next = info; +        p_next = &info->next; +    } + +    return head; + + err: +    qapi_free_BlockInfoList(head); +    return NULL; +} + +BlockStatsList *qmp_query_blockstats(bool has_query_nodes, +                                     bool query_nodes, +                                     Error **errp) +{ +    BlockStatsList *head = NULL, **p_next = &head; +    BlockDriverState *bs = NULL; + +    /* Just to be safe if query_nodes is not always initialized */ +    query_nodes = has_query_nodes && query_nodes; + +    while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) { +        BlockStatsList *info = g_malloc0(sizeof(*info)); +        AioContext *ctx = bdrv_get_aio_context(bs); + +        aio_context_acquire(ctx); +        info->value = bdrv_query_stats(bs, !query_nodes); +        aio_context_release(ctx); + +        *p_next = info; +        p_next = &info->next; +    } + +    return head; +} + +#define NB_SUFFIXES 4 + +static char *get_human_readable_size(char *buf, int buf_size, int64_t size) +{ +    static const char suffixes[NB_SUFFIXES] = {'K', 'M', 'G', 'T'}; +    int64_t base; +    int i; + +    if (size <= 999) { +        snprintf(buf, buf_size, "%" PRId64, size); +    } else { +        base = 1024; +        for (i = 0; i < NB_SUFFIXES; i++) { +            if (size < (10 * base)) { +                snprintf(buf, buf_size, "%0.1f%c", +                         (double)size / base, +                         suffixes[i]); +                break; +            } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { +                snprintf(buf, buf_size, "%" PRId64 "%c", +                         ((size + (base >> 1)) / base), +                         suffixes[i]); +                break; +            } +            base = base * 1024; +        } +    } +    return buf; +} + +void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, +                        QEMUSnapshotInfo *sn) +{ +    char buf1[128], date_buf[128], clock_buf[128]; +    struct tm tm; +    time_t ti; +    int64_t secs; + +    if (!sn) { +        func_fprintf(f, +                     "%-10s%-20s%7s%20s%15s", +                     "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); +    } else { +        ti = sn->date_sec; +        localtime_r(&ti, &tm); +        strftime(date_buf, sizeof(date_buf), +                 "%Y-%m-%d %H:%M:%S", &tm); +        secs = sn->vm_clock_nsec / 1000000000; +        snprintf(clock_buf, sizeof(clock_buf), +                 "%02d:%02d:%02d.%03d", +                 (int)(secs / 3600), +                 (int)((secs / 60) % 60), +                 (int)(secs % 60), +                 (int)((sn->vm_clock_nsec / 1000000) % 1000)); +        func_fprintf(f, +                     "%-10s%-20s%7s%20s%15s", +                     sn->id_str, sn->name, +                     get_human_readable_size(buf1, sizeof(buf1), +                                             sn->vm_state_size), +                     date_buf, +                     clock_buf); +    } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, +                       QDict *dict); +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, +                       QList *list); + +static void dump_qobject(fprintf_function func_fprintf, void *f, +                         int comp_indent, QObject *obj) +{ +    switch (qobject_type(obj)) { +        case QTYPE_QINT: { +            QInt *value = qobject_to_qint(obj); +            func_fprintf(f, "%" PRId64, qint_get_int(value)); +            break; +        } +        case QTYPE_QSTRING: { +            QString *value = qobject_to_qstring(obj); +            func_fprintf(f, "%s", qstring_get_str(value)); +            break; +        } +        case QTYPE_QDICT: { +            QDict *value = qobject_to_qdict(obj); +            dump_qdict(func_fprintf, f, comp_indent, value); +            break; +        } +        case QTYPE_QLIST: { +            QList *value = qobject_to_qlist(obj); +            dump_qlist(func_fprintf, f, comp_indent, value); +            break; +        } +        case QTYPE_QFLOAT: { +            QFloat *value = qobject_to_qfloat(obj); +            func_fprintf(f, "%g", qfloat_get_double(value)); +            break; +        } +        case QTYPE_QBOOL: { +            QBool *value = qobject_to_qbool(obj); +            func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false"); +            break; +        } +        default: +            abort(); +    } +} + +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, +                       QList *list) +{ +    const QListEntry *entry; +    int i = 0; + +    for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { +        qtype_code type = qobject_type(entry->value); +        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); +        const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: "; + +        func_fprintf(f, format, indentation * 4, "", i); +        dump_qobject(func_fprintf, f, indentation + 1, entry->value); +        if (!composite) { +            func_fprintf(f, "\n"); +        } +    } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, +                       QDict *dict) +{ +    const QDictEntry *entry; + +    for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { +        qtype_code type = qobject_type(entry->value); +        bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); +        const char *format = composite ? "%*s%s:\n" : "%*s%s: "; +        char key[strlen(entry->key) + 1]; +        int i; + +        /* replace dashes with spaces in key (variable) names */ +        for (i = 0; entry->key[i]; i++) { +            key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; +        } +        key[i] = 0; + +        func_fprintf(f, format, indentation * 4, "", key); +        dump_qobject(func_fprintf, f, indentation + 1, entry->value); +        if (!composite) { +            func_fprintf(f, "\n"); +        } +    } +} + +void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, +                                   ImageInfoSpecific *info_spec) +{ +    QmpOutputVisitor *ov = qmp_output_visitor_new(); +    QObject *obj, *data; + +    visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, +                                 &error_abort); +    obj = qmp_output_get_qobject(ov); +    assert(qobject_type(obj) == QTYPE_QDICT); +    data = qdict_get(qobject_to_qdict(obj), "data"); +    dump_qobject(func_fprintf, f, 1, data); +    qmp_output_visitor_cleanup(ov); +} + +void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, +                          ImageInfo *info) +{ +    char size_buf[128], dsize_buf[128]; +    if (!info->has_actual_size) { +        snprintf(dsize_buf, sizeof(dsize_buf), "unavailable"); +    } else { +        get_human_readable_size(dsize_buf, sizeof(dsize_buf), +                                info->actual_size); +    } +    get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size); +    func_fprintf(f, +                 "image: %s\n" +                 "file format: %s\n" +                 "virtual size: %s (%" PRId64 " bytes)\n" +                 "disk size: %s\n", +                 info->filename, info->format, size_buf, +                 info->virtual_size, +                 dsize_buf); + +    if (info->has_encrypted && info->encrypted) { +        func_fprintf(f, "encrypted: yes\n"); +    } + +    if (info->has_cluster_size) { +        func_fprintf(f, "cluster_size: %" PRId64 "\n", +                       info->cluster_size); +    } + +    if (info->has_dirty_flag && info->dirty_flag) { +        func_fprintf(f, "cleanly shut down: no\n"); +    } + +    if (info->has_backing_filename) { +        func_fprintf(f, "backing file: %s", info->backing_filename); +        if (info->has_full_backing_filename) { +            func_fprintf(f, " (actual path: %s)", info->full_backing_filename); +        } +        func_fprintf(f, "\n"); +        if (info->has_backing_filename_format) { +            func_fprintf(f, "backing file format: %s\n", +                         info->backing_filename_format); +        } +    } + +    if (info->has_snapshots) { +        SnapshotInfoList *elem; + +        func_fprintf(f, "Snapshot list:\n"); +        bdrv_snapshot_dump(func_fprintf, f, NULL); +        func_fprintf(f, "\n"); + +        /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but +         * we convert to the block layer's native QEMUSnapshotInfo for now. +         */ +        for (elem = info->snapshots; elem; elem = elem->next) { +            QEMUSnapshotInfo sn = { +                .vm_state_size = elem->value->vm_state_size, +                .date_sec = elem->value->date_sec, +                .date_nsec = elem->value->date_nsec, +                .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL + +                                 elem->value->vm_clock_nsec, +            }; + +            pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id); +            pstrcpy(sn.name, sizeof(sn.name), elem->value->name); +            bdrv_snapshot_dump(func_fprintf, f, &sn); +            func_fprintf(f, "\n"); +        } +    } + +    if (info->has_format_specific) { +        func_fprintf(f, "Format specific information:\n"); +        bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); +    } +} diff --git a/block/qcow.c b/block/qcow.c new file mode 100644 index 00000000..01fba54c --- /dev/null +++ b/block/qcow.c @@ -0,0 +1,1036 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qapi/qmp/qerror.h" +#include "crypto/cipher.h" +#include "migration/migration.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES  1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { +    uint32_t magic; +    uint32_t version; +    uint64_t backing_file_offset; +    uint32_t backing_file_size; +    uint32_t mtime; +    uint64_t size; /* in bytes */ +    uint8_t cluster_bits; +    uint8_t l2_bits; +    uint16_t padding; +    uint32_t crypt_method; +    uint64_t l1_table_offset; +} QEMU_PACKED QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { +    int cluster_bits; +    int cluster_size; +    int cluster_sectors; +    int l2_bits; +    int l2_size; +    unsigned int l1_size; +    uint64_t cluster_offset_mask; +    uint64_t l1_table_offset; +    uint64_t *l1_table; +    uint64_t *l2_cache; +    uint64_t l2_cache_offsets[L2_CACHE_SIZE]; +    uint32_t l2_cache_counts[L2_CACHE_SIZE]; +    uint8_t *cluster_cache; +    uint8_t *cluster_data; +    uint64_t cluster_cache_offset; +    QCryptoCipher *cipher; /* NULL if no key yet */ +    uint32_t crypt_method_header; +    CoMutex lock; +    Error *migration_blocker; +} BDRVQcowState; + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const QCowHeader *cow_header = (const void *)buf; + +    if (buf_size >= sizeof(QCowHeader) && +        be32_to_cpu(cow_header->magic) == QCOW_MAGIC && +        be32_to_cpu(cow_header->version) == QCOW_VERSION) +        return 100; +    else +        return 0; +} + +static int qcow_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int len, i, shift; +    int ret; +    QCowHeader header; + +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); +    if (ret < 0) { +        goto fail; +    } +    be32_to_cpus(&header.magic); +    be32_to_cpus(&header.version); +    be64_to_cpus(&header.backing_file_offset); +    be32_to_cpus(&header.backing_file_size); +    be32_to_cpus(&header.mtime); +    be64_to_cpus(&header.size); +    be32_to_cpus(&header.crypt_method); +    be64_to_cpus(&header.l1_table_offset); + +    if (header.magic != QCOW_MAGIC) { +        error_setg(errp, "Image not in qcow format"); +        ret = -EINVAL; +        goto fail; +    } +    if (header.version != QCOW_VERSION) { +        char version[64]; +        snprintf(version, sizeof(version), "QCOW version %" PRIu32, +                 header.version); +        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +                   bdrv_get_device_or_node_name(bs), "qcow", version); +        ret = -ENOTSUP; +        goto fail; +    } + +    if (header.size <= 1) { +        error_setg(errp, "Image size is too small (must be at least 2 bytes)"); +        ret = -EINVAL; +        goto fail; +    } +    if (header.cluster_bits < 9 || header.cluster_bits > 16) { +        error_setg(errp, "Cluster size must be between 512 and 64k"); +        ret = -EINVAL; +        goto fail; +    } + +    /* l2_bits specifies number of entries; storing a uint64_t in each entry, +     * so bytes = num_entries << 3. */ +    if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) { +        error_setg(errp, "L2 table size must be between 512 and 64k"); +        ret = -EINVAL; +        goto fail; +    } + +    if (header.crypt_method > QCOW_CRYPT_AES) { +        error_setg(errp, "invalid encryption method in qcow header"); +        ret = -EINVAL; +        goto fail; +    } +    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { +        error_setg(errp, "AES cipher not available"); +        ret = -EINVAL; +        goto fail; +    } +    s->crypt_method_header = header.crypt_method; +    if (s->crypt_method_header) { +        bs->encrypted = 1; +    } +    s->cluster_bits = header.cluster_bits; +    s->cluster_size = 1 << s->cluster_bits; +    s->cluster_sectors = 1 << (s->cluster_bits - 9); +    s->l2_bits = header.l2_bits; +    s->l2_size = 1 << s->l2_bits; +    bs->total_sectors = header.size / 512; +    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + +    /* read the level 1 table */ +    shift = s->cluster_bits + s->l2_bits; +    if (header.size > UINT64_MAX - (1LL << shift)) { +        error_setg(errp, "Image too large"); +        ret = -EINVAL; +        goto fail; +    } else { +        uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift; +        if (l1_size > INT_MAX / sizeof(uint64_t)) { +            error_setg(errp, "Image too large"); +            ret = -EINVAL; +            goto fail; +        } +        s->l1_size = l1_size; +    } + +    s->l1_table_offset = header.l1_table_offset; +    s->l1_table = g_try_new(uint64_t, s->l1_size); +    if (s->l1_table == NULL) { +        error_setg(errp, "Could not allocate memory for L1 table"); +        ret = -ENOMEM; +        goto fail; +    } + +    ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, +               s->l1_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } + +    for(i = 0;i < s->l1_size; i++) { +        be64_to_cpus(&s->l1_table[i]); +    } + +    /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ +    s->l2_cache = +        qemu_try_blockalign(bs->file, +                            s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); +    if (s->l2_cache == NULL) { +        error_setg(errp, "Could not allocate L2 table cache"); +        ret = -ENOMEM; +        goto fail; +    } +    s->cluster_cache = g_malloc(s->cluster_size); +    s->cluster_data = g_malloc(s->cluster_size); +    s->cluster_cache_offset = -1; + +    /* read the backing file name */ +    if (header.backing_file_offset != 0) { +        len = header.backing_file_size; +        if (len > 1023 || len >= sizeof(bs->backing_file)) { +            error_setg(errp, "Backing file name too long"); +            ret = -EINVAL; +            goto fail; +        } +        ret = bdrv_pread(bs->file, header.backing_file_offset, +                   bs->backing_file, len); +        if (ret < 0) { +            goto fail; +        } +        bs->backing_file[len] = '\0'; +    } + +    /* Disable migration when qcow images are used */ +    error_setg(&s->migration_blocker, "The qcow format used by node '%s' " +               "does not support live migration", +               bdrv_get_device_or_node_name(bs)); +    migrate_add_blocker(s->migration_blocker); + +    qemu_co_mutex_init(&s->lock); +    return 0; + + fail: +    g_free(s->l1_table); +    qemu_vfree(s->l2_cache); +    g_free(s->cluster_cache); +    g_free(s->cluster_data); +    return ret; +} + + +/* We have nothing to do for QCOW reopen, stubs just return + * success */ +static int qcow_reopen_prepare(BDRVReopenState *state, +                               BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ +    BDRVQcowState *s = bs->opaque; +    uint8_t keybuf[16]; +    int len, i; +    Error *err; + +    memset(keybuf, 0, 16); +    len = strlen(key); +    if (len > 16) +        len = 16; +    /* XXX: we could compress the chars to 7 bits to increase +       entropy */ +    for(i = 0;i < len;i++) { +        keybuf[i] = key[i]; +    } +    assert(bs->encrypted); + +    qcrypto_cipher_free(s->cipher); +    s->cipher = qcrypto_cipher_new( +        QCRYPTO_CIPHER_ALG_AES_128, +        QCRYPTO_CIPHER_MODE_CBC, +        keybuf, G_N_ELEMENTS(keybuf), +        &err); + +    if (!s->cipher) { +        /* XXX would be nice if errors in this method could +         * be properly propagate to the caller. Would need +         * the bdrv_set_key() API signature to be fixed. */ +        error_free(err); +        return -1; +    } +    return 0; +} + +/* The crypt function is compatible with the linux cryptoloop +   algorithm for < 4 GB images. NOTE: out_buf == in_buf is +   supported */ +static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                           uint8_t *out_buf, const uint8_t *in_buf, +                           int nb_sectors, bool enc, Error **errp) +{ +    union { +        uint64_t ll[2]; +        uint8_t b[16]; +    } ivec; +    int i; +    int ret; + +    for(i = 0; i < nb_sectors; i++) { +        ivec.ll[0] = cpu_to_le64(sector_num); +        ivec.ll[1] = 0; +        if (qcrypto_cipher_setiv(s->cipher, +                                 ivec.b, G_N_ELEMENTS(ivec.b), +                                 errp) < 0) { +            return -1; +        } +        if (enc) { +            ret = qcrypto_cipher_encrypt(s->cipher, +                                         in_buf, +                                         out_buf, +                                         512, +                                         errp); +        } else { +            ret = qcrypto_cipher_decrypt(s->cipher, +                                         in_buf, +                                         out_buf, +                                         512, +                                         errp); +        } +        if (ret < 0) { +            return -1; +        } +        sector_num++; +        in_buf += 512; +        out_buf += 512; +    } +    return 0; +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, +                                   uint64_t offset, int allocate, +                                   int compressed_size, +                                   int n_start, int n_end) +{ +    BDRVQcowState *s = bs->opaque; +    int min_index, i, j, l1_index, l2_index; +    uint64_t l2_offset, *l2_table, cluster_offset, tmp; +    uint32_t min_count; +    int new_l2_table; + +    l1_index = offset >> (s->l2_bits + s->cluster_bits); +    l2_offset = s->l1_table[l1_index]; +    new_l2_table = 0; +    if (!l2_offset) { +        if (!allocate) +            return 0; +        /* allocate a new l2 entry */ +        l2_offset = bdrv_getlength(bs->file); +        /* round to cluster size */ +        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); +        /* update the L1 entry */ +        s->l1_table[l1_index] = l2_offset; +        tmp = cpu_to_be64(l2_offset); +        if (bdrv_pwrite_sync(bs->file, +                s->l1_table_offset + l1_index * sizeof(tmp), +                &tmp, sizeof(tmp)) < 0) +            return 0; +        new_l2_table = 1; +    } +    for(i = 0; i < L2_CACHE_SIZE; i++) { +        if (l2_offset == s->l2_cache_offsets[i]) { +            /* increment the hit count */ +            if (++s->l2_cache_counts[i] == 0xffffffff) { +                for(j = 0; j < L2_CACHE_SIZE; j++) { +                    s->l2_cache_counts[j] >>= 1; +                } +            } +            l2_table = s->l2_cache + (i << s->l2_bits); +            goto found; +        } +    } +    /* not found: load a new entry in the least used one */ +    min_index = 0; +    min_count = 0xffffffff; +    for(i = 0; i < L2_CACHE_SIZE; i++) { +        if (s->l2_cache_counts[i] < min_count) { +            min_count = s->l2_cache_counts[i]; +            min_index = i; +        } +    } +    l2_table = s->l2_cache + (min_index << s->l2_bits); +    if (new_l2_table) { +        memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); +        if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, +                s->l2_size * sizeof(uint64_t)) < 0) +            return 0; +    } else { +        if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != +            s->l2_size * sizeof(uint64_t)) +            return 0; +    } +    s->l2_cache_offsets[min_index] = l2_offset; +    s->l2_cache_counts[min_index] = 1; + found: +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); +    cluster_offset = be64_to_cpu(l2_table[l2_index]); +    if (!cluster_offset || +        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { +        if (!allocate) +            return 0; +        /* allocate a new cluster */ +        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && +            (n_end - n_start) < s->cluster_sectors) { +            /* if the cluster is already compressed, we must +               decompress it in the case it is not completely +               overwritten */ +            if (decompress_cluster(bs, cluster_offset) < 0) +                return 0; +            cluster_offset = bdrv_getlength(bs->file); +            cluster_offset = (cluster_offset + s->cluster_size - 1) & +                ~(s->cluster_size - 1); +            /* write the cluster content */ +            if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != +                s->cluster_size) +                return -1; +        } else { +            cluster_offset = bdrv_getlength(bs->file); +            if (allocate == 1) { +                /* round to cluster size */ +                cluster_offset = (cluster_offset + s->cluster_size - 1) & +                    ~(s->cluster_size - 1); +                bdrv_truncate(bs->file, cluster_offset + s->cluster_size); +                /* if encrypted, we must initialize the cluster +                   content which won't be written */ +                if (bs->encrypted && +                    (n_end - n_start) < s->cluster_sectors) { +                    uint64_t start_sect; +                    assert(s->cipher); +                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9; +                    memset(s->cluster_data + 512, 0x00, 512); +                    for(i = 0; i < s->cluster_sectors; i++) { +                        if (i < n_start || i >= n_end) { +                            Error *err = NULL; +                            if (encrypt_sectors(s, start_sect + i, +                                                s->cluster_data, +                                                s->cluster_data + 512, 1, +                                                true, &err) < 0) { +                                error_free(err); +                                errno = EIO; +                                return -1; +                            } +                            if (bdrv_pwrite(bs->file, cluster_offset + i * 512, +                                            s->cluster_data, 512) != 512) +                                return -1; +                        } +                    } +                } +            } else if (allocate == 2) { +                cluster_offset |= QCOW_OFLAG_COMPRESSED | +                    (uint64_t)compressed_size << (63 - s->cluster_bits); +            } +        } +        /* update L2 table */ +        tmp = cpu_to_be64(cluster_offset); +        l2_table[l2_index] = tmp; +        if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), +                &tmp, sizeof(tmp)) < 0) +            return 0; +    } +    return cluster_offset; +} + +static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster, n; +    uint64_t cluster_offset; + +    qemu_co_mutex_lock(&s->lock); +    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); +    qemu_co_mutex_unlock(&s->lock); +    index_in_cluster = sector_num & (s->cluster_sectors - 1); +    n = s->cluster_sectors - index_in_cluster; +    if (n > nb_sectors) +        n = nb_sectors; +    *pnum = n; +    if (!cluster_offset) { +        return 0; +    } +    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) { +        return BDRV_BLOCK_DATA; +    } +    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); +    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, +                             const uint8_t *buf, int buf_size) +{ +    z_stream strm1, *strm = &strm1; +    int ret, out_len; + +    memset(strm, 0, sizeof(*strm)); + +    strm->next_in = (uint8_t *)buf; +    strm->avail_in = buf_size; +    strm->next_out = out_buf; +    strm->avail_out = out_buf_size; + +    ret = inflateInit2(strm, -12); +    if (ret != Z_OK) +        return -1; +    ret = inflate(strm, Z_FINISH); +    out_len = strm->next_out - out_buf; +    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || +        out_len != out_buf_size) { +        inflateEnd(strm); +        return -1; +    } +    inflateEnd(strm); +    return 0; +} + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, csize; +    uint64_t coffset; + +    coffset = cluster_offset & s->cluster_offset_mask; +    if (s->cluster_cache_offset != coffset) { +        csize = cluster_offset >> (63 - s->cluster_bits); +        csize &= (s->cluster_size - 1); +        ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); +        if (ret != csize) +            return -1; +        if (decompress_buffer(s->cluster_cache, s->cluster_size, +                              s->cluster_data, csize) < 0) { +            return -1; +        } +        s->cluster_cache_offset = coffset; +    } +    return 0; +} + +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, +                         int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    int ret = 0, n; +    uint64_t cluster_offset; +    struct iovec hd_iov; +    QEMUIOVector hd_qiov; +    uint8_t *buf; +    void *orig_buf; +    Error *err = NULL; + +    if (qiov->niov > 1) { +        buf = orig_buf = qemu_try_blockalign(bs, qiov->size); +        if (buf == NULL) { +            return -ENOMEM; +        } +    } else { +        orig_buf = NULL; +        buf = (uint8_t *)qiov->iov->iov_base; +    } + +    qemu_co_mutex_lock(&s->lock); + +    while (nb_sectors != 0) { +        /* prepare next request */ +        cluster_offset = get_cluster_offset(bs, sector_num << 9, +                                                 0, 0, 0, 0); +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        n = s->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } + +        if (!cluster_offset) { +            if (bs->backing_hd) { +                /* read from the base image */ +                hd_iov.iov_base = (void *)buf; +                hd_iov.iov_len = n * 512; +                qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +                qemu_co_mutex_unlock(&s->lock); +                ret = bdrv_co_readv(bs->backing_hd, sector_num, +                                    n, &hd_qiov); +                qemu_co_mutex_lock(&s->lock); +                if (ret < 0) { +                    goto fail; +                } +            } else { +                /* Note: in this case, no need to wait */ +                memset(buf, 0, 512 * n); +            } +        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { +            /* add AIO support for compressed blocks ? */ +            if (decompress_cluster(bs, cluster_offset) < 0) { +                goto fail; +            } +            memcpy(buf, +                   s->cluster_cache + index_in_cluster * 512, 512 * n); +        } else { +            if ((cluster_offset & 511) != 0) { +                goto fail; +            } +            hd_iov.iov_base = (void *)buf; +            hd_iov.iov_len = n * 512; +            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +            qemu_co_mutex_unlock(&s->lock); +            ret = bdrv_co_readv(bs->file, +                                (cluster_offset >> 9) + index_in_cluster, +                                n, &hd_qiov); +            qemu_co_mutex_lock(&s->lock); +            if (ret < 0) { +                break; +            } +            if (bs->encrypted) { +                assert(s->cipher); +                if (encrypt_sectors(s, sector_num, buf, buf, +                                    n, false, &err) < 0) { +                    goto fail; +                } +            } +        } +        ret = 0; + +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; +    } + +done: +    qemu_co_mutex_unlock(&s->lock); + +    if (qiov->niov > 1) { +        qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); +        qemu_vfree(orig_buf); +    } + +    return ret; + +fail: +    error_free(err); +    ret = -EIO; +    goto done; +} + +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, +                          int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    uint64_t cluster_offset; +    const uint8_t *src_buf; +    int ret = 0, n; +    uint8_t *cluster_data = NULL; +    struct iovec hd_iov; +    QEMUIOVector hd_qiov; +    uint8_t *buf; +    void *orig_buf; + +    s->cluster_cache_offset = -1; /* disable compressed cache */ + +    if (qiov->niov > 1) { +        buf = orig_buf = qemu_try_blockalign(bs, qiov->size); +        if (buf == NULL) { +            return -ENOMEM; +        } +        qemu_iovec_to_buf(qiov, 0, buf, qiov->size); +    } else { +        orig_buf = NULL; +        buf = (uint8_t *)qiov->iov->iov_base; +    } + +    qemu_co_mutex_lock(&s->lock); + +    while (nb_sectors != 0) { + +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        n = s->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } +        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, +                                            index_in_cluster, +                                            index_in_cluster + n); +        if (!cluster_offset || (cluster_offset & 511) != 0) { +            ret = -EIO; +            break; +        } +        if (bs->encrypted) { +            Error *err = NULL; +            assert(s->cipher); +            if (!cluster_data) { +                cluster_data = g_malloc0(s->cluster_size); +            } +            if (encrypt_sectors(s, sector_num, cluster_data, buf, +                                n, true, &err) < 0) { +                error_free(err); +                ret = -EIO; +                break; +            } +            src_buf = cluster_data; +        } else { +            src_buf = buf; +        } + +        hd_iov.iov_base = (void *)src_buf; +        hd_iov.iov_len = n * 512; +        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); +        qemu_co_mutex_unlock(&s->lock); +        ret = bdrv_co_writev(bs->file, +                             (cluster_offset >> 9) + index_in_cluster, +                             n, &hd_qiov); +        qemu_co_mutex_lock(&s->lock); +        if (ret < 0) { +            break; +        } +        ret = 0; + +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; +    } +    qemu_co_mutex_unlock(&s->lock); + +    if (qiov->niov > 1) { +        qemu_vfree(orig_buf); +    } +    g_free(cluster_data); + +    return ret; +} + +static void qcow_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    qcrypto_cipher_free(s->cipher); +    s->cipher = NULL; +    g_free(s->l1_table); +    qemu_vfree(s->l2_cache); +    g_free(s->cluster_cache); +    g_free(s->cluster_data); + +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +} + +static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int header_size, backing_filename_len, l1_size, shift, i; +    QCowHeader header; +    uint8_t *tmp; +    int64_t total_size = 0; +    char *backing_file = NULL; +    int flags = 0; +    Error *local_err = NULL; +    int ret; +    BlockDriverState *qcow_bs; + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { +        flags |= BLOCK_FLAG_ENCRYPT; +    } + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto cleanup; +    } + +    qcow_bs = NULL; +    ret = bdrv_open(&qcow_bs, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto cleanup; +    } + +    ret = bdrv_truncate(qcow_bs, 0); +    if (ret < 0) { +        goto exit; +    } + +    memset(&header, 0, sizeof(header)); +    header.magic = cpu_to_be32(QCOW_MAGIC); +    header.version = cpu_to_be32(QCOW_VERSION); +    header.size = cpu_to_be64(total_size); +    header_size = sizeof(header); +    backing_filename_len = 0; +    if (backing_file) { +        if (strcmp(backing_file, "fat:")) { +            header.backing_file_offset = cpu_to_be64(header_size); +            backing_filename_len = strlen(backing_file); +            header.backing_file_size = cpu_to_be32(backing_filename_len); +            header_size += backing_filename_len; +        } else { +            /* special backing file for vvfat */ +            backing_file = NULL; +        } +        header.cluster_bits = 9; /* 512 byte cluster to avoid copying +                                    unmodified sectors */ +        header.l2_bits = 12; /* 32 KB L2 tables */ +    } else { +        header.cluster_bits = 12; /* 4 KB clusters */ +        header.l2_bits = 9; /* 4 KB L2 tables */ +    } +    header_size = (header_size + 7) & ~7; +    shift = header.cluster_bits + header.l2_bits; +    l1_size = (total_size + (1LL << shift) - 1) >> shift; + +    header.l1_table_offset = cpu_to_be64(header_size); +    if (flags & BLOCK_FLAG_ENCRYPT) { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); +    } else { +        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); +    } + +    /* write all the data */ +    ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); +    if (ret != sizeof(header)) { +        goto exit; +    } + +    if (backing_file) { +        ret = bdrv_pwrite(qcow_bs, sizeof(header), +            backing_file, backing_filename_len); +        if (ret != backing_filename_len) { +            goto exit; +        } +    } + +    tmp = g_malloc0(BDRV_SECTOR_SIZE); +    for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ +        BDRV_SECTOR_SIZE); i++) { +        ret = bdrv_pwrite(qcow_bs, header_size + +            BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); +        if (ret != BDRV_SECTOR_SIZE) { +            g_free(tmp); +            goto exit; +        } +    } + +    g_free(tmp); +    ret = 0; +exit: +    bdrv_unref(qcow_bs); +cleanup: +    g_free(backing_file); +    return ret; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    uint32_t l1_length = s->l1_size * sizeof(uint64_t); +    int ret; + +    memset(s->l1_table, 0, l1_length); +    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, +            l1_length) < 0) +        return -1; +    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); +    if (ret < 0) +        return ret; + +    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); +    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); +    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + +    return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned +   tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, +                                 const uint8_t *buf, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    z_stream strm; +    int ret, out_len; +    uint8_t *out_buf; +    uint64_t cluster_offset; + +    if (nb_sectors != s->cluster_sectors) { +        ret = -EINVAL; + +        /* Zero-pad last write if image size is not cluster aligned */ +        if (sector_num + nb_sectors == bs->total_sectors && +            nb_sectors < s->cluster_sectors) { +            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); +            memset(pad_buf, 0, s->cluster_size); +            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); +            ret = qcow_write_compressed(bs, sector_num, +                                        pad_buf, s->cluster_sectors); +            qemu_vfree(pad_buf); +        } +        return ret; +    } + +    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + +    /* best compression, small window, no zlib header */ +    memset(&strm, 0, sizeof(strm)); +    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, +                       Z_DEFLATED, -12, +                       9, Z_DEFAULT_STRATEGY); +    if (ret != 0) { +        ret = -EINVAL; +        goto fail; +    } + +    strm.avail_in = s->cluster_size; +    strm.next_in = (uint8_t *)buf; +    strm.avail_out = s->cluster_size; +    strm.next_out = out_buf; + +    ret = deflate(&strm, Z_FINISH); +    if (ret != Z_STREAM_END && ret != Z_OK) { +        deflateEnd(&strm); +        ret = -EINVAL; +        goto fail; +    } +    out_len = strm.next_out - out_buf; + +    deflateEnd(&strm); + +    if (ret != Z_STREAM_END || out_len >= s->cluster_size) { +        /* could not compress: write normal cluster */ +        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); +        if (ret < 0) { +            goto fail; +        } +    } else { +        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, +                                            out_len, 0, 0); +        if (cluster_offset == 0) { +            ret = -EIO; +            goto fail; +        } + +        cluster_offset &= s->cluster_offset_mask; +        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); +        if (ret < 0) { +            goto fail; +        } +    } + +    ret = 0; +fail: +    g_free(out_buf); +    return ret; +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQcowState *s = bs->opaque; +    bdi->cluster_size = s->cluster_size; +    return 0; +} + +static QemuOptsList qcow_create_opts = { +    .name = "qcow-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_BACKING_FILE, +            .type = QEMU_OPT_STRING, +            .help = "File name of a base image" +        }, +        { +            .name = BLOCK_OPT_ENCRYPT, +            .type = QEMU_OPT_BOOL, +            .help = "Encrypt the image", +            .def_value_str = "off" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_qcow = { +    .format_name	= "qcow", +    .instance_size	= sizeof(BDRVQcowState), +    .bdrv_probe		= qcow_probe, +    .bdrv_open		= qcow_open, +    .bdrv_close		= qcow_close, +    .bdrv_reopen_prepare    = qcow_reopen_prepare, +    .bdrv_create            = qcow_create, +    .bdrv_has_zero_init     = bdrv_has_zero_init_1, +    .supports_backing       = true, + +    .bdrv_co_readv          = qcow_co_readv, +    .bdrv_co_writev         = qcow_co_writev, +    .bdrv_co_get_block_status   = qcow_co_get_block_status, + +    .bdrv_set_key           = qcow_set_key, +    .bdrv_make_empty        = qcow_make_empty, +    .bdrv_write_compressed  = qcow_write_compressed, +    .bdrv_get_info          = qcow_get_info, + +    .create_opts            = &qcow_create_opts, +}; + +static void bdrv_qcow_init(void) +{ +    bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c new file mode 100644 index 00000000..53b8afc3 --- /dev/null +++ b/block/qcow2-cache.c @@ -0,0 +1,350 @@ +/* + * L2/refcount table cache for the QCOW2 format + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/block_int.h" +#include "qemu-common.h" +#include "qcow2.h" +#include "trace.h" + +typedef struct Qcow2CachedTable { +    int64_t  offset; +    bool     dirty; +    uint64_t lru_counter; +    int      ref; +} Qcow2CachedTable; + +struct Qcow2Cache { +    Qcow2CachedTable       *entries; +    struct Qcow2Cache      *depends; +    int                     size; +    bool                    depends_on_flush; +    void                   *table_array; +    uint64_t                lru_counter; +}; + +static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, +                    Qcow2Cache *c, int table) +{ +    BDRVQcowState *s = bs->opaque; +    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; +} + +static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, +                  Qcow2Cache *c, void *table) +{ +    BDRVQcowState *s = bs->opaque; +    ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; +    int idx = table_offset / s->cluster_size; +    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); +    return idx; +} + +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2Cache *c; + +    c = g_new0(Qcow2Cache, 1); +    c->size = num_tables; +    c->entries = g_try_new0(Qcow2CachedTable, num_tables); +    c->table_array = qemu_try_blockalign(bs->file, +                                         (size_t) num_tables * s->cluster_size); + +    if (!c->entries || !c->table_array) { +        qemu_vfree(c->table_array); +        g_free(c->entries); +        g_free(c); +        c = NULL; +    } + +    return c; +} + +int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c) +{ +    int i; + +    for (i = 0; i < c->size; i++) { +        assert(c->entries[i].ref == 0); +    } + +    qemu_vfree(c->table_array); +    g_free(c->entries); +    g_free(c); + +    return 0; +} + +static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) +{ +    int ret; + +    ret = qcow2_cache_flush(bs, c->depends); +    if (ret < 0) { +        return ret; +    } + +    c->depends = NULL; +    c->depends_on_flush = false; + +    return 0; +} + +static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) +{ +    BDRVQcowState *s = bs->opaque; +    int ret = 0; + +    if (!c->entries[i].dirty || !c->entries[i].offset) { +        return 0; +    } + +    trace_qcow2_cache_entry_flush(qemu_coroutine_self(), +                                  c == s->l2_table_cache, i); + +    if (c->depends) { +        ret = qcow2_cache_flush_dependency(bs, c); +    } else if (c->depends_on_flush) { +        ret = bdrv_flush(bs->file); +        if (ret >= 0) { +            c->depends_on_flush = false; +        } +    } + +    if (ret < 0) { +        return ret; +    } + +    if (c == s->refcount_block_cache) { +        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, +                c->entries[i].offset, s->cluster_size); +    } else if (c == s->l2_table_cache) { +        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, +                c->entries[i].offset, s->cluster_size); +    } else { +        ret = qcow2_pre_write_overlap_check(bs, 0, +                c->entries[i].offset, s->cluster_size); +    } + +    if (ret < 0) { +        return ret; +    } + +    if (c == s->refcount_block_cache) { +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); +    } else if (c == s->l2_table_cache) { +        BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); +    } + +    ret = bdrv_pwrite(bs->file, c->entries[i].offset, +                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); +    if (ret < 0) { +        return ret; +    } + +    c->entries[i].dirty = false; + +    return 0; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ +    BDRVQcowState *s = bs->opaque; +    int result = 0; +    int ret; +    int i; + +    trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); + +    for (i = 0; i < c->size; i++) { +        ret = qcow2_cache_entry_flush(bs, c, i); +        if (ret < 0 && result != -ENOSPC) { +            result = ret; +        } +    } + +    if (result == 0) { +        ret = bdrv_flush(bs->file); +        if (ret < 0) { +            result = ret; +        } +    } + +    return result; +} + +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, +    Qcow2Cache *dependency) +{ +    int ret; + +    if (dependency->depends) { +        ret = qcow2_cache_flush_dependency(bs, dependency); +        if (ret < 0) { +            return ret; +        } +    } + +    if (c->depends && (c->depends != dependency)) { +        ret = qcow2_cache_flush_dependency(bs, c); +        if (ret < 0) { +            return ret; +        } +    } + +    c->depends = dependency; +    return 0; +} + +void qcow2_cache_depends_on_flush(Qcow2Cache *c) +{ +    c->depends_on_flush = true; +} + +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) +{ +    int ret, i; + +    ret = qcow2_cache_flush(bs, c); +    if (ret < 0) { +        return ret; +    } + +    for (i = 0; i < c->size; i++) { +        assert(c->entries[i].ref == 0); +        c->entries[i].offset = 0; +        c->entries[i].lru_counter = 0; +    } + +    c->lru_counter = 0; + +    return 0; +} + +static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, +    uint64_t offset, void **table, bool read_from_disk) +{ +    BDRVQcowState *s = bs->opaque; +    int i; +    int ret; +    int lookup_index; +    uint64_t min_lru_counter = UINT64_MAX; +    int min_lru_index = -1; + +    trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, +                          offset, read_from_disk); + +    /* Check if the table is already cached */ +    i = lookup_index = (offset / s->cluster_size * 4) % c->size; +    do { +        const Qcow2CachedTable *t = &c->entries[i]; +        if (t->offset == offset) { +            goto found; +        } +        if (t->ref == 0 && t->lru_counter < min_lru_counter) { +            min_lru_counter = t->lru_counter; +            min_lru_index = i; +        } +        if (++i == c->size) { +            i = 0; +        } +    } while (i != lookup_index); + +    if (min_lru_index == -1) { +        /* This can't happen in current synchronous code, but leave the check +         * here as a reminder for whoever starts using AIO with the cache */ +        abort(); +    } + +    /* Cache miss: write a table back and replace it */ +    i = min_lru_index; +    trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), +                                        c == s->l2_table_cache, i); + +    ret = qcow2_cache_entry_flush(bs, c, i); +    if (ret < 0) { +        return ret; +    } + +    trace_qcow2_cache_get_read(qemu_coroutine_self(), +                               c == s->l2_table_cache, i); +    c->entries[i].offset = 0; +    if (read_from_disk) { +        if (c == s->l2_table_cache) { +            BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); +        } + +        ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i), +                         s->cluster_size); +        if (ret < 0) { +            return ret; +        } +    } + +    c->entries[i].offset = offset; + +    /* And return the right table */ +found: +    c->entries[i].ref++; +    *table = qcow2_cache_get_table_addr(bs, c, i); + +    trace_qcow2_cache_get_done(qemu_coroutine_self(), +                               c == s->l2_table_cache, i); + +    return 0; +} + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table) +{ +    return qcow2_cache_do_get(bs, c, offset, table, true); +} + +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table) +{ +    return qcow2_cache_do_get(bs, c, offset, table, false); +} + +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +{ +    int i = qcow2_cache_get_table_idx(bs, c, *table); + +    c->entries[i].ref--; +    *table = NULL; + +    if (c->entries[i].ref == 0) { +        c->entries[i].lru_counter = ++c->lru_counter; +    } + +    assert(c->entries[i].ref >= 0); +} + +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, +     void *table) +{ +    int i = qcow2_cache_get_table_idx(bs, c, table); +    assert(c->entries[i].offset != 0); +    c->entries[i].dirty = true; +} diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c new file mode 100644 index 00000000..7e94fe70 --- /dev/null +++ b/block/qcow2-cluster.c @@ -0,0 +1,1887 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <zlib.h> + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "trace.h" + +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, +                        bool exact_size) +{ +    BDRVQcowState *s = bs->opaque; +    int new_l1_size2, ret, i; +    uint64_t *new_l1_table; +    int64_t old_l1_table_offset, old_l1_size; +    int64_t new_l1_table_offset, new_l1_size; +    uint8_t data[12]; + +    if (min_size <= s->l1_size) +        return 0; + +    /* Do a sanity check on min_size before trying to calculate new_l1_size +     * (this prevents overflows during the while loop for the calculation of +     * new_l1_size) */ +    if (min_size > INT_MAX / sizeof(uint64_t)) { +        return -EFBIG; +    } + +    if (exact_size) { +        new_l1_size = min_size; +    } else { +        /* Bump size up to reduce the number of times we have to grow */ +        new_l1_size = s->l1_size; +        if (new_l1_size == 0) { +            new_l1_size = 1; +        } +        while (min_size > new_l1_size) { +            new_l1_size = (new_l1_size * 3 + 1) / 2; +        } +    } + +    if (new_l1_size > INT_MAX / sizeof(uint64_t)) { +        return -EFBIG; +    } + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", +            s->l1_size, new_l1_size); +#endif + +    new_l1_size2 = sizeof(uint64_t) * new_l1_size; +    new_l1_table = qemu_try_blockalign(bs->file, +                                       align_offset(new_l1_size2, 512)); +    if (new_l1_table == NULL) { +        return -ENOMEM; +    } +    memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); + +    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + +    /* write new table (align to cluster) */ +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); +    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); +    if (new_l1_table_offset < 0) { +        qemu_vfree(new_l1_table); +        return new_l1_table_offset; +    } + +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* the L1 position has not yet been updated, so these clusters must +     * indeed be completely free */ +    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, +                                        new_l1_size2); +    if (ret < 0) { +        goto fail; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); +    for(i = 0; i < s->l1_size; i++) +        new_l1_table[i] = cpu_to_be64(new_l1_table[i]); +    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); +    if (ret < 0) +        goto fail; +    for(i = 0; i < s->l1_size; i++) +        new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + +    /* set new table */ +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); +    cpu_to_be32w((uint32_t*)data, new_l1_size); +    stq_be_p(data + 4, new_l1_table_offset); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); +    if (ret < 0) { +        goto fail; +    } +    qemu_vfree(s->l1_table); +    old_l1_table_offset = s->l1_table_offset; +    s->l1_table_offset = new_l1_table_offset; +    s->l1_table = new_l1_table; +    old_l1_size = s->l1_size; +    s->l1_size = new_l1_size; +    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), +                        QCOW2_DISCARD_OTHER); +    return 0; + fail: +    qemu_vfree(new_l1_table); +    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, +                        QCOW2_DISCARD_OTHER); +    return ret; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, +    uint64_t **l2_table) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); + +    return ret; +} + +/* + * Writes one sector of the L1 table to the disk (can't update single entries + * and we really don't want bdrv_pread to perform a read-modify-write) + */ +#define L1_ENTRIES_PER_SECTOR (512 / 8) +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; +    int l1_start_index; +    int i, ret; + +    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); +    for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; +         i++) +    { +        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); +    } + +    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, +            s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); +    if (ret < 0) { +        return ret; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); +    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, +        buf, sizeof(buf)); +    if (ret < 0) { +        return ret; +    } + +    return 0; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t old_l2_offset; +    uint64_t *l2_table = NULL; +    int64_t l2_offset; +    int ret; + +    old_l2_offset = s->l1_table[l1_index]; + +    trace_qcow2_l2_allocate(bs, l1_index); + +    /* allocate a new l2 entry */ + +    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); +    if (l2_offset < 0) { +        ret = l2_offset; +        goto fail; +    } + +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* allocate a new entry in the l2 cache */ + +    trace_qcow2_l2_allocate_get_empty(bs, l1_index); +    ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); +    if (ret < 0) { +        goto fail; +    } + +    l2_table = *table; + +    if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { +        /* if there was no old l2 table, clear the new table */ +        memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); +    } else { +        uint64_t* old_table; + +        /* if there was an old l2 table, read it from the disk */ +        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); +        ret = qcow2_cache_get(bs, s->l2_table_cache, +            old_l2_offset & L1E_OFFSET_MASK, +            (void**) &old_table); +        if (ret < 0) { +            goto fail; +        } + +        memcpy(l2_table, old_table, s->cluster_size); + +        qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); +    } + +    /* write the l2 table to the file */ +    BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + +    trace_qcow2_l2_allocate_write_l2(bs, l1_index); +    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* update the L1 entry */ +    trace_qcow2_l2_allocate_write_l1(bs, l1_index); +    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; +    ret = qcow2_write_l1_entry(bs, l1_index); +    if (ret < 0) { +        goto fail; +    } + +    *table = l2_table; +    trace_qcow2_l2_allocate_done(bs, l1_index, 0); +    return 0; + +fail: +    trace_qcow2_l2_allocate_done(bs, l1_index, ret); +    if (l2_table != NULL) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) table); +    } +    s->l1_table[l1_index] = old_l2_offset; +    if (l2_offset > 0) { +        qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), +                            QCOW2_DISCARD_ALWAYS); +    } +    return ret; +} + +/* + * Checks how many clusters in a given L2 table are contiguous in the image + * file. As soon as one of the flags in the bitmask stop_flags changes compared + * to the first cluster, the search is stopped and the cluster is not counted + * as contiguous. (This allows it, for example, to stop at the first compressed + * cluster which may require a different handling) + */ +static int count_contiguous_clusters(int nb_clusters, int cluster_size, +        uint64_t *l2_table, uint64_t stop_flags) +{ +    int i; +    uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; +    uint64_t first_entry = be64_to_cpu(l2_table[0]); +    uint64_t offset = first_entry & mask; + +    if (!offset) +        return 0; + +    assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED); + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; +        if (offset + (uint64_t) i * cluster_size != l2_entry) { +            break; +        } +    } + +	return i; +} + +static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table) +{ +    int i; + +    for (i = 0; i < nb_clusters; i++) { +        int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); + +        if (type != QCOW2_CLUSTER_UNALLOCATED) { +            break; +        } +    } + +    return i; +} + +/* The crypt function is compatible with the linux cryptoloop +   algorithm for < 4 GB images. NOTE: out_buf == in_buf is +   supported */ +int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                          uint8_t *out_buf, const uint8_t *in_buf, +                          int nb_sectors, bool enc, +                          Error **errp) +{ +    union { +        uint64_t ll[2]; +        uint8_t b[16]; +    } ivec; +    int i; +    int ret; + +    for(i = 0; i < nb_sectors; i++) { +        ivec.ll[0] = cpu_to_le64(sector_num); +        ivec.ll[1] = 0; +        if (qcrypto_cipher_setiv(s->cipher, +                                 ivec.b, G_N_ELEMENTS(ivec.b), +                                 errp) < 0) { +            return -1; +        } +        if (enc) { +            ret = qcrypto_cipher_encrypt(s->cipher, +                                         in_buf, +                                         out_buf, +                                         512, +                                         errp); +        } else { +            ret = qcrypto_cipher_decrypt(s->cipher, +                                         in_buf, +                                         out_buf, +                                         512, +                                         errp); +        } +        if (ret < 0) { +            return -1; +        } +        sector_num++; +        in_buf += 512; +        out_buf += 512; +    } +    return 0; +} + +static int coroutine_fn copy_sectors(BlockDriverState *bs, +                                     uint64_t start_sect, +                                     uint64_t cluster_offset, +                                     int n_start, int n_end) +{ +    BDRVQcowState *s = bs->opaque; +    QEMUIOVector qiov; +    struct iovec iov; +    int n, ret; + +    n = n_end - n_start; +    if (n <= 0) { +        return 0; +    } + +    iov.iov_len = n * BDRV_SECTOR_SIZE; +    iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); +    if (iov.iov_base == NULL) { +        return -ENOMEM; +    } + +    qemu_iovec_init_external(&qiov, &iov, 1); + +    BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + +    if (!bs->drv) { +        ret = -ENOMEDIUM; +        goto out; +    } + +    /* Call .bdrv_co_readv() directly instead of using the public block-layer +     * interface.  This avoids double I/O throttling and request tracking, +     * which can lead to deadlock when block layer copy-on-read is enabled. +     */ +    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); +    if (ret < 0) { +        goto out; +    } + +    if (bs->encrypted) { +        Error *err = NULL; +        assert(s->cipher); +        if (qcow2_encrypt_sectors(s, start_sect + n_start, +                                  iov.iov_base, iov.iov_base, n, +                                  true, &err) < 0) { +            ret = -EIO; +            error_free(err); +            goto out; +        } +    } + +    ret = qcow2_pre_write_overlap_check(bs, 0, +            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); +    if (ret < 0) { +        goto out; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); +    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); +    if (ret < 0) { +        goto out; +    } + +    ret = 0; +out: +    qemu_vfree(iov.iov_base); +    return ret; +} + + +/* + * get_cluster_offset + * + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. + * + * on entry, *num is the number of contiguous sectors we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous sectors we can read. + * + * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error + * cases. + */ +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int l2_index; +    uint64_t l1_index, l2_offset, *l2_table; +    int l1_bits, c; +    unsigned int index_in_cluster, nb_clusters; +    uint64_t nb_available, nb_needed; +    int ret; + +    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); +    nb_needed = *num + index_in_cluster; + +    l1_bits = s->l2_bits + s->cluster_bits; + +    /* compute how many bytes there are between the offset and +     * the end of the l1 entry +     */ + +    nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); + +    /* compute the number of available sectors */ + +    nb_available = (nb_available >> 9) + index_in_cluster; + +    if (nb_needed > nb_available) { +        nb_needed = nb_available; +    } +    assert(nb_needed <= INT_MAX); + +    *cluster_offset = 0; + +    /* seek the the l2 offset in the l1 table */ + +    l1_index = offset >> l1_bits; +    if (l1_index >= s->l1_size) { +        ret = QCOW2_CLUSTER_UNALLOCATED; +        goto out; +    } + +    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; +    if (!l2_offset) { +        ret = QCOW2_CLUSTER_UNALLOCATED; +        goto out; +    } + +    if (offset_into_cluster(s, l2_offset)) { +        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 +                                " unaligned (L1 index: %#" PRIx64 ")", +                                l2_offset, l1_index); +        return -EIO; +    } + +    /* load the l2 table in memory */ + +    ret = l2_load(bs, l2_offset, &l2_table); +    if (ret < 0) { +        return ret; +    } + +    /* find the cluster offset for the given disk offset */ + +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); +    *cluster_offset = be64_to_cpu(l2_table[l2_index]); + +    /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ +    nb_clusters = size_to_clusters(s, nb_needed << 9); + +    ret = qcow2_get_cluster_type(*cluster_offset); +    switch (ret) { +    case QCOW2_CLUSTER_COMPRESSED: +        /* Compressed clusters can only be processed one by one */ +        c = 1; +        *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; +        break; +    case QCOW2_CLUSTER_ZERO: +        if (s->qcow_version < 3) { +            qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" +                                    " in pre-v3 image (L2 offset: %#" PRIx64 +                                    ", L2 index: %#x)", l2_offset, l2_index); +            ret = -EIO; +            goto fail; +        } +        c = count_contiguous_clusters(nb_clusters, s->cluster_size, +                &l2_table[l2_index], QCOW_OFLAG_ZERO); +        *cluster_offset = 0; +        break; +    case QCOW2_CLUSTER_UNALLOCATED: +        /* how many empty clusters ? */ +        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); +        *cluster_offset = 0; +        break; +    case QCOW2_CLUSTER_NORMAL: +        /* how many allocated clusters ? */ +        c = count_contiguous_clusters(nb_clusters, s->cluster_size, +                &l2_table[l2_index], QCOW_OFLAG_ZERO); +        *cluster_offset &= L2E_OFFSET_MASK; +        if (offset_into_cluster(s, *cluster_offset)) { +            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" +                                    PRIx64 " unaligned (L2 offset: %#" PRIx64 +                                    ", L2 index: %#x)", *cluster_offset, +                                    l2_offset, l2_index); +            ret = -EIO; +            goto fail; +        } +        break; +    default: +        abort(); +    } + +    qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + +    nb_available = (c * s->cluster_sectors); + +out: +    if (nb_available > nb_needed) +        nb_available = nb_needed; + +    *num = nb_available - index_in_cluster; + +    return ret; + +fail: +    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); +    return ret; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + * Returns 0 on success, -errno in failure case + */ +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, +                             uint64_t **new_l2_table, +                             int *new_l2_index) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int l2_index; +    uint64_t l1_index, l2_offset; +    uint64_t *l2_table = NULL; +    int ret; + +    /* seek the the l2 offset in the l1 table */ + +    l1_index = offset >> (s->l2_bits + s->cluster_bits); +    if (l1_index >= s->l1_size) { +        ret = qcow2_grow_l1_table(bs, l1_index + 1, false); +        if (ret < 0) { +            return ret; +        } +    } + +    assert(l1_index < s->l1_size); +    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; +    if (offset_into_cluster(s, l2_offset)) { +        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 +                                " unaligned (L1 index: %#" PRIx64 ")", +                                l2_offset, l1_index); +        return -EIO; +    } + +    /* seek the l2 table of the given l2 offset */ + +    if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { +        /* load the l2 table in memory */ +        ret = l2_load(bs, l2_offset, &l2_table); +        if (ret < 0) { +            return ret; +        } +    } else { +        /* First allocate a new L2 table (and do COW if needed) */ +        ret = l2_allocate(bs, l1_index, &l2_table); +        if (ret < 0) { +            return ret; +        } + +        /* Then decrease the refcount of the old table */ +        if (l2_offset) { +            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), +                                QCOW2_DISCARD_OTHER); +        } +    } + +    /* find the cluster offset for the given disk offset */ + +    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + +    *new_l2_table = l2_table; +    *new_l2_index = l2_index; + +    return 0; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, +                                               uint64_t offset, +                                               int compressed_size) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index, ret; +    uint64_t *l2_table; +    int64_t cluster_offset; +    int nb_csectors; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return 0; +    } + +    /* Compression can't overwrite anything. Fail if the cluster was already +     * allocated. */ +    cluster_offset = be64_to_cpu(l2_table[l2_index]); +    if (cluster_offset & L2E_OFFSET_MASK) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +        return 0; +    } + +    cluster_offset = qcow2_alloc_bytes(bs, compressed_size); +    if (cluster_offset < 0) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +        return 0; +    } + +    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - +                  (cluster_offset >> 9); + +    cluster_offset |= QCOW_OFLAG_COMPRESSED | +                      ((uint64_t)nb_csectors << s->csize_shift); + +    /* update L2 table */ + +    /* compressed clusters never have the copied flag */ + +    BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); +    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); +    l2_table[l2_index] = cpu_to_be64(cluster_offset); +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    return cluster_offset; +} + +static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    if (r->nb_sectors == 0) { +        return 0; +    } + +    qemu_co_mutex_unlock(&s->lock); +    ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, +                       r->offset / BDRV_SECTOR_SIZE, +                       r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); +    qemu_co_mutex_lock(&s->lock); + +    if (ret < 0) { +        return ret; +    } + +    /* +     * Before we update the L2 table to actually point to the new cluster, we +     * need to be sure that the refcounts have been increased and COW was +     * handled. +     */ +    qcow2_cache_depends_on_flush(s->l2_table_cache); + +    return 0; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ +    BDRVQcowState *s = bs->opaque; +    int i, j = 0, l2_index, ret; +    uint64_t *old_cluster, *l2_table; +    uint64_t cluster_offset = m->alloc_offset; + +    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); +    assert(m->nb_clusters > 0); + +    old_cluster = g_try_new(uint64_t, m->nb_clusters); +    if (old_cluster == NULL) { +        ret = -ENOMEM; +        goto err; +    } + +    /* copy content of unmodified sectors */ +    ret = perform_cow(bs, m, &m->cow_start); +    if (ret < 0) { +        goto err; +    } + +    ret = perform_cow(bs, m, &m->cow_end); +    if (ret < 0) { +        goto err; +    } + +    /* Update L2 table. */ +    if (s->use_lazy_refcounts) { +        qcow2_mark_dirty(bs); +    } +    if (qcow2_need_accurate_refcounts(s)) { +        qcow2_cache_set_dependency(bs, s->l2_table_cache, +                                   s->refcount_block_cache); +    } + +    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); +    if (ret < 0) { +        goto err; +    } +    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + +    assert(l2_index + m->nb_clusters <= s->l2_size); +    for (i = 0; i < m->nb_clusters; i++) { +        /* if two concurrent writes happen to the same unallocated cluster +	 * each write allocates separate cluster and writes data concurrently. +	 * The first one to complete updates l2 table with pointer to its +	 * cluster the second one has to do RMW (which is done above by +	 * copy_sectors()), update l2 table with its cluster pointer and free +	 * old cluster. This is what this loop does */ +        if(l2_table[l2_index + i] != 0) +            old_cluster[j++] = l2_table[l2_index + i]; + +        l2_table[l2_index + i] = cpu_to_be64((cluster_offset + +                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); +     } + + +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    /* +     * If this was a COW, we need to decrease the refcount of the old cluster. +     * Also flush bs->file to get the right order for L2 and refcount update. +     * +     * Don't discard clusters that reach a refcount of 0 (e.g. compressed +     * clusters), the next write will reuse them anyway. +     */ +    if (j != 0) { +        for (i = 0; i < j; i++) { +            qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, +                                    QCOW2_DISCARD_NEVER); +        } +    } + +    ret = 0; +err: +    g_free(old_cluster); +    return ret; + } + +/* + * Returns the number of contiguous clusters that can be used for an allocating + * write, but require COW to be performed (this includes yet unallocated space, + * which must copy from the backing file) + */ +static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, +    uint64_t *l2_table, int l2_index) +{ +    int i; + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); +        int cluster_type = qcow2_get_cluster_type(l2_entry); + +        switch(cluster_type) { +        case QCOW2_CLUSTER_NORMAL: +            if (l2_entry & QCOW_OFLAG_COPIED) { +                goto out; +            } +            break; +        case QCOW2_CLUSTER_UNALLOCATED: +        case QCOW2_CLUSTER_COMPRESSED: +        case QCOW2_CLUSTER_ZERO: +            break; +        default: +            abort(); +        } +    } + +out: +    assert(i <= nb_clusters); +    return i; +} + +/* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + * + * Returns: + *   0       if there was no dependency. *cur_bytes indicates the number of + *           bytes from guest_offset that can be read before the next + *           dependency must be processed (or the request is complete) + * + *   -EAGAIN if we had to wait for another request, previously gathered + *           information on cluster allocation may be invalid now. The caller + *           must start over anyway, so consider *cur_bytes undefined. + */ +static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *cur_bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    QCowL2Meta *old_alloc; +    uint64_t bytes = *cur_bytes; + +    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + +        uint64_t start = guest_offset; +        uint64_t end = start + bytes; +        uint64_t old_start = l2meta_cow_start(old_alloc); +        uint64_t old_end = l2meta_cow_end(old_alloc); + +        if (end <= old_start || start >= old_end) { +            /* No intersection */ +        } else { +            if (start < old_start) { +                /* Stop at the start of a running allocation */ +                bytes = old_start - start; +            } else { +                bytes = 0; +            } + +            /* Stop if already an l2meta exists. After yielding, it wouldn't +             * be valid any more, so we'd have to clean up the old L2Metas +             * and deal with requests depending on them before starting to +             * gather new ones. Not worth the trouble. */ +            if (bytes == 0 && *m) { +                *cur_bytes = 0; +                return 0; +            } + +            if (bytes == 0) { +                /* Wait for the dependency to complete. We need to recheck +                 * the free/allocated clusters when we continue. */ +                qemu_co_mutex_unlock(&s->lock); +                qemu_co_queue_wait(&old_alloc->dependent_requests); +                qemu_co_mutex_lock(&s->lock); +                return -EAGAIN; +            } +        } +    } + +    /* Make sure that existing clusters and new allocations are only used up to +     * the next dependency if we shortened the request above */ +    *cur_bytes = bytes; + +    return 0; +} + +/* + * Checks how many already allocated clusters that don't require a copy on + * write there are at the given guest_offset (up to *bytes). If + * *host_offset is not zero, only physically contiguous clusters beginning at + * this host offset are counted. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + *   0:     if no allocated clusters are available at the given offset. + *          *bytes is normally unchanged. It is set to 0 if the cluster + *          is allocated and doesn't need COW, but doesn't have the right + *          physical offset. + * + *   1:     if allocated clusters that don't require a COW are available at + *          the requested offset. *bytes may have decreased and describes + *          the length of the area that can be written to. + * + *  -errno: in error cases + */ +static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index; +    uint64_t cluster_offset; +    uint64_t *l2_table; +    uint64_t nb_clusters; +    unsigned int keep_clusters; +    int ret; + +    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, +                              *bytes); + +    assert(*host_offset == 0 ||    offset_into_cluster(s, guest_offset) +                                == offset_into_cluster(s, *host_offset)); + +    /* +     * Calculate the number of clusters to look for. We stop at L2 table +     * boundaries to keep things simple. +     */ +    nb_clusters = +        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + +    l2_index = offset_to_l2_index(s, guest_offset); +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); +    assert(nb_clusters <= INT_MAX); + +    /* Find L2 entry for the first involved cluster */ +    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    cluster_offset = be64_to_cpu(l2_table[l2_index]); + +    /* Check how many clusters are already allocated and don't need COW */ +    if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL +        && (cluster_offset & QCOW_OFLAG_COPIED)) +    { +        /* If a specific host_offset is required, check it */ +        bool offset_matches = +            (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + +        if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { +            qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " +                                    "%#llx unaligned (guest offset: %#" PRIx64 +                                    ")", cluster_offset & L2E_OFFSET_MASK, +                                    guest_offset); +            ret = -EIO; +            goto out; +        } + +        if (*host_offset != 0 && !offset_matches) { +            *bytes = 0; +            ret = 0; +            goto out; +        } + +        /* We keep all QCOW_OFLAG_COPIED clusters */ +        keep_clusters = +            count_contiguous_clusters(nb_clusters, s->cluster_size, +                                      &l2_table[l2_index], +                                      QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); +        assert(keep_clusters <= nb_clusters); + +        *bytes = MIN(*bytes, +                 keep_clusters * s->cluster_size +                 - offset_into_cluster(s, guest_offset)); + +        ret = 1; +    } else { +        ret = 0; +    } + +    /* Cleanup */ +out: +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    /* Only return a host offset if we actually made progress. Otherwise we +     * would make requirements for handle_alloc() that it can't fulfill */ +    if (ret > 0) { +        *host_offset = (cluster_offset & L2E_OFFSET_MASK) +                     + offset_into_cluster(s, guest_offset); +    } + +    return ret; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, +                                   uint64_t *host_offset, uint64_t *nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; + +    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, +                                         *host_offset, *nb_clusters); + +    /* Allocate new clusters */ +    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); +    if (*host_offset == 0) { +        int64_t cluster_offset = +            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); +        if (cluster_offset < 0) { +            return cluster_offset; +        } +        *host_offset = cluster_offset; +        return 0; +    } else { +        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); +        if (ret < 0) { +            return ret; +        } +        *nb_clusters = ret; +        return 0; +    } +} + +/* + * Allocates new clusters for an area that either is yet unallocated or needs a + * copy on write. If *host_offset is non-zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + *   0:     if no clusters could be allocated. *bytes is set to 0, + *          *host_offset is left unchanged. + * + *   1:     if new clusters were allocated. *bytes may be decreased if the + *          new allocation doesn't cover all of the requested area. + *          *host_offset is updated to contain the host offset of the first + *          newly allocated cluster. + * + *  -errno: in error cases + */ +static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, +    uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    int l2_index; +    uint64_t *l2_table; +    uint64_t entry; +    uint64_t nb_clusters; +    int ret; + +    uint64_t alloc_cluster_offset; + +    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, +                             *bytes); +    assert(*bytes > 0); + +    /* +     * Calculate the number of clusters to look for. We stop at L2 table +     * boundaries to keep things simple. +     */ +    nb_clusters = +        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + +    l2_index = offset_to_l2_index(s, guest_offset); +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); +    assert(nb_clusters <= INT_MAX); + +    /* Find L2 entry for the first involved cluster */ +    ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    entry = be64_to_cpu(l2_table[l2_index]); + +    /* For the moment, overwrite compressed clusters one by one */ +    if (entry & QCOW_OFLAG_COMPRESSED) { +        nb_clusters = 1; +    } else { +        nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); +    } + +    /* This function is only called when there were no non-COW clusters, so if +     * we can't find any unallocated or COW clusters either, something is +     * wrong with our code. */ +    assert(nb_clusters > 0); + +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    /* Allocate, if necessary at a given offset in the image file */ +    alloc_cluster_offset = start_of_cluster(s, *host_offset); +    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, +                                  &nb_clusters); +    if (ret < 0) { +        goto fail; +    } + +    /* Can't extend contiguous allocation */ +    if (nb_clusters == 0) { +        *bytes = 0; +        return 0; +    } + +    /* !*host_offset would overwrite the image header and is reserved for "no +     * host offset preferred". If 0 was a valid host offset, it'd trigger the +     * following overlap check; do that now to avoid having an invalid value in +     * *host_offset. */ +    if (!alloc_cluster_offset) { +        ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, +                                            nb_clusters * s->cluster_size); +        assert(ret < 0); +        goto fail; +    } + +    /* +     * Save info needed for meta data update. +     * +     * requested_sectors: Number of sectors from the start of the first +     * newly allocated cluster to the end of the (possibly shortened +     * before) write request. +     * +     * avail_sectors: Number of sectors from the start of the first +     * newly allocated to the end of the last newly allocated cluster. +     * +     * nb_sectors: The number of sectors from the start of the first +     * newly allocated cluster to the end of the area that the write +     * request actually writes to (excluding COW at the end) +     */ +    int requested_sectors = +        (*bytes + offset_into_cluster(s, guest_offset)) +        >> BDRV_SECTOR_BITS; +    int avail_sectors = nb_clusters +                        << (s->cluster_bits - BDRV_SECTOR_BITS); +    int alloc_n_start = offset_into_cluster(s, guest_offset) +                        >> BDRV_SECTOR_BITS; +    int nb_sectors = MIN(requested_sectors, avail_sectors); +    QCowL2Meta *old_m = *m; + +    *m = g_malloc0(sizeof(**m)); + +    **m = (QCowL2Meta) { +        .next           = old_m, + +        .alloc_offset   = alloc_cluster_offset, +        .offset         = start_of_cluster(s, guest_offset), +        .nb_clusters    = nb_clusters, +        .nb_available   = nb_sectors, + +        .cow_start = { +            .offset     = 0, +            .nb_sectors = alloc_n_start, +        }, +        .cow_end = { +            .offset     = nb_sectors * BDRV_SECTOR_SIZE, +            .nb_sectors = avail_sectors - nb_sectors, +        }, +    }; +    qemu_co_queue_init(&(*m)->dependent_requests); +    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); + +    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); +    *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) +                         - offset_into_cluster(s, guest_offset)); +    assert(*bytes != 0); + +    return 1; + +fail: +    if (*m && (*m)->nb_clusters > 0) { +        QLIST_REMOVE(*m, next_in_flight); +    } +    return ret; +} + +/* + * alloc_cluster_offset + * + * For a given offset on the virtual disk, find the cluster offset in qcow2 + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster was already allocated, m->nb_clusters is set to 0 and + * other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. + * + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. + * + * Return 0 on success and -errno in error cases + */ +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *host_offset, QCowL2Meta **m) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t start, remaining; +    uint64_t cluster_offset; +    uint64_t cur_bytes; +    int ret; + +    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); + +    assert((offset & ~BDRV_SECTOR_MASK) == 0); + +again: +    start = offset; +    remaining = (uint64_t)*num << BDRV_SECTOR_BITS; +    cluster_offset = 0; +    *host_offset = 0; +    cur_bytes = 0; +    *m = NULL; + +    while (true) { + +        if (!*host_offset) { +            *host_offset = start_of_cluster(s, cluster_offset); +        } + +        assert(remaining >= cur_bytes); + +        start           += cur_bytes; +        remaining       -= cur_bytes; +        cluster_offset  += cur_bytes; + +        if (remaining == 0) { +            break; +        } + +        cur_bytes = remaining; + +        /* +         * Now start gathering as many contiguous clusters as possible: +         * +         * 1. Check for overlaps with in-flight allocations +         * +         *      a) Overlap not in the first cluster -> shorten this request and +         *         let the caller handle the rest in its next loop iteration. +         * +         *      b) Real overlaps of two requests. Yield and restart the search +         *         for contiguous clusters (the situation could have changed +         *         while we were sleeping) +         * +         *      c) TODO: Request starts in the same cluster as the in-flight +         *         allocation ends. Shorten the COW of the in-fight allocation, +         *         set cluster_offset to write to the same cluster and set up +         *         the right synchronisation between the in-flight request and +         *         the new one. +         */ +        ret = handle_dependencies(bs, start, &cur_bytes, m); +        if (ret == -EAGAIN) { +            /* Currently handle_dependencies() doesn't yield if we already had +             * an allocation. If it did, we would have to clean up the L2Meta +             * structs before starting over. */ +            assert(*m == NULL); +            goto again; +        } else if (ret < 0) { +            return ret; +        } else if (cur_bytes == 0) { +            break; +        } else { +            /* handle_dependencies() may have decreased cur_bytes (shortened +             * the allocations below) so that the next dependency is processed +             * correctly during the next loop iteration. */ +        } + +        /* +         * 2. Count contiguous COPIED clusters. +         */ +        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); +        if (ret < 0) { +            return ret; +        } else if (ret) { +            continue; +        } else if (cur_bytes == 0) { +            break; +        } + +        /* +         * 3. If the request still hasn't completed, allocate new clusters, +         *    considering any cluster_offset of steps 1c or 2. +         */ +        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); +        if (ret < 0) { +            return ret; +        } else if (ret) { +            continue; +        } else { +            assert(cur_bytes == 0); +            break; +        } +    } + +    *num -= remaining >> BDRV_SECTOR_BITS; +    assert(*num > 0); +    assert(*host_offset != 0); + +    return 0; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, +                             const uint8_t *buf, int buf_size) +{ +    z_stream strm1, *strm = &strm1; +    int ret, out_len; + +    memset(strm, 0, sizeof(*strm)); + +    strm->next_in = (uint8_t *)buf; +    strm->avail_in = buf_size; +    strm->next_out = out_buf; +    strm->avail_out = out_buf_size; + +    ret = inflateInit2(strm, -12); +    if (ret != Z_OK) +        return -1; +    ret = inflate(strm, Z_FINISH); +    out_len = strm->next_out - out_buf; +    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || +        out_len != out_buf_size) { +        inflateEnd(strm); +        return -1; +    } +    inflateEnd(strm); +    return 0; +} + +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, csize, nb_csectors, sector_offset; +    uint64_t coffset; + +    coffset = cluster_offset & s->cluster_offset_mask; +    if (s->cluster_cache_offset != coffset) { +        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; +        sector_offset = coffset & 511; +        csize = nb_csectors * 512 - sector_offset; +        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); +        ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); +        if (ret < 0) { +            return ret; +        } +        if (decompress_buffer(s->cluster_cache, s->cluster_size, +                              s->cluster_data + sector_offset, csize) < 0) { +            return -EIO; +        } +        s->cluster_cache_offset = coffset; +    } +    return 0; +} + +/* + * This discards as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of discarded + * clusters. + */ +static int discard_single_l2(BlockDriverState *bs, uint64_t offset, +                             uint64_t nb_clusters, enum qcow2_discard_type type, +                             bool full_discard) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table; +    int l2_index; +    int ret; +    int i; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    /* Limit nb_clusters to one L2 table */ +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); +    assert(nb_clusters <= INT_MAX); + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t old_l2_entry; + +        old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); + +        /* +         * If full_discard is false, make sure that a discarded area reads back +         * as zeroes for v3 images (we cannot do it for v2 without actually +         * writing a zero-filled buffer). We can skip the operation if the +         * cluster is already marked as zero, or if it's unallocated and we +         * don't have a backing file. +         * +         * TODO We might want to use bdrv_get_block_status(bs) here, but we're +         * holding s->lock, so that doesn't work today. +         * +         * If full_discard is true, the sector should not read back as zeroes, +         * but rather fall through to the backing file. +         */ +        switch (qcow2_get_cluster_type(old_l2_entry)) { +            case QCOW2_CLUSTER_UNALLOCATED: +                if (full_discard || !bs->backing_hd) { +                    continue; +                } +                break; + +            case QCOW2_CLUSTER_ZERO: +                if (!full_discard) { +                    continue; +                } +                break; + +            case QCOW2_CLUSTER_NORMAL: +            case QCOW2_CLUSTER_COMPRESSED: +                break; + +            default: +                abort(); +        } + +        /* First remove L2 entries */ +        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); +        if (!full_discard && s->qcow_version >= 3) { +            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); +        } else { +            l2_table[l2_index + i] = cpu_to_be64(0); +        } + +        /* Then decrease the refcount */ +        qcow2_free_any_clusters(bs, old_l2_entry, 1, type); +    } + +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    return nb_clusters; +} + +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, +    int nb_sectors, enum qcow2_discard_type type, bool full_discard) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t end_offset; +    uint64_t nb_clusters; +    int ret; + +    end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); + +    /* Round start up and end down */ +    offset = align_offset(offset, s->cluster_size); +    end_offset = start_of_cluster(s, end_offset); + +    if (offset > end_offset) { +        return 0; +    } + +    nb_clusters = size_to_clusters(s, end_offset - offset); + +    s->cache_discards = true; + +    /* Each L2 table is handled by its own loop iteration */ +    while (nb_clusters > 0) { +        ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); +        if (ret < 0) { +            goto fail; +        } + +        nb_clusters -= ret; +        offset += (ret * s->cluster_size); +    } + +    ret = 0; +fail: +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    return ret; +} + +/* + * This zeroes as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of zeroed + * clusters. + */ +static int zero_single_l2(BlockDriverState *bs, uint64_t offset, +                          uint64_t nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table; +    int l2_index; +    int ret; +    int i; + +    ret = get_cluster_table(bs, offset, &l2_table, &l2_index); +    if (ret < 0) { +        return ret; +    } + +    /* Limit nb_clusters to one L2 table */ +    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); +    assert(nb_clusters <= INT_MAX); + +    for (i = 0; i < nb_clusters; i++) { +        uint64_t old_offset; + +        old_offset = be64_to_cpu(l2_table[l2_index + i]); + +        /* Update L2 entries */ +        qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); +        if (old_offset & QCOW_OFLAG_COMPRESSED) { +            l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); +            qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); +        } else { +            l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); +        } +    } + +    qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +    return nb_clusters; +} + +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t nb_clusters; +    int ret; + +    /* The zero flag is only supported by version 3 and newer */ +    if (s->qcow_version < 3) { +        return -ENOTSUP; +    } + +    /* Each L2 table is handled by its own loop iteration */ +    nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + +    s->cache_discards = true; + +    while (nb_clusters > 0) { +        ret = zero_single_l2(bs, offset, nb_clusters); +        if (ret < 0) { +            goto fail; +        } + +        nb_clusters -= ret; +        offset += (ret * s->cluster_size); +    } + +    ret = 0; +fail: +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    return ret; +} + +/* + * Expands all zero clusters in a specific L1 table (or deallocates them, for + * non-backed non-pre-allocated zero clusters). + * + * l1_entries and *visited_l1_entries are used to keep track of progress for + * status_cb(). l1_entries contains the total number of L1 entries and + * *visited_l1_entries counts all visited L1 entries. + */ +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, +                                      int l1_size, int64_t *visited_l1_entries, +                                      int64_t l1_entries, +                                      BlockDriverAmendStatusCB *status_cb) +{ +    BDRVQcowState *s = bs->opaque; +    bool is_active_l1 = (l1_table == s->l1_table); +    uint64_t *l2_table = NULL; +    int ret; +    int i, j; + +    if (!is_active_l1) { +        /* inactive L2 tables require a buffer to be stored in when loading +         * them from disk */ +        l2_table = qemu_try_blockalign(bs->file, s->cluster_size); +        if (l2_table == NULL) { +            return -ENOMEM; +        } +    } + +    for (i = 0; i < l1_size; i++) { +        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; +        bool l2_dirty = false; +        uint64_t l2_refcount; + +        if (!l2_offset) { +            /* unallocated */ +            (*visited_l1_entries)++; +            if (status_cb) { +                status_cb(bs, *visited_l1_entries, l1_entries); +            } +            continue; +        } + +        if (offset_into_cluster(s, l2_offset)) { +            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" +                                    PRIx64 " unaligned (L1 index: %#x)", +                                    l2_offset, i); +            ret = -EIO; +            goto fail; +        } + +        if (is_active_l1) { +            /* get active L2 tables from cache */ +            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, +                    (void **)&l2_table); +        } else { +            /* load inactive L2 tables from disk */ +            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, +                    (void *)l2_table, s->cluster_sectors); +        } +        if (ret < 0) { +            goto fail; +        } + +        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, +                                 &l2_refcount); +        if (ret < 0) { +            goto fail; +        } + +        for (j = 0; j < s->l2_size; j++) { +            uint64_t l2_entry = be64_to_cpu(l2_table[j]); +            int64_t offset = l2_entry & L2E_OFFSET_MASK; +            int cluster_type = qcow2_get_cluster_type(l2_entry); +            bool preallocated = offset != 0; + +            if (cluster_type != QCOW2_CLUSTER_ZERO) { +                continue; +            } + +            if (!preallocated) { +                if (!bs->backing_hd) { +                    /* not backed; therefore we can simply deallocate the +                     * cluster */ +                    l2_table[j] = 0; +                    l2_dirty = true; +                    continue; +                } + +                offset = qcow2_alloc_clusters(bs, s->cluster_size); +                if (offset < 0) { +                    ret = offset; +                    goto fail; +                } + +                if (l2_refcount > 1) { +                    /* For shared L2 tables, set the refcount accordingly (it is +                     * already 1 and needs to be l2_refcount) */ +                    ret = qcow2_update_cluster_refcount(bs, +                            offset >> s->cluster_bits, +                            refcount_diff(1, l2_refcount), false, +                            QCOW2_DISCARD_OTHER); +                    if (ret < 0) { +                        qcow2_free_clusters(bs, offset, s->cluster_size, +                                            QCOW2_DISCARD_OTHER); +                        goto fail; +                    } +                } +            } + +            if (offset_into_cluster(s, offset)) { +                qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " +                                        "%#" PRIx64 " unaligned (L2 offset: %#" +                                        PRIx64 ", L2 index: %#x)", offset, +                                        l2_offset, j); +                if (!preallocated) { +                    qcow2_free_clusters(bs, offset, s->cluster_size, +                                        QCOW2_DISCARD_ALWAYS); +                } +                ret = -EIO; +                goto fail; +            } + +            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); +            if (ret < 0) { +                if (!preallocated) { +                    qcow2_free_clusters(bs, offset, s->cluster_size, +                                        QCOW2_DISCARD_ALWAYS); +                } +                goto fail; +            } + +            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, +                                    s->cluster_sectors, 0); +            if (ret < 0) { +                if (!preallocated) { +                    qcow2_free_clusters(bs, offset, s->cluster_size, +                                        QCOW2_DISCARD_ALWAYS); +                } +                goto fail; +            } + +            if (l2_refcount == 1) { +                l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); +            } else { +                l2_table[j] = cpu_to_be64(offset); +            } +            l2_dirty = true; +        } + +        if (is_active_l1) { +            if (l2_dirty) { +                qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); +                qcow2_cache_depends_on_flush(s->l2_table_cache); +            } +            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); +        } else { +            if (l2_dirty) { +                ret = qcow2_pre_write_overlap_check(bs, +                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, +                        s->cluster_size); +                if (ret < 0) { +                    goto fail; +                } + +                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, +                        (void *)l2_table, s->cluster_sectors); +                if (ret < 0) { +                    goto fail; +                } +            } +        } + +        (*visited_l1_entries)++; +        if (status_cb) { +            status_cb(bs, *visited_l1_entries, l1_entries); +        } +    } + +    ret = 0; + +fail: +    if (l2_table) { +        if (!is_active_l1) { +            qemu_vfree(l2_table); +        } else { +            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); +        } +    } +    return ret; +} + +/* + * For backed images, expands all zero clusters on the image. For non-backed + * images, deallocates all non-pre-allocated zero clusters (and claims the + * allocation for pre-allocated ones). This is important for downgrading to a + * qcow2 version which doesn't yet support metadata zero clusters. + */ +int qcow2_expand_zero_clusters(BlockDriverState *bs, +                               BlockDriverAmendStatusCB *status_cb) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l1_table = NULL; +    int64_t l1_entries = 0, visited_l1_entries = 0; +    int ret; +    int i, j; + +    if (status_cb) { +        l1_entries = s->l1_size; +        for (i = 0; i < s->nb_snapshots; i++) { +            l1_entries += s->snapshots[i].l1_size; +        } +    } + +    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, +                                     &visited_l1_entries, l1_entries, +                                     status_cb); +    if (ret < 0) { +        goto fail; +    } + +    /* Inactive L1 tables may point to active L2 tables - therefore it is +     * necessary to flush the L2 table cache before trying to access the L2 +     * tables pointed to by inactive L1 entries (else we might try to expand +     * zero clusters that have already been expanded); furthermore, it is also +     * necessary to empty the L2 table cache, since it may contain tables which +     * are now going to be modified directly on disk, bypassing the cache. +     * qcow2_cache_empty() does both for us. */ +    ret = qcow2_cache_empty(bs, s->l2_table_cache); +    if (ret < 0) { +        goto fail; +    } + +    for (i = 0; i < s->nb_snapshots; i++) { +        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + +                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; + +        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); + +        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / +                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); +        if (ret < 0) { +            goto fail; +        } + +        for (j = 0; j < s->snapshots[i].l1_size; j++) { +            be64_to_cpus(&l1_table[j]); +        } + +        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, +                                         &visited_l1_entries, l1_entries, +                                         status_cb); +        if (ret < 0) { +            goto fail; +        } +    } + +    ret = 0; + +fail: +    g_free(l1_table); +    return ret; +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c new file mode 100644 index 00000000..0b6c302e --- /dev/null +++ b/block/qcow2-refcount.c @@ -0,0 +1,2457 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "qemu/range.h" + +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, +                            int64_t offset, int64_t length, uint64_t addend, +                            bool decrease, enum qcow2_discard_type type); + +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); + +static void set_refcount_ro0(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro1(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro2(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro3(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro4(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro5(void *refcount_array, uint64_t index, +                             uint64_t value); +static void set_refcount_ro6(void *refcount_array, uint64_t index, +                             uint64_t value); + + +static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { +    &get_refcount_ro0, +    &get_refcount_ro1, +    &get_refcount_ro2, +    &get_refcount_ro3, +    &get_refcount_ro4, +    &get_refcount_ro5, +    &get_refcount_ro6 +}; + +static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { +    &set_refcount_ro0, +    &set_refcount_ro1, +    &set_refcount_ro2, +    &set_refcount_ro3, +    &set_refcount_ro4, +    &set_refcount_ro5, +    &set_refcount_ro6 +}; + + +/*********************************************************/ +/* refcount handling */ + +int qcow2_refcount_init(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int refcount_table_size2, i; +    int ret; + +    assert(s->refcount_order >= 0 && s->refcount_order <= 6); + +    s->get_refcount = get_refcount_funcs[s->refcount_order]; +    s->set_refcount = set_refcount_funcs[s->refcount_order]; + +    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); +    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); +    s->refcount_table = g_try_malloc(refcount_table_size2); + +    if (s->refcount_table_size > 0) { +        if (s->refcount_table == NULL) { +            ret = -ENOMEM; +            goto fail; +        } +        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); +        ret = bdrv_pread(bs->file, s->refcount_table_offset, +                         s->refcount_table, refcount_table_size2); +        if (ret < 0) { +            goto fail; +        } +        for(i = 0; i < s->refcount_table_size; i++) +            be64_to_cpus(&s->refcount_table[i]); +    } +    return 0; + fail: +    return ret; +} + +void qcow2_refcount_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    g_free(s->refcount_table); +} + + +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) +{ +    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; +} + +static void set_refcount_ro0(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 1)); +    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); +    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); +} + +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) +{ +    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) +           & 0x3; +} + +static void set_refcount_ro1(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 2)); +    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); +    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); +} + +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) +{ +    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) +           & 0xf; +} + +static void set_refcount_ro2(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 4)); +    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); +    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); +} + +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) +{ +    return ((const uint8_t *)refcount_array)[index]; +} + +static void set_refcount_ro3(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 8)); +    ((uint8_t *)refcount_array)[index] = value; +} + +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) +{ +    return be16_to_cpu(((const uint16_t *)refcount_array)[index]); +} + +static void set_refcount_ro4(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 16)); +    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); +} + +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) +{ +    return be32_to_cpu(((const uint32_t *)refcount_array)[index]); +} + +static void set_refcount_ro5(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    assert(!(value >> 32)); +    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); +} + +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) +{ +    return be64_to_cpu(((const uint64_t *)refcount_array)[index]); +} + +static void set_refcount_ro6(void *refcount_array, uint64_t index, +                             uint64_t value) +{ +    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); +} + + +static int load_refcount_block(BlockDriverState *bs, +                               int64_t refcount_block_offset, +                               void **refcount_block) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); +    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, +        refcount_block); + +    return ret; +} + +/* + * Retrieves the refcount of the cluster given by its index and stores it in + * *refcount. Returns 0 on success and -errno on failure. + */ +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, +                       uint64_t *refcount) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t refcount_table_index, block_index; +    int64_t refcount_block_offset; +    int ret; +    void *refcount_block; + +    refcount_table_index = cluster_index >> s->refcount_block_bits; +    if (refcount_table_index >= s->refcount_table_size) { +        *refcount = 0; +        return 0; +    } +    refcount_block_offset = +        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; +    if (!refcount_block_offset) { +        *refcount = 0; +        return 0; +    } + +    if (offset_into_cluster(s, refcount_block_offset)) { +        qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 +                                " unaligned (reftable index: %#" PRIx64 ")", +                                refcount_block_offset, refcount_table_index); +        return -EIO; +    } + +    ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, +                          &refcount_block); +    if (ret < 0) { +        return ret; +    } + +    block_index = cluster_index & (s->refcount_block_size - 1); +    *refcount = s->get_refcount(refcount_block, block_index); + +    qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + +    return 0; +} + +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcowState *s, +    unsigned int min_size) +{ +    unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; +    unsigned int refcount_table_clusters = +        MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); + +    while (min_clusters > refcount_table_clusters) { +        refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; +    } + +    return refcount_table_clusters << (s->cluster_bits - 3); +} + + +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, +    uint64_t offset_b) +{ +    uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); +    uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); + +    return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns 0 on success or -errno in error case + */ +static int alloc_refcount_block(BlockDriverState *bs, +                                int64_t cluster_index, void **refcount_block) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int refcount_table_index; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + +    /* Find the refcount block for the given cluster */ +    refcount_table_index = cluster_index >> s->refcount_block_bits; + +    if (refcount_table_index < s->refcount_table_size) { + +        uint64_t refcount_block_offset = +            s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + +        /* If it's already there, we're done */ +        if (refcount_block_offset) { +            if (offset_into_cluster(s, refcount_block_offset)) { +                qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" +                                        PRIx64 " unaligned (reftable index: " +                                        "%#x)", refcount_block_offset, +                                        refcount_table_index); +                return -EIO; +            } + +             return load_refcount_block(bs, refcount_block_offset, +                                        refcount_block); +        } +    } + +    /* +     * If we came here, we need to allocate something. Something is at least +     * a cluster for the new refcount block. It may also include a new refcount +     * table if the old refcount table is too small. +     * +     * Note that allocating clusters here needs some special care: +     * +     * - We can't use the normal qcow2_alloc_clusters(), it would try to +     *   increase the refcount and very likely we would end up with an endless +     *   recursion. Instead we must place the refcount blocks in a way that +     *   they can describe them themselves. +     * +     * - We need to consider that at this point we are inside update_refcounts +     *   and potentially doing an initial refcount increase. This means that +     *   some clusters have already been allocated by the caller, but their +     *   refcount isn't accurate yet. If we allocate clusters for metadata, we +     *   need to return -EAGAIN to signal the caller that it needs to restart +     *   the search for free clusters. +     * +     * - alloc_clusters_noref and qcow2_free_clusters may load a different +     *   refcount block into the cache +     */ + +    *refcount_block = NULL; + +    /* We write to the refcount table, so we might depend on L2 tables */ +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        return ret; +    } + +    /* Allocate the refcount block itself and mark it as used */ +    int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); +    if (new_block < 0) { +        return new_block; +    } + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 +        " at %" PRIx64 "\n", +        refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + +    if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { +        /* Zero the new refcount block before updating it */ +        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, +                                    refcount_block); +        if (ret < 0) { +            goto fail_block; +        } + +        memset(*refcount_block, 0, s->cluster_size); + +        /* The block describes itself, need to update the cache */ +        int block_index = (new_block >> s->cluster_bits) & +            (s->refcount_block_size - 1); +        s->set_refcount(*refcount_block, block_index, 1); +    } else { +        /* Described somewhere else. This can recurse at most twice before we +         * arrive at a block that describes itself. */ +        ret = update_refcount(bs, new_block, s->cluster_size, 1, false, +                              QCOW2_DISCARD_NEVER); +        if (ret < 0) { +            goto fail_block; +        } + +        ret = qcow2_cache_flush(bs, s->refcount_block_cache); +        if (ret < 0) { +            goto fail_block; +        } + +        /* Initialize the new refcount block only after updating its refcount, +         * update_refcount uses the refcount cache itself */ +        ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, +                                    refcount_block); +        if (ret < 0) { +            goto fail_block; +        } + +        memset(*refcount_block, 0, s->cluster_size); +    } + +    /* Now the new refcount block needs to be written to disk */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); +    qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); +    ret = qcow2_cache_flush(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail_block; +    } + +    /* If the refcount table is big enough, just hook the block up there */ +    if (refcount_table_index < s->refcount_table_size) { +        uint64_t data64 = cpu_to_be64(new_block); +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); +        ret = bdrv_pwrite_sync(bs->file, +            s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), +            &data64, sizeof(data64)); +        if (ret < 0) { +            goto fail_block; +        } + +        s->refcount_table[refcount_table_index] = new_block; + +        /* The new refcount block may be where the caller intended to put its +         * data, so let it restart the search. */ +        return -EAGAIN; +    } + +    qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); + +    /* +     * If we come here, we need to grow the refcount table. Again, a new +     * refcount table needs some space and we can't simply allocate to avoid +     * endless recursion. +     * +     * Therefore let's grab new refcount blocks at the end of the image, which +     * will describe themselves and the new refcount table. This way we can +     * reference them only in the new table and do the switch to the new +     * refcount table at once without producing an inconsistent state in +     * between. +     */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + +    /* Calculate the number of refcount blocks needed so far; this will be the +     * basis for calculating the index of the first cluster used for the +     * self-describing refcount structures which we are about to create. +     * +     * Because we reached this point, there cannot be any refcount entries for +     * cluster_index or higher indices yet. However, because new_block has been +     * allocated to describe that cluster (and it will assume this role later +     * on), we cannot use that index; also, new_block may actually have a higher +     * cluster index than cluster_index, so it needs to be taken into account +     * here (and 1 needs to be added to its value because that cluster is used). +     */ +    uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, +                                            (new_block >> s->cluster_bits) + 1), +                                        s->refcount_block_size); + +    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { +        return -EFBIG; +    } + +    /* And now we need at least one block more for the new metadata */ +    uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); +    uint64_t last_table_size; +    uint64_t blocks_clusters; +    do { +        uint64_t table_clusters = +            size_to_clusters(s, table_size * sizeof(uint64_t)); +        blocks_clusters = 1 + +            ((table_clusters + s->refcount_block_size - 1) +            / s->refcount_block_size); +        uint64_t meta_clusters = table_clusters + blocks_clusters; + +        last_table_size = table_size; +        table_size = next_refcount_table_size(s, blocks_used + +            ((meta_clusters + s->refcount_block_size - 1) +            / s->refcount_block_size)); + +    } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", +        s->refcount_table_size, table_size); +#endif + +    /* Create the new refcount table and blocks */ +    uint64_t meta_offset = (blocks_used * s->refcount_block_size) * +        s->cluster_size; +    uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; +    uint64_t *new_table = g_try_new0(uint64_t, table_size); +    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + +    assert(table_size > 0 && blocks_clusters > 0); +    if (new_table == NULL || new_blocks == NULL) { +        ret = -ENOMEM; +        goto fail_table; +    } + +    /* Fill the new refcount table */ +    memcpy(new_table, s->refcount_table, +        s->refcount_table_size * sizeof(uint64_t)); +    new_table[refcount_table_index] = new_block; + +    int i; +    for (i = 0; i < blocks_clusters; i++) { +        new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); +    } + +    /* Fill the refcount blocks */ +    uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); +    int block = 0; +    for (i = 0; i < table_clusters + blocks_clusters; i++) { +        s->set_refcount(new_blocks, block++, 1); +    } + +    /* Write refcount blocks to disk */ +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); +    ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, +        blocks_clusters * s->cluster_size); +    g_free(new_blocks); +    new_blocks = NULL; +    if (ret < 0) { +        goto fail_table; +    } + +    /* Write refcount table to disk */ +    for(i = 0; i < table_size; i++) { +        cpu_to_be64s(&new_table[i]); +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); +    ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, +        table_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail_table; +    } + +    for(i = 0; i < table_size; i++) { +        be64_to_cpus(&new_table[i]); +    } + +    /* Hook up the new refcount table in the qcow2 header */ +    uint8_t data[12]; +    cpu_to_be64w((uint64_t*)data, table_offset); +    cpu_to_be32w((uint32_t*)(data + 8), table_clusters); +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), +        data, sizeof(data)); +    if (ret < 0) { +        goto fail_table; +    } + +    /* And switch it in memory */ +    uint64_t old_table_offset = s->refcount_table_offset; +    uint64_t old_table_size = s->refcount_table_size; + +    g_free(s->refcount_table); +    s->refcount_table = new_table; +    s->refcount_table_size = table_size; +    s->refcount_table_offset = table_offset; + +    /* Free old table. */ +    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), +                        QCOW2_DISCARD_OTHER); + +    ret = load_refcount_block(bs, new_block, refcount_block); +    if (ret < 0) { +        return ret; +    } + +    /* If we were trying to do the initial refcount update for some cluster +     * allocation, we might have used the same clusters to store newly +     * allocated metadata. Make the caller search some new space. */ +    return -EAGAIN; + +fail_table: +    g_free(new_blocks); +    g_free(new_table); +fail_block: +    if (*refcount_block != NULL) { +        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); +    } +    return ret; +} + +void qcow2_process_discards(BlockDriverState *bs, int ret) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2DiscardRegion *d, *next; + +    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { +        QTAILQ_REMOVE(&s->discards, d, next); + +        /* Discard is optional, ignore the return value */ +        if (ret >= 0) { +            bdrv_discard(bs->file, +                         d->offset >> BDRV_SECTOR_BITS, +                         d->bytes >> BDRV_SECTOR_BITS); +        } + +        g_free(d); +    } +} + +static void update_refcount_discard(BlockDriverState *bs, +                                    uint64_t offset, uint64_t length) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2DiscardRegion *d, *p, *next; + +    QTAILQ_FOREACH(d, &s->discards, next) { +        uint64_t new_start = MIN(offset, d->offset); +        uint64_t new_end = MAX(offset + length, d->offset + d->bytes); + +        if (new_end - new_start <= length + d->bytes) { +            /* There can't be any overlap, areas ending up here have no +             * references any more and therefore shouldn't get freed another +             * time. */ +            assert(d->bytes + length == new_end - new_start); +            d->offset = new_start; +            d->bytes = new_end - new_start; +            goto found; +        } +    } + +    d = g_malloc(sizeof(*d)); +    *d = (Qcow2DiscardRegion) { +        .bs     = bs, +        .offset = offset, +        .bytes  = length, +    }; +    QTAILQ_INSERT_TAIL(&s->discards, d, next); + +found: +    /* Merge discard requests if they are adjacent now */ +    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { +        if (p == d +            || p->offset > d->offset + d->bytes +            || d->offset > p->offset + p->bytes) +        { +            continue; +        } + +        /* Still no overlap possible */ +        assert(p->offset == d->offset + d->bytes +            || d->offset == p->offset + p->bytes); + +        QTAILQ_REMOVE(&s->discards, p, next); +        d->offset = MIN(d->offset, p->offset); +        d->bytes += p->bytes; +        g_free(p); +    } +} + +/* XXX: cache several refcount block clusters ? */ +/* @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added */ +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, +                                                   int64_t offset, +                                                   int64_t length, +                                                   uint64_t addend, +                                                   bool decrease, +                                                   enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t start, last, cluster_offset; +    void *refcount_block = NULL; +    int64_t old_table_index = -1; +    int ret; + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 +            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", +            addend); +#endif +    if (length < 0) { +        return -EINVAL; +    } else if (length == 0) { +        return 0; +    } + +    if (decrease) { +        qcow2_cache_set_dependency(bs, s->refcount_block_cache, +            s->l2_table_cache); +    } + +    start = start_of_cluster(s, offset); +    last = start_of_cluster(s, offset + length - 1); +    for(cluster_offset = start; cluster_offset <= last; +        cluster_offset += s->cluster_size) +    { +        int block_index; +        uint64_t refcount; +        int64_t cluster_index = cluster_offset >> s->cluster_bits; +        int64_t table_index = cluster_index >> s->refcount_block_bits; + +        /* Load the refcount block and allocate it if needed */ +        if (table_index != old_table_index) { +            if (refcount_block) { +                qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); +            } +            ret = alloc_refcount_block(bs, cluster_index, &refcount_block); +            if (ret < 0) { +                goto fail; +            } +        } +        old_table_index = table_index; + +        qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, +                                     refcount_block); + +        /* we can update the count and save it */ +        block_index = cluster_index & (s->refcount_block_size - 1); + +        refcount = s->get_refcount(refcount_block, block_index); +        if (decrease ? (refcount - addend > refcount) +                     : (refcount + addend < refcount || +                        refcount + addend > s->refcount_max)) +        { +            ret = -EINVAL; +            goto fail; +        } +        if (decrease) { +            refcount -= addend; +        } else { +            refcount += addend; +        } +        if (refcount == 0 && cluster_index < s->free_cluster_index) { +            s->free_cluster_index = cluster_index; +        } +        s->set_refcount(refcount_block, block_index, refcount); + +        if (refcount == 0 && s->discard_passthrough[type]) { +            update_refcount_discard(bs, cluster_offset, s->cluster_size); +        } +    } + +    ret = 0; +fail: +    if (!s->cache_discards) { +        qcow2_process_discards(bs, ret); +    } + +    /* Write last changed block to disk */ +    if (refcount_block) { +        qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); +    } + +    /* +     * Try do undo any updates if an error is returned (This may succeed in +     * some cases like ENOSPC for allocating a new refcount block) +     */ +    if (ret < 0) { +        int dummy; +        dummy = update_refcount(bs, offset, cluster_offset - offset, addend, +                                !decrease, QCOW2_DISCARD_NEVER); +        (void)dummy; +    } + +    return ret; +} + +/* + * Increases or decreases the refcount of a given cluster. + * + * @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added. + * + * On success 0 is returned; on failure -errno is returned. + */ +int qcow2_update_cluster_refcount(BlockDriverState *bs, +                                  int64_t cluster_index, +                                  uint64_t addend, bool decrease, +                                  enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, +                          decrease, type); +    if (ret < 0) { +        return ret; +    } + +    return 0; +} + + + +/*********************************************************/ +/* cluster allocation functions */ + + + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t i, nb_clusters, refcount; +    int ret; + +    /* We can't allocate clusters if they may still be queued for discard. */ +    if (s->cache_discards) { +        qcow2_process_discards(bs, 0); +    } + +    nb_clusters = size_to_clusters(s, size); +retry: +    for(i = 0; i < nb_clusters; i++) { +        uint64_t next_cluster_index = s->free_cluster_index++; +        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); + +        if (ret < 0) { +            return ret; +        } else if (refcount != 0) { +            goto retry; +        } +    } + +    /* Make sure that all offsets in the "allocated" range are representable +     * in an int64_t */ +    if (s->free_cluster_index > 0 && +        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) +    { +        return -EFBIG; +    } + +#ifdef DEBUG_ALLOC2 +    fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", +            size, +            (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif +    return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) +{ +    int64_t offset; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); +    do { +        offset = alloc_clusters_noref(bs, size); +        if (offset < 0) { +            return offset; +        } + +        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); +    } while (ret == -EAGAIN); + +    if (ret < 0) { +        return ret; +    } + +    return offset; +} + +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, +                                int64_t nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t cluster_index, refcount; +    uint64_t i; +    int ret; + +    assert(nb_clusters >= 0); +    if (nb_clusters == 0) { +        return 0; +    } + +    do { +        /* Check how many clusters there are free */ +        cluster_index = offset >> s->cluster_bits; +        for(i = 0; i < nb_clusters; i++) { +            ret = qcow2_get_refcount(bs, cluster_index++, &refcount); +            if (ret < 0) { +                return ret; +            } else if (refcount != 0) { +                break; +            } +        } + +        /* And then allocate them */ +        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, +                              QCOW2_DISCARD_NEVER); +    } while (ret == -EAGAIN); + +    if (ret < 0) { +        return ret; +    } + +    return i; +} + +/* only used to allocate compressed sectors. We try to allocate +   contiguous sectors. size must be <= cluster_size */ +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t offset; +    size_t free_in_cluster; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); +    assert(size > 0 && size <= s->cluster_size); +    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); + +    offset = s->free_byte_offset; + +    if (offset) { +        uint64_t refcount; +        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); +        if (ret < 0) { +            return ret; +        } + +        if (refcount == s->refcount_max) { +            offset = 0; +        } +    } + +    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); +    do { +        if (!offset || free_in_cluster < size) { +            int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); +            if (new_cluster < 0) { +                return new_cluster; +            } + +            if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { +                offset = new_cluster; +            } +        } + +        assert(offset); +        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); +    } while (ret == -EAGAIN); +    if (ret < 0) { +        return ret; +    } + +    /* The cluster refcount was incremented; refcount blocks must be flushed +     * before the caller's L2 table updates. */ +    qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); + +    s->free_byte_offset = offset + size; +    if (!offset_into_cluster(s, s->free_byte_offset)) { +        s->free_byte_offset = 0; +    } + +    return offset; +} + +void qcow2_free_clusters(BlockDriverState *bs, +                          int64_t offset, int64_t size, +                          enum qcow2_discard_type type) +{ +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); +    ret = update_refcount(bs, offset, size, 1, true, type); +    if (ret < 0) { +        fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); +        /* TODO Remember the clusters to free them later and avoid leaking */ +    } +} + +/* + * Free a cluster using its L2 entry (handles clusters of all types, e.g. + * normal cluster, compressed cluster, etc.) + */ +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, +                             int nb_clusters, enum qcow2_discard_type type) +{ +    BDRVQcowState *s = bs->opaque; + +    switch (qcow2_get_cluster_type(l2_entry)) { +    case QCOW2_CLUSTER_COMPRESSED: +        { +            int nb_csectors; +            nb_csectors = ((l2_entry >> s->csize_shift) & +                           s->csize_mask) + 1; +            qcow2_free_clusters(bs, +                (l2_entry & s->cluster_offset_mask) & ~511, +                nb_csectors * 512, type); +        } +        break; +    case QCOW2_CLUSTER_NORMAL: +    case QCOW2_CLUSTER_ZERO: +        if (l2_entry & L2E_OFFSET_MASK) { +            if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { +                qcow2_signal_corruption(bs, false, -1, -1, +                                        "Cannot free unaligned cluster %#llx", +                                        l2_entry & L2E_OFFSET_MASK); +            } else { +                qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, +                                    nb_clusters << s->cluster_bits, type); +            } +        } +        break; +    case QCOW2_CLUSTER_UNALLOCATED: +        break; +    default: +        abort(); +    } +} + + + +/*********************************************************/ +/* snapshots and image creation */ + + + +/* update the refcounts of snapshots and the copied flag */ +int qcow2_update_snapshot_refcount(BlockDriverState *bs, +    int64_t l1_table_offset, int l1_size, int addend) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount; +    bool l1_allocated = false; +    int64_t old_offset, old_l2_offset; +    int i, j, l1_modified = 0, nb_csectors; +    int ret; + +    assert(addend >= -1 && addend <= 1); + +    l2_table = NULL; +    l1_table = NULL; +    l1_size2 = l1_size * sizeof(uint64_t); + +    s->cache_discards = true; + +    /* WARNING: qcow2_snapshot_goto relies on this function not using the +     * l1_table_offset when it is the current s->l1_table_offset! Be careful +     * when changing this! */ +    if (l1_table_offset != s->l1_table_offset) { +        l1_table = g_try_malloc0(align_offset(l1_size2, 512)); +        if (l1_size2 && l1_table == NULL) { +            ret = -ENOMEM; +            goto fail; +        } +        l1_allocated = true; + +        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); +        if (ret < 0) { +            goto fail; +        } + +        for(i = 0;i < l1_size; i++) +            be64_to_cpus(&l1_table[i]); +    } else { +        assert(l1_size == s->l1_size); +        l1_table = s->l1_table; +        l1_allocated = false; +    } + +    for(i = 0; i < l1_size; i++) { +        l2_offset = l1_table[i]; +        if (l2_offset) { +            old_l2_offset = l2_offset; +            l2_offset &= L1E_OFFSET_MASK; + +            if (offset_into_cluster(s, l2_offset)) { +                qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" +                                        PRIx64 " unaligned (L1 index: %#x)", +                                        l2_offset, i); +                ret = -EIO; +                goto fail; +            } + +            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, +                (void**) &l2_table); +            if (ret < 0) { +                goto fail; +            } + +            for(j = 0; j < s->l2_size; j++) { +                uint64_t cluster_index; + +                offset = be64_to_cpu(l2_table[j]); +                old_offset = offset; +                offset &= ~QCOW_OFLAG_COPIED; + +                switch (qcow2_get_cluster_type(offset)) { +                    case QCOW2_CLUSTER_COMPRESSED: +                        nb_csectors = ((offset >> s->csize_shift) & +                                       s->csize_mask) + 1; +                        if (addend != 0) { +                            ret = update_refcount(bs, +                                (offset & s->cluster_offset_mask) & ~511, +                                nb_csectors * 512, abs(addend), addend < 0, +                                QCOW2_DISCARD_SNAPSHOT); +                            if (ret < 0) { +                                goto fail; +                            } +                        } +                        /* compressed clusters are never modified */ +                        refcount = 2; +                        break; + +                    case QCOW2_CLUSTER_NORMAL: +                    case QCOW2_CLUSTER_ZERO: +                        if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) { +                            qcow2_signal_corruption(bs, true, -1, -1, "Data " +                                                    "cluster offset %#llx " +                                                    "unaligned (L2 offset: %#" +                                                    PRIx64 ", L2 index: %#x)", +                                                    offset & L2E_OFFSET_MASK, +                                                    l2_offset, j); +                            ret = -EIO; +                            goto fail; +                        } + +                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; +                        if (!cluster_index) { +                            /* unallocated */ +                            refcount = 0; +                            break; +                        } +                        if (addend != 0) { +                            ret = qcow2_update_cluster_refcount(bs, +                                    cluster_index, abs(addend), addend < 0, +                                    QCOW2_DISCARD_SNAPSHOT); +                            if (ret < 0) { +                                goto fail; +                            } +                        } + +                        ret = qcow2_get_refcount(bs, cluster_index, &refcount); +                        if (ret < 0) { +                            goto fail; +                        } +                        break; + +                    case QCOW2_CLUSTER_UNALLOCATED: +                        refcount = 0; +                        break; + +                    default: +                        abort(); +                } + +                if (refcount == 1) { +                    offset |= QCOW_OFLAG_COPIED; +                } +                if (offset != old_offset) { +                    if (addend > 0) { +                        qcow2_cache_set_dependency(bs, s->l2_table_cache, +                            s->refcount_block_cache); +                    } +                    l2_table[j] = cpu_to_be64(offset); +                    qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, +                                                 l2_table); +                } +            } + +            qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + +            if (addend != 0) { +                ret = qcow2_update_cluster_refcount(bs, l2_offset >> +                                                        s->cluster_bits, +                                                    abs(addend), addend < 0, +                                                    QCOW2_DISCARD_SNAPSHOT); +                if (ret < 0) { +                    goto fail; +                } +            } +            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, +                                     &refcount); +            if (ret < 0) { +                goto fail; +            } else if (refcount == 1) { +                l2_offset |= QCOW_OFLAG_COPIED; +            } +            if (l2_offset != old_l2_offset) { +                l1_table[i] = l2_offset; +                l1_modified = 1; +            } +        } +    } + +    ret = bdrv_flush(bs); +fail: +    if (l2_table) { +        qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); +    } + +    s->cache_discards = false; +    qcow2_process_discards(bs, ret); + +    /* Update L1 only if it isn't deleted anyway (addend = -1) */ +    if (ret == 0 && addend >= 0 && l1_modified) { +        for (i = 0; i < l1_size; i++) { +            cpu_to_be64s(&l1_table[i]); +        } + +        ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); + +        for (i = 0; i < l1_size; i++) { +            be64_to_cpus(&l1_table[i]); +        } +    } +    if (l1_allocated) +        g_free(l1_table); +    return ret; +} + + + + +/*********************************************************/ +/* refcount checking functions */ + + +static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries) +{ +    /* This assertion holds because there is no way we can address more than +     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because +     * offsets have to be representable in bytes); due to every cluster +     * corresponding to one refcount entry, we are well below that limit */ +    assert(entries < (UINT64_C(1) << (64 - 9))); + +    /* Thanks to the assertion this will not overflow, because +     * s->refcount_order < 7. +     * (note: x << s->refcount_order == x * s->refcount_bits) */ +    return DIV_ROUND_UP(entries << s->refcount_order, 8); +} + +/** + * Reallocates *array so that it can hold new_size entries. *size must contain + * the current number of entries in *array. If the reallocation fails, *array + * and *size will not be modified and -errno will be returned. If the + * reallocation is successful, *array will be set to the new buffer, *size + * will be set to new_size and 0 will be returned. The size of the reallocated + * refcount array buffer will be aligned to a cluster boundary, and the newly + * allocated area will be zeroed. + */ +static int realloc_refcount_array(BDRVQcowState *s, void **array, +                                  int64_t *size, int64_t new_size) +{ +    int64_t old_byte_size, new_byte_size; +    void *new_ptr; + +    /* Round to clusters so the array can be directly written to disk */ +    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) +                    * s->cluster_size; +    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) +                    * s->cluster_size; + +    if (new_byte_size == old_byte_size) { +        *size = new_size; +        return 0; +    } + +    assert(new_byte_size > 0); + +    if (new_byte_size > SIZE_MAX) { +        return -ENOMEM; +    } + +    new_ptr = g_try_realloc(*array, new_byte_size); +    if (!new_ptr) { +        return -ENOMEM; +    } + +    if (new_byte_size > old_byte_size) { +        memset((char *)new_ptr + old_byte_size, 0, +               new_byte_size - old_byte_size); +    } + +    *array = new_ptr; +    *size  = new_size; + +    return 0; +} + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared the the refcount table saved in the image. + * + * Modifies the number of errors in res. + */ +static int inc_refcounts(BlockDriverState *bs, +                         BdrvCheckResult *res, +                         void **refcount_table, +                         int64_t *refcount_table_size, +                         int64_t offset, int64_t size) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t start, last, cluster_offset, k, refcount; +    int ret; + +    if (size <= 0) { +        return 0; +    } + +    start = start_of_cluster(s, offset); +    last = start_of_cluster(s, offset + size - 1); +    for(cluster_offset = start; cluster_offset <= last; +        cluster_offset += s->cluster_size) { +        k = cluster_offset >> s->cluster_bits; +        if (k >= *refcount_table_size) { +            ret = realloc_refcount_array(s, refcount_table, +                                         refcount_table_size, k + 1); +            if (ret < 0) { +                res->check_errors++; +                return ret; +            } +        } + +        refcount = s->get_refcount(*refcount_table, k); +        if (refcount == s->refcount_max) { +            fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 +                    "\n", cluster_offset); +            res->corruptions++; +            continue; +        } +        s->set_refcount(*refcount_table, k, refcount + 1); +    } + +    return 0; +} + +/* Flags for check_refcounts_l1() and check_refcounts_l2() */ +enum { +    CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */ +}; + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, +                              void **refcount_table, +                              int64_t *refcount_table_size, int64_t l2_offset, +                              int flags) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table, l2_entry; +    uint64_t next_contiguous_offset = 0; +    int i, l2_size, nb_csectors, ret; + +    /* Read L2 table from disk */ +    l2_size = s->l2_size * sizeof(uint64_t); +    l2_table = g_malloc(l2_size); + +    ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); +    if (ret < 0) { +        fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); +        res->check_errors++; +        goto fail; +    } + +    /* Do the actual checks */ +    for(i = 0; i < s->l2_size; i++) { +        l2_entry = be64_to_cpu(l2_table[i]); + +        switch (qcow2_get_cluster_type(l2_entry)) { +        case QCOW2_CLUSTER_COMPRESSED: +            /* Compressed clusters don't have QCOW_OFLAG_COPIED */ +            if (l2_entry & QCOW_OFLAG_COPIED) { +                fprintf(stderr, "ERROR: cluster %" PRId64 ": " +                    "copied flag must never be set for compressed " +                    "clusters\n", l2_entry >> s->cluster_bits); +                l2_entry &= ~QCOW_OFLAG_COPIED; +                res->corruptions++; +            } + +            /* Mark cluster as used */ +            nb_csectors = ((l2_entry >> s->csize_shift) & +                           s->csize_mask) + 1; +            l2_entry &= s->cluster_offset_mask; +            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, +                                l2_entry & ~511, nb_csectors * 512); +            if (ret < 0) { +                goto fail; +            } + +            if (flags & CHECK_FRAG_INFO) { +                res->bfi.allocated_clusters++; +                res->bfi.compressed_clusters++; + +                /* Compressed clusters are fragmented by nature.  Since they +                 * take up sub-sector space but we only have sector granularity +                 * I/O we need to re-read the same sectors even for adjacent +                 * compressed clusters. +                 */ +                res->bfi.fragmented_clusters++; +            } +            break; + +        case QCOW2_CLUSTER_ZERO: +            if ((l2_entry & L2E_OFFSET_MASK) == 0) { +                break; +            } +            /* fall through */ + +        case QCOW2_CLUSTER_NORMAL: +        { +            uint64_t offset = l2_entry & L2E_OFFSET_MASK; + +            if (flags & CHECK_FRAG_INFO) { +                res->bfi.allocated_clusters++; +                if (next_contiguous_offset && +                    offset != next_contiguous_offset) { +                    res->bfi.fragmented_clusters++; +                } +                next_contiguous_offset = offset + s->cluster_size; +            } + +            /* Mark cluster as used */ +            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, +                                offset, s->cluster_size); +            if (ret < 0) { +                goto fail; +            } + +            /* Correct offsets are cluster aligned */ +            if (offset_into_cluster(s, offset)) { +                fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " +                    "properly aligned; L2 entry corrupted.\n", offset); +                res->corruptions++; +            } +            break; +        } + +        case QCOW2_CLUSTER_UNALLOCATED: +            break; + +        default: +            abort(); +        } +    } + +    g_free(l2_table); +    return 0; + +fail: +    g_free(l2_table); +    return ret; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, +                              BdrvCheckResult *res, +                              void **refcount_table, +                              int64_t *refcount_table_size, +                              int64_t l1_table_offset, int l1_size, +                              int flags) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l1_table = NULL, l2_offset, l1_size2; +    int i, ret; + +    l1_size2 = l1_size * sizeof(uint64_t); + +    /* Mark L1 table as used */ +    ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, +                        l1_table_offset, l1_size2); +    if (ret < 0) { +        goto fail; +    } + +    /* Read L1 table entries from disk */ +    if (l1_size2 > 0) { +        l1_table = g_try_malloc(l1_size2); +        if (l1_table == NULL) { +            ret = -ENOMEM; +            res->check_errors++; +            goto fail; +        } +        ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); +        if (ret < 0) { +            fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); +            res->check_errors++; +            goto fail; +        } +        for(i = 0;i < l1_size; i++) +            be64_to_cpus(&l1_table[i]); +    } + +    /* Do the actual checks */ +    for(i = 0; i < l1_size; i++) { +        l2_offset = l1_table[i]; +        if (l2_offset) { +            /* Mark L2 table as used */ +            l2_offset &= L1E_OFFSET_MASK; +            ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, +                                l2_offset, s->cluster_size); +            if (ret < 0) { +                goto fail; +            } + +            /* L2 tables are cluster aligned */ +            if (offset_into_cluster(s, l2_offset)) { +                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " +                    "cluster aligned; L1 entry corrupted\n", l2_offset); +                res->corruptions++; +            } + +            /* Process and check L2 entries */ +            ret = check_refcounts_l2(bs, res, refcount_table, +                                     refcount_table_size, l2_offset, flags); +            if (ret < 0) { +                goto fail; +            } +        } +    } +    g_free(l1_table); +    return 0; + +fail: +    g_free(l1_table); +    return ret; +} + +/* + * Checks the OFLAG_COPIED flag for all L1 and L2 entries. + * + * This function does not print an error message nor does it increment + * check_errors if qcow2_get_refcount fails (this is because such an error will + * have been already detected and sufficiently signaled by the calling function + * (qcow2_check_refcounts) by the time this function is called). + */ +static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, +                              BdrvCheckMode fix) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); +    int ret; +    uint64_t refcount; +    int i, j; + +    for (i = 0; i < s->l1_size; i++) { +        uint64_t l1_entry = s->l1_table[i]; +        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; +        bool l2_dirty = false; + +        if (!l2_offset) { +            continue; +        } + +        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, +                                 &refcount); +        if (ret < 0) { +            /* don't print message nor increment check_errors */ +            continue; +        } +        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { +            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " +                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", +                    fix & BDRV_FIX_ERRORS ? "Repairing" : +                                            "ERROR", +                    i, l1_entry, refcount); +            if (fix & BDRV_FIX_ERRORS) { +                s->l1_table[i] = refcount == 1 +                               ? l1_entry |  QCOW_OFLAG_COPIED +                               : l1_entry & ~QCOW_OFLAG_COPIED; +                ret = qcow2_write_l1_entry(bs, i); +                if (ret < 0) { +                    res->check_errors++; +                    goto fail; +                } +                res->corruptions_fixed++; +            } else { +                res->corruptions++; +            } +        } + +        ret = bdrv_pread(bs->file, l2_offset, l2_table, +                         s->l2_size * sizeof(uint64_t)); +        if (ret < 0) { +            fprintf(stderr, "ERROR: Could not read L2 table: %s\n", +                    strerror(-ret)); +            res->check_errors++; +            goto fail; +        } + +        for (j = 0; j < s->l2_size; j++) { +            uint64_t l2_entry = be64_to_cpu(l2_table[j]); +            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; +            int cluster_type = qcow2_get_cluster_type(l2_entry); + +            if ((cluster_type == QCOW2_CLUSTER_NORMAL) || +                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { +                ret = qcow2_get_refcount(bs, +                                         data_offset >> s->cluster_bits, +                                         &refcount); +                if (ret < 0) { +                    /* don't print message nor increment check_errors */ +                    continue; +                } +                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { +                    fprintf(stderr, "%s OFLAG_COPIED data cluster: " +                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", +                            fix & BDRV_FIX_ERRORS ? "Repairing" : +                                                    "ERROR", +                            l2_entry, refcount); +                    if (fix & BDRV_FIX_ERRORS) { +                        l2_table[j] = cpu_to_be64(refcount == 1 +                                    ? l2_entry |  QCOW_OFLAG_COPIED +                                    : l2_entry & ~QCOW_OFLAG_COPIED); +                        l2_dirty = true; +                        res->corruptions_fixed++; +                    } else { +                        res->corruptions++; +                    } +                } +            } +        } + +        if (l2_dirty) { +            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, +                                                l2_offset, s->cluster_size); +            if (ret < 0) { +                fprintf(stderr, "ERROR: Could not write L2 table; metadata " +                        "overlap check failed: %s\n", strerror(-ret)); +                res->check_errors++; +                goto fail; +            } + +            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); +            if (ret < 0) { +                fprintf(stderr, "ERROR: Could not write L2 table: %s\n", +                        strerror(-ret)); +                res->check_errors++; +                goto fail; +            } +        } +    } + +    ret = 0; + +fail: +    qemu_vfree(l2_table); +    return ret; +} + +/* + * Checks consistency of refblocks and accounts for each refblock in + * *refcount_table. + */ +static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, +                           BdrvCheckMode fix, bool *rebuild, +                           void **refcount_table, int64_t *nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t i, size; +    int ret; + +    for(i = 0; i < s->refcount_table_size; i++) { +        uint64_t offset, cluster; +        offset = s->refcount_table[i]; +        cluster = offset >> s->cluster_bits; + +        /* Refcount blocks are cluster aligned */ +        if (offset_into_cluster(s, offset)) { +            fprintf(stderr, "ERROR refcount block %" PRId64 " is not " +                "cluster aligned; refcount table entry corrupted\n", i); +            res->corruptions++; +            *rebuild = true; +            continue; +        } + +        if (cluster >= *nb_clusters) { +            fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", +                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); + +            if (fix & BDRV_FIX_ERRORS) { +                int64_t new_nb_clusters; + +                if (offset > INT64_MAX - s->cluster_size) { +                    ret = -EINVAL; +                    goto resize_fail; +                } + +                ret = bdrv_truncate(bs->file, offset + s->cluster_size); +                if (ret < 0) { +                    goto resize_fail; +                } +                size = bdrv_getlength(bs->file); +                if (size < 0) { +                    ret = size; +                    goto resize_fail; +                } + +                new_nb_clusters = size_to_clusters(s, size); +                assert(new_nb_clusters >= *nb_clusters); + +                ret = realloc_refcount_array(s, refcount_table, +                                             nb_clusters, new_nb_clusters); +                if (ret < 0) { +                    res->check_errors++; +                    return ret; +                } + +                if (cluster >= *nb_clusters) { +                    ret = -EINVAL; +                    goto resize_fail; +                } + +                res->corruptions_fixed++; +                ret = inc_refcounts(bs, res, refcount_table, nb_clusters, +                                    offset, s->cluster_size); +                if (ret < 0) { +                    return ret; +                } +                /* No need to check whether the refcount is now greater than 1: +                 * This area was just allocated and zeroed, so it can only be +                 * exactly 1 after inc_refcounts() */ +                continue; + +resize_fail: +                res->corruptions++; +                *rebuild = true; +                fprintf(stderr, "ERROR could not resize image: %s\n", +                        strerror(-ret)); +            } else { +                res->corruptions++; +            } +            continue; +        } + +        if (offset != 0) { +            ret = inc_refcounts(bs, res, refcount_table, nb_clusters, +                                offset, s->cluster_size); +            if (ret < 0) { +                return ret; +            } +            if (s->get_refcount(*refcount_table, cluster) != 1) { +                fprintf(stderr, "ERROR refcount block %" PRId64 +                        " refcount=%" PRIu64 "\n", i, +                        s->get_refcount(*refcount_table, cluster)); +                res->corruptions++; +                *rebuild = true; +            } +        } +    } + +    return 0; +} + +/* + * Calculates an in-memory refcount table. + */ +static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                               BdrvCheckMode fix, bool *rebuild, +                               void **refcount_table, int64_t *nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t i; +    QCowSnapshot *sn; +    int ret; + +    if (!*refcount_table) { +        int64_t old_size = 0; +        ret = realloc_refcount_array(s, refcount_table, +                                     &old_size, *nb_clusters); +        if (ret < 0) { +            res->check_errors++; +            return ret; +        } +    } + +    /* header */ +    ret = inc_refcounts(bs, res, refcount_table, nb_clusters, +                        0, s->cluster_size); +    if (ret < 0) { +        return ret; +    } + +    /* current L1 table */ +    ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, +                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); +    if (ret < 0) { +        return ret; +    } + +    /* snapshots */ +    for (i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, +                                 sn->l1_table_offset, sn->l1_size, 0); +        if (ret < 0) { +            return ret; +        } +    } +    ret = inc_refcounts(bs, res, refcount_table, nb_clusters, +                        s->snapshots_offset, s->snapshots_size); +    if (ret < 0) { +        return ret; +    } + +    /* refcount data */ +    ret = inc_refcounts(bs, res, refcount_table, nb_clusters, +                        s->refcount_table_offset, +                        s->refcount_table_size * sizeof(uint64_t)); +    if (ret < 0) { +        return ret; +    } + +    return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); +} + +/* + * Compares the actual reference count for each cluster in the image against the + * refcount as reported by the refcount structures on-disk. + */ +static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                              BdrvCheckMode fix, bool *rebuild, +                              int64_t *highest_cluster, +                              void *refcount_table, int64_t nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t i; +    uint64_t refcount1, refcount2; +    int ret; + +    for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { +        ret = qcow2_get_refcount(bs, i, &refcount1); +        if (ret < 0) { +            fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", +                    i, strerror(-ret)); +            res->check_errors++; +            continue; +        } + +        refcount2 = s->get_refcount(refcount_table, i); + +        if (refcount1 > 0 || refcount2 > 0) { +            *highest_cluster = i; +        } + +        if (refcount1 != refcount2) { +            /* Check if we're allowed to fix the mismatch */ +            int *num_fixed = NULL; +            if (refcount1 == 0) { +                *rebuild = true; +            } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { +                num_fixed = &res->leaks_fixed; +            } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { +                num_fixed = &res->corruptions_fixed; +            } + +            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 +                    " reference=%" PRIu64 "\n", +                   num_fixed != NULL     ? "Repairing" : +                   refcount1 < refcount2 ? "ERROR" : +                                           "Leaked", +                   i, refcount1, refcount2); + +            if (num_fixed) { +                ret = update_refcount(bs, i << s->cluster_bits, 1, +                                      refcount_diff(refcount1, refcount2), +                                      refcount1 > refcount2, +                                      QCOW2_DISCARD_ALWAYS); +                if (ret >= 0) { +                    (*num_fixed)++; +                    continue; +                } +            } + +            /* And if we couldn't, print an error */ +            if (refcount1 < refcount2) { +                res->corruptions++; +            } else { +                res->leaks++; +            } +        } +    } +} + +/* + * Allocates clusters using an in-memory refcount table (IMRT) in contrast to + * the on-disk refcount structures. + * + * On input, *first_free_cluster tells where to start looking, and need not + * actually be a free cluster; the returned offset will not be before that + * cluster.  On output, *first_free_cluster points to the first gap found, even + * if that gap was too small to be used as the returned offset. + * + * Note that *first_free_cluster is a cluster index whereas the return value is + * an offset. + */ +static int64_t alloc_clusters_imrt(BlockDriverState *bs, +                                   int cluster_count, +                                   void **refcount_table, +                                   int64_t *imrt_nb_clusters, +                                   int64_t *first_free_cluster) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t cluster = *first_free_cluster, i; +    bool first_gap = true; +    int contiguous_free_clusters; +    int ret; + +    /* Starting at *first_free_cluster, find a range of at least cluster_count +     * continuously free clusters */ +    for (contiguous_free_clusters = 0; +         cluster < *imrt_nb_clusters && +         contiguous_free_clusters < cluster_count; +         cluster++) +    { +        if (!s->get_refcount(*refcount_table, cluster)) { +            contiguous_free_clusters++; +            if (first_gap) { +                /* If this is the first free cluster found, update +                 * *first_free_cluster accordingly */ +                *first_free_cluster = cluster; +                first_gap = false; +            } +        } else if (contiguous_free_clusters) { +            contiguous_free_clusters = 0; +        } +    } + +    /* If contiguous_free_clusters is greater than zero, it contains the number +     * of continuously free clusters until the current cluster; the first free +     * cluster in the current "gap" is therefore +     * cluster - contiguous_free_clusters */ + +    /* If no such range could be found, grow the in-memory refcount table +     * accordingly to append free clusters at the end of the image */ +    if (contiguous_free_clusters < cluster_count) { +        /* contiguous_free_clusters clusters are already empty at the image end; +         * we need cluster_count clusters; therefore, we have to allocate +         * cluster_count - contiguous_free_clusters new clusters at the end of +         * the image (which is the current value of cluster; note that cluster +         * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond +         * the image end) */ +        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, +                                     cluster + cluster_count +                                     - contiguous_free_clusters); +        if (ret < 0) { +            return ret; +        } +    } + +    /* Go back to the first free cluster */ +    cluster -= contiguous_free_clusters; +    for (i = 0; i < cluster_count; i++) { +        s->set_refcount(*refcount_table, cluster + i, 1); +    } + +    return cluster << s->cluster_bits; +} + +/* + * Creates a new refcount structure based solely on the in-memory information + * given through *refcount_table. All necessary allocations will be reflected + * in that array. + * + * On success, the old refcount structure is leaked (it will be covered by the + * new refcount structure). + */ +static int rebuild_refcount_structure(BlockDriverState *bs, +                                      BdrvCheckResult *res, +                                      void **refcount_table, +                                      int64_t *nb_clusters) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; +    int64_t refblock_offset, refblock_start, refblock_index; +    uint32_t reftable_size = 0; +    uint64_t *on_disk_reftable = NULL; +    void *on_disk_refblock; +    int ret = 0; +    struct { +        uint64_t reftable_offset; +        uint32_t reftable_clusters; +    } QEMU_PACKED reftable_offset_and_clusters; + +    qcow2_cache_empty(bs, s->refcount_block_cache); + +write_refblocks: +    for (; cluster < *nb_clusters; cluster++) { +        if (!s->get_refcount(*refcount_table, cluster)) { +            continue; +        } + +        refblock_index = cluster >> s->refcount_block_bits; +        refblock_start = refblock_index << s->refcount_block_bits; + +        /* Don't allocate a cluster in a refblock already written to disk */ +        if (first_free_cluster < refblock_start) { +            first_free_cluster = refblock_start; +        } +        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, +                                              nb_clusters, &first_free_cluster); +        if (refblock_offset < 0) { +            fprintf(stderr, "ERROR allocating refblock: %s\n", +                    strerror(-refblock_offset)); +            res->check_errors++; +            ret = refblock_offset; +            goto fail; +        } + +        if (reftable_size <= refblock_index) { +            uint32_t old_reftable_size = reftable_size; +            uint64_t *new_on_disk_reftable; + +            reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), +                                     s->cluster_size) / sizeof(uint64_t); +            new_on_disk_reftable = g_try_realloc(on_disk_reftable, +                                                 reftable_size * +                                                 sizeof(uint64_t)); +            if (!new_on_disk_reftable) { +                res->check_errors++; +                ret = -ENOMEM; +                goto fail; +            } +            on_disk_reftable = new_on_disk_reftable; + +            memset(on_disk_reftable + old_reftable_size, 0, +                   (reftable_size - old_reftable_size) * sizeof(uint64_t)); + +            /* The offset we have for the reftable is now no longer valid; +             * this will leak that range, but we can easily fix that by running +             * a leak-fixing check after this rebuild operation */ +            reftable_offset = -1; +        } +        on_disk_reftable[refblock_index] = refblock_offset; + +        /* If this is apparently the last refblock (for now), try to squeeze the +         * reftable in */ +        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && +            reftable_offset < 0) +        { +            uint64_t reftable_clusters = size_to_clusters(s, reftable_size * +                                                          sizeof(uint64_t)); +            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, +                                                  refcount_table, nb_clusters, +                                                  &first_free_cluster); +            if (reftable_offset < 0) { +                fprintf(stderr, "ERROR allocating reftable: %s\n", +                        strerror(-reftable_offset)); +                res->check_errors++; +                ret = reftable_offset; +                goto fail; +            } +        } + +        ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, +                                            s->cluster_size); +        if (ret < 0) { +            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +            goto fail; +        } + +        /* The size of *refcount_table is always cluster-aligned, therefore the +         * write operation will not overflow */ +        on_disk_refblock = (void *)((char *) *refcount_table + +                                    refblock_index * s->cluster_size); + +        ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, +                         on_disk_refblock, s->cluster_sectors); +        if (ret < 0) { +            fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +            goto fail; +        } + +        /* Go to the end of this refblock */ +        cluster = refblock_start + s->refcount_block_size - 1; +    } + +    if (reftable_offset < 0) { +        uint64_t post_refblock_start, reftable_clusters; + +        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); +        reftable_clusters = size_to_clusters(s, +                                             reftable_size * sizeof(uint64_t)); +        /* Not pretty but simple */ +        if (first_free_cluster < post_refblock_start) { +            first_free_cluster = post_refblock_start; +        } +        reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, +                                              refcount_table, nb_clusters, +                                              &first_free_cluster); +        if (reftable_offset < 0) { +            fprintf(stderr, "ERROR allocating reftable: %s\n", +                    strerror(-reftable_offset)); +            res->check_errors++; +            ret = reftable_offset; +            goto fail; +        } + +        goto write_refblocks; +    } + +    assert(on_disk_reftable); + +    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { +        cpu_to_be64s(&on_disk_reftable[refblock_index]); +    } + +    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, +                                        reftable_size * sizeof(uint64_t)); +    if (ret < 0) { +        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); +        goto fail; +    } + +    assert(reftable_size < INT_MAX / sizeof(uint64_t)); +    ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, +                      reftable_size * sizeof(uint64_t)); +    if (ret < 0) { +        fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); +        goto fail; +    } + +    /* Enter new reftable into the image header */ +    cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset, +                 reftable_offset); +    cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters, +                 size_to_clusters(s, reftable_size * sizeof(uint64_t))); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, +                                              refcount_table_offset), +                           &reftable_offset_and_clusters, +                           sizeof(reftable_offset_and_clusters)); +    if (ret < 0) { +        fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); +        goto fail; +    } + +    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { +        be64_to_cpus(&on_disk_reftable[refblock_index]); +    } +    s->refcount_table = on_disk_reftable; +    s->refcount_table_offset = reftable_offset; +    s->refcount_table_size = reftable_size; + +    return 0; + +fail: +    g_free(on_disk_reftable); +    return ret; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occurred. + */ +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                          BdrvCheckMode fix) +{ +    BDRVQcowState *s = bs->opaque; +    BdrvCheckResult pre_compare_res; +    int64_t size, highest_cluster, nb_clusters; +    void *refcount_table = NULL; +    bool rebuild = false; +    int ret; + +    size = bdrv_getlength(bs->file); +    if (size < 0) { +        res->check_errors++; +        return size; +    } + +    nb_clusters = size_to_clusters(s, size); +    if (nb_clusters > INT_MAX) { +        res->check_errors++; +        return -EFBIG; +    } + +    res->bfi.total_clusters = +        size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); + +    ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, +                              &nb_clusters); +    if (ret < 0) { +        goto fail; +    } + +    /* In case we don't need to rebuild the refcount structure (but want to fix +     * something), this function is immediately called again, in which case the +     * result should be ignored */ +    pre_compare_res = *res; +    compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, +                      nb_clusters); + +    if (rebuild && (fix & BDRV_FIX_ERRORS)) { +        BdrvCheckResult old_res = *res; +        int fresh_leaks = 0; + +        fprintf(stderr, "Rebuilding refcount structure\n"); +        ret = rebuild_refcount_structure(bs, res, &refcount_table, +                                         &nb_clusters); +        if (ret < 0) { +            goto fail; +        } + +        res->corruptions = 0; +        res->leaks = 0; + +        /* Because the old reftable has been exchanged for a new one the +         * references have to be recalculated */ +        rebuild = false; +        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); +        ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, +                                  &nb_clusters); +        if (ret < 0) { +            goto fail; +        } + +        if (fix & BDRV_FIX_LEAKS) { +            /* The old refcount structures are now leaked, fix it; the result +             * can be ignored, aside from leaks which were introduced by +             * rebuild_refcount_structure() that could not be fixed */ +            BdrvCheckResult saved_res = *res; +            *res = (BdrvCheckResult){ 0 }; + +            compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, +                              &highest_cluster, refcount_table, nb_clusters); +            if (rebuild) { +                fprintf(stderr, "ERROR rebuilt refcount structure is still " +                        "broken\n"); +            } + +            /* Any leaks accounted for here were introduced by +             * rebuild_refcount_structure() because that function has created a +             * new refcount structure from scratch */ +            fresh_leaks = res->leaks; +            *res = saved_res; +        } + +        if (res->corruptions < old_res.corruptions) { +            res->corruptions_fixed += old_res.corruptions - res->corruptions; +        } +        if (res->leaks < old_res.leaks) { +            res->leaks_fixed += old_res.leaks - res->leaks; +        } +        res->leaks += fresh_leaks; +    } else if (fix) { +        if (rebuild) { +            fprintf(stderr, "ERROR need to rebuild refcount structures\n"); +            res->check_errors++; +            ret = -EIO; +            goto fail; +        } + +        if (res->leaks || res->corruptions) { +            *res = pre_compare_res; +            compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, +                              refcount_table, nb_clusters); +        } +    } + +    /* check OFLAG_COPIED */ +    ret = check_oflag_copied(bs, res, fix); +    if (ret < 0) { +        goto fail; +    } + +    res->image_end_offset = (highest_cluster + 1) * s->cluster_size; +    ret = 0; + +fail: +    g_free(refcount_table); + +    return ret; +} + +#define overlaps_with(ofs, sz) \ +    ranges_overlap(offset, size, ofs, sz) + +/* + * Checks if the given offset into the image file is actually free to use by + * looking for overlaps with important metadata sections (L1/L2 tables etc.), + * i.e. a sanity check without relying on the refcount tables. + * + * The ign parameter specifies what checks not to perform (being a bitmask of + * QCow2MetadataOverlap values), i.e., what sections to ignore. + * + * Returns: + * - 0 if writing to this offset will not affect the mentioned metadata + * - a positive QCow2MetadataOverlap value indicating one overlapping section + * - a negative value (-errno) indicating an error while performing a check, + *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 + */ +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, +                                 int64_t size) +{ +    BDRVQcowState *s = bs->opaque; +    int chk = s->overlap_check & ~ign; +    int i, j; + +    if (!size) { +        return 0; +    } + +    if (chk & QCOW2_OL_MAIN_HEADER) { +        if (offset < s->cluster_size) { +            return QCOW2_OL_MAIN_HEADER; +        } +    } + +    /* align range to test to cluster boundaries */ +    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); +    offset = start_of_cluster(s, offset); + +    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { +        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { +            return QCOW2_OL_ACTIVE_L1; +        } +    } + +    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { +        if (overlaps_with(s->refcount_table_offset, +            s->refcount_table_size * sizeof(uint64_t))) { +            return QCOW2_OL_REFCOUNT_TABLE; +        } +    } + +    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { +        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { +            return QCOW2_OL_SNAPSHOT_TABLE; +        } +    } + +    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { +        for (i = 0; i < s->nb_snapshots; i++) { +            if (s->snapshots[i].l1_size && +                overlaps_with(s->snapshots[i].l1_table_offset, +                s->snapshots[i].l1_size * sizeof(uint64_t))) { +                return QCOW2_OL_INACTIVE_L1; +            } +        } +    } + +    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { +        for (i = 0; i < s->l1_size; i++) { +            if ((s->l1_table[i] & L1E_OFFSET_MASK) && +                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, +                s->cluster_size)) { +                return QCOW2_OL_ACTIVE_L2; +            } +        } +    } + +    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { +        for (i = 0; i < s->refcount_table_size; i++) { +            if ((s->refcount_table[i] & REFT_OFFSET_MASK) && +                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, +                s->cluster_size)) { +                return QCOW2_OL_REFCOUNT_BLOCK; +            } +        } +    } + +    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { +        for (i = 0; i < s->nb_snapshots; i++) { +            uint64_t l1_ofs = s->snapshots[i].l1_table_offset; +            uint32_t l1_sz  = s->snapshots[i].l1_size; +            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); +            uint64_t *l1 = g_try_malloc(l1_sz2); +            int ret; + +            if (l1_sz2 && l1 == NULL) { +                return -ENOMEM; +            } + +            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); +            if (ret < 0) { +                g_free(l1); +                return ret; +            } + +            for (j = 0; j < l1_sz; j++) { +                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; +                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { +                    g_free(l1); +                    return QCOW2_OL_INACTIVE_L2; +                } +            } + +            g_free(l1); +        } +    } + +    return 0; +} + +static const char *metadata_ol_names[] = { +    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header", +    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table", +    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table", +    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", +    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", +    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", +    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table", +    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table", +}; + +/* + * First performs a check for metadata overlaps (through + * qcow2_check_metadata_overlap); if that fails with a negative value (error + * while performing a check), that value is returned. If an impending overlap + * is detected, the BDS will be made unusable, the qcow2 file marked corrupt + * and -EIO returned. + * + * Returns 0 if there were neither overlaps nor errors while checking for + * overlaps; or a negative value (-errno) on error. + */ +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, +                                  int64_t size) +{ +    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); + +    if (ret < 0) { +        return ret; +    } else if (ret > 0) { +        int metadata_ol_bitnr = ctz32(ret); +        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); + +        qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " +                                "write on metadata (overlaps with %s)", +                                metadata_ol_names[metadata_ol_bitnr]); +        return -EIO; +    } + +    return 0; +} diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c new file mode 100644 index 00000000..b6f58c13 --- /dev/null +++ b/block/qcow2-snapshot.c @@ -0,0 +1,734 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "qemu/error-report.h" + +void qcow2_free_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int i; + +    for(i = 0; i < s->nb_snapshots; i++) { +        g_free(s->snapshots[i].name); +        g_free(s->snapshots[i].id_str); +    } +    g_free(s->snapshots); +    s->snapshots = NULL; +    s->nb_snapshots = 0; +} + +int qcow2_read_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshotHeader h; +    QCowSnapshotExtraData extra; +    QCowSnapshot *sn; +    int i, id_str_size, name_size; +    int64_t offset; +    uint32_t extra_data_size; +    int ret; + +    if (!s->nb_snapshots) { +        s->snapshots = NULL; +        s->snapshots_size = 0; +        return 0; +    } + +    offset = s->snapshots_offset; +    s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots); + +    for(i = 0; i < s->nb_snapshots; i++) { +        /* Read statically sized part of the snapshot header */ +        offset = align_offset(offset, 8); +        ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); +        if (ret < 0) { +            goto fail; +        } + +        offset += sizeof(h); +        sn = s->snapshots + i; +        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); +        sn->l1_size = be32_to_cpu(h.l1_size); +        sn->vm_state_size = be32_to_cpu(h.vm_state_size); +        sn->date_sec = be32_to_cpu(h.date_sec); +        sn->date_nsec = be32_to_cpu(h.date_nsec); +        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); +        extra_data_size = be32_to_cpu(h.extra_data_size); + +        id_str_size = be16_to_cpu(h.id_str_size); +        name_size = be16_to_cpu(h.name_size); + +        /* Read extra data */ +        ret = bdrv_pread(bs->file, offset, &extra, +                         MIN(sizeof(extra), extra_data_size)); +        if (ret < 0) { +            goto fail; +        } +        offset += extra_data_size; + +        if (extra_data_size >= 8) { +            sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); +        } + +        if (extra_data_size >= 16) { +            sn->disk_size = be64_to_cpu(extra.disk_size); +        } else { +            sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; +        } + +        /* Read snapshot ID */ +        sn->id_str = g_malloc(id_str_size + 1); +        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); +        if (ret < 0) { +            goto fail; +        } +        offset += id_str_size; +        sn->id_str[id_str_size] = '\0'; + +        /* Read snapshot name */ +        sn->name = g_malloc(name_size + 1); +        ret = bdrv_pread(bs->file, offset, sn->name, name_size); +        if (ret < 0) { +            goto fail; +        } +        offset += name_size; +        sn->name[name_size] = '\0'; + +        if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) { +            ret = -EFBIG; +            goto fail; +        } +    } + +    assert(offset - s->snapshots_offset <= INT_MAX); +    s->snapshots_size = offset - s->snapshots_offset; +    return 0; + +fail: +    qcow2_free_snapshots(bs); +    return ret; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow2_write_snapshots(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    QCowSnapshotHeader h; +    QCowSnapshotExtraData extra; +    int i, name_size, id_str_size, snapshots_size; +    struct { +        uint32_t nb_snapshots; +        uint64_t snapshots_offset; +    } QEMU_PACKED header_data; +    int64_t offset, snapshots_offset = 0; +    int ret; + +    /* compute the size of the snapshots */ +    offset = 0; +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        offset = align_offset(offset, 8); +        offset += sizeof(h); +        offset += sizeof(extra); +        offset += strlen(sn->id_str); +        offset += strlen(sn->name); + +        if (offset > QCOW_MAX_SNAPSHOTS_SIZE) { +            ret = -EFBIG; +            goto fail; +        } +    } + +    assert(offset <= INT_MAX); +    snapshots_size = offset; + +    /* Allocate space for the new snapshot list */ +    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); +    offset = snapshots_offset; +    if (offset < 0) { +        ret = offset; +        goto fail; +    } +    ret = bdrv_flush(bs); +    if (ret < 0) { +        goto fail; +    } + +    /* The snapshot list position has not yet been updated, so these clusters +     * must indeed be completely free */ +    ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size); +    if (ret < 0) { +        goto fail; +    } + + +    /* Write all snapshots to the new list */ +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        memset(&h, 0, sizeof(h)); +        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); +        h.l1_size = cpu_to_be32(sn->l1_size); +        /* If it doesn't fit in 32 bit, older implementations should treat it +         * as a disk-only snapshot rather than truncate the VM state */ +        if (sn->vm_state_size <= 0xffffffff) { +            h.vm_state_size = cpu_to_be32(sn->vm_state_size); +        } +        h.date_sec = cpu_to_be32(sn->date_sec); +        h.date_nsec = cpu_to_be32(sn->date_nsec); +        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); +        h.extra_data_size = cpu_to_be32(sizeof(extra)); + +        memset(&extra, 0, sizeof(extra)); +        extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); +        extra.disk_size = cpu_to_be64(sn->disk_size); + +        id_str_size = strlen(sn->id_str); +        name_size = strlen(sn->name); +        assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); +        h.id_str_size = cpu_to_be16(id_str_size); +        h.name_size = cpu_to_be16(name_size); +        offset = align_offset(offset, 8); + +        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); +        if (ret < 0) { +            goto fail; +        } +        offset += sizeof(h); + +        ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); +        if (ret < 0) { +            goto fail; +        } +        offset += sizeof(extra); + +        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); +        if (ret < 0) { +            goto fail; +        } +        offset += id_str_size; + +        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); +        if (ret < 0) { +            goto fail; +        } +        offset += name_size; +    } + +    /* +     * Update the header to point to the new snapshot table. This requires the +     * new table and its refcounts to be stable on disk. +     */ +    ret = bdrv_flush(bs); +    if (ret < 0) { +        goto fail; +    } + +    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != +        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + +    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots); +    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset); + +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), +                           &header_data, sizeof(header_data)); +    if (ret < 0) { +        goto fail; +    } + +    /* free the old snapshot table */ +    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, +                        QCOW2_DISCARD_SNAPSHOT); +    s->snapshots_offset = snapshots_offset; +    s->snapshots_size = snapshots_size; +    return 0; + +fail: +    if (snapshots_offset > 0) { +        qcow2_free_clusters(bs, snapshots_offset, snapshots_size, +                            QCOW2_DISCARD_ALWAYS); +    } +    return ret; +} + +static void find_new_snapshot_id(BlockDriverState *bs, +                                 char *id_str, int id_str_size) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    int i; +    unsigned long id, id_max = 0; + +    for(i = 0; i < s->nb_snapshots; i++) { +        sn = s->snapshots + i; +        id = strtoul(sn->id_str, NULL, 10); +        if (id > id_max) +            id_max = id; +    } +    snprintf(id_str, id_str_size, "%lu", id_max + 1); +} + +static int find_snapshot_by_id_and_name(BlockDriverState *bs, +                                        const char *id, +                                        const char *name) +{ +    BDRVQcowState *s = bs->opaque; +    int i; + +    if (id && name) { +        for (i = 0; i < s->nb_snapshots; i++) { +            if (!strcmp(s->snapshots[i].id_str, id) && +                !strcmp(s->snapshots[i].name, name)) { +                return i; +            } +        } +    } else if (id) { +        for (i = 0; i < s->nb_snapshots; i++) { +            if (!strcmp(s->snapshots[i].id_str, id)) { +                return i; +            } +        } +    } else if (name) { +        for (i = 0; i < s->nb_snapshots; i++) { +            if (!strcmp(s->snapshots[i].name, name)) { +                return i; +            } +        } +    } + +    return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, +                                       const char *id_or_name) +{ +    int ret; + +    ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL); +    if (ret >= 0) { +        return ret; +    } +    return find_snapshot_by_id_and_name(bs, NULL, id_or_name); +} + +/* if no id is provided, a new one is constructed */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *new_snapshot_list = NULL; +    QCowSnapshot *old_snapshot_list = NULL; +    QCowSnapshot sn1, *sn = &sn1; +    int i, ret; +    uint64_t *l1_table = NULL; +    int64_t l1_table_offset; + +    if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) { +        return -EFBIG; +    } + +    memset(sn, 0, sizeof(*sn)); + +    /* Generate an ID */ +    find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); + +    /* Check that the ID is unique */ +    if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { +        return -EEXIST; +    } + +    /* Populate sn with passed data */ +    sn->id_str = g_strdup(sn_info->id_str); +    sn->name = g_strdup(sn_info->name); + +    sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; +    sn->vm_state_size = sn_info->vm_state_size; +    sn->date_sec = sn_info->date_sec; +    sn->date_nsec = sn_info->date_nsec; +    sn->vm_clock_nsec = sn_info->vm_clock_nsec; + +    /* Allocate the L1 table of the snapshot and copy the current one there. */ +    l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); +    if (l1_table_offset < 0) { +        ret = l1_table_offset; +        goto fail; +    } + +    sn->l1_table_offset = l1_table_offset; +    sn->l1_size = s->l1_size; + +    l1_table = g_try_new(uint64_t, s->l1_size); +    if (s->l1_size && l1_table == NULL) { +        ret = -ENOMEM; +        goto fail; +    } + +    for(i = 0; i < s->l1_size; i++) { +        l1_table[i] = cpu_to_be64(s->l1_table[i]); +    } + +    ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset, +                                        s->l1_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } + +    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, +                      s->l1_size * sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } + +    g_free(l1_table); +    l1_table = NULL; + +    /* +     * Increase the refcounts of all clusters and make sure everything is +     * stable on disk before updating the snapshot table to contain a pointer +     * to the new L1 table. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); +    if (ret < 0) { +        goto fail; +    } + +    /* Append the new snapshot to the snapshot list */ +    new_snapshot_list = g_new(QCowSnapshot, s->nb_snapshots + 1); +    if (s->snapshots) { +        memcpy(new_snapshot_list, s->snapshots, +               s->nb_snapshots * sizeof(QCowSnapshot)); +        old_snapshot_list = s->snapshots; +    } +    s->snapshots = new_snapshot_list; +    s->snapshots[s->nb_snapshots++] = *sn; + +    ret = qcow2_write_snapshots(bs); +    if (ret < 0) { +        g_free(s->snapshots); +        s->snapshots = old_snapshot_list; +        s->nb_snapshots--; +        goto fail; +    } + +    g_free(old_snapshot_list); + +    /* The VM state isn't needed any more in the active L1 table; in fact, it +     * hurts by causing expensive COW for the next snapshot. */ +    qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), +                           align_offset(sn->vm_state_size, s->cluster_size) +                                >> BDRV_SECTOR_BITS, +                           QCOW2_DISCARD_NEVER, false); + +#ifdef DEBUG_ALLOC +    { +      BdrvCheckResult result = {0}; +      qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; + +fail: +    g_free(sn->id_str); +    g_free(sn->name); +    g_free(l1_table); + +    return ret; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    int i, snapshot_index; +    int cur_l1_bytes, sn_l1_bytes; +    int ret; +    uint64_t *sn_l1_table = NULL; + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); +    if (snapshot_index < 0) { +        return -ENOENT; +    } +    sn = &s->snapshots[snapshot_index]; + +    if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { +        error_report("qcow2: Loading snapshots with different disk " +            "size is not implemented"); +        ret = -ENOTSUP; +        goto fail; +    } + +    /* +     * Make sure that the current L1 table is big enough to contain the whole +     * L1 table of the snapshot. If the snapshot L1 table is smaller, the +     * current one must be padded with zeros. +     */ +    ret = qcow2_grow_l1_table(bs, sn->l1_size, true); +    if (ret < 0) { +        goto fail; +    } + +    cur_l1_bytes = s->l1_size * sizeof(uint64_t); +    sn_l1_bytes = sn->l1_size * sizeof(uint64_t); + +    /* +     * Copy the snapshot L1 table to the current L1 table. +     * +     * Before overwriting the old current L1 table on disk, make sure to +     * increase all refcounts for the clusters referenced by the new one. +     * Decrease the refcount referenced by the old one only when the L1 +     * table is overwritten. +     */ +    sn_l1_table = g_try_malloc0(cur_l1_bytes); +    if (cur_l1_bytes && sn_l1_table == NULL) { +        ret = -ENOMEM; +        goto fail; +    } + +    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); +    if (ret < 0) { +        goto fail; +    } + +    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, +                                         sn->l1_size, 1); +    if (ret < 0) { +        goto fail; +    } + +    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, +                                        s->l1_table_offset, cur_l1_bytes); +    if (ret < 0) { +        goto fail; +    } + +    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, +                           cur_l1_bytes); +    if (ret < 0) { +        goto fail; +    } + +    /* +     * Decrease refcount of clusters of current L1 table. +     * +     * At this point, the in-memory s->l1_table points to the old L1 table, +     * whereas on disk we already have the new one. +     * +     * qcow2_update_snapshot_refcount special cases the current L1 table to use +     * the in-memory data instead of really using the offset to load a new one, +     * which is why this works. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, +                                         s->l1_size, -1); + +    /* +     * Now update the in-memory L1 table to be in sync with the on-disk one. We +     * need to do this even if updating refcounts failed. +     */ +    for(i = 0;i < s->l1_size; i++) { +        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); +    } + +    if (ret < 0) { +        goto fail; +    } + +    g_free(sn_l1_table); +    sn_l1_table = NULL; + +    /* +     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed +     * when we decreased the refcount of the old snapshot. +     */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); +    if (ret < 0) { +        goto fail; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; + +fail: +    g_free(sn_l1_table); +    return ret; +} + +int qcow2_snapshot_delete(BlockDriverState *bs, +                          const char *snapshot_id, +                          const char *name, +                          Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot sn; +    int snapshot_index, ret; + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); +    if (snapshot_index < 0) { +        error_setg(errp, "Can't find the snapshot"); +        return -ENOENT; +    } +    sn = s->snapshots[snapshot_index]; + +    /* Remove it from the snapshot list */ +    memmove(s->snapshots + snapshot_index, +            s->snapshots + snapshot_index + 1, +            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); +    s->nb_snapshots--; +    ret = qcow2_write_snapshots(bs); +    if (ret < 0) { +        error_setg_errno(errp, -ret, +                         "Failed to remove snapshot from snapshot list"); +        return ret; +    } + +    /* +     * The snapshot is now unused, clean up. If we fail after this point, we +     * won't recover but just leak clusters. +     */ +    g_free(sn.id_str); +    g_free(sn.name); + +    /* +     * Now decrease the refcounts of clusters referenced by the snapshot and +     * free the L1 table. +     */ +    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, +                                         sn.l1_size, -1); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table"); +        return ret; +    } +    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), +                        QCOW2_DISCARD_SNAPSHOT); + +    /* must update the copied flag on the current cluster offsets */ +    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); +    if (ret < 0) { +        error_setg_errno(errp, -ret, +                         "Failed to update snapshot status in disk"); +        return ret; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return 0; +} + +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ +    BDRVQcowState *s = bs->opaque; +    QEMUSnapshotInfo *sn_tab, *sn_info; +    QCowSnapshot *sn; +    int i; + +    if (!s->nb_snapshots) { +        *psn_tab = NULL; +        return s->nb_snapshots; +    } + +    sn_tab = g_new0(QEMUSnapshotInfo, s->nb_snapshots); +    for(i = 0; i < s->nb_snapshots; i++) { +        sn_info = sn_tab + i; +        sn = s->snapshots + i; +        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), +                sn->id_str); +        pstrcpy(sn_info->name, sizeof(sn_info->name), +                sn->name); +        sn_info->vm_state_size = sn->vm_state_size; +        sn_info->date_sec = sn->date_sec; +        sn_info->date_nsec = sn->date_nsec; +        sn_info->vm_clock_nsec = sn->vm_clock_nsec; +    } +    *psn_tab = sn_tab; +    return s->nb_snapshots; +} + +int qcow2_snapshot_load_tmp(BlockDriverState *bs, +                            const char *snapshot_id, +                            const char *name, +                            Error **errp) +{ +    int i, snapshot_index; +    BDRVQcowState *s = bs->opaque; +    QCowSnapshot *sn; +    uint64_t *new_l1_table; +    int new_l1_bytes; +    int ret; + +    assert(bs->read_only); + +    /* Search the snapshot */ +    snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); +    if (snapshot_index < 0) { +        error_setg(errp, +                   "Can't find snapshot"); +        return -ENOENT; +    } +    sn = &s->snapshots[snapshot_index]; + +    /* Allocate and read in the snapshot's L1 table */ +    if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { +        error_setg(errp, "Snapshot L1 table too large"); +        return -EFBIG; +    } +    new_l1_bytes = sn->l1_size * sizeof(uint64_t); +    new_l1_table = qemu_try_blockalign(bs->file, +                                       align_offset(new_l1_bytes, 512)); +    if (new_l1_table == NULL) { +        return -ENOMEM; +    } + +    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); +    if (ret < 0) { +        error_setg(errp, "Failed to read l1 table for snapshot"); +        qemu_vfree(new_l1_table); +        return ret; +    } + +    /* Switch the L1 table */ +    qemu_vfree(s->l1_table); + +    s->l1_size = sn->l1_size; +    s->l1_table_offset = sn->l1_table_offset; +    s->l1_table = new_l1_table; + +    for(i = 0;i < s->l1_size; i++) { +        be64_to_cpus(&s->l1_table[i]); +    } + +    return 0; +} diff --git a/block/qcow2.c b/block/qcow2.c new file mode 100644 index 00000000..76c331b3 --- /dev/null +++ b/block/qcow2.c @@ -0,0 +1,2987 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "block/qcow2.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qbool.h" +#include "qapi/util.h" +#include "qapi/qmp/types.h" +#include "qapi-event.h" +#include "trace.h" +#include "qemu/option_int.h" + +/* +  Differences with QCOW: + +  - Support for multiple incremental snapshots. +  - Memory management by reference counts. +  - Clusters which have a reference count of one have the bit +    QCOW_OFLAG_COPIED to optimize write performance. +  - Size of compressed clusters is stored in sectors to reduce bit usage +    in the cluster offsets. +  - Support for storing additional data (such as the VM state) in the +    snapshots. +  - If a backing store is used, the cluster size is not constrained +    (could be backported to QCOW). +  - L2 tables have always a size of one cluster. +*/ + + +typedef struct { +    uint32_t magic; +    uint32_t len; +} QEMU_PACKED QCowExtension; + +#define  QCOW2_EXT_MAGIC_END 0 +#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA +#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 + +static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const QCowHeader *cow_header = (const void *)buf; + +    if (buf_size >= sizeof(QCowHeader) && +        be32_to_cpu(cow_header->magic) == QCOW_MAGIC && +        be32_to_cpu(cow_header->version) >= 2) +        return 100; +    else +        return 0; +} + + +/*  + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, +                                 uint64_t end_offset, void **p_feature_table, +                                 Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    QCowExtension ext; +    uint64_t offset; +    int ret; + +#ifdef DEBUG_EXT +    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif +    offset = start_offset; +    while (offset < end_offset) { + +#ifdef DEBUG_EXT +        /* Sanity check */ +        if (offset > s->cluster_size) +            printf("qcow2_read_extension: suspicious offset %lu\n", offset); + +        printf("attempting to read extended header in offset %lu\n", offset); +#endif + +        ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " +                             "pread fail from offset %" PRIu64, offset); +            return 1; +        } +        be32_to_cpus(&ext.magic); +        be32_to_cpus(&ext.len); +        offset += sizeof(ext); +#ifdef DEBUG_EXT +        printf("ext.magic = 0x%x\n", ext.magic); +#endif +        if (offset > end_offset || ext.len > end_offset - offset) { +            error_setg(errp, "Header extension too large"); +            return -EINVAL; +        } + +        switch (ext.magic) { +        case QCOW2_EXT_MAGIC_END: +            return 0; + +        case QCOW2_EXT_MAGIC_BACKING_FORMAT: +            if (ext.len >= sizeof(bs->backing_format)) { +                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 +                           " too large (>=%zu)", ext.len, +                           sizeof(bs->backing_format)); +                return 2; +            } +            ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); +            if (ret < 0) { +                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " +                                 "Could not read format name"); +                return 3; +            } +            bs->backing_format[ext.len] = '\0'; +            s->image_backing_format = g_strdup(bs->backing_format); +#ifdef DEBUG_EXT +            printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif +            break; + +        case QCOW2_EXT_MAGIC_FEATURE_TABLE: +            if (p_feature_table != NULL) { +                void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); +                ret = bdrv_pread(bs->file, offset , feature_table, ext.len); +                if (ret < 0) { +                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " +                                     "Could not read table"); +                    return ret; +                } + +                *p_feature_table = feature_table; +            } +            break; + +        default: +            /* unknown magic - save it in case we need to rewrite the header */ +            { +                Qcow2UnknownHeaderExtension *uext; + +                uext = g_malloc0(sizeof(*uext)  + ext.len); +                uext->magic = ext.magic; +                uext->len = ext.len; +                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + +                ret = bdrv_pread(bs->file, offset , uext->data, uext->len); +                if (ret < 0) { +                    error_setg_errno(errp, -ret, "ERROR: unknown extension: " +                                     "Could not read data"); +                    return ret; +                } +            } +            break; +        } + +        offset += ((ext.len + 7) & ~7); +    } + +    return 0; +} + +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    Qcow2UnknownHeaderExtension *uext, *next; + +    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { +        QLIST_REMOVE(uext, next); +        g_free(uext); +    } +} + +static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, +    Error **errp, const char *fmt, ...) +{ +    char msg[64]; +    va_list ap; + +    va_start(ap, fmt); +    vsnprintf(msg, sizeof(msg), fmt, ap); +    va_end(ap); + +    error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +               bdrv_get_device_or_node_name(bs), "qcow2", msg); +} + +static void report_unsupported_feature(BlockDriverState *bs, +    Error **errp, Qcow2Feature *table, uint64_t mask) +{ +    char *features = g_strdup(""); +    char *old; + +    while (table && table->name[0] != '\0') { +        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { +            if (mask & (1ULL << table->bit)) { +                old = features; +                features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", +                                           table->name); +                g_free(old); +                mask &= ~(1ULL << table->bit); +            } +        } +        table++; +    } + +    if (mask) { +        old = features; +        features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, +                                   old, *old ? ", " : "", mask); +        g_free(old); +    } + +    report_unsupported(bs, errp, "%s", features); +    g_free(features); +} + +/* + * Sets the dirty bit and flushes afterwards if necessary. + * + * The incompatible_features bit is only set if the image file header was + * updated successfully.  Therefore it is not required to check the return + * value of this function. + */ +int qcow2_mark_dirty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t val; +    int ret; + +    assert(s->qcow_version >= 3); + +    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { +        return 0; /* already dirty */ +    } + +    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); +    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), +                      &val, sizeof(val)); +    if (ret < 0) { +        return ret; +    } +    ret = bdrv_flush(bs->file); +    if (ret < 0) { +        return ret; +    } + +    /* Only treat image as dirty if the header was updated successfully */ +    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; +    return 0; +} + +/* + * Clears the dirty bit and flushes before if necessary.  Only call this + * function when there are no pending requests, it does not guard against + * concurrent requests dirtying the image. + */ +static int qcow2_mark_clean(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { +        int ret; + +        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + +        ret = bdrv_flush(bs); +        if (ret < 0) { +            return ret; +        } + +        return qcow2_update_header(bs); +    } +    return 0; +} + +/* + * Marks the image as corrupt. + */ +int qcow2_mark_corrupt(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; +    return qcow2_update_header(bs); +} + +/* + * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes + * before if necessary. + */ +int qcow2_mark_consistent(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; + +    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { +        int ret = bdrv_flush(bs); +        if (ret < 0) { +            return ret; +        } + +        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; +        return qcow2_update_header(bs); +    } +    return 0; +} + +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, +                       BdrvCheckMode fix) +{ +    int ret = qcow2_check_refcounts(bs, result, fix); +    if (ret < 0) { +        return ret; +    } + +    if (fix && result->check_errors == 0 && result->corruptions == 0) { +        ret = qcow2_mark_clean(bs); +        if (ret < 0) { +            return ret; +        } +        return qcow2_mark_consistent(bs); +    } +    return ret; +} + +static int validate_table_offset(BlockDriverState *bs, uint64_t offset, +                                 uint64_t entries, size_t entry_len) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t size; + +    /* Use signed INT64_MAX as the maximum even for uint64_t header fields, +     * because values will be passed to qemu functions taking int64_t. */ +    if (entries > INT64_MAX / entry_len) { +        return -EINVAL; +    } + +    size = entries * entry_len; + +    if (INT64_MAX - size < offset) { +        return -EINVAL; +    } + +    /* Tables must be cluster aligned */ +    if (offset & (s->cluster_size - 1)) { +        return -EINVAL; +    } + +    return 0; +} + +static QemuOptsList qcow2_runtime_opts = { +    .name = "qcow2", +    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), +    .desc = { +        { +            .name = QCOW2_OPT_LAZY_REFCOUNTS, +            .type = QEMU_OPT_BOOL, +            .help = "Postpone refcount updates", +        }, +        { +            .name = QCOW2_OPT_DISCARD_REQUEST, +            .type = QEMU_OPT_BOOL, +            .help = "Pass guest discard requests to the layer below", +        }, +        { +            .name = QCOW2_OPT_DISCARD_SNAPSHOT, +            .type = QEMU_OPT_BOOL, +            .help = "Generate discard requests when snapshot related space " +                    "is freed", +        }, +        { +            .name = QCOW2_OPT_DISCARD_OTHER, +            .type = QEMU_OPT_BOOL, +            .help = "Generate discard requests when other clusters are freed", +        }, +        { +            .name = QCOW2_OPT_OVERLAP, +            .type = QEMU_OPT_STRING, +            .help = "Selects which overlap checks to perform from a range of " +                    "templates (none, constant, cached, all)", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_TEMPLATE, +            .type = QEMU_OPT_STRING, +            .help = "Selects which overlap checks to perform from a range of " +                    "templates (none, constant, cached, all)", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into the main qcow2 header", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into the active L1 table", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into an active L2 table", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into the refcount table", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into a refcount block", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into the snapshot table", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into an inactive L1 table", +        }, +        { +            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, +            .type = QEMU_OPT_BOOL, +            .help = "Check for unintended writes into an inactive L2 table", +        }, +        { +            .name = QCOW2_OPT_CACHE_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Maximum combined metadata (L2 tables and refcount blocks) " +                    "cache size", +        }, +        { +            .name = QCOW2_OPT_L2_CACHE_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Maximum L2 table cache size", +        }, +        { +            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Maximum refcount block cache size", +        }, +        { /* end of list */ } +    }, +}; + +static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { +    [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER, +    [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1, +    [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2, +    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, +    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, +    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, +    [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1, +    [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2, +}; + +static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, +                             uint64_t *l2_cache_size, +                             uint64_t *refcount_cache_size, Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t combined_cache_size; +    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; + +    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE); +    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE); +    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE); + +    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0); +    *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0); +    *refcount_cache_size = qemu_opt_get_size(opts, +                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0); + +    if (combined_cache_size_set) { +        if (l2_cache_size_set && refcount_cache_size_set) { +            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE +                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set " +                       "the same time"); +            return; +        } else if (*l2_cache_size > combined_cache_size) { +            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed " +                       QCOW2_OPT_CACHE_SIZE); +            return; +        } else if (*refcount_cache_size > combined_cache_size) { +            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed " +                       QCOW2_OPT_CACHE_SIZE); +            return; +        } + +        if (l2_cache_size_set) { +            *refcount_cache_size = combined_cache_size - *l2_cache_size; +        } else if (refcount_cache_size_set) { +            *l2_cache_size = combined_cache_size - *refcount_cache_size; +        } else { +            *refcount_cache_size = combined_cache_size +                                 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); +            *l2_cache_size = combined_cache_size - *refcount_cache_size; +        } +    } else { +        if (!l2_cache_size_set && !refcount_cache_size_set) { +            *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, +                                 (uint64_t)DEFAULT_L2_CACHE_CLUSTERS +                                 * s->cluster_size); +            *refcount_cache_size = *l2_cache_size +                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO; +        } else if (!l2_cache_size_set) { +            *l2_cache_size = *refcount_cache_size +                           * DEFAULT_L2_REFCOUNT_SIZE_RATIO; +        } else if (!refcount_cache_size_set) { +            *refcount_cache_size = *l2_cache_size +                                 / DEFAULT_L2_REFCOUNT_SIZE_RATIO; +        } +    } +} + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    unsigned int len, i; +    int ret = 0; +    QCowHeader header; +    QemuOpts *opts = NULL; +    Error *local_err = NULL; +    uint64_t ext_end; +    uint64_t l1_vm_state_index; +    const char *opt_overlap_check, *opt_overlap_check_template; +    int overlap_check_template = 0; +    uint64_t l2_cache_size, refcount_cache_size; + +    ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not read qcow2 header"); +        goto fail; +    } +    be32_to_cpus(&header.magic); +    be32_to_cpus(&header.version); +    be64_to_cpus(&header.backing_file_offset); +    be32_to_cpus(&header.backing_file_size); +    be64_to_cpus(&header.size); +    be32_to_cpus(&header.cluster_bits); +    be32_to_cpus(&header.crypt_method); +    be64_to_cpus(&header.l1_table_offset); +    be32_to_cpus(&header.l1_size); +    be64_to_cpus(&header.refcount_table_offset); +    be32_to_cpus(&header.refcount_table_clusters); +    be64_to_cpus(&header.snapshots_offset); +    be32_to_cpus(&header.nb_snapshots); + +    if (header.magic != QCOW_MAGIC) { +        error_setg(errp, "Image is not in qcow2 format"); +        ret = -EINVAL; +        goto fail; +    } +    if (header.version < 2 || header.version > 3) { +        report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version); +        ret = -ENOTSUP; +        goto fail; +    } + +    s->qcow_version = header.version; + +    /* Initialise cluster size */ +    if (header.cluster_bits < MIN_CLUSTER_BITS || +        header.cluster_bits > MAX_CLUSTER_BITS) { +        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, +                   header.cluster_bits); +        ret = -EINVAL; +        goto fail; +    } + +    s->cluster_bits = header.cluster_bits; +    s->cluster_size = 1 << s->cluster_bits; +    s->cluster_sectors = 1 << (s->cluster_bits - 9); + +    /* Initialise version 3 header fields */ +    if (header.version == 2) { +        header.incompatible_features    = 0; +        header.compatible_features      = 0; +        header.autoclear_features       = 0; +        header.refcount_order           = 4; +        header.header_length            = 72; +    } else { +        be64_to_cpus(&header.incompatible_features); +        be64_to_cpus(&header.compatible_features); +        be64_to_cpus(&header.autoclear_features); +        be32_to_cpus(&header.refcount_order); +        be32_to_cpus(&header.header_length); + +        if (header.header_length < 104) { +            error_setg(errp, "qcow2 header too short"); +            ret = -EINVAL; +            goto fail; +        } +    } + +    if (header.header_length > s->cluster_size) { +        error_setg(errp, "qcow2 header exceeds cluster size"); +        ret = -EINVAL; +        goto fail; +    } + +    if (header.header_length > sizeof(header)) { +        s->unknown_header_fields_size = header.header_length - sizeof(header); +        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); +        ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, +                         s->unknown_header_fields_size); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " +                             "fields"); +            goto fail; +        } +    } + +    if (header.backing_file_offset > s->cluster_size) { +        error_setg(errp, "Invalid backing file offset"); +        ret = -EINVAL; +        goto fail; +    } + +    if (header.backing_file_offset) { +        ext_end = header.backing_file_offset; +    } else { +        ext_end = 1 << header.cluster_bits; +    } + +    /* Handle feature bits */ +    s->incompatible_features    = header.incompatible_features; +    s->compatible_features      = header.compatible_features; +    s->autoclear_features       = header.autoclear_features; + +    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { +        void *feature_table = NULL; +        qcow2_read_extensions(bs, header.header_length, ext_end, +                              &feature_table, NULL); +        report_unsupported_feature(bs, errp, feature_table, +                                   s->incompatible_features & +                                   ~QCOW2_INCOMPAT_MASK); +        ret = -ENOTSUP; +        g_free(feature_table); +        goto fail; +    } + +    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { +        /* Corrupt images may not be written to unless they are being repaired +         */ +        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { +            error_setg(errp, "qcow2: Image is corrupt; cannot be opened " +                       "read/write"); +            ret = -EACCES; +            goto fail; +        } +    } + +    /* Check support for various header values */ +    if (header.refcount_order > 6) { +        error_setg(errp, "Reference count entry width too large; may not " +                   "exceed 64 bits"); +        ret = -EINVAL; +        goto fail; +    } +    s->refcount_order = header.refcount_order; +    s->refcount_bits = 1 << s->refcount_order; +    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); +    s->refcount_max += s->refcount_max - 1; + +    if (header.crypt_method > QCOW_CRYPT_AES) { +        error_setg(errp, "Unsupported encryption method: %" PRIu32, +                   header.crypt_method); +        ret = -EINVAL; +        goto fail; +    } +    if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { +        error_setg(errp, "AES cipher not available"); +        ret = -EINVAL; +        goto fail; +    } +    s->crypt_method_header = header.crypt_method; +    if (s->crypt_method_header) { +        bs->encrypted = 1; +    } + +    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ +    s->l2_size = 1 << s->l2_bits; +    /* 2^(s->refcount_order - 3) is the refcount width in bytes */ +    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3); +    s->refcount_block_size = 1 << s->refcount_block_bits; +    bs->total_sectors = header.size / 512; +    s->csize_shift = (62 - (s->cluster_bits - 8)); +    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; +    s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + +    s->refcount_table_offset = header.refcount_table_offset; +    s->refcount_table_size = +        header.refcount_table_clusters << (s->cluster_bits - 3); + +    if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { +        error_setg(errp, "Reference count table too large"); +        ret = -EINVAL; +        goto fail; +    } + +    ret = validate_table_offset(bs, s->refcount_table_offset, +                                s->refcount_table_size, sizeof(uint64_t)); +    if (ret < 0) { +        error_setg(errp, "Invalid reference count table offset"); +        goto fail; +    } + +    /* Snapshot table offset/length */ +    if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { +        error_setg(errp, "Too many snapshots"); +        ret = -EINVAL; +        goto fail; +    } + +    ret = validate_table_offset(bs, header.snapshots_offset, +                                header.nb_snapshots, +                                sizeof(QCowSnapshotHeader)); +    if (ret < 0) { +        error_setg(errp, "Invalid snapshot table offset"); +        goto fail; +    } + +    /* read the level 1 table */ +    if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { +        error_setg(errp, "Active L1 table too large"); +        ret = -EFBIG; +        goto fail; +    } +    s->l1_size = header.l1_size; + +    l1_vm_state_index = size_to_l1(s, header.size); +    if (l1_vm_state_index > INT_MAX) { +        error_setg(errp, "Image is too big"); +        ret = -EFBIG; +        goto fail; +    } +    s->l1_vm_state_index = l1_vm_state_index; + +    /* the L1 table must contain at least enough entries to put +       header.size bytes */ +    if (s->l1_size < s->l1_vm_state_index) { +        error_setg(errp, "L1 table is too small"); +        ret = -EINVAL; +        goto fail; +    } + +    ret = validate_table_offset(bs, header.l1_table_offset, +                                header.l1_size, sizeof(uint64_t)); +    if (ret < 0) { +        error_setg(errp, "Invalid L1 table offset"); +        goto fail; +    } +    s->l1_table_offset = header.l1_table_offset; + + +    if (s->l1_size > 0) { +        s->l1_table = qemu_try_blockalign(bs->file, +            align_offset(s->l1_size * sizeof(uint64_t), 512)); +        if (s->l1_table == NULL) { +            error_setg(errp, "Could not allocate L1 table"); +            ret = -ENOMEM; +            goto fail; +        } +        ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, +                         s->l1_size * sizeof(uint64_t)); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not read L1 table"); +            goto fail; +        } +        for(i = 0;i < s->l1_size; i++) { +            be64_to_cpus(&s->l1_table[i]); +        } +    } + +    /* get L2 table/refcount block cache size from command line options */ +    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, +                     &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    l2_cache_size /= s->cluster_size; +    if (l2_cache_size < MIN_L2_CACHE_SIZE) { +        l2_cache_size = MIN_L2_CACHE_SIZE; +    } +    if (l2_cache_size > INT_MAX) { +        error_setg(errp, "L2 cache size too big"); +        ret = -EINVAL; +        goto fail; +    } + +    refcount_cache_size /= s->cluster_size; +    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { +        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; +    } +    if (refcount_cache_size > INT_MAX) { +        error_setg(errp, "Refcount cache size too big"); +        ret = -EINVAL; +        goto fail; +    } + +    /* alloc L2 table/refcount block cache */ +    s->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); +    s->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); +    if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) { +        error_setg(errp, "Could not allocate metadata caches"); +        ret = -ENOMEM; +        goto fail; +    } + +    s->cluster_cache = g_malloc(s->cluster_size); +    /* one more sector for decompressed data alignment */ +    s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS +                                                    * s->cluster_size + 512); +    if (s->cluster_data == NULL) { +        error_setg(errp, "Could not allocate temporary cluster buffer"); +        ret = -ENOMEM; +        goto fail; +    } + +    s->cluster_cache_offset = -1; +    s->flags = flags; + +    ret = qcow2_refcount_init(bs); +    if (ret != 0) { +        error_setg_errno(errp, -ret, "Could not initialize refcount handling"); +        goto fail; +    } + +    QLIST_INIT(&s->cluster_allocs); +    QTAILQ_INIT(&s->discards); + +    /* read qcow2 extensions */ +    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, +        &local_err)) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    /* read the backing file name */ +    if (header.backing_file_offset != 0) { +        len = header.backing_file_size; +        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || +            len >= sizeof(bs->backing_file)) { +            error_setg(errp, "Backing file name too long"); +            ret = -EINVAL; +            goto fail; +        } +        ret = bdrv_pread(bs->file, header.backing_file_offset, +                         bs->backing_file, len); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not read backing file name"); +            goto fail; +        } +        bs->backing_file[len] = '\0'; +        s->image_backing_file = g_strdup(bs->backing_file); +    } + +    /* Internal snapshots */ +    s->snapshots_offset = header.snapshots_offset; +    s->nb_snapshots = header.nb_snapshots; + +    ret = qcow2_read_snapshots(bs); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not read snapshots"); +        goto fail; +    } + +    /* Clear unknown autoclear feature bits */ +    if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) { +        s->autoclear_features = 0; +        ret = qcow2_update_header(bs); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not update qcow2 header"); +            goto fail; +        } +    } + +    /* Initialise locks */ +    qemu_co_mutex_init(&s->lock); + +    /* Repair image if dirty */ +    if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only && +        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { +        BdrvCheckResult result = {0}; + +        ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not repair dirty image"); +            goto fail; +        } +    } + +    /* Enable lazy_refcounts according to image and command line options */ +    s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, +        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + +    s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; +    s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; +    s->discard_passthrough[QCOW2_DISCARD_REQUEST] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, +                          flags & BDRV_O_UNMAP); +    s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); +    s->discard_passthrough[QCOW2_DISCARD_OTHER] = +        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + +    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); +    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); +    if (opt_overlap_check_template && opt_overlap_check && +        strcmp(opt_overlap_check_template, opt_overlap_check)) +    { +        error_setg(errp, "Conflicting values for qcow2 options '" +                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE +                   "' ('%s')", opt_overlap_check, opt_overlap_check_template); +        ret = -EINVAL; +        goto fail; +    } +    if (!opt_overlap_check) { +        opt_overlap_check = opt_overlap_check_template ?: "cached"; +    } + +    if (!strcmp(opt_overlap_check, "none")) { +        overlap_check_template = 0; +    } else if (!strcmp(opt_overlap_check, "constant")) { +        overlap_check_template = QCOW2_OL_CONSTANT; +    } else if (!strcmp(opt_overlap_check, "cached")) { +        overlap_check_template = QCOW2_OL_CACHED; +    } else if (!strcmp(opt_overlap_check, "all")) { +        overlap_check_template = QCOW2_OL_ALL; +    } else { +        error_setg(errp, "Unsupported value '%s' for qcow2 option " +                   "'overlap-check'. Allowed are either of the following: " +                   "none, constant, cached, all", opt_overlap_check); +        ret = -EINVAL; +        goto fail; +    } + +    s->overlap_check = 0; +    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { +        /* overlap-check defines a template bitmask, but every flag may be +         * overwritten through the associated boolean option */ +        s->overlap_check |= +            qemu_opt_get_bool(opts, overlap_bool_option_names[i], +                              overlap_check_template & (1 << i)) << i; +    } + +    qemu_opts_del(opts); +    opts = NULL; + +    if (s->use_lazy_refcounts && s->qcow_version < 3) { +        error_setg(errp, "Lazy refcounts require a qcow2 image with at least " +                   "qemu 1.1 compatibility level"); +        ret = -EINVAL; +        goto fail; +    } + +#ifdef DEBUG_ALLOC +    { +        BdrvCheckResult result = {0}; +        qcow2_check_refcounts(bs, &result, 0); +    } +#endif +    return ret; + + fail: +    qemu_opts_del(opts); +    g_free(s->unknown_header_fields); +    cleanup_unknown_header_ext(bs); +    qcow2_free_snapshots(bs); +    qcow2_refcount_close(bs); +    qemu_vfree(s->l1_table); +    /* else pre-write overlap checks in cache_destroy may crash */ +    s->l1_table = NULL; +    if (s->l2_table_cache) { +        qcow2_cache_destroy(bs, s->l2_table_cache); +    } +    if (s->refcount_block_cache) { +        qcow2_cache_destroy(bs, s->refcount_block_cache); +    } +    g_free(s->cluster_cache); +    qemu_vfree(s->cluster_data); +    return ret; +} + +static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    BDRVQcowState *s = bs->opaque; + +    bs->bl.write_zeroes_alignment = s->cluster_sectors; +} + +static int qcow2_set_key(BlockDriverState *bs, const char *key) +{ +    BDRVQcowState *s = bs->opaque; +    uint8_t keybuf[16]; +    int len, i; +    Error *err = NULL; + +    memset(keybuf, 0, 16); +    len = strlen(key); +    if (len > 16) +        len = 16; +    /* XXX: we could compress the chars to 7 bits to increase +       entropy */ +    for(i = 0;i < len;i++) { +        keybuf[i] = key[i]; +    } +    assert(bs->encrypted); + +    qcrypto_cipher_free(s->cipher); +    s->cipher = qcrypto_cipher_new( +        QCRYPTO_CIPHER_ALG_AES_128, +        QCRYPTO_CIPHER_MODE_CBC, +        keybuf, G_N_ELEMENTS(keybuf), +        &err); + +    if (!s->cipher) { +        /* XXX would be nice if errors in this method could +         * be properly propagate to the caller. Would need +         * the bdrv_set_key() API signature to be fixed. */ +        error_free(err); +        return -1; +    } +    return 0; +} + +/* We have no actual commit/abort logic for qcow2, but we need to write out any + * unwritten data if we reopen read-only. */ +static int qcow2_reopen_prepare(BDRVReopenState *state, +                                BlockReopenQueue *queue, Error **errp) +{ +    int ret; + +    if ((state->flags & BDRV_O_RDWR) == 0) { +        ret = bdrv_flush(state->bs); +        if (ret < 0) { +            return ret; +        } + +        ret = qcow2_mark_clean(state->bs); +        if (ret < 0) { +            return ret; +        } +    } + +    return 0; +} + +static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t cluster_offset; +    int index_in_cluster, ret; +    int64_t status = 0; + +    *pnum = nb_sectors; +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); +    qemu_co_mutex_unlock(&s->lock); +    if (ret < 0) { +        return ret; +    } + +    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && +        !s->cipher) { +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); +        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; +    } +    if (ret == QCOW2_CLUSTER_ZERO) { +        status |= BDRV_BLOCK_ZERO; +    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { +        status |= BDRV_BLOCK_DATA; +    } +    return status; +} + +/* handle reading after the end of the backing file */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, +                  int64_t sector_num, int nb_sectors) +{ +    int n1; +    if ((sector_num + nb_sectors) <= bs->total_sectors) +        return nb_sectors; +    if (sector_num >= bs->total_sectors) +        n1 = 0; +    else +        n1 = bs->total_sectors - sector_num; + +    qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + +    return n1; +} + +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, +                          int remaining_sectors, QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster, n1; +    int ret; +    int cur_nr_sectors; /* number of sectors in current iteration */ +    uint64_t cluster_offset = 0; +    uint64_t bytes_done = 0; +    QEMUIOVector hd_qiov; +    uint8_t *cluster_data = NULL; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    qemu_co_mutex_lock(&s->lock); + +    while (remaining_sectors != 0) { + +        /* prepare next request */ +        cur_nr_sectors = remaining_sectors; +        if (s->cipher) { +            cur_nr_sectors = MIN(cur_nr_sectors, +                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); +        } + +        ret = qcow2_get_cluster_offset(bs, sector_num << 9, +            &cur_nr_sectors, &cluster_offset); +        if (ret < 0) { +            goto fail; +        } + +        index_in_cluster = sector_num & (s->cluster_sectors - 1); + +        qemu_iovec_reset(&hd_qiov); +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, +            cur_nr_sectors * 512); + +        switch (ret) { +        case QCOW2_CLUSTER_UNALLOCATED: + +            if (bs->backing_hd) { +                /* read from the base image */ +                n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, +                    sector_num, cur_nr_sectors); +                if (n1 > 0) { +                    QEMUIOVector local_qiov; + +                    qemu_iovec_init(&local_qiov, hd_qiov.niov); +                    qemu_iovec_concat(&local_qiov, &hd_qiov, 0, +                                      n1 * BDRV_SECTOR_SIZE); + +                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); +                    qemu_co_mutex_unlock(&s->lock); +                    ret = bdrv_co_readv(bs->backing_hd, sector_num, +                                        n1, &local_qiov); +                    qemu_co_mutex_lock(&s->lock); + +                    qemu_iovec_destroy(&local_qiov); + +                    if (ret < 0) { +                        goto fail; +                    } +                } +            } else { +                /* Note: in this case, no need to wait */ +                qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); +            } +            break; + +        case QCOW2_CLUSTER_ZERO: +            qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); +            break; + +        case QCOW2_CLUSTER_COMPRESSED: +            /* add AIO support for compressed blocks ? */ +            ret = qcow2_decompress_cluster(bs, cluster_offset); +            if (ret < 0) { +                goto fail; +            } + +            qemu_iovec_from_buf(&hd_qiov, 0, +                s->cluster_cache + index_in_cluster * 512, +                512 * cur_nr_sectors); +            break; + +        case QCOW2_CLUSTER_NORMAL: +            if ((cluster_offset & 511) != 0) { +                ret = -EIO; +                goto fail; +            } + +            if (bs->encrypted) { +                assert(s->cipher); + +                /* +                 * For encrypted images, read everything into a temporary +                 * contiguous buffer on which the AES functions can work. +                 */ +                if (!cluster_data) { +                    cluster_data = +                        qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS +                                                      * s->cluster_size); +                    if (cluster_data == NULL) { +                        ret = -ENOMEM; +                        goto fail; +                    } +                } + +                assert(cur_nr_sectors <= +                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); +                qemu_iovec_reset(&hd_qiov); +                qemu_iovec_add(&hd_qiov, cluster_data, +                    512 * cur_nr_sectors); +            } + +            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); +            qemu_co_mutex_unlock(&s->lock); +            ret = bdrv_co_readv(bs->file, +                                (cluster_offset >> 9) + index_in_cluster, +                                cur_nr_sectors, &hd_qiov); +            qemu_co_mutex_lock(&s->lock); +            if (ret < 0) { +                goto fail; +            } +            if (bs->encrypted) { +                assert(s->cipher); +                Error *err = NULL; +                if (qcow2_encrypt_sectors(s, sector_num,  cluster_data, +                                          cluster_data, cur_nr_sectors, false, +                                          &err) < 0) { +                    error_free(err); +                    ret = -EIO; +                    goto fail; +                } +                qemu_iovec_from_buf(qiov, bytes_done, +                    cluster_data, 512 * cur_nr_sectors); +            } +            break; + +        default: +            g_assert_not_reached(); +            ret = -EIO; +            goto fail; +        } + +        remaining_sectors -= cur_nr_sectors; +        sector_num += cur_nr_sectors; +        bytes_done += cur_nr_sectors * 512; +    } +    ret = 0; + +fail: +    qemu_co_mutex_unlock(&s->lock); + +    qemu_iovec_destroy(&hd_qiov); +    qemu_vfree(cluster_data); + +    return ret; +} + +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, +                           int64_t sector_num, +                           int remaining_sectors, +                           QEMUIOVector *qiov) +{ +    BDRVQcowState *s = bs->opaque; +    int index_in_cluster; +    int ret; +    int cur_nr_sectors; /* number of sectors in current iteration */ +    uint64_t cluster_offset; +    QEMUIOVector hd_qiov; +    uint64_t bytes_done = 0; +    uint8_t *cluster_data = NULL; +    QCowL2Meta *l2meta = NULL; + +    trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, +                                 remaining_sectors); + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    s->cluster_cache_offset = -1; /* disable compressed cache */ + +    qemu_co_mutex_lock(&s->lock); + +    while (remaining_sectors != 0) { + +        l2meta = NULL; + +        trace_qcow2_writev_start_part(qemu_coroutine_self()); +        index_in_cluster = sector_num & (s->cluster_sectors - 1); +        cur_nr_sectors = remaining_sectors; +        if (bs->encrypted && +            cur_nr_sectors > +            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { +            cur_nr_sectors = +                QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; +        } + +        ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, +            &cur_nr_sectors, &cluster_offset, &l2meta); +        if (ret < 0) { +            goto fail; +        } + +        assert((cluster_offset & 511) == 0); + +        qemu_iovec_reset(&hd_qiov); +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, +            cur_nr_sectors * 512); + +        if (bs->encrypted) { +            Error *err = NULL; +            assert(s->cipher); +            if (!cluster_data) { +                cluster_data = qemu_try_blockalign(bs->file, +                                                   QCOW_MAX_CRYPT_CLUSTERS +                                                   * s->cluster_size); +                if (cluster_data == NULL) { +                    ret = -ENOMEM; +                    goto fail; +                } +            } + +            assert(hd_qiov.size <= +                   QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); +            qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); + +            if (qcow2_encrypt_sectors(s, sector_num, cluster_data, +                                      cluster_data, cur_nr_sectors, +                                      true, &err) < 0) { +                error_free(err); +                ret = -EIO; +                goto fail; +            } + +            qemu_iovec_reset(&hd_qiov); +            qemu_iovec_add(&hd_qiov, cluster_data, +                cur_nr_sectors * 512); +        } + +        ret = qcow2_pre_write_overlap_check(bs, 0, +                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, +                cur_nr_sectors * BDRV_SECTOR_SIZE); +        if (ret < 0) { +            goto fail; +        } + +        qemu_co_mutex_unlock(&s->lock); +        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); +        trace_qcow2_writev_data(qemu_coroutine_self(), +                                (cluster_offset >> 9) + index_in_cluster); +        ret = bdrv_co_writev(bs->file, +                             (cluster_offset >> 9) + index_in_cluster, +                             cur_nr_sectors, &hd_qiov); +        qemu_co_mutex_lock(&s->lock); +        if (ret < 0) { +            goto fail; +        } + +        while (l2meta != NULL) { +            QCowL2Meta *next; + +            ret = qcow2_alloc_cluster_link_l2(bs, l2meta); +            if (ret < 0) { +                goto fail; +            } + +            /* Take the request off the list of running requests */ +            if (l2meta->nb_clusters != 0) { +                QLIST_REMOVE(l2meta, next_in_flight); +            } + +            qemu_co_queue_restart_all(&l2meta->dependent_requests); + +            next = l2meta->next; +            g_free(l2meta); +            l2meta = next; +        } + +        remaining_sectors -= cur_nr_sectors; +        sector_num += cur_nr_sectors; +        bytes_done += cur_nr_sectors * 512; +        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); +    } +    ret = 0; + +fail: +    qemu_co_mutex_unlock(&s->lock); + +    while (l2meta != NULL) { +        QCowL2Meta *next; + +        if (l2meta->nb_clusters != 0) { +            QLIST_REMOVE(l2meta, next_in_flight); +        } +        qemu_co_queue_restart_all(&l2meta->dependent_requests); + +        next = l2meta->next; +        g_free(l2meta); +        l2meta = next; +    } + +    qemu_iovec_destroy(&hd_qiov); +    qemu_vfree(cluster_data); +    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + +    return ret; +} + +static void qcow2_close(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    qemu_vfree(s->l1_table); +    /* else pre-write overlap checks in cache_destroy may crash */ +    s->l1_table = NULL; + +    if (!(bs->open_flags & BDRV_O_INCOMING)) { +        int ret1, ret2; + +        ret1 = qcow2_cache_flush(bs, s->l2_table_cache); +        ret2 = qcow2_cache_flush(bs, s->refcount_block_cache); + +        if (ret1) { +            error_report("Failed to flush the L2 table cache: %s", +                         strerror(-ret1)); +        } +        if (ret2) { +            error_report("Failed to flush the refcount block cache: %s", +                         strerror(-ret2)); +        } + +        if (!ret1 && !ret2) { +            qcow2_mark_clean(bs); +        } +    } + +    qcow2_cache_destroy(bs, s->l2_table_cache); +    qcow2_cache_destroy(bs, s->refcount_block_cache); + +    qcrypto_cipher_free(s->cipher); +    s->cipher = NULL; + +    g_free(s->unknown_header_fields); +    cleanup_unknown_header_ext(bs); + +    g_free(s->image_backing_file); +    g_free(s->image_backing_format); + +    g_free(s->cluster_cache); +    qemu_vfree(s->cluster_data); +    qcow2_refcount_close(bs); +    qcow2_free_snapshots(bs); +} + +static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) +{ +    BDRVQcowState *s = bs->opaque; +    int flags = s->flags; +    QCryptoCipher *cipher = NULL; +    QDict *options; +    Error *local_err = NULL; +    int ret; + +    /* +     * Backing files are read-only which makes all of their metadata immutable, +     * that means we don't have to worry about reopening them here. +     */ + +    cipher = s->cipher; +    s->cipher = NULL; + +    qcow2_close(bs); + +    bdrv_invalidate_cache(bs->file, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        return; +    } + +    memset(s, 0, sizeof(BDRVQcowState)); +    options = qdict_clone_shallow(bs->options); + +    ret = qcow2_open(bs, options, flags, &local_err); +    QDECREF(options); +    if (local_err) { +        error_setg(errp, "Could not reopen qcow2 layer: %s", +                   error_get_pretty(local_err)); +        error_free(local_err); +        return; +    } else if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); +        return; +    } + +    s->cipher = cipher; +} + +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, +    size_t len, size_t buflen) +{ +    QCowExtension *ext_backing_fmt = (QCowExtension*) buf; +    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); + +    if (buflen < ext_len) { +        return -ENOSPC; +    } + +    *ext_backing_fmt = (QCowExtension) { +        .magic  = cpu_to_be32(magic), +        .len    = cpu_to_be32(len), +    }; +    memcpy(buf + sizeof(QCowExtension), s, len); + +    return ext_len; +} + +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    QCowHeader *header; +    char *buf; +    size_t buflen = s->cluster_size; +    int ret; +    uint64_t total_size; +    uint32_t refcount_table_clusters; +    size_t header_length; +    Qcow2UnknownHeaderExtension *uext; + +    buf = qemu_blockalign(bs, buflen); + +    /* Header structure */ +    header = (QCowHeader*) buf; + +    if (buflen < sizeof(*header)) { +        ret = -ENOSPC; +        goto fail; +    } + +    header_length = sizeof(*header) + s->unknown_header_fields_size; +    total_size = bs->total_sectors * BDRV_SECTOR_SIZE; +    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + +    *header = (QCowHeader) { +        /* Version 2 fields */ +        .magic                  = cpu_to_be32(QCOW_MAGIC), +        .version                = cpu_to_be32(s->qcow_version), +        .backing_file_offset    = 0, +        .backing_file_size      = 0, +        .cluster_bits           = cpu_to_be32(s->cluster_bits), +        .size                   = cpu_to_be64(total_size), +        .crypt_method           = cpu_to_be32(s->crypt_method_header), +        .l1_size                = cpu_to_be32(s->l1_size), +        .l1_table_offset        = cpu_to_be64(s->l1_table_offset), +        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset), +        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), +        .nb_snapshots           = cpu_to_be32(s->nb_snapshots), +        .snapshots_offset       = cpu_to_be64(s->snapshots_offset), + +        /* Version 3 fields */ +        .incompatible_features  = cpu_to_be64(s->incompatible_features), +        .compatible_features    = cpu_to_be64(s->compatible_features), +        .autoclear_features     = cpu_to_be64(s->autoclear_features), +        .refcount_order         = cpu_to_be32(s->refcount_order), +        .header_length          = cpu_to_be32(header_length), +    }; + +    /* For older versions, write a shorter header */ +    switch (s->qcow_version) { +    case 2: +        ret = offsetof(QCowHeader, incompatible_features); +        break; +    case 3: +        ret = sizeof(*header); +        break; +    default: +        ret = -EINVAL; +        goto fail; +    } + +    buf += ret; +    buflen -= ret; +    memset(buf, 0, buflen); + +    /* Preserve any unknown field in the header */ +    if (s->unknown_header_fields_size) { +        if (buflen < s->unknown_header_fields_size) { +            ret = -ENOSPC; +            goto fail; +        } + +        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); +        buf += s->unknown_header_fields_size; +        buflen -= s->unknown_header_fields_size; +    } + +    /* Backing file format header extension */ +    if (s->image_backing_format) { +        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, +                             s->image_backing_format, +                             strlen(s->image_backing_format), +                             buflen); +        if (ret < 0) { +            goto fail; +        } + +        buf += ret; +        buflen -= ret; +    } + +    /* Feature table */ +    Qcow2Feature features[] = { +        { +            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, +            .bit  = QCOW2_INCOMPAT_DIRTY_BITNR, +            .name = "dirty bit", +        }, +        { +            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, +            .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR, +            .name = "corrupt bit", +        }, +        { +            .type = QCOW2_FEAT_TYPE_COMPATIBLE, +            .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, +            .name = "lazy refcounts", +        }, +    }; + +    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, +                         features, sizeof(features), buflen); +    if (ret < 0) { +        goto fail; +    } +    buf += ret; +    buflen -= ret; + +    /* Keep unknown header extensions */ +    QLIST_FOREACH(uext, &s->unknown_header_ext, next) { +        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); +        if (ret < 0) { +            goto fail; +        } + +        buf += ret; +        buflen -= ret; +    } + +    /* End of header extensions */ +    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); +    if (ret < 0) { +        goto fail; +    } + +    buf += ret; +    buflen -= ret; + +    /* Backing file name */ +    if (s->image_backing_file) { +        size_t backing_file_len = strlen(s->image_backing_file); + +        if (buflen < backing_file_len) { +            ret = -ENOSPC; +            goto fail; +        } + +        /* Using strncpy is ok here, since buf is not NUL-terminated. */ +        strncpy(buf, s->image_backing_file, buflen); + +        header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); +        header->backing_file_size   = cpu_to_be32(backing_file_len); +    } + +    /* Write the new header */ +    ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); +    if (ret < 0) { +        goto fail; +    } + +    ret = 0; +fail: +    qemu_vfree(header); +    return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, +    const char *backing_file, const char *backing_fmt) +{ +    BDRVQcowState *s = bs->opaque; + +    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); +    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + +    g_free(s->image_backing_file); +    g_free(s->image_backing_format); + +    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL; +    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL; + +    return qcow2_update_header(bs); +} + +static int preallocate(BlockDriverState *bs) +{ +    uint64_t nb_sectors; +    uint64_t offset; +    uint64_t host_offset = 0; +    int num; +    int ret; +    QCowL2Meta *meta; + +    nb_sectors = bdrv_nb_sectors(bs); +    offset = 0; + +    while (nb_sectors) { +        num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); +        ret = qcow2_alloc_cluster_offset(bs, offset, &num, +                                         &host_offset, &meta); +        if (ret < 0) { +            return ret; +        } + +        while (meta) { +            QCowL2Meta *next = meta->next; + +            ret = qcow2_alloc_cluster_link_l2(bs, meta); +            if (ret < 0) { +                qcow2_free_any_clusters(bs, meta->alloc_offset, +                                        meta->nb_clusters, QCOW2_DISCARD_NEVER); +                return ret; +            } + +            /* There are no dependent requests, but we need to remove our +             * request from the list of in-flight requests */ +            QLIST_REMOVE(meta, next_in_flight); + +            g_free(meta); +            meta = next; +        } + +        /* TODO Preallocate data if requested */ + +        nb_sectors -= num; +        offset += num << BDRV_SECTOR_BITS; +    } + +    /* +     * It is expected that the image file is large enough to actually contain +     * all of the allocated clusters (otherwise we get failing reads after +     * EOF). Extend the image to the last allocated sector. +     */ +    if (host_offset != 0) { +        uint8_t buf[BDRV_SECTOR_SIZE]; +        memset(buf, 0, BDRV_SECTOR_SIZE); +        ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1, +                         buf, 1); +        if (ret < 0) { +            return ret; +        } +    } + +    return 0; +} + +static int qcow2_create2(const char *filename, int64_t total_size, +                         const char *backing_file, const char *backing_format, +                         int flags, size_t cluster_size, PreallocMode prealloc, +                         QemuOpts *opts, int version, int refcount_order, +                         Error **errp) +{ +    /* Calculate cluster_bits */ +    int cluster_bits; +    cluster_bits = ctz32(cluster_size); +    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || +        (1 << cluster_bits) != cluster_size) +    { +        error_setg(errp, "Cluster size must be a power of two between %d and " +                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); +        return -EINVAL; +    } + +    /* +     * Open the image file and write a minimal qcow2 header. +     * +     * We keep things simple and start with a zero-sized image. We also +     * do without refcount blocks or a L1 table for now. We'll fix the +     * inconsistency later. +     * +     * We do need a refcount table because growing the refcount table means +     * allocating two new refcount blocks - the seconds of which would be at +     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file +     * size for any qcow2 image. +     */ +    BlockDriverState* bs; +    QCowHeader *header; +    uint64_t* refcount_table; +    Error *local_err = NULL; +    int ret; + +    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { +        /* Note: The following calculation does not need to be exact; if it is a +         * bit off, either some bytes will be "leaked" (which is fine) or we +         * will need to increase the file size by some bytes (which is fine, +         * too, as long as the bulk is allocated here). Therefore, using +         * floating point arithmetic is fine. */ +        int64_t meta_size = 0; +        uint64_t nreftablee, nrefblocke, nl1e, nl2e; +        int64_t aligned_total_size = align_offset(total_size, cluster_size); +        int refblock_bits, refblock_size; +        /* refcount entry size in bytes */ +        double rces = (1 << refcount_order) / 8.; + +        /* see qcow2_open() */ +        refblock_bits = cluster_bits - (refcount_order - 3); +        refblock_size = 1 << refblock_bits; + +        /* header: 1 cluster */ +        meta_size += cluster_size; + +        /* total size of L2 tables */ +        nl2e = aligned_total_size / cluster_size; +        nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); +        meta_size += nl2e * sizeof(uint64_t); + +        /* total size of L1 tables */ +        nl1e = nl2e * sizeof(uint64_t) / cluster_size; +        nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); +        meta_size += nl1e * sizeof(uint64_t); + +        /* total size of refcount blocks +         * +         * note: every host cluster is reference-counted, including metadata +         * (even refcount blocks are recursively included). +         * Let: +         *   a = total_size (this is the guest disk size) +         *   m = meta size not including refcount blocks and refcount tables +         *   c = cluster size +         *   y1 = number of refcount blocks entries +         *   y2 = meta size including everything +         *   rces = refcount entry size in bytes +         * then, +         *   y1 = (y2 + a)/c +         *   y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m +         * we can get y1: +         *   y1 = (a + m) / (c - rces - rces * sizeof(u64) / c) +         */ +        nrefblocke = (aligned_total_size + meta_size + cluster_size) +                   / (cluster_size - rces - rces * sizeof(uint64_t) +                                                 / cluster_size); +        meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size; + +        /* total size of refcount tables */ +        nreftablee = nrefblocke / refblock_size; +        nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t)); +        meta_size += nreftablee * sizeof(uint64_t); + +        qemu_opt_set_number(opts, BLOCK_OPT_SIZE, +                            aligned_total_size + meta_size, &error_abort); +        qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc], +                     &error_abort); +    } + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } + +    bs = NULL; +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } + +    /* Write the header */ +    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); +    header = g_malloc0(cluster_size); +    *header = (QCowHeader) { +        .magic                      = cpu_to_be32(QCOW_MAGIC), +        .version                    = cpu_to_be32(version), +        .cluster_bits               = cpu_to_be32(cluster_bits), +        .size                       = cpu_to_be64(0), +        .l1_table_offset            = cpu_to_be64(0), +        .l1_size                    = cpu_to_be32(0), +        .refcount_table_offset      = cpu_to_be64(cluster_size), +        .refcount_table_clusters    = cpu_to_be32(1), +        .refcount_order             = cpu_to_be32(refcount_order), +        .header_length              = cpu_to_be32(sizeof(*header)), +    }; + +    if (flags & BLOCK_FLAG_ENCRYPT) { +        header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); +    } else { +        header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); +    } + +    if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { +        header->compatible_features |= +            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); +    } + +    ret = bdrv_pwrite(bs, 0, header, cluster_size); +    g_free(header); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not write qcow2 header"); +        goto out; +    } + +    /* Write a refcount table with one refcount block */ +    refcount_table = g_malloc0(2 * cluster_size); +    refcount_table[0] = cpu_to_be64(2 * cluster_size); +    ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); +    g_free(refcount_table); + +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not write refcount table"); +        goto out; +    } + +    bdrv_unref(bs); +    bs = NULL; + +    /* +     * And now open the image and make it consistent first (i.e. increase the +     * refcount of the cluster that is occupied by the header and the refcount +     * table) +     */ +    ret = bdrv_open(&bs, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, +                    &bdrv_qcow2, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto out; +    } + +    ret = qcow2_alloc_clusters(bs, 3 * cluster_size); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " +                         "header and refcount table"); +        goto out; + +    } else if (ret != 0) { +        error_report("Huh, first cluster in empty image is already in use?"); +        abort(); +    } + +    /* Okay, now that we have a valid image, let's give it the right size */ +    ret = bdrv_truncate(bs, total_size); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not resize image"); +        goto out; +    } + +    /* Want a backing file? There you go.*/ +    if (backing_file) { +        ret = bdrv_change_backing_file(bs, backing_file, backing_format); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not assign backing file '%s' " +                             "with format '%s'", backing_file, backing_format); +            goto out; +        } +    } + +    /* And if we're supposed to preallocate metadata, do that now */ +    if (prealloc != PREALLOC_MODE_OFF) { +        BDRVQcowState *s = bs->opaque; +        qemu_co_mutex_lock(&s->lock); +        ret = preallocate(bs); +        qemu_co_mutex_unlock(&s->lock); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not preallocate metadata"); +            goto out; +        } +    } + +    bdrv_unref(bs); +    bs = NULL; + +    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ +    ret = bdrv_open(&bs, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, +                    &bdrv_qcow2, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        goto out; +    } + +    ret = 0; +out: +    if (bs) { +        bdrv_unref(bs); +    } +    return ret; +} + +static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    char *backing_file = NULL; +    char *backing_fmt = NULL; +    char *buf = NULL; +    uint64_t size = 0; +    int flags = 0; +    size_t cluster_size = DEFAULT_CLUSTER_SIZE; +    PreallocMode prealloc; +    int version = 3; +    uint64_t refcount_bits = 16; +    int refcount_order; +    Error *local_err = NULL; +    int ret; + +    /* Read out options */ +    size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                    BDRV_SECTOR_SIZE); +    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); +    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { +        flags |= BLOCK_FLAG_ENCRYPT; +    } +    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, +                                         DEFAULT_CLUSTER_SIZE); +    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); +    prealloc = qapi_enum_parse(PreallocMode_lookup, buf, +                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, +                               &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto finish; +    } +    g_free(buf); +    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); +    if (!buf) { +        /* keep the default */ +    } else if (!strcmp(buf, "0.10")) { +        version = 2; +    } else if (!strcmp(buf, "1.1")) { +        version = 3; +    } else { +        error_setg(errp, "Invalid compatibility level: '%s'", buf); +        ret = -EINVAL; +        goto finish; +    } + +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) { +        flags |= BLOCK_FLAG_LAZY_REFCOUNTS; +    } + +    if (backing_file && prealloc != PREALLOC_MODE_OFF) { +        error_setg(errp, "Backing file and preallocation cannot be used at " +                   "the same time"); +        ret = -EINVAL; +        goto finish; +    } + +    if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { +        error_setg(errp, "Lazy refcounts only supported with compatibility " +                   "level 1.1 and above (use compat=1.1 or greater)"); +        ret = -EINVAL; +        goto finish; +    } + +    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, +                                            refcount_bits); +    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { +        error_setg(errp, "Refcount width must be a power of two and may not " +                   "exceed 64 bits"); +        ret = -EINVAL; +        goto finish; +    } + +    if (version < 3 && refcount_bits != 16) { +        error_setg(errp, "Different refcount widths than 16 bits require " +                   "compatibility level 1.1 or above (use compat=1.1 or " +                   "greater)"); +        ret = -EINVAL; +        goto finish; +    } + +    refcount_order = ctz32(refcount_bits); + +    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, +                        cluster_size, prealloc, opts, version, refcount_order, +                        &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +    } + +finish: +    g_free(backing_file); +    g_free(backing_fmt); +    g_free(buf); +    return ret; +} + +static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ +    int ret; +    BDRVQcowState *s = bs->opaque; + +    /* Emulate misaligned zero writes */ +    if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { +        return -ENOTSUP; +    } + +    /* Whatever is left can use real zero clusters */ +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, +        nb_sectors); +    qemu_co_mutex_unlock(&s->lock); + +    return ret; +} + +static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors) +{ +    int ret; +    BDRVQcowState *s = bs->opaque; + +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, +        nb_sectors, QCOW2_DISCARD_REQUEST, false); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t new_l1_size; +    int ret; + +    if (offset & 511) { +        error_report("The new size must be a multiple of 512"); +        return -EINVAL; +    } + +    /* cannot proceed if image has snapshots */ +    if (s->nb_snapshots) { +        error_report("Can't resize an image which has snapshots"); +        return -ENOTSUP; +    } + +    /* shrinking is currently not supported */ +    if (offset < bs->total_sectors * 512) { +        error_report("qcow2 doesn't support shrinking images yet"); +        return -ENOTSUP; +    } + +    new_l1_size = size_to_l1(s, offset); +    ret = qcow2_grow_l1_table(bs, new_l1_size, true); +    if (ret < 0) { +        return ret; +    } + +    /* write updated header.size */ +    offset = cpu_to_be64(offset); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), +                           &offset, sizeof(uint64_t)); +    if (ret < 0) { +        return ret; +    } + +    s->l1_vm_state_index = new_l1_size; +    return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned +   tables to avoid losing bytes in alignment */ +static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, +                                  const uint8_t *buf, int nb_sectors) +{ +    BDRVQcowState *s = bs->opaque; +    z_stream strm; +    int ret, out_len; +    uint8_t *out_buf; +    uint64_t cluster_offset; + +    if (nb_sectors == 0) { +        /* align end of file to a sector boundary to ease reading with +           sector based I/Os */ +        cluster_offset = bdrv_getlength(bs->file); +        return bdrv_truncate(bs->file, cluster_offset); +    } + +    if (nb_sectors != s->cluster_sectors) { +        ret = -EINVAL; + +        /* Zero-pad last write if image size is not cluster aligned */ +        if (sector_num + nb_sectors == bs->total_sectors && +            nb_sectors < s->cluster_sectors) { +            uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); +            memset(pad_buf, 0, s->cluster_size); +            memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); +            ret = qcow2_write_compressed(bs, sector_num, +                                         pad_buf, s->cluster_sectors); +            qemu_vfree(pad_buf); +        } +        return ret; +    } + +    out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + +    /* best compression, small window, no zlib header */ +    memset(&strm, 0, sizeof(strm)); +    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, +                       Z_DEFLATED, -12, +                       9, Z_DEFAULT_STRATEGY); +    if (ret != 0) { +        ret = -EINVAL; +        goto fail; +    } + +    strm.avail_in = s->cluster_size; +    strm.next_in = (uint8_t *)buf; +    strm.avail_out = s->cluster_size; +    strm.next_out = out_buf; + +    ret = deflate(&strm, Z_FINISH); +    if (ret != Z_STREAM_END && ret != Z_OK) { +        deflateEnd(&strm); +        ret = -EINVAL; +        goto fail; +    } +    out_len = strm.next_out - out_buf; + +    deflateEnd(&strm); + +    if (ret != Z_STREAM_END || out_len >= s->cluster_size) { +        /* could not compress: write normal cluster */ +        ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); +        if (ret < 0) { +            goto fail; +        } +    } else { +        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, +            sector_num << 9, out_len); +        if (!cluster_offset) { +            ret = -EIO; +            goto fail; +        } +        cluster_offset &= s->cluster_offset_mask; + +        ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); +        if (ret < 0) { +            goto fail; +        } + +        BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); +        ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); +        if (ret < 0) { +            goto fail; +        } +    } + +    ret = 0; +fail: +    g_free(out_buf); +    return ret; +} + +static int make_completely_empty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int ret, l1_clusters; +    int64_t offset; +    uint64_t *new_reftable = NULL; +    uint64_t rt_entry, l1_size2; +    struct { +        uint64_t l1_offset; +        uint64_t reftable_offset; +        uint32_t reftable_clusters; +    } QEMU_PACKED l1_ofs_rt_ofs_cls; + +    ret = qcow2_cache_empty(bs, s->l2_table_cache); +    if (ret < 0) { +        goto fail; +    } + +    ret = qcow2_cache_empty(bs, s->refcount_block_cache); +    if (ret < 0) { +        goto fail; +    } + +    /* Refcounts will be broken utterly */ +    ret = qcow2_mark_dirty(bs); +    if (ret < 0) { +        goto fail; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + +    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); +    l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); + +    /* After this call, neither the in-memory nor the on-disk refcount +     * information accurately describe the actual references */ + +    ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE, +                            l1_clusters * s->cluster_sectors, 0); +    if (ret < 0) { +        goto fail_broken_refcounts; +    } +    memset(s->l1_table, 0, l1_size2); + +    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); + +    /* Overwrite enough clusters at the beginning of the sectors to place +     * the refcount table, a refcount block and the L1 table in; this may +     * overwrite parts of the existing refcount and L1 table, which is not +     * an issue because the dirty flag is set, complete data loss is in fact +     * desired and partial data loss is consequently fine as well */ +    ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE, +                            (2 + l1_clusters) * s->cluster_size / +                            BDRV_SECTOR_SIZE, 0); +    /* This call (even if it failed overall) may have overwritten on-disk +     * refcount structures; in that case, the in-memory refcount information +     * will probably differ from the on-disk information which makes the BDS +     * unusable */ +    if (ret < 0) { +        goto fail_broken_refcounts; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); +    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + +    /* "Create" an empty reftable (one cluster) directly after the image +     * header and an empty L1 table three clusters after the image header; +     * the cluster between those two will be used as the first refblock */ +    cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); +    cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); +    cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), +                           &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); +    if (ret < 0) { +        goto fail_broken_refcounts; +    } + +    s->l1_table_offset = 3 * s->cluster_size; + +    new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); +    if (!new_reftable) { +        ret = -ENOMEM; +        goto fail_broken_refcounts; +    } + +    s->refcount_table_offset = s->cluster_size; +    s->refcount_table_size   = s->cluster_size / sizeof(uint64_t); + +    g_free(s->refcount_table); +    s->refcount_table = new_reftable; +    new_reftable = NULL; + +    /* Now the in-memory refcount information again corresponds to the on-disk +     * information (reftable is empty and no refblocks (the refblock cache is +     * empty)); however, this means some clusters (e.g. the image header) are +     * referenced, but not refcounted, but the normal qcow2 code assumes that +     * the in-memory information is always correct */ + +    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + +    /* Enter the first refblock into the reftable */ +    rt_entry = cpu_to_be64(2 * s->cluster_size); +    ret = bdrv_pwrite_sync(bs->file, s->cluster_size, +                           &rt_entry, sizeof(rt_entry)); +    if (ret < 0) { +        goto fail_broken_refcounts; +    } +    s->refcount_table[0] = 2 * s->cluster_size; + +    s->free_cluster_index = 0; +    assert(3 + l1_clusters <= s->refcount_block_size); +    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); +    if (offset < 0) { +        ret = offset; +        goto fail_broken_refcounts; +    } else if (offset > 0) { +        error_report("First cluster in emptied image is in use"); +        abort(); +    } + +    /* Now finally the in-memory information corresponds to the on-disk +     * structures and is correct */ +    ret = qcow2_mark_clean(bs); +    if (ret < 0) { +        goto fail; +    } + +    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size); +    if (ret < 0) { +        goto fail; +    } + +    return 0; + +fail_broken_refcounts: +    /* The BDS is unusable at this point. If we wanted to make it usable, we +     * would have to call qcow2_refcount_close(), qcow2_refcount_init(), +     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() +     * again. However, because the functions which could have caused this error +     * path to be taken are used by those functions as well, it's very likely +     * that that sequence will fail as well. Therefore, just eject the BDS. */ +    bs->drv = NULL; + +fail: +    g_free(new_reftable); +    return ret; +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    uint64_t start_sector; +    int sector_step = INT_MAX / BDRV_SECTOR_SIZE; +    int l1_clusters, ret = 0; + +    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + +    if (s->qcow_version >= 3 && !s->snapshots && +        3 + l1_clusters <= s->refcount_block_size) { +        /* The following function only works for qcow2 v3 images (it requires +         * the dirty flag) and only as long as there are no snapshots (because +         * it completely empties the image). Furthermore, the L1 table and three +         * additional clusters (image header, refcount table, one refcount +         * block) have to fit inside one refcount block. */ +        return make_completely_empty(bs); +    } + +    /* This fallback code simply discards every active cluster; this is slow, +     * but works in all cases */ +    for (start_sector = 0; start_sector < bs->total_sectors; +         start_sector += sector_step) +    { +        /* As this function is generally used after committing an external +         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the +         * default action for this kind of discard is to pass the discard, +         * which will ideally result in an actually smaller image file, as +         * is probably desired. */ +        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE, +                                     MIN(sector_step, +                                         bs->total_sectors - start_sector), +                                     QCOW2_DISCARD_SNAPSHOT, true); +        if (ret < 0) { +            break; +        } +    } + +    return ret; +} + +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int ret; + +    qemu_co_mutex_lock(&s->lock); +    ret = qcow2_cache_flush(bs, s->l2_table_cache); +    if (ret < 0) { +        qemu_co_mutex_unlock(&s->lock); +        return ret; +    } + +    if (qcow2_need_accurate_refcounts(s)) { +        ret = qcow2_cache_flush(bs, s->refcount_block_cache); +        if (ret < 0) { +            qemu_co_mutex_unlock(&s->lock); +            return ret; +        } +    } +    qemu_co_mutex_unlock(&s->lock); + +    return 0; +} + +static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQcowState *s = bs->opaque; +    bdi->unallocated_blocks_are_zero = true; +    bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); +    bdi->cluster_size = s->cluster_size; +    bdi->vm_state_offset = qcow2_vm_state_offset(s); +    return 0; +} + +static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + +    *spec_info = (ImageInfoSpecific){ +        .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2, +        { +            .qcow2 = g_new(ImageInfoSpecificQCow2, 1), +        }, +    }; +    if (s->qcow_version == 2) { +        *spec_info->qcow2 = (ImageInfoSpecificQCow2){ +            .compat             = g_strdup("0.10"), +            .refcount_bits      = s->refcount_bits, +        }; +    } else if (s->qcow_version == 3) { +        *spec_info->qcow2 = (ImageInfoSpecificQCow2){ +            .compat             = g_strdup("1.1"), +            .lazy_refcounts     = s->compatible_features & +                                  QCOW2_COMPAT_LAZY_REFCOUNTS, +            .has_lazy_refcounts = true, +            .corrupt            = s->incompatible_features & +                                  QCOW2_INCOMPAT_CORRUPT, +            .has_corrupt        = true, +            .refcount_bits      = s->refcount_bits, +        }; +    } + +    return spec_info; +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t nb_clusters, k, k1, size; +    int refcount; + +    size = bdrv_getlength(bs->file); +    nb_clusters = size_to_clusters(s, size); +    for(k = 0; k < nb_clusters;) { +        k1 = k; +        refcount = get_refcount(bs, k); +        k++; +        while (k < nb_clusters && get_refcount(bs, k) == refcount) +            k++; +        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, +               k - k1); +    } +} +#endif + +static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, +                              int64_t pos) +{ +    BDRVQcowState *s = bs->opaque; +    int64_t total_sectors = bs->total_sectors; +    bool zero_beyond_eof = bs->zero_beyond_eof; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); +    bs->zero_beyond_eof = false; +    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); +    bs->zero_beyond_eof = zero_beyond_eof; + +    /* bdrv_co_do_writev will have increased the total_sectors value to include +     * the VM state - the VM state is however not an actual part of the block +     * device, therefore, we need to restore the old value. */ +    bs->total_sectors = total_sectors; + +    return ret; +} + +static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, +                              int64_t pos, int size) +{ +    BDRVQcowState *s = bs->opaque; +    bool zero_beyond_eof = bs->zero_beyond_eof; +    int ret; + +    BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); +    bs->zero_beyond_eof = false; +    ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); +    bs->zero_beyond_eof = zero_beyond_eof; + +    return ret; +} + +/* + * Downgrades an image's version. To achieve this, any incompatible features + * have to be removed. + */ +static int qcow2_downgrade(BlockDriverState *bs, int target_version, +                           BlockDriverAmendStatusCB *status_cb) +{ +    BDRVQcowState *s = bs->opaque; +    int current_version = s->qcow_version; +    int ret; + +    if (target_version == current_version) { +        return 0; +    } else if (target_version > current_version) { +        return -EINVAL; +    } else if (target_version != 2) { +        return -EINVAL; +    } + +    if (s->refcount_order != 4) { +        /* we would have to convert the image to a refcount_order == 4 image +         * here; however, since qemu (at the time of writing this) does not +         * support anything different than 4 anyway, there is no point in doing +         * so right now; however, we should error out (if qemu supports this in +         * the future and this code has not been adapted) */ +        error_report("qcow2_downgrade: Image refcount orders other than 4 are " +                     "currently not supported."); +        return -ENOTSUP; +    } + +    /* clear incompatible features */ +    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { +        ret = qcow2_mark_clean(bs); +        if (ret < 0) { +            return ret; +        } +    } + +    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in +     * the first place; if that happens nonetheless, returning -ENOTSUP is the +     * best thing to do anyway */ + +    if (s->incompatible_features) { +        return -ENOTSUP; +    } + +    /* since we can ignore compatible features, we can set them to 0 as well */ +    s->compatible_features = 0; +    /* if lazy refcounts have been used, they have already been fixed through +     * clearing the dirty flag */ + +    /* clearing autoclear features is trivial */ +    s->autoclear_features = 0; + +    ret = qcow2_expand_zero_clusters(bs, status_cb); +    if (ret < 0) { +        return ret; +    } + +    s->qcow_version = target_version; +    ret = qcow2_update_header(bs); +    if (ret < 0) { +        s->qcow_version = current_version; +        return ret; +    } +    return 0; +} + +static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, +                               BlockDriverAmendStatusCB *status_cb) +{ +    BDRVQcowState *s = bs->opaque; +    int old_version = s->qcow_version, new_version = old_version; +    uint64_t new_size = 0; +    const char *backing_file = NULL, *backing_format = NULL; +    bool lazy_refcounts = s->use_lazy_refcounts; +    const char *compat = NULL; +    uint64_t cluster_size = s->cluster_size; +    bool encrypt; +    int ret; +    QemuOptDesc *desc = opts->list->desc; + +    while (desc && desc->name) { +        if (!qemu_opt_find(opts, desc->name)) { +            /* only change explicitly defined options */ +            desc++; +            continue; +        } + +        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { +            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); +            if (!compat) { +                /* preserve default */ +            } else if (!strcmp(compat, "0.10")) { +                new_version = 2; +            } else if (!strcmp(compat, "1.1")) { +                new_version = 3; +            } else { +                fprintf(stderr, "Unknown compatibility level %s.\n", compat); +                return -EINVAL; +            } +        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { +            fprintf(stderr, "Cannot change preallocation mode.\n"); +            return -ENOTSUP; +        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { +            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); +        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { +            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); +        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { +            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); +        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { +            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, +                                        !!s->cipher); + +            if (encrypt != !!s->cipher) { +                fprintf(stderr, "Changing the encryption flag is not " +                        "supported.\n"); +                return -ENOTSUP; +            } +        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { +            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, +                                             cluster_size); +            if (cluster_size != s->cluster_size) { +                fprintf(stderr, "Changing the cluster size is not " +                        "supported.\n"); +                return -ENOTSUP; +            } +        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { +            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, +                                               lazy_refcounts); +        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { +            error_report("Cannot change refcount entry width"); +            return -ENOTSUP; +        } else { +            /* if this assertion fails, this probably means a new option was +             * added without having it covered here */ +            assert(false); +        } + +        desc++; +    } + +    if (new_version != old_version) { +        if (new_version > old_version) { +            /* Upgrade */ +            s->qcow_version = new_version; +            ret = qcow2_update_header(bs); +            if (ret < 0) { +                s->qcow_version = old_version; +                return ret; +            } +        } else { +            ret = qcow2_downgrade(bs, new_version, status_cb); +            if (ret < 0) { +                return ret; +            } +        } +    } + +    if (backing_file || backing_format) { +        ret = qcow2_change_backing_file(bs, +                    backing_file ?: s->image_backing_file, +                    backing_format ?: s->image_backing_format); +        if (ret < 0) { +            return ret; +        } +    } + +    if (s->use_lazy_refcounts != lazy_refcounts) { +        if (lazy_refcounts) { +            if (s->qcow_version < 3) { +                fprintf(stderr, "Lazy refcounts only supported with compatibility " +                        "level 1.1 and above (use compat=1.1 or greater)\n"); +                return -EINVAL; +            } +            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; +            ret = qcow2_update_header(bs); +            if (ret < 0) { +                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; +                return ret; +            } +            s->use_lazy_refcounts = true; +        } else { +            /* make image clean first */ +            ret = qcow2_mark_clean(bs); +            if (ret < 0) { +                return ret; +            } +            /* now disallow lazy refcounts */ +            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; +            ret = qcow2_update_header(bs); +            if (ret < 0) { +                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; +                return ret; +            } +            s->use_lazy_refcounts = false; +        } +    } + +    if (new_size) { +        ret = bdrv_truncate(bs, new_size); +        if (ret < 0) { +            return ret; +        } +    } + +    return 0; +} + +/* + * If offset or size are negative, respectively, they will not be included in + * the BLOCK_IMAGE_CORRUPTED event emitted. + * fatal will be ignored for read-only BDS; corruptions found there will always + * be considered non-fatal. + */ +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, +                             int64_t size, const char *message_format, ...) +{ +    BDRVQcowState *s = bs->opaque; +    const char *node_name; +    char *message; +    va_list ap; + +    fatal = fatal && !bs->read_only; + +    if (s->signaled_corruption && +        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) +    { +        return; +    } + +    va_start(ap, message_format); +    message = g_strdup_vprintf(message_format, ap); +    va_end(ap); + +    if (fatal) { +        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " +                "corruption events will be suppressed\n", message); +    } else { +        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " +                "corruption events will be suppressed\n", message); +    } + +    node_name = bdrv_get_node_name(bs); +    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), +                                          *node_name != '\0', node_name, +                                          message, offset >= 0, offset, +                                          size >= 0, size, +                                          fatal, &error_abort); +    g_free(message); + +    if (fatal) { +        qcow2_mark_corrupt(bs); +        bs->drv = NULL; /* make BDS unusable */ +    } + +    s->signaled_corruption = true; +} + +static QemuOptsList qcow2_create_opts = { +    .name = "qcow2-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_COMPAT_LEVEL, +            .type = QEMU_OPT_STRING, +            .help = "Compatibility level (0.10 or 1.1)" +        }, +        { +            .name = BLOCK_OPT_BACKING_FILE, +            .type = QEMU_OPT_STRING, +            .help = "File name of a base image" +        }, +        { +            .name = BLOCK_OPT_BACKING_FMT, +            .type = QEMU_OPT_STRING, +            .help = "Image format of the base image" +        }, +        { +            .name = BLOCK_OPT_ENCRYPT, +            .type = QEMU_OPT_BOOL, +            .help = "Encrypt the image", +            .def_value_str = "off" +        }, +        { +            .name = BLOCK_OPT_CLUSTER_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "qcow2 cluster size", +            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) +        }, +        { +            .name = BLOCK_OPT_PREALLOC, +            .type = QEMU_OPT_STRING, +            .help = "Preallocation mode (allowed values: off, metadata, " +                    "falloc, full)" +        }, +        { +            .name = BLOCK_OPT_LAZY_REFCOUNTS, +            .type = QEMU_OPT_BOOL, +            .help = "Postpone refcount updates", +            .def_value_str = "off" +        }, +        { +            .name = BLOCK_OPT_REFCOUNT_BITS, +            .type = QEMU_OPT_NUMBER, +            .help = "Width of a reference count entry in bits", +            .def_value_str = "16" +        }, +        { /* end of list */ } +    } +}; + +BlockDriver bdrv_qcow2 = { +    .format_name        = "qcow2", +    .instance_size      = sizeof(BDRVQcowState), +    .bdrv_probe         = qcow2_probe, +    .bdrv_open          = qcow2_open, +    .bdrv_close         = qcow2_close, +    .bdrv_reopen_prepare  = qcow2_reopen_prepare, +    .bdrv_create        = qcow2_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_co_get_block_status = qcow2_co_get_block_status, +    .bdrv_set_key       = qcow2_set_key, + +    .bdrv_co_readv          = qcow2_co_readv, +    .bdrv_co_writev         = qcow2_co_writev, +    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os, + +    .bdrv_co_write_zeroes   = qcow2_co_write_zeroes, +    .bdrv_co_discard        = qcow2_co_discard, +    .bdrv_truncate          = qcow2_truncate, +    .bdrv_write_compressed  = qcow2_write_compressed, +    .bdrv_make_empty        = qcow2_make_empty, + +    .bdrv_snapshot_create   = qcow2_snapshot_create, +    .bdrv_snapshot_goto     = qcow2_snapshot_goto, +    .bdrv_snapshot_delete   = qcow2_snapshot_delete, +    .bdrv_snapshot_list     = qcow2_snapshot_list, +    .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, +    .bdrv_get_info          = qcow2_get_info, +    .bdrv_get_specific_info = qcow2_get_specific_info, + +    .bdrv_save_vmstate    = qcow2_save_vmstate, +    .bdrv_load_vmstate    = qcow2_load_vmstate, + +    .supports_backing           = true, +    .bdrv_change_backing_file   = qcow2_change_backing_file, + +    .bdrv_refresh_limits        = qcow2_refresh_limits, +    .bdrv_invalidate_cache      = qcow2_invalidate_cache, + +    .create_opts         = &qcow2_create_opts, +    .bdrv_check          = qcow2_check, +    .bdrv_amend_options  = qcow2_amend_options, +}; + +static void bdrv_qcow2_init(void) +{ +    bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/block/qcow2.h b/block/qcow2.h new file mode 100644 index 00000000..4b5a6afc --- /dev/null +++ b/block/qcow2.h @@ -0,0 +1,592 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QCOW2_H +#define BLOCK_QCOW2_H + +#include "crypto/cipher.h" +#include "block/coroutine.h" + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES  1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE 0x800000 + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE 0x2000000 + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED     (1ULL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1ULL << 0) + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 + +/* Must be at least 2 to cover COW */ +#define MIN_L2_CACHE_SIZE 2 /* clusters */ + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ + +/* Whichever is more */ +#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ +#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ + +/* The refblock cache needs only a fourth of the L2 cache size to cover as many + * clusters */ +#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4 + +#define DEFAULT_CLUSTER_SIZE 65536 + + +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" +#define QCOW2_OPT_CACHE_SIZE "cache-size" +#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" +#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" + +typedef struct QCowHeader { +    uint32_t magic; +    uint32_t version; +    uint64_t backing_file_offset; +    uint32_t backing_file_size; +    uint32_t cluster_bits; +    uint64_t size; /* in bytes */ +    uint32_t crypt_method; +    uint32_t l1_size; /* XXX: save number of clusters instead ? */ +    uint64_t l1_table_offset; +    uint64_t refcount_table_offset; +    uint32_t refcount_table_clusters; +    uint32_t nb_snapshots; +    uint64_t snapshots_offset; + +    /* The following fields are only valid for version >= 3 */ +    uint64_t incompatible_features; +    uint64_t compatible_features; +    uint64_t autoclear_features; + +    uint32_t refcount_order; +    uint32_t header_length; +} QEMU_PACKED QCowHeader; + +typedef struct QEMU_PACKED QCowSnapshotHeader { +    /* header is 8 byte aligned */ +    uint64_t l1_table_offset; + +    uint32_t l1_size; +    uint16_t id_str_size; +    uint16_t name_size; + +    uint32_t date_sec; +    uint32_t date_nsec; + +    uint64_t vm_clock_nsec; + +    uint32_t vm_state_size; +    uint32_t extra_data_size; /* for extension */ +    /* extra data follows */ +    /* id_str follows */ +    /* name follows  */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { +    uint64_t vm_state_size_large; +    uint64_t disk_size; +} QCowSnapshotExtraData; + + +typedef struct QCowSnapshot { +    uint64_t l1_table_offset; +    uint32_t l1_size; +    char *id_str; +    char *name; +    uint64_t disk_size; +    uint64_t vm_state_size; +    uint32_t date_sec; +    uint32_t date_nsec; +    uint64_t vm_clock_nsec; +} QCowSnapshot; + +struct Qcow2Cache; +typedef struct Qcow2Cache Qcow2Cache; + +typedef struct Qcow2UnknownHeaderExtension { +    uint32_t magic; +    uint32_t len; +    QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; +    uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { +    QCOW2_FEAT_TYPE_INCOMPATIBLE    = 0, +    QCOW2_FEAT_TYPE_COMPATIBLE      = 1, +    QCOW2_FEAT_TYPE_AUTOCLEAR       = 2, +}; + +/* Incompatible feature bits */ +enum { +    QCOW2_INCOMPAT_DIRTY_BITNR   = 0, +    QCOW2_INCOMPAT_CORRUPT_BITNR = 1, +    QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, +    QCOW2_INCOMPAT_CORRUPT       = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, + +    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY +                                 | QCOW2_INCOMPAT_CORRUPT, +}; + +/* Compatible feature bits */ +enum { +    QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, +    QCOW2_COMPAT_LAZY_REFCOUNTS       = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + +    QCOW2_COMPAT_FEAT_MASK            = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +enum qcow2_discard_type { +    QCOW2_DISCARD_NEVER = 0, +    QCOW2_DISCARD_ALWAYS, +    QCOW2_DISCARD_REQUEST, +    QCOW2_DISCARD_SNAPSHOT, +    QCOW2_DISCARD_OTHER, +    QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { +    uint8_t type; +    uint8_t bit; +    char    name[46]; +} QEMU_PACKED Qcow2Feature; + +typedef struct Qcow2DiscardRegion { +    BlockDriverState *bs; +    uint64_t offset; +    uint64_t bytes; +    QTAILQ_ENTRY(Qcow2DiscardRegion) next; +} Qcow2DiscardRegion; + +typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, +                                      uint64_t index); +typedef void Qcow2SetRefcountFunc(void *refcount_array, +                                  uint64_t index, uint64_t value); + +typedef struct BDRVQcowState { +    int cluster_bits; +    int cluster_size; +    int cluster_sectors; +    int l2_bits; +    int l2_size; +    int l1_size; +    int l1_vm_state_index; +    int refcount_block_bits; +    int refcount_block_size; +    int csize_shift; +    int csize_mask; +    uint64_t cluster_offset_mask; +    uint64_t l1_table_offset; +    uint64_t *l1_table; + +    Qcow2Cache* l2_table_cache; +    Qcow2Cache* refcount_block_cache; + +    uint8_t *cluster_cache; +    uint8_t *cluster_data; +    uint64_t cluster_cache_offset; +    QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; + +    uint64_t *refcount_table; +    uint64_t refcount_table_offset; +    uint32_t refcount_table_size; +    uint64_t free_cluster_index; +    uint64_t free_byte_offset; + +    CoMutex lock; + +    QCryptoCipher *cipher; /* current cipher, NULL if no key yet */ +    uint32_t crypt_method_header; +    uint64_t snapshots_offset; +    int snapshots_size; +    unsigned int nb_snapshots; +    QCowSnapshot *snapshots; + +    int flags; +    int qcow_version; +    bool use_lazy_refcounts; +    int refcount_order; +    int refcount_bits; +    uint64_t refcount_max; + +    Qcow2GetRefcountFunc *get_refcount; +    Qcow2SetRefcountFunc *set_refcount; + +    bool discard_passthrough[QCOW2_DISCARD_MAX]; + +    int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ +    bool signaled_corruption; + +    uint64_t incompatible_features; +    uint64_t compatible_features; +    uint64_t autoclear_features; + +    size_t unknown_header_fields_size; +    void* unknown_header_fields; +    QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; +    QTAILQ_HEAD (, Qcow2DiscardRegion) discards; +    bool cache_discards; + +    /* Backing file path and format as stored in the image (this is not the +     * effective path/format, which may be the result of a runtime option +     * override) */ +    char *image_backing_file; +    char *image_backing_format; +} BDRVQcowState; + +struct QCowAIOCB; + +typedef struct Qcow2COWRegion { +    /** +     * Offset of the COW region in bytes from the start of the first cluster +     * touched by the request. +     */ +    uint64_t    offset; + +    /** Number of sectors to copy */ +    int         nb_sectors; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ +    /** Guest offset of the first newly allocated cluster */ +    uint64_t offset; + +    /** Host offset of the first newly allocated cluster */ +    uint64_t alloc_offset; + +    /** +     * Number of sectors from the start of the first allocated cluster to +     * the end of the (possibly shortened) request +     */ +    int nb_available; + +    /** Number of newly allocated clusters */ +    int nb_clusters; + +    /** +     * Requests that overlap with this allocation and wait to be restarted +     * when the allocating request has completed. +     */ +    CoQueue dependent_requests; + +    /** +     * The COW Region between the start of the first allocated cluster and the +     * area the guest actually writes to. +     */ +    Qcow2COWRegion cow_start; + +    /** +     * The COW Region between the area the guest actually writes to and the +     * end of the last allocated cluster. +     */ +    Qcow2COWRegion cow_end; + +    /** Pointer to next L2Meta of the same write request */ +    struct QCowL2Meta *next; + +    QLIST_ENTRY(QCowL2Meta) next_in_flight; +} QCowL2Meta; + +enum { +    QCOW2_CLUSTER_UNALLOCATED, +    QCOW2_CLUSTER_NORMAL, +    QCOW2_CLUSTER_COMPRESSED, +    QCOW2_CLUSTER_ZERO +}; + +typedef enum QCow2MetadataOverlap { +    QCOW2_OL_MAIN_HEADER_BITNR    = 0, +    QCOW2_OL_ACTIVE_L1_BITNR      = 1, +    QCOW2_OL_ACTIVE_L2_BITNR      = 2, +    QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, +    QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, +    QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, +    QCOW2_OL_INACTIVE_L1_BITNR    = 6, +    QCOW2_OL_INACTIVE_L2_BITNR    = 7, + +    QCOW2_OL_MAX_BITNR            = 8, + +    QCOW2_OL_NONE           = 0, +    QCOW2_OL_MAIN_HEADER    = (1 << QCOW2_OL_MAIN_HEADER_BITNR), +    QCOW2_OL_ACTIVE_L1      = (1 << QCOW2_OL_ACTIVE_L1_BITNR), +    QCOW2_OL_ACTIVE_L2      = (1 << QCOW2_OL_ACTIVE_L2_BITNR), +    QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), +    QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), +    QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), +    QCOW2_OL_INACTIVE_L1    = (1 << QCOW2_OL_INACTIVE_L1_BITNR), +    /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv +     * reads. */ +    QCOW2_OL_INACTIVE_L2    = (1 << QCOW2_OL_INACTIVE_L2_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ +    (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ +     QCOW2_OL_SNAPSHOT_TABLE) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ +    (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ +     QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ +    (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL + +static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) +{ +    return offset & ~(s->cluster_size - 1); +} + +static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) +{ +    return offset & (s->cluster_size - 1); +} + +static inline uint64_t size_to_clusters(BDRVQcowState *s, uint64_t size) +{ +    return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) +{ +    int shift = s->cluster_bits + s->l2_bits; +    return (size + (1ULL << shift) - 1) >> shift; +} + +static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) +{ +    return (offset >> s->cluster_bits) & (s->l2_size - 1); +} + +static inline int64_t align_offset(int64_t offset, int n) +{ +    offset = (offset + n - 1) & ~(n - 1); +    return offset; +} + +static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ +    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s) +{ +    return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; +} + +static inline int qcow2_get_cluster_type(uint64_t l2_entry) +{ +    if (l2_entry & QCOW_OFLAG_COMPRESSED) { +        return QCOW2_CLUSTER_COMPRESSED; +    } else if (l2_entry & QCOW_OFLAG_ZERO) { +        return QCOW2_CLUSTER_ZERO; +    } else if (!(l2_entry & L2E_OFFSET_MASK)) { +        return QCOW2_CLUSTER_UNALLOCATED; +    } else { +        return QCOW2_CLUSTER_NORMAL; +    } +} + +/* Check whether refcounts are eager or lazy */ +static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) +{ +    return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); +} + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ +    return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ +    return m->offset + m->cow_end.offset +        + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); +} + +static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) +{ +    return r1 > r2 ? r1 - r2 : r2 - r1; +} + +// FIXME Need qcow2_ prefix to global functions + +/* qcow2.c functions */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, +                  int64_t sector_num, int nb_sectors); + +int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_mark_corrupt(BlockDriverState *bs); +int qcow2_mark_consistent(BlockDriverState *bs); +int qcow2_update_header(BlockDriverState *bs); + +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, +                             int64_t size, const char *message_format, ...) +                             GCC_FMT_ATTR(5, 6); + +/* qcow2-refcount.c functions */ +int qcow2_refcount_init(BlockDriverState *bs); +void qcow2_refcount_close(BlockDriverState *bs); + +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, +                       uint64_t *refcount); + +int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, +                                  uint64_t addend, bool decrease, +                                  enum qcow2_discard_type type); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, +                                int64_t nb_clusters); +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +void qcow2_free_clusters(BlockDriverState *bs, +                          int64_t offset, int64_t size, +                          enum qcow2_discard_type type); +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, +                             int nb_clusters, enum qcow2_discard_type type); + +int qcow2_update_snapshot_refcount(BlockDriverState *bs, +    int64_t l1_table_offset, int l1_size, int addend); + +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, +                          BdrvCheckMode fix); + +void qcow2_process_discards(BlockDriverState *bs, int ret); + +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, +                                 int64_t size); +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, +                                  int64_t size); + +/* qcow2-cluster.c functions */ +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, +                        bool exact_size); +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); +void qcow2_l2_cache_reset(BlockDriverState *bs); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); +int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +                          uint8_t *out_buf, const uint8_t *in_buf, +                          int nb_sectors, bool enc, Error **errp); + +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, +    int *num, uint64_t *host_offset, QCowL2Meta **m); +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, +                                         uint64_t offset, +                                         int compressed_size); + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, +    int nb_sectors, enum qcow2_discard_type type, bool full_discard); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); + +int qcow2_expand_zero_clusters(BlockDriverState *bs, +                               BlockDriverAmendStatusCB *status_cb); + +/* qcow2-snapshot.c functions */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, +                          const char *snapshot_id, +                          const char *name, +                          Error **errp); +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, +                            const char *snapshot_id, +                            const char *name, +                            Error **errp); + +void qcow2_free_snapshots(BlockDriverState *bs); +int qcow2_read_snapshots(BlockDriverState *bs); + +/* qcow2-cache.c functions */ +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); + +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, +     void *table); +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, +    Qcow2Cache *dependency); +void qcow2_cache_depends_on_flush(Qcow2Cache *c); + +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table); +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, +    void **table); +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); + +#endif diff --git a/block/qed-check.c b/block/qed-check.c new file mode 100644 index 00000000..36ecd290 --- /dev/null +++ b/block/qed-check.c @@ -0,0 +1,250 @@ +/* + * QEMU Enhanced Disk Format Consistency Check + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { +    BDRVQEDState *s; +    BdrvCheckResult *result; +    bool fix;                           /* whether to fix invalid offsets */ + +    uint64_t nclusters; +    uint32_t *used_clusters;            /* referenced cluster bitmap */ + +    QEDRequest request; +} QEDCheck; + +static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { +    return !!(bitmap[n / 32] & (1 << (n % 32))); +} + +static void qed_set_bit(uint32_t *bitmap, uint64_t n) { +    bitmap[n / 32] |= 1 << (n % 32); +} + +/** + * Set bitmap bits for clusters + * + * @check:          Check structure + * @offset:         Starting offset in bytes + * @n:              Number of clusters + */ +static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, +                                  unsigned int n) +{ +    uint64_t cluster = qed_bytes_to_clusters(check->s, offset); +    unsigned int corruptions = 0; + +    while (n-- != 0) { +        /* Clusters should only be referenced once */ +        if (qed_test_bit(check->used_clusters, cluster)) { +            corruptions++; +        } + +        qed_set_bit(check->used_clusters, cluster); +        cluster++; +    } + +    check->result->corruptions += corruptions; +    return corruptions == 0; +} + +/** + * Check an L2 table + * + * @ret:            Number of invalid cluster offsets + */ +static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) +{ +    BDRVQEDState *s = check->s; +    unsigned int i, num_invalid = 0; +    uint64_t last_offset = 0; + +    for (i = 0; i < s->table_nelems; i++) { +        uint64_t offset = table->offsets[i]; + +        if (qed_offset_is_unalloc_cluster(offset) || +            qed_offset_is_zero_cluster(offset)) { +            continue; +        } +        check->result->bfi.allocated_clusters++; +        if (last_offset && (last_offset + s->header.cluster_size != offset)) { +            check->result->bfi.fragmented_clusters++; +        } +        last_offset = offset; + +        /* Detect invalid cluster offset */ +        if (!qed_check_cluster_offset(s, offset)) { +            if (check->fix) { +                table->offsets[i] = 0; +                check->result->corruptions_fixed++; +            } else { +                check->result->corruptions++; +            } + +            num_invalid++; +            continue; +        } + +        qed_set_used_clusters(check, offset, 1); +    } + +    return num_invalid; +} + +/** + * Descend tables and check each cluster is referenced once only + */ +static int qed_check_l1_table(QEDCheck *check, QEDTable *table) +{ +    BDRVQEDState *s = check->s; +    unsigned int i, num_invalid_l1 = 0; +    int ret, last_error = 0; + +    /* Mark L1 table clusters used */ +    qed_set_used_clusters(check, s->header.l1_table_offset, +                          s->header.table_size); + +    for (i = 0; i < s->table_nelems; i++) { +        unsigned int num_invalid_l2; +        uint64_t offset = table->offsets[i]; + +        if (qed_offset_is_unalloc_cluster(offset)) { +            continue; +        } + +        /* Detect invalid L2 offset */ +        if (!qed_check_table_offset(s, offset)) { +            /* Clear invalid offset */ +            if (check->fix) { +                table->offsets[i] = 0; +                check->result->corruptions_fixed++; +            } else { +                check->result->corruptions++; +            } + +            num_invalid_l1++; +            continue; +        } + +        if (!qed_set_used_clusters(check, offset, s->header.table_size)) { +            continue; /* skip an invalid table */ +        } + +        ret = qed_read_l2_table_sync(s, &check->request, offset); +        if (ret) { +            check->result->check_errors++; +            last_error = ret; +            continue; +        } + +        num_invalid_l2 = qed_check_l2_table(check, +                                            check->request.l2_table->table); + +        /* Write out fixed L2 table */ +        if (num_invalid_l2 > 0 && check->fix) { +            ret = qed_write_l2_table_sync(s, &check->request, 0, +                                          s->table_nelems, false); +            if (ret) { +                check->result->check_errors++; +                last_error = ret; +                continue; +            } +        } +    } + +    /* Drop reference to final table */ +    qed_unref_l2_cache_entry(check->request.l2_table); +    check->request.l2_table = NULL; + +    /* Write out fixed L1 table */ +    if (num_invalid_l1 > 0 && check->fix) { +        ret = qed_write_l1_table_sync(s, 0, s->table_nelems); +        if (ret) { +            check->result->check_errors++; +            last_error = ret; +        } +    } + +    return last_error; +} + +/** + * Check for unreferenced (leaked) clusters + */ +static void qed_check_for_leaks(QEDCheck *check) +{ +    BDRVQEDState *s = check->s; +    uint64_t i; + +    for (i = s->header.header_size; i < check->nclusters; i++) { +        if (!qed_test_bit(check->used_clusters, i)) { +            check->result->leaks++; +        } +    } +} + +/** + * Mark an image clean once it passes check or has been repaired + */ +static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) +{ +    /* Skip if there were unfixable corruptions or I/O errors */ +    if (result->corruptions > 0 || result->check_errors > 0) { +        return; +    } + +    /* Skip if image is already marked clean */ +    if (!(s->header.features & QED_F_NEED_CHECK)) { +        return; +    } + +    /* Ensure fixes reach storage before clearing check bit */ +    bdrv_flush(s->bs); + +    s->header.features &= ~QED_F_NEED_CHECK; +    qed_write_header_sync(s); +} + +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) +{ +    QEDCheck check = { +        .s = s, +        .result = result, +        .nclusters = qed_bytes_to_clusters(s, s->file_size), +        .request = { .l2_table = NULL }, +        .fix = fix, +    }; +    int ret; + +    check.used_clusters = g_try_new0(uint32_t, (check.nclusters + 31) / 32); +    if (check.nclusters && check.used_clusters == NULL) { +        return -ENOMEM; +    } + +    check.result->bfi.total_clusters = +        (s->header.image_size + s->header.cluster_size - 1) / +            s->header.cluster_size; +    ret = qed_check_l1_table(&check, s->l1_table); +    if (ret == 0) { +        /* Only check for leaks if entire image was scanned successfully */ +        qed_check_for_leaks(&check); + +        if (fix) { +            qed_check_mark_clean(s, result); +        } +    } + +    g_free(check.used_clusters); +    return ret; +} diff --git a/block/qed-cluster.c b/block/qed-cluster.c new file mode 100644 index 00000000..f64b2af8 --- /dev/null +++ b/block/qed-cluster.c @@ -0,0 +1,165 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s:              QED state + * @table:          L2 table + * @index:          First cluster index + * @n:              Maximum number of clusters + * @offset:         Set to first cluster offset + * + * This function scans tables for contiguous clusters.  A contiguous run of + * clusters may be allocated, unallocated, or zero. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, +                                                  QEDTable *table, +                                                  unsigned int index, +                                                  unsigned int n, +                                                  uint64_t *offset) +{ +    unsigned int end = MIN(index + n, s->table_nelems); +    uint64_t last = table->offsets[index]; +    unsigned int i; + +    *offset = last; + +    for (i = index + 1; i < end; i++) { +        if (qed_offset_is_unalloc_cluster(last)) { +            /* Counting unallocated clusters */ +            if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { +                break; +            } +        } else if (qed_offset_is_zero_cluster(last)) { +            /* Counting zero clusters */ +            if (!qed_offset_is_zero_cluster(table->offsets[i])) { +                break; +            } +        } else { +            /* Counting allocated clusters */ +            if (table->offsets[i] != last + s->header.cluster_size) { +                break; +            } +            last = table->offsets[i]; +        } +    } +    return i - index; +} + +typedef struct { +    BDRVQEDState *s; +    uint64_t pos; +    size_t len; + +    QEDRequest *request; + +    /* User callback */ +    QEDFindClusterFunc *cb; +    void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ +    QEDFindClusterCB *find_cluster_cb = opaque; +    BDRVQEDState *s = find_cluster_cb->s; +    QEDRequest *request = find_cluster_cb->request; +    uint64_t offset = 0; +    size_t len = 0; +    unsigned int index; +    unsigned int n; + +    if (ret) { +        goto out; +    } + +    index = qed_l2_index(s, find_cluster_cb->pos); +    n = qed_bytes_to_clusters(s, +                              qed_offset_into_cluster(s, find_cluster_cb->pos) + +                              find_cluster_cb->len); +    n = qed_count_contiguous_clusters(s, request->l2_table->table, +                                      index, n, &offset); + +    if (qed_offset_is_unalloc_cluster(offset)) { +        ret = QED_CLUSTER_L2; +    } else if (qed_offset_is_zero_cluster(offset)) { +        ret = QED_CLUSTER_ZERO; +    } else if (qed_check_cluster_offset(s, offset)) { +        ret = QED_CLUSTER_FOUND; +    } else { +        ret = -EINVAL; +    } + +    len = MIN(find_cluster_cb->len, n * s->header.cluster_size - +              qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: +    find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); +    g_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s:          QED state + * @request:    L2 cache entry + * @pos:        Byte position in device + * @len:        Number of bytes + * @cb:         Completion function + * @opaque:     User data for completion function + * + * This function translates a position in the block device to an offset in the + * image file.  It invokes the cb completion callback to report back the + * translated offset or unallocated range in the image file. + * + * If the L2 table exists, request->l2_table points to the L2 table cache entry + * and the caller must free the reference when they are finished.  The cache + * entry is exposed in this way to avoid callers having to read the L2 table + * again later during request processing.  If request->l2_table is non-NULL it + * will be unreferenced before taking on the new cache entry. + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, +                      size_t len, QEDFindClusterFunc *cb, void *opaque) +{ +    QEDFindClusterCB *find_cluster_cb; +    uint64_t l2_offset; + +    /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary +     * so that a request acts on one L2 table at a time. +     */ +    len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + +    l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; +    if (qed_offset_is_unalloc_cluster(l2_offset)) { +        cb(opaque, QED_CLUSTER_L1, 0, len); +        return; +    } +    if (!qed_check_table_offset(s, l2_offset)) { +        cb(opaque, -EINVAL, 0, 0); +        return; +    } + +    find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); +    find_cluster_cb->s = s; +    find_cluster_cb->pos = pos; +    find_cluster_cb->len = len; +    find_cluster_cb->cb = cb; +    find_cluster_cb->opaque = opaque; +    find_cluster_cb->request = request; + +    qed_read_l2_table(s, request, l2_offset, +                      qed_find_cluster_cb, find_cluster_cb); +} diff --git a/block/qed-gencb.c b/block/qed-gencb.c new file mode 100644 index 00000000..b817a8bf --- /dev/null +++ b/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque) +{ +    GenericCB *gencb = g_malloc(len); +    gencb->cb = cb; +    gencb->opaque = opaque; +    return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ +    GenericCB *gencb = opaque; +    BlockCompletionFunc *cb = gencb->cb; +    void *user_opaque = gencb->opaque; + +    g_free(gencb); +    cb(user_opaque, ret); +} diff --git a/block/qed-l2-cache.c b/block/qed-l2-cache.c new file mode 100644 index 00000000..e9b2aae4 --- /dev/null +++ b/block/qed-l2-cache.c @@ -0,0 +1,187 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +/* + * L2 table cache usage is as follows: + * + * An open image has one L2 table cache that is used to avoid accessing the + * image file for recently referenced L2 tables. + * + * Cluster offset lookup translates the logical offset within the block device + * to a cluster offset within the image file.  This is done by indexing into + * the L1 and L2 tables which store cluster offsets.  It is here where the L2 + * table cache serves up recently referenced L2 tables. + * + * If there is a cache miss, that L2 table is read from the image file and + * committed to the cache.  Subsequent accesses to that L2 table will be served + * from the cache until the table is evicted from the cache. + * + * L2 tables are also committed to the cache when new L2 tables are allocated + * in the image file.  Since the L2 table cache is write-through, the new L2 + * table is first written out to the image file and then committed to the + * cache. + * + * Multiple I/O requests may be using an L2 table cache entry at any given + * time.  That means an entry may be in use across several requests and + * reference counting is needed to free the entry at the correct time.  In + * particular, an entry evicted from the cache will only be freed once all + * references are dropped. + * + * An in-flight I/O request will hold a reference to a L2 table cache entry for + * the period during which it needs to access the L2 table.  This includes + * cluster offset lookup, L2 table allocation, and L2 table update when a new + * data cluster has been allocated. + * + * An interesting case occurs when two requests need to access an L2 table that + * is not in the cache.  Since the operation to read the table from the image + * file takes some time to complete, both requests may see a cache miss and + * start reading the L2 table from the image file.  The first to finish will + * commit its L2 table into the cache.  When the second tries to commit its + * table will be deleted in favor of the existing cache entry. + */ + +#include "trace.h" +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache) +{ +    QTAILQ_INIT(&l2_cache->entries); +    l2_cache->n_entries = 0; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ +    CachedL2Table *entry, *next_entry; + +    QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { +        qemu_vfree(entry->table); +        g_free(entry); +    } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + * The caller must allocate the actual table field for this entry and it must + * be freeable using qemu_vfree(). + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ +    CachedL2Table *entry; + +    entry = g_malloc0(sizeof(*entry)); +    entry->ref++; + +    trace_qed_alloc_l2_cache_entry(l2_cache, entry); + +    return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(CachedL2Table *entry) +{ +    if (!entry) { +        return; +    } + +    entry->ref--; +    trace_qed_unref_l2_cache_entry(entry, entry->ref); +    if (entry->ref == 0) { +        qemu_vfree(entry->table); +        g_free(entry); +    } +} + +/** + * Find an entry in the L2 cache.  This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ +    CachedL2Table *entry; + +    QTAILQ_FOREACH(entry, &l2_cache->entries, node) { +        if (entry->offset == offset) { +            trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); +            entry->ref++; +            return entry; +        } +    } +    return NULL; +} + +/** + * Commit an L2 cache entry into the cache.  This is meant to be used as part of + * the process to satisfy a cache miss.  A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * N.B. This function steals a reference to the l2_table from the caller so the + * caller must obtain a new reference by issuing a call to + * qed_find_l2_cache_entry(). + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ +    CachedL2Table *entry; + +    entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); +    if (entry) { +        qed_unref_l2_cache_entry(entry); +        qed_unref_l2_cache_entry(l2_table); +        return; +    } + +    /* Evict an unused cache entry so we have space.  If all entries are in use +     * we can grow the cache temporarily and we try to shrink back down later. +     */ +    if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { +        CachedL2Table *next; +        QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { +            if (entry->ref > 1) { +                continue; +            } + +            QTAILQ_REMOVE(&l2_cache->entries, entry, node); +            l2_cache->n_entries--; +            qed_unref_l2_cache_entry(entry); + +            /* Stop evicting when we've shrunk back to max size */ +            if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { +                break; +            } +        } +    } + +    l2_cache->n_entries++; +    QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/block/qed-table.c b/block/qed-table.c new file mode 100644 index 00000000..513aa872 --- /dev/null +++ b/block/qed-table.c @@ -0,0 +1,296 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "qed.h" + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEDTable *table; + +    struct iovec iov; +    QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ +    QEDReadTableCB *read_table_cb = opaque; +    QEDTable *table = read_table_cb->table; +    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); +    int i; + +    /* Handle I/O error */ +    if (ret) { +        goto out; +    } + +    /* Byteswap offsets */ +    for (i = 0; i < noffsets; i++) { +        table->offsets[i] = le64_to_cpu(table->offsets[i]); +    } + +out: +    /* Completion */ +    trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); +    gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, +                           BlockCompletionFunc *cb, void *opaque) +{ +    QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), +                                                cb, opaque); +    QEMUIOVector *qiov = &read_table_cb->qiov; + +    trace_qed_read_table(s, offset, table); + +    read_table_cb->s = s; +    read_table_cb->table = table; +    read_table_cb->iov.iov_base = table->offsets, +    read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + +    qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); +    bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, +                   qiov->size / BDRV_SECTOR_SIZE, +                   qed_read_table_cb, read_table_cb); +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEDTable *orig_table; +    QEDTable *table; +    bool flush;             /* flush after write? */ + +    struct iovec iov; +    QEMUIOVector qiov; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ +    QEDWriteTableCB *write_table_cb = opaque; + +    trace_qed_write_table_cb(write_table_cb->s, +                             write_table_cb->orig_table, +                             write_table_cb->flush, +                             ret); + +    if (ret) { +        goto out; +    } + +    if (write_table_cb->flush) { +        /* We still need to flush first */ +        write_table_cb->flush = false; +        bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, +                       write_table_cb); +        return; +    } + +out: +    qemu_vfree(write_table_cb->table); +    gencb_complete(&write_table_cb->gencb, ret); +} + +/** + * Write out an updated part or all of a table + * + * @s:          QED state + * @offset:     Offset of table in image file, in bytes + * @table:      Table + * @index:      Index of first element + * @n:          Number of elements + * @flush:      Whether or not to sync to disk + * @cb:         Completion function + * @opaque:     Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, +                            unsigned int index, unsigned int n, bool flush, +                            BlockCompletionFunc *cb, void *opaque) +{ +    QEDWriteTableCB *write_table_cb; +    unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; +    unsigned int start, end, i; +    size_t len_bytes; + +    trace_qed_write_table(s, offset, table, index, n); + +    /* Calculate indices of the first and one after last elements */ +    start = index & ~sector_mask; +    end = (index + n + sector_mask) & ~sector_mask; + +    len_bytes = (end - start) * sizeof(uint64_t); + +    write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); +    write_table_cb->s = s; +    write_table_cb->orig_table = table; +    write_table_cb->flush = flush; +    write_table_cb->table = qemu_blockalign(s->bs, len_bytes); +    write_table_cb->iov.iov_base = write_table_cb->table->offsets; +    write_table_cb->iov.iov_len = len_bytes; +    qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + +    /* Byteswap table */ +    for (i = start; i < end; i++) { +        uint64_t le_offset = cpu_to_le64(table->offsets[i]); +        write_table_cb->table->offsets[i - start] = le_offset; +    } + +    /* Adjust for offset into table */ +    offset += start * sizeof(uint64_t); + +    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, +                    &write_table_cb->qiov, +                    write_table_cb->qiov.size / BDRV_SECTOR_SIZE, +                    qed_write_table_cb, write_table_cb); +} + +/** + * Propagate return value from async callback + */ +static void qed_sync_cb(void *opaque, int ret) +{ +    *(int *)opaque = ret; +} + +int qed_read_l1_table_sync(BDRVQEDState *s) +{ +    int ret = -EINPROGRESS; + +    qed_read_table(s, s->header.l1_table_offset, +                   s->l1_table, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        aio_poll(bdrv_get_aio_context(s->bs), true); +    } + +    return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, +                        BlockCompletionFunc *cb, void *opaque) +{ +    BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); +    qed_write_table(s, s->header.l1_table_offset, +                    s->l1_table, index, n, false, cb, opaque); +} + +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, +                            unsigned int n) +{ +    int ret = -EINPROGRESS; + +    qed_write_l1_table(s, index, n, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        aio_poll(bdrv_get_aio_context(s->bs), true); +    } + +    return ret; +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    uint64_t l2_offset; +    QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ +    QEDReadL2TableCB *read_l2_table_cb = opaque; +    QEDRequest *request = read_l2_table_cb->request; +    BDRVQEDState *s = read_l2_table_cb->s; +    CachedL2Table *l2_table = request->l2_table; +    uint64_t l2_offset = read_l2_table_cb->l2_offset; + +    if (ret) { +        /* can't trust loaded L2 table anymore */ +        qed_unref_l2_cache_entry(l2_table); +        request->l2_table = NULL; +    } else { +        l2_table->offset = l2_offset; + +        qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + +        /* This is guaranteed to succeed because we just committed the entry +         * to the cache. +         */ +        request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); +        assert(request->l2_table != NULL); +    } + +    gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, +                       BlockCompletionFunc *cb, void *opaque) +{ +    QEDReadL2TableCB *read_l2_table_cb; + +    qed_unref_l2_cache_entry(request->l2_table); + +    /* Check for cached L2 entry */ +    request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); +    if (request->l2_table) { +        cb(opaque, 0); +        return; +    } + +    request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); +    request->l2_table->table = qed_alloc_table(s); + +    read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); +    read_l2_table_cb->s = s; +    read_l2_table_cb->l2_offset = offset; +    read_l2_table_cb->request = request; + +    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); +    qed_read_table(s, offset, request->l2_table->table, +                   qed_read_l2_table_cb, read_l2_table_cb); +} + +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) +{ +    int ret = -EINPROGRESS; + +    qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        aio_poll(bdrv_get_aio_context(s->bs), true); +    } + +    return ret; +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, +                        unsigned int index, unsigned int n, bool flush, +                        BlockCompletionFunc *cb, void *opaque) +{ +    BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); +    qed_write_table(s, request->l2_table->offset, +                    request->l2_table->table, index, n, flush, cb, opaque); +} + +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                            unsigned int index, unsigned int n, bool flush) +{ +    int ret = -EINPROGRESS; + +    qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); +    while (ret == -EINPROGRESS) { +        aio_poll(bdrv_get_aio_context(s->bs), true); +    } + +    return ret; +} diff --git a/block/qed.c b/block/qed.c new file mode 100644 index 00000000..954ed007 --- /dev/null +++ b/block/qed.c @@ -0,0 +1,1693 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/timer.h" +#include "trace.h" +#include "qed.h" +#include "qapi/qmp/qerror.h" +#include "migration/migration.h" + +static const AIOCBInfo qed_aiocb_info = { +    .aiocb_size         = sizeof(QEDAIOCB), +}; + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, +                          const char *filename) +{ +    const QEDHeader *header = (const QEDHeader *)buf; + +    if (buf_size < sizeof(*header)) { +        return 0; +    } +    if (le32_to_cpu(header->magic) != QED_MAGIC) { +        return 0; +    } +    return 100; +} + +/** + * Check whether an image format is raw + * + * @fmt:    Backing file format, may be NULL + */ +static bool qed_fmt_is_raw(const char *fmt) +{ +    return fmt && strcmp(fmt, "raw") == 0; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ +    cpu->magic = le32_to_cpu(le->magic); +    cpu->cluster_size = le32_to_cpu(le->cluster_size); +    cpu->table_size = le32_to_cpu(le->table_size); +    cpu->header_size = le32_to_cpu(le->header_size); +    cpu->features = le64_to_cpu(le->features); +    cpu->compat_features = le64_to_cpu(le->compat_features); +    cpu->autoclear_features = le64_to_cpu(le->autoclear_features); +    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); +    cpu->image_size = le64_to_cpu(le->image_size); +    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); +    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ +    le->magic = cpu_to_le32(cpu->magic); +    le->cluster_size = cpu_to_le32(cpu->cluster_size); +    le->table_size = cpu_to_le32(cpu->table_size); +    le->header_size = cpu_to_le32(cpu->header_size); +    le->features = cpu_to_le64(cpu->features); +    le->compat_features = cpu_to_le64(cpu->compat_features); +    le->autoclear_features = cpu_to_le64(cpu->autoclear_features); +    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); +    le->image_size = cpu_to_le64(cpu->image_size); +    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); +    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); +} + +int qed_write_header_sync(BDRVQEDState *s) +{ +    QEDHeader le; +    int ret; + +    qed_header_cpu_to_le(&s->header, &le); +    ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); +    if (ret != sizeof(le)) { +        return ret; +    } +    return 0; +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    struct iovec iov; +    QEMUIOVector qiov; +    int nsectors; +    uint8_t *buf; +} QEDWriteHeaderCB; + +static void qed_write_header_cb(void *opaque, int ret) +{ +    QEDWriteHeaderCB *write_header_cb = opaque; + +    qemu_vfree(write_header_cb->buf); +    gencb_complete(write_header_cb, ret); +} + +static void qed_write_header_read_cb(void *opaque, int ret) +{ +    QEDWriteHeaderCB *write_header_cb = opaque; +    BDRVQEDState *s = write_header_cb->s; + +    if (ret) { +        qed_write_header_cb(write_header_cb, ret); +        return; +    } + +    /* Update header */ +    qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); + +    bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, +                    write_header_cb->nsectors, qed_write_header_cb, +                    write_header_cb); +} + +/** + * Update header in-place (does not rewrite backing filename or other strings) + * + * This function only updates known header fields in-place and does not affect + * extra data after the QED header. + */ +static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, +                             void *opaque) +{ +    /* We must write full sectors for O_DIRECT but cannot necessarily generate +     * the data following the header if an unrecognized compat feature is +     * active.  Therefore, first read the sectors containing the header, update +     * them, and write back. +     */ + +    int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / +                   BDRV_SECTOR_SIZE; +    size_t len = nsectors * BDRV_SECTOR_SIZE; +    QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), +                                                    cb, opaque); + +    write_header_cb->s = s; +    write_header_cb->nsectors = nsectors; +    write_header_cb->buf = qemu_blockalign(s->bs, len); +    write_header_cb->iov.iov_base = write_header_cb->buf; +    write_header_cb->iov.iov_len = len; +    qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); + +    bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, +                   qed_write_header_read_cb, write_header_cb); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ +    uint64_t table_entries; +    uint64_t l2_size; + +    table_entries = (table_size * cluster_size) / sizeof(uint64_t); +    l2_size = table_entries * cluster_size; + +    return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ +    if (cluster_size < QED_MIN_CLUSTER_SIZE || +        cluster_size > QED_MAX_CLUSTER_SIZE) { +        return false; +    } +    if (cluster_size & (cluster_size - 1)) { +        return false; /* not power of 2 */ +    } +    return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ +    if (table_size < QED_MIN_TABLE_SIZE || +        table_size > QED_MAX_TABLE_SIZE) { +        return false; +    } +    if (table_size & (table_size - 1)) { +        return false; /* not power of 2 */ +    } +    return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, +                                    uint32_t table_size) +{ +    if (image_size % BDRV_SECTOR_SIZE != 0) { +        return false; /* not multiple of sector size */ +    } +    if (image_size > qed_max_image_size(cluster_size, table_size)) { +        return false; /* image is too large */ +    } +    return true; +} + +/** + * Read a string of known length from the image file + * + * @file:       Image file + * @offset:     File offset to start of string, in bytes + * @n:          String length in bytes + * @buf:        Destination buffer + * @buflen:     Destination buffer length in bytes + * @ret:        0 on success, -errno on failure + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, +                           char *buf, size_t buflen) +{ +    int ret; +    if (n >= buflen) { +        return -EINVAL; +    } +    ret = bdrv_pread(file, offset, buf, n); +    if (ret < 0) { +        return ret; +    } +    buf[n] = '\0'; +    return 0; +} + +/** + * Allocate new clusters + * + * @s:          QED state + * @n:          Number of contiguous clusters to allocate + * @ret:        Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written.  It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ +    uint64_t offset = s->file_size; +    s->file_size += n * s->header.cluster_size; +    return offset; +} + +QEDTable *qed_alloc_table(BDRVQEDState *s) +{ +    /* Honor O_DIRECT memory alignment requirements */ +    return qemu_blockalign(s->bs, +                           s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ +    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + +    l2_table->table = qed_alloc_table(s); +    l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + +    memset(l2_table->table->offsets, 0, +           s->header.cluster_size * s->header.table_size); +    return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +{ +    assert(!s->allocating_write_reqs_plugged); + +    s->allocating_write_reqs_plugged = true; +} + +static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) +{ +    QEDAIOCB *acb; + +    assert(s->allocating_write_reqs_plugged); + +    s->allocating_write_reqs_plugged = false; + +    acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); +    if (acb) { +        qed_aio_next_io(acb, 0); +    } +} + +static void qed_finish_clear_need_check(void *opaque, int ret) +{ +    /* Do nothing */ +} + +static void qed_flush_after_clear_need_check(void *opaque, int ret) +{ +    BDRVQEDState *s = opaque; + +    bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + +    /* No need to wait until flush completes */ +    qed_unplug_allocating_write_reqs(s); +} + +static void qed_clear_need_check(void *opaque, int ret) +{ +    BDRVQEDState *s = opaque; + +    if (ret) { +        qed_unplug_allocating_write_reqs(s); +        return; +    } + +    s->header.features &= ~QED_F_NEED_CHECK; +    qed_write_header(s, qed_flush_after_clear_need_check, s); +} + +static void qed_need_check_timer_cb(void *opaque) +{ +    BDRVQEDState *s = opaque; + +    /* The timer should only fire when allocating writes have drained */ +    assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); + +    trace_qed_need_check_timer_cb(s); + +    qed_plug_allocating_write_reqs(s); + +    /* Ensure writes are on disk before clearing flag */ +    bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static void qed_start_need_check_timer(BDRVQEDState *s) +{ +    trace_qed_start_need_check_timer(s); + +    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for +     * migration. +     */ +    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + +                   get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); +} + +/* It's okay to call this multiple times or when no timer is started */ +static void qed_cancel_need_check_timer(BDRVQEDState *s) +{ +    trace_qed_cancel_need_check_timer(s); +    timer_del(s->need_check_timer); +} + +static void bdrv_qed_rebind(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; +    s->bs = bs; +} + +static void bdrv_qed_detach_aio_context(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; + +    qed_cancel_need_check_timer(s); +    timer_free(s->need_check_timer); +} + +static void bdrv_qed_attach_aio_context(BlockDriverState *bs, +                                        AioContext *new_context) +{ +    BDRVQEDState *s = bs->opaque; + +    s->need_check_timer = aio_timer_new(new_context, +                                        QEMU_CLOCK_VIRTUAL, SCALE_NS, +                                        qed_need_check_timer_cb, s); +    if (s->header.features & QED_F_NEED_CHECK) { +        qed_start_need_check_timer(s); +    } +} + +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, +                         Error **errp) +{ +    BDRVQEDState *s = bs->opaque; +    QEDHeader le_header; +    int64_t file_size; +    int ret; + +    s->bs = bs; +    QSIMPLEQ_INIT(&s->allocating_write_reqs); + +    ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); +    if (ret < 0) { +        return ret; +    } +    qed_header_le_to_cpu(&le_header, &s->header); + +    if (s->header.magic != QED_MAGIC) { +        error_setg(errp, "Image not in QED format"); +        return -EINVAL; +    } +    if (s->header.features & ~QED_FEATURE_MASK) { +        /* image uses unsupported feature bits */ +        char buf[64]; +        snprintf(buf, sizeof(buf), "%" PRIx64, +            s->header.features & ~QED_FEATURE_MASK); +        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +                   bdrv_get_device_or_node_name(bs), "QED", buf); +        return -ENOTSUP; +    } +    if (!qed_is_cluster_size_valid(s->header.cluster_size)) { +        return -EINVAL; +    } + +    /* Round down file size to the last cluster */ +    file_size = bdrv_getlength(bs->file); +    if (file_size < 0) { +        return file_size; +    } +    s->file_size = qed_start_of_cluster(s, file_size); + +    if (!qed_is_table_size_valid(s->header.table_size)) { +        return -EINVAL; +    } +    if (!qed_is_image_size_valid(s->header.image_size, +                                 s->header.cluster_size, +                                 s->header.table_size)) { +        return -EINVAL; +    } +    if (!qed_check_table_offset(s, s->header.l1_table_offset)) { +        return -EINVAL; +    } + +    s->table_nelems = (s->header.cluster_size * s->header.table_size) / +                      sizeof(uint64_t); +    s->l2_shift = ctz32(s->header.cluster_size); +    s->l2_mask = s->table_nelems - 1; +    s->l1_shift = s->l2_shift + ctz32(s->table_nelems); + +    /* Header size calculation must not overflow uint32_t */ +    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { +        return -EINVAL; +    } + +    if ((s->header.features & QED_F_BACKING_FILE)) { +        if ((uint64_t)s->header.backing_filename_offset + +            s->header.backing_filename_size > +            s->header.cluster_size * s->header.header_size) { +            return -EINVAL; +        } + +        ret = qed_read_string(bs->file, s->header.backing_filename_offset, +                              s->header.backing_filename_size, bs->backing_file, +                              sizeof(bs->backing_file)); +        if (ret < 0) { +            return ret; +        } + +        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { +            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); +        } +    } + +    /* Reset unknown autoclear feature bits.  This is a backwards +     * compatibility mechanism that allows images to be opened by older +     * programs, which "knock out" unknown feature bits.  When an image is +     * opened by a newer program again it can detect that the autoclear +     * feature is no longer valid. +     */ +    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && +        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { +        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; + +        ret = qed_write_header_sync(s); +        if (ret) { +            return ret; +        } + +        /* From here on only known autoclear feature bits are valid */ +        bdrv_flush(bs->file); +    } + +    s->l1_table = qed_alloc_table(s); +    qed_init_l2_cache(&s->l2_cache); + +    ret = qed_read_l1_table_sync(s); +    if (ret) { +        goto out; +    } + +    /* If image was not closed cleanly, check consistency */ +    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { +        /* Read-only images cannot be fixed.  There is no risk of corruption +         * since write operations are not possible.  Therefore, allow +         * potentially inconsistent images to be opened read-only.  This can +         * aid data recovery from an otherwise inconsistent image. +         */ +        if (!bdrv_is_read_only(bs->file) && +            !(flags & BDRV_O_INCOMING)) { +            BdrvCheckResult result = {0}; + +            ret = qed_check(s, &result, true); +            if (ret) { +                goto out; +            } +        } +    } + +    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs)); + +out: +    if (ret) { +        qed_free_l2_cache(&s->l2_cache); +        qemu_vfree(s->l1_table); +    } +    return ret; +} + +static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    BDRVQEDState *s = bs->opaque; + +    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; +} + +/* We have nothing to do for QED reopen, stubs just return + * success */ +static int bdrv_qed_reopen_prepare(BDRVReopenState *state, +                                   BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; + +    bdrv_qed_detach_aio_context(bs); + +    /* Ensure writes reach stable storage */ +    bdrv_flush(bs->file); + +    /* Clean shutdown, no check required on next open */ +    if (s->header.features & QED_F_NEED_CHECK) { +        s->header.features &= ~QED_F_NEED_CHECK; +        qed_write_header_sync(s); +    } + +    qed_free_l2_cache(&s->l2_cache); +    qemu_vfree(s->l1_table); +} + +static int qed_create(const char *filename, uint32_t cluster_size, +                      uint64_t image_size, uint32_t table_size, +                      const char *backing_file, const char *backing_fmt, +                      QemuOpts *opts, Error **errp) +{ +    QEDHeader header = { +        .magic = QED_MAGIC, +        .cluster_size = cluster_size, +        .table_size = table_size, +        .header_size = 1, +        .features = 0, +        .compat_features = 0, +        .l1_table_offset = cluster_size, +        .image_size = image_size, +    }; +    QEDHeader le_header; +    uint8_t *l1_table = NULL; +    size_t l1_size = header.cluster_size * header.table_size; +    Error *local_err = NULL; +    int ret = 0; +    BlockDriverState *bs; + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } + +    bs = NULL; +    ret = bdrv_open(&bs, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL, +                    &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        return ret; +    } + +    /* File must start empty and grow, check truncate is supported */ +    ret = bdrv_truncate(bs, 0); +    if (ret < 0) { +        goto out; +    } + +    if (backing_file) { +        header.features |= QED_F_BACKING_FILE; +        header.backing_filename_offset = sizeof(le_header); +        header.backing_filename_size = strlen(backing_file); + +        if (qed_fmt_is_raw(backing_fmt)) { +            header.features |= QED_F_BACKING_FORMAT_NO_PROBE; +        } +    } + +    qed_header_cpu_to_le(&header, &le_header); +    ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); +    if (ret < 0) { +        goto out; +    } +    ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, +                      header.backing_filename_size); +    if (ret < 0) { +        goto out; +    } + +    l1_table = g_malloc0(l1_size); +    ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); +    if (ret < 0) { +        goto out; +    } + +    ret = 0; /* success */ +out: +    g_free(l1_table); +    bdrv_unref(bs); +    return ret; +} + +static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    uint64_t image_size = 0; +    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; +    uint32_t table_size = QED_DEFAULT_TABLE_SIZE; +    char *backing_file = NULL; +    char *backing_fmt = NULL; +    int ret; + +    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); +    backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); +    cluster_size = qemu_opt_get_size_del(opts, +                                         BLOCK_OPT_CLUSTER_SIZE, +                                         QED_DEFAULT_CLUSTER_SIZE); +    table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE, +                                       QED_DEFAULT_TABLE_SIZE); + +    if (!qed_is_cluster_size_valid(cluster_size)) { +        error_setg(errp, "QED cluster size must be within range [%u, %u] " +                         "and power of 2", +                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); +        ret = -EINVAL; +        goto finish; +    } +    if (!qed_is_table_size_valid(table_size)) { +        error_setg(errp, "QED table size must be within range [%u, %u] " +                         "and power of 2", +                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); +        ret = -EINVAL; +        goto finish; +    } +    if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { +        error_setg(errp, "QED image size must be a non-zero multiple of " +                         "cluster size and less than %" PRIu64 " bytes", +                   qed_max_image_size(cluster_size, table_size)); +        ret = -EINVAL; +        goto finish; +    } + +    ret = qed_create(filename, cluster_size, image_size, table_size, +                     backing_file, backing_fmt, opts, errp); + +finish: +    g_free(backing_file); +    g_free(backing_fmt); +    return ret; +} + +typedef struct { +    BlockDriverState *bs; +    Coroutine *co; +    uint64_t pos; +    int64_t status; +    int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ +    QEDIsAllocatedCB *cb = opaque; +    BDRVQEDState *s = cb->bs->opaque; +    *cb->pnum = len / BDRV_SECTOR_SIZE; +    switch (ret) { +    case QED_CLUSTER_FOUND: +        offset |= qed_offset_into_cluster(s, cb->pos); +        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; +        break; +    case QED_CLUSTER_ZERO: +        cb->status = BDRV_BLOCK_ZERO; +        break; +    case QED_CLUSTER_L2: +    case QED_CLUSTER_L1: +        cb->status = 0; +        break; +    default: +        assert(ret < 0); +        cb->status = ret; +        break; +    } + +    if (cb->co) { +        qemu_coroutine_enter(cb->co, NULL); +    } +} + +static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, +                                                 int64_t sector_num, +                                                 int nb_sectors, int *pnum) +{ +    BDRVQEDState *s = bs->opaque; +    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; +    QEDIsAllocatedCB cb = { +        .bs = bs, +        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, +        .status = BDRV_BLOCK_OFFSET_MASK, +        .pnum = pnum, +    }; +    QEDRequest request = { .l2_table = NULL }; + +    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); + +    /* Now sleep if the callback wasn't invoked immediately */ +    while (cb.status == BDRV_BLOCK_OFFSET_MASK) { +        cb.co = qemu_coroutine_self(); +        qemu_coroutine_yield(); +    } + +    qed_unref_l2_cache_entry(request.l2_table); + +    return cb.status; +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ +    return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s:              QED state + * @pos:            Byte position in device + * @qiov:           Destination I/O vector + * @backing_qiov:   Possibly shortened copy of qiov, to be allocated here + * @cb:             Completion function + * @opaque:         User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, +                                  QEMUIOVector *qiov, +                                  QEMUIOVector **backing_qiov, +                                  BlockCompletionFunc *cb, void *opaque) +{ +    uint64_t backing_length = 0; +    size_t size; + +    /* If there is a backing file, get its length.  Treat the absence of a +     * backing file like a zero length backing file. +     */ +    if (s->bs->backing_hd) { +        int64_t l = bdrv_getlength(s->bs->backing_hd); +        if (l < 0) { +            cb(opaque, l); +            return; +        } +        backing_length = l; +    } + +    /* Zero all sectors if reading beyond the end of the backing file */ +    if (pos >= backing_length || +        pos + qiov->size > backing_length) { +        qemu_iovec_memset(qiov, 0, 0, qiov->size); +    } + +    /* Complete now if there are no backing file sectors to read */ +    if (pos >= backing_length) { +        cb(opaque, 0); +        return; +    } + +    /* If the read straddles the end of the backing file, shorten it */ +    size = MIN((uint64_t)backing_length - pos, qiov->size); + +    assert(*backing_qiov == NULL); +    *backing_qiov = g_new(QEMUIOVector, 1); +    qemu_iovec_init(*backing_qiov, qiov->niov); +    qemu_iovec_concat(*backing_qiov, qiov, 0, size); + +    BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); +    bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, +                   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); +} + +typedef struct { +    GenericCB gencb; +    BDRVQEDState *s; +    QEMUIOVector qiov; +    QEMUIOVector *backing_qiov; +    struct iovec iov; +    uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ +    CopyFromBackingFileCB *copy_cb = opaque; +    qemu_vfree(copy_cb->iov.iov_base); +    gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ +    CopyFromBackingFileCB *copy_cb = opaque; +    BDRVQEDState *s = copy_cb->s; + +    if (copy_cb->backing_qiov) { +        qemu_iovec_destroy(copy_cb->backing_qiov); +        g_free(copy_cb->backing_qiov); +        copy_cb->backing_qiov = NULL; +    } + +    if (ret) { +        qed_copy_from_backing_file_cb(copy_cb, ret); +        return; +    } + +    BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); +    bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, +                    ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, +                    qed_copy_from_backing_file_cb, copy_cb); +} + +/** + * Copy data from backing file into the image + * + * @s:          QED state + * @pos:        Byte position in device + * @len:        Number of bytes + * @offset:     Byte offset in image file + * @cb:         Completion function + * @opaque:     User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, +                                       uint64_t len, uint64_t offset, +                                       BlockCompletionFunc *cb, +                                       void *opaque) +{ +    CopyFromBackingFileCB *copy_cb; + +    /* Skip copy entirely if there is no work to do */ +    if (len == 0) { +        cb(opaque, 0); +        return; +    } + +    copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); +    copy_cb->s = s; +    copy_cb->offset = offset; +    copy_cb->backing_qiov = NULL; +    copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); +    copy_cb->iov.iov_len = len; +    qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + +    qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov, +                          qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s:              QED state + * @table:          L2 table + * @index:          First cluster index + * @n:              Number of contiguous clusters + * @cluster:        First cluster offset + * + * The cluster offset may be an allocated byte offset in the image file, the + * zero cluster marker, or the unallocated cluster marker. + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, +                                unsigned int n, uint64_t cluster) +{ +    int i; +    for (i = index; i < index + n; i++) { +        table->offsets[i] = cluster; +        if (!qed_offset_is_unalloc_cluster(cluster) && +            !qed_offset_is_zero_cluster(cluster)) { +            cluster += s->header.cluster_size; +        } +    } +} + +static void qed_aio_complete_bh(void *opaque) +{ +    QEDAIOCB *acb = opaque; +    BlockCompletionFunc *cb = acb->common.cb; +    void *user_opaque = acb->common.opaque; +    int ret = acb->bh_ret; + +    qemu_bh_delete(acb->bh); +    qemu_aio_unref(acb); + +    /* Invoke callback */ +    cb(user_opaque, ret); +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ +    BDRVQEDState *s = acb_to_s(acb); + +    trace_qed_aio_complete(s, acb, ret); + +    /* Free resources */ +    qemu_iovec_destroy(&acb->cur_qiov); +    qed_unref_l2_cache_entry(acb->request.l2_table); + +    /* Free the buffer we may have allocated for zero writes */ +    if (acb->flags & QED_AIOCB_ZERO) { +        qemu_vfree(acb->qiov->iov[0].iov_base); +        acb->qiov->iov[0].iov_base = NULL; +    } + +    /* Arrange for a bh to invoke the completion function */ +    acb->bh_ret = ret; +    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), +                         qed_aio_complete_bh, acb); +    qemu_bh_schedule(acb->bh); + +    /* Start next allocating write request waiting behind this one.  Note that +     * requests enqueue themselves when they first hit an unallocated cluster +     * but they wait until the entire request is finished before waking up the +     * next request in the queue.  This ensures that we don't cycle through +     * requests multiple times but rather finish one at a time completely. +     */ +    if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { +        QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); +        acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); +        if (acb) { +            qed_aio_next_io(acb, 0); +        } else if (s->header.features & QED_F_NEED_CHECK) { +            qed_start_need_check_timer(s); +        } +    } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    CachedL2Table *l2_table = acb->request.l2_table; +    uint64_t l2_offset = l2_table->offset; + +    qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + +    /* This is guaranteed to succeed because we just committed the entry to the +     * cache. +     */ +    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); +    assert(acb->request.l2_table != NULL); + +    qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    int index; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    index = qed_l1_index(s, acb->cur_pos); +    s->l1_table->offsets[index] = acb->request.l2_table->offset; + +    qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +{ +    BDRVQEDState *s = acb_to_s(acb); +    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; +    int index; + +    if (ret) { +        goto err; +    } + +    if (need_alloc) { +        qed_unref_l2_cache_entry(acb->request.l2_table); +        acb->request.l2_table = qed_new_l2_table(s); +    } + +    index = qed_l2_index(s, acb->cur_pos); +    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, +                         offset); + +    if (need_alloc) { +        /* Write out the whole new L2 table */ +        qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, +                            qed_aio_write_l1_update, acb); +    } else { +        /* Write out only the updated part of the L2 table */ +        qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, +                            qed_aio_next_io, acb); +    } +    return; + +err: +    qed_aio_complete(acb, ret); +} + +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use.  A crash during an + * allocating write could result in empty clusters in the image.  If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region.  The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); + +    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { +        qed_aio_complete(acb, -EIO); +    } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t offset = acb->cur_cluster + +                      qed_offset_into_cluster(s, acb->cur_pos); +    BlockCompletionFunc *next_fn; + +    trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { +        next_fn = qed_aio_next_io; +    } else { +        if (s->bs->backing_hd) { +            next_fn = qed_aio_write_flush_before_l2_update; +        } else { +            next_fn = qed_aio_write_l2_update_cb; +        } +    } + +    BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); +    bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, +                    &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, +                    next_fn, acb); +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t start = acb->cur_pos + acb->cur_qiov.size; +    uint64_t len = +        qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; +    uint64_t offset = acb->cur_cluster + +                      qed_offset_into_cluster(s, acb->cur_pos) + +                      acb->cur_qiov.size; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    trace_qed_aio_write_postfill(s, acb, start, len, offset); +    qed_copy_from_backing_file(s, start, len, offset, +                                qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    uint64_t start = qed_start_of_cluster(s, acb->cur_pos); +    uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + +    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); +    qed_copy_from_backing_file(s, start, len, acb->cur_cluster, +                                qed_aio_write_postfill, acb); +} + +/** + * Check if the QED_F_NEED_CHECK bit should be set during allocating write + */ +static bool qed_should_set_need_check(BDRVQEDState *s) +{ +    /* The flush before L2 update path ensures consistency */ +    if (s->bs->backing_hd) { +        return false; +    } + +    return !(s->header.features & QED_F_NEED_CHECK); +} + +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; + +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    qed_aio_write_l2_update(acb, 0, 1); +} + +/** + * Write new data cluster + * + * @acb:        Write request + * @len:        Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ +    BDRVQEDState *s = acb_to_s(acb); +    BlockCompletionFunc *cb; + +    /* Cancel timer when the first allocating request comes in */ +    if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { +        qed_cancel_need_check_timer(s); +    } + +    /* Freeze this request if another allocating write is in progress */ +    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { +        QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); +    } +    if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || +        s->allocating_write_reqs_plugged) { +        return; /* wait for existing request to finish */ +    } + +    acb->cur_nclusters = qed_bytes_to_clusters(s, +            qed_offset_into_cluster(s, acb->cur_pos) + len); +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    if (acb->flags & QED_AIOCB_ZERO) { +        /* Skip ahead if the clusters are already zero */ +        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { +            qed_aio_next_io(acb, 0); +            return; +        } + +        cb = qed_aio_write_zero_cluster; +    } else { +        cb = qed_aio_write_prefill; +        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); +    } + +    if (qed_should_set_need_check(s)) { +        s->header.features |= QED_F_NEED_CHECK; +        qed_write_header(s, cb, acb); +    } else { +        cb(acb, 0); +    } +} + +/** + * Write data cluster in place + * + * @acb:        Write request + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ +    /* Allocate buffer for zero writes */ +    if (acb->flags & QED_AIOCB_ZERO) { +        struct iovec *iov = acb->qiov->iov; + +        if (!iov->iov_base) { +            iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); +            if (iov->iov_base == NULL) { +                qed_aio_complete(acb, -ENOMEM); +                return; +            } +            memset(iov->iov_base, 0, iov->iov_len); +        } +    } + +    /* Calculate the I/O vector */ +    acb->cur_cluster = offset; +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    /* Do the actual write */ +    qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque:     Write request + * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + *              or -errno + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, +                               uint64_t offset, size_t len) +{ +    QEDAIOCB *acb = opaque; + +    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + +    acb->find_cluster_ret = ret; + +    switch (ret) { +    case QED_CLUSTER_FOUND: +        qed_aio_write_inplace(acb, offset, len); +        break; + +    case QED_CLUSTER_L2: +    case QED_CLUSTER_L1: +    case QED_CLUSTER_ZERO: +        qed_aio_write_alloc(acb, len); +        break; + +    default: +        qed_aio_complete(acb, ret); +        break; +    } +} + +/** + * Read data cluster + * + * @opaque:     Read request + * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + *              or -errno + * @offset:     Cluster offset in bytes + * @len:        Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, +                              uint64_t offset, size_t len) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    BlockDriverState *bs = acb->common.bs; + +    /* Adjust offset into cluster */ +    offset += qed_offset_into_cluster(s, acb->cur_pos); + +    trace_qed_aio_read_data(s, acb, ret, offset, len); + +    if (ret < 0) { +        goto err; +    } + +    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + +    /* Handle zero cluster and backing file reads */ +    if (ret == QED_CLUSTER_ZERO) { +        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); +        qed_aio_next_io(acb, 0); +        return; +    } else if (ret != QED_CLUSTER_FOUND) { +        qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, +                              &acb->backing_qiov, qed_aio_next_io, acb); +        return; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); +    bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, +                   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, +                   qed_aio_next_io, acb); +    return; + +err: +    qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ +    QEDAIOCB *acb = opaque; +    BDRVQEDState *s = acb_to_s(acb); +    QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? +                                qed_aio_write_data : qed_aio_read_data; + +    trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + +    if (acb->backing_qiov) { +        qemu_iovec_destroy(acb->backing_qiov); +        g_free(acb->backing_qiov); +        acb->backing_qiov = NULL; +    } + +    /* Handle I/O error */ +    if (ret) { +        qed_aio_complete(acb, ret); +        return; +    } + +    acb->qiov_offset += acb->cur_qiov.size; +    acb->cur_pos += acb->cur_qiov.size; +    qemu_iovec_reset(&acb->cur_qiov); + +    /* Complete request */ +    if (acb->cur_pos >= acb->end_pos) { +        qed_aio_complete(acb, 0); +        return; +    } + +    /* Find next cluster and start I/O */ +    qed_find_cluster(s, &acb->request, +                      acb->cur_pos, acb->end_pos - acb->cur_pos, +                      io_fn, acb); +} + +static BlockAIOCB *qed_aio_setup(BlockDriverState *bs, +                                 int64_t sector_num, +                                 QEMUIOVector *qiov, int nb_sectors, +                                 BlockCompletionFunc *cb, +                                 void *opaque, int flags) +{ +    QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); + +    trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, +                        opaque, flags); + +    acb->flags = flags; +    acb->qiov = qiov; +    acb->qiov_offset = 0; +    acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; +    acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; +    acb->backing_qiov = NULL; +    acb->request.l2_table = NULL; +    qemu_iovec_init(&acb->cur_qiov, qiov->niov); + +    /* Start request */ +    qed_aio_next_io(acb, 0); +    return &acb->common; +} + +static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, +                                      int64_t sector_num, +                                      QEMUIOVector *qiov, int nb_sectors, +                                      BlockCompletionFunc *cb, +                                      void *opaque) +{ +    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, +                                       int64_t sector_num, +                                       QEMUIOVector *qiov, int nb_sectors, +                                       BlockCompletionFunc *cb, +                                       void *opaque) +{ +    return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, +                         opaque, QED_AIOCB_WRITE); +} + +typedef struct { +    Coroutine *co; +    int ret; +    bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ +    QEDWriteZeroesCB *cb = opaque; + +    cb->done = true; +    cb->ret = ret; +    if (cb->co) { +        qemu_coroutine_enter(cb->co, NULL); +    } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, +                                                 int64_t sector_num, +                                                 int nb_sectors, +                                                 BdrvRequestFlags flags) +{ +    BlockAIOCB *blockacb; +    BDRVQEDState *s = bs->opaque; +    QEDWriteZeroesCB cb = { .done = false }; +    QEMUIOVector qiov; +    struct iovec iov; + +    /* Refuse if there are untouched backing file sectors */ +    if (bs->backing_hd) { +        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { +            return -ENOTSUP; +        } +        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { +            return -ENOTSUP; +        } +    } + +    /* Zero writes start without an I/O buffer.  If a buffer becomes necessary +     * then it will be allocated during request processing. +     */ +    iov.iov_base = NULL, +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE, + +    qemu_iovec_init_external(&qiov, &iov, 1); +    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, +                             qed_co_write_zeroes_cb, &cb, +                             QED_AIOCB_WRITE | QED_AIOCB_ZERO); +    if (!blockacb) { +        return -EIO; +    } +    if (!cb.done) { +        cb.co = qemu_coroutine_self(); +        qemu_coroutine_yield(); +    } +    assert(cb.done); +    return cb.ret; +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVQEDState *s = bs->opaque; +    uint64_t old_image_size; +    int ret; + +    if (!qed_is_image_size_valid(offset, s->header.cluster_size, +                                 s->header.table_size)) { +        return -EINVAL; +    } + +    /* Shrinking is currently not supported */ +    if ((uint64_t)offset < s->header.image_size) { +        return -ENOTSUP; +    } + +    old_image_size = s->header.image_size; +    s->header.image_size = offset; +    ret = qed_write_header_sync(s); +    if (ret < 0) { +        s->header.image_size = old_image_size; +    } +    return ret; +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ +    BDRVQEDState *s = bs->opaque; +    return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVQEDState *s = bs->opaque; + +    memset(bdi, 0, sizeof(*bdi)); +    bdi->cluster_size = s->header.cluster_size; +    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; +    bdi->unallocated_blocks_are_zero = true; +    bdi->can_write_zeroes_with_unmap = true; +    return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, +                                        const char *backing_file, +                                        const char *backing_fmt) +{ +    BDRVQEDState *s = bs->opaque; +    QEDHeader new_header, le_header; +    void *buffer; +    size_t buffer_len, backing_file_len; +    int ret; + +    /* Refuse to set backing filename if unknown compat feature bits are +     * active.  If the image uses an unknown compat feature then we may not +     * know the layout of data following the header structure and cannot safely +     * add a new string. +     */ +    if (backing_file && (s->header.compat_features & +                         ~QED_COMPAT_FEATURE_MASK)) { +        return -ENOTSUP; +    } + +    memcpy(&new_header, &s->header, sizeof(new_header)); + +    new_header.features &= ~(QED_F_BACKING_FILE | +                             QED_F_BACKING_FORMAT_NO_PROBE); + +    /* Adjust feature flags */ +    if (backing_file) { +        new_header.features |= QED_F_BACKING_FILE; + +        if (qed_fmt_is_raw(backing_fmt)) { +            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; +        } +    } + +    /* Calculate new header size */ +    backing_file_len = 0; + +    if (backing_file) { +        backing_file_len = strlen(backing_file); +    } + +    buffer_len = sizeof(new_header); +    new_header.backing_filename_offset = buffer_len; +    new_header.backing_filename_size = backing_file_len; +    buffer_len += backing_file_len; + +    /* Make sure we can rewrite header without failing */ +    if (buffer_len > new_header.header_size * new_header.cluster_size) { +        return -ENOSPC; +    } + +    /* Prepare new header */ +    buffer = g_malloc(buffer_len); + +    qed_header_cpu_to_le(&new_header, &le_header); +    memcpy(buffer, &le_header, sizeof(le_header)); +    buffer_len = sizeof(le_header); + +    if (backing_file) { +        memcpy(buffer + buffer_len, backing_file, backing_file_len); +        buffer_len += backing_file_len; +    } + +    /* Write new header */ +    ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); +    g_free(buffer); +    if (ret == 0) { +        memcpy(&s->header, &new_header, sizeof(new_header)); +    } +    return ret; +} + +static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) +{ +    BDRVQEDState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +    bdrv_qed_close(bs); + +    bdrv_invalidate_cache(bs->file, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        return; +    } + +    memset(s, 0, sizeof(BDRVQEDState)); +    ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); +    if (local_err) { +        error_setg(errp, "Could not reopen qed layer: %s", +                   error_get_pretty(local_err)); +        error_free(local_err); +        return; +    } else if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not reopen qed layer"); +        return; +    } +} + +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, +                          BdrvCheckMode fix) +{ +    BDRVQEDState *s = bs->opaque; + +    return qed_check(s, result, !!fix); +} + +static QemuOptsList qed_create_opts = { +    .name = "qed-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_BACKING_FILE, +            .type = QEMU_OPT_STRING, +            .help = "File name of a base image" +        }, +        { +            .name = BLOCK_OPT_BACKING_FMT, +            .type = QEMU_OPT_STRING, +            .help = "Image format of the base image" +        }, +        { +            .name = BLOCK_OPT_CLUSTER_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Cluster size (in bytes)", +            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE) +        }, +        { +            .name = BLOCK_OPT_TABLE_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "L1/L2 table size (in clusters)" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_qed = { +    .format_name              = "qed", +    .instance_size            = sizeof(BDRVQEDState), +    .create_opts              = &qed_create_opts, +    .supports_backing         = true, + +    .bdrv_probe               = bdrv_qed_probe, +    .bdrv_rebind              = bdrv_qed_rebind, +    .bdrv_open                = bdrv_qed_open, +    .bdrv_close               = bdrv_qed_close, +    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare, +    .bdrv_create              = bdrv_qed_create, +    .bdrv_has_zero_init       = bdrv_has_zero_init_1, +    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, +    .bdrv_aio_readv           = bdrv_qed_aio_readv, +    .bdrv_aio_writev          = bdrv_qed_aio_writev, +    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes, +    .bdrv_truncate            = bdrv_qed_truncate, +    .bdrv_getlength           = bdrv_qed_getlength, +    .bdrv_get_info            = bdrv_qed_get_info, +    .bdrv_refresh_limits      = bdrv_qed_refresh_limits, +    .bdrv_change_backing_file = bdrv_qed_change_backing_file, +    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache, +    .bdrv_check               = bdrv_qed_check, +    .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context, +    .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context, +}; + +static void bdrv_qed_init(void) +{ +    bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/block/qed.h b/block/qed.h new file mode 100644 index 00000000..615e676f --- /dev/null +++ b/block/qed.h @@ -0,0 +1,343 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block/block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + *                     +----------+ + *                     | L1 table | + *                     +----------+ + *                ,------'  |  '------. + *           +----------+   |    +----------+ + *           | L2 table |  ...   | L2 table | + *           +----------+        +----------+ + *       ,------'  |  '------. + *  +----------+   |    +----------+ + *  |   Data   |  ...   |   Data   | + *  +----------+        +----------+ + * + * The L1 table is fixed size and always present.  L2 tables are allocated on + * demand.  The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ +#define  QED_DEFAULT_CLUSTER_SIZE  65536 +enum { +    QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + +    /* The image supports a backing file */ +    QED_F_BACKING_FILE = 0x01, + +    /* The image needs a consistency check before use */ +    QED_F_NEED_CHECK = 0x02, + +    /* The backing file format must not be probed, treat as raw image */ +    QED_F_BACKING_FORMAT_NO_PROBE = 0x04, + +    /* Feature bits must be used when the on-disk format changes */ +    QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ +                       QED_F_NEED_CHECK | +                       QED_F_BACKING_FORMAT_NO_PROBE, +    QED_COMPAT_FEATURE_MASK = 0,            /* supported compat feature bits */ +    QED_AUTOCLEAR_FEATURE_MASK = 0,         /* supported autoclear feature bits */ + +    /* Data is stored in groups of sectors called clusters.  Cluster size must +     * be large to avoid keeping too much metadata.  I/O requests that have +     * sub-cluster size will require read-modify-write. +     */ +    QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ +    QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, + +    /* Allocated clusters are tracked using a 2-level pagetable.  Table size is +     * a multiple of clusters so large maximum image sizes can be supported +     * without jacking up the cluster size too much. +     */ +    QED_MIN_TABLE_SIZE = 1,        /* in clusters */ +    QED_MAX_TABLE_SIZE = 16, +    QED_DEFAULT_TABLE_SIZE = 4, + +    /* Delay to flush and clean image after last allocating write completes */ +    QED_NEED_CHECK_TIMEOUT = 5,    /* in seconds */ +}; + +typedef struct { +    uint32_t magic;                 /* QED\0 */ + +    uint32_t cluster_size;          /* in bytes */ +    uint32_t table_size;            /* for L1 and L2 tables, in clusters */ +    uint32_t header_size;           /* in clusters */ + +    uint64_t features;              /* format feature bits */ +    uint64_t compat_features;       /* compatible feature bits */ +    uint64_t autoclear_features;    /* self-resetting feature bits */ + +    uint64_t l1_table_offset;       /* in bytes */ +    uint64_t image_size;            /* total logical image size, in bytes */ + +    /* if (features & QED_F_BACKING_FILE) */ +    uint32_t backing_filename_offset; /* in bytes from start of header */ +    uint32_t backing_filename_size;   /* in bytes */ +} QEMU_PACKED QEDHeader; + +typedef struct { +    uint64_t offsets[0];            /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { +    QEDTable *table; +    uint64_t offset;    /* offset=0 indicates an invalidate entry */ +    QTAILQ_ENTRY(CachedL2Table) node; +    int ref; +} CachedL2Table; + +typedef struct { +    QTAILQ_HEAD(, CachedL2Table) entries; +    unsigned int n_entries; +} L2TableCache; + +typedef struct QEDRequest { +    CachedL2Table *l2_table; +} QEDRequest; + +enum { +    QED_AIOCB_WRITE = 0x0001,       /* read or write? */ +    QED_AIOCB_ZERO  = 0x0002,       /* zero write, used with QED_AIOCB_WRITE */ +}; + +typedef struct QEDAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    int bh_ret;                     /* final return status for completion bh */ +    QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */ +    int flags;                      /* QED_AIOCB_* bits ORed together */ +    uint64_t end_pos;               /* request end on block device, in bytes */ + +    /* User scatter-gather list */ +    QEMUIOVector *qiov; +    size_t qiov_offset;             /* byte count already processed */ + +    /* Current cluster scatter-gather list */ +    QEMUIOVector cur_qiov; +    QEMUIOVector *backing_qiov; +    uint64_t cur_pos;               /* position on block device, in bytes */ +    uint64_t cur_cluster;           /* cluster offset in image file */ +    unsigned int cur_nclusters;     /* number of clusters being accessed */ +    int find_cluster_ret;           /* used for L1/L2 update */ + +    QEDRequest request; +} QEDAIOCB; + +typedef struct { +    BlockDriverState *bs;           /* device */ +    uint64_t file_size;             /* length of image file, in bytes */ + +    QEDHeader header;               /* always cpu-endian */ +    QEDTable *l1_table; +    L2TableCache l2_cache;          /* l2 table cache */ +    uint32_t table_nelems; +    uint32_t l1_shift; +    uint32_t l2_shift; +    uint32_t l2_mask; + +    /* Allocating write request queue */ +    QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; +    bool allocating_write_reqs_plugged; + +    /* Periodic flush and clear need check flag */ +    QEMUTimer *need_check_timer; +} BDRVQEDState; + +enum { +    QED_CLUSTER_FOUND,         /* cluster found */ +    QED_CLUSTER_ZERO,          /* zero cluster found */ +    QED_CLUSTER_L2,            /* cluster missing in L2 */ +    QED_CLUSTER_L1,            /* cluster missing in L1 */ +}; + +/** + * qed_find_cluster() completion callback + * + * @opaque:     User data for completion callback + * @ret:        QED_CLUSTER_FOUND   Success + *              QED_CLUSTER_L2      Data cluster unallocated in L2 + *              QED_CLUSTER_L1      L2 unallocated in L1 + *              -errno              POSIX error occurred + * @offset:     Data cluster offset + * @len:        Contiguous bytes starting from cluster offset + * + * This function is invoked when qed_find_cluster() completes. + * + * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range + * in the image file. + * + * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 + * table offset, respectively.  len is number of contiguous unallocated bytes. + */ +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { +    BlockCompletionFunc *cb; +    void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * Header functions + */ +int qed_write_header_sync(BDRVQEDState *s); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table_sync(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, +                        BlockCompletionFunc *cb, void *opaque); +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, +                            unsigned int n); +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                           uint64_t offset); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, +                       BlockCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, +                        unsigned int index, unsigned int n, bool flush, +                        BlockCompletionFunc *cb, void *opaque); +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, +                            unsigned int index, unsigned int n, bool flush); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, +                      size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Consistency check + */ +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); + +QEDTable *qed_alloc_table(BDRVQEDState *s); + +/** + * Round down to the start of a cluster + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ +    return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ +    return offset & (s->header.cluster_size - 1); +} + +static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) +{ +    return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / +           (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ +    return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ +    return (pos >> s->l2_shift) & s->l2_mask; +} + +/** + * Test if a cluster offset is valid + */ +static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) +{ +    uint64_t header_size = (uint64_t)s->header.header_size * +                           s->header.cluster_size; + +    if (offset & (s->header.cluster_size - 1)) { +        return false; +    } +    return offset >= header_size && offset < s->file_size; +} + +/** + * Test if a table offset is valid + */ +static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) +{ +    uint64_t end_offset = offset + (s->header.table_size - 1) * +                          s->header.cluster_size; + +    /* Overflow check */ +    if (end_offset <= offset) { +        return false; +    } + +    return qed_check_cluster_offset(s, offset) && +           qed_check_cluster_offset(s, end_offset); +} + +static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, +                                                 uint64_t offset) +{ +    if (qed_offset_into_cluster(s, offset)) { +        return false; +    } +    return true; +} + +static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) +{ +    if (offset == 0) { +        return true; +    } +    return false; +} + +static inline bool qed_offset_is_zero_cluster(uint64_t offset) +{ +    if (offset == 1) { +        return true; +    } +    return false; +} + +#endif /* BLOCK_QED_H */ diff --git a/block/quorum.c b/block/quorum.c new file mode 100644 index 00000000..2f6c45f7 --- /dev/null +++ b/block/quorum.c @@ -0,0 +1,1065 @@ +/* + * Quorum Block filter + * + * Copyright (C) 2012-2014 Nodalink, EURL. + * + * Author: + *   Benoît Canet <benoit.canet@irqsave.net> + * + * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp) + * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc). + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "block/block_int.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qlist.h" +#include "qapi/qmp/qstring.h" +#include "qapi-event.h" +#include "crypto/hash.h" + +#define HASH_LENGTH 32 + +#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" +#define QUORUM_OPT_BLKVERIFY      "blkverify" +#define QUORUM_OPT_REWRITE        "rewrite-corrupted" +#define QUORUM_OPT_READ_PATTERN   "read-pattern" + +/* This union holds a vote hash value */ +typedef union QuorumVoteValue { +    uint8_t h[HASH_LENGTH];    /* SHA-256 hash */ +    int64_t l;                 /* simpler 64 bits hash */ +} QuorumVoteValue; + +/* A vote item */ +typedef struct QuorumVoteItem { +    int index; +    QLIST_ENTRY(QuorumVoteItem) next; +} QuorumVoteItem; + +/* this structure is a vote version. A version is the set of votes sharing the + * same vote value. + * The set of votes will be tracked with the items field and its cardinality is + * vote_count. + */ +typedef struct QuorumVoteVersion { +    QuorumVoteValue value; +    int index; +    int vote_count; +    QLIST_HEAD(, QuorumVoteItem) items; +    QLIST_ENTRY(QuorumVoteVersion) next; +} QuorumVoteVersion; + +/* this structure holds a group of vote versions together */ +typedef struct QuorumVotes { +    QLIST_HEAD(, QuorumVoteVersion) vote_list; +    bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); +} QuorumVotes; + +/* the following structure holds the state of one quorum instance */ +typedef struct BDRVQuorumState { +    BlockDriverState **bs; /* children BlockDriverStates */ +    int num_children;      /* children count */ +    int threshold;         /* if less than threshold children reads gave the +                            * same result a quorum error occurs. +                            */ +    bool is_blkverify;     /* true if the driver is in blkverify mode +                            * Writes are mirrored on two children devices. +                            * On reads the two children devices' contents are +                            * compared and if a difference is spotted its +                            * location is printed and the code aborts. +                            * It is useful to debug other block drivers by +                            * comparing them with a reference one. +                            */ +    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted +                            * block if Quorum is reached. +                            */ + +    QuorumReadPattern read_pattern; +} BDRVQuorumState; + +typedef struct QuorumAIOCB QuorumAIOCB; + +/* Quorum will create one instance of the following structure per operation it + * performs on its children. + * So for each read/write operation coming from the upper layer there will be + * $children_count QuorumChildRequest. + */ +typedef struct QuorumChildRequest { +    BlockAIOCB *aiocb; +    QEMUIOVector qiov; +    uint8_t *buf; +    int ret; +    QuorumAIOCB *parent; +} QuorumChildRequest; + +/* Quorum will use the following structure to track progress of each read/write + * operation received by the upper layer. + * This structure hold pointers to the QuorumChildRequest structures instances + * used to do operations on each children and track overall progress. + */ +struct QuorumAIOCB { +    BlockAIOCB common; + +    /* Request metadata */ +    uint64_t sector_num; +    int nb_sectors; + +    QEMUIOVector *qiov;         /* calling IOV */ + +    QuorumChildRequest *qcrs;   /* individual child requests */ +    int count;                  /* number of completed AIOCB */ +    int success_count;          /* number of successfully completed AIOCB */ + +    int rewrite_count;          /* number of replica to rewrite: count down to +                                 * zero once writes are fired +                                 */ + +    QuorumVotes votes; + +    bool is_read; +    int vote_ret; +    int child_iter;             /* which child to read in fifo pattern */ +}; + +static bool quorum_vote(QuorumAIOCB *acb); + +static void quorum_aio_cancel(BlockAIOCB *blockacb) +{ +    QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); +    BDRVQuorumState *s = acb->common.bs->opaque; +    int i; + +    /* cancel all callbacks */ +    for (i = 0; i < s->num_children; i++) { +        if (acb->qcrs[i].aiocb) { +            bdrv_aio_cancel_async(acb->qcrs[i].aiocb); +        } +    } +} + +static AIOCBInfo quorum_aiocb_info = { +    .aiocb_size         = sizeof(QuorumAIOCB), +    .cancel_async       = quorum_aio_cancel, +}; + +static void quorum_aio_finalize(QuorumAIOCB *acb) +{ +    int i, ret = 0; + +    if (acb->vote_ret) { +        ret = acb->vote_ret; +    } + +    acb->common.cb(acb->common.opaque, ret); + +    if (acb->is_read) { +        /* on the quorum case acb->child_iter == s->num_children - 1 */ +        for (i = 0; i <= acb->child_iter; i++) { +            qemu_vfree(acb->qcrs[i].buf); +            qemu_iovec_destroy(&acb->qcrs[i].qiov); +        } +    } + +    g_free(acb->qcrs); +    qemu_aio_unref(acb); +} + +static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ +    return !memcmp(a->h, b->h, HASH_LENGTH); +} + +static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ +    return a->l == b->l; +} + +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, +                                   BlockDriverState *bs, +                                   QEMUIOVector *qiov, +                                   uint64_t sector_num, +                                   int nb_sectors, +                                   BlockCompletionFunc *cb, +                                   void *opaque) +{ +    QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); +    int i; + +    acb->common.bs->opaque = s; +    acb->sector_num = sector_num; +    acb->nb_sectors = nb_sectors; +    acb->qiov = qiov; +    acb->qcrs = g_new0(QuorumChildRequest, s->num_children); +    acb->count = 0; +    acb->success_count = 0; +    acb->rewrite_count = 0; +    acb->votes.compare = quorum_sha256_compare; +    QLIST_INIT(&acb->votes.vote_list); +    acb->is_read = false; +    acb->vote_ret = 0; + +    for (i = 0; i < s->num_children; i++) { +        acb->qcrs[i].buf = NULL; +        acb->qcrs[i].ret = 0; +        acb->qcrs[i].parent = acb; +    } + +    return acb; +} + +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +{ +    const char *msg = NULL; +    if (ret < 0) { +        msg = strerror(-ret); +    } +    qapi_event_send_quorum_report_bad(!!msg, msg, node_name, +                                      acb->sector_num, acb->nb_sectors, &error_abort); +} + +static void quorum_report_failure(QuorumAIOCB *acb) +{ +    const char *reference = bdrv_get_device_or_node_name(acb->common.bs); +    qapi_event_send_quorum_failure(reference, acb->sector_num, +                                   acb->nb_sectors, &error_abort); +} + +static int quorum_vote_error(QuorumAIOCB *acb); + +static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) +{ +    BDRVQuorumState *s = acb->common.bs->opaque; + +    if (acb->success_count < s->threshold) { +        acb->vote_ret = quorum_vote_error(acb); +        quorum_report_failure(acb); +        return true; +    } + +    return false; +} + +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ +    QuorumAIOCB *acb = opaque; + +    /* one less rewrite to do */ +    acb->rewrite_count--; + +    /* wait until all rewrite callbacks have completed */ +    if (acb->rewrite_count) { +        return; +    } + +    quorum_aio_finalize(acb); +} + +static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb); + +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) +{ +    int i; +    assert(dest->niov == source->niov); +    assert(dest->size == source->size); +    for (i = 0; i < source->niov; i++) { +        assert(dest->iov[i].iov_len == source->iov[i].iov_len); +        memcpy(dest->iov[i].iov_base, +               source->iov[i].iov_base, +               source->iov[i].iov_len); +    } +} + +static void quorum_aio_cb(void *opaque, int ret) +{ +    QuorumChildRequest *sacb = opaque; +    QuorumAIOCB *acb = sacb->parent; +    BDRVQuorumState *s = acb->common.bs->opaque; +    bool rewrite = false; + +    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { +        /* We try to read next child in FIFO order if we fail to read */ +        if (ret < 0 && ++acb->child_iter < s->num_children) { +            read_fifo_child(acb); +            return; +        } + +        if (ret == 0) { +            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); +        } +        acb->vote_ret = ret; +        quorum_aio_finalize(acb); +        return; +    } + +    sacb->ret = ret; +    acb->count++; +    if (ret == 0) { +        acb->success_count++; +    } else { +        quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); +    } +    assert(acb->count <= s->num_children); +    assert(acb->success_count <= s->num_children); +    if (acb->count < s->num_children) { +        return; +    } + +    /* Do the vote on read */ +    if (acb->is_read) { +        rewrite = quorum_vote(acb); +    } else { +        quorum_has_too_much_io_failed(acb); +    } + +    /* if no rewrite is done the code will finish right away */ +    if (!rewrite) { +        quorum_aio_finalize(acb); +    } +} + +static void quorum_report_bad_versions(BDRVQuorumState *s, +                                       QuorumAIOCB *acb, +                                       QuorumVoteValue *value) +{ +    QuorumVoteVersion *version; +    QuorumVoteItem *item; + +    QLIST_FOREACH(version, &acb->votes.vote_list, next) { +        if (acb->votes.compare(&version->value, value)) { +            continue; +        } +        QLIST_FOREACH(item, &version->items, next) { +            quorum_report_bad(acb, s->bs[item->index]->node_name, 0); +        } +    } +} + +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, +                                        QuorumVoteValue *value) +{ +    QuorumVoteVersion *version; +    QuorumVoteItem *item; +    int count = 0; + +    /* first count the number of bad versions: done first to avoid concurrency +     * issues. +     */ +    QLIST_FOREACH(version, &acb->votes.vote_list, next) { +        if (acb->votes.compare(&version->value, value)) { +            continue; +        } +        QLIST_FOREACH(item, &version->items, next) { +            count++; +        } +    } + +    /* quorum_rewrite_aio_cb will count down this to zero */ +    acb->rewrite_count = count; + +    /* now fire the correcting rewrites */ +    QLIST_FOREACH(version, &acb->votes.vote_list, next) { +        if (acb->votes.compare(&version->value, value)) { +            continue; +        } +        QLIST_FOREACH(item, &version->items, next) { +            bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, +                            acb->nb_sectors, quorum_rewrite_aio_cb, acb); +        } +    } + +    /* return true if any rewrite is done else false */ +    return count; +} + +static void quorum_count_vote(QuorumVotes *votes, +                              QuorumVoteValue *value, +                              int index) +{ +    QuorumVoteVersion *v = NULL, *version = NULL; +    QuorumVoteItem *item; + +    /* look if we have something with this hash */ +    QLIST_FOREACH(v, &votes->vote_list, next) { +        if (votes->compare(&v->value, value)) { +            version = v; +            break; +        } +    } + +    /* It's a version not yet in the list add it */ +    if (!version) { +        version = g_new0(QuorumVoteVersion, 1); +        QLIST_INIT(&version->items); +        memcpy(&version->value, value, sizeof(version->value)); +        version->index = index; +        version->vote_count = 0; +        QLIST_INSERT_HEAD(&votes->vote_list, version, next); +    } + +    version->vote_count++; + +    item = g_new0(QuorumVoteItem, 1); +    item->index = index; +    QLIST_INSERT_HEAD(&version->items, item, next); +} + +static void quorum_free_vote_list(QuorumVotes *votes) +{ +    QuorumVoteVersion *version, *next_version; +    QuorumVoteItem *item, *next_item; + +    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { +        QLIST_REMOVE(version, next); +        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { +            QLIST_REMOVE(item, next); +            g_free(item); +        } +        g_free(version); +    } +} + +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) +{ +    QEMUIOVector *qiov = &acb->qcrs[i].qiov; +    size_t len = sizeof(hash->h); +    uint8_t *data = hash->h; + +    /* XXX - would be nice if we could pass in the Error ** +     * and propagate that back, but this quorum code is +     * restricted to just errno values currently */ +    if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256, +                            qiov->iov, qiov->niov, +                            &data, &len, +                            NULL) < 0) { +        return -EINVAL; +    } + +    return 0; +} + +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) +{ +    int max = 0; +    QuorumVoteVersion *candidate, *winner = NULL; + +    QLIST_FOREACH(candidate, &votes->vote_list, next) { +        if (candidate->vote_count > max) { +            max = candidate->vote_count; +            winner = candidate; +        } +    } + +    return winner; +} + +/* qemu_iovec_compare is handy for blkverify mode because it returns the first + * differing byte location. Yet it is handcoded to compare vectors one byte + * after another so it does not benefit from the libc SIMD optimizations. + * quorum_iovec_compare is written for speed and should be used in the non + * blkverify mode of quorum. + */ +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ +    int i; +    int result; + +    assert(a->niov == b->niov); +    for (i = 0; i < a->niov; i++) { +        assert(a->iov[i].iov_len == b->iov[i].iov_len); +        result = memcmp(a->iov[i].iov_base, +                        b->iov[i].iov_base, +                        a->iov[i].iov_len); +        if (result) { +            return false; +        } +    } + +    return true; +} + +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, +                                          const char *fmt, ...) +{ +    va_list ap; + +    va_start(ap, fmt); +    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", +            acb->sector_num, acb->nb_sectors); +    vfprintf(stderr, fmt, ap); +    fprintf(stderr, "\n"); +    va_end(ap); +    exit(1); +} + +static bool quorum_compare(QuorumAIOCB *acb, +                           QEMUIOVector *a, +                           QEMUIOVector *b) +{ +    BDRVQuorumState *s = acb->common.bs->opaque; +    ssize_t offset; + +    /* This driver will replace blkverify in this particular case */ +    if (s->is_blkverify) { +        offset = qemu_iovec_compare(a, b); +        if (offset != -1) { +            quorum_err(acb, "contents mismatch in sector %" PRId64, +                       acb->sector_num + +                       (uint64_t)(offset / BDRV_SECTOR_SIZE)); +        } +        return true; +    } + +    return quorum_iovec_compare(a, b); +} + +/* Do a vote to get the error code */ +static int quorum_vote_error(QuorumAIOCB *acb) +{ +    BDRVQuorumState *s = acb->common.bs->opaque; +    QuorumVoteVersion *winner = NULL; +    QuorumVotes error_votes; +    QuorumVoteValue result_value; +    int i, ret = 0; +    bool error = false; + +    QLIST_INIT(&error_votes.vote_list); +    error_votes.compare = quorum_64bits_compare; + +    for (i = 0; i < s->num_children; i++) { +        ret = acb->qcrs[i].ret; +        if (ret) { +            error = true; +            result_value.l = ret; +            quorum_count_vote(&error_votes, &result_value, i); +        } +    } + +    if (error) { +        winner = quorum_get_vote_winner(&error_votes); +        ret = winner->value.l; +    } + +    quorum_free_vote_list(&error_votes); + +    return ret; +} + +static bool quorum_vote(QuorumAIOCB *acb) +{ +    bool quorum = true; +    bool rewrite = false; +    int i, j, ret; +    QuorumVoteValue hash; +    BDRVQuorumState *s = acb->common.bs->opaque; +    QuorumVoteVersion *winner; + +    if (quorum_has_too_much_io_failed(acb)) { +        return false; +    } + +    /* get the index of the first successful read */ +    for (i = 0; i < s->num_children; i++) { +        if (!acb->qcrs[i].ret) { +            break; +        } +    } + +    assert(i < s->num_children); + +    /* compare this read with all other successful reads stopping at quorum +     * failure +     */ +    for (j = i + 1; j < s->num_children; j++) { +        if (acb->qcrs[j].ret) { +            continue; +        } +        quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); +        if (!quorum) { +            break; +       } +    } + +    /* Every successful read agrees */ +    if (quorum) { +        quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); +        return false; +    } + +    /* compute hashes for each successful read, also store indexes */ +    for (i = 0; i < s->num_children; i++) { +        if (acb->qcrs[i].ret) { +            continue; +        } +        ret = quorum_compute_hash(acb, i, &hash); +        /* if ever the hash computation failed */ +        if (ret < 0) { +            acb->vote_ret = ret; +            goto free_exit; +        } +        quorum_count_vote(&acb->votes, &hash, i); +    } + +    /* vote to select the most represented version */ +    winner = quorum_get_vote_winner(&acb->votes); + +    /* if the winner count is smaller than threshold the read fails */ +    if (winner->vote_count < s->threshold) { +        quorum_report_failure(acb); +        acb->vote_ret = -EIO; +        goto free_exit; +    } + +    /* we have a winner: copy it */ +    quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); + +    /* some versions are bad print them */ +    quorum_report_bad_versions(s, acb, &winner->value); + +    /* corruption correction is enabled */ +    if (s->rewrite_corrupted) { +        rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); +    } + +free_exit: +    /* free lists */ +    quorum_free_vote_list(&acb->votes); +    return rewrite; +} + +static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) +{ +    BDRVQuorumState *s = acb->common.bs->opaque; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size); +        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); +        qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); +    } + +    for (i = 0; i < s->num_children; i++) { +        bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, +                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); +    } + +    return &acb->common; +} + +static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) +{ +    BDRVQuorumState *s = acb->common.bs->opaque; + +    acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter], +                                                     acb->qiov->size); +    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); +    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, +                     acb->qcrs[acb->child_iter].buf); +    bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num, +                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, +                   quorum_aio_cb, &acb->qcrs[acb->child_iter]); + +    return &acb->common; +} + +static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, +                                    int64_t sector_num, +                                    QEMUIOVector *qiov, +                                    int nb_sectors, +                                    BlockCompletionFunc *cb, +                                    void *opaque) +{ +    BDRVQuorumState *s = bs->opaque; +    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, +                                      nb_sectors, cb, opaque); +    acb->is_read = true; + +    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { +        acb->child_iter = s->num_children - 1; +        return read_quorum_children(acb); +    } + +    acb->child_iter = 0; +    return read_fifo_child(acb); +} + +static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, +                                     int64_t sector_num, +                                     QEMUIOVector *qiov, +                                     int nb_sectors, +                                     BlockCompletionFunc *cb, +                                     void *opaque) +{ +    BDRVQuorumState *s = bs->opaque; +    QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, +                                      cb, opaque); +    int i; + +    for (i = 0; i < s->num_children; i++) { +        acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, +                                             nb_sectors, &quorum_aio_cb, +                                             &acb->qcrs[i]); +    } + +    return &acb->common; +} + +static int64_t quorum_getlength(BlockDriverState *bs) +{ +    BDRVQuorumState *s = bs->opaque; +    int64_t result; +    int i; + +    /* check that all file have the same length */ +    result = bdrv_getlength(s->bs[0]); +    if (result < 0) { +        return result; +    } +    for (i = 1; i < s->num_children; i++) { +        int64_t value = bdrv_getlength(s->bs[i]); +        if (value < 0) { +            return value; +        } +        if (value != result) { +            return -EIO; +        } +    } + +    return result; +} + +static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) +{ +    BDRVQuorumState *s = bs->opaque; +    Error *local_err = NULL; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bdrv_invalidate_cache(s->bs[i], &local_err); +        if (local_err) { +            error_propagate(errp, local_err); +            return; +        } +    } +} + +static coroutine_fn int quorum_co_flush(BlockDriverState *bs) +{ +    BDRVQuorumState *s = bs->opaque; +    QuorumVoteVersion *winner = NULL; +    QuorumVotes error_votes; +    QuorumVoteValue result_value; +    int i; +    int result = 0; + +    QLIST_INIT(&error_votes.vote_list); +    error_votes.compare = quorum_64bits_compare; + +    for (i = 0; i < s->num_children; i++) { +        result = bdrv_co_flush(s->bs[i]); +        result_value.l = result; +        quorum_count_vote(&error_votes, &result_value, i); +    } + +    winner = quorum_get_vote_winner(&error_votes); +    result = winner->value.l; + +    quorum_free_vote_list(&error_votes); + +    return result; +} + +static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, +                                               BlockDriverState *candidate) +{ +    BDRVQuorumState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bool perm = bdrv_recurse_is_first_non_filter(s->bs[i], +                                                     candidate); +        if (perm) { +            return true; +        } +    } + +    return false; +} + +static int quorum_valid_threshold(int threshold, int num_children, Error **errp) +{ + +    if (threshold < 1) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, +                   "vote-threshold", "value >= 1"); +        return -ERANGE; +    } + +    if (threshold > num_children) { +        error_setg(errp, "threshold may not exceed children count"); +        return -ERANGE; +    } + +    return 0; +} + +static QemuOptsList quorum_runtime_opts = { +    .name = "quorum", +    .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head), +    .desc = { +        { +            .name = QUORUM_OPT_VOTE_THRESHOLD, +            .type = QEMU_OPT_NUMBER, +            .help = "The number of vote needed for reaching quorum", +        }, +        { +            .name = QUORUM_OPT_BLKVERIFY, +            .type = QEMU_OPT_BOOL, +            .help = "Trigger block verify mode if set", +        }, +        { +            .name = QUORUM_OPT_REWRITE, +            .type = QEMU_OPT_BOOL, +            .help = "Rewrite corrupted block on read quorum", +        }, +        { +            .name = QUORUM_OPT_READ_PATTERN, +            .type = QEMU_OPT_STRING, +            .help = "Allowed pattern: quorum, fifo. Quorum is default", +        }, +        { /* end of list */ } +    }, +}; + +static int parse_read_pattern(const char *opt) +{ +    int i; + +    if (!opt) { +        /* Set quorum as default */ +        return QUORUM_READ_PATTERN_QUORUM; +    } + +    for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) { +        if (!strcmp(opt, QuorumReadPattern_lookup[i])) { +            return i; +        } +    } + +    return -EINVAL; +} + +static int quorum_open(BlockDriverState *bs, QDict *options, int flags, +                       Error **errp) +{ +    BDRVQuorumState *s = bs->opaque; +    Error *local_err = NULL; +    QemuOpts *opts = NULL; +    bool *opened; +    int i; +    int ret = 0; + +    qdict_flatten(options); + +    /* count how many different children are present */ +    s->num_children = qdict_array_entries(options, "children."); +    if (s->num_children < 0) { +        error_setg(&local_err, "Option children is not a valid array"); +        ret = -EINVAL; +        goto exit; +    } +    if (s->num_children < 2) { +        error_setg(&local_err, +                   "Number of provided children must be greater than 1"); +        ret = -EINVAL; +        goto exit; +    } + +    opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        ret = -EINVAL; +        goto exit; +    } + +    s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); +    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); +    if (ret < 0) { +        error_setg(&local_err, "Please set read-pattern as fifo or quorum"); +        goto exit; +    } +    s->read_pattern = ret; + +    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { +        /* and validate it against s->num_children */ +        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); +        if (ret < 0) { +            goto exit; +        } + +        /* is the driver in blkverify mode */ +        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && +            s->num_children == 2 && s->threshold == 2) { +            s->is_blkverify = true; +        } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { +            fprintf(stderr, "blkverify mode is set by setting blkverify=on " +                    "and using two files with vote_threshold=2\n"); +        } + +        s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, +                                                 false); +        if (s->rewrite_corrupted && s->is_blkverify) { +            error_setg(&local_err, +                       "rewrite-corrupted=on cannot be used with blkverify=on"); +            ret = -EINVAL; +            goto exit; +        } +    } + +    /* allocate the children BlockDriverState array */ +    s->bs = g_new0(BlockDriverState *, s->num_children); +    opened = g_new0(bool, s->num_children); + +    for (i = 0; i < s->num_children; i++) { +        char indexstr[32]; +        ret = snprintf(indexstr, 32, "children.%d", i); +        assert(ret < 32); + +        ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs, +                              &child_format, false, &local_err); +        if (ret < 0) { +            goto close_exit; +        } + +        opened[i] = true; +    } + +    g_free(opened); +    goto exit; + +close_exit: +    /* cleanup on error */ +    for (i = 0; i < s->num_children; i++) { +        if (!opened[i]) { +            continue; +        } +        bdrv_unref(s->bs[i]); +    } +    g_free(s->bs); +    g_free(opened); +exit: +    qemu_opts_del(opts); +    /* propagate error */ +    if (local_err) { +        error_propagate(errp, local_err); +    } +    return ret; +} + +static void quorum_close(BlockDriverState *bs) +{ +    BDRVQuorumState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bdrv_unref(s->bs[i]); +    } + +    g_free(s->bs); +} + +static void quorum_detach_aio_context(BlockDriverState *bs) +{ +    BDRVQuorumState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bdrv_detach_aio_context(s->bs[i]); +    } +} + +static void quorum_attach_aio_context(BlockDriverState *bs, +                                      AioContext *new_context) +{ +    BDRVQuorumState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bdrv_attach_aio_context(s->bs[i], new_context); +    } +} + +static void quorum_refresh_filename(BlockDriverState *bs) +{ +    BDRVQuorumState *s = bs->opaque; +    QDict *opts; +    QList *children; +    int i; + +    for (i = 0; i < s->num_children; i++) { +        bdrv_refresh_filename(s->bs[i]); +        if (!s->bs[i]->full_open_options) { +            return; +        } +    } + +    children = qlist_new(); +    for (i = 0; i < s->num_children; i++) { +        QINCREF(s->bs[i]->full_open_options); +        qlist_append_obj(children, QOBJECT(s->bs[i]->full_open_options)); +    } + +    opts = qdict_new(); +    qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("quorum"))); +    qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD, +                  QOBJECT(qint_from_int(s->threshold))); +    qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY, +                  QOBJECT(qbool_from_bool(s->is_blkverify))); +    qdict_put_obj(opts, QUORUM_OPT_REWRITE, +                  QOBJECT(qbool_from_bool(s->rewrite_corrupted))); +    qdict_put_obj(opts, "children", QOBJECT(children)); + +    bs->full_open_options = opts; +} + +static BlockDriver bdrv_quorum = { +    .format_name                        = "quorum", +    .protocol_name                      = "quorum", + +    .instance_size                      = sizeof(BDRVQuorumState), + +    .bdrv_file_open                     = quorum_open, +    .bdrv_close                         = quorum_close, +    .bdrv_refresh_filename              = quorum_refresh_filename, + +    .bdrv_co_flush_to_disk              = quorum_co_flush, + +    .bdrv_getlength                     = quorum_getlength, + +    .bdrv_aio_readv                     = quorum_aio_readv, +    .bdrv_aio_writev                    = quorum_aio_writev, +    .bdrv_invalidate_cache              = quorum_invalidate_cache, + +    .bdrv_detach_aio_context            = quorum_detach_aio_context, +    .bdrv_attach_aio_context            = quorum_attach_aio_context, + +    .is_filter                          = true, +    .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter, +}; + +static void bdrv_quorum_init(void) +{ +    if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) { +        /* SHA256 hash support is required for quorum device */ +        return; +    } +    bdrv_register(&bdrv_quorum); +} + +block_init(bdrv_quorum_init); diff --git a/block/raw-aio.h b/block/raw-aio.h new file mode 100644 index 00000000..31d791fe --- /dev/null +++ b/block/raw-aio.h @@ -0,0 +1,62 @@ +/* + * Declarations for AIO in the raw protocol + * + * Copyright IBM, Corp. 2008 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#ifndef QEMU_RAW_AIO_H +#define QEMU_RAW_AIO_H + +/* AIO request types */ +#define QEMU_AIO_READ         0x0001 +#define QEMU_AIO_WRITE        0x0002 +#define QEMU_AIO_IOCTL        0x0004 +#define QEMU_AIO_FLUSH        0x0008 +#define QEMU_AIO_DISCARD      0x0010 +#define QEMU_AIO_WRITE_ZEROES 0x0020 +#define QEMU_AIO_TYPE_MASK \ +        (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ +         QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) + +/* AIO flags */ +#define QEMU_AIO_MISALIGNED   0x1000 +#define QEMU_AIO_BLKDEV       0x2000 + + +/* linux-aio.c - Linux native implementation */ +#ifdef CONFIG_LINUX_AIO +void *laio_init(void); +void laio_cleanup(void *s); +BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type); +void laio_detach_aio_context(void *s, AioContext *old_context); +void laio_attach_aio_context(void *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, void *aio_ctx); +void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); +#endif + +#ifdef _WIN32 +typedef struct QEMUWin32AIOState QEMUWin32AIOState; +QEMUWin32AIOState *win32_aio_init(void); +void win32_aio_cleanup(QEMUWin32AIOState *aio); +int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); +BlockAIOCB *win32_aio_submit(BlockDriverState *bs, +        QEMUWin32AIOState *aio, HANDLE hfile, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type); +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, +                                  AioContext *old_context); +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, +                                  AioContext *new_context); +#endif + +#endif /* QEMU_RAW_AIO_H */ diff --git a/block/raw-posix.c b/block/raw-posix.c new file mode 100644 index 00000000..855febed --- /dev/null +++ b/block/raw-posix.c @@ -0,0 +1,2821 @@ +/* + * Block driver for RAW files (posix) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/timer.h" +#include "qemu/log.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "trace.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" +#include "raw-aio.h" +#include "qapi/util.h" +#include "qapi/qmp/qstring.h" + +#if defined(__APPLE__) && (__MACH__) +#include <paths.h> +#include <sys/param.h> +#include <IOKit/IOKitLib.h> +#include <IOKit/IOBSD.h> +#include <IOKit/storage/IOMediaBSDClient.h> +#include <IOKit/storage/IOMedia.h> +#include <IOKit/storage/IOCDMedia.h> +//#include <IOKit/storage/IOCDTypes.h> +#include <CoreFoundation/CoreFoundation.h> +#endif + +#ifdef __sun__ +#define _POSIX_PTHREAD_SEMANTICS 1 +#include <sys/dkio.h> +#endif +#ifdef __linux__ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/param.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#include <linux/fs.h> +#include <linux/hdreg.h> +#include <scsi/sg.h> +#ifdef __s390__ +#include <asm/dasd.h> +#endif +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */ +#endif +#endif +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) +#include <linux/falloc.h> +#endif +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +#include <sys/disk.h> +#include <sys/cdio.h> +#endif + +#ifdef __OpenBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#endif + +#ifdef __NetBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#include <sys/disk.h> +#endif + +#ifdef __DragonFly__ +#include <sys/ioctl.h> +#include <sys/diskslice.h> +#endif + +#ifdef CONFIG_XFS +#include <xfs/xfs.h> +#endif + +//#define DEBUG_BLOCK + +#ifdef DEBUG_BLOCK +# define DEBUG_BLOCK_PRINT 1 +#else +# define DEBUG_BLOCK_PRINT 0 +#endif +#define DPRINTF(fmt, ...) \ +do { \ +    if (DEBUG_BLOCK_PRINT) { \ +        printf(fmt, ## __VA_ARGS__); \ +    } \ +} while (0) + +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + +#define FTYPE_FILE   0 +#define FTYPE_CD     1 +#define FTYPE_FD     2 + +/* if the FD is not accessed during that time (in ns), we try to +   reopen it to see if the disk has been changed */ +#define FD_OPEN_TIMEOUT (1000000000) + +#define MAX_BLOCKSIZE	4096 + +typedef struct BDRVRawState { +    int fd; +    int type; +    int open_flags; +    size_t buf_align; + +#if defined(__linux__) +    /* linux floppy specific */ +    int64_t fd_open_time; +    int64_t fd_error_time; +    int fd_got_error; +    int fd_media_changed; +#endif +#ifdef CONFIG_LINUX_AIO +    int use_aio; +    void *aio_ctx; +#endif +#ifdef CONFIG_XFS +    bool is_xfs:1; +#endif +    bool has_discard:1; +    bool has_write_zeroes:1; +    bool discard_zeroes:1; +    bool has_fallocate; +    bool needs_alignment; +} BDRVRawState; + +typedef struct BDRVRawReopenState { +    int fd; +    int open_flags; +#ifdef CONFIG_LINUX_AIO +    int use_aio; +#endif +} BDRVRawReopenState; + +static int fd_open(BlockDriverState *bs); +static int64_t raw_getlength(BlockDriverState *bs); + +typedef struct RawPosixAIOData { +    BlockDriverState *bs; +    int aio_fildes; +    union { +        struct iovec *aio_iov; +        void *aio_ioctl_buf; +    }; +    int aio_niov; +    uint64_t aio_nbytes; +#define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */ +    off_t aio_offset; +    int aio_type; +} RawPosixAIOData; + +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_reopen(BlockDriverState *bs); +#endif + +#if defined(__NetBSD__) +static int raw_normalize_devicepath(const char **filename) +{ +    static char namebuf[PATH_MAX]; +    const char *dp, *fname; +    struct stat sb; + +    fname = *filename; +    dp = strrchr(fname, '/'); +    if (lstat(fname, &sb) < 0) { +        fprintf(stderr, "%s: stat failed: %s\n", +            fname, strerror(errno)); +        return -errno; +    } + +    if (!S_ISBLK(sb.st_mode)) { +        return 0; +    } + +    if (dp == NULL) { +        snprintf(namebuf, PATH_MAX, "r%s", fname); +    } else { +        snprintf(namebuf, PATH_MAX, "%.*s/r%s", +            (int)(dp - fname), fname, dp + 1); +    } +    fprintf(stderr, "%s is a block device", fname); +    *filename = namebuf; +    fprintf(stderr, ", using %s\n", *filename); + +    return 0; +} +#else +static int raw_normalize_devicepath(const char **filename) +{ +    return 0; +} +#endif + +/* + * Get logical block size via ioctl. On success store it in @sector_size_p. + */ +static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) +{ +    unsigned int sector_size; +    bool success = false; + +    errno = ENOTSUP; + +    /* Try a few ioctls to get the right size */ +#ifdef BLKSSZGET +    if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { +        *sector_size_p = sector_size; +        success = true; +    } +#endif +#ifdef DKIOCGETBLOCKSIZE +    if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { +        *sector_size_p = sector_size; +        success = true; +    } +#endif +#ifdef DIOCGSECTORSIZE +    if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { +        *sector_size_p = sector_size; +        success = true; +    } +#endif + +    return success ? 0 : -errno; +} + +/** + * Get physical block size of @fd. + * On success, store it in @blk_size and return 0. + * On failure, return -errno. + */ +static int probe_physical_blocksize(int fd, unsigned int *blk_size) +{ +#ifdef BLKPBSZGET +    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { +        return -errno; +    } +    return 0; +#else +    return -ENOTSUP; +#endif +} + +/* Check if read is allowed with given memory buffer and length. + * + * This function is used to check O_DIRECT memory buffer and request alignment. + */ +static bool raw_is_io_aligned(int fd, void *buf, size_t len) +{ +    ssize_t ret = pread(fd, buf, len, 0); + +    if (ret >= 0) { +        return true; +    } + +#ifdef __linux__ +    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore +     * other errors (e.g. real I/O error), which could happen on a failed +     * drive, since we only care about probing alignment. +     */ +    if (errno != EINVAL) { +        return true; +    } +#endif + +    return false; +} + +static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    char *buf; +    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); + +    /* For SCSI generic devices the alignment is not really used. +       With buffered I/O, we don't have any restrictions. */ +    if (bdrv_is_sg(bs) || !s->needs_alignment) { +        bs->request_alignment = 1; +        s->buf_align = 1; +        return; +    } + +    bs->request_alignment = 0; +    s->buf_align = 0; +    /* Let's try to use the logical blocksize for the alignment. */ +    if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) { +        bs->request_alignment = 0; +    } +#ifdef CONFIG_XFS +    if (s->is_xfs) { +        struct dioattr da; +        if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { +            bs->request_alignment = da.d_miniosz; +            /* The kernel returns wrong information for d_mem */ +            /* s->buf_align = da.d_mem; */ +        } +    } +#endif + +    /* If we could not get the sizes so far, we can only guess them */ +    if (!s->buf_align) { +        size_t align; +        buf = qemu_memalign(max_align, 2 * max_align); +        for (align = 512; align <= max_align; align <<= 1) { +            if (raw_is_io_aligned(fd, buf + align, max_align)) { +                s->buf_align = align; +                break; +            } +        } +        qemu_vfree(buf); +    } + +    if (!bs->request_alignment) { +        size_t align; +        buf = qemu_memalign(s->buf_align, max_align); +        for (align = 512; align <= max_align; align <<= 1) { +            if (raw_is_io_aligned(fd, buf, align)) { +                bs->request_alignment = align; +                break; +            } +        } +        qemu_vfree(buf); +    } + +    if (!s->buf_align || !bs->request_alignment) { +        error_setg(errp, "Could not find working O_DIRECT alignment. " +                         "Try cache.direct=off."); +    } +} + +static void raw_parse_flags(int bdrv_flags, int *open_flags) +{ +    assert(open_flags != NULL); + +    *open_flags |= O_BINARY; +    *open_flags &= ~O_ACCMODE; +    if (bdrv_flags & BDRV_O_RDWR) { +        *open_flags |= O_RDWR; +    } else { +        *open_flags |= O_RDONLY; +    } + +    /* Use O_DSYNC for write-through caching, no flags for write-back caching, +     * and O_DIRECT for no caching. */ +    if ((bdrv_flags & BDRV_O_NOCACHE)) { +        *open_flags |= O_DIRECT; +    } +} + +static void raw_detach_aio_context(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO +    BDRVRawState *s = bs->opaque; + +    if (s->use_aio) { +        laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs)); +    } +#endif +} + +static void raw_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context) +{ +#ifdef CONFIG_LINUX_AIO +    BDRVRawState *s = bs->opaque; + +    if (s->use_aio) { +        laio_attach_aio_context(s->aio_ctx, new_context); +    } +#endif +} + +#ifdef CONFIG_LINUX_AIO +static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) +{ +    int ret = -1; +    assert(aio_ctx != NULL); +    assert(use_aio != NULL); +    /* +     * Currently Linux do AIO only for files opened with O_DIRECT +     * specified so check NOCACHE flag too +     */ +    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == +                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { + +        /* if non-NULL, laio_init() has already been run */ +        if (*aio_ctx == NULL) { +            *aio_ctx = laio_init(); +            if (!*aio_ctx) { +                goto error; +            } +        } +        *use_aio = 1; +    } else { +        *use_aio = 0; +    } + +    ret = 0; + +error: +    return ret; +} +#endif + +static void raw_parse_filename(const char *filename, QDict *options, +                               Error **errp) +{ +    /* The filename does not have to be prefixed by the protocol name, since +     * "file" is the default protocol; therefore, the return value of this +     * function call can be ignored. */ +    strstart(filename, "file:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static QemuOptsList raw_runtime_opts = { +    .name = "raw", +    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "File name of the image", +        }, +        { /* end of list */ } +    }, +}; + +static int raw_open_common(BlockDriverState *bs, QDict *options, +                           int bdrv_flags, int open_flags, Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename = NULL; +    int fd, ret; +    struct stat st; + +    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    ret = raw_normalize_devicepath(&filename); +    if (ret != 0) { +        error_setg_errno(errp, -ret, "Could not normalize device path"); +        goto fail; +    } + +    s->open_flags = open_flags; +    raw_parse_flags(bdrv_flags, &s->open_flags); + +    s->fd = -1; +    fd = qemu_open(filename, s->open_flags, 0644); +    if (fd < 0) { +        ret = -errno; +        if (ret == -EROFS) { +            ret = -EACCES; +        } +        goto fail; +    } +    s->fd = fd; + +#ifdef CONFIG_LINUX_AIO +    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { +        qemu_close(fd); +        ret = -errno; +        error_setg_errno(errp, -ret, "Could not set AIO state"); +        goto fail; +    } +    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) { +        error_printf("WARNING: aio=native was specified for '%s', but " +                     "it requires cache.direct=on, which was not " +                     "specified. Falling back to aio=threads.\n" +                     "         This will become an error condition in " +                     "future QEMU versions.\n", +                     bs->filename); +    } +#endif + +    s->has_discard = true; +    s->has_write_zeroes = true; +    if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { +        s->needs_alignment = true; +    } + +    if (fstat(s->fd, &st) < 0) { +        ret = -errno; +        error_setg_errno(errp, errno, "Could not stat file"); +        goto fail; +    } +    if (S_ISREG(st.st_mode)) { +        s->discard_zeroes = true; +        s->has_fallocate = true; +    } +    if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES +        unsigned int arg; +        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { +            s->discard_zeroes = true; +        } +#endif +#ifdef __linux__ +        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do +         * not rely on the contents of discarded blocks unless using O_DIRECT. +         * Same for BLKZEROOUT. +         */ +        if (!(bs->open_flags & BDRV_O_NOCACHE)) { +            s->discard_zeroes = false; +            s->has_write_zeroes = false; +        } +#endif +    } +#ifdef __FreeBSD__ +    if (S_ISCHR(st.st_mode)) { +        /* +         * The file is a char device (disk), which on FreeBSD isn't behind +         * a pager, so force all requests to be aligned. This is needed +         * so QEMU makes sure all IO operations on the device are aligned +         * to sector size, or else FreeBSD will reject them with EINVAL. +         */ +        s->needs_alignment = true; +    } +#endif + +#ifdef CONFIG_XFS +    if (platform_test_xfs_fd(s->fd)) { +        s->is_xfs = true; +    } +#endif + +    raw_attach_aio_context(bs, bdrv_get_aio_context(bs)); + +    ret = 0; +fail: +    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { +        unlink(filename); +    } +    qemu_opts_del(opts); +    return ret; +} + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +    s->type = FTYPE_FILE; +    ret = raw_open_common(bs, options, flags, 0, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +    } +    return ret; +} + +static int raw_reopen_prepare(BDRVReopenState *state, +                              BlockReopenQueue *queue, Error **errp) +{ +    BDRVRawState *s; +    BDRVRawReopenState *raw_s; +    int ret = 0; +    Error *local_err = NULL; + +    assert(state != NULL); +    assert(state->bs != NULL); + +    s = state->bs->opaque; + +    state->opaque = g_new0(BDRVRawReopenState, 1); +    raw_s = state->opaque; + +#ifdef CONFIG_LINUX_AIO +    raw_s->use_aio = s->use_aio; + +    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is +     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() +     * won't override aio_ctx if aio_ctx is non-NULL */ +    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { +        error_setg(errp, "Could not set AIO state"); +        return -1; +    } +#endif + +    if (s->type == FTYPE_FD || s->type == FTYPE_CD) { +        raw_s->open_flags |= O_NONBLOCK; +    } + +    raw_parse_flags(state->flags, &raw_s->open_flags); + +    raw_s->fd = -1; + +    int fcntl_flags = O_APPEND | O_NONBLOCK; +#ifdef O_NOATIME +    fcntl_flags |= O_NOATIME; +#endif + +#ifdef O_ASYNC +    /* Not all operating systems have O_ASYNC, and those that don't +     * will not let us track the state into raw_s->open_flags (typically +     * you achieve the same effect with an ioctl, for example I_SETSIG +     * on Solaris). But we do not use O_ASYNC, so that's fine. +     */ +    assert((s->open_flags & O_ASYNC) == 0); +#endif + +    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { +        /* dup the original fd */ +        /* TODO: use qemu fcntl wrapper */ +#ifdef F_DUPFD_CLOEXEC +        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0); +#else +        raw_s->fd = dup(s->fd); +        if (raw_s->fd != -1) { +            qemu_set_cloexec(raw_s->fd); +        } +#endif +        if (raw_s->fd >= 0) { +            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags); +            if (ret) { +                qemu_close(raw_s->fd); +                raw_s->fd = -1; +            } +        } +    } + +    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ +    if (raw_s->fd == -1) { +        assert(!(raw_s->open_flags & O_CREAT)); +        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags); +        if (raw_s->fd == -1) { +            error_setg_errno(errp, errno, "Could not reopen file"); +            ret = -1; +        } +    } + +    /* Fail already reopen_prepare() if we can't get a working O_DIRECT +     * alignment with the new fd. */ +    if (raw_s->fd != -1) { +        raw_probe_alignment(state->bs, raw_s->fd, &local_err); +        if (local_err) { +            qemu_close(raw_s->fd); +            raw_s->fd = -1; +            error_propagate(errp, local_err); +            ret = -EINVAL; +        } +    } + +    return ret; +} + +static void raw_reopen_commit(BDRVReopenState *state) +{ +    BDRVRawReopenState *raw_s = state->opaque; +    BDRVRawState *s = state->bs->opaque; + +    s->open_flags = raw_s->open_flags; + +    qemu_close(s->fd); +    s->fd = raw_s->fd; +#ifdef CONFIG_LINUX_AIO +    s->use_aio = raw_s->use_aio; +#endif + +    g_free(state->opaque); +    state->opaque = NULL; +} + + +static void raw_reopen_abort(BDRVReopenState *state) +{ +    BDRVRawReopenState *raw_s = state->opaque; + +     /* nothing to do if NULL, we didn't get far enough */ +    if (raw_s == NULL) { +        return; +    } + +    if (raw_s->fd >= 0) { +        qemu_close(raw_s->fd); +        raw_s->fd = -1; +    } +    g_free(state->opaque); +    state->opaque = NULL; +} + +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    BDRVRawState *s = bs->opaque; + +    raw_probe_alignment(bs, s->fd, errp); +    bs->bl.min_mem_alignment = s->buf_align; +    bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); +} + +static int check_for_dasd(int fd) +{ +#ifdef BIODASDINFO2 +    struct dasd_information2_t info = {0}; + +    return ioctl(fd, BIODASDINFO2, &info); +#else +    return -1; +#endif +} + +/** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ +static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ +    BDRVRawState *s = bs->opaque; +    int ret; + +    /* If DASD, get blocksizes */ +    if (check_for_dasd(s->fd) < 0) { +        return -ENOTSUP; +    } +    ret = probe_logical_blocksize(s->fd, &bsz->log); +    if (ret < 0) { +        return ret; +    } +    return probe_physical_blocksize(s->fd, &bsz->phys); +} + +/** + * Try to get @bs's geometry: cyls, heads, sectors. + * On success, store them in @geo and return 0. + * On failure return -errno. + * (Allows block driver to assign default geometry values that guest sees) + */ +#ifdef __linux__ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ +    BDRVRawState *s = bs->opaque; +    struct hd_geometry ioctl_geo = {0}; +    uint32_t blksize; + +    /* If DASD, get its geometry */ +    if (check_for_dasd(s->fd) < 0) { +        return -ENOTSUP; +    } +    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { +        return -errno; +    } +    /* HDIO_GETGEO may return success even though geo contains zeros +       (e.g. certain multipath setups) */ +    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { +        return -ENOTSUP; +    } +    /* Do not return a geometry for partition */ +    if (ioctl_geo.start != 0) { +        return -ENOTSUP; +    } +    geo->heads = ioctl_geo.heads; +    geo->sectors = ioctl_geo.sectors; +    if (!probe_physical_blocksize(s->fd, &blksize)) { +        /* overwrite cyls: HDIO_GETGEO result is incorrect for big drives */ +        geo->cylinders = bdrv_nb_sectors(bs) / (blksize / BDRV_SECTOR_SIZE) +                                             / (geo->heads * geo->sectors); +        return 0; +    } +    geo->cylinders = ioctl_geo.cylinders; + +    return 0; +} +#else /* __linux__ */ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ +    return -ENOTSUP; +} +#endif + +static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) +{ +    int ret; + +    ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); +    if (ret == -1) { +        return -errno; +    } + +    return 0; +} + +static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) +{ +    int ret; + +    ret = qemu_fdatasync(aiocb->aio_fildes); +    if (ret == -1) { +        return -errno; +    } +    return 0; +} + +#ifdef CONFIG_PREADV + +static bool preadv_present = true; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ +    return preadv(fd, iov, nr_iov, offset); +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ +    return pwritev(fd, iov, nr_iov, offset); +} + +#else + +static bool preadv_present = false; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ +    return -ENOSYS; +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ +    return -ENOSYS; +} + +#endif + +static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) +{ +    ssize_t len; + +    do { +        if (aiocb->aio_type & QEMU_AIO_WRITE) +            len = qemu_pwritev(aiocb->aio_fildes, +                               aiocb->aio_iov, +                               aiocb->aio_niov, +                               aiocb->aio_offset); +         else +            len = qemu_preadv(aiocb->aio_fildes, +                              aiocb->aio_iov, +                              aiocb->aio_niov, +                              aiocb->aio_offset); +    } while (len == -1 && errno == EINTR); + +    if (len == -1) { +        return -errno; +    } +    return len; +} + +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ +static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) +{ +    ssize_t offset = 0; +    ssize_t len; + +    while (offset < aiocb->aio_nbytes) { +        if (aiocb->aio_type & QEMU_AIO_WRITE) { +            len = pwrite(aiocb->aio_fildes, +                         (const char *)buf + offset, +                         aiocb->aio_nbytes - offset, +                         aiocb->aio_offset + offset); +        } else { +            len = pread(aiocb->aio_fildes, +                        buf + offset, +                        aiocb->aio_nbytes - offset, +                        aiocb->aio_offset + offset); +        } +        if (len == -1 && errno == EINTR) { +            continue; +        } else if (len == -1 && errno == EINVAL && +                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) && +                   !(aiocb->aio_type & QEMU_AIO_WRITE) && +                   offset > 0) { +            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned +             * after a short read.  Assume that O_DIRECT short reads only occur +             * at EOF.  Therefore this is a short read, not an I/O error. +             */ +            break; +        } else if (len == -1) { +            offset = -errno; +            break; +        } else if (len == 0) { +            break; +        } +        offset += len; +    } + +    return offset; +} + +static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) +{ +    ssize_t nbytes; +    char *buf; + +    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { +        /* +         * If there is just a single buffer, and it is properly aligned +         * we can just use plain pread/pwrite without any problems. +         */ +        if (aiocb->aio_niov == 1) { +             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); +        } +        /* +         * We have more than one iovec, and all are properly aligned. +         * +         * Try preadv/pwritev first and fall back to linearizing the +         * buffer if it's not supported. +         */ +        if (preadv_present) { +            nbytes = handle_aiocb_rw_vector(aiocb); +            if (nbytes == aiocb->aio_nbytes || +                (nbytes < 0 && nbytes != -ENOSYS)) { +                return nbytes; +            } +            preadv_present = false; +        } + +        /* +         * XXX(hch): short read/write.  no easy way to handle the reminder +         * using these interfaces.  For now retry using plain +         * pread/pwrite? +         */ +    } + +    /* +     * Ok, we have to do it the hard way, copy all segments into +     * a single aligned buffer. +     */ +    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); +    if (buf == NULL) { +        return -ENOMEM; +    } + +    if (aiocb->aio_type & QEMU_AIO_WRITE) { +        char *p = buf; +        int i; + +        for (i = 0; i < aiocb->aio_niov; ++i) { +            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); +            p += aiocb->aio_iov[i].iov_len; +        } +        assert(p - buf == aiocb->aio_nbytes); +    } + +    nbytes = handle_aiocb_rw_linear(aiocb, buf); +    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { +        char *p = buf; +        size_t count = aiocb->aio_nbytes, copy; +        int i; + +        for (i = 0; i < aiocb->aio_niov && count; ++i) { +            copy = count; +            if (copy > aiocb->aio_iov[i].iov_len) { +                copy = aiocb->aio_iov[i].iov_len; +            } +            memcpy(aiocb->aio_iov[i].iov_base, p, copy); +            assert(count >= copy); +            p     += copy; +            count -= copy; +        } +        assert(count == 0); +    } +    qemu_vfree(buf); + +    return nbytes; +} + +#ifdef CONFIG_XFS +static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ +    struct xfs_flock64 fl; +    int err; + +    memset(&fl, 0, sizeof(fl)); +    fl.l_whence = SEEK_SET; +    fl.l_start = offset; +    fl.l_len = bytes; + +    if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { +        err = errno; +        DPRINTF("cannot write zero range (%s)\n", strerror(errno)); +        return -err; +    } + +    return 0; +} + +static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ +    struct xfs_flock64 fl; +    int err; + +    memset(&fl, 0, sizeof(fl)); +    fl.l_whence = SEEK_SET; +    fl.l_start = offset; +    fl.l_len = bytes; + +    if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { +        err = errno; +        DPRINTF("cannot punch hole (%s)\n", strerror(errno)); +        return -err; +    } + +    return 0; +} +#endif + +static int translate_err(int err) +{ +    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || +        err == -ENOTTY) { +        err = -ENOTSUP; +    } +    return err; +} + +#ifdef CONFIG_FALLOCATE +static int do_fallocate(int fd, int mode, off_t offset, off_t len) +{ +    do { +        if (fallocate(fd, mode, offset, len) == 0) { +            return 0; +        } +    } while (errno == EINTR); +    return translate_err(-errno); +} +#endif + +static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) +{ +    int ret = -ENOTSUP; +    BDRVRawState *s = aiocb->bs->opaque; + +    if (!s->has_write_zeroes) { +        return -ENOTSUP; +    } + +#ifdef BLKZEROOUT +    do { +        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; +        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { +            return 0; +        } +    } while (errno == EINTR); + +    ret = translate_err(-errno); +#endif + +    if (ret == -ENOTSUP) { +        s->has_write_zeroes = false; +    } +    return ret; +} + +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ +#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) +    BDRVRawState *s = aiocb->bs->opaque; +#endif + +    if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +        return handle_aiocb_write_zeroes_block(aiocb); +    } + +#ifdef CONFIG_XFS +    if (s->is_xfs) { +        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); +    } +#endif + +#ifdef CONFIG_FALLOCATE_ZERO_RANGE +    if (s->has_write_zeroes) { +        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, +                               aiocb->aio_offset, aiocb->aio_nbytes); +        if (ret == 0 || ret != -ENOTSUP) { +            return ret; +        } +        s->has_write_zeroes = false; +    } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +    if (s->has_discard && s->has_fallocate) { +        int ret = do_fallocate(s->fd, +                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, +                               aiocb->aio_offset, aiocb->aio_nbytes); +        if (ret == 0) { +            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); +            if (ret == 0 || ret != -ENOTSUP) { +                return ret; +            } +            s->has_fallocate = false; +        } else if (ret != -ENOTSUP) { +            return ret; +        } else { +            s->has_discard = false; +        } +    } +#endif + +#ifdef CONFIG_FALLOCATE +    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { +        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); +        if (ret == 0 || ret != -ENOTSUP) { +            return ret; +        } +        s->has_fallocate = false; +    } +#endif + +    return -ENOTSUP; +} + +static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +{ +    int ret = -EOPNOTSUPP; +    BDRVRawState *s = aiocb->bs->opaque; + +    if (!s->has_discard) { +        return -ENOTSUP; +    } + +    if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKDISCARD +        do { +            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; +            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { +                return 0; +            } +        } while (errno == EINTR); + +        ret = -errno; +#endif +    } else { +#ifdef CONFIG_XFS +        if (s->is_xfs) { +            return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); +        } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, +                           aiocb->aio_offset, aiocb->aio_nbytes); +#endif +    } + +    ret = translate_err(ret); +    if (ret == -ENOTSUP) { +        s->has_discard = false; +    } +    return ret; +} + +static int aio_worker(void *arg) +{ +    RawPosixAIOData *aiocb = arg; +    ssize_t ret = 0; + +    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { +    case QEMU_AIO_READ: +        ret = handle_aiocb_rw(aiocb); +        if (ret >= 0 && ret < aiocb->aio_nbytes) { +            iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, +                      0, aiocb->aio_nbytes - ret); + +            ret = aiocb->aio_nbytes; +        } +        if (ret == aiocb->aio_nbytes) { +            ret = 0; +        } else if (ret >= 0 && ret < aiocb->aio_nbytes) { +            ret = -EINVAL; +        } +        break; +    case QEMU_AIO_WRITE: +        ret = handle_aiocb_rw(aiocb); +        if (ret == aiocb->aio_nbytes) { +            ret = 0; +        } else if (ret >= 0 && ret < aiocb->aio_nbytes) { +            ret = -EINVAL; +        } +        break; +    case QEMU_AIO_FLUSH: +        ret = handle_aiocb_flush(aiocb); +        break; +    case QEMU_AIO_IOCTL: +        ret = handle_aiocb_ioctl(aiocb); +        break; +    case QEMU_AIO_DISCARD: +        ret = handle_aiocb_discard(aiocb); +        break; +    case QEMU_AIO_WRITE_ZEROES: +        ret = handle_aiocb_write_zeroes(aiocb); +        break; +    default: +        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); +        ret = -EINVAL; +        break; +    } + +    g_slice_free(RawPosixAIOData, aiocb); +    return ret; +} + +static int paio_submit_co(BlockDriverState *bs, int fd, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        int type) +{ +    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); +    ThreadPool *pool; + +    acb->bs = bs; +    acb->aio_type = type; +    acb->aio_fildes = fd; + +    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; +    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + +    if (qiov) { +        acb->aio_iov = qiov->iov; +        acb->aio_niov = qiov->niov; +        assert(qiov->size == acb->aio_nbytes); +    } + +    trace_paio_submit_co(sector_num, nb_sectors, type); +    pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); +    return thread_pool_submit_co(pool, aio_worker, acb); +} + +static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type) +{ +    RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); +    ThreadPool *pool; + +    acb->bs = bs; +    acb->aio_type = type; +    acb->aio_fildes = fd; + +    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; +    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + +    if (qiov) { +        acb->aio_iov = qiov->iov; +        acb->aio_niov = qiov->niov; +        assert(qiov->size == acb->aio_nbytes); +    } + +    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); +    pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); +    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} + +static BlockAIOCB *raw_aio_submit(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type) +{ +    BDRVRawState *s = bs->opaque; + +    if (fd_open(bs) < 0) +        return NULL; + +    /* +     * Check if the underlying device requires requests to be aligned, +     * and if the request we are trying to submit is aligned or not. +     * If this is the case tell the low-level driver that it needs +     * to copy the buffer. +     */ +    if (s->needs_alignment) { +        if (!bdrv_qiov_is_aligned(bs, qiov)) { +            type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_AIO +        } else if (s->use_aio) { +            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, +                               nb_sectors, cb, opaque, type); +#endif +        } +    } + +    return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, +                       cb, opaque, type); +} + +static void raw_aio_plug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO +    BDRVRawState *s = bs->opaque; +    if (s->use_aio) { +        laio_io_plug(bs, s->aio_ctx); +    } +#endif +} + +static void raw_aio_unplug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO +    BDRVRawState *s = bs->opaque; +    if (s->use_aio) { +        laio_io_unplug(bs, s->aio_ctx, true); +    } +#endif +} + +static void raw_aio_flush_io_queue(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO +    BDRVRawState *s = bs->opaque; +    if (s->use_aio) { +        laio_io_unplug(bs, s->aio_ctx, false); +    } +#endif +} + +static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return raw_aio_submit(bs, sector_num, qiov, nb_sectors, +                          cb, opaque, QEMU_AIO_READ); +} + +static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque) +{ +    return raw_aio_submit(bs, sector_num, qiov, nb_sectors, +                          cb, opaque, QEMU_AIO_WRITE); +} + +static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, +        BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; + +    if (fd_open(bs) < 0) +        return NULL; + +    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} + +static void raw_close(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; + +    raw_detach_aio_context(bs); + +#ifdef CONFIG_LINUX_AIO +    if (s->use_aio) { +        laio_cleanup(s->aio_ctx); +    } +#endif +    if (s->fd >= 0) { +        qemu_close(s->fd); +        s->fd = -1; +    } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVRawState *s = bs->opaque; +    struct stat st; + +    if (fstat(s->fd, &st)) { +        return -errno; +    } + +    if (S_ISREG(st.st_mode)) { +        if (ftruncate(s->fd, offset) < 0) { +            return -errno; +        } +    } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { +       if (offset > raw_getlength(bs)) { +           return -EINVAL; +       } +    } else { +        return -ENOTSUP; +    } + +    return 0; +} + +#ifdef __OpenBSD__ +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int fd = s->fd; +    struct stat st; + +    if (fstat(fd, &st)) +        return -errno; +    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { +        struct disklabel dl; + +        if (ioctl(fd, DIOCGDINFO, &dl)) +            return -errno; +        return (uint64_t)dl.d_secsize * +            dl.d_partitions[DISKPART(st.st_rdev)].p_size; +    } else +        return st.st_size; +} +#elif defined(__NetBSD__) +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int fd = s->fd; +    struct stat st; + +    if (fstat(fd, &st)) +        return -errno; +    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { +        struct dkwedge_info dkw; + +        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { +            return dkw.dkw_size * 512; +        } else { +            struct disklabel dl; + +            if (ioctl(fd, DIOCGDINFO, &dl)) +                return -errno; +            return (uint64_t)dl.d_secsize * +                dl.d_partitions[DISKPART(st.st_rdev)].p_size; +        } +    } else +        return st.st_size; +} +#elif defined(__sun__) +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    struct dk_minfo minfo; +    int ret; +    int64_t size; + +    ret = fd_open(bs); +    if (ret < 0) { +        return ret; +    } + +    /* +     * Use the DKIOCGMEDIAINFO ioctl to read the size. +     */ +    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); +    if (ret != -1) { +        return minfo.dki_lbsize * minfo.dki_capacity; +    } + +    /* +     * There are reports that lseek on some devices fails, but +     * irc discussion said that contingency on contingency was overkill. +     */ +    size = lseek(s->fd, 0, SEEK_END); +    if (size < 0) { +        return -errno; +    } +    return size; +} +#elif defined(CONFIG_BSD) +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int fd = s->fd; +    int64_t size; +    struct stat sb; +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +    int reopened = 0; +#endif +    int ret; + +    ret = fd_open(bs); +    if (ret < 0) +        return ret; + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +again: +#endif +    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { +#ifdef DIOCGMEDIASIZE +	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) +#elif defined(DIOCGPART) +        { +                struct partinfo pi; +                if (ioctl(fd, DIOCGPART, &pi) == 0) +                        size = pi.media_size; +                else +                        size = 0; +        } +        if (size == 0) +#endif +#if defined(__APPLE__) && defined(__MACH__) +        { +            uint64_t sectors = 0; +            uint32_t sector_size = 0; + +            if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 +               && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { +                size = sectors * sector_size; +            } else { +                size = lseek(fd, 0LL, SEEK_END); +                if (size < 0) { +                    return -errno; +                } +            } +        } +#else +        size = lseek(fd, 0LL, SEEK_END); +        if (size < 0) { +            return -errno; +        } +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +        switch(s->type) { +        case FTYPE_CD: +            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ +            if (size == 2048LL * (unsigned)-1) +                size = 0; +            /* XXX no disc?  maybe we need to reopen... */ +            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { +                reopened = 1; +                goto again; +            } +        } +#endif +    } else { +        size = lseek(fd, 0, SEEK_END); +        if (size < 0) { +            return -errno; +        } +    } +    return size; +} +#else +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int ret; +    int64_t size; + +    ret = fd_open(bs); +    if (ret < 0) { +        return ret; +    } + +    size = lseek(s->fd, 0, SEEK_END); +    if (size < 0) { +        return -errno; +    } +    return size; +} +#endif + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ +    struct stat st; +    BDRVRawState *s = bs->opaque; + +    if (fstat(s->fd, &st) < 0) { +        return -errno; +    } +    return (int64_t)st.st_blocks * 512; +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int fd; +    int result = 0; +    int64_t total_size = 0; +    bool nocow = false; +    PreallocMode prealloc; +    char *buf = NULL; +    Error *local_err = NULL; + +    strstart(filename, "file:", &filename); + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); +    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); +    prealloc = qapi_enum_parse(PreallocMode_lookup, buf, +                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, +                               &local_err); +    g_free(buf); +    if (local_err) { +        error_propagate(errp, local_err); +        result = -EINVAL; +        goto out; +    } + +    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, +                   0644); +    if (fd < 0) { +        result = -errno; +        error_setg_errno(errp, -result, "Could not create file"); +        goto out; +    } + +    if (nocow) { +#ifdef __linux__ +        /* Set NOCOW flag to solve performance issue on fs like btrfs. +         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value +         * will be ignored since any failure of this operation should not +         * block the left work. +         */ +        int attr; +        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { +            attr |= FS_NOCOW_FL; +            ioctl(fd, FS_IOC_SETFLAGS, &attr); +        } +#endif +    } + +    if (ftruncate(fd, total_size) != 0) { +        result = -errno; +        error_setg_errno(errp, -result, "Could not resize file"); +        goto out_close; +    } + +    switch (prealloc) { +#ifdef CONFIG_POSIX_FALLOCATE +    case PREALLOC_MODE_FALLOC: +        /* posix_fallocate() doesn't set errno. */ +        result = -posix_fallocate(fd, 0, total_size); +        if (result != 0) { +            error_setg_errno(errp, -result, +                             "Could not preallocate data for the new file"); +        } +        break; +#endif +    case PREALLOC_MODE_FULL: +    { +        int64_t num = 0, left = total_size; +        buf = g_malloc0(65536); + +        while (left > 0) { +            num = MIN(left, 65536); +            result = write(fd, buf, num); +            if (result < 0) { +                result = -errno; +                error_setg_errno(errp, -result, +                                 "Could not write to the new file"); +                break; +            } +            left -= result; +        } +        if (result >= 0) { +            result = fsync(fd); +            if (result < 0) { +                result = -errno; +                error_setg_errno(errp, -result, +                                 "Could not flush new file to disk"); +            } +        } +        g_free(buf); +        break; +    } +    case PREALLOC_MODE_OFF: +        break; +    default: +        result = -EINVAL; +        error_setg(errp, "Unsupported preallocation mode: %s", +                   PreallocMode_lookup[prealloc]); +        break; +    } + +out_close: +    if (qemu_close(fd) != 0 && result == 0) { +        result = -errno; +        error_setg_errno(errp, -result, "Could not close the new file"); +    } +out: +    return result; +} + +/* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. + * If @start is not in a hole, store @start in @data, and the + * beginning of the next hole in @hole, and return 0. + * If @start is in a non-trailing hole, store @start in @hole and the + * beginning of the next non-hole in @data, and return 0. + * If @start is in a trailing hole or beyond EOF, return -ENXIO. + * If we can't find out, return a negative errno other than -ENXIO. + */ +static int find_allocation(BlockDriverState *bs, off_t start, +                           off_t *data, off_t *hole) +{ +#if defined SEEK_HOLE && defined SEEK_DATA +    BDRVRawState *s = bs->opaque; +    off_t offs; + +    /* +     * SEEK_DATA cases: +     * D1. offs == start: start is in data +     * D2. offs > start: start is in a hole, next data at offs +     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole +     *                              or start is beyond EOF +     *     If the latter happens, the file has been truncated behind +     *     our back since we opened it.  All bets are off then. +     *     Treating like a trailing hole is simplest. +     * D4. offs < 0, errno != ENXIO: we learned nothing +     */ +    offs = lseek(s->fd, start, SEEK_DATA); +    if (offs < 0) { +        return -errno;          /* D3 or D4 */ +    } +    assert(offs >= start); + +    if (offs > start) { +        /* D2: in hole, next data at offs */ +        *hole = start; +        *data = offs; +        return 0; +    } + +    /* D1: in data, end not yet known */ + +    /* +     * SEEK_HOLE cases: +     * H1. offs == start: start is in a hole +     *     If this happens here, a hole has been dug behind our back +     *     since the previous lseek(). +     * H2. offs > start: either start is in data, next hole at offs, +     *                   or start is in trailing hole, EOF at offs +     *     Linux treats trailing holes like any other hole: offs == +     *     start.  Solaris seeks to EOF instead: offs > start (blech). +     *     If that happens here, a hole has been dug behind our back +     *     since the previous lseek(). +     * H3. offs < 0, errno = ENXIO: start is beyond EOF +     *     If this happens, the file has been truncated behind our +     *     back since we opened it.  Treat it like a trailing hole. +     * H4. offs < 0, errno != ENXIO: we learned nothing +     *     Pretend we know nothing at all, i.e. "forget" about D1. +     */ +    offs = lseek(s->fd, start, SEEK_HOLE); +    if (offs < 0) { +        return -errno;          /* D1 and (H3 or H4) */ +    } +    assert(offs >= start); + +    if (offs > start) { +        /* +         * D1 and H2: either in data, next hole at offs, or it was in +         * data but is now in a trailing hole.  In the latter case, +         * all bets are off.  Treating it as if it there was data all +         * the way to EOF is safe, so simply do that. +         */ +        *data = start; +        *hole = offs; +        return 0; +    } + +    /* D1 and H1 */ +    return -EBUSY; +#else +    return -ENOTSUP; +#endif +} + +/* + * Returns the allocation status of the specified sectors. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, +                                                    int64_t sector_num, +                                                    int nb_sectors, int *pnum) +{ +    off_t start, data = 0, hole = 0; +    int64_t total_size; +    int ret; + +    ret = fd_open(bs); +    if (ret < 0) { +        return ret; +    } + +    start = sector_num * BDRV_SECTOR_SIZE; +    total_size = bdrv_getlength(bs); +    if (total_size < 0) { +        return total_size; +    } else if (start >= total_size) { +        *pnum = 0; +        return 0; +    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { +        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); +    } + +    ret = find_allocation(bs, start, &data, &hole); +    if (ret == -ENXIO) { +        /* Trailing hole */ +        *pnum = nb_sectors; +        ret = BDRV_BLOCK_ZERO; +    } else if (ret < 0) { +        /* No info available, so pretend there are no holes */ +        *pnum = nb_sectors; +        ret = BDRV_BLOCK_DATA; +    } else if (data == start) { +        /* On a data extent, compute sectors to the end of the extent, +         * possibly including a partial sector at EOF. */ +        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); +        ret = BDRV_BLOCK_DATA; +    } else { +        /* On a hole, compute sectors to the beginning of the next extent.  */ +        assert(hole == start); +        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); +        ret = BDRV_BLOCK_ZERO; +    } +    return ret | BDRV_BLOCK_OFFSET_VALID | start; +} + +static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, +    BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; + +    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, +                       cb, opaque, QEMU_AIO_DISCARD); +} + +static int coroutine_fn raw_co_write_zeroes( +    BlockDriverState *bs, int64_t sector_num, +    int nb_sectors, BdrvRequestFlags flags) +{ +    BDRVRawState *s = bs->opaque; + +    if (!(flags & BDRV_REQ_MAY_UNMAP)) { +        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, +                              QEMU_AIO_WRITE_ZEROES); +    } else if (s->discard_zeroes) { +        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, +                              QEMU_AIO_DISCARD); +    } +    return -ENOTSUP; +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVRawState *s = bs->opaque; + +    bdi->unallocated_blocks_are_zero = s->discard_zeroes; +    bdi->can_write_zeroes_with_unmap = s->discard_zeroes; +    return 0; +} + +static QemuOptsList raw_create_opts = { +    .name = "raw-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_NOCOW, +            .type = QEMU_OPT_BOOL, +            .help = "Turn off copy-on-write (valid only on btrfs)" +        }, +        { +            .name = BLOCK_OPT_PREALLOC, +            .type = QEMU_OPT_STRING, +            .help = "Preallocation mode (allowed values: off, falloc, full)" +        }, +        { /* end of list */ } +    } +}; + +BlockDriver bdrv_file = { +    .format_name = "file", +    .protocol_name = "file", +    .instance_size = sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_probe = NULL, /* no probe for protocols */ +    .bdrv_parse_filename = raw_parse_filename, +    .bdrv_file_open = raw_open, +    .bdrv_reopen_prepare = raw_reopen_prepare, +    .bdrv_reopen_commit = raw_reopen_commit, +    .bdrv_reopen_abort = raw_reopen_abort, +    .bdrv_close = raw_close, +    .bdrv_create = raw_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_co_get_block_status = raw_co_get_block_status, +    .bdrv_co_write_zeroes = raw_co_write_zeroes, + +    .bdrv_aio_readv = raw_aio_readv, +    .bdrv_aio_writev = raw_aio_writev, +    .bdrv_aio_flush = raw_aio_flush, +    .bdrv_aio_discard = raw_aio_discard, +    .bdrv_refresh_limits = raw_refresh_limits, +    .bdrv_io_plug = raw_aio_plug, +    .bdrv_io_unplug = raw_aio_unplug, +    .bdrv_flush_io_queue = raw_aio_flush_io_queue, + +    .bdrv_truncate = raw_truncate, +    .bdrv_getlength = raw_getlength, +    .bdrv_get_info = raw_get_info, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    .create_opts = &raw_create_opts, +}; + +/***********************************************/ +/* host device */ + +#if defined(__APPLE__) && defined(__MACH__) +static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ); +static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ); + +kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ) +{ +    kern_return_t       kernResult; +    mach_port_t     masterPort; +    CFMutableDictionaryRef  classesToMatch; + +    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); +    if ( KERN_SUCCESS != kernResult ) { +        printf( "IOMasterPort returned %d\n", kernResult ); +    } + +    classesToMatch = IOServiceMatching( kIOCDMediaClass ); +    if ( classesToMatch == NULL ) { +        printf( "IOServiceMatching returned a NULL dictionary.\n" ); +    } else { +    CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue ); +    } +    kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator ); +    if ( KERN_SUCCESS != kernResult ) +    { +        printf( "IOServiceGetMatchingServices returned %d\n", kernResult ); +    } + +    return kernResult; +} + +kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ) +{ +    io_object_t     nextMedia; +    kern_return_t   kernResult = KERN_FAILURE; +    *bsdPath = '\0'; +    nextMedia = IOIteratorNext( mediaIterator ); +    if ( nextMedia ) +    { +        CFTypeRef   bsdPathAsCFString; +    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); +        if ( bsdPathAsCFString ) { +            size_t devPathLength; +            strcpy( bsdPath, _PATH_DEV ); +            strcat( bsdPath, "r" ); +            devPathLength = strlen( bsdPath ); +            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { +                kernResult = KERN_SUCCESS; +            } +            CFRelease( bsdPathAsCFString ); +        } +        IOObjectRelease( nextMedia ); +    } + +    return kernResult; +} + +#endif + +static int hdev_probe_device(const char *filename) +{ +    struct stat st; + +    /* allow a dedicated CD-ROM driver to match with a higher priority */ +    if (strstart(filename, "/dev/cdrom", NULL)) +        return 50; + +    if (stat(filename, &st) >= 0 && +            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { +        return 100; +    } + +    return 0; +} + +static int check_hdev_writable(BDRVRawState *s) +{ +#if defined(BLKROGET) +    /* Linux block devices can be configured "read-only" using blockdev(8). +     * This is independent of device node permissions and therefore open(2) +     * with O_RDWR succeeds.  Actual writes fail with EPERM. +     * +     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly +     * check for read-only block devices so that Linux block devices behave +     * properly. +     */ +    struct stat st; +    int readonly = 0; + +    if (fstat(s->fd, &st)) { +        return -errno; +    } + +    if (!S_ISBLK(st.st_mode)) { +        return 0; +    } + +    if (ioctl(s->fd, BLKROGET, &readonly) < 0) { +        return -errno; +    } + +    if (readonly) { +        return -EACCES; +    } +#endif /* defined(BLKROGET) */ +    return 0; +} + +static void hdev_parse_filename(const char *filename, QDict *options, +                                Error **errp) +{ +    /* The prefix is optional, just as for "file". */ +    strstart(filename, "host_device:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static bool hdev_is_sg(BlockDriverState *bs) +{ + +#if defined(__linux__) + +    struct stat st; +    struct sg_scsi_id scsiid; +    int sg_version; + +    if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) && +        !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) && +        !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) { +        DPRINTF("SG device found: type=%d, version=%d\n", +            scsiid.scsi_type, sg_version); +        return true; +    } + +#endif + +    return false; +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +#if defined(__APPLE__) && defined(__MACH__) +    const char *filename = qdict_get_str(options, "filename"); + +    if (strstart(filename, "/dev/cdrom", NULL)) { +        kern_return_t kernResult; +        io_iterator_t mediaIterator; +        char bsdPath[ MAXPATHLEN ]; +        int fd; + +        kernResult = FindEjectableCDMedia( &mediaIterator ); +        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) ); + +        if ( bsdPath[ 0 ] != '\0' ) { +            strcat(bsdPath,"s0"); +            /* some CDs don't have a partition 0 */ +            fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE); +            if (fd < 0) { +                bsdPath[strlen(bsdPath)-1] = '1'; +            } else { +                qemu_close(fd); +            } +            filename = bsdPath; +            qdict_put(options, "filename", qstring_from_str(filename)); +        } + +        if ( mediaIterator ) +            IOObjectRelease( mediaIterator ); +    } +#endif + +    s->type = FTYPE_FILE; + +    ret = raw_open_common(bs, options, flags, 0, &local_err); +    if (ret < 0) { +        if (local_err) { +            error_propagate(errp, local_err); +        } +        return ret; +    } + +    /* Since this does ioctl the device must be already opened */ +    bs->sg = hdev_is_sg(bs); + +    if (flags & BDRV_O_RDWR) { +        ret = check_hdev_writable(s); +        if (ret < 0) { +            raw_close(bs); +            error_setg_errno(errp, -ret, "The device is not writable"); +            return ret; +        } +    } + +    return ret; +} + +#if defined(__linux__) +/* Note: we do not have a reliable method to detect if the floppy is +   present. The current method is to try to open the floppy at every +   I/O and to keep it opened during a few hundreds of ms. */ +static int fd_open(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int last_media_present; + +    if (s->type != FTYPE_FD) +        return 0; +    last_media_present = (s->fd >= 0); +    if (s->fd >= 0 && +        (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { +        qemu_close(s->fd); +        s->fd = -1; +        DPRINTF("Floppy closed\n"); +    } +    if (s->fd < 0) { +        if (s->fd_got_error && +            (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) { +            DPRINTF("No floppy (open delayed)\n"); +            return -EIO; +        } +        s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK); +        if (s->fd < 0) { +            s->fd_error_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +            s->fd_got_error = 1; +            if (last_media_present) +                s->fd_media_changed = 1; +            DPRINTF("No floppy\n"); +            return -EIO; +        } +        DPRINTF("Floppy opened\n"); +    } +    if (!last_media_present) +        s->fd_media_changed = 1; +    s->fd_open_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +    s->fd_got_error = 0; +    return 0; +} + +static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ +    BDRVRawState *s = bs->opaque; + +    return ioctl(s->fd, req, buf); +} + +static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, +        unsigned long int req, void *buf, +        BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; +    RawPosixAIOData *acb; +    ThreadPool *pool; + +    if (fd_open(bs) < 0) +        return NULL; + +    acb = g_slice_new(RawPosixAIOData); +    acb->bs = bs; +    acb->aio_type = QEMU_AIO_IOCTL; +    acb->aio_fildes = s->fd; +    acb->aio_offset = 0; +    acb->aio_ioctl_buf = buf; +    acb->aio_ioctl_cmd = req; +    pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); +    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} + +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static int fd_open(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; + +    /* this is just to ensure s->fd is sane (its called by io ops) */ +    if (s->fd >= 0) +        return 0; +    return -EIO; +} +#else /* !linux && !FreeBSD */ + +static int fd_open(BlockDriverState *bs) +{ +    return 0; +} + +#endif /* !linux && !FreeBSD */ + +static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, +    BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; + +    if (fd_open(bs) < 0) { +        return NULL; +    } +    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, +                       cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); +} + +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ +    BDRVRawState *s = bs->opaque; +    int rc; + +    rc = fd_open(bs); +    if (rc < 0) { +        return rc; +    } +    if (!(flags & BDRV_REQ_MAY_UNMAP)) { +        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, +                              QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); +    } else if (s->discard_zeroes) { +        return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, +                              QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); +    } +    return -ENOTSUP; +} + +static int hdev_create(const char *filename, QemuOpts *opts, +                       Error **errp) +{ +    int fd; +    int ret = 0; +    struct stat stat_buf; +    int64_t total_size = 0; +    bool has_prefix; + +    /* This function is used by all three protocol block drivers and therefore +     * any of these three prefixes may be given. +     * The return value has to be stored somewhere, otherwise this is an error +     * due to -Werror=unused-value. */ +    has_prefix = +        strstart(filename, "host_device:", &filename) || +        strstart(filename, "host_cdrom:" , &filename) || +        strstart(filename, "host_floppy:", &filename); + +    (void)has_prefix; + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); + +    fd = qemu_open(filename, O_WRONLY | O_BINARY); +    if (fd < 0) { +        ret = -errno; +        error_setg_errno(errp, -ret, "Could not open device"); +        return ret; +    } + +    if (fstat(fd, &stat_buf) < 0) { +        ret = -errno; +        error_setg_errno(errp, -ret, "Could not stat device"); +    } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { +        error_setg(errp, +                   "The given file is neither a block nor a character device"); +        ret = -ENODEV; +    } else if (lseek(fd, 0, SEEK_END) < total_size) { +        error_setg(errp, "Device is too small"); +        ret = -ENOSPC; +    } + +    qemu_close(fd); +    return ret; +} + +static BlockDriver bdrv_host_device = { +    .format_name        = "host_device", +    .protocol_name        = "host_device", +    .instance_size      = sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_probe_device  = hdev_probe_device, +    .bdrv_parse_filename = hdev_parse_filename, +    .bdrv_file_open     = hdev_open, +    .bdrv_close         = raw_close, +    .bdrv_reopen_prepare = raw_reopen_prepare, +    .bdrv_reopen_commit  = raw_reopen_commit, +    .bdrv_reopen_abort   = raw_reopen_abort, +    .bdrv_create         = hdev_create, +    .create_opts         = &raw_create_opts, +    .bdrv_co_write_zeroes = hdev_co_write_zeroes, + +    .bdrv_aio_readv	= raw_aio_readv, +    .bdrv_aio_writev	= raw_aio_writev, +    .bdrv_aio_flush	= raw_aio_flush, +    .bdrv_aio_discard   = hdev_aio_discard, +    .bdrv_refresh_limits = raw_refresh_limits, +    .bdrv_io_plug = raw_aio_plug, +    .bdrv_io_unplug = raw_aio_unplug, +    .bdrv_flush_io_queue = raw_aio_flush_io_queue, + +    .bdrv_truncate      = raw_truncate, +    .bdrv_getlength	= raw_getlength, +    .bdrv_get_info = raw_get_info, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, +    .bdrv_probe_blocksizes = hdev_probe_blocksizes, +    .bdrv_probe_geometry = hdev_probe_geometry, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    /* generic scsi device */ +#ifdef __linux__ +    .bdrv_ioctl         = hdev_ioctl, +    .bdrv_aio_ioctl     = hdev_aio_ioctl, +#endif +}; + +#ifdef __linux__ +static void floppy_parse_filename(const char *filename, QDict *options, +                                  Error **errp) +{ +    /* The prefix is optional, just as for "file". */ +    strstart(filename, "host_floppy:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int floppy_open(BlockDriverState *bs, QDict *options, int flags, +                       Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +    s->type = FTYPE_FD; + +    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ +    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); +    if (ret) { +        if (local_err) { +            error_propagate(errp, local_err); +        } +        return ret; +    } + +    /* close fd so that we can reopen it as needed */ +    qemu_close(s->fd); +    s->fd = -1; +    s->fd_media_changed = 1; + +    error_report("Host floppy pass-through is deprecated"); +    error_printf("Support for it will be removed in a future release.\n"); +    return 0; +} + +static int floppy_probe_device(const char *filename) +{ +    int fd, ret; +    int prio = 0; +    struct floppy_struct fdparam; +    struct stat st; + +    if (strstart(filename, "/dev/fd", NULL) && +        !strstart(filename, "/dev/fdset/", NULL) && +        !strstart(filename, "/dev/fd/", NULL)) { +        prio = 50; +    } + +    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); +    if (fd < 0) { +        goto out; +    } +    ret = fstat(fd, &st); +    if (ret == -1 || !S_ISBLK(st.st_mode)) { +        goto outc; +    } + +    /* Attempt to detect via a floppy specific ioctl */ +    ret = ioctl(fd, FDGETPRM, &fdparam); +    if (ret >= 0) +        prio = 100; + +outc: +    qemu_close(fd); +out: +    return prio; +} + + +static int floppy_is_inserted(BlockDriverState *bs) +{ +    return fd_open(bs) >= 0; +} + +static int floppy_media_changed(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int ret; + +    /* +     * XXX: we do not have a true media changed indication. +     * It does not work if the floppy is changed without trying to read it. +     */ +    fd_open(bs); +    ret = s->fd_media_changed; +    s->fd_media_changed = 0; +    DPRINTF("Floppy changed=%d\n", ret); +    return ret; +} + +static void floppy_eject(BlockDriverState *bs, bool eject_flag) +{ +    BDRVRawState *s = bs->opaque; +    int fd; + +    if (s->fd >= 0) { +        qemu_close(s->fd); +        s->fd = -1; +    } +    fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK); +    if (fd >= 0) { +        if (ioctl(fd, FDEJECT, 0) < 0) +            perror("FDEJECT"); +        qemu_close(fd); +    } +} + +static BlockDriver bdrv_host_floppy = { +    .format_name        = "host_floppy", +    .protocol_name      = "host_floppy", +    .instance_size      = sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_probe_device	= floppy_probe_device, +    .bdrv_parse_filename = floppy_parse_filename, +    .bdrv_file_open     = floppy_open, +    .bdrv_close         = raw_close, +    .bdrv_reopen_prepare = raw_reopen_prepare, +    .bdrv_reopen_commit  = raw_reopen_commit, +    .bdrv_reopen_abort   = raw_reopen_abort, +    .bdrv_create         = hdev_create, +    .create_opts         = &raw_create_opts, + +    .bdrv_aio_readv     = raw_aio_readv, +    .bdrv_aio_writev    = raw_aio_writev, +    .bdrv_aio_flush	= raw_aio_flush, +    .bdrv_refresh_limits = raw_refresh_limits, +    .bdrv_io_plug = raw_aio_plug, +    .bdrv_io_unplug = raw_aio_unplug, +    .bdrv_flush_io_queue = raw_aio_flush_io_queue, + +    .bdrv_truncate      = raw_truncate, +    .bdrv_getlength      = raw_getlength, +    .has_variable_length = true, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    /* removable device support */ +    .bdrv_is_inserted   = floppy_is_inserted, +    .bdrv_media_changed = floppy_media_changed, +    .bdrv_eject         = floppy_eject, +}; +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static void cdrom_parse_filename(const char *filename, QDict *options, +                                 Error **errp) +{ +    /* The prefix is optional, just as for "file". */ +    strstart(filename, "host_cdrom:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} +#endif + +#ifdef __linux__ +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +    s->type = FTYPE_CD; + +    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ +    ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +    } +    return ret; +} + +static int cdrom_probe_device(const char *filename) +{ +    int fd, ret; +    int prio = 0; +    struct stat st; + +    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); +    if (fd < 0) { +        goto out; +    } +    ret = fstat(fd, &st); +    if (ret == -1 || !S_ISBLK(st.st_mode)) { +        goto outc; +    } + +    /* Attempt to detect via a CDROM specific ioctl */ +    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); +    if (ret >= 0) +        prio = 100; + +outc: +    qemu_close(fd); +out: +    return prio; +} + +static int cdrom_is_inserted(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int ret; + +    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); +    if (ret == CDS_DISC_OK) +        return 1; +    return 0; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ +    BDRVRawState *s = bs->opaque; + +    if (eject_flag) { +        if (ioctl(s->fd, CDROMEJECT, NULL) < 0) +            perror("CDROMEJECT"); +    } else { +        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) +            perror("CDROMEJECT"); +    } +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ +    BDRVRawState *s = bs->opaque; + +    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { +        /* +         * Note: an error can happen if the distribution automatically +         * mounts the CD-ROM +         */ +        /* perror("CDROM_LOCKDOOR"); */ +    } +} + +static BlockDriver bdrv_host_cdrom = { +    .format_name        = "host_cdrom", +    .protocol_name      = "host_cdrom", +    .instance_size      = sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_probe_device	= cdrom_probe_device, +    .bdrv_parse_filename = cdrom_parse_filename, +    .bdrv_file_open     = cdrom_open, +    .bdrv_close         = raw_close, +    .bdrv_reopen_prepare = raw_reopen_prepare, +    .bdrv_reopen_commit  = raw_reopen_commit, +    .bdrv_reopen_abort   = raw_reopen_abort, +    .bdrv_create         = hdev_create, +    .create_opts         = &raw_create_opts, + +    .bdrv_aio_readv     = raw_aio_readv, +    .bdrv_aio_writev    = raw_aio_writev, +    .bdrv_aio_flush	= raw_aio_flush, +    .bdrv_refresh_limits = raw_refresh_limits, +    .bdrv_io_plug = raw_aio_plug, +    .bdrv_io_unplug = raw_aio_unplug, +    .bdrv_flush_io_queue = raw_aio_flush_io_queue, + +    .bdrv_truncate      = raw_truncate, +    .bdrv_getlength      = raw_getlength, +    .has_variable_length = true, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    /* removable device support */ +    .bdrv_is_inserted   = cdrom_is_inserted, +    .bdrv_eject         = cdrom_eject, +    .bdrv_lock_medium   = cdrom_lock_medium, + +    /* generic scsi device */ +    .bdrv_ioctl         = hdev_ioctl, +    .bdrv_aio_ioctl     = hdev_aio_ioctl, +}; +#endif /* __linux__ */ + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    Error *local_err = NULL; +    int ret; + +    s->type = FTYPE_CD; + +    ret = raw_open_common(bs, options, flags, 0, &local_err); +    if (ret) { +        if (local_err) { +            error_propagate(errp, local_err); +        } +        return ret; +    } + +    /* make sure the door isn't locked at this time */ +    ioctl(s->fd, CDIOCALLOW); +    return 0; +} + +static int cdrom_probe_device(const char *filename) +{ +    if (strstart(filename, "/dev/cd", NULL) || +            strstart(filename, "/dev/acd", NULL)) +        return 100; +    return 0; +} + +static int cdrom_reopen(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    int fd; + +    /* +     * Force reread of possibly changed/newly loaded disc, +     * FreeBSD seems to not notice sometimes... +     */ +    if (s->fd >= 0) +        qemu_close(s->fd); +    fd = qemu_open(bs->filename, s->open_flags, 0644); +    if (fd < 0) { +        s->fd = -1; +        return -EIO; +    } +    s->fd = fd; + +    /* make sure the door isn't locked at this time */ +    ioctl(s->fd, CDIOCALLOW); +    return 0; +} + +static int cdrom_is_inserted(BlockDriverState *bs) +{ +    return raw_getlength(bs) > 0; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ +    BDRVRawState *s = bs->opaque; + +    if (s->fd < 0) +        return; + +    (void) ioctl(s->fd, CDIOCALLOW); + +    if (eject_flag) { +        if (ioctl(s->fd, CDIOCEJECT) < 0) +            perror("CDIOCEJECT"); +    } else { +        if (ioctl(s->fd, CDIOCCLOSE) < 0) +            perror("CDIOCCLOSE"); +    } + +    cdrom_reopen(bs); +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ +    BDRVRawState *s = bs->opaque; + +    if (s->fd < 0) +        return; +    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { +        /* +         * Note: an error can happen if the distribution automatically +         * mounts the CD-ROM +         */ +        /* perror("CDROM_LOCKDOOR"); */ +    } +} + +static BlockDriver bdrv_host_cdrom = { +    .format_name        = "host_cdrom", +    .protocol_name      = "host_cdrom", +    .instance_size      = sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_probe_device	= cdrom_probe_device, +    .bdrv_parse_filename = cdrom_parse_filename, +    .bdrv_file_open     = cdrom_open, +    .bdrv_close         = raw_close, +    .bdrv_reopen_prepare = raw_reopen_prepare, +    .bdrv_reopen_commit  = raw_reopen_commit, +    .bdrv_reopen_abort   = raw_reopen_abort, +    .bdrv_create        = hdev_create, +    .create_opts        = &raw_create_opts, + +    .bdrv_aio_readv     = raw_aio_readv, +    .bdrv_aio_writev    = raw_aio_writev, +    .bdrv_aio_flush	= raw_aio_flush, +    .bdrv_refresh_limits = raw_refresh_limits, +    .bdrv_io_plug = raw_aio_plug, +    .bdrv_io_unplug = raw_aio_unplug, +    .bdrv_flush_io_queue = raw_aio_flush_io_queue, + +    .bdrv_truncate      = raw_truncate, +    .bdrv_getlength      = raw_getlength, +    .has_variable_length = true, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    /* removable device support */ +    .bdrv_is_inserted   = cdrom_is_inserted, +    .bdrv_eject         = cdrom_eject, +    .bdrv_lock_medium   = cdrom_lock_medium, +}; +#endif /* __FreeBSD__ */ + +static void bdrv_file_init(void) +{ +    /* +     * Register all the drivers.  Note that order is important, the driver +     * registered last will get probed first. +     */ +    bdrv_register(&bdrv_file); +    bdrv_register(&bdrv_host_device); +#ifdef __linux__ +    bdrv_register(&bdrv_host_floppy); +    bdrv_register(&bdrv_host_cdrom); +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +    bdrv_register(&bdrv_host_cdrom); +#endif +} + +block_init(bdrv_file_init); diff --git a/block/raw-win32.c b/block/raw-win32.c new file mode 100644 index 00000000..68f2338a --- /dev/null +++ b/block/raw-win32.c @@ -0,0 +1,729 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "raw-aio.h" +#include "trace.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" +#include "qapi/qmp/qstring.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD     1 +#define FTYPE_HARDDISK 2 + +typedef struct RawWin32AIOData { +    BlockDriverState *bs; +    HANDLE hfile; +    struct iovec *aio_iov; +    int aio_niov; +    size_t aio_nbytes; +    off64_t aio_offset; +    int aio_type; +} RawWin32AIOData; + +typedef struct BDRVRawState { +    HANDLE hfile; +    int type; +    char drive_path[16]; /* format: "d:\" */ +    QEMUWin32AIOState *aio; +} BDRVRawState; + +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ +static size_t handle_aiocb_rw(RawWin32AIOData *aiocb) +{ +    size_t offset = 0; +    int i; + +    for (i = 0; i < aiocb->aio_niov; i++) { +        OVERLAPPED ov; +        DWORD ret, ret_count, len; + +        memset(&ov, 0, sizeof(ov)); +        ov.Offset = (aiocb->aio_offset + offset); +        ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32; +        len = aiocb->aio_iov[i].iov_len; +        if (aiocb->aio_type & QEMU_AIO_WRITE) { +            ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, +                            len, &ret_count, &ov); +        } else { +            ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, +                           len, &ret_count, &ov); +        } +        if (!ret) { +            ret_count = 0; +        } +        if (ret_count != len) { +            offset += ret_count; +            break; +        } +        offset += len; +    } + +    return offset; +} + +static int aio_worker(void *arg) +{ +    RawWin32AIOData *aiocb = arg; +    ssize_t ret = 0; +    size_t count; + +    switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { +    case QEMU_AIO_READ: +        count = handle_aiocb_rw(aiocb); +        if (count < aiocb->aio_nbytes) { +            /* A short read means that we have reached EOF. Pad the buffer +             * with zeros for bytes after EOF. */ +            iov_memset(aiocb->aio_iov, aiocb->aio_niov, count, +                      0, aiocb->aio_nbytes - count); + +            count = aiocb->aio_nbytes; +        } +        if (count == aiocb->aio_nbytes) { +            ret = 0; +        } else { +            ret = -EINVAL; +        } +        break; +    case QEMU_AIO_WRITE: +        count = handle_aiocb_rw(aiocb); +        if (count == aiocb->aio_nbytes) { +            count = 0; +        } else { +            count = -EINVAL; +        } +        break; +    case QEMU_AIO_FLUSH: +        if (!FlushFileBuffers(aiocb->hfile)) { +            return -EIO; +        } +        break; +    default: +        fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); +        ret = -EINVAL; +        break; +    } + +    g_slice_free(RawWin32AIOData, aiocb); +    return ret; +} + +static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type) +{ +    RawWin32AIOData *acb = g_slice_new(RawWin32AIOData); +    ThreadPool *pool; + +    acb->bs = bs; +    acb->hfile = hfile; +    acb->aio_type = type; + +    if (qiov) { +        acb->aio_iov = qiov->iov; +        acb->aio_niov = qiov->niov; +    } +    acb->aio_nbytes = nb_sectors * 512; +    acb->aio_offset = sector_num * 512; + +    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); +    pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); +    return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} + +int qemu_ftruncate64(int fd, int64_t length) +{ +    LARGE_INTEGER li; +    DWORD dw; +    LONG high; +    HANDLE h; +    BOOL res; + +    if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0) +	return -1; + +    h = (HANDLE)_get_osfhandle(fd); + +    /* get current position, ftruncate do not change position */ +    li.HighPart = 0; +    li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT); +    if (li.LowPart == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { +	return -1; +    } + +    high = length >> 32; +    dw = SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN); +    if (dw == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { +	return -1; +    } +    res = SetEndOfFile(h); + +    /* back to old position */ +    SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN); +    return res ? 0 : -1; +} + +static int set_sparse(int fd) +{ +    DWORD returned; +    return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE, +				 NULL, 0, NULL, 0, &returned, NULL); +} + +static void raw_detach_aio_context(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; + +    if (s->aio) { +        win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); +    } +} + +static void raw_attach_aio_context(BlockDriverState *bs, +                                   AioContext *new_context) +{ +    BDRVRawState *s = bs->opaque; + +    if (s->aio) { +        win32_aio_attach_aio_context(s->aio, new_context); +    } +} + +static void raw_probe_alignment(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    DWORD sectorsPerCluster, freeClusters, totalClusters, count; +    DISK_GEOMETRY_EX dg; +    BOOL status; + +    if (s->type == FTYPE_CD) { +        bs->request_alignment = 2048; +        return; +    } +    if (s->type == FTYPE_HARDDISK) { +        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, +                                 NULL, 0, &dg, sizeof(dg), &count, NULL); +        if (status != 0) { +            bs->request_alignment = dg.Geometry.BytesPerSector; +            return; +        } +        /* try GetDiskFreeSpace too */ +    } + +    if (s->drive_path[0]) { +        GetDiskFreeSpace(s->drive_path, §orsPerCluster, +                         &dg.Geometry.BytesPerSector, +                         &freeClusters, &totalClusters); +        bs->request_alignment = dg.Geometry.BytesPerSector; +    } +} + +static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) +{ +    assert(access_flags != NULL); +    assert(overlapped != NULL); + +    if (flags & BDRV_O_RDWR) { +        *access_flags = GENERIC_READ | GENERIC_WRITE; +    } else { +        *access_flags = GENERIC_READ; +    } + +    *overlapped = FILE_ATTRIBUTE_NORMAL; +    if (flags & BDRV_O_NATIVE_AIO) { +        *overlapped |= FILE_FLAG_OVERLAPPED; +    } +    if (flags & BDRV_O_NOCACHE) { +        *overlapped |= FILE_FLAG_NO_BUFFERING; +    } +} + +static void raw_parse_filename(const char *filename, QDict *options, +                               Error **errp) +{ +    /* The filename does not have to be prefixed by the protocol name, since +     * "file" is the default protocol; therefore, the return value of this +     * function call can be ignored. */ +    strstart(filename, "file:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static QemuOptsList raw_runtime_opts = { +    .name = "raw", +    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "File name of the image", +        }, +        { /* end of list */ } +    }, +}; + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    int access_flags; +    DWORD overlapped; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename; +    int ret; + +    s->type = FTYPE_FILE; + +    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    raw_parse_flags(flags, &access_flags, &overlapped); + +    if (filename[0] && filename[1] == ':') { +        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); +    } else if (filename[0] == '\\' && filename[1] == '\\') { +        s->drive_path[0] = 0; +    } else { +        /* Relative path.  */ +        char buf[MAX_PATH]; +        GetCurrentDirectory(MAX_PATH, buf); +        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); +    } + +    s->hfile = CreateFile(filename, access_flags, +                          FILE_SHARE_READ, NULL, +                          OPEN_EXISTING, overlapped, NULL); +    if (s->hfile == INVALID_HANDLE_VALUE) { +        int err = GetLastError(); + +        if (err == ERROR_ACCESS_DENIED) { +            ret = -EACCES; +        } else { +            ret = -EINVAL; +        } +        goto fail; +    } + +    if (flags & BDRV_O_NATIVE_AIO) { +        s->aio = win32_aio_init(); +        if (s->aio == NULL) { +            CloseHandle(s->hfile); +            error_setg(errp, "Could not initialize AIO"); +            ret = -EINVAL; +            goto fail; +        } + +        ret = win32_aio_attach(s->aio, s->hfile); +        if (ret < 0) { +            win32_aio_cleanup(s->aio); +            CloseHandle(s->hfile); +            error_setg_errno(errp, -ret, "Could not enable AIO"); +            goto fail; +        } + +        win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs)); +    } + +    raw_probe_alignment(bs); +    ret = 0; +fail: +    qemu_opts_del(opts); +    return ret; +} + +static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, +                         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +                         BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; +    if (s->aio) { +        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, +                                nb_sectors, cb, opaque, QEMU_AIO_READ);  +    } else { +        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, +                           cb, opaque, QEMU_AIO_READ); +    } +} + +static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, +                          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +                          BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; +    if (s->aio) { +        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, +                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);  +    } else { +        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, +                           cb, opaque, QEMU_AIO_WRITE); +    } +} + +static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, +                         BlockCompletionFunc *cb, void *opaque) +{ +    BDRVRawState *s = bs->opaque; +    return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} + +static void raw_close(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; + +    if (s->aio) { +        win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); +        win32_aio_cleanup(s->aio); +        s->aio = NULL; +    } + +    CloseHandle(s->hfile); +    if (bs->open_flags & BDRV_O_TEMPORARY) { +        unlink(bs->filename); +    } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVRawState *s = bs->opaque; +    LONG low, high; +    DWORD dwPtrLow; + +    low = offset; +    high = offset >> 32; + +    /* +     * An error has occurred if the return value is INVALID_SET_FILE_POINTER +     * and GetLastError doesn't return NO_ERROR. +     */ +    dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN); +    if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { +        fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError()); +        return -EIO; +    } +    if (SetEndOfFile(s->hfile) == 0) { +        fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError()); +        return -EIO; +    } +    return 0; +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ +    BDRVRawState *s = bs->opaque; +    LARGE_INTEGER l; +    ULARGE_INTEGER available, total, total_free; +    DISK_GEOMETRY_EX dg; +    DWORD count; +    BOOL status; + +    switch(s->type) { +    case FTYPE_FILE: +        l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart); +        if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) +            return -EIO; +        break; +    case FTYPE_CD: +        if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free)) +            return -EIO; +        l.QuadPart = total.QuadPart; +        break; +    case FTYPE_HARDDISK: +        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, +                                 NULL, 0, &dg, sizeof(dg), &count, NULL); +        if (status != 0) { +            l = dg.DiskSize; +        } +        break; +    default: +        return -EIO; +    } +    return l.QuadPart; +} + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ +    typedef DWORD (WINAPI * get_compressed_t)(const char *filename, +                                              DWORD * high); +    get_compressed_t get_compressed; +    struct _stati64 st; +    const char *filename = bs->filename; +    /* WinNT support GetCompressedFileSize to determine allocate size */ +    get_compressed = +        (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), +                                            "GetCompressedFileSizeA"); +    if (get_compressed) { +        DWORD high, low; +        low = get_compressed(filename, &high); +        if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR) { +            return (((int64_t) high) << 32) + low; +        } +    } + +    if (_stati64(filename, &st) < 0) { +        return -1; +    } +    return st.st_size; +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int fd; +    int64_t total_size = 0; + +    strstart(filename, "file:", &filename); + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); + +    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, +                   0644); +    if (fd < 0) { +        error_setg_errno(errp, errno, "Could not create file"); +        return -EIO; +    } +    set_sparse(fd); +    ftruncate(fd, total_size); +    qemu_close(fd); +    return 0; +} + + +static QemuOptsList raw_create_opts = { +    .name = "raw-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +BlockDriver bdrv_file = { +    .format_name	= "file", +    .protocol_name	= "file", +    .instance_size	= sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_parse_filename = raw_parse_filename, +    .bdrv_file_open     = raw_open, +    .bdrv_close         = raw_close, +    .bdrv_create        = raw_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, + +    .bdrv_aio_readv     = raw_aio_readv, +    .bdrv_aio_writev    = raw_aio_writev, +    .bdrv_aio_flush     = raw_aio_flush, + +    .bdrv_truncate	= raw_truncate, +    .bdrv_getlength	= raw_getlength, +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, + +    .create_opts        = &raw_create_opts, +}; + +/***********************************************/ +/* host device */ + +static int find_cdrom(char *cdrom_name, int cdrom_name_size) +{ +    char drives[256], *pdrv = drives; +    UINT type; + +    memset(drives, 0, sizeof(drives)); +    GetLogicalDriveStrings(sizeof(drives), drives); +    while(pdrv[0] != '\0') { +        type = GetDriveType(pdrv); +        switch(type) { +        case DRIVE_CDROM: +            snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]); +            return 0; +            break; +        } +        pdrv += lstrlen(pdrv) + 1; +    } +    return -1; +} + +static int find_device_type(BlockDriverState *bs, const char *filename) +{ +    BDRVRawState *s = bs->opaque; +    UINT type; +    const char *p; + +    if (strstart(filename, "\\\\.\\", &p) || +        strstart(filename, "//./", &p)) { +        if (stristart(p, "PhysicalDrive", NULL)) +            return FTYPE_HARDDISK; +        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]); +        type = GetDriveType(s->drive_path); +        switch (type) { +        case DRIVE_REMOVABLE: +        case DRIVE_FIXED: +            return FTYPE_HARDDISK; +        case DRIVE_CDROM: +            return FTYPE_CD; +        default: +            return FTYPE_FILE; +        } +    } else { +        return FTYPE_FILE; +    } +} + +static int hdev_probe_device(const char *filename) +{ +    if (strstart(filename, "/dev/cdrom", NULL)) +        return 100; +    if (is_windows_drive(filename)) +        return 100; +    return 0; +} + +static void hdev_parse_filename(const char *filename, QDict *options, +                                Error **errp) +{ +    /* The prefix is optional, just as for "file". */ +    strstart(filename, "host_device:", &filename); + +    qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    BDRVRawState *s = bs->opaque; +    int access_flags, create_flags; +    int ret = 0; +    DWORD overlapped; +    char device_name[64]; + +    Error *local_err = NULL; +    const char *filename; + +    QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, +                                      &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto done; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    if (strstart(filename, "/dev/cdrom", NULL)) { +        if (find_cdrom(device_name, sizeof(device_name)) < 0) { +            error_setg(errp, "Could not open CD-ROM drive"); +            ret = -ENOENT; +            goto done; +        } +        filename = device_name; +    } else { +        /* transform drive letters into device name */ +        if (((filename[0] >= 'a' && filename[0] <= 'z') || +             (filename[0] >= 'A' && filename[0] <= 'Z')) && +            filename[1] == ':' && filename[2] == '\0') { +            snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]); +            filename = device_name; +        } +    } +    s->type = find_device_type(bs, filename); + +    raw_parse_flags(flags, &access_flags, &overlapped); + +    create_flags = OPEN_EXISTING; + +    s->hfile = CreateFile(filename, access_flags, +                          FILE_SHARE_READ, NULL, +                          create_flags, overlapped, NULL); +    if (s->hfile == INVALID_HANDLE_VALUE) { +        int err = GetLastError(); + +        if (err == ERROR_ACCESS_DENIED) { +            ret = -EACCES; +        } else { +            ret = -EINVAL; +        } +        error_setg_errno(errp, -ret, "Could not open device"); +        goto done; +    } + +done: +    qemu_opts_del(opts); +    return ret; +} + +static BlockDriver bdrv_host_device = { +    .format_name	= "host_device", +    .protocol_name	= "host_device", +    .instance_size	= sizeof(BDRVRawState), +    .bdrv_needs_filename = true, +    .bdrv_parse_filename = hdev_parse_filename, +    .bdrv_probe_device	= hdev_probe_device, +    .bdrv_file_open	= hdev_open, +    .bdrv_close		= raw_close, + +    .bdrv_aio_readv     = raw_aio_readv, +    .bdrv_aio_writev    = raw_aio_writev, +    .bdrv_aio_flush     = raw_aio_flush, + +    .bdrv_detach_aio_context = raw_detach_aio_context, +    .bdrv_attach_aio_context = raw_attach_aio_context, + +    .bdrv_getlength      = raw_getlength, +    .has_variable_length = true, + +    .bdrv_get_allocated_file_size +                        = raw_get_allocated_file_size, +}; + +static void bdrv_file_init(void) +{ +    bdrv_register(&bdrv_file); +    bdrv_register(&bdrv_host_device); +} + +block_init(bdrv_file_init); diff --git a/block/raw_bsd.c b/block/raw_bsd.c new file mode 100644 index 00000000..e3d2d046 --- /dev/null +++ b/block/raw_bsd.c @@ -0,0 +1,282 @@ +/* BlockDriver implementation for "raw" + * + * Copyright (C) 2010, 2013, Red Hat, Inc. + * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com> + * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com> + * + * Author: + *   Laszlo Ersek <lersek@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "block/block_int.h" +#include "qemu/option.h" + +static QemuOptsList raw_create_opts = { +    .name = "raw-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +static int raw_reopen_prepare(BDRVReopenState *reopen_state, +                              BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, +                                     int nb_sectors, QEMUIOVector *qiov) +{ +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); +    return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov); +} + +static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, +                                      int nb_sectors, QEMUIOVector *qiov) +{ +    void *buf = NULL; +    BlockDriver *drv; +    QEMUIOVector local_qiov; +    int ret; + +    if (bs->probed && sector_num == 0) { +        /* As long as these conditions are true, we can't get partial writes to +         * the probe buffer and can just directly check the request. */ +        QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); +        QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); + +        if (nb_sectors == 0) { +            /* qemu_iovec_to_buf() would fail, but we want to return success +             * instead of -EINVAL in this case. */ +            return 0; +        } + +        buf = qemu_try_blockalign(bs->file, 512); +        if (!buf) { +            ret = -ENOMEM; +            goto fail; +        } + +        ret = qemu_iovec_to_buf(qiov, 0, buf, 512); +        if (ret != 512) { +            ret = -EINVAL; +            goto fail; +        } + +        drv = bdrv_probe_all(buf, 512, NULL); +        if (drv != bs->drv) { +            ret = -EPERM; +            goto fail; +        } + +        /* Use the checked buffer, a malicious guest might be overwriting its +         * original buffer in the background. */ +        qemu_iovec_init(&local_qiov, qiov->niov + 1); +        qemu_iovec_add(&local_qiov, buf, 512); +        qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); +        qiov = &local_qiov; +    } + +    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); +    ret = bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); + +fail: +    if (qiov == &local_qiov) { +        qemu_iovec_destroy(&local_qiov); +    } +    qemu_vfree(buf); +    return ret; +} + +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, +                                            int64_t sector_num, +                                            int nb_sectors, int *pnum) +{ +    *pnum = nb_sectors; +    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | +           (sector_num << BDRV_SECTOR_BITS); +} + +static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, +                                            int64_t sector_num, int nb_sectors, +                                            BdrvRequestFlags flags) +{ +    return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags); +} + +static int coroutine_fn raw_co_discard(BlockDriverState *bs, +                                       int64_t sector_num, int nb_sectors) +{ +    return bdrv_co_discard(bs->file, sector_num, nb_sectors); +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ +    return bdrv_getlength(bs->file); +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    return bdrv_get_info(bs->file, bdi); +} + +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    bs->bl = bs->file->bl; +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ +    return bdrv_truncate(bs->file, offset); +} + +static int raw_is_inserted(BlockDriverState *bs) +{ +    return bdrv_is_inserted(bs->file); +} + +static int raw_media_changed(BlockDriverState *bs) +{ +    return bdrv_media_changed(bs->file); +} + +static void raw_eject(BlockDriverState *bs, bool eject_flag) +{ +    bdrv_eject(bs->file, eject_flag); +} + +static void raw_lock_medium(BlockDriverState *bs, bool locked) +{ +    bdrv_lock_medium(bs->file, locked); +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ +    return bdrv_ioctl(bs->file, req, buf); +} + +static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, +                                 unsigned long int req, void *buf, +                                 BlockCompletionFunc *cb, +                                 void *opaque) +{ +    return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); +} + +static int raw_has_zero_init(BlockDriverState *bs) +{ +    return bdrv_has_zero_init(bs->file); +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    Error *local_err = NULL; +    int ret; + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +    } +    return ret; +} + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    bs->sg = bs->file->sg; + +    if (bs->probed && !bdrv_is_read_only(bs)) { +        fprintf(stderr, +                "WARNING: Image format was not specified for '%s' and probing " +                "guessed raw.\n" +                "         Automatically detecting the format is dangerous for " +                "raw images, write operations on block 0 will be restricted.\n" +                "         Specify the 'raw' format explicitly to remove the " +                "restrictions.\n", +                bs->file->filename); +    } + +    return 0; +} + +static void raw_close(BlockDriverState *bs) +{ +} + +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    /* smallest possible positive score so that raw is used if and only if no +     * other block driver works +     */ +    return 1; +} + +static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ +    return bdrv_probe_blocksizes(bs->file, bsz); +} + +static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ +    return bdrv_probe_geometry(bs->file, geo); +} + +BlockDriver bdrv_raw = { +    .format_name          = "raw", +    .bdrv_probe           = &raw_probe, +    .bdrv_reopen_prepare  = &raw_reopen_prepare, +    .bdrv_open            = &raw_open, +    .bdrv_close           = &raw_close, +    .bdrv_create          = &raw_create, +    .bdrv_co_readv        = &raw_co_readv, +    .bdrv_co_writev       = &raw_co_writev, +    .bdrv_co_write_zeroes = &raw_co_write_zeroes, +    .bdrv_co_discard      = &raw_co_discard, +    .bdrv_co_get_block_status = &raw_co_get_block_status, +    .bdrv_truncate        = &raw_truncate, +    .bdrv_getlength       = &raw_getlength, +    .has_variable_length  = true, +    .bdrv_get_info        = &raw_get_info, +    .bdrv_refresh_limits  = &raw_refresh_limits, +    .bdrv_probe_blocksizes = &raw_probe_blocksizes, +    .bdrv_probe_geometry  = &raw_probe_geometry, +    .bdrv_is_inserted     = &raw_is_inserted, +    .bdrv_media_changed   = &raw_media_changed, +    .bdrv_eject           = &raw_eject, +    .bdrv_lock_medium     = &raw_lock_medium, +    .bdrv_ioctl           = &raw_ioctl, +    .bdrv_aio_ioctl       = &raw_aio_ioctl, +    .create_opts          = &raw_create_opts, +    .bdrv_has_zero_init   = &raw_has_zero_init +}; + +static void bdrv_raw_init(void) +{ +    bdrv_register(&bdrv_raw); +} + +block_init(bdrv_raw_init); diff --git a/block/rbd.c b/block/rbd.c new file mode 100644 index 00000000..a60a19d5 --- /dev/null +++ b/block/rbd.c @@ -0,0 +1,967 @@ +/* + * QEMU Block driver for RADOS (Ceph) + * + * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>, + *                         Josh Durgin <josh.durgin@dreamhost.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include <inttypes.h> + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "block/block_int.h" + +#include <rbd/librbd.h> + +/* + * When specifying the image filename use: + * + * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]] + * + * poolname must be the name of an existing rados pool. + * + * devicename is the name of the rbd image. + * + * Each option given is used to configure rados, and may be any valid + * Ceph option, "id", or "conf". + * + * The "id" option indicates what user we should authenticate as to + * the Ceph cluster.  If it is excluded we will use the Ceph default + * (normally 'admin'). + * + * The "conf" option specifies a Ceph configuration file to read.  If + * it is not specified, we will read from the default Ceph locations + * (e.g., /etc/ceph/ceph.conf).  To avoid reading _any_ configuration + * file, specify conf=/dev/null. + * + * Configuration values containing :, @, or = can be escaped with a + * leading "\". + */ + +/* rbd_aio_discard added in 0.1.2 */ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) +#define LIBRBD_SUPPORTS_DISCARD +#else +#undef LIBRBD_SUPPORTS_DISCARD +#endif + +#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) + +#define RBD_MAX_CONF_NAME_SIZE 128 +#define RBD_MAX_CONF_VAL_SIZE 512 +#define RBD_MAX_CONF_SIZE 1024 +#define RBD_MAX_POOL_NAME_SIZE 128 +#define RBD_MAX_SNAP_NAME_SIZE 128 +#define RBD_MAX_SNAPS 100 + +typedef enum { +    RBD_AIO_READ, +    RBD_AIO_WRITE, +    RBD_AIO_DISCARD, +    RBD_AIO_FLUSH +} RBDAIOCmd; + +typedef struct RBDAIOCB { +    BlockAIOCB common; +    QEMUBH *bh; +    int64_t ret; +    QEMUIOVector *qiov; +    char *bounce; +    RBDAIOCmd cmd; +    int error; +    struct BDRVRBDState *s; +} RBDAIOCB; + +typedef struct RADOSCB { +    RBDAIOCB *acb; +    struct BDRVRBDState *s; +    int64_t size; +    char *buf; +    int64_t ret; +} RADOSCB; + +typedef struct BDRVRBDState { +    rados_t cluster; +    rados_ioctx_t io_ctx; +    rbd_image_t image; +    char name[RBD_MAX_IMAGE_NAME_SIZE]; +    char *snap; +} BDRVRBDState; + +static int qemu_rbd_next_tok(char *dst, int dst_len, +                             char *src, char delim, +                             const char *name, +                             char **p, Error **errp) +{ +    int l; +    char *end; + +    *p = NULL; + +    if (delim != '\0') { +        for (end = src; *end; ++end) { +            if (*end == delim) { +                break; +            } +            if (*end == '\\' && end[1] != '\0') { +                end++; +            } +        } +        if (*end == delim) { +            *p = end + 1; +            *end = '\0'; +        } +    } +    l = strlen(src); +    if (l >= dst_len) { +        error_setg(errp, "%s too long", name); +        return -EINVAL; +    } else if (l == 0) { +        error_setg(errp, "%s too short", name); +        return -EINVAL; +    } + +    pstrcpy(dst, dst_len, src); + +    return 0; +} + +static void qemu_rbd_unescape(char *src) +{ +    char *p; + +    for (p = src; *src; ++src, ++p) { +        if (*src == '\\' && src[1] != '\0') { +            src++; +        } +        *p = *src; +    } +    *p = '\0'; +} + +static int qemu_rbd_parsename(const char *filename, +                              char *pool, int pool_len, +                              char *snap, int snap_len, +                              char *name, int name_len, +                              char *conf, int conf_len, +                              Error **errp) +{ +    const char *start; +    char *p, *buf; +    int ret; + +    if (!strstart(filename, "rbd:", &start)) { +        error_setg(errp, "File name must start with 'rbd:'"); +        return -EINVAL; +    } + +    buf = g_strdup(start); +    p = buf; +    *snap = '\0'; +    *conf = '\0'; + +    ret = qemu_rbd_next_tok(pool, pool_len, p, +                            '/', "pool name", &p, errp); +    if (ret < 0 || !p) { +        ret = -EINVAL; +        goto done; +    } +    qemu_rbd_unescape(pool); + +    if (strchr(p, '@')) { +        ret = qemu_rbd_next_tok(name, name_len, p, +                                '@', "object name", &p, errp); +        if (ret < 0) { +            goto done; +        } +        ret = qemu_rbd_next_tok(snap, snap_len, p, +                                ':', "snap name", &p, errp); +        qemu_rbd_unescape(snap); +    } else { +        ret = qemu_rbd_next_tok(name, name_len, p, +                                ':', "object name", &p, errp); +    } +    qemu_rbd_unescape(name); +    if (ret < 0 || !p) { +        goto done; +    } + +    ret = qemu_rbd_next_tok(conf, conf_len, p, +                            '\0', "configuration", &p, errp); + +done: +    g_free(buf); +    return ret; +} + +static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) +{ +    const char *p = conf; + +    while (*p) { +        int len; +        const char *end = strchr(p, ':'); + +        if (end) { +            len = end - p; +        } else { +            len = strlen(p); +        } + +        if (strncmp(p, "id=", 3) == 0) { +            len -= 3; +            strncpy(clientname, p + 3, len); +            clientname[len] = '\0'; +            return clientname; +        } +        if (end == NULL) { +            break; +        } +        p = end + 1; +    } +    return NULL; +} + +static int qemu_rbd_set_conf(rados_t cluster, const char *conf, +                             bool only_read_conf_file, +                             Error **errp) +{ +    char *p, *buf; +    char name[RBD_MAX_CONF_NAME_SIZE]; +    char value[RBD_MAX_CONF_VAL_SIZE]; +    int ret = 0; + +    buf = g_strdup(conf); +    p = buf; + +    while (p) { +        ret = qemu_rbd_next_tok(name, sizeof(name), p, +                                '=', "conf option name", &p, errp); +        if (ret < 0) { +            break; +        } +        qemu_rbd_unescape(name); + +        if (!p) { +            error_setg(errp, "conf option %s has no value", name); +            ret = -EINVAL; +            break; +        } + +        ret = qemu_rbd_next_tok(value, sizeof(value), p, +                                ':', "conf option value", &p, errp); +        if (ret < 0) { +            break; +        } +        qemu_rbd_unescape(value); + +        if (strcmp(name, "conf") == 0) { +            /* read the conf file alone, so it doesn't override more +               specific settings for a particular device */ +            if (only_read_conf_file) { +                ret = rados_conf_read_file(cluster, value); +                if (ret < 0) { +                    error_setg(errp, "error reading conf file %s", value); +                    break; +                } +            } +        } else if (strcmp(name, "id") == 0) { +            /* ignore, this is parsed by qemu_rbd_parse_clientname() */ +        } else if (!only_read_conf_file) { +            ret = rados_conf_set(cluster, name, value); +            if (ret < 0) { +                error_setg(errp, "invalid conf option %s", name); +                ret = -EINVAL; +                break; +            } +        } +    } + +    g_free(buf); +    return ret; +} + +static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    Error *local_err = NULL; +    int64_t bytes = 0; +    int64_t objsize; +    int obj_order = 0; +    char pool[RBD_MAX_POOL_NAME_SIZE]; +    char name[RBD_MAX_IMAGE_NAME_SIZE]; +    char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; +    char conf[RBD_MAX_CONF_SIZE]; +    char clientname_buf[RBD_MAX_CONF_SIZE]; +    char *clientname; +    rados_t cluster; +    rados_ioctx_t io_ctx; +    int ret; + +    if (qemu_rbd_parsename(filename, pool, sizeof(pool), +                           snap_buf, sizeof(snap_buf), +                           name, sizeof(name), +                           conf, sizeof(conf), &local_err) < 0) { +        error_propagate(errp, local_err); +        return -EINVAL; +    } + +    /* Read out options */ +    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                     BDRV_SECTOR_SIZE); +    objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0); +    if (objsize) { +        if ((objsize - 1) & objsize) {    /* not a power of 2? */ +            error_setg(errp, "obj size needs to be power of 2"); +            return -EINVAL; +        } +        if (objsize < 4096) { +            error_setg(errp, "obj size too small"); +            return -EINVAL; +        } +        obj_order = ctz32(objsize); +    } + +    clientname = qemu_rbd_parse_clientname(conf, clientname_buf); +    if (rados_create(&cluster, clientname) < 0) { +        error_setg(errp, "error initializing"); +        return -EIO; +    } + +    if (strstr(conf, "conf=") == NULL) { +        /* try default location, but ignore failure */ +        rados_conf_read_file(cluster, NULL); +    } else if (conf[0] != '\0' && +               qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) { +        rados_shutdown(cluster); +        error_propagate(errp, local_err); +        return -EIO; +    } + +    if (conf[0] != '\0' && +        qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) { +        rados_shutdown(cluster); +        error_propagate(errp, local_err); +        return -EIO; +    } + +    if (rados_connect(cluster) < 0) { +        error_setg(errp, "error connecting"); +        rados_shutdown(cluster); +        return -EIO; +    } + +    if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { +        error_setg(errp, "error opening pool %s", pool); +        rados_shutdown(cluster); +        return -EIO; +    } + +    ret = rbd_create(io_ctx, name, bytes, &obj_order); +    rados_ioctx_destroy(io_ctx); +    rados_shutdown(cluster); + +    return ret; +} + +/* + * This aio completion is being called from rbd_finish_bh() and runs in qemu + * BH context. + */ +static void qemu_rbd_complete_aio(RADOSCB *rcb) +{ +    RBDAIOCB *acb = rcb->acb; +    int64_t r; + +    r = rcb->ret; + +    if (acb->cmd != RBD_AIO_READ) { +        if (r < 0) { +            acb->ret = r; +            acb->error = 1; +        } else if (!acb->error) { +            acb->ret = rcb->size; +        } +    } else { +        if (r < 0) { +            memset(rcb->buf, 0, rcb->size); +            acb->ret = r; +            acb->error = 1; +        } else if (r < rcb->size) { +            memset(rcb->buf + r, 0, rcb->size - r); +            if (!acb->error) { +                acb->ret = rcb->size; +            } +        } else if (!acb->error) { +            acb->ret = r; +        } +    } + +    g_free(rcb); + +    if (acb->cmd == RBD_AIO_READ) { +        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); +    } +    qemu_vfree(acb->bounce); +    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + +    qemu_aio_unref(acb); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { +    .name = "rbd", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "Specification of the rbd image", +        }, +        { /* end of list */ } +    }, +}; + +static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, +                         Error **errp) +{ +    BDRVRBDState *s = bs->opaque; +    char pool[RBD_MAX_POOL_NAME_SIZE]; +    char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; +    char conf[RBD_MAX_CONF_SIZE]; +    char clientname_buf[RBD_MAX_CONF_SIZE]; +    char *clientname; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename; +    int r; + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        qemu_opts_del(opts); +        return -EINVAL; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    if (qemu_rbd_parsename(filename, pool, sizeof(pool), +                           snap_buf, sizeof(snap_buf), +                           s->name, sizeof(s->name), +                           conf, sizeof(conf), errp) < 0) { +        r = -EINVAL; +        goto failed_opts; +    } + +    clientname = qemu_rbd_parse_clientname(conf, clientname_buf); +    r = rados_create(&s->cluster, clientname); +    if (r < 0) { +        error_setg(errp, "error initializing"); +        goto failed_opts; +    } + +    s->snap = NULL; +    if (snap_buf[0] != '\0') { +        s->snap = g_strdup(snap_buf); +    } + +    if (strstr(conf, "conf=") == NULL) { +        /* try default location, but ignore failure */ +        rados_conf_read_file(s->cluster, NULL); +    } else if (conf[0] != '\0') { +        r = qemu_rbd_set_conf(s->cluster, conf, true, errp); +        if (r < 0) { +            goto failed_shutdown; +        } +    } + +    if (conf[0] != '\0') { +        r = qemu_rbd_set_conf(s->cluster, conf, false, errp); +        if (r < 0) { +            goto failed_shutdown; +        } +    } + +    /* +     * Fallback to more conservative semantics if setting cache +     * options fails. Ignore errors from setting rbd_cache because the +     * only possible error is that the option does not exist, and +     * librbd defaults to no caching. If write through caching cannot +     * be set up, fall back to no caching. +     */ +    if (flags & BDRV_O_NOCACHE) { +        rados_conf_set(s->cluster, "rbd_cache", "false"); +    } else { +        rados_conf_set(s->cluster, "rbd_cache", "true"); +    } + +    r = rados_connect(s->cluster); +    if (r < 0) { +        error_setg(errp, "error connecting"); +        goto failed_shutdown; +    } + +    r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); +    if (r < 0) { +        error_setg(errp, "error opening pool %s", pool); +        goto failed_shutdown; +    } + +    r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); +    if (r < 0) { +        error_setg(errp, "error reading header from %s", s->name); +        goto failed_open; +    } + +    bs->read_only = (s->snap != NULL); + +    qemu_opts_del(opts); +    return 0; + +failed_open: +    rados_ioctx_destroy(s->io_ctx); +failed_shutdown: +    rados_shutdown(s->cluster); +    g_free(s->snap); +failed_opts: +    qemu_opts_del(opts); +    return r; +} + +static void qemu_rbd_close(BlockDriverState *bs) +{ +    BDRVRBDState *s = bs->opaque; + +    rbd_close(s->image); +    rados_ioctx_destroy(s->io_ctx); +    g_free(s->snap); +    rados_shutdown(s->cluster); +} + +static const AIOCBInfo rbd_aiocb_info = { +    .aiocb_size = sizeof(RBDAIOCB), +}; + +static void rbd_finish_bh(void *opaque) +{ +    RADOSCB *rcb = opaque; +    qemu_bh_delete(rcb->acb->bh); +    qemu_rbd_complete_aio(rcb); +} + +/* + * This is the callback function for rbd_aio_read and _write + * + * Note: this function is being called from a non qemu thread so + * we need to be careful about what we do here. Generally we only + * schedule a BH, and do the rest of the io completion handling + * from rbd_finish_bh() which runs in a qemu context. + */ +static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) +{ +    RBDAIOCB *acb = rcb->acb; + +    rcb->ret = rbd_aio_get_return_value(c); +    rbd_aio_release(c); + +    acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), +                         rbd_finish_bh, rcb); +    qemu_bh_schedule(acb->bh); +} + +static int rbd_aio_discard_wrapper(rbd_image_t image, +                                   uint64_t off, +                                   uint64_t len, +                                   rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_DISCARD +    return rbd_aio_discard(image, off, len, comp); +#else +    return -ENOTSUP; +#endif +} + +static int rbd_aio_flush_wrapper(rbd_image_t image, +                                 rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH +    return rbd_aio_flush(image, comp); +#else +    return -ENOTSUP; +#endif +} + +static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, +                                 int64_t sector_num, +                                 QEMUIOVector *qiov, +                                 int nb_sectors, +                                 BlockCompletionFunc *cb, +                                 void *opaque, +                                 RBDAIOCmd cmd) +{ +    RBDAIOCB *acb; +    RADOSCB *rcb = NULL; +    rbd_completion_t c; +    int64_t off, size; +    char *buf; +    int r; + +    BDRVRBDState *s = bs->opaque; + +    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); +    acb->cmd = cmd; +    acb->qiov = qiov; +    if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { +        acb->bounce = NULL; +    } else { +        acb->bounce = qemu_try_blockalign(bs, qiov->size); +        if (acb->bounce == NULL) { +            goto failed; +        } +    } +    acb->ret = 0; +    acb->error = 0; +    acb->s = s; +    acb->bh = NULL; + +    if (cmd == RBD_AIO_WRITE) { +        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); +    } + +    buf = acb->bounce; + +    off = sector_num * BDRV_SECTOR_SIZE; +    size = nb_sectors * BDRV_SECTOR_SIZE; + +    rcb = g_new(RADOSCB, 1); +    rcb->acb = acb; +    rcb->buf = buf; +    rcb->s = acb->s; +    rcb->size = size; +    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); +    if (r < 0) { +        goto failed; +    } + +    switch (cmd) { +    case RBD_AIO_WRITE: +        r = rbd_aio_write(s->image, off, size, buf, c); +        break; +    case RBD_AIO_READ: +        r = rbd_aio_read(s->image, off, size, buf, c); +        break; +    case RBD_AIO_DISCARD: +        r = rbd_aio_discard_wrapper(s->image, off, size, c); +        break; +    case RBD_AIO_FLUSH: +        r = rbd_aio_flush_wrapper(s->image, c); +        break; +    default: +        r = -EINVAL; +    } + +    if (r < 0) { +        goto failed_completion; +    } + +    return &acb->common; + +failed_completion: +    rbd_aio_release(c); +failed: +    g_free(rcb); +    qemu_vfree(acb->bounce); +    qemu_aio_unref(acb); +    return NULL; +} + +static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, +                                      int64_t sector_num, +                                      QEMUIOVector *qiov, +                                      int nb_sectors, +                                      BlockCompletionFunc *cb, +                                      void *opaque) +{ +    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, +                         RBD_AIO_READ); +} + +static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, +                                       int64_t sector_num, +                                       QEMUIOVector *qiov, +                                       int nb_sectors, +                                       BlockCompletionFunc *cb, +                                       void *opaque) +{ +    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, +                         RBD_AIO_WRITE); +} + +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH +static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, +                                      BlockCompletionFunc *cb, +                                      void *opaque) +{ +    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); +} + +#else + +static int qemu_rbd_co_flush(BlockDriverState *bs) +{ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) +    /* rbd_flush added in 0.1.1 */ +    BDRVRBDState *s = bs->opaque; +    return rbd_flush(s->image); +#else +    return 0; +#endif +} +#endif + +static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVRBDState *s = bs->opaque; +    rbd_image_info_t info; +    int r; + +    r = rbd_stat(s->image, &info, sizeof(info)); +    if (r < 0) { +        return r; +    } + +    bdi->cluster_size = info.obj_size; +    return 0; +} + +static int64_t qemu_rbd_getlength(BlockDriverState *bs) +{ +    BDRVRBDState *s = bs->opaque; +    rbd_image_info_t info; +    int r; + +    r = rbd_stat(s->image, &info, sizeof(info)); +    if (r < 0) { +        return r; +    } + +    return info.size; +} + +static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset) +{ +    BDRVRBDState *s = bs->opaque; +    int r; + +    r = rbd_resize(s->image, offset); +    if (r < 0) { +        return r; +    } + +    return 0; +} + +static int qemu_rbd_snap_create(BlockDriverState *bs, +                                QEMUSnapshotInfo *sn_info) +{ +    BDRVRBDState *s = bs->opaque; +    int r; + +    if (sn_info->name[0] == '\0') { +        return -EINVAL; /* we need a name for rbd snapshots */ +    } + +    /* +     * rbd snapshots are using the name as the user controlled unique identifier +     * we can't use the rbd snapid for that purpose, as it can't be set +     */ +    if (sn_info->id_str[0] != '\0' && +        strcmp(sn_info->id_str, sn_info->name) != 0) { +        return -EINVAL; +    } + +    if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { +        return -ERANGE; +    } + +    r = rbd_snap_create(s->image, sn_info->name); +    if (r < 0) { +        error_report("failed to create snap: %s", strerror(-r)); +        return r; +    } + +    return 0; +} + +static int qemu_rbd_snap_remove(BlockDriverState *bs, +                                const char *snapshot_id, +                                const char *snapshot_name, +                                Error **errp) +{ +    BDRVRBDState *s = bs->opaque; +    int r; + +    if (!snapshot_name) { +        error_setg(errp, "rbd need a valid snapshot name"); +        return -EINVAL; +    } + +    /* If snapshot_id is specified, it must be equal to name, see +       qemu_rbd_snap_list() */ +    if (snapshot_id && strcmp(snapshot_id, snapshot_name)) { +        error_setg(errp, +                   "rbd do not support snapshot id, it should be NULL or " +                   "equal to snapshot name"); +        return -EINVAL; +    } + +    r = rbd_snap_remove(s->image, snapshot_name); +    if (r < 0) { +        error_setg_errno(errp, -r, "Failed to remove the snapshot"); +    } +    return r; +} + +static int qemu_rbd_snap_rollback(BlockDriverState *bs, +                                  const char *snapshot_name) +{ +    BDRVRBDState *s = bs->opaque; +    int r; + +    r = rbd_snap_rollback(s->image, snapshot_name); +    return r; +} + +static int qemu_rbd_snap_list(BlockDriverState *bs, +                              QEMUSnapshotInfo **psn_tab) +{ +    BDRVRBDState *s = bs->opaque; +    QEMUSnapshotInfo *sn_info, *sn_tab = NULL; +    int i, snap_count; +    rbd_snap_info_t *snaps; +    int max_snaps = RBD_MAX_SNAPS; + +    do { +        snaps = g_new(rbd_snap_info_t, max_snaps); +        snap_count = rbd_snap_list(s->image, snaps, &max_snaps); +        if (snap_count <= 0) { +            g_free(snaps); +        } +    } while (snap_count == -ERANGE); + +    if (snap_count <= 0) { +        goto done; +    } + +    sn_tab = g_new0(QEMUSnapshotInfo, snap_count); + +    for (i = 0; i < snap_count; i++) { +        const char *snap_name = snaps[i].name; + +        sn_info = sn_tab + i; +        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); +        pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); + +        sn_info->vm_state_size = snaps[i].size; +        sn_info->date_sec = 0; +        sn_info->date_nsec = 0; +        sn_info->vm_clock_nsec = 0; +    } +    rbd_snap_list_end(snaps); +    g_free(snaps); + + done: +    *psn_tab = sn_tab; +    return snap_count; +} + +#ifdef LIBRBD_SUPPORTS_DISCARD +static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, +                                        int64_t sector_num, +                                        int nb_sectors, +                                        BlockCompletionFunc *cb, +                                        void *opaque) +{ +    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, +                         RBD_AIO_DISCARD); +} +#endif + +#ifdef LIBRBD_SUPPORTS_INVALIDATE +static void qemu_rbd_invalidate_cache(BlockDriverState *bs, +                                      Error **errp) +{ +    BDRVRBDState *s = bs->opaque; +    int r = rbd_invalidate_cache(s->image); +    if (r < 0) { +        error_setg_errno(errp, -r, "Failed to invalidate the cache"); +    } +} +#endif + +static QemuOptsList qemu_rbd_create_opts = { +    .name = "rbd-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_CLUSTER_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "RBD object size" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_rbd = { +    .format_name        = "rbd", +    .instance_size      = sizeof(BDRVRBDState), +    .bdrv_needs_filename = true, +    .bdrv_file_open     = qemu_rbd_open, +    .bdrv_close         = qemu_rbd_close, +    .bdrv_create        = qemu_rbd_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_get_info      = qemu_rbd_getinfo, +    .create_opts        = &qemu_rbd_create_opts, +    .bdrv_getlength     = qemu_rbd_getlength, +    .bdrv_truncate      = qemu_rbd_truncate, +    .protocol_name      = "rbd", + +    .bdrv_aio_readv         = qemu_rbd_aio_readv, +    .bdrv_aio_writev        = qemu_rbd_aio_writev, + +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH +    .bdrv_aio_flush         = qemu_rbd_aio_flush, +#else +    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush, +#endif + +#ifdef LIBRBD_SUPPORTS_DISCARD +    .bdrv_aio_discard       = qemu_rbd_aio_discard, +#endif + +    .bdrv_snapshot_create   = qemu_rbd_snap_create, +    .bdrv_snapshot_delete   = qemu_rbd_snap_remove, +    .bdrv_snapshot_list     = qemu_rbd_snap_list, +    .bdrv_snapshot_goto     = qemu_rbd_snap_rollback, +#ifdef LIBRBD_SUPPORTS_INVALIDATE +    .bdrv_invalidate_cache  = qemu_rbd_invalidate_cache, +#endif +}; + +static void bdrv_rbd_init(void) +{ +    bdrv_register(&bdrv_rbd); +} + +block_init(bdrv_rbd_init); diff --git a/block/sheepdog.c b/block/sheepdog.c new file mode 100644 index 00000000..9585beb7 --- /dev/null +++ b/block/sheepdog.c @@ -0,0 +1,2805 @@ +/* + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu/uri.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "block/block_int.h" +#include "qemu/bitops.h" + +#define SD_PROTO_VER 0x01 + +#define SD_DEFAULT_ADDR "localhost" +#define SD_DEFAULT_PORT 7000 + +#define SD_OP_CREATE_AND_WRITE_OBJ  0x01 +#define SD_OP_READ_OBJ       0x02 +#define SD_OP_WRITE_OBJ      0x03 +/* 0x04 is used internally by Sheepdog */ +#define SD_OP_DISCARD_OBJ    0x05 + +#define SD_OP_NEW_VDI        0x11 +#define SD_OP_LOCK_VDI       0x12 +#define SD_OP_RELEASE_VDI    0x13 +#define SD_OP_GET_VDI_INFO   0x14 +#define SD_OP_READ_VDIS      0x15 +#define SD_OP_FLUSH_VDI      0x16 +#define SD_OP_DEL_VDI        0x17 +#define SD_OP_GET_CLUSTER_DEFAULT   0x18 + +#define SD_FLAG_CMD_WRITE    0x01 +#define SD_FLAG_CMD_COW      0x02 +#define SD_FLAG_CMD_CACHE    0x04 /* Writeback mode for cache */ +#define SD_FLAG_CMD_DIRECT   0x08 /* Don't use cache */ + +#define SD_RES_SUCCESS       0x00 /* Success */ +#define SD_RES_UNKNOWN       0x01 /* Unknown error */ +#define SD_RES_NO_OBJ        0x02 /* No object found */ +#define SD_RES_EIO           0x03 /* I/O error */ +#define SD_RES_VDI_EXIST     0x04 /* Vdi exists already */ +#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ +#define SD_RES_SYSTEM_ERROR  0x06 /* System error */ +#define SD_RES_VDI_LOCKED    0x07 /* Vdi is locked */ +#define SD_RES_NO_VDI        0x08 /* No vdi found */ +#define SD_RES_NO_BASE_VDI   0x09 /* No base vdi found */ +#define SD_RES_VDI_READ      0x0A /* Cannot read requested vdi */ +#define SD_RES_VDI_WRITE     0x0B /* Cannot write requested vdi */ +#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ +#define SD_RES_BASE_VDI_WRITE   0x0D /* Cannot write base vdi */ +#define SD_RES_NO_TAG        0x0E /* Requested tag is not found */ +#define SD_RES_STARTUP       0x0F /* Sheepdog is on starting up */ +#define SD_RES_VDI_NOT_LOCKED   0x10 /* Vdi is not locked */ +#define SD_RES_SHUTDOWN      0x11 /* Sheepdog is shutting down */ +#define SD_RES_NO_MEM        0x12 /* Cannot allocate memory */ +#define SD_RES_FULL_VDI      0x13 /* we already have the maximum vdis */ +#define SD_RES_VER_MISMATCH  0x14 /* Protocol version mismatch */ +#define SD_RES_NO_SPACE      0x15 /* Server has no room for new objects */ +#define SD_RES_WAIT_FOR_FORMAT  0x16 /* Waiting for a format operation */ +#define SD_RES_WAIT_FOR_JOIN    0x17 /* Waiting for other nodes joining */ +#define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog */ +#define SD_RES_HALT          0x19 /* Sheepdog is stopped serving IO request */ +#define SD_RES_READONLY      0x1A /* Object is read-only */ + +/* + * Object ID rules + * + *  0 - 19 (20 bits): data object space + * 20 - 31 (12 bits): reserved data object space + * 32 - 55 (24 bits): vdi object space + * 56 - 59 ( 4 bits): reserved vdi object space + * 60 - 63 ( 4 bits): object type identifier space + */ + +#define VDI_SPACE_SHIFT   32 +#define VDI_BIT (UINT64_C(1) << 63) +#define VMSTATE_BIT (UINT64_C(1) << 62) +#define MAX_DATA_OBJS (UINT64_C(1) << 20) +#define MAX_CHILDREN 1024 +#define SD_MAX_VDI_LEN 256 +#define SD_MAX_VDI_TAG_LEN 256 +#define SD_NR_VDIS   (1U << 24) +#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) +#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 +/* + * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and + * (SD_EC_MAX_STRIP - 1) for parity strips + * + * SD_MAX_COPIES is sum of number of data strips and parity strips. + */ +#define SD_EC_MAX_STRIP 16 +#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) + +#define SD_INODE_SIZE (sizeof(SheepdogInode)) +#define CURRENT_VDI_ID 0 + +#define LOCK_TYPE_NORMAL 0 +#define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */ + +typedef struct SheepdogReq { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint32_t opcode_specific[8]; +} SheepdogReq; + +typedef struct SheepdogRsp { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint32_t result; +    uint32_t opcode_specific[7]; +} SheepdogRsp; + +typedef struct SheepdogObjReq { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint64_t oid; +    uint64_t cow_oid; +    uint8_t copies; +    uint8_t copy_policy; +    uint8_t reserved[6]; +    uint64_t offset; +} SheepdogObjReq; + +typedef struct SheepdogObjRsp { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint32_t result; +    uint8_t copies; +    uint8_t copy_policy; +    uint8_t reserved[2]; +    uint32_t pad[6]; +} SheepdogObjRsp; + +typedef struct SheepdogVdiReq { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint64_t vdi_size; +    uint32_t base_vdi_id; +    uint8_t copies; +    uint8_t copy_policy; +    uint8_t store_policy; +    uint8_t block_size_shift; +    uint32_t snapid; +    uint32_t type; +    uint32_t pad[2]; +} SheepdogVdiReq; + +typedef struct SheepdogVdiRsp { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint32_t result; +    uint32_t rsvd; +    uint32_t vdi_id; +    uint32_t pad[5]; +} SheepdogVdiRsp; + +typedef struct SheepdogClusterRsp { +    uint8_t proto_ver; +    uint8_t opcode; +    uint16_t flags; +    uint32_t epoch; +    uint32_t id; +    uint32_t data_length; +    uint32_t result; +    uint8_t nr_copies; +    uint8_t copy_policy; +    uint8_t block_size_shift; +    uint8_t __pad1; +    uint32_t __pad2[6]; +} SheepdogClusterRsp; + +typedef struct SheepdogInode { +    char name[SD_MAX_VDI_LEN]; +    char tag[SD_MAX_VDI_TAG_LEN]; +    uint64_t ctime; +    uint64_t snap_ctime; +    uint64_t vm_clock_nsec; +    uint64_t vdi_size; +    uint64_t vm_state_size; +    uint16_t copy_policy; +    uint8_t nr_copies; +    uint8_t block_size_shift; +    uint32_t snap_id; +    uint32_t vdi_id; +    uint32_t parent_vdi_id; +    uint32_t child_vdi_id[MAX_CHILDREN]; +    uint32_t data_vdi_id[MAX_DATA_OBJS]; +} SheepdogInode; + +#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id) + +/* + * 64 bit FNV-1a non-zero initial basis + */ +#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) + +/* + * 64 bit Fowler/Noll/Vo FNV-1a hash code + */ +static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) +{ +    unsigned char *bp = buf; +    unsigned char *be = bp + len; +    while (bp < be) { +        hval ^= (uint64_t) *bp++; +        hval += (hval << 1) + (hval << 4) + (hval << 5) + +            (hval << 7) + (hval << 8) + (hval << 40); +    } +    return hval; +} + +static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx) +{ +    return inode->vdi_id == inode->data_vdi_id[idx]; +} + +static inline bool is_data_obj(uint64_t oid) +{ +    return !(VDI_BIT & oid); +} + +static inline uint64_t data_oid_to_idx(uint64_t oid) +{ +    return oid & (MAX_DATA_OBJS - 1); +} + +static inline uint32_t oid_to_vid(uint64_t oid) +{ +    return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; +} + +static inline uint64_t vid_to_vdi_oid(uint32_t vid) +{ +    return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); +} + +static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) +{ +    return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) +{ +    return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline bool is_snapshot(struct SheepdogInode *inode) +{ +    return !!inode->snap_ctime; +} + +#undef DPRINTF +#ifdef DEBUG_SDOG +#define DPRINTF(fmt, args...)                                       \ +    do {                                                            \ +        fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ +    } while (0) +#else +#define DPRINTF(fmt, args...) +#endif + +typedef struct SheepdogAIOCB SheepdogAIOCB; + +typedef struct AIOReq { +    SheepdogAIOCB *aiocb; +    unsigned int iov_offset; + +    uint64_t oid; +    uint64_t base_oid; +    uint64_t offset; +    unsigned int data_len; +    uint8_t flags; +    uint32_t id; +    bool create; + +    QLIST_ENTRY(AIOReq) aio_siblings; +} AIOReq; + +enum AIOCBState { +    AIOCB_WRITE_UDATA, +    AIOCB_READ_UDATA, +    AIOCB_FLUSH_CACHE, +    AIOCB_DISCARD_OBJ, +}; + +#define AIOCBOverwrapping(x, y)                                 \ +    (!(x->max_affect_data_idx < y->min_affect_data_idx          \ +       || y->max_affect_data_idx < x->min_affect_data_idx)) + +struct SheepdogAIOCB { +    BlockAIOCB common; + +    QEMUIOVector *qiov; + +    int64_t sector_num; +    int nb_sectors; + +    int ret; +    enum AIOCBState aiocb_type; + +    Coroutine *coroutine; +    void (*aio_done_func)(SheepdogAIOCB *); + +    bool cancelable; +    int nr_pending; + +    uint32_t min_affect_data_idx; +    uint32_t max_affect_data_idx; + +    QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; +}; + +typedef struct BDRVSheepdogState { +    BlockDriverState *bs; +    AioContext *aio_context; + +    SheepdogInode inode; + +    uint32_t min_dirty_data_idx; +    uint32_t max_dirty_data_idx; + +    char name[SD_MAX_VDI_LEN]; +    bool is_snapshot; +    uint32_t cache_flags; +    bool discard_supported; + +    char *host_spec; +    bool is_unix; +    int fd; + +    CoMutex lock; +    Coroutine *co_send; +    Coroutine *co_recv; + +    uint32_t aioreq_seq_num; + +    /* Every aio request must be linked to either of these queues. */ +    QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; +    QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; + +    CoQueue overwrapping_queue; +    QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; +} BDRVSheepdogState; + +static const char * sd_strerror(int err) +{ +    int i; + +    static const struct { +        int err; +        const char *desc; +    } errors[] = { +        {SD_RES_SUCCESS, "Success"}, +        {SD_RES_UNKNOWN, "Unknown error"}, +        {SD_RES_NO_OBJ, "No object found"}, +        {SD_RES_EIO, "I/O error"}, +        {SD_RES_VDI_EXIST, "VDI exists already"}, +        {SD_RES_INVALID_PARMS, "Invalid parameters"}, +        {SD_RES_SYSTEM_ERROR, "System error"}, +        {SD_RES_VDI_LOCKED, "VDI is already locked"}, +        {SD_RES_NO_VDI, "No vdi found"}, +        {SD_RES_NO_BASE_VDI, "No base VDI found"}, +        {SD_RES_VDI_READ, "Failed read the requested VDI"}, +        {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, +        {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, +        {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, +        {SD_RES_NO_TAG, "Failed to find the requested tag"}, +        {SD_RES_STARTUP, "The system is still booting"}, +        {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, +        {SD_RES_SHUTDOWN, "The system is shutting down"}, +        {SD_RES_NO_MEM, "Out of memory on the server"}, +        {SD_RES_FULL_VDI, "We already have the maximum vdis"}, +        {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, +        {SD_RES_NO_SPACE, "Server has no space for new objects"}, +        {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, +        {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, +        {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, +        {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, +        {SD_RES_READONLY, "Object is read-only"}, +    }; + +    for (i = 0; i < ARRAY_SIZE(errors); ++i) { +        if (errors[i].err == err) { +            return errors[i].desc; +        } +    } + +    return "Invalid error code"; +} + +/* + * Sheepdog I/O handling: + * + * 1. In sd_co_rw_vector, we send the I/O requests to the server and + *    link the requests to the inflight_list in the + *    BDRVSheepdogState.  The function exits without waiting for + *    receiving the response. + * + * 2. We receive the response in aio_read_response, the fd handler to + *    the sheepdog connection.  If metadata update is needed, we send + *    the write request to the vdi object in sd_write_done, the write + *    completion function.  We switch back to sd_co_readv/writev after + *    all the requests belonging to the AIOCB are finished. + */ + +static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, +                                    uint64_t oid, unsigned int data_len, +                                    uint64_t offset, uint8_t flags, bool create, +                                    uint64_t base_oid, unsigned int iov_offset) +{ +    AIOReq *aio_req; + +    aio_req = g_malloc(sizeof(*aio_req)); +    aio_req->aiocb = acb; +    aio_req->iov_offset = iov_offset; +    aio_req->oid = oid; +    aio_req->base_oid = base_oid; +    aio_req->offset = offset; +    aio_req->data_len = data_len; +    aio_req->flags = flags; +    aio_req->id = s->aioreq_seq_num++; +    aio_req->create = create; + +    acb->nr_pending++; +    return aio_req; +} + +static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) +{ +    SheepdogAIOCB *acb = aio_req->aiocb; + +    acb->cancelable = false; +    QLIST_REMOVE(aio_req, aio_siblings); +    g_free(aio_req); + +    acb->nr_pending--; +} + +static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) +{ +    qemu_coroutine_enter(acb->coroutine, NULL); +    qemu_aio_unref(acb); +} + +/* + * Check whether the specified acb can be canceled + * + * We can cancel aio when any request belonging to the acb is: + *  - Not processed by the sheepdog server. + *  - Not linked to the inflight queue. + */ +static bool sd_acb_cancelable(const SheepdogAIOCB *acb) +{ +    BDRVSheepdogState *s = acb->common.bs->opaque; +    AIOReq *aioreq; + +    if (!acb->cancelable) { +        return false; +    } + +    QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) { +        if (aioreq->aiocb == acb) { +            return false; +        } +    } + +    return true; +} + +static void sd_aio_cancel(BlockAIOCB *blockacb) +{ +    SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; +    BDRVSheepdogState *s = acb->common.bs->opaque; +    AIOReq *aioreq, *next; + +    if (sd_acb_cancelable(acb)) { +        /* Remove outstanding requests from failed queue.  */ +        QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, +                           next) { +            if (aioreq->aiocb == acb) { +                free_aio_req(s, aioreq); +            } +        } + +        assert(acb->nr_pending == 0); +        if (acb->common.cb) { +            acb->common.cb(acb->common.opaque, -ECANCELED); +        } +        sd_finish_aiocb(acb); +    } +} + +static const AIOCBInfo sd_aiocb_info = { +    .aiocb_size     = sizeof(SheepdogAIOCB), +    .cancel_async   = sd_aio_cancel, +}; + +static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, +                                   int64_t sector_num, int nb_sectors) +{ +    SheepdogAIOCB *acb; +    uint32_t object_size; +    BDRVSheepdogState *s = bs->opaque; + +    object_size = (UINT32_C(1) << s->inode.block_size_shift); + +    acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); + +    acb->qiov = qiov; + +    acb->sector_num = sector_num; +    acb->nb_sectors = nb_sectors; + +    acb->aio_done_func = NULL; +    acb->cancelable = true; +    acb->coroutine = qemu_coroutine_self(); +    acb->ret = 0; +    acb->nr_pending = 0; + +    acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; +    acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + +                              acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; + +    return acb; +} + +/* Return -EIO in case of error, file descriptor on success */ +static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) +{ +    int fd; + +    if (s->is_unix) { +        fd = unix_connect(s->host_spec, errp); +    } else { +        fd = inet_connect(s->host_spec, errp); + +        if (fd >= 0) { +            int ret = socket_set_nodelay(fd); +            if (ret < 0) { +                error_report("%s", strerror(errno)); +            } +        } +    } + +    if (fd >= 0) { +        qemu_set_nonblock(fd); +    } else { +        fd = -EIO; +    } + +    return fd; +} + +/* Return 0 on success and -errno in case of error */ +static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, +                                    unsigned int *wlen) +{ +    int ret; + +    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); +    if (ret != sizeof(*hdr)) { +        error_report("failed to send a req, %s", strerror(errno)); +        ret = -socket_error(); +        return ret; +    } + +    ret = qemu_co_send(sockfd, data, *wlen); +    if (ret != *wlen) { +        ret = -socket_error(); +        error_report("failed to send a req, %s", strerror(errno)); +    } + +    return ret; +} + +static void restart_co_req(void *opaque) +{ +    Coroutine *co = opaque; + +    qemu_coroutine_enter(co, NULL); +} + +typedef struct SheepdogReqCo { +    int sockfd; +    AioContext *aio_context; +    SheepdogReq *hdr; +    void *data; +    unsigned int *wlen; +    unsigned int *rlen; +    int ret; +    bool finished; +} SheepdogReqCo; + +static coroutine_fn void do_co_req(void *opaque) +{ +    int ret; +    Coroutine *co; +    SheepdogReqCo *srco = opaque; +    int sockfd = srco->sockfd; +    SheepdogReq *hdr = srco->hdr; +    void *data = srco->data; +    unsigned int *wlen = srco->wlen; +    unsigned int *rlen = srco->rlen; + +    co = qemu_coroutine_self(); +    aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co); + +    ret = send_co_req(sockfd, hdr, data, wlen); +    if (ret < 0) { +        goto out; +    } + +    aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co); + +    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); +    if (ret != sizeof(*hdr)) { +        error_report("failed to get a rsp, %s", strerror(errno)); +        ret = -errno; +        goto out; +    } + +    if (*rlen > hdr->data_length) { +        *rlen = hdr->data_length; +    } + +    if (*rlen) { +        ret = qemu_co_recv(sockfd, data, *rlen); +        if (ret != *rlen) { +            error_report("failed to get the data, %s", strerror(errno)); +            ret = -errno; +            goto out; +        } +    } +    ret = 0; +out: +    /* there is at most one request for this sockfd, so it is safe to +     * set each handler to NULL. */ +    aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL); + +    srco->ret = ret; +    srco->finished = true; +} + +/* + * Send the request to the sheep in a synchronous manner. + * + * Return 0 on success, -errno in case of error. + */ +static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, +                  void *data, unsigned int *wlen, unsigned int *rlen) +{ +    Coroutine *co; +    SheepdogReqCo srco = { +        .sockfd = sockfd, +        .aio_context = aio_context, +        .hdr = hdr, +        .data = data, +        .wlen = wlen, +        .rlen = rlen, +        .ret = 0, +        .finished = false, +    }; + +    if (qemu_in_coroutine()) { +        do_co_req(&srco); +    } else { +        co = qemu_coroutine_create(do_co_req); +        qemu_coroutine_enter(co, &srco); +        while (!srco.finished) { +            aio_poll(aio_context, true); +        } +    } + +    return srco.ret; +} + +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +                                         struct iovec *iov, int niov, +                                         enum AIOCBState aiocb_type); +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); +static void co_write_request(void *opaque); + +static coroutine_fn void reconnect_to_sdog(void *opaque) +{ +    BDRVSheepdogState *s = opaque; +    AIOReq *aio_req, *next; + +    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); +    close(s->fd); +    s->fd = -1; + +    /* Wait for outstanding write requests to be completed. */ +    while (s->co_send != NULL) { +        co_write_request(opaque); +    } + +    /* Try to reconnect the sheepdog server every one second. */ +    while (s->fd < 0) { +        Error *local_err = NULL; +        s->fd = get_sheep_fd(s, &local_err); +        if (s->fd < 0) { +            DPRINTF("Wait for connection to be established\n"); +            error_report_err(local_err); +            co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, +                            1000000000ULL); +        } +    }; + +    /* +     * Now we have to resend all the request in the inflight queue.  However, +     * resend_aioreq() can yield and newly created requests can be added to the +     * inflight queue before the coroutine is resumed.  To avoid mixing them, we +     * have to move all the inflight requests to the failed queue before +     * resend_aioreq() is called. +     */ +    QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { +        QLIST_REMOVE(aio_req, aio_siblings); +        QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); +    } + +    /* Resend all the failed aio requests. */ +    while (!QLIST_EMPTY(&s->failed_aio_head)) { +        aio_req = QLIST_FIRST(&s->failed_aio_head); +        QLIST_REMOVE(aio_req, aio_siblings); +        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); +        resend_aioreq(s, aio_req); +    } +} + +/* + * Receive responses of the I/O requests. + * + * This function is registered as a fd handler, and called from the + * main loop when s->fd is ready for reading responses. + */ +static void coroutine_fn aio_read_response(void *opaque) +{ +    SheepdogObjRsp rsp; +    BDRVSheepdogState *s = opaque; +    int fd = s->fd; +    int ret; +    AIOReq *aio_req = NULL; +    SheepdogAIOCB *acb; +    uint64_t idx; + +    /* read a header */ +    ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); +    if (ret != sizeof(rsp)) { +        error_report("failed to get the header, %s", strerror(errno)); +        goto err; +    } + +    /* find the right aio_req from the inflight aio list */ +    QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { +        if (aio_req->id == rsp.id) { +            break; +        } +    } +    if (!aio_req) { +        error_report("cannot find aio_req %x", rsp.id); +        goto err; +    } + +    acb = aio_req->aiocb; + +    switch (acb->aiocb_type) { +    case AIOCB_WRITE_UDATA: +        /* this coroutine context is no longer suitable for co_recv +         * because we may send data to update vdi objects */ +        s->co_recv = NULL; +        if (!is_data_obj(aio_req->oid)) { +            break; +        } +        idx = data_oid_to_idx(aio_req->oid); + +        if (aio_req->create) { +            /* +             * If the object is newly created one, we need to update +             * the vdi object (metadata object).  min_dirty_data_idx +             * and max_dirty_data_idx are changed to include updated +             * index between them. +             */ +            if (rsp.result == SD_RES_SUCCESS) { +                s->inode.data_vdi_id[idx] = s->inode.vdi_id; +                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); +                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); +            } +        } +        break; +    case AIOCB_READ_UDATA: +        ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, +                            aio_req->iov_offset, rsp.data_length); +        if (ret != rsp.data_length) { +            error_report("failed to get the data, %s", strerror(errno)); +            goto err; +        } +        break; +    case AIOCB_FLUSH_CACHE: +        if (rsp.result == SD_RES_INVALID_PARMS) { +            DPRINTF("disable cache since the server doesn't support it\n"); +            s->cache_flags = SD_FLAG_CMD_DIRECT; +            rsp.result = SD_RES_SUCCESS; +        } +        break; +    case AIOCB_DISCARD_OBJ: +        switch (rsp.result) { +        case SD_RES_INVALID_PARMS: +            error_report("sheep(%s) doesn't support discard command", +                         s->host_spec); +            rsp.result = SD_RES_SUCCESS; +            s->discard_supported = false; +            break; +        case SD_RES_SUCCESS: +            idx = data_oid_to_idx(aio_req->oid); +            s->inode.data_vdi_id[idx] = 0; +            break; +        default: +            break; +        } +    } + +    switch (rsp.result) { +    case SD_RES_SUCCESS: +        break; +    case SD_RES_READONLY: +        if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { +            ret = reload_inode(s, 0, ""); +            if (ret < 0) { +                goto err; +            } +        } +        if (is_data_obj(aio_req->oid)) { +            aio_req->oid = vid_to_data_oid(s->inode.vdi_id, +                                           data_oid_to_idx(aio_req->oid)); +        } else { +            aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); +        } +        resend_aioreq(s, aio_req); +        goto out; +    default: +        acb->ret = -EIO; +        error_report("%s", sd_strerror(rsp.result)); +        break; +    } + +    free_aio_req(s, aio_req); +    if (!acb->nr_pending) { +        /* +         * We've finished all requests which belong to the AIOCB, so +         * we can switch back to sd_co_readv/writev now. +         */ +        acb->aio_done_func(acb); +    } +out: +    s->co_recv = NULL; +    return; +err: +    s->co_recv = NULL; +    reconnect_to_sdog(opaque); +} + +static void co_read_response(void *opaque) +{ +    BDRVSheepdogState *s = opaque; + +    if (!s->co_recv) { +        s->co_recv = qemu_coroutine_create(aio_read_response); +    } + +    qemu_coroutine_enter(s->co_recv, opaque); +} + +static void co_write_request(void *opaque) +{ +    BDRVSheepdogState *s = opaque; + +    qemu_coroutine_enter(s->co_send, NULL); +} + +/* + * Return a socket descriptor to read/write objects. + * + * We cannot use this descriptor for other operations because + * the block driver may be on waiting response from the server. + */ +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) +{ +    int fd; + +    fd = connect_to_sdog(s, errp); +    if (fd < 0) { +        return fd; +    } + +    aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s); +    return fd; +} + +static int sd_parse_uri(BDRVSheepdogState *s, const char *filename, +                        char *vdi, uint32_t *snapid, char *tag) +{ +    URI *uri; +    QueryParams *qp = NULL; +    int ret = 0; + +    uri = uri_parse(filename); +    if (!uri) { +        return -EINVAL; +    } + +    /* transport */ +    if (!strcmp(uri->scheme, "sheepdog")) { +        s->is_unix = false; +    } else if (!strcmp(uri->scheme, "sheepdog+tcp")) { +        s->is_unix = false; +    } else if (!strcmp(uri->scheme, "sheepdog+unix")) { +        s->is_unix = true; +    } else { +        ret = -EINVAL; +        goto out; +    } + +    if (uri->path == NULL || !strcmp(uri->path, "/")) { +        ret = -EINVAL; +        goto out; +    } +    pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1); + +    qp = query_params_parse(uri->query); +    if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) { +        ret = -EINVAL; +        goto out; +    } + +    if (s->is_unix) { +        /* sheepdog+unix:///vdiname?socket=path */ +        if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { +            ret = -EINVAL; +            goto out; +        } +        s->host_spec = g_strdup(qp->p[0].value); +    } else { +        /* sheepdog[+tcp]://[host:port]/vdiname */ +        s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR, +                                       uri->port ?: SD_DEFAULT_PORT); +    } + +    /* snapshot tag */ +    if (uri->fragment) { +        *snapid = strtoul(uri->fragment, NULL, 10); +        if (*snapid == 0) { +            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment); +        } +    } else { +        *snapid = CURRENT_VDI_ID; /* search current vdi */ +    } + +out: +    if (qp) { +        query_params_free(qp); +    } +    uri_free(uri); +    return ret; +} + +/* + * Parse a filename (old syntax) + * + * filename must be one of the following formats: + *   1. [vdiname] + *   2. [vdiname]:[snapid] + *   3. [vdiname]:[tag] + *   4. [hostname]:[port]:[vdiname] + *   5. [hostname]:[port]:[vdiname]:[snapid] + *   6. [hostname]:[port]:[vdiname]:[tag] + * + * You can boot from the snapshot images by specifying `snapid` or + * `tag'. + * + * You can run VMs outside the Sheepdog cluster by specifying + * `hostname' and `port' (experimental). + */ +static int parse_vdiname(BDRVSheepdogState *s, const char *filename, +                         char *vdi, uint32_t *snapid, char *tag) +{ +    char *p, *q, *uri; +    const char *host_spec, *vdi_spec; +    int nr_sep, ret; + +    strstart(filename, "sheepdog:", (const char **)&filename); +    p = q = g_strdup(filename); + +    /* count the number of separators */ +    nr_sep = 0; +    while (*p) { +        if (*p == ':') { +            nr_sep++; +        } +        p++; +    } +    p = q; + +    /* use the first two tokens as host_spec. */ +    if (nr_sep >= 2) { +        host_spec = p; +        p = strchr(p, ':'); +        p++; +        p = strchr(p, ':'); +        *p++ = '\0'; +    } else { +        host_spec = ""; +    } + +    vdi_spec = p; + +    p = strchr(vdi_spec, ':'); +    if (p) { +        *p++ = '#'; +    } + +    uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec); + +    ret = sd_parse_uri(s, uri, vdi, snapid, tag); + +    g_free(q); +    g_free(uri); + +    return ret; +} + +static int find_vdi_name(BDRVSheepdogState *s, const char *filename, +                         uint32_t snapid, const char *tag, uint32_t *vid, +                         bool lock, Error **errp) +{ +    int ret, fd; +    SheepdogVdiReq hdr; +    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; +    unsigned int wlen, rlen = 0; +    char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; + +    fd = connect_to_sdog(s, errp); +    if (fd < 0) { +        return fd; +    } + +    /* This pair of strncpy calls ensures that the buffer is zero-filled, +     * which is desirable since we'll soon be sending those bytes, and +     * don't want the send_req to read uninitialized data. +     */ +    strncpy(buf, filename, SD_MAX_VDI_LEN); +    strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); + +    memset(&hdr, 0, sizeof(hdr)); +    if (lock) { +        hdr.opcode = SD_OP_LOCK_VDI; +        hdr.type = LOCK_TYPE_NORMAL; +    } else { +        hdr.opcode = SD_OP_GET_VDI_INFO; +    } +    wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; +    hdr.proto_ver = SD_PROTO_VER; +    hdr.data_length = wlen; +    hdr.snapid = snapid; +    hdr.flags = SD_FLAG_CMD_WRITE; + +    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); +    if (ret) { +        error_setg_errno(errp, -ret, "cannot get vdi info"); +        goto out; +    } + +    if (rsp->result != SD_RES_SUCCESS) { +        error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s", +                   sd_strerror(rsp->result), filename, snapid, tag); +        if (rsp->result == SD_RES_NO_VDI) { +            ret = -ENOENT; +        } else if (rsp->result == SD_RES_VDI_LOCKED) { +            ret = -EBUSY; +        } else { +            ret = -EIO; +        } +        goto out; +    } +    *vid = rsp->vdi_id; + +    ret = 0; +out: +    closesocket(fd); +    return ret; +} + +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +                                         struct iovec *iov, int niov, +                                         enum AIOCBState aiocb_type) +{ +    int nr_copies = s->inode.nr_copies; +    SheepdogObjReq hdr; +    unsigned int wlen = 0; +    int ret; +    uint64_t oid = aio_req->oid; +    unsigned int datalen = aio_req->data_len; +    uint64_t offset = aio_req->offset; +    uint8_t flags = aio_req->flags; +    uint64_t old_oid = aio_req->base_oid; +    bool create = aio_req->create; + +    if (!nr_copies) { +        error_report("bug"); +    } + +    memset(&hdr, 0, sizeof(hdr)); + +    switch (aiocb_type) { +    case AIOCB_FLUSH_CACHE: +        hdr.opcode = SD_OP_FLUSH_VDI; +        break; +    case AIOCB_READ_UDATA: +        hdr.opcode = SD_OP_READ_OBJ; +        hdr.flags = flags; +        break; +    case AIOCB_WRITE_UDATA: +        if (create) { +            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; +        } else { +            hdr.opcode = SD_OP_WRITE_OBJ; +        } +        wlen = datalen; +        hdr.flags = SD_FLAG_CMD_WRITE | flags; +        break; +    case AIOCB_DISCARD_OBJ: +        hdr.opcode = SD_OP_DISCARD_OBJ; +        break; +    } + +    if (s->cache_flags) { +        hdr.flags |= s->cache_flags; +    } + +    hdr.oid = oid; +    hdr.cow_oid = old_oid; +    hdr.copies = s->inode.nr_copies; + +    hdr.data_length = datalen; +    hdr.offset = offset; + +    hdr.id = aio_req->id; + +    qemu_co_mutex_lock(&s->lock); +    s->co_send = qemu_coroutine_self(); +    aio_set_fd_handler(s->aio_context, s->fd, +                       co_read_response, co_write_request, s); +    socket_set_cork(s->fd, 1); + +    /* send a header */ +    ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); +    if (ret != sizeof(hdr)) { +        error_report("failed to send a req, %s", strerror(errno)); +        goto out; +    } + +    if (wlen) { +        ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); +        if (ret != wlen) { +            error_report("failed to send a data, %s", strerror(errno)); +        } +    } +out: +    socket_set_cork(s->fd, 0); +    aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s); +    s->co_send = NULL; +    qemu_co_mutex_unlock(&s->lock); +} + +static int read_write_object(int fd, AioContext *aio_context, char *buf, +                             uint64_t oid, uint8_t copies, +                             unsigned int datalen, uint64_t offset, +                             bool write, bool create, uint32_t cache_flags) +{ +    SheepdogObjReq hdr; +    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; +    unsigned int wlen, rlen; +    int ret; + +    memset(&hdr, 0, sizeof(hdr)); + +    if (write) { +        wlen = datalen; +        rlen = 0; +        hdr.flags = SD_FLAG_CMD_WRITE; +        if (create) { +            hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; +        } else { +            hdr.opcode = SD_OP_WRITE_OBJ; +        } +    } else { +        wlen = 0; +        rlen = datalen; +        hdr.opcode = SD_OP_READ_OBJ; +    } + +    hdr.flags |= cache_flags; + +    hdr.oid = oid; +    hdr.data_length = datalen; +    hdr.offset = offset; +    hdr.copies = copies; + +    ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); +    if (ret) { +        error_report("failed to send a request to the sheep"); +        return ret; +    } + +    switch (rsp->result) { +    case SD_RES_SUCCESS: +        return 0; +    default: +        error_report("%s", sd_strerror(rsp->result)); +        return -EIO; +    } +} + +static int read_object(int fd, AioContext *aio_context, char *buf, +                       uint64_t oid, uint8_t copies, +                       unsigned int datalen, uint64_t offset, +                       uint32_t cache_flags) +{ +    return read_write_object(fd, aio_context, buf, oid, copies, +                             datalen, offset, false, +                             false, cache_flags); +} + +static int write_object(int fd, AioContext *aio_context, char *buf, +                        uint64_t oid, uint8_t copies, +                        unsigned int datalen, uint64_t offset, bool create, +                        uint32_t cache_flags) +{ +    return read_write_object(fd, aio_context, buf, oid, copies, +                             datalen, offset, true, +                             create, cache_flags); +} + +/* update inode with the latest state */ +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) +{ +    Error *local_err = NULL; +    SheepdogInode *inode; +    int ret = 0, fd; +    uint32_t vid = 0; + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        return -EIO; +    } + +    inode = g_malloc(SD_INODE_HEADER_SIZE); + +    ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err); +    if (ret) { +        error_report_err(local_err); +        goto out; +    } + +    ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid), +                      s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, +                      s->cache_flags); +    if (ret < 0) { +        goto out; +    } + +    if (inode->vdi_id != s->inode.vdi_id) { +        memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE); +    } + +out: +    g_free(inode); +    closesocket(fd); + +    return ret; +} + +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ +    SheepdogAIOCB *acb = aio_req->aiocb; + +    aio_req->create = false; + +    /* check whether this request becomes a CoW one */ +    if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { +        int idx = data_oid_to_idx(aio_req->oid); + +        if (is_data_obj_writable(&s->inode, idx)) { +            goto out; +        } + +        if (s->inode.data_vdi_id[idx]) { +            aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); +            aio_req->flags |= SD_FLAG_CMD_COW; +        } +        aio_req->create = true; +    } +out: +    if (is_data_obj(aio_req->oid)) { +        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, +                        acb->aiocb_type); +    } else { +        struct iovec iov; +        iov.iov_base = &s->inode; +        iov.iov_len = sizeof(s->inode); +        add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); +    } +} + +static void sd_detach_aio_context(BlockDriverState *bs) +{ +    BDRVSheepdogState *s = bs->opaque; + +    aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); +} + +static void sd_attach_aio_context(BlockDriverState *bs, +                                  AioContext *new_context) +{ +    BDRVSheepdogState *s = bs->opaque; + +    s->aio_context = new_context; +    aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { +    .name = "sheepdog", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "filename", +            .type = QEMU_OPT_STRING, +            .help = "URL to the sheepdog image", +        }, +        { /* end of list */ } +    }, +}; + +static int sd_open(BlockDriverState *bs, QDict *options, int flags, +                   Error **errp) +{ +    int ret, fd; +    uint32_t vid = 0; +    BDRVSheepdogState *s = bs->opaque; +    char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; +    uint32_t snapid; +    char *buf = NULL; +    QemuOpts *opts; +    Error *local_err = NULL; +    const char *filename; + +    s->bs = bs; +    s->aio_context = bdrv_get_aio_context(bs); + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto out; +    } + +    filename = qemu_opt_get(opts, "filename"); + +    QLIST_INIT(&s->inflight_aio_head); +    QLIST_INIT(&s->failed_aio_head); +    QLIST_INIT(&s->inflight_aiocb_head); +    s->fd = -1; + +    memset(vdi, 0, sizeof(vdi)); +    memset(tag, 0, sizeof(tag)); + +    if (strstr(filename, "://")) { +        ret = sd_parse_uri(s, filename, vdi, &snapid, tag); +    } else { +        ret = parse_vdiname(s, filename, vdi, &snapid, tag); +    } +    if (ret < 0) { +        error_setg(errp, "Can't parse filename"); +        goto out; +    } +    s->fd = get_sheep_fd(s, errp); +    if (s->fd < 0) { +        ret = s->fd; +        goto out; +    } + +    ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp); +    if (ret) { +        goto out; +    } + +    /* +     * QEMU block layer emulates writethrough cache as 'writeback + flush', so +     * we always set SD_FLAG_CMD_CACHE (writeback cache) as default. +     */ +    s->cache_flags = SD_FLAG_CMD_CACHE; +    if (flags & BDRV_O_NOCACHE) { +        s->cache_flags = SD_FLAG_CMD_DIRECT; +    } +    s->discard_supported = true; + +    if (snapid || tag[0] != '\0') { +        DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid); +        s->is_snapshot = true; +    } + +    fd = connect_to_sdog(s, errp); +    if (fd < 0) { +        ret = fd; +        goto out; +    } + +    buf = g_malloc(SD_INODE_SIZE); +    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), +                      0, SD_INODE_SIZE, 0, s->cache_flags); + +    closesocket(fd); + +    if (ret) { +        error_setg(errp, "Can't read snapshot inode"); +        goto out; +    } + +    memcpy(&s->inode, buf, sizeof(s->inode)); +    s->min_dirty_data_idx = UINT32_MAX; +    s->max_dirty_data_idx = 0; + +    bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; +    pstrcpy(s->name, sizeof(s->name), vdi); +    qemu_co_mutex_init(&s->lock); +    qemu_co_queue_init(&s->overwrapping_queue); +    qemu_opts_del(opts); +    g_free(buf); +    return 0; +out: +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); +    if (s->fd >= 0) { +        closesocket(s->fd); +    } +    qemu_opts_del(opts); +    g_free(buf); +    return ret; +} + +static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, +                        Error **errp) +{ +    SheepdogVdiReq hdr; +    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; +    int fd, ret; +    unsigned int wlen, rlen = 0; +    char buf[SD_MAX_VDI_LEN]; + +    fd = connect_to_sdog(s, errp); +    if (fd < 0) { +        return fd; +    } + +    /* FIXME: would it be better to fail (e.g., return -EIO) when filename +     * does not fit in buf?  For now, just truncate and avoid buffer overrun. +     */ +    memset(buf, 0, sizeof(buf)); +    pstrcpy(buf, sizeof(buf), s->name); + +    memset(&hdr, 0, sizeof(hdr)); +    hdr.opcode = SD_OP_NEW_VDI; +    hdr.base_vdi_id = s->inode.vdi_id; + +    wlen = SD_MAX_VDI_LEN; + +    hdr.flags = SD_FLAG_CMD_WRITE; +    hdr.snapid = snapshot; + +    hdr.data_length = wlen; +    hdr.vdi_size = s->inode.vdi_size; +    hdr.copy_policy = s->inode.copy_policy; +    hdr.copies = s->inode.nr_copies; +    hdr.block_size_shift = s->inode.block_size_shift; + +    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + +    closesocket(fd); + +    if (ret) { +        error_setg_errno(errp, -ret, "create failed"); +        return ret; +    } + +    if (rsp->result != SD_RES_SUCCESS) { +        error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name); +        return -EIO; +    } + +    if (vdi_id) { +        *vdi_id = rsp->vdi_id; +    } + +    return 0; +} + +static int sd_prealloc(const char *filename, Error **errp) +{ +    BlockDriverState *bs = NULL; +    BDRVSheepdogState *base = NULL; +    unsigned long buf_size; +    uint32_t idx, max_idx; +    uint32_t object_size; +    int64_t vdi_size; +    void *buf = NULL; +    int ret; + +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, errp); +    if (ret < 0) { +        goto out_with_err_set; +    } + +    vdi_size = bdrv_getlength(bs); +    if (vdi_size < 0) { +        ret = vdi_size; +        goto out; +    } + +    base = bs->opaque; +    object_size = (UINT32_C(1) << base->inode.block_size_shift); +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); +    buf = g_malloc0(buf_size); + +    max_idx = DIV_ROUND_UP(vdi_size, buf_size); + +    for (idx = 0; idx < max_idx; idx++) { +        /* +         * The created image can be a cloned image, so we need to read +         * a data from the source image. +         */ +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); +        if (ret < 0) { +            goto out; +        } +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); +        if (ret < 0) { +            goto out; +        } +    } + +out: +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Can't pre-allocate"); +    } +out_with_err_set: +    if (bs) { +        bdrv_unref(bs); +    } +    g_free(buf); + +    return ret; +} + +/* + * Sheepdog support two kinds of redundancy, full replication and erasure + * coding. + * + * # create a fully replicated vdi with x copies + * -o redundancy=x (1 <= x <= SD_MAX_COPIES) + * + * # create a erasure coded vdi with x data strips and y parity strips + * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) + */ +static int parse_redundancy(BDRVSheepdogState *s, const char *opt) +{ +    struct SheepdogInode *inode = &s->inode; +    const char *n1, *n2; +    long copy, parity; +    char p[10]; + +    pstrcpy(p, sizeof(p), opt); +    n1 = strtok(p, ":"); +    n2 = strtok(NULL, ":"); + +    if (!n1) { +        return -EINVAL; +    } + +    copy = strtol(n1, NULL, 10); +    if (copy > SD_MAX_COPIES || copy < 1) { +        return -EINVAL; +    } +    if (!n2) { +        inode->copy_policy = 0; +        inode->nr_copies = copy; +        return 0; +    } + +    if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { +        return -EINVAL; +    } + +    parity = strtol(n2, NULL, 10); +    if (parity >= SD_EC_MAX_STRIP || parity < 1) { +        return -EINVAL; +    } + +    /* +     * 4 bits for parity and 4 bits for data. +     * We have to compress upper data bits because it can't represent 16 +     */ +    inode->copy_policy = ((copy / 2) << 4) + parity; +    inode->nr_copies = copy + parity; + +    return 0; +} + +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) +{ +    struct SheepdogInode *inode = &s->inode; +    uint64_t object_size; +    int obj_order; + +    object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); +    if (object_size) { +        if ((object_size - 1) & object_size) {    /* not a power of 2? */ +            return -EINVAL; +        } +        obj_order = ctz32(object_size); +        if (obj_order < 20 || obj_order > 31) { +            return -EINVAL; +        } +        inode->block_size_shift = (uint8_t)obj_order; +    } + +    return 0; +} + +static int sd_create(const char *filename, QemuOpts *opts, +                     Error **errp) +{ +    int ret = 0; +    uint32_t vid = 0; +    char *backing_file = NULL; +    char *buf = NULL; +    BDRVSheepdogState *s; +    char tag[SD_MAX_VDI_TAG_LEN]; +    uint32_t snapid; +    uint64_t max_vdi_size; +    bool prealloc = false; + +    s = g_new0(BDRVSheepdogState, 1); + +    memset(tag, 0, sizeof(tag)); +    if (strstr(filename, "://")) { +        ret = sd_parse_uri(s, filename, s->name, &snapid, tag); +    } else { +        ret = parse_vdiname(s, filename, s->name, &snapid, tag); +    } +    if (ret < 0) { +        error_setg(errp, "Can't parse filename"); +        goto out; +    } + +    s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                                 BDRV_SECTOR_SIZE); +    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); +    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); +    if (!buf || !strcmp(buf, "off")) { +        prealloc = false; +    } else if (!strcmp(buf, "full")) { +        prealloc = true; +    } else { +        error_setg(errp, "Invalid preallocation mode: '%s'", buf); +        ret = -EINVAL; +        goto out; +    } + +    g_free(buf); +    buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY); +    if (buf) { +        ret = parse_redundancy(s, buf); +        if (ret < 0) { +            error_setg(errp, "Invalid redundancy mode: '%s'", buf); +            goto out; +        } +    } +    ret = parse_block_size_shift(s, opts); +    if (ret < 0) { +        error_setg(errp, "Invalid object_size." +                         " obect_size needs to be power of 2" +                         " and be limited from 2^20 to 2^31"); +        goto out; +    } + +    if (backing_file) { +        BlockDriverState *bs; +        BDRVSheepdogState *base; +        BlockDriver *drv; + +        /* Currently, only Sheepdog backing image is supported. */ +        drv = bdrv_find_protocol(backing_file, true, NULL); +        if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { +            error_setg(errp, "backing_file must be a sheepdog image"); +            ret = -EINVAL; +            goto out; +        } + +        bs = NULL; +        ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL, +                        errp); +        if (ret < 0) { +            goto out; +        } + +        base = bs->opaque; + +        if (!is_snapshot(&base->inode)) { +            error_setg(errp, "cannot clone from a non snapshot vdi"); +            bdrv_unref(bs); +            ret = -EINVAL; +            goto out; +        } +        s->inode.vdi_id = base->inode.vdi_id; +        bdrv_unref(bs); +    } + +    s->aio_context = qemu_get_aio_context(); + +    /* if block_size_shift is not specified, get cluster default value */ +    if (s->inode.block_size_shift == 0) { +        SheepdogVdiReq hdr; +        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; +        Error *local_err = NULL; +        int fd; +        unsigned int wlen = 0, rlen = 0; + +        fd = connect_to_sdog(s, &local_err); +        if (fd < 0) { +            error_report("%s", error_get_pretty(local_err)); +            error_free(local_err); +            ret = -EIO; +            goto out; +        } + +        memset(&hdr, 0, sizeof(hdr)); +        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; +        hdr.proto_ver = SD_PROTO_VER; + +        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, +                     NULL, &wlen, &rlen); +        closesocket(fd); +        if (ret) { +            error_setg_errno(errp, -ret, "failed to get cluster default"); +            goto out; +        } +        if (rsp->result == SD_RES_SUCCESS) { +            s->inode.block_size_shift = rsp->block_size_shift; +        } else { +            s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; +        } +    } + +    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; + +    if (s->inode.vdi_size > max_vdi_size) { +        error_setg(errp, "An image is too large." +                         " The maximum image size is %"PRIu64 "GB", +                         max_vdi_size / 1024 / 1024 / 1024); +        ret = -EINVAL; +        goto out; +    } + +    ret = do_sd_create(s, &vid, 0, errp); +    if (ret) { +        goto out; +    } + +    if (prealloc) { +        ret = sd_prealloc(filename, errp); +    } +out: +    g_free(backing_file); +    g_free(buf); +    g_free(s); +    return ret; +} + +static void sd_close(BlockDriverState *bs) +{ +    Error *local_err = NULL; +    BDRVSheepdogState *s = bs->opaque; +    SheepdogVdiReq hdr; +    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; +    unsigned int wlen, rlen = 0; +    int fd, ret; + +    DPRINTF("%s\n", s->name); + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        return; +    } + +    memset(&hdr, 0, sizeof(hdr)); + +    hdr.opcode = SD_OP_RELEASE_VDI; +    hdr.type = LOCK_TYPE_NORMAL; +    hdr.base_vdi_id = s->inode.vdi_id; +    wlen = strlen(s->name) + 1; +    hdr.data_length = wlen; +    hdr.flags = SD_FLAG_CMD_WRITE; + +    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, +                 s->name, &wlen, &rlen); + +    closesocket(fd); + +    if (!ret && rsp->result != SD_RES_SUCCESS && +        rsp->result != SD_RES_VDI_NOT_LOCKED) { +        error_report("%s, %s", sd_strerror(rsp->result), s->name); +    } + +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); +    closesocket(s->fd); +    g_free(s->host_spec); +} + +static int64_t sd_getlength(BlockDriverState *bs) +{ +    BDRVSheepdogState *s = bs->opaque; + +    return s->inode.vdi_size; +} + +static int sd_truncate(BlockDriverState *bs, int64_t offset) +{ +    Error *local_err = NULL; +    BDRVSheepdogState *s = bs->opaque; +    int ret, fd; +    unsigned int datalen; +    uint64_t max_vdi_size; + +    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; +    if (offset < s->inode.vdi_size) { +        error_report("shrinking is not supported"); +        return -EINVAL; +    } else if (offset > max_vdi_size) { +        error_report("too big image size"); +        return -EINVAL; +    } + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        return fd; +    } + +    /* we don't need to update entire object */ +    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); +    s->inode.vdi_size = offset; +    ret = write_object(fd, s->aio_context, (char *)&s->inode, +                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, +                       datalen, 0, false, s->cache_flags); +    close(fd); + +    if (ret < 0) { +        error_report("failed to update an inode."); +    } + +    return ret; +} + +/* + * This function is called after writing data objects.  If we need to + * update metadata, this sends a write request to the vdi object. + * Otherwise, this switches back to sd_co_readv/writev. + */ +static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) +{ +    BDRVSheepdogState *s = acb->common.bs->opaque; +    struct iovec iov; +    AIOReq *aio_req; +    uint32_t offset, data_len, mn, mx; + +    mn = s->min_dirty_data_idx; +    mx = s->max_dirty_data_idx; +    if (mn <= mx) { +        /* we need to update the vdi object. */ +        offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + +            mn * sizeof(s->inode.data_vdi_id[0]); +        data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); + +        s->min_dirty_data_idx = UINT32_MAX; +        s->max_dirty_data_idx = 0; + +        iov.iov_base = &s->inode; +        iov.iov_len = sizeof(s->inode); +        aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), +                                data_len, offset, 0, false, 0, offset); +        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); +        add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); + +        acb->aio_done_func = sd_finish_aiocb; +        acb->aiocb_type = AIOCB_WRITE_UDATA; +        return; +    } + +    sd_finish_aiocb(acb); +} + +/* Delete current working VDI on the snapshot chain */ +static bool sd_delete(BDRVSheepdogState *s) +{ +    Error *local_err = NULL; +    unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; +    SheepdogVdiReq hdr = { +        .opcode = SD_OP_DEL_VDI, +        .base_vdi_id = s->inode.vdi_id, +        .data_length = wlen, +        .flags = SD_FLAG_CMD_WRITE, +    }; +    SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; +    int fd, ret; + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        return false; +    } + +    ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, +                 s->name, &wlen, &rlen); +    closesocket(fd); +    if (ret) { +        return false; +    } +    switch (rsp->result) { +    case SD_RES_NO_VDI: +        error_report("%s was already deleted", s->name); +        /* fall through */ +    case SD_RES_SUCCESS: +        break; +    default: +        error_report("%s, %s", sd_strerror(rsp->result), s->name); +        return false; +    } + +    return true; +} + +/* + * Create a writable VDI from a snapshot + */ +static int sd_create_branch(BDRVSheepdogState *s) +{ +    Error *local_err = NULL; +    int ret, fd; +    uint32_t vid; +    char *buf; +    bool deleted; + +    DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); + +    buf = g_malloc(SD_INODE_SIZE); + +    /* +     * Even If deletion fails, we will just create extra snapshot based on +     * the working VDI which was supposed to be deleted. So no need to +     * false bail out. +     */ +    deleted = sd_delete(s); +    ret = do_sd_create(s, &vid, !deleted, &local_err); +    if (ret) { +        error_report_err(local_err); +        goto out; +    } + +    DPRINTF("%" PRIx32 " is created.\n", vid); + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        ret = fd; +        goto out; +    } + +    ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), +                      s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); + +    closesocket(fd); + +    if (ret < 0) { +        goto out; +    } + +    memcpy(&s->inode, buf, sizeof(s->inode)); + +    s->is_snapshot = false; +    ret = 0; +    DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id); + +out: +    g_free(buf); + +    return ret; +} + +/* + * Send I/O requests to the server. + * + * This function sends requests to the server, links the requests to + * the inflight_list in BDRVSheepdogState, and exits without + * waiting the response.  The responses are received in the + * `aio_read_response' function which is called from the main loop as + * a fd handler. + * + * Returns 1 when we need to wait a response, 0 when there is no sent + * request and -errno in error cases. + */ +static int coroutine_fn sd_co_rw_vector(void *p) +{ +    SheepdogAIOCB *acb = p; +    int ret = 0; +    unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; +    unsigned long idx; +    uint32_t object_size; +    uint64_t oid; +    uint64_t offset; +    BDRVSheepdogState *s = acb->common.bs->opaque; +    SheepdogInode *inode = &s->inode; +    AIOReq *aio_req; + +    if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { +        /* +         * In the case we open the snapshot VDI, Sheepdog creates the +         * writable VDI when we do a write operation first. +         */ +        ret = sd_create_branch(s); +        if (ret) { +            acb->ret = -EIO; +            goto out; +        } +    } + +    object_size = (UINT32_C(1) << inode->block_size_shift); +    idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; +    offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; + +    /* +     * Make sure we don't free the aiocb before we are done with all requests. +     * This additional reference is dropped at the end of this function. +     */ +    acb->nr_pending++; + +    while (done != total) { +        uint8_t flags = 0; +        uint64_t old_oid = 0; +        bool create = false; + +        oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); + +        len = MIN(total - done, object_size - offset); + +        switch (acb->aiocb_type) { +        case AIOCB_READ_UDATA: +            if (!inode->data_vdi_id[idx]) { +                qemu_iovec_memset(acb->qiov, done, 0, len); +                goto done; +            } +            break; +        case AIOCB_WRITE_UDATA: +            if (!inode->data_vdi_id[idx]) { +                create = true; +            } else if (!is_data_obj_writable(inode, idx)) { +                /* Copy-On-Write */ +                create = true; +                old_oid = oid; +                flags = SD_FLAG_CMD_COW; +            } +            break; +        case AIOCB_DISCARD_OBJ: +            /* +             * We discard the object only when the whole object is +             * 1) allocated 2) trimmed. Otherwise, simply skip it. +             */ +            if (len != object_size || inode->data_vdi_id[idx] == 0) { +                goto done; +            } +            break; +        default: +            break; +        } + +        if (create) { +            DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", +                    inode->vdi_id, oid, +                    vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); +            oid = vid_to_data_oid(inode->vdi_id, idx); +            DPRINTF("new oid %" PRIx64 "\n", oid); +        } + +        aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, +                                old_oid, done); +        QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + +        add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, +                        acb->aiocb_type); +    done: +        offset = 0; +        idx++; +        done += len; +    } +out: +    if (!--acb->nr_pending) { +        return acb->ret; +    } +    return 1; +} + +static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) +{ +    SheepdogAIOCB *cb; + +    QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { +        if (AIOCBOverwrapping(aiocb, cb)) { +            return true; +        } +    } + +    QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings); +    return false; +} + +static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, +                        int nb_sectors, QEMUIOVector *qiov) +{ +    SheepdogAIOCB *acb; +    int ret; +    int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; +    BDRVSheepdogState *s = bs->opaque; + +    if (offset > s->inode.vdi_size) { +        ret = sd_truncate(bs, offset); +        if (ret < 0) { +            return ret; +        } +    } + +    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); +    acb->aio_done_func = sd_write_done; +    acb->aiocb_type = AIOCB_WRITE_UDATA; + +retry: +    if (check_overwrapping_aiocb(s, acb)) { +        qemu_co_queue_wait(&s->overwrapping_queue); +        goto retry; +    } + +    ret = sd_co_rw_vector(acb); +    if (ret <= 0) { +        QLIST_REMOVE(acb, aiocb_siblings); +        qemu_co_queue_restart_all(&s->overwrapping_queue); +        qemu_aio_unref(acb); +        return ret; +    } + +    qemu_coroutine_yield(); + +    QLIST_REMOVE(acb, aiocb_siblings); +    qemu_co_queue_restart_all(&s->overwrapping_queue); + +    return acb->ret; +} + +static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, +                       int nb_sectors, QEMUIOVector *qiov) +{ +    SheepdogAIOCB *acb; +    int ret; +    BDRVSheepdogState *s = bs->opaque; + +    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); +    acb->aiocb_type = AIOCB_READ_UDATA; +    acb->aio_done_func = sd_finish_aiocb; + +retry: +    if (check_overwrapping_aiocb(s, acb)) { +        qemu_co_queue_wait(&s->overwrapping_queue); +        goto retry; +    } + +    ret = sd_co_rw_vector(acb); +    if (ret <= 0) { +        QLIST_REMOVE(acb, aiocb_siblings); +        qemu_co_queue_restart_all(&s->overwrapping_queue); +        qemu_aio_unref(acb); +        return ret; +    } + +    qemu_coroutine_yield(); + +    QLIST_REMOVE(acb, aiocb_siblings); +    qemu_co_queue_restart_all(&s->overwrapping_queue); +    return acb->ret; +} + +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) +{ +    BDRVSheepdogState *s = bs->opaque; +    SheepdogAIOCB *acb; +    AIOReq *aio_req; + +    if (s->cache_flags != SD_FLAG_CMD_CACHE) { +        return 0; +    } + +    acb = sd_aio_setup(bs, NULL, 0, 0); +    acb->aiocb_type = AIOCB_FLUSH_CACHE; +    acb->aio_done_func = sd_finish_aiocb; + +    aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), +                            0, 0, 0, false, 0, 0); +    QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); +    add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type); + +    qemu_coroutine_yield(); +    return acb->ret; +} + +static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ +    Error *local_err = NULL; +    BDRVSheepdogState *s = bs->opaque; +    int ret, fd; +    uint32_t new_vid; +    SheepdogInode *inode; +    unsigned int datalen; + +    DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " +            "is_snapshot %d\n", sn_info->name, sn_info->id_str, +            s->name, sn_info->vm_state_size, s->is_snapshot); + +    if (s->is_snapshot) { +        error_report("You can't create a snapshot of a snapshot VDI, " +                     "%s (%" PRIu32 ").", s->name, s->inode.vdi_id); + +        return -EINVAL; +    } + +    DPRINTF("%s %s\n", sn_info->name, sn_info->id_str); + +    s->inode.vm_state_size = sn_info->vm_state_size; +    s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; +    /* It appears that inode.tag does not require a NUL terminator, +     * which means this use of strncpy is ok. +     */ +    strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); +    /* we don't need to update entire object */ +    datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); +    inode = g_malloc(datalen); + +    /* refresh inode. */ +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        ret = fd; +        goto cleanup; +    } + +    ret = write_object(fd, s->aio_context, (char *)&s->inode, +                       vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, +                       datalen, 0, false, s->cache_flags); +    if (ret < 0) { +        error_report("failed to write snapshot's inode."); +        goto cleanup; +    } + +    ret = do_sd_create(s, &new_vid, 1, &local_err); +    if (ret < 0) { +        error_report("failed to create inode for snapshot: %s", +                     error_get_pretty(local_err)); +        error_free(local_err); +        goto cleanup; +    } + +    ret = read_object(fd, s->aio_context, (char *)inode, +                      vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, +                      s->cache_flags); + +    if (ret < 0) { +        error_report("failed to read new inode info. %s", strerror(errno)); +        goto cleanup; +    } + +    memcpy(&s->inode, inode, datalen); +    DPRINTF("s->inode: name %s snap_id %x oid %x\n", +            s->inode.name, s->inode.snap_id, s->inode.vdi_id); + +cleanup: +    g_free(inode); +    closesocket(fd); +    return ret; +} + +/* + * We implement rollback(loadvm) operation to the specified snapshot by + * 1) switch to the snapshot + * 2) rely on sd_create_branch to delete working VDI and + * 3) create a new working VDI based on the specified snapshot + */ +static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ +    BDRVSheepdogState *s = bs->opaque; +    BDRVSheepdogState *old_s; +    char tag[SD_MAX_VDI_TAG_LEN]; +    uint32_t snapid = 0; +    int ret = 0; + +    old_s = g_new(BDRVSheepdogState, 1); + +    memcpy(old_s, s, sizeof(BDRVSheepdogState)); + +    snapid = strtoul(snapshot_id, NULL, 10); +    if (snapid) { +        tag[0] = 0; +    } else { +        pstrcpy(tag, sizeof(tag), snapshot_id); +    } + +    ret = reload_inode(s, snapid, tag); +    if (ret) { +        goto out; +    } + +    ret = sd_create_branch(s); +    if (ret) { +        goto out; +    } + +    g_free(old_s); + +    return 0; +out: +    /* recover bdrv_sd_state */ +    memcpy(s, old_s, sizeof(BDRVSheepdogState)); +    g_free(old_s); + +    error_report("failed to open. recover old bdrv_sd_state."); + +    return ret; +} + +static int sd_snapshot_delete(BlockDriverState *bs, +                              const char *snapshot_id, +                              const char *name, +                              Error **errp) +{ +    /* FIXME: Delete specified snapshot id.  */ +    return 0; +} + +static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ +    Error *local_err = NULL; +    BDRVSheepdogState *s = bs->opaque; +    SheepdogReq req; +    int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); +    QEMUSnapshotInfo *sn_tab = NULL; +    unsigned wlen, rlen; +    int found = 0; +    static SheepdogInode inode; +    unsigned long *vdi_inuse; +    unsigned int start_nr; +    uint64_t hval; +    uint32_t vid; + +    vdi_inuse = g_malloc(max); + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        ret = fd; +        goto out; +    } + +    rlen = max; +    wlen = 0; + +    memset(&req, 0, sizeof(req)); + +    req.opcode = SD_OP_READ_VDIS; +    req.data_length = max; + +    ret = do_req(fd, s->aio_context, (SheepdogReq *)&req, +                 vdi_inuse, &wlen, &rlen); + +    closesocket(fd); +    if (ret) { +        goto out; +    } + +    sn_tab = g_new0(QEMUSnapshotInfo, nr); + +    /* calculate a vdi id with hash function */ +    hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); +    start_nr = hval & (SD_NR_VDIS - 1); + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        ret = fd; +        goto out; +    } + +    for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) { +        if (!test_bit(vid, vdi_inuse)) { +            break; +        } + +        /* we don't need to read entire object */ +        ret = read_object(fd, s->aio_context, (char *)&inode, +                          vid_to_vdi_oid(vid), +                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, +                          s->cache_flags); + +        if (ret) { +            continue; +        } + +        if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) { +            sn_tab[found].date_sec = inode.snap_ctime >> 32; +            sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; +            sn_tab[found].vm_state_size = inode.vm_state_size; +            sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; + +            snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), +                     "%" PRIu32, inode.snap_id); +            pstrcpy(sn_tab[found].name, +                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)), +                    inode.tag); +            found++; +        } +    } + +    closesocket(fd); +out: +    *psn_tab = sn_tab; + +    g_free(vdi_inuse); + +    if (ret < 0) { +        return ret; +    } + +    return found; +} + +static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, +                                int64_t pos, int size, int load) +{ +    Error *local_err = NULL; +    bool create; +    int fd, ret = 0, remaining = size; +    unsigned int data_len; +    uint64_t vmstate_oid; +    uint64_t offset; +    uint32_t vdi_index; +    uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; +    uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); + +    fd = connect_to_sdog(s, &local_err); +    if (fd < 0) { +        error_report_err(local_err); +        return fd; +    } + +    while (remaining) { +        vdi_index = pos / object_size; +        offset = pos % object_size; + +        data_len = MIN(remaining, object_size - offset); + +        vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); + +        create = (offset == 0); +        if (load) { +            ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid, +                              s->inode.nr_copies, data_len, offset, +                              s->cache_flags); +        } else { +            ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid, +                               s->inode.nr_copies, data_len, offset, create, +                               s->cache_flags); +        } + +        if (ret < 0) { +            error_report("failed to save vmstate %s", strerror(errno)); +            goto cleanup; +        } + +        pos += data_len; +        data += data_len; +        remaining -= data_len; +    } +    ret = size; +cleanup: +    closesocket(fd); +    return ret; +} + +static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, +                           int64_t pos) +{ +    BDRVSheepdogState *s = bs->opaque; +    void *buf; +    int ret; + +    buf = qemu_blockalign(bs, qiov->size); +    qemu_iovec_to_buf(qiov, 0, buf, qiov->size); +    ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0); +    qemu_vfree(buf); + +    return ret; +} + +static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, +                           int64_t pos, int size) +{ +    BDRVSheepdogState *s = bs->opaque; + +    return do_load_save_vmstate(s, data, pos, size, 1); +} + + +static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, +                                      int nb_sectors) +{ +    SheepdogAIOCB *acb; +    QEMUIOVector dummy; +    BDRVSheepdogState *s = bs->opaque; +    int ret; + +    if (!s->discard_supported) { +            return 0; +    } + +    acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors); +    acb->aiocb_type = AIOCB_DISCARD_OBJ; +    acb->aio_done_func = sd_finish_aiocb; + +retry: +    if (check_overwrapping_aiocb(s, acb)) { +        qemu_co_queue_wait(&s->overwrapping_queue); +        goto retry; +    } + +    ret = sd_co_rw_vector(acb); +    if (ret <= 0) { +        QLIST_REMOVE(acb, aiocb_siblings); +        qemu_co_queue_restart_all(&s->overwrapping_queue); +        qemu_aio_unref(acb); +        return ret; +    } + +    qemu_coroutine_yield(); + +    QLIST_REMOVE(acb, aiocb_siblings); +    qemu_co_queue_restart_all(&s->overwrapping_queue); + +    return acb->ret; +} + +static coroutine_fn int64_t +sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, +                       int *pnum) +{ +    BDRVSheepdogState *s = bs->opaque; +    SheepdogInode *inode = &s->inode; +    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); +    uint64_t offset = sector_num * BDRV_SECTOR_SIZE; +    unsigned long start = offset / object_size, +                  end = DIV_ROUND_UP((sector_num + nb_sectors) * +                                     BDRV_SECTOR_SIZE, object_size); +    unsigned long idx; +    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + +    for (idx = start; idx < end; idx++) { +        if (inode->data_vdi_id[idx] == 0) { +            break; +        } +    } +    if (idx == start) { +        /* Get the longest length of unallocated sectors */ +        ret = 0; +        for (idx = start + 1; idx < end; idx++) { +            if (inode->data_vdi_id[idx] != 0) { +                break; +            } +        } +    } + +    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; +    if (*pnum > nb_sectors) { +        *pnum = nb_sectors; +    } +    return ret; +} + +static int64_t sd_get_allocated_file_size(BlockDriverState *bs) +{ +    BDRVSheepdogState *s = bs->opaque; +    SheepdogInode *inode = &s->inode; +    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); +    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); +    uint64_t size = 0; + +    for (i = 0; i < last; i++) { +        if (inode->data_vdi_id[i] == 0) { +            continue; +        } +        size += object_size; +    } +    return size; +} + +static QemuOptsList sd_create_opts = { +    .name = "sheepdog-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_BACKING_FILE, +            .type = QEMU_OPT_STRING, +            .help = "File name of a base image" +        }, +        { +            .name = BLOCK_OPT_PREALLOC, +            .type = QEMU_OPT_STRING, +            .help = "Preallocation mode (allowed values: off, full)" +        }, +        { +            .name = BLOCK_OPT_REDUNDANCY, +            .type = QEMU_OPT_STRING, +            .help = "Redundancy of the image" +        }, +        { +            .name = BLOCK_OPT_OBJECT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Object size of the image" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_sheepdog = { +    .format_name    = "sheepdog", +    .protocol_name  = "sheepdog", +    .instance_size  = sizeof(BDRVSheepdogState), +    .bdrv_needs_filename = true, +    .bdrv_file_open = sd_open, +    .bdrv_close     = sd_close, +    .bdrv_create    = sd_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_getlength = sd_getlength, +    .bdrv_get_allocated_file_size = sd_get_allocated_file_size, +    .bdrv_truncate  = sd_truncate, + +    .bdrv_co_readv  = sd_co_readv, +    .bdrv_co_writev = sd_co_writev, +    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk, +    .bdrv_co_discard = sd_co_discard, +    .bdrv_co_get_block_status = sd_co_get_block_status, + +    .bdrv_snapshot_create   = sd_snapshot_create, +    .bdrv_snapshot_goto     = sd_snapshot_goto, +    .bdrv_snapshot_delete   = sd_snapshot_delete, +    .bdrv_snapshot_list     = sd_snapshot_list, + +    .bdrv_save_vmstate  = sd_save_vmstate, +    .bdrv_load_vmstate  = sd_load_vmstate, + +    .bdrv_detach_aio_context = sd_detach_aio_context, +    .bdrv_attach_aio_context = sd_attach_aio_context, + +    .create_opts    = &sd_create_opts, +}; + +static BlockDriver bdrv_sheepdog_tcp = { +    .format_name    = "sheepdog", +    .protocol_name  = "sheepdog+tcp", +    .instance_size  = sizeof(BDRVSheepdogState), +    .bdrv_needs_filename = true, +    .bdrv_file_open = sd_open, +    .bdrv_close     = sd_close, +    .bdrv_create    = sd_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_getlength = sd_getlength, +    .bdrv_get_allocated_file_size = sd_get_allocated_file_size, +    .bdrv_truncate  = sd_truncate, + +    .bdrv_co_readv  = sd_co_readv, +    .bdrv_co_writev = sd_co_writev, +    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk, +    .bdrv_co_discard = sd_co_discard, +    .bdrv_co_get_block_status = sd_co_get_block_status, + +    .bdrv_snapshot_create   = sd_snapshot_create, +    .bdrv_snapshot_goto     = sd_snapshot_goto, +    .bdrv_snapshot_delete   = sd_snapshot_delete, +    .bdrv_snapshot_list     = sd_snapshot_list, + +    .bdrv_save_vmstate  = sd_save_vmstate, +    .bdrv_load_vmstate  = sd_load_vmstate, + +    .bdrv_detach_aio_context = sd_detach_aio_context, +    .bdrv_attach_aio_context = sd_attach_aio_context, + +    .create_opts    = &sd_create_opts, +}; + +static BlockDriver bdrv_sheepdog_unix = { +    .format_name    = "sheepdog", +    .protocol_name  = "sheepdog+unix", +    .instance_size  = sizeof(BDRVSheepdogState), +    .bdrv_needs_filename = true, +    .bdrv_file_open = sd_open, +    .bdrv_close     = sd_close, +    .bdrv_create    = sd_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_getlength = sd_getlength, +    .bdrv_get_allocated_file_size = sd_get_allocated_file_size, +    .bdrv_truncate  = sd_truncate, + +    .bdrv_co_readv  = sd_co_readv, +    .bdrv_co_writev = sd_co_writev, +    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk, +    .bdrv_co_discard = sd_co_discard, +    .bdrv_co_get_block_status = sd_co_get_block_status, + +    .bdrv_snapshot_create   = sd_snapshot_create, +    .bdrv_snapshot_goto     = sd_snapshot_goto, +    .bdrv_snapshot_delete   = sd_snapshot_delete, +    .bdrv_snapshot_list     = sd_snapshot_list, + +    .bdrv_save_vmstate  = sd_save_vmstate, +    .bdrv_load_vmstate  = sd_load_vmstate, + +    .bdrv_detach_aio_context = sd_detach_aio_context, +    .bdrv_attach_aio_context = sd_attach_aio_context, + +    .create_opts    = &sd_create_opts, +}; + +static void bdrv_sheepdog_init(void) +{ +    bdrv_register(&bdrv_sheepdog); +    bdrv_register(&bdrv_sheepdog_tcp); +    bdrv_register(&bdrv_sheepdog_unix); +} +block_init(bdrv_sheepdog_init); diff --git a/block/snapshot.c b/block/snapshot.c new file mode 100644 index 00000000..49e143e9 --- /dev/null +++ b/block/snapshot.c @@ -0,0 +1,358 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/snapshot.h" +#include "block/block_int.h" +#include "qapi/qmp/qerror.h" + +QemuOptsList internal_snapshot_opts = { +    .name = "snapshot", +    .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head), +    .desc = { +        { +            .name = SNAPSHOT_OPT_ID, +            .type = QEMU_OPT_STRING, +            .help = "snapshot id" +        },{ +            .name = SNAPSHOT_OPT_NAME, +            .type = QEMU_OPT_STRING, +            .help = "snapshot name" +        },{ +            /* end of list */ +        } +    }, +}; + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, +                       const char *name) +{ +    QEMUSnapshotInfo *sn_tab, *sn; +    int nb_sns, i, ret; + +    ret = -ENOENT; +    nb_sns = bdrv_snapshot_list(bs, &sn_tab); +    if (nb_sns < 0) { +        return ret; +    } +    for (i = 0; i < nb_sns; i++) { +        sn = &sn_tab[i]; +        if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { +            *sn_info = *sn; +            ret = 0; +            break; +        } +    } +    g_free(sn_tab); +    return ret; +} + +/** + * Look up an internal snapshot by @id and @name. + * @bs: block device to search + * @id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @sn_info: location to store information on the snapshot found + * @errp: location to store error, will be set only for exception + * + * This function will traverse snapshot list in @bs to search the matching + * one, @id and @name are the matching condition: + * If both @id and @name are specified, find the first one with id @id and + * name @name. + * If only @id is specified, find the first one with id @id. + * If only @name is specified, find the first one with name @name. + * if none is specified, abort(). + * + * Returns: true when a snapshot is found and @sn_info will be filled, false + * when error or not found. If all operation succeed but no matching one is + * found, @errp will NOT be set. + */ +bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, +                                       const char *id, +                                       const char *name, +                                       QEMUSnapshotInfo *sn_info, +                                       Error **errp) +{ +    QEMUSnapshotInfo *sn_tab, *sn; +    int nb_sns, i; +    bool ret = false; + +    assert(id || name); + +    nb_sns = bdrv_snapshot_list(bs, &sn_tab); +    if (nb_sns < 0) { +        error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list"); +        return false; +    } else if (nb_sns == 0) { +        return false; +    } + +    if (id && name) { +        for (i = 0; i < nb_sns; i++) { +            sn = &sn_tab[i]; +            if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) { +                *sn_info = *sn; +                ret = true; +                break; +            } +        } +    } else if (id) { +        for (i = 0; i < nb_sns; i++) { +            sn = &sn_tab[i]; +            if (!strcmp(sn->id_str, id)) { +                *sn_info = *sn; +                ret = true; +                break; +            } +        } +    } else if (name) { +        for (i = 0; i < nb_sns; i++) { +            sn = &sn_tab[i]; +            if (!strcmp(sn->name, name)) { +                *sn_info = *sn; +                ret = true; +                break; +            } +        } +    } + +    g_free(sn_tab); +    return ret; +} + +int bdrv_can_snapshot(BlockDriverState *bs) +{ +    BlockDriver *drv = bs->drv; +    if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { +        return 0; +    } + +    if (!drv->bdrv_snapshot_create) { +        if (bs->file != NULL) { +            return bdrv_can_snapshot(bs->file); +        } +        return 0; +    } + +    return 1; +} + +int bdrv_snapshot_create(BlockDriverState *bs, +                         QEMUSnapshotInfo *sn_info) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_create) { +        return drv->bdrv_snapshot_create(bs, sn_info); +    } +    if (bs->file) { +        return bdrv_snapshot_create(bs->file, sn_info); +    } +    return -ENOTSUP; +} + +int bdrv_snapshot_goto(BlockDriverState *bs, +                       const char *snapshot_id) +{ +    BlockDriver *drv = bs->drv; +    int ret, open_ret; + +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_goto) { +        return drv->bdrv_snapshot_goto(bs, snapshot_id); +    } + +    if (bs->file) { +        drv->bdrv_close(bs); +        ret = bdrv_snapshot_goto(bs->file, snapshot_id); +        open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); +        if (open_ret < 0) { +            bdrv_unref(bs->file); +            bs->drv = NULL; +            return open_ret; +        } +        return ret; +    } + +    return -ENOTSUP; +} + +/** + * Delete an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, delete the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, delete the first one with id + * @snapshot_id. + * If only @name is specified, delete the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on failure. If @bs is not inserted, return + * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs + * does not support internal snapshot deletion, return -ENOTSUP. If @bs does + * not support parameter @snapshot_id or @name, or one of them is not correctly + * specified, return -EINVAL. If @bs can't find one matching @id and @name, + * return -ENOENT. If @errp != NULL, it will always be filled with error + * message on failure. + */ +int bdrv_snapshot_delete(BlockDriverState *bs, +                         const char *snapshot_id, +                         const char *name, +                         Error **errp) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); +        return -ENOMEDIUM; +    } +    if (!snapshot_id && !name) { +        error_setg(errp, "snapshot_id and name are both NULL"); +        return -EINVAL; +    } + +    /* drain all pending i/o before deleting snapshot */ +    bdrv_drain(bs); + +    if (drv->bdrv_snapshot_delete) { +        return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); +    } +    if (bs->file) { +        return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); +    } +    error_setg(errp, "Block format '%s' used by device '%s' " +               "does not support internal snapshot deletion", +               drv->format_name, bdrv_get_device_name(bs)); +    return -ENOTSUP; +} + +void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, +                                        const char *id_or_name, +                                        Error **errp) +{ +    int ret; +    Error *local_err = NULL; + +    ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err); +    if (ret == -ENOENT || ret == -EINVAL) { +        error_free(local_err); +        local_err = NULL; +        ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err); +    } + +    if (ret < 0) { +        error_propagate(errp, local_err); +    } +} + +int bdrv_snapshot_list(BlockDriverState *bs, +                       QEMUSnapshotInfo **psn_info) +{ +    BlockDriver *drv = bs->drv; +    if (!drv) { +        return -ENOMEDIUM; +    } +    if (drv->bdrv_snapshot_list) { +        return drv->bdrv_snapshot_list(bs, psn_info); +    } +    if (bs->file) { +        return bdrv_snapshot_list(bs->file, psn_info); +    } +    return -ENOTSUP; +} + +/** + * Temporarily load an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, load the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, load the first one with id + * @snapshot_id. + * If only @name is specified, load the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on fail. If @bs is not inserted, return + * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support + * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and + * @name, return -ENOENT. If @errp != NULL, it will always be filled on + * failure. + */ +int bdrv_snapshot_load_tmp(BlockDriverState *bs, +                           const char *snapshot_id, +                           const char *name, +                           Error **errp) +{ +    BlockDriver *drv = bs->drv; + +    if (!drv) { +        error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); +        return -ENOMEDIUM; +    } +    if (!snapshot_id && !name) { +        error_setg(errp, "snapshot_id and name are both NULL"); +        return -EINVAL; +    } +    if (!bs->read_only) { +        error_setg(errp, "Device is not readonly"); +        return -EINVAL; +    } +    if (drv->bdrv_snapshot_load_tmp) { +        return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); +    } +    error_setg(errp, "Block format '%s' used by device '%s' " +               "does not support temporarily loading internal snapshots", +               drv->format_name, bdrv_get_device_name(bs)); +    return -ENOTSUP; +} + +int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, +                                         const char *id_or_name, +                                         Error **errp) +{ +    int ret; +    Error *local_err = NULL; + +    ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err); +    if (ret == -ENOENT || ret == -EINVAL) { +        error_free(local_err); +        local_err = NULL; +        ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); +    } + +    if (local_err) { +        error_propagate(errp, local_err); +    } + +    return ret; +} diff --git a/block/ssh.c b/block/ssh.c new file mode 100644 index 00000000..8d067390 --- /dev/null +++ b/block/ssh.c @@ -0,0 +1,1114 @@ +/* + * Secure Shell (ssh) backend for QEMU. + * + * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones <rjones@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> + +#include <libssh2.h> +#include <libssh2_sftp.h> + +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "qemu/uri.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" + +/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in + * this block driver code. + * + * TRACE_LIBSSH2=<bitmask> enables tracing in libssh2 itself.  Note + * that this requires that libssh2 was specially compiled with the + * `./configure --enable-debug' option, so most likely you will have + * to compile it yourself.  The meaning of <bitmask> is described + * here: http://www.libssh2.org/libssh2_trace.html + */ +#define DEBUG_SSH     0 +#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */ + +#define DPRINTF(fmt, ...)                           \ +    do {                                            \ +        if (DEBUG_SSH) {                            \ +            fprintf(stderr, "ssh: %-15s " fmt "\n", \ +                    __func__, ##__VA_ARGS__);       \ +        }                                           \ +    } while (0) + +typedef struct BDRVSSHState { +    /* Coroutine. */ +    CoMutex lock; + +    /* SSH connection. */ +    int sock;                         /* socket */ +    LIBSSH2_SESSION *session;         /* ssh session */ +    LIBSSH2_SFTP *sftp;               /* sftp session */ +    LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */ + +    /* See ssh_seek() function below. */ +    int64_t offset; +    bool offset_op_read; + +    /* File attributes at open.  We try to keep the .filesize field +     * updated if it changes (eg by writing at the end of the file). +     */ +    LIBSSH2_SFTP_ATTRIBUTES attrs; + +    /* Used to warn if 'flush' is not supported. */ +    char *hostport; +    bool unsafe_flush_warning; +} BDRVSSHState; + +static void ssh_state_init(BDRVSSHState *s) +{ +    memset(s, 0, sizeof *s); +    s->sock = -1; +    s->offset = -1; +    qemu_co_mutex_init(&s->lock); +} + +static void ssh_state_free(BDRVSSHState *s) +{ +    g_free(s->hostport); +    if (s->sftp_handle) { +        libssh2_sftp_close(s->sftp_handle); +    } +    if (s->sftp) { +        libssh2_sftp_shutdown(s->sftp); +    } +    if (s->session) { +        libssh2_session_disconnect(s->session, +                                   "from qemu ssh client: " +                                   "user closed the connection"); +        libssh2_session_free(s->session); +    } +    if (s->sock >= 0) { +        close(s->sock); +    } +} + +static void GCC_FMT_ATTR(3, 4) +session_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) +{ +    va_list args; +    char *msg; + +    va_start(args, fs); +    msg = g_strdup_vprintf(fs, args); +    va_end(args); + +    if (s->session) { +        char *ssh_err; +        int ssh_err_code; + +        /* This is not an errno.  See <libssh2.h>. */ +        ssh_err_code = libssh2_session_last_error(s->session, +                                                  &ssh_err, NULL, 0); +        error_setg(errp, "%s: %s (libssh2 error code: %d)", +                   msg, ssh_err, ssh_err_code); +    } else { +        error_setg(errp, "%s", msg); +    } +    g_free(msg); +} + +static void GCC_FMT_ATTR(3, 4) +sftp_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) +{ +    va_list args; +    char *msg; + +    va_start(args, fs); +    msg = g_strdup_vprintf(fs, args); +    va_end(args); + +    if (s->sftp) { +        char *ssh_err; +        int ssh_err_code; +        unsigned long sftp_err_code; + +        /* This is not an errno.  See <libssh2.h>. */ +        ssh_err_code = libssh2_session_last_error(s->session, +                                                  &ssh_err, NULL, 0); +        /* See <libssh2_sftp.h>. */ +        sftp_err_code = libssh2_sftp_last_error((s)->sftp); + +        error_setg(errp, +                   "%s: %s (libssh2 error code: %d, sftp error code: %lu)", +                   msg, ssh_err, ssh_err_code, sftp_err_code); +    } else { +        error_setg(errp, "%s", msg); +    } +    g_free(msg); +} + +static void GCC_FMT_ATTR(2, 3) +sftp_error_report(BDRVSSHState *s, const char *fs, ...) +{ +    va_list args; + +    va_start(args, fs); +    error_vprintf(fs, args); + +    if ((s)->sftp) { +        char *ssh_err; +        int ssh_err_code; +        unsigned long sftp_err_code; + +        /* This is not an errno.  See <libssh2.h>. */ +        ssh_err_code = libssh2_session_last_error(s->session, +                                                  &ssh_err, NULL, 0); +        /* See <libssh2_sftp.h>. */ +        sftp_err_code = libssh2_sftp_last_error((s)->sftp); + +        error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)", +                     ssh_err, ssh_err_code, sftp_err_code); +    } + +    va_end(args); +    error_printf("\n"); +} + +static int parse_uri(const char *filename, QDict *options, Error **errp) +{ +    URI *uri = NULL; +    QueryParams *qp = NULL; +    int i; + +    uri = uri_parse(filename); +    if (!uri) { +        return -EINVAL; +    } + +    if (strcmp(uri->scheme, "ssh") != 0) { +        error_setg(errp, "URI scheme must be 'ssh'"); +        goto err; +    } + +    if (!uri->server || strcmp(uri->server, "") == 0) { +        error_setg(errp, "missing hostname in URI"); +        goto err; +    } + +    if (!uri->path || strcmp(uri->path, "") == 0) { +        error_setg(errp, "missing remote path in URI"); +        goto err; +    } + +    qp = query_params_parse(uri->query); +    if (!qp) { +        error_setg(errp, "could not parse query parameters"); +        goto err; +    } + +    if(uri->user && strcmp(uri->user, "") != 0) { +        qdict_put(options, "user", qstring_from_str(uri->user)); +    } + +    qdict_put(options, "host", qstring_from_str(uri->server)); + +    if (uri->port) { +        qdict_put(options, "port", qint_from_int(uri->port)); +    } + +    qdict_put(options, "path", qstring_from_str(uri->path)); + +    /* Pick out any query parameters that we understand, and ignore +     * the rest. +     */ +    for (i = 0; i < qp->n; ++i) { +        if (strcmp(qp->p[i].name, "host_key_check") == 0) { +            qdict_put(options, "host_key_check", +                      qstring_from_str(qp->p[i].value)); +        } +    } + +    query_params_free(qp); +    uri_free(uri); +    return 0; + + err: +    if (qp) { +      query_params_free(qp); +    } +    if (uri) { +      uri_free(uri); +    } +    return -EINVAL; +} + +static void ssh_parse_filename(const char *filename, QDict *options, +                               Error **errp) +{ +    if (qdict_haskey(options, "user") || +        qdict_haskey(options, "host") || +        qdict_haskey(options, "port") || +        qdict_haskey(options, "path") || +        qdict_haskey(options, "host_key_check")) { +        error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option"); +        return; +    } + +    parse_uri(filename, options, errp); +} + +static int check_host_key_knownhosts(BDRVSSHState *s, +                                     const char *host, int port, Error **errp) +{ +    const char *home; +    char *knh_file = NULL; +    LIBSSH2_KNOWNHOSTS *knh = NULL; +    struct libssh2_knownhost *found; +    int ret, r; +    const char *hostkey; +    size_t len; +    int type; + +    hostkey = libssh2_session_hostkey(s->session, &len, &type); +    if (!hostkey) { +        ret = -EINVAL; +        session_error_setg(errp, s, "failed to read remote host key"); +        goto out; +    } + +    knh = libssh2_knownhost_init(s->session); +    if (!knh) { +        ret = -EINVAL; +        session_error_setg(errp, s, +                           "failed to initialize known hosts support"); +        goto out; +    } + +    home = getenv("HOME"); +    if (home) { +        knh_file = g_strdup_printf("%s/.ssh/known_hosts", home); +    } else { +        knh_file = g_strdup_printf("/root/.ssh/known_hosts"); +    } + +    /* Read all known hosts from OpenSSH-style known_hosts file. */ +    libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH); + +    r = libssh2_knownhost_checkp(knh, host, port, hostkey, len, +                                 LIBSSH2_KNOWNHOST_TYPE_PLAIN| +                                 LIBSSH2_KNOWNHOST_KEYENC_RAW, +                                 &found); +    switch (r) { +    case LIBSSH2_KNOWNHOST_CHECK_MATCH: +        /* OK */ +        DPRINTF("host key OK: %s", found->key); +        break; +    case LIBSSH2_KNOWNHOST_CHECK_MISMATCH: +        ret = -EINVAL; +        session_error_setg(errp, s, +                      "host key does not match the one in known_hosts" +                      " (found key %s)", found->key); +        goto out; +    case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND: +        ret = -EINVAL; +        session_error_setg(errp, s, "no host key was found in known_hosts"); +        goto out; +    case LIBSSH2_KNOWNHOST_CHECK_FAILURE: +        ret = -EINVAL; +        session_error_setg(errp, s, +                      "failure matching the host key with known_hosts"); +        goto out; +    default: +        ret = -EINVAL; +        session_error_setg(errp, s, "unknown error matching the host key" +                      " with known_hosts (%d)", r); +        goto out; +    } + +    /* known_hosts checking successful. */ +    ret = 0; + + out: +    if (knh != NULL) { +        libssh2_knownhost_free(knh); +    } +    g_free(knh_file); +    return ret; +} + +static unsigned hex2decimal(char ch) +{ +    if (ch >= '0' && ch <= '9') { +        return (ch - '0'); +    } else if (ch >= 'a' && ch <= 'f') { +        return 10 + (ch - 'a'); +    } else if (ch >= 'A' && ch <= 'F') { +        return 10 + (ch - 'A'); +    } + +    return -1; +} + +/* Compare the binary fingerprint (hash of host key) with the + * host_key_check parameter. + */ +static int compare_fingerprint(const unsigned char *fingerprint, size_t len, +                               const char *host_key_check) +{ +    unsigned c; + +    while (len > 0) { +        while (*host_key_check == ':') +            host_key_check++; +        if (!qemu_isxdigit(host_key_check[0]) || +            !qemu_isxdigit(host_key_check[1])) +            return 1; +        c = hex2decimal(host_key_check[0]) * 16 + +            hex2decimal(host_key_check[1]); +        if (c - *fingerprint != 0) +            return c - *fingerprint; +        fingerprint++; +        len--; +        host_key_check += 2; +    } +    return *host_key_check - '\0'; +} + +static int +check_host_key_hash(BDRVSSHState *s, const char *hash, +                    int hash_type, size_t fingerprint_len, Error **errp) +{ +    const char *fingerprint; + +    fingerprint = libssh2_hostkey_hash(s->session, hash_type); +    if (!fingerprint) { +        session_error_setg(errp, s, "failed to read remote host key"); +        return -EINVAL; +    } + +    if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len, +                           hash) != 0) { +        error_setg(errp, "remote host key does not match host_key_check '%s'", +                   hash); +        return -EPERM; +    } + +    return 0; +} + +static int check_host_key(BDRVSSHState *s, const char *host, int port, +                          const char *host_key_check, Error **errp) +{ +    /* host_key_check=no */ +    if (strcmp(host_key_check, "no") == 0) { +        return 0; +    } + +    /* host_key_check=md5:xx:yy:zz:... */ +    if (strncmp(host_key_check, "md5:", 4) == 0) { +        return check_host_key_hash(s, &host_key_check[4], +                                   LIBSSH2_HOSTKEY_HASH_MD5, 16, errp); +    } + +    /* host_key_check=sha1:xx:yy:zz:... */ +    if (strncmp(host_key_check, "sha1:", 5) == 0) { +        return check_host_key_hash(s, &host_key_check[5], +                                   LIBSSH2_HOSTKEY_HASH_SHA1, 20, errp); +    } + +    /* host_key_check=yes */ +    if (strcmp(host_key_check, "yes") == 0) { +        return check_host_key_knownhosts(s, host, port, errp); +    } + +    error_setg(errp, "unknown host_key_check setting (%s)", host_key_check); +    return -EINVAL; +} + +static int authenticate(BDRVSSHState *s, const char *user, Error **errp) +{ +    int r, ret; +    const char *userauthlist; +    LIBSSH2_AGENT *agent = NULL; +    struct libssh2_agent_publickey *identity; +    struct libssh2_agent_publickey *prev_identity = NULL; + +    userauthlist = libssh2_userauth_list(s->session, user, strlen(user)); +    if (strstr(userauthlist, "publickey") == NULL) { +        ret = -EPERM; +        error_setg(errp, +                "remote server does not support \"publickey\" authentication"); +        goto out; +    } + +    /* Connect to ssh-agent and try each identity in turn. */ +    agent = libssh2_agent_init(s->session); +    if (!agent) { +        ret = -EINVAL; +        session_error_setg(errp, s, "failed to initialize ssh-agent support"); +        goto out; +    } +    if (libssh2_agent_connect(agent)) { +        ret = -ECONNREFUSED; +        session_error_setg(errp, s, "failed to connect to ssh-agent"); +        goto out; +    } +    if (libssh2_agent_list_identities(agent)) { +        ret = -EINVAL; +        session_error_setg(errp, s, +                           "failed requesting identities from ssh-agent"); +        goto out; +    } + +    for(;;) { +        r = libssh2_agent_get_identity(agent, &identity, prev_identity); +        if (r == 1) {           /* end of list */ +            break; +        } +        if (r < 0) { +            ret = -EINVAL; +            session_error_setg(errp, s, +                               "failed to obtain identity from ssh-agent"); +            goto out; +        } +        r = libssh2_agent_userauth(agent, user, identity); +        if (r == 0) { +            /* Authenticated! */ +            ret = 0; +            goto out; +        } +        /* Failed to authenticate with this identity, try the next one. */ +        prev_identity = identity; +    } + +    ret = -EPERM; +    error_setg(errp, "failed to authenticate using publickey authentication " +               "and the identities held by your ssh-agent"); + + out: +    if (agent != NULL) { +        /* Note: libssh2 implementation implicitly calls +         * libssh2_agent_disconnect if necessary. +         */ +        libssh2_agent_free(agent); +    } + +    return ret; +} + +static int connect_to_ssh(BDRVSSHState *s, QDict *options, +                          int ssh_flags, int creat_mode, Error **errp) +{ +    int r, ret; +    const char *host, *user, *path, *host_key_check; +    int port; + +    if (!qdict_haskey(options, "host")) { +        ret = -EINVAL; +        error_setg(errp, "No hostname was specified"); +        goto err; +    } +    host = qdict_get_str(options, "host"); + +    if (qdict_haskey(options, "port")) { +        port = qdict_get_int(options, "port"); +    } else { +        port = 22; +    } + +    if (!qdict_haskey(options, "path")) { +        ret = -EINVAL; +        error_setg(errp, "No path was specified"); +        goto err; +    } +    path = qdict_get_str(options, "path"); + +    if (qdict_haskey(options, "user")) { +        user = qdict_get_str(options, "user"); +    } else { +        user = g_get_user_name(); +        if (!user) { +            error_setg_errno(errp, errno, "Can't get user name"); +            ret = -errno; +            goto err; +        } +    } + +    if (qdict_haskey(options, "host_key_check")) { +        host_key_check = qdict_get_str(options, "host_key_check"); +    } else { +        host_key_check = "yes"; +    } + +    /* Construct the host:port name for inet_connect. */ +    g_free(s->hostport); +    s->hostport = g_strdup_printf("%s:%d", host, port); + +    /* Open the socket and connect. */ +    s->sock = inet_connect(s->hostport, errp); +    if (s->sock < 0) { +        ret = -EIO; +        goto err; +    } + +    /* Create SSH session. */ +    s->session = libssh2_session_init(); +    if (!s->session) { +        ret = -EINVAL; +        session_error_setg(errp, s, "failed to initialize libssh2 session"); +        goto err; +    } + +#if TRACE_LIBSSH2 != 0 +    libssh2_trace(s->session, TRACE_LIBSSH2); +#endif + +    r = libssh2_session_handshake(s->session, s->sock); +    if (r != 0) { +        ret = -EINVAL; +        session_error_setg(errp, s, "failed to establish SSH session"); +        goto err; +    } + +    /* Check the remote host's key against known_hosts. */ +    ret = check_host_key(s, host, port, host_key_check, errp); +    if (ret < 0) { +        goto err; +    } + +    /* Authenticate. */ +    ret = authenticate(s, user, errp); +    if (ret < 0) { +        goto err; +    } + +    /* Start SFTP. */ +    s->sftp = libssh2_sftp_init(s->session); +    if (!s->sftp) { +        session_error_setg(errp, s, "failed to initialize sftp handle"); +        ret = -EINVAL; +        goto err; +    } + +    /* Open the remote file. */ +    DPRINTF("opening file %s flags=0x%x creat_mode=0%o", +            path, ssh_flags, creat_mode); +    s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode); +    if (!s->sftp_handle) { +        session_error_setg(errp, s, "failed to open remote file '%s'", path); +        ret = -EINVAL; +        goto err; +    } + +    r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); +    if (r < 0) { +        sftp_error_setg(errp, s, "failed to read file attributes"); +        return -EINVAL; +    } + +    /* Delete the options we've used; any not deleted will cause the +     * block layer to give an error about unused options. +     */ +    qdict_del(options, "host"); +    qdict_del(options, "port"); +    qdict_del(options, "user"); +    qdict_del(options, "path"); +    qdict_del(options, "host_key_check"); + +    return 0; + + err: +    if (s->sftp_handle) { +        libssh2_sftp_close(s->sftp_handle); +    } +    s->sftp_handle = NULL; +    if (s->sftp) { +        libssh2_sftp_shutdown(s->sftp); +    } +    s->sftp = NULL; +    if (s->session) { +        libssh2_session_disconnect(s->session, +                                   "from qemu ssh client: " +                                   "error opening connection"); +        libssh2_session_free(s->session); +    } +    s->session = NULL; + +    return ret; +} + +static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, +                         Error **errp) +{ +    BDRVSSHState *s = bs->opaque; +    int ret; +    int ssh_flags; + +    ssh_state_init(s); + +    ssh_flags = LIBSSH2_FXF_READ; +    if (bdrv_flags & BDRV_O_RDWR) { +        ssh_flags |= LIBSSH2_FXF_WRITE; +    } + +    /* Start up SSH. */ +    ret = connect_to_ssh(s, options, ssh_flags, 0, errp); +    if (ret < 0) { +        goto err; +    } + +    /* Go non-blocking. */ +    libssh2_session_set_blocking(s->session, 0); + +    return 0; + + err: +    if (s->sock >= 0) { +        close(s->sock); +    } +    s->sock = -1; + +    return ret; +} + +static QemuOptsList ssh_create_opts = { +    .name = "ssh-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { /* end of list */ } +    } +}; + +static int ssh_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int r, ret; +    int64_t total_size = 0; +    QDict *uri_options = NULL; +    BDRVSSHState s; +    ssize_t r2; +    char c[1] = { '\0' }; + +    ssh_state_init(&s); + +    /* Get desired file size. */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    DPRINTF("total_size=%" PRIi64, total_size); + +    uri_options = qdict_new(); +    r = parse_uri(filename, uri_options, errp); +    if (r < 0) { +        ret = r; +        goto out; +    } + +    r = connect_to_ssh(&s, uri_options, +                       LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE| +                       LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, +                       0644, errp); +    if (r < 0) { +        ret = r; +        goto out; +    } + +    if (total_size > 0) { +        libssh2_sftp_seek64(s.sftp_handle, total_size-1); +        r2 = libssh2_sftp_write(s.sftp_handle, c, 1); +        if (r2 < 0) { +            sftp_error_setg(errp, &s, "truncate failed"); +            ret = -EINVAL; +            goto out; +        } +        s.attrs.filesize = total_size; +    } + +    ret = 0; + + out: +    ssh_state_free(&s); +    if (uri_options != NULL) { +        QDECREF(uri_options); +    } +    return ret; +} + +static void ssh_close(BlockDriverState *bs) +{ +    BDRVSSHState *s = bs->opaque; + +    ssh_state_free(s); +} + +static int ssh_has_zero_init(BlockDriverState *bs) +{ +    BDRVSSHState *s = bs->opaque; +    /* Assume false, unless we can positively prove it's true. */ +    int has_zero_init = 0; + +    if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) { +        if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) { +            has_zero_init = 1; +        } +    } + +    return has_zero_init; +} + +static void restart_coroutine(void *opaque) +{ +    Coroutine *co = opaque; + +    DPRINTF("co=%p", co); + +    qemu_coroutine_enter(co, NULL); +} + +static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) +{ +    int r; +    IOHandler *rd_handler = NULL, *wr_handler = NULL; +    Coroutine *co = qemu_coroutine_self(); + +    r = libssh2_session_block_directions(s->session); + +    if (r & LIBSSH2_SESSION_BLOCK_INBOUND) { +        rd_handler = restart_coroutine; +    } +    if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) { +        wr_handler = restart_coroutine; +    } + +    DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, +            rd_handler, wr_handler); + +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, +                       rd_handler, wr_handler, co); +} + +static coroutine_fn void clear_fd_handler(BDRVSSHState *s, +                                          BlockDriverState *bs) +{ +    DPRINTF("s->sock=%d", s->sock); +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, NULL, NULL, NULL); +} + +/* A non-blocking call returned EAGAIN, so yield, ensuring the + * handlers are set up so that we'll be rescheduled when there is an + * interesting event on the socket. + */ +static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) +{ +    set_fd_handler(s, bs); +    qemu_coroutine_yield(); +    clear_fd_handler(s, bs); +} + +/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position + * in the remote file.  Notice that it just updates a field in the + * sftp_handle structure, so there is no network traffic and it cannot + * fail. + * + * However, `libssh2_sftp_seek64' does have a catastrophic effect on + * performance since it causes the handle to throw away all in-flight + * reads and buffered readahead data.  Therefore this function tries + * to be intelligent about when to call the underlying libssh2 function. + */ +#define SSH_SEEK_WRITE 0 +#define SSH_SEEK_READ  1 +#define SSH_SEEK_FORCE 2 + +static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags) +{ +    bool op_read = (flags & SSH_SEEK_READ) != 0; +    bool force = (flags & SSH_SEEK_FORCE) != 0; + +    if (force || op_read != s->offset_op_read || offset != s->offset) { +        DPRINTF("seeking to offset=%" PRIi64, offset); +        libssh2_sftp_seek64(s->sftp_handle, offset); +        s->offset = offset; +        s->offset_op_read = op_read; +    } +} + +static coroutine_fn int ssh_read(BDRVSSHState *s, BlockDriverState *bs, +                                 int64_t offset, size_t size, +                                 QEMUIOVector *qiov) +{ +    ssize_t r; +    size_t got; +    char *buf, *end_of_vec; +    struct iovec *i; + +    DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + +    ssh_seek(s, offset, SSH_SEEK_READ); + +    /* This keeps track of the current iovec element ('i'), where we +     * will write to next ('buf'), and the end of the current iovec +     * ('end_of_vec'). +     */ +    i = &qiov->iov[0]; +    buf = i->iov_base; +    end_of_vec = i->iov_base + i->iov_len; + +    /* libssh2 has a hard-coded limit of 2000 bytes per request, +     * although it will also do readahead behind our backs.  Therefore +     * we may have to do repeated reads here until we have read 'size' +     * bytes. +     */ +    for (got = 0; got < size; ) { +    again: +        DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf); +        r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf); +        DPRINTF("sftp_read returned %zd", r); + +        if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { +            co_yield(s, bs); +            goto again; +        } +        if (r < 0) { +            sftp_error_report(s, "read failed"); +            s->offset = -1; +            return -EIO; +        } +        if (r == 0) { +            /* EOF: Short read so pad the buffer with zeroes and return it. */ +            qemu_iovec_memset(qiov, got, 0, size - got); +            return 0; +        } + +        got += r; +        buf += r; +        s->offset += r; +        if (buf >= end_of_vec && got < size) { +            i++; +            buf = i->iov_base; +            end_of_vec = i->iov_base + i->iov_len; +        } +    } + +    return 0; +} + +static coroutine_fn int ssh_co_readv(BlockDriverState *bs, +                                     int64_t sector_num, +                                     int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVSSHState *s = bs->opaque; +    int ret; + +    qemu_co_mutex_lock(&s->lock); +    ret = ssh_read(s, bs, sector_num * BDRV_SECTOR_SIZE, +                   nb_sectors * BDRV_SECTOR_SIZE, qiov); +    qemu_co_mutex_unlock(&s->lock); + +    return ret; +} + +static int ssh_write(BDRVSSHState *s, BlockDriverState *bs, +                     int64_t offset, size_t size, +                     QEMUIOVector *qiov) +{ +    ssize_t r; +    size_t written; +    char *buf, *end_of_vec; +    struct iovec *i; + +    DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + +    ssh_seek(s, offset, SSH_SEEK_WRITE); + +    /* This keeps track of the current iovec element ('i'), where we +     * will read from next ('buf'), and the end of the current iovec +     * ('end_of_vec'). +     */ +    i = &qiov->iov[0]; +    buf = i->iov_base; +    end_of_vec = i->iov_base + i->iov_len; + +    for (written = 0; written < size; ) { +    again: +        DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf); +        r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf); +        DPRINTF("sftp_write returned %zd", r); + +        if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { +            co_yield(s, bs); +            goto again; +        } +        if (r < 0) { +            sftp_error_report(s, "write failed"); +            s->offset = -1; +            return -EIO; +        } +        /* The libssh2 API is very unclear about this.  A comment in +         * the code says "nothing was acked, and no EAGAIN was +         * received!" which apparently means that no data got sent +         * out, and the underlying channel didn't return any EAGAIN +         * indication.  I think this is a bug in either libssh2 or +         * OpenSSH (server-side).  In any case, forcing a seek (to +         * discard libssh2 internal buffers), and then trying again +         * works for me. +         */ +        if (r == 0) { +            ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE); +            co_yield(s, bs); +            goto again; +        } + +        written += r; +        buf += r; +        s->offset += r; +        if (buf >= end_of_vec && written < size) { +            i++; +            buf = i->iov_base; +            end_of_vec = i->iov_base + i->iov_len; +        } + +        if (offset + written > s->attrs.filesize) +            s->attrs.filesize = offset + written; +    } + +    return 0; +} + +static coroutine_fn int ssh_co_writev(BlockDriverState *bs, +                                      int64_t sector_num, +                                      int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVSSHState *s = bs->opaque; +    int ret; + +    qemu_co_mutex_lock(&s->lock); +    ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, +                    nb_sectors * BDRV_SECTOR_SIZE, qiov); +    qemu_co_mutex_unlock(&s->lock); + +    return ret; +} + +static void unsafe_flush_warning(BDRVSSHState *s, const char *what) +{ +    if (!s->unsafe_flush_warning) { +        error_report("warning: ssh server %s does not support fsync", +                     s->hostport); +        if (what) { +            error_report("to support fsync, you need %s", what); +        } +        s->unsafe_flush_warning = true; +    } +} + +#ifdef HAS_LIBSSH2_SFTP_FSYNC + +static coroutine_fn int ssh_flush(BDRVSSHState *s, BlockDriverState *bs) +{ +    int r; + +    DPRINTF("fsync"); + again: +    r = libssh2_sftp_fsync(s->sftp_handle); +    if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { +        co_yield(s, bs); +        goto again; +    } +    if (r == LIBSSH2_ERROR_SFTP_PROTOCOL && +        libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) { +        unsafe_flush_warning(s, "OpenSSH >= 6.3"); +        return 0; +    } +    if (r < 0) { +        sftp_error_report(s, "fsync failed"); +        return -EIO; +    } + +    return 0; +} + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ +    BDRVSSHState *s = bs->opaque; +    int ret; + +    qemu_co_mutex_lock(&s->lock); +    ret = ssh_flush(s, bs); +    qemu_co_mutex_unlock(&s->lock); + +    return ret; +} + +#else /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ +    BDRVSSHState *s = bs->opaque; + +    unsafe_flush_warning(s, "libssh2 >= 1.4.4"); +    return 0; +} + +#endif /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static int64_t ssh_getlength(BlockDriverState *bs) +{ +    BDRVSSHState *s = bs->opaque; +    int64_t length; + +    /* Note we cannot make a libssh2 call here. */ +    length = (int64_t) s->attrs.filesize; +    DPRINTF("length=%" PRIi64, length); + +    return length; +} + +static BlockDriver bdrv_ssh = { +    .format_name                  = "ssh", +    .protocol_name                = "ssh", +    .instance_size                = sizeof(BDRVSSHState), +    .bdrv_parse_filename          = ssh_parse_filename, +    .bdrv_file_open               = ssh_file_open, +    .bdrv_create                  = ssh_create, +    .bdrv_close                   = ssh_close, +    .bdrv_has_zero_init           = ssh_has_zero_init, +    .bdrv_co_readv                = ssh_co_readv, +    .bdrv_co_writev               = ssh_co_writev, +    .bdrv_getlength               = ssh_getlength, +    .bdrv_co_flush_to_disk        = ssh_co_flush, +    .create_opts                  = &ssh_create_opts, +}; + +static void bdrv_ssh_init(void) +{ +    int r; + +    r = libssh2_init(0); +    if (r != 0) { +        fprintf(stderr, "libssh2 initialization failed, %d\n", r); +        exit(EXIT_FAILURE); +    } + +    bdrv_register(&bdrv_ssh); +} + +block_init(bdrv_ssh_init); diff --git a/block/stream.c b/block/stream.c new file mode 100644 index 00000000..ab0bd057 --- /dev/null +++ b/block/stream.c @@ -0,0 +1,270 @@ +/* + * Image streaming + * + * Copyright IBM, Corp. 2011 + * + * Authors: + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" + +enum { +    /* +     * Size of data buffer for populating the image file.  This should be large +     * enough to process multiple clusters in a single call, so that populating +     * contiguous regions of the image is efficient. +     */ +    STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct StreamBlockJob { +    BlockJob common; +    RateLimit limit; +    BlockDriverState *base; +    BlockdevOnError on_error; +    char *backing_file_str; +} StreamBlockJob; + +static int coroutine_fn stream_populate(BlockDriverState *bs, +                                        int64_t sector_num, int nb_sectors, +                                        void *buf) +{ +    struct iovec iov = { +        .iov_base = buf, +        .iov_len  = nb_sectors * BDRV_SECTOR_SIZE, +    }; +    QEMUIOVector qiov; + +    qemu_iovec_init_external(&qiov, &iov, 1); + +    /* Copy-on-read the unallocated clusters */ +    return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); +} + +static void close_unused_images(BlockDriverState *top, BlockDriverState *base, +                                const char *base_id) +{ +    BlockDriverState *intermediate; +    intermediate = top->backing_hd; + +    /* Must assign before bdrv_delete() to prevent traversing dangling pointer +     * while we delete backing image instances. +     */ +    bdrv_set_backing_hd(top, base); + +    while (intermediate) { +        BlockDriverState *unused; + +        /* reached base */ +        if (intermediate == base) { +            break; +        } + +        unused = intermediate; +        intermediate = intermediate->backing_hd; +        bdrv_set_backing_hd(unused, NULL); +        bdrv_unref(unused); +    } + +    bdrv_refresh_limits(top, NULL); +} + +typedef struct { +    int ret; +    bool reached_end; +} StreamCompleteData; + +static void stream_complete(BlockJob *job, void *opaque) +{ +    StreamBlockJob *s = container_of(job, StreamBlockJob, common); +    StreamCompleteData *data = opaque; +    BlockDriverState *base = s->base; + +    if (!block_job_is_cancelled(&s->common) && data->reached_end && +        data->ret == 0) { +        const char *base_id = NULL, *base_fmt = NULL; +        if (base) { +            base_id = s->backing_file_str; +            if (base->drv) { +                base_fmt = base->drv->format_name; +            } +        } +        data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); +        close_unused_images(job->bs, base, base_id); +    } + +    g_free(s->backing_file_str); +    block_job_completed(&s->common, data->ret); +    g_free(data); +} + +static void coroutine_fn stream_run(void *opaque) +{ +    StreamBlockJob *s = opaque; +    StreamCompleteData *data; +    BlockDriverState *bs = s->common.bs; +    BlockDriverState *base = s->base; +    int64_t sector_num, end; +    int error = 0; +    int ret = 0; +    int n = 0; +    void *buf; + +    if (!bs->backing_hd) { +        block_job_completed(&s->common, 0); +        return; +    } + +    s->common.len = bdrv_getlength(bs); +    if (s->common.len < 0) { +        block_job_completed(&s->common, s->common.len); +        return; +    } + +    end = s->common.len >> BDRV_SECTOR_BITS; +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); + +    /* Turn on copy-on-read for the whole block device so that guest read +     * requests help us make progress.  Only do this when copying the entire +     * backing chain since the copy-on-read operation does not take base into +     * account. +     */ +    if (!base) { +        bdrv_enable_copy_on_read(bs); +    } + +    for (sector_num = 0; sector_num < end; sector_num += n) { +        uint64_t delay_ns = 0; +        bool copy; + +wait: +        /* Note that even when no rate limit is applied we need to yield +         * with no pending I/O here so that bdrv_drain_all() returns. +         */ +        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); +        if (block_job_is_cancelled(&s->common)) { +            break; +        } + +        copy = false; + +        ret = bdrv_is_allocated(bs, sector_num, +                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); +        if (ret == 1) { +            /* Allocated in the top, no need to copy.  */ +        } else if (ret >= 0) { +            /* Copy if allocated in the intermediate images.  Limit to the +             * known-unallocated area [sector_num, sector_num+n).  */ +            ret = bdrv_is_allocated_above(bs->backing_hd, base, +                                          sector_num, n, &n); + +            /* Finish early if end of backing file has been reached */ +            if (ret == 0 && n == 0) { +                n = end - sector_num; +            } + +            copy = (ret == 1); +        } +        trace_stream_one_iteration(s, sector_num, n, ret); +        if (copy) { +            if (s->common.speed) { +                delay_ns = ratelimit_calculate_delay(&s->limit, n); +                if (delay_ns > 0) { +                    goto wait; +                } +            } +            ret = stream_populate(bs, sector_num, n, buf); +        } +        if (ret < 0) { +            BlockErrorAction action = +                block_job_error_action(&s->common, s->common.bs, s->on_error, +                                       true, -ret); +            if (action == BLOCK_ERROR_ACTION_STOP) { +                n = 0; +                continue; +            } +            if (error == 0) { +                error = ret; +            } +            if (action == BLOCK_ERROR_ACTION_REPORT) { +                break; +            } +        } +        ret = 0; + +        /* Publish progress */ +        s->common.offset += n * BDRV_SECTOR_SIZE; +    } + +    if (!base) { +        bdrv_disable_copy_on_read(bs); +    } + +    /* Do not remove the backing file if an error was there but ignored.  */ +    ret = error; + +    qemu_vfree(buf); + +    /* Modify backing chain and close BDSes in main loop */ +    data = g_malloc(sizeof(*data)); +    data->ret = ret; +    data->reached_end = sector_num == end; +    block_job_defer_to_main_loop(&s->common, stream_complete, data); +} + +static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ +    StreamBlockJob *s = container_of(job, StreamBlockJob, common); + +    if (speed < 0) { +        error_setg(errp, QERR_INVALID_PARAMETER, "speed"); +        return; +    } +    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static const BlockJobDriver stream_job_driver = { +    .instance_size = sizeof(StreamBlockJob), +    .job_type      = BLOCK_JOB_TYPE_STREAM, +    .set_speed     = stream_set_speed, +}; + +void stream_start(BlockDriverState *bs, BlockDriverState *base, +                  const char *backing_file_str, int64_t speed, +                  BlockdevOnError on_error, +                  BlockCompletionFunc *cb, +                  void *opaque, Error **errp) +{ +    StreamBlockJob *s; + +    if ((on_error == BLOCKDEV_ON_ERROR_STOP || +         on_error == BLOCKDEV_ON_ERROR_ENOSPC) && +        !bdrv_iostatus_is_enabled(bs)) { +        error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); +        return; +    } + +    s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); +    if (!s) { +        return; +    } + +    s->base = base; +    s->backing_file_str = g_strdup(backing_file_str); + +    s->on_error = on_error; +    s->common.co = qemu_coroutine_create(stream_run); +    trace_stream_start(bs, base, s, s->common.co, opaque); +    qemu_coroutine_enter(s->common.co, s); +} diff --git a/block/throttle-groups.c b/block/throttle-groups.c new file mode 100644 index 00000000..1abc6fca --- /dev/null +++ b/block/throttle-groups.c @@ -0,0 +1,501 @@ +/* + * QEMU block throttling group infrastructure + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + *   Benoît Canet <benoit.canet@nodalink.com> + *   Alberto Garcia <berto@igalia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "block/throttle-groups.h" +#include "qemu/queue.h" +#include "qemu/thread.h" +#include "sysemu/qtest.h" + +/* The ThrottleGroup structure (with its ThrottleState) is shared + * among different BlockDriverState and it's independent from + * AioContext, so in order to use it from different threads it needs + * its own locking. + * + * This locking is however handled internally in this file, so it's + * mostly transparent to outside users (but see the documentation in + * throttle_groups_lock()). + * + * The whole ThrottleGroup structure is private and invisible to + * outside users, that only use it through its ThrottleState. + * + * In addition to the ThrottleGroup structure, BlockDriverState has + * fields that need to be accessed by other members of the group and + * therefore also need to be protected by this lock. Once a BDS is + * registered in a group those fields can be accessed by other threads + * any time. + * + * Again, all this is handled internally and is mostly transparent to + * the outside. The 'throttle_timers' field however has an additional + * constraint because it may be temporarily invalid (see for example + * bdrv_set_aio_context()). Therefore in this file a thread will + * access some other BDS's timers only after verifying that that BDS + * has throttled requests in the queue. + */ +typedef struct ThrottleGroup { +    char *name; /* This is constant during the lifetime of the group */ + +    QemuMutex lock; /* This lock protects the following four fields */ +    ThrottleState ts; +    QLIST_HEAD(, BlockDriverState) head; +    BlockDriverState *tokens[2]; +    bool any_timer_armed[2]; + +    /* These two are protected by the global throttle_groups_lock */ +    unsigned refcount; +    QTAILQ_ENTRY(ThrottleGroup) list; +} ThrottleGroup; + +static QemuMutex throttle_groups_lock; +static QTAILQ_HEAD(, ThrottleGroup) throttle_groups = +    QTAILQ_HEAD_INITIALIZER(throttle_groups); + +/* Increments the reference count of a ThrottleGroup given its name. + * + * If no ThrottleGroup is found with the given name a new one is + * created. + * + * @name: the name of the ThrottleGroup + * @ret:  the ThrottleGroup + */ +static ThrottleGroup *throttle_group_incref(const char *name) +{ +    ThrottleGroup *tg = NULL; +    ThrottleGroup *iter; + +    qemu_mutex_lock(&throttle_groups_lock); + +    /* Look for an existing group with that name */ +    QTAILQ_FOREACH(iter, &throttle_groups, list) { +        if (!strcmp(name, iter->name)) { +            tg = iter; +            break; +        } +    } + +    /* Create a new one if not found */ +    if (!tg) { +        tg = g_new0(ThrottleGroup, 1); +        tg->name = g_strdup(name); +        qemu_mutex_init(&tg->lock); +        throttle_init(&tg->ts); +        QLIST_INIT(&tg->head); + +        QTAILQ_INSERT_TAIL(&throttle_groups, tg, list); +    } + +    tg->refcount++; + +    qemu_mutex_unlock(&throttle_groups_lock); + +    return tg; +} + +/* Decrease the reference count of a ThrottleGroup. + * + * When the reference count reaches zero the ThrottleGroup is + * destroyed. + * + * @tg:  The ThrottleGroup to unref + */ +static void throttle_group_unref(ThrottleGroup *tg) +{ +    qemu_mutex_lock(&throttle_groups_lock); +    if (--tg->refcount == 0) { +        QTAILQ_REMOVE(&throttle_groups, tg, list); +        qemu_mutex_destroy(&tg->lock); +        g_free(tg->name); +        g_free(tg); +    } +    qemu_mutex_unlock(&throttle_groups_lock); +} + +/* Get the name from a BlockDriverState's ThrottleGroup. The name (and + * the pointer) is guaranteed to remain constant during the lifetime + * of the group. + * + * @bs:   a BlockDriverState that is member of a throttling group + * @ret:  the name of the group. + */ +const char *throttle_group_get_name(BlockDriverState *bs) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    return tg->name; +} + +/* Return the next BlockDriverState in the round-robin sequence, + * simulating a circular list. + * + * This assumes that tg->lock is held. + * + * @bs:  the current BlockDriverState + * @ret: the next BlockDriverState in the sequence + */ +static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs) +{ +    ThrottleState *ts = bs->throttle_state; +    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); +    BlockDriverState *next = QLIST_NEXT(bs, round_robin); + +    if (!next) { +        return QLIST_FIRST(&tg->head); +    } + +    return next; +} + +/* Return the next BlockDriverState in the round-robin sequence with + * pending I/O requests. + * + * This assumes that tg->lock is held. + * + * @bs:        the current BlockDriverState + * @is_write:  the type of operation (read/write) + * @ret:       the next BlockDriverState with pending requests, or bs + *             if there is none. + */ +static BlockDriverState *next_throttle_token(BlockDriverState *bs, +                                             bool is_write) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    BlockDriverState *token, *start; + +    start = token = tg->tokens[is_write]; + +    /* get next bs round in round robin style */ +    token = throttle_group_next_bs(token); +    while (token != start && !token->pending_reqs[is_write]) { +        token = throttle_group_next_bs(token); +    } + +    /* If no IO are queued for scheduling on the next round robin token +     * then decide the token is the current bs because chances are +     * the current bs get the current request queued. +     */ +    if (token == start && !token->pending_reqs[is_write]) { +        token = bs; +    } + +    return token; +} + +/* Check if the next I/O request for a BlockDriverState needs to be + * throttled or not. If there's no timer set in this group, set one + * and update the token accordingly. + * + * This assumes that tg->lock is held. + * + * @bs:         the current BlockDriverState + * @is_write:   the type of operation (read/write) + * @ret:        whether the I/O request needs to be throttled or not + */ +static bool throttle_group_schedule_timer(BlockDriverState *bs, +                                          bool is_write) +{ +    ThrottleState *ts = bs->throttle_state; +    ThrottleTimers *tt = &bs->throttle_timers; +    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); +    bool must_wait; + +    /* Check if any of the timers in this group is already armed */ +    if (tg->any_timer_armed[is_write]) { +        return true; +    } + +    must_wait = throttle_schedule_timer(ts, tt, is_write); + +    /* If a timer just got armed, set bs as the current token */ +    if (must_wait) { +        tg->tokens[is_write] = bs; +        tg->any_timer_armed[is_write] = true; +    } + +    return must_wait; +} + +/* Look for the next pending I/O request and schedule it. + * + * This assumes that tg->lock is held. + * + * @bs:        the current BlockDriverState + * @is_write:  the type of operation (read/write) + */ +static void schedule_next_request(BlockDriverState *bs, bool is_write) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    bool must_wait; +    BlockDriverState *token; + +    /* Check if there's any pending request to schedule next */ +    token = next_throttle_token(bs, is_write); +    if (!token->pending_reqs[is_write]) { +        return; +    } + +    /* Set a timer for the request if it needs to be throttled */ +    must_wait = throttle_group_schedule_timer(token, is_write); + +    /* If it doesn't have to wait, queue it for immediate execution */ +    if (!must_wait) { +        /* Give preference to requests from the current bs */ +        if (qemu_in_coroutine() && +            qemu_co_queue_next(&bs->throttled_reqs[is_write])) { +            token = bs; +        } else { +            ThrottleTimers *tt = &token->throttle_timers; +            int64_t now = qemu_clock_get_ns(tt->clock_type); +            timer_mod(tt->timers[is_write], now + 1); +            tg->any_timer_armed[is_write] = true; +        } +        tg->tokens[is_write] = token; +    } +} + +/* Check if an I/O request needs to be throttled, wait and set a timer + * if necessary, and schedule the next request using a round robin + * algorithm. + * + * @bs:        the current BlockDriverState + * @bytes:     the number of bytes for this I/O + * @is_write:  the type of operation (read/write) + */ +void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, +                                                        unsigned int bytes, +                                                        bool is_write) +{ +    bool must_wait; +    BlockDriverState *token; + +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    qemu_mutex_lock(&tg->lock); + +    /* First we check if this I/O has to be throttled. */ +    token = next_throttle_token(bs, is_write); +    must_wait = throttle_group_schedule_timer(token, is_write); + +    /* Wait if there's a timer set or queued requests of this type */ +    if (must_wait || bs->pending_reqs[is_write]) { +        bs->pending_reqs[is_write]++; +        qemu_mutex_unlock(&tg->lock); +        qemu_co_queue_wait(&bs->throttled_reqs[is_write]); +        qemu_mutex_lock(&tg->lock); +        bs->pending_reqs[is_write]--; +    } + +    /* The I/O will be executed, so do the accounting */ +    throttle_account(bs->throttle_state, is_write, bytes); + +    /* Schedule the next request */ +    schedule_next_request(bs, is_write); + +    qemu_mutex_unlock(&tg->lock); +} + +/* Update the throttle configuration for a particular group. Similar + * to throttle_config(), but guarantees atomicity within the + * throttling group. + * + * @bs:  a BlockDriverState that is member of the group + * @cfg: the configuration to set + */ +void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ +    ThrottleTimers *tt = &bs->throttle_timers; +    ThrottleState *ts = bs->throttle_state; +    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); +    qemu_mutex_lock(&tg->lock); +    /* throttle_config() cancels the timers */ +    if (timer_pending(tt->timers[0])) { +        tg->any_timer_armed[0] = false; +    } +    if (timer_pending(tt->timers[1])) { +        tg->any_timer_armed[1] = false; +    } +    throttle_config(ts, tt, cfg); +    qemu_mutex_unlock(&tg->lock); +} + +/* Get the throttle configuration from a particular group. Similar to + * throttle_get_config(), but guarantees atomicity within the + * throttling group. + * + * @bs:  a BlockDriverState that is member of the group + * @cfg: the configuration will be written here + */ +void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ +    ThrottleState *ts = bs->throttle_state; +    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); +    qemu_mutex_lock(&tg->lock); +    throttle_get_config(ts, cfg); +    qemu_mutex_unlock(&tg->lock); +} + +/* ThrottleTimers callback. This wakes up a request that was waiting + * because it had been throttled. + * + * @bs:        the BlockDriverState whose request had been throttled + * @is_write:  the type of operation (read/write) + */ +static void timer_cb(BlockDriverState *bs, bool is_write) +{ +    ThrottleState *ts = bs->throttle_state; +    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); +    bool empty_queue; + +    /* The timer has just been fired, so we can update the flag */ +    qemu_mutex_lock(&tg->lock); +    tg->any_timer_armed[is_write] = false; +    qemu_mutex_unlock(&tg->lock); + +    /* Run the request that was waiting for this timer */ +    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]); + +    /* If the request queue was empty then we have to take care of +     * scheduling the next one */ +    if (empty_queue) { +        qemu_mutex_lock(&tg->lock); +        schedule_next_request(bs, is_write); +        qemu_mutex_unlock(&tg->lock); +    } +} + +static void read_timer_cb(void *opaque) +{ +    timer_cb(opaque, false); +} + +static void write_timer_cb(void *opaque) +{ +    timer_cb(opaque, true); +} + +/* Register a BlockDriverState in the throttling group, also + * initializing its timers and updating its throttle_state pointer to + * point to it. If a throttling group with that name does not exist + * yet, it will be created. + * + * @bs:        the BlockDriverState to insert + * @groupname: the name of the group + */ +void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) +{ +    int i; +    ThrottleGroup *tg = throttle_group_incref(groupname); +    int clock_type = QEMU_CLOCK_REALTIME; + +    if (qtest_enabled()) { +        /* For testing block IO throttling only */ +        clock_type = QEMU_CLOCK_VIRTUAL; +    } + +    bs->throttle_state = &tg->ts; + +    qemu_mutex_lock(&tg->lock); +    /* If the ThrottleGroup is new set this BlockDriverState as the token */ +    for (i = 0; i < 2; i++) { +        if (!tg->tokens[i]) { +            tg->tokens[i] = bs; +        } +    } + +    QLIST_INSERT_HEAD(&tg->head, bs, round_robin); + +    throttle_timers_init(&bs->throttle_timers, +                         bdrv_get_aio_context(bs), +                         clock_type, +                         read_timer_cb, +                         write_timer_cb, +                         bs); + +    qemu_mutex_unlock(&tg->lock); +} + +/* Unregister a BlockDriverState from its group, removing it from the + * list, destroying the timers and setting the throttle_state pointer + * to NULL. + * + * The group will be destroyed if it's empty after this operation. + * + * @bs: the BlockDriverState to remove + */ +void throttle_group_unregister_bs(BlockDriverState *bs) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    int i; + +    qemu_mutex_lock(&tg->lock); +    for (i = 0; i < 2; i++) { +        if (tg->tokens[i] == bs) { +            BlockDriverState *token = throttle_group_next_bs(bs); +            /* Take care of the case where this is the last bs in the group */ +            if (token == bs) { +                token = NULL; +            } +            tg->tokens[i] = token; +        } +    } + +    /* remove the current bs from the list */ +    QLIST_REMOVE(bs, round_robin); +    throttle_timers_destroy(&bs->throttle_timers); +    qemu_mutex_unlock(&tg->lock); + +    throttle_group_unref(tg); +    bs->throttle_state = NULL; +} + +/* Acquire the lock of this throttling group. + * + * You won't normally need to use this. None of the functions from the + * ThrottleGroup API require you to acquire the lock since all of them + * deal with it internally. + * + * This should only be used in exceptional cases when you want to + * access the protected fields of a BlockDriverState directly + * (e.g. bdrv_swap()). + * + * @bs: a BlockDriverState that is member of the group + */ +void throttle_group_lock(BlockDriverState *bs) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    qemu_mutex_lock(&tg->lock); +} + +/* Release the lock of this throttling group. + * + * See the comments in throttle_group_lock(). + */ +void throttle_group_unlock(BlockDriverState *bs) +{ +    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); +    qemu_mutex_unlock(&tg->lock); +} + +static void throttle_groups_init(void) +{ +    qemu_mutex_init(&throttle_groups_lock); +} + +block_init(throttle_groups_init); diff --git a/block/vdi.c b/block/vdi.c new file mode 100644 index 00000000..7642ef35 --- /dev/null +++ b/block/vdi.c @@ -0,0 +1,914 @@ +/* + * Block driver for the Virtual Disk Image (VDI) format + * + * Copyright (c) 2009, 2012 Stefan Weil + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <http://www.gnu.org/licenses/>. + * + * Reference: + * http://forums.virtualbox.org/viewtopic.php?t=8046 + * + * This driver supports create / read / write operations on VDI images. + * + * Todo (see also TODO in code): + * + * Some features like snapshots are still missing. + * + * Deallocation of zero-filled blocks and shrinking images are missing, too + * (might be added to common block layer). + * + * Allocation of blocks could be optimized (less writes to block map and + * header). + * + * Read and write of adjacent blocks could be done in one operation + * (current code uses one operation per block (1 MiB). + * + * The code is not thread safe (missing locks for changes in header and + * block table, no problem with current QEMU). + * + * Hints: + * + * Blocks (VDI documentation) correspond to clusters (QEMU). + * QEMU's backing files could be implemented using VDI snapshot files (TODO). + * VDI snapshot files may also contain the complete machine state. + * Maybe this machine state can be converted to QEMU PC machine snapshot data. + * + * The driver keeps a block cache (little endian entries) in memory. + * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM, + * so this seems to be reasonable. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include "block/coroutine.h" + +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#else +/* TODO: move uuid emulation to some central place in QEMU. */ +#include "sysemu/sysemu.h"     /* UUID_FMT */ +typedef unsigned char uuid_t[16]; +#endif + +/* Code configuration options. */ + +/* Enable debug messages. */ +//~ #define CONFIG_VDI_DEBUG + +/* Support write operations on VDI images. */ +#define CONFIG_VDI_WRITE + +/* Support non-standard block (cluster) size. This is untested. + * Maybe it will be needed for very large images. + */ +//~ #define CONFIG_VDI_BLOCK_SIZE + +/* Support static (fixed, pre-allocated) images. */ +#define CONFIG_VDI_STATIC_IMAGE + +/* Command line option for static images. */ +#define BLOCK_OPT_STATIC "static" + +#define KiB     1024 +#define MiB     (KiB * KiB) + +#define SECTOR_SIZE 512 +#define DEFAULT_CLUSTER_SIZE (1 * MiB) + +#if defined(CONFIG_VDI_DEBUG) +#define logout(fmt, ...) \ +                fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +/* Image signature. */ +#define VDI_SIGNATURE 0xbeda107f + +/* Image version. */ +#define VDI_VERSION_1_1 0x00010001 + +/* Image type. */ +#define VDI_TYPE_DYNAMIC 1 +#define VDI_TYPE_STATIC  2 + +/* Innotek / SUN images use these strings in header.text: + * "<<< innotek VirtualBox Disk Image >>>\n" + * "<<< Sun xVM VirtualBox Disk Image >>>\n" + * "<<< Sun VirtualBox Disk Image >>>\n" + * The value does not matter, so QEMU created images use a different text. + */ +#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n" + +/* A never-allocated block; semantically arbitrary content. */ +#define VDI_UNALLOCATED 0xffffffffU + +/* A discarded (no longer allocated) block; semantically zero-filled. */ +#define VDI_DISCARDED   0xfffffffeU + +#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) + +/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since + * the bmap is read and written in a single operation, its size needs to be + * limited to INT_MAX; furthermore, when opening an image, the bmap size is + * rounded up to be aligned on BDRV_SECTOR_SIZE. + * Therefore this should satisfy the following: + * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1 + * (INT_MAX + 1 is the first value not representable as an int) + * This guarantees that any value below or equal to the constant will, when + * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary, + * still be below or equal to INT_MAX. */ +#define VDI_BLOCKS_IN_IMAGE_MAX \ +    ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t))) +#define VDI_DISK_SIZE_MAX        ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \ +                                  (uint64_t)DEFAULT_CLUSTER_SIZE) + +#if !defined(CONFIG_UUID) +static inline void uuid_generate(uuid_t out) +{ +    memset(out, 0, sizeof(uuid_t)); +} + +static inline int uuid_is_null(const uuid_t uu) +{ +    uuid_t null_uuid = { 0 }; +    return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0; +} + +# if defined(CONFIG_VDI_DEBUG) +static inline void uuid_unparse(const uuid_t uu, char *out) +{ +    snprintf(out, 37, UUID_FMT, +            uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], +            uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); +} +# endif +#endif + +typedef struct { +    char text[0x40]; +    uint32_t signature; +    uint32_t version; +    uint32_t header_size; +    uint32_t image_type; +    uint32_t image_flags; +    char description[256]; +    uint32_t offset_bmap; +    uint32_t offset_data; +    uint32_t cylinders;         /* disk geometry, unused here */ +    uint32_t heads;             /* disk geometry, unused here */ +    uint32_t sectors;           /* disk geometry, unused here */ +    uint32_t sector_size; +    uint32_t unused1; +    uint64_t disk_size; +    uint32_t block_size; +    uint32_t block_extra;       /* unused here */ +    uint32_t blocks_in_image; +    uint32_t blocks_allocated; +    uuid_t uuid_image; +    uuid_t uuid_last_snap; +    uuid_t uuid_link; +    uuid_t uuid_parent; +    uint64_t unused2[7]; +} QEMU_PACKED VdiHeader; + +typedef struct { +    /* The block map entries are little endian (even in memory). */ +    uint32_t *bmap; +    /* Size of block (bytes). */ +    uint32_t block_size; +    /* Size of block (sectors). */ +    uint32_t block_sectors; +    /* First sector of block map. */ +    uint32_t bmap_sector; +    /* VDI header (converted to host endianness). */ +    VdiHeader header; + +    CoMutex write_lock; + +    Error *migration_blocker; +} BDRVVdiState; + +/* Change UUID from little endian (IPRT = VirtualBox format) to big endian + * format (network byte order, standard, see RFC 4122) and vice versa. + */ +static void uuid_convert(uuid_t uuid) +{ +    bswap32s((uint32_t *)&uuid[0]); +    bswap16s((uint16_t *)&uuid[4]); +    bswap16s((uint16_t *)&uuid[6]); +} + +static void vdi_header_to_cpu(VdiHeader *header) +{ +    le32_to_cpus(&header->signature); +    le32_to_cpus(&header->version); +    le32_to_cpus(&header->header_size); +    le32_to_cpus(&header->image_type); +    le32_to_cpus(&header->image_flags); +    le32_to_cpus(&header->offset_bmap); +    le32_to_cpus(&header->offset_data); +    le32_to_cpus(&header->cylinders); +    le32_to_cpus(&header->heads); +    le32_to_cpus(&header->sectors); +    le32_to_cpus(&header->sector_size); +    le64_to_cpus(&header->disk_size); +    le32_to_cpus(&header->block_size); +    le32_to_cpus(&header->block_extra); +    le32_to_cpus(&header->blocks_in_image); +    le32_to_cpus(&header->blocks_allocated); +    uuid_convert(header->uuid_image); +    uuid_convert(header->uuid_last_snap); +    uuid_convert(header->uuid_link); +    uuid_convert(header->uuid_parent); +} + +static void vdi_header_to_le(VdiHeader *header) +{ +    cpu_to_le32s(&header->signature); +    cpu_to_le32s(&header->version); +    cpu_to_le32s(&header->header_size); +    cpu_to_le32s(&header->image_type); +    cpu_to_le32s(&header->image_flags); +    cpu_to_le32s(&header->offset_bmap); +    cpu_to_le32s(&header->offset_data); +    cpu_to_le32s(&header->cylinders); +    cpu_to_le32s(&header->heads); +    cpu_to_le32s(&header->sectors); +    cpu_to_le32s(&header->sector_size); +    cpu_to_le64s(&header->disk_size); +    cpu_to_le32s(&header->block_size); +    cpu_to_le32s(&header->block_extra); +    cpu_to_le32s(&header->blocks_in_image); +    cpu_to_le32s(&header->blocks_allocated); +    uuid_convert(header->uuid_image); +    uuid_convert(header->uuid_last_snap); +    uuid_convert(header->uuid_link); +    uuid_convert(header->uuid_parent); +} + +#if defined(CONFIG_VDI_DEBUG) +static void vdi_header_print(VdiHeader *header) +{ +    char uuid[37]; +    logout("text        %s", header->text); +    logout("signature   0x%08x\n", header->signature); +    logout("header size 0x%04x\n", header->header_size); +    logout("image type  0x%04x\n", header->image_type); +    logout("image flags 0x%04x\n", header->image_flags); +    logout("description %s\n", header->description); +    logout("offset bmap 0x%04x\n", header->offset_bmap); +    logout("offset data 0x%04x\n", header->offset_data); +    logout("cylinders   0x%04x\n", header->cylinders); +    logout("heads       0x%04x\n", header->heads); +    logout("sectors     0x%04x\n", header->sectors); +    logout("sector size 0x%04x\n", header->sector_size); +    logout("image size  0x%" PRIx64 " B (%" PRIu64 " MiB)\n", +           header->disk_size, header->disk_size / MiB); +    logout("block size  0x%04x\n", header->block_size); +    logout("block extra 0x%04x\n", header->block_extra); +    logout("blocks tot. 0x%04x\n", header->blocks_in_image); +    logout("blocks all. 0x%04x\n", header->blocks_allocated); +    uuid_unparse(header->uuid_image, uuid); +    logout("uuid image  %s\n", uuid); +    uuid_unparse(header->uuid_last_snap, uuid); +    logout("uuid snap   %s\n", uuid); +    uuid_unparse(header->uuid_link, uuid); +    logout("uuid link   %s\n", uuid); +    uuid_unparse(header->uuid_parent, uuid); +    logout("uuid parent %s\n", uuid); +} +#endif + +static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, +                     BdrvCheckMode fix) +{ +    /* TODO: additional checks possible. */ +    BDRVVdiState *s = (BDRVVdiState *)bs->opaque; +    uint32_t blocks_allocated = 0; +    uint32_t block; +    uint32_t *bmap; +    logout("\n"); + +    if (fix) { +        return -ENOTSUP; +    } + +    bmap = g_try_new(uint32_t, s->header.blocks_in_image); +    if (s->header.blocks_in_image && bmap == NULL) { +        res->check_errors++; +        return -ENOMEM; +    } + +    memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); + +    /* Check block map and value of blocks_allocated. */ +    for (block = 0; block < s->header.blocks_in_image; block++) { +        uint32_t bmap_entry = le32_to_cpu(s->bmap[block]); +        if (VDI_IS_ALLOCATED(bmap_entry)) { +            if (bmap_entry < s->header.blocks_in_image) { +                blocks_allocated++; +                if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) { +                    bmap[bmap_entry] = bmap_entry; +                } else { +                    fprintf(stderr, "ERROR: block index %" PRIu32 +                            " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry); +                    res->corruptions++; +                } +            } else { +                fprintf(stderr, "ERROR: block index %" PRIu32 +                        " too large, is %" PRIu32 "\n", block, bmap_entry); +                res->corruptions++; +            } +        } +    } +    if (blocks_allocated != s->header.blocks_allocated) { +        fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32 +               ", should be %" PRIu32 "\n", +               blocks_allocated, s->header.blocks_allocated); +        res->corruptions++; +    } + +    g_free(bmap); + +    return 0; +} + +static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    /* TODO: vdi_get_info would be needed for machine snapshots. +       vm_state_offset is still missing. */ +    BDRVVdiState *s = (BDRVVdiState *)bs->opaque; +    logout("\n"); +    bdi->cluster_size = s->block_size; +    bdi->vm_state_offset = 0; +    bdi->unallocated_blocks_are_zero = true; +    return 0; +} + +static int vdi_make_empty(BlockDriverState *bs) +{ +    /* TODO: missing code. */ +    logout("\n"); +    /* The return value for missing code must be 0, see block.c. */ +    return 0; +} + +static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    const VdiHeader *header = (const VdiHeader *)buf; +    int ret = 0; + +    logout("\n"); + +    if (buf_size < sizeof(*header)) { +        /* Header too small, no VDI. */ +    } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { +        ret = 100; +    } + +    if (ret == 0) { +        logout("no vdi image\n"); +    } else { +        logout("%s", header->text); +    } + +    return ret; +} + +static int vdi_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVVdiState *s = bs->opaque; +    VdiHeader header; +    size_t bmap_size; +    int ret; + +    logout("\n"); + +    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1); +    if (ret < 0) { +        goto fail; +    } + +    vdi_header_to_cpu(&header); +#if defined(CONFIG_VDI_DEBUG) +    vdi_header_print(&header); +#endif + +    if (header.disk_size > VDI_DISK_SIZE_MAX) { +        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 +                          ", max supported is 0x%" PRIx64 ")", +                          header.disk_size, VDI_DISK_SIZE_MAX); +        ret = -ENOTSUP; +        goto fail; +    } + +    if (header.disk_size % SECTOR_SIZE != 0) { +        /* 'VBoxManage convertfromraw' can create images with odd disk sizes. +           We accept them but round the disk size to the next multiple of +           SECTOR_SIZE. */ +        logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size); +        header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE); +    } + +    if (header.signature != VDI_SIGNATURE) { +        error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32 +                   ")", header.signature); +        ret = -EINVAL; +        goto fail; +    } else if (header.version != VDI_VERSION_1_1) { +        error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32 +                   ")", header.version >> 16, header.version & 0xffff); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.offset_bmap % SECTOR_SIZE != 0) { +        /* We only support block maps which start on a sector boundary. */ +        error_setg(errp, "unsupported VDI image (unaligned block map offset " +                   "0x%" PRIx32 ")", header.offset_bmap); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.offset_data % SECTOR_SIZE != 0) { +        /* We only support data blocks which start on a sector boundary. */ +        error_setg(errp, "unsupported VDI image (unaligned data offset 0x%" +                   PRIx32 ")", header.offset_data); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.sector_size != SECTOR_SIZE) { +        error_setg(errp, "unsupported VDI image (sector size %" PRIu32 +                   " is not %u)", header.sector_size, SECTOR_SIZE); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { +        error_setg(errp, "unsupported VDI image (block size %" PRIu32 +                   " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.disk_size > +               (uint64_t)header.blocks_in_image * header.block_size) { +        error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", " +                   "image bitmap has room for %" PRIu64 ")", +                   header.disk_size, +                   (uint64_t)header.blocks_in_image * header.block_size); +        ret = -ENOTSUP; +        goto fail; +    } else if (!uuid_is_null(header.uuid_link)) { +        error_setg(errp, "unsupported VDI image (non-NULL link UUID)"); +        ret = -ENOTSUP; +        goto fail; +    } else if (!uuid_is_null(header.uuid_parent)) { +        error_setg(errp, "unsupported VDI image (non-NULL parent UUID)"); +        ret = -ENOTSUP; +        goto fail; +    } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) { +        error_setg(errp, "unsupported VDI image " +                         "(too many blocks %u, max is %u)", +                          header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX); +        ret = -ENOTSUP; +        goto fail; +    } + +    bs->total_sectors = header.disk_size / SECTOR_SIZE; + +    s->block_size = header.block_size; +    s->block_sectors = header.block_size / SECTOR_SIZE; +    s->bmap_sector = header.offset_bmap / SECTOR_SIZE; +    s->header = header; + +    bmap_size = header.blocks_in_image * sizeof(uint32_t); +    bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE); +    s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE); +    if (s->bmap == NULL) { +        ret = -ENOMEM; +        goto fail; +    } + +    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); +    if (ret < 0) { +        goto fail_free_bmap; +    } + +    /* Disable migration when vdi images are used */ +    error_setg(&s->migration_blocker, "The vdi format used by node '%s' " +               "does not support live migration", +               bdrv_get_device_or_node_name(bs)); +    migrate_add_blocker(s->migration_blocker); + +    qemu_co_mutex_init(&s->write_lock); + +    return 0; + + fail_free_bmap: +    qemu_vfree(s->bmap); + + fail: +    return ret; +} + +static int vdi_reopen_prepare(BDRVReopenState *state, +                              BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ +    BDRVVdiState *s = (BDRVVdiState *)bs->opaque; +    size_t bmap_index = sector_num / s->block_sectors; +    size_t sector_in_block = sector_num % s->block_sectors; +    int n_sectors = s->block_sectors - sector_in_block; +    uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); +    uint64_t offset; +    int result; + +    logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); +    if (n_sectors > nb_sectors) { +        n_sectors = nb_sectors; +    } +    *pnum = n_sectors; +    result = VDI_IS_ALLOCATED(bmap_entry); +    if (!result) { +        return 0; +    } + +    offset = s->header.offset_data + +                              (uint64_t)bmap_entry * s->block_size + +                              sector_in_block * SECTOR_SIZE; +    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; +} + +static int vdi_co_read(BlockDriverState *bs, +        int64_t sector_num, uint8_t *buf, int nb_sectors) +{ +    BDRVVdiState *s = bs->opaque; +    uint32_t bmap_entry; +    uint32_t block_index; +    uint32_t sector_in_block; +    uint32_t n_sectors; +    int ret = 0; + +    logout("\n"); + +    while (ret >= 0 && nb_sectors > 0) { +        block_index = sector_num / s->block_sectors; +        sector_in_block = sector_num % s->block_sectors; +        n_sectors = s->block_sectors - sector_in_block; +        if (n_sectors > nb_sectors) { +            n_sectors = nb_sectors; +        } + +        logout("will read %u sectors starting at sector %" PRIu64 "\n", +               n_sectors, sector_num); + +        /* prepare next AIO request */ +        bmap_entry = le32_to_cpu(s->bmap[block_index]); +        if (!VDI_IS_ALLOCATED(bmap_entry)) { +            /* Block not allocated, return zeros, no need to wait. */ +            memset(buf, 0, n_sectors * SECTOR_SIZE); +            ret = 0; +        } else { +            uint64_t offset = s->header.offset_data / SECTOR_SIZE + +                              (uint64_t)bmap_entry * s->block_sectors + +                              sector_in_block; +            ret = bdrv_read(bs->file, offset, buf, n_sectors); +        } +        logout("%u sectors read\n", n_sectors); + +        nb_sectors -= n_sectors; +        sector_num += n_sectors; +        buf += n_sectors * SECTOR_SIZE; +    } + +    return ret; +} + +static int vdi_co_write(BlockDriverState *bs, +        int64_t sector_num, const uint8_t *buf, int nb_sectors) +{ +    BDRVVdiState *s = bs->opaque; +    uint32_t bmap_entry; +    uint32_t block_index; +    uint32_t sector_in_block; +    uint32_t n_sectors; +    uint32_t bmap_first = VDI_UNALLOCATED; +    uint32_t bmap_last = VDI_UNALLOCATED; +    uint8_t *block = NULL; +    int ret = 0; + +    logout("\n"); + +    while (ret >= 0 && nb_sectors > 0) { +        block_index = sector_num / s->block_sectors; +        sector_in_block = sector_num % s->block_sectors; +        n_sectors = s->block_sectors - sector_in_block; +        if (n_sectors > nb_sectors) { +            n_sectors = nb_sectors; +        } + +        logout("will write %u sectors starting at sector %" PRIu64 "\n", +               n_sectors, sector_num); + +        /* prepare next AIO request */ +        bmap_entry = le32_to_cpu(s->bmap[block_index]); +        if (!VDI_IS_ALLOCATED(bmap_entry)) { +            /* Allocate new block and write to it. */ +            uint64_t offset; +            bmap_entry = s->header.blocks_allocated; +            s->bmap[block_index] = cpu_to_le32(bmap_entry); +            s->header.blocks_allocated++; +            offset = s->header.offset_data / SECTOR_SIZE + +                     (uint64_t)bmap_entry * s->block_sectors; +            if (block == NULL) { +                block = g_malloc(s->block_size); +                bmap_first = block_index; +            } +            bmap_last = block_index; +            /* Copy data to be written to new block and zero unused parts. */ +            memset(block, 0, sector_in_block * SECTOR_SIZE); +            memcpy(block + sector_in_block * SECTOR_SIZE, +                   buf, n_sectors * SECTOR_SIZE); +            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, +                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + +            /* Note that this coroutine does not yield anywhere from reading the +             * bmap entry until here, so in regards to all the coroutines trying +             * to write to this cluster, the one doing the allocation will +             * always be the first to try to acquire the lock. +             * Therefore, it is also the first that will actually be able to +             * acquire the lock and thus the padded cluster is written before +             * the other coroutines can write to the affected area. */ +            qemu_co_mutex_lock(&s->write_lock); +            ret = bdrv_write(bs->file, offset, block, s->block_sectors); +            qemu_co_mutex_unlock(&s->write_lock); +        } else { +            uint64_t offset = s->header.offset_data / SECTOR_SIZE + +                              (uint64_t)bmap_entry * s->block_sectors + +                              sector_in_block; +            qemu_co_mutex_lock(&s->write_lock); +            /* This lock is only used to make sure the following write operation +             * is executed after the write issued by the coroutine allocating +             * this cluster, therefore we do not need to keep it locked. +             * As stated above, the allocating coroutine will always try to lock +             * the mutex before all the other concurrent accesses to that +             * cluster, therefore at this point we can be absolutely certain +             * that that write operation has returned (there may be other writes +             * in flight, but they do not concern this very operation). */ +            qemu_co_mutex_unlock(&s->write_lock); +            ret = bdrv_write(bs->file, offset, buf, n_sectors); +        } + +        nb_sectors -= n_sectors; +        sector_num += n_sectors; +        buf += n_sectors * SECTOR_SIZE; + +        logout("%u sectors written\n", n_sectors); +    } + +    logout("finished data write\n"); +    if (ret < 0) { +        return ret; +    } + +    if (block) { +        /* One or more new blocks were allocated. */ +        VdiHeader *header = (VdiHeader *) block; +        uint8_t *base; +        uint64_t offset; + +        logout("now writing modified header\n"); +        assert(VDI_IS_ALLOCATED(bmap_first)); +        *header = s->header; +        vdi_header_to_le(header); +        ret = bdrv_write(bs->file, 0, block, 1); +        g_free(block); +        block = NULL; + +        if (ret < 0) { +            return ret; +        } + +        logout("now writing modified block map entry %u...%u\n", +               bmap_first, bmap_last); +        /* Write modified sectors from block map. */ +        bmap_first /= (SECTOR_SIZE / sizeof(uint32_t)); +        bmap_last /= (SECTOR_SIZE / sizeof(uint32_t)); +        n_sectors = bmap_last - bmap_first + 1; +        offset = s->bmap_sector + bmap_first; +        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; +        logout("will write %u block map sectors starting from entry %u\n", +               n_sectors, bmap_first); +        ret = bdrv_write(bs->file, offset, base, n_sectors); +    } + +    return ret; +} + +static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int ret = 0; +    uint64_t bytes = 0; +    uint32_t blocks; +    size_t block_size = DEFAULT_CLUSTER_SIZE; +    uint32_t image_type = VDI_TYPE_DYNAMIC; +    VdiHeader header; +    size_t i; +    size_t bmap_size; +    int64_t offset = 0; +    Error *local_err = NULL; +    BlockDriverState *bs = NULL; +    uint32_t *bmap = NULL; + +    logout("\n"); + +    /* Read out options. */ +    bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                     BDRV_SECTOR_SIZE); +#if defined(CONFIG_VDI_BLOCK_SIZE) +    /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ +    block_size = qemu_opt_get_size_del(opts, +                                       BLOCK_OPT_CLUSTER_SIZE, +                                       DEFAULT_CLUSTER_SIZE); +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) { +        image_type = VDI_TYPE_STATIC; +    } +#endif + +    if (bytes > VDI_DISK_SIZE_MAX) { +        ret = -ENOTSUP; +        error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 +                          ", max supported is 0x%" PRIx64 ")", +                          bytes, VDI_DISK_SIZE_MAX); +        goto exit; +    } + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } + +    /* We need enough blocks to store the given disk size, +       so always round up. */ +    blocks = DIV_ROUND_UP(bytes, block_size); + +    bmap_size = blocks * sizeof(uint32_t); +    bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE); + +    memset(&header, 0, sizeof(header)); +    pstrcpy(header.text, sizeof(header.text), VDI_TEXT); +    header.signature = VDI_SIGNATURE; +    header.version = VDI_VERSION_1_1; +    header.header_size = 0x180; +    header.image_type = image_type; +    header.offset_bmap = 0x200; +    header.offset_data = 0x200 + bmap_size; +    header.sector_size = SECTOR_SIZE; +    header.disk_size = bytes; +    header.block_size = block_size; +    header.blocks_in_image = blocks; +    if (image_type == VDI_TYPE_STATIC) { +        header.blocks_allocated = blocks; +    } +    uuid_generate(header.uuid_image); +    uuid_generate(header.uuid_last_snap); +    /* There is no need to set header.uuid_link or header.uuid_parent here. */ +#if defined(CONFIG_VDI_DEBUG) +    vdi_header_print(&header); +#endif +    vdi_header_to_le(&header); +    ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); +    if (ret < 0) { +        error_setg(errp, "Error writing header to %s", filename); +        goto exit; +    } +    offset += sizeof(header); + +    if (bmap_size > 0) { +        bmap = g_try_malloc0(bmap_size); +        if (bmap == NULL) { +            ret = -ENOMEM; +            error_setg(errp, "Could not allocate bmap"); +            goto exit; +        } +        for (i = 0; i < blocks; i++) { +            if (image_type == VDI_TYPE_STATIC) { +                bmap[i] = i; +            } else { +                bmap[i] = VDI_UNALLOCATED; +            } +        } +        ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); +        if (ret < 0) { +            error_setg(errp, "Error writing bmap to %s", filename); +            goto exit; +        } +        offset += bmap_size; +    } + +    if (image_type == VDI_TYPE_STATIC) { +        ret = bdrv_truncate(bs, offset + blocks * block_size); +        if (ret < 0) { +            error_setg(errp, "Failed to statically allocate %s", filename); +            goto exit; +        } +    } + +exit: +    bdrv_unref(bs); +    g_free(bmap); +    return ret; +} + +static void vdi_close(BlockDriverState *bs) +{ +    BDRVVdiState *s = bs->opaque; + +    qemu_vfree(s->bmap); + +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +} + +static QemuOptsList vdi_create_opts = { +    .name = "vdi-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +#if defined(CONFIG_VDI_BLOCK_SIZE) +        { +            .name = BLOCK_OPT_CLUSTER_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "VDI cluster (block) size", +            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) +        }, +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) +        { +            .name = BLOCK_OPT_STATIC, +            .type = QEMU_OPT_BOOL, +            .help = "VDI static (pre-allocated) image", +            .def_value_str = "off" +        }, +#endif +        /* TODO: An additional option to set UUID values might be useful. */ +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_vdi = { +    .format_name = "vdi", +    .instance_size = sizeof(BDRVVdiState), +    .bdrv_probe = vdi_probe, +    .bdrv_open = vdi_open, +    .bdrv_close = vdi_close, +    .bdrv_reopen_prepare = vdi_reopen_prepare, +    .bdrv_create = vdi_create, +    .bdrv_has_zero_init = bdrv_has_zero_init_1, +    .bdrv_co_get_block_status = vdi_co_get_block_status, +    .bdrv_make_empty = vdi_make_empty, + +    .bdrv_read = vdi_co_read, +#if defined(CONFIG_VDI_WRITE) +    .bdrv_write = vdi_co_write, +#endif + +    .bdrv_get_info = vdi_get_info, + +    .create_opts = &vdi_create_opts, +    .bdrv_check = vdi_check, +}; + +static void bdrv_vdi_init(void) +{ +    logout("\n"); +    bdrv_register(&bdrv_vdi); +} + +block_init(bdrv_vdi_init); diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c new file mode 100644 index 00000000..0640d3f4 --- /dev/null +++ b/block/vhdx-endian.c @@ -0,0 +1,223 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + *  Jeff Cody <jcody@redhat.com> + * + *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + *  by Microsoft: + *      https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/vhdx.h" + +#include <uuid/uuid.h> + + +/* + * All the VHDX formats on disk are little endian - the following + * are helper import/export functions to correctly convert + * endianness from disk read to native cpu format, and back again. + */ + + +/* VHDX File Header */ + + +void vhdx_header_le_import(VHDXHeader *h) +{ +    assert(h != NULL); + +    le32_to_cpus(&h->signature); +    le32_to_cpus(&h->checksum); +    le64_to_cpus(&h->sequence_number); + +    leguid_to_cpus(&h->file_write_guid); +    leguid_to_cpus(&h->data_write_guid); +    leguid_to_cpus(&h->log_guid); + +    le16_to_cpus(&h->log_version); +    le16_to_cpus(&h->version); +    le32_to_cpus(&h->log_length); +    le64_to_cpus(&h->log_offset); +} + +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h) +{ +    assert(orig_h != NULL); +    assert(new_h != NULL); + +    new_h->signature       = cpu_to_le32(orig_h->signature); +    new_h->checksum        = cpu_to_le32(orig_h->checksum); +    new_h->sequence_number = cpu_to_le64(orig_h->sequence_number); + +    new_h->file_write_guid = orig_h->file_write_guid; +    new_h->data_write_guid = orig_h->data_write_guid; +    new_h->log_guid        = orig_h->log_guid; + +    cpu_to_leguids(&new_h->file_write_guid); +    cpu_to_leguids(&new_h->data_write_guid); +    cpu_to_leguids(&new_h->log_guid); + +    new_h->log_version     = cpu_to_le16(orig_h->log_version); +    new_h->version         = cpu_to_le16(orig_h->version); +    new_h->log_length      = cpu_to_le32(orig_h->log_length); +    new_h->log_offset      = cpu_to_le64(orig_h->log_offset); +} + + +/* VHDX Log Headers */ + + +void vhdx_log_desc_le_import(VHDXLogDescriptor *d) +{ +    assert(d != NULL); + +    le32_to_cpus(&d->signature); +    le64_to_cpus(&d->file_offset); +    le64_to_cpus(&d->sequence_number); +} + +void vhdx_log_desc_le_export(VHDXLogDescriptor *d) +{ +    assert(d != NULL); + +    cpu_to_le32s(&d->signature); +    cpu_to_le32s(&d->trailing_bytes); +    cpu_to_le64s(&d->leading_bytes); +    cpu_to_le64s(&d->file_offset); +    cpu_to_le64s(&d->sequence_number); +} + +void vhdx_log_data_le_import(VHDXLogDataSector *d) +{ +    assert(d != NULL); + +    le32_to_cpus(&d->data_signature); +    le32_to_cpus(&d->sequence_high); +    le32_to_cpus(&d->sequence_low); +} + +void vhdx_log_data_le_export(VHDXLogDataSector *d) +{ +    assert(d != NULL); + +    cpu_to_le32s(&d->data_signature); +    cpu_to_le32s(&d->sequence_high); +    cpu_to_le32s(&d->sequence_low); +} + +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr) +{ +    assert(hdr != NULL); + +    le32_to_cpus(&hdr->signature); +    le32_to_cpus(&hdr->checksum); +    le32_to_cpus(&hdr->entry_length); +    le32_to_cpus(&hdr->tail); +    le64_to_cpus(&hdr->sequence_number); +    le32_to_cpus(&hdr->descriptor_count); +    leguid_to_cpus(&hdr->log_guid); +    le64_to_cpus(&hdr->flushed_file_offset); +    le64_to_cpus(&hdr->last_file_offset); +} + +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr) +{ +    assert(hdr != NULL); + +    cpu_to_le32s(&hdr->signature); +    cpu_to_le32s(&hdr->checksum); +    cpu_to_le32s(&hdr->entry_length); +    cpu_to_le32s(&hdr->tail); +    cpu_to_le64s(&hdr->sequence_number); +    cpu_to_le32s(&hdr->descriptor_count); +    cpu_to_leguids(&hdr->log_guid); +    cpu_to_le64s(&hdr->flushed_file_offset); +    cpu_to_le64s(&hdr->last_file_offset); +} + + +/* Region table entries */ +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr) +{ +    assert(hdr != NULL); + +    le32_to_cpus(&hdr->signature); +    le32_to_cpus(&hdr->checksum); +    le32_to_cpus(&hdr->entry_count); +} + +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr) +{ +    assert(hdr != NULL); + +    cpu_to_le32s(&hdr->signature); +    cpu_to_le32s(&hdr->checksum); +    cpu_to_le32s(&hdr->entry_count); +} + +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e) +{ +    assert(e != NULL); + +    leguid_to_cpus(&e->guid); +    le64_to_cpus(&e->file_offset); +    le32_to_cpus(&e->length); +    le32_to_cpus(&e->data_bits); +} + +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e) +{ +    assert(e != NULL); + +    cpu_to_leguids(&e->guid); +    cpu_to_le64s(&e->file_offset); +    cpu_to_le32s(&e->length); +    cpu_to_le32s(&e->data_bits); +} + + +/* Metadata headers & table */ +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr) +{ +    assert(hdr != NULL); + +    le64_to_cpus(&hdr->signature); +    le16_to_cpus(&hdr->entry_count); +} + +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr) +{ +    assert(hdr != NULL); + +    cpu_to_le64s(&hdr->signature); +    cpu_to_le16s(&hdr->entry_count); +} + +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e) +{ +    assert(e != NULL); + +    leguid_to_cpus(&e->item_id); +    le32_to_cpus(&e->offset); +    le32_to_cpus(&e->length); +    le32_to_cpus(&e->data_bits); +} +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e) +{ +    assert(e != NULL); + +    cpu_to_leguids(&e->item_id); +    cpu_to_le32s(&e->offset); +    cpu_to_le32s(&e->length); +    cpu_to_le32s(&e->data_bits); +} diff --git a/block/vhdx-log.c b/block/vhdx-log.c new file mode 100644 index 00000000..47fec63c --- /dev/null +++ b/block/vhdx-log.c @@ -0,0 +1,1039 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + *  Jeff Cody <jcody@redhat.com> + * + *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + *  by Microsoft: + *      https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This file covers the functionality of the metadata log writing, parsing, and + * replay. + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "block/vhdx.h" + + +typedef struct VHDXLogSequence { +    bool valid; +    uint32_t count; +    VHDXLogEntries log; +    VHDXLogEntryHeader hdr; +} VHDXLogSequence; + +typedef struct VHDXLogDescEntries { +    VHDXLogEntryHeader hdr; +    VHDXLogDescriptor desc[]; +} VHDXLogDescEntries; + +static const MSGUID zero_guid = { 0 }; + +/* The log located on the disk is circular buffer containing + * sectors of 4096 bytes each. + * + * It is assumed for the read/write functions below that the + * circular buffer scheme uses a 'one sector open' to indicate + * the buffer is full.  Given the validation methods used for each + * sector, this method should be compatible with other methods that + * do not waste a sector. + */ + + +/* Allow peeking at the hdr entry at the beginning of the current + * read index, without advancing the read index */ +static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, +                             VHDXLogEntryHeader *hdr) +{ +    int ret = 0; +    uint64_t offset; +    uint32_t read; + +    assert(hdr != NULL); + +    /* peek is only supported on sector boundaries */ +    if (log->read % VHDX_LOG_SECTOR_SIZE) { +        ret = -EFAULT; +        goto exit; +    } + +    read = log->read; +    /* we are guaranteed that a) log sectors are 4096 bytes, +     * and b) the log length is a multiple of 1MB. So, there +     * is always a round number of sectors in the buffer */ +    if ((read + sizeof(VHDXLogEntryHeader)) > log->length) { +        read = 0; +    } + +    if (read == log->write) { +        ret = -EINVAL; +        goto exit; +    } + +    offset = log->offset + read; + +    ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader)); +    if (ret < 0) { +        goto exit; +    } +    vhdx_log_entry_hdr_le_import(hdr); + +exit: +    return ret; +} + +/* Index increment for log, based on sector boundaries */ +static int vhdx_log_inc_idx(uint32_t idx, uint64_t length) +{ +    idx += VHDX_LOG_SECTOR_SIZE; +    /* we are guaranteed that a) log sectors are 4096 bytes, +     * and b) the log length is a multiple of 1MB. So, there +     * is always a round number of sectors in the buffer */ +    return idx >= length ? 0 : idx; +} + + +/* Reset the log to empty */ +static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s) +{ +    MSGUID guid = { 0 }; +    s->log.read = s->log.write = 0; +    /* a log guid of 0 indicates an empty log to any parser of v0 +     * VHDX logs */ +    vhdx_update_headers(bs, s, false, &guid); +} + +/* Reads num_sectors from the log (all log sectors are 4096 bytes), + * into buffer 'buffer'.  Upon return, *sectors_read will contain + * the number of sectors successfully read. + * + * It is assumed that 'buffer' is already allocated, and of sufficient + * size (i.e. >= 4096*num_sectors). + * + * If 'peek' is true, then the tail (read) pointer for the circular buffer is + * not modified. + * + * 0 is returned on success, -errno otherwise.  */ +static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, +                                 uint32_t *sectors_read, void *buffer, +                                 uint32_t num_sectors, bool peek) +{ +    int ret = 0; +    uint64_t offset; +    uint32_t read; + +    read = log->read; + +    *sectors_read = 0; +    while (num_sectors) { +        if (read == log->write) { +            /* empty */ +            break; +        } +        offset = log->offset + read; + +        ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE); +        if (ret < 0) { +            goto exit; +        } +        read = vhdx_log_inc_idx(read, log->length); + +        *sectors_read = *sectors_read + 1; +        num_sectors--; +    } + +exit: +    if (!peek) { +        log->read = read; +    } +    return ret; +} + +/* Writes num_sectors to the log (all log sectors are 4096 bytes), + * from buffer 'buffer'.  Upon return, *sectors_written will contain + * the number of sectors successfully written. + * + * It is assumed that 'buffer' is at least 4096*num_sectors large. + * + * 0 is returned on success, -errno otherwise */ +static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, +                                  uint32_t *sectors_written, void *buffer, +                                  uint32_t num_sectors) +{ +    int ret = 0; +    uint64_t offset; +    uint32_t write; +    void *buffer_tmp; +    BDRVVHDXState *s = bs->opaque; + +    ret = vhdx_user_visible_write(bs, s); +    if (ret < 0) { +        goto exit; +    } + +    write = log->write; + +    buffer_tmp = buffer; +    while (num_sectors) { + +        offset = log->offset + write; +        write = vhdx_log_inc_idx(write, log->length); +        if (write == log->read) { +            /* full */ +            break; +        } +        ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE); +        if (ret < 0) { +            goto exit; +        } +        buffer_tmp += VHDX_LOG_SECTOR_SIZE; + +        log->write = write; +        *sectors_written = *sectors_written + 1; +        num_sectors--; +    } + +exit: +    return ret; +} + + +/* Validates a log entry header */ +static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, +                                  BDRVVHDXState *s) +{ +    int valid = false; + +    if (hdr->signature != VHDX_LOG_SIGNATURE) { +        goto exit; +    } + +    /* if the individual entry length is larger than the whole log +     * buffer, that is obviously invalid */ +    if (log->length < hdr->entry_length) { +        goto exit; +    } + +    /* length of entire entry must be in units of 4KB (log sector size) */ +    if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) { +        goto exit; +    } + +    /* per spec, sequence # must be > 0 */ +    if (hdr->sequence_number == 0) { +        goto exit; +    } + +    /* log entries are only valid if they match the file-wide log guid +     * found in the active header */ +    if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) { +        goto exit; +    } + +    if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) { +        goto exit; +    } + +    valid = true; + +exit: +    return valid; +} + +/* + * Given a log header, this will validate that the descriptors and the + * corresponding data sectors (if applicable) + * + * Validation consists of: + *      1. Making sure the sequence numbers matches the entry header + *      2. Verifying a valid signature ('zero' or 'desc' for descriptors) + *      3. File offset field is a multiple of 4KB + *      4. If a data descriptor, the corresponding data sector + *         has its signature ('data') and matching sequence number + * + * @desc: the data buffer containing the descriptor + * @hdr:  the log entry header + * + * Returns true if valid + */ +static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, +                                   VHDXLogEntryHeader *hdr) +{ +    bool ret = false; + +    if (desc->sequence_number != hdr->sequence_number) { +        goto exit; +    } +    if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) { +        goto exit; +    } + +    if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { +        if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { +            /* valid */ +            ret = true; +        } +    } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { +            /* valid */ +            ret = true; +    } + +exit: +    return ret; +} + + +/* Prior to sector data for a log entry, there is the header + * and the descriptors referenced in the header: + * + * [] = 4KB sector + * + * [ hdr, desc ][   desc   ][ ... ][ data ][ ... ] + * + * The first sector in a log entry has a 64 byte header, and + * up to 126 32-byte descriptors.  If more descriptors than + * 126 are required, then subsequent sectors can have up to 128 + * descriptors.  Each sector is 4KB.  Data follows the descriptor + * sectors. + * + * This will return the number of sectors needed to encompass + * the passed number of descriptors in desc_cnt. + * + * This will never return 0, even if desc_cnt is 0. + */ +static int vhdx_compute_desc_sectors(uint32_t desc_cnt) +{ +    uint32_t desc_sectors; + +    desc_cnt += 2; /* account for header in first sector */ +    desc_sectors = desc_cnt / 128; +    if (desc_cnt % 128) { +        desc_sectors++; +    } + +    return desc_sectors; +} + + +/* Reads the log header, and subsequent descriptors (if any).  This + * will allocate all the space for buffer, which must be NULL when + * passed into this function. Each descriptor will also be validated, + * and error returned if any are invalid. */ +static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, +                              VHDXLogEntries *log, VHDXLogDescEntries **buffer, +                              bool convert_endian) +{ +    int ret = 0; +    uint32_t desc_sectors; +    uint32_t sectors_read; +    VHDXLogEntryHeader hdr; +    VHDXLogDescEntries *desc_entries = NULL; +    VHDXLogDescriptor desc; +    int i; + +    assert(*buffer == NULL); + +    ret = vhdx_log_peek_hdr(bs, log, &hdr); +    if (ret < 0) { +        goto exit; +    } + +    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { +        ret = -EINVAL; +        goto exit; +    } + +    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); +    desc_entries = qemu_try_blockalign(bs->file, +                                       desc_sectors * VHDX_LOG_SECTOR_SIZE); +    if (desc_entries == NULL) { +        ret = -ENOMEM; +        goto exit; +    } + +    ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, +                                desc_sectors, false); +    if (ret < 0) { +        goto free_and_exit; +    } +    if (sectors_read != desc_sectors) { +        ret = -EINVAL; +        goto free_and_exit; +    } + +    /* put in proper endianness, and validate each desc */ +    for (i = 0; i < hdr.descriptor_count; i++) { +        desc = desc_entries->desc[i]; +        vhdx_log_desc_le_import(&desc); +        if (convert_endian) { +            desc_entries->desc[i] = desc; +        } +        if (vhdx_log_desc_is_valid(&desc, &hdr) == false) { +            ret = -EINVAL; +            goto free_and_exit; +        } +    } +    if (convert_endian) { +        desc_entries->hdr = hdr; +    } + +    *buffer = desc_entries; +    goto exit; + +free_and_exit: +    qemu_vfree(desc_entries); +exit: +    return ret; +} + + +/* Flushes the descriptor described by desc to the VHDX image file. + * If the descriptor is a data descriptor, than 'data' must be non-NULL, + * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be + * written. + * + * Verification is performed to make sure the sequence numbers of a data + * descriptor match the sequence number in the desc. + * + * For a zero descriptor, it may describe multiple sectors to fill with zeroes. + * In this case, it should be noted that zeroes are written to disk, and the + * image file is not extended as a sparse file.  */ +static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, +                               VHDXLogDataSector *data) +{ +    int ret = 0; +    uint64_t seq, file_offset; +    uint32_t offset = 0; +    void *buffer = NULL; +    uint64_t count = 1; +    int i; + +    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + +    if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { +        /* data sector */ +        if (data == NULL) { +            ret = -EFAULT; +            goto exit; +        } + +        /* The sequence number of the data sector must match that +         * in the descriptor */ +        seq = data->sequence_high; +        seq <<= 32; +        seq |= data->sequence_low & 0xffffffff; + +        if (seq != desc->sequence_number) { +            ret = -EINVAL; +            goto exit; +        } + +        /* Each data sector is in total 4096 bytes, however the first +         * 8 bytes, and last 4 bytes, are located in the descriptor */ +        memcpy(buffer, &desc->leading_bytes, 8); +        offset += 8; + +        memcpy(buffer+offset, data->data, 4084); +        offset += 4084; + +        memcpy(buffer+offset, &desc->trailing_bytes, 4); + +    } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { +        /* write 'count' sectors of sector */ +        memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); +        count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; +    } else { +        error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32, +                      desc->signature); +        ret = -EINVAL; +        goto exit; +    } + +    file_offset = desc->file_offset; + +    /* count is only > 1 if we are writing zeroes */ +    for (i = 0; i < count; i++) { +        ret = bdrv_pwrite_sync(bs->file, file_offset, buffer, +                               VHDX_LOG_SECTOR_SIZE); +        if (ret < 0) { +            goto exit; +        } +        file_offset += VHDX_LOG_SECTOR_SIZE; +    } + +exit: +    qemu_vfree(buffer); +    return ret; +} + +/* Flush the entire log (as described by 'logs') to the VHDX image + * file, and then set the log to 'empty' status once complete. + * + * The log entries should be validate prior to flushing */ +static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, +                          VHDXLogSequence *logs) +{ +    int ret = 0; +    int i; +    uint32_t cnt, sectors_read; +    uint64_t new_file_size; +    void *data = NULL; +    VHDXLogDescEntries *desc_entries = NULL; +    VHDXLogEntryHeader hdr_tmp = { 0 }; + +    cnt = logs->count; + +    data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + +    ret = vhdx_user_visible_write(bs, s); +    if (ret < 0) { +        goto exit; +    } + +    /* each iteration represents one log sequence, which may span multiple +     * sectors */ +    while (cnt--) { +        ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp); +        if (ret < 0) { +            goto exit; +        } +        /* if the log shows a FlushedFileOffset larger than our current file +         * size, then that means the file has been truncated / corrupted, and +         * we must refused to open it / use it */ +        if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) { +            ret = -EINVAL; +            goto exit; +        } + +        ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true); +        if (ret < 0) { +            goto exit; +        } + +        for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { +            if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) { +                /* data sector, so read a sector to flush */ +                ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, +                                            data, 1, false); +                if (ret < 0) { +                    goto exit; +                } +                if (sectors_read != 1) { +                    ret = -EINVAL; +                    goto exit; +                } +                vhdx_log_data_le_import(data); +            } + +            ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); +            if (ret < 0) { +                goto exit; +            } +        } +        if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) { +            new_file_size = desc_entries->hdr.last_file_offset; +            if (new_file_size % (1024*1024)) { +                /* round up to nearest 1MB boundary */ +                new_file_size = ((new_file_size >> 20) + 1) << 20; +                bdrv_truncate(bs->file, new_file_size); +            } +        } +        qemu_vfree(desc_entries); +        desc_entries = NULL; +    } + +    bdrv_flush(bs); +    /* once the log is fully flushed, indicate that we have an empty log +     * now.  This also sets the log guid to 0, to indicate an empty log */ +    vhdx_log_reset(bs, s); + +exit: +    qemu_vfree(data); +    qemu_vfree(desc_entries); +    return ret; +} + +static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, +                                   VHDXLogEntries *log, uint64_t seq, +                                   bool *valid, VHDXLogEntryHeader *entry) +{ +    int ret = 0; +    VHDXLogEntryHeader hdr; +    void *buffer = NULL; +    uint32_t i, desc_sectors, total_sectors, crc; +    uint32_t sectors_read = 0; +    VHDXLogDescEntries *desc_buffer = NULL; + +    *valid = false; + +    ret = vhdx_log_peek_hdr(bs, log, &hdr); +    if (ret < 0) { +        goto inc_and_exit; +    } + +    if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { +        goto inc_and_exit; +    } + +    if (seq > 0) { +        if (hdr.sequence_number != seq + 1) { +            goto inc_and_exit; +        } +    } + +    desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + +    /* Read all log sectors, and calculate log checksum */ + +    total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; + + +    /* read_desc() will increment the read idx */ +    ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false); +    if (ret < 0) { +        goto free_and_exit; +    } + +    crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer, +                            desc_sectors * VHDX_LOG_SECTOR_SIZE, 4); +    crc ^= 0xffffffff; + +    buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); +    if (total_sectors > desc_sectors) { +        for (i = 0; i < total_sectors - desc_sectors; i++) { +            sectors_read = 0; +            ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer, +                                        1, false); +            if (ret < 0 || sectors_read != 1) { +                goto free_and_exit; +            } +            crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1); +            crc ^= 0xffffffff; +        } +    } +    crc ^= 0xffffffff; +    if (crc != hdr.checksum) { +        goto free_and_exit; +    } + +    *valid = true; +    *entry = hdr; +    goto free_and_exit; + +inc_and_exit: +    log->read = vhdx_log_inc_idx(log->read, log->length); + +free_and_exit: +    qemu_vfree(buffer); +    qemu_vfree(desc_buffer); +    return ret; +} + +/* Search through the log circular buffer, and find the valid, active + * log sequence, if any exists + * */ +static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, +                           VHDXLogSequence *logs) +{ +    int ret = 0; +    uint32_t tail; +    bool seq_valid = false; +    VHDXLogSequence candidate = { 0 }; +    VHDXLogEntryHeader hdr = { 0 }; +    VHDXLogEntries curr_log; + +    memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries)); +    curr_log.write = curr_log.length;   /* assume log is full */ +    curr_log.read = 0; + + +    /* now we will go through the whole log sector by sector, until +     * we find a valid, active log sequence, or reach the end of the +     * log buffer */ +    for (;;) { +        uint64_t curr_seq = 0; +        VHDXLogSequence current = { 0 }; + +        tail = curr_log.read; + +        ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, +                                      &seq_valid, &hdr); +        if (ret < 0) { +            goto exit; +        } + +        if (seq_valid) { +            current.valid     = true; +            current.log       = curr_log; +            current.log.read  = tail; +            current.log.write = curr_log.read; +            current.count     = 1; +            current.hdr       = hdr; + + +            for (;;) { +                ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, +                                              &seq_valid, &hdr); +                if (ret < 0) { +                    goto exit; +                } +                if (seq_valid == false) { +                    break; +                } +                current.log.write = curr_log.read; +                current.count++; + +                curr_seq = hdr.sequence_number; +            } +        } + +        if (current.valid) { +            if (candidate.valid == false || +                current.hdr.sequence_number > candidate.hdr.sequence_number) { +                candidate = current; +            } +        } + +        if (curr_log.read < tail) { +            break; +        } +    } + +    *logs = candidate; + +    if (candidate.valid) { +        /* this is the next sequence number, for writes */ +        s->log.sequence = candidate.hdr.sequence_number + 1; +    } + + +exit: +    return ret; +} + +/* Parse the replay log.  Per the VHDX spec, if the log is present + * it must be replayed prior to opening the file, even read-only. + * + * If read-only, we must replay the log in RAM (or refuse to open + * a dirty VHDX file read-only) */ +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, +                   Error **errp) +{ +    int ret = 0; +    VHDXHeader *hdr; +    VHDXLogSequence logs = { 0 }; + +    hdr = s->headers[s->curr_header]; + +    *flushed = false; + +    /* s->log.hdr is freed in vhdx_close() */ +    if (s->log.hdr == NULL) { +        s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader)); +    } + +    s->log.offset = hdr->log_offset; +    s->log.length = hdr->log_length; + +    if (s->log.offset < VHDX_LOG_MIN_SIZE || +        s->log.offset % VHDX_LOG_MIN_SIZE) { +        ret = -EINVAL; +        goto exit; +    } + +    /* per spec, only log version of 0 is supported */ +    if (hdr->log_version != 0) { +        ret = -EINVAL; +        goto exit; +    } + +    /* If either the log guid, or log length is zero, +     * then a replay log is not present */ +    if (guid_eq(hdr->log_guid, zero_guid)) { +        goto exit; +    } + +    if (hdr->log_length == 0) { +        goto exit; +    } + +    if (hdr->log_length % VHDX_LOG_MIN_SIZE) { +        ret = -EINVAL; +        goto exit; +    } + + +    /* The log is present, we need to find if and where there is an active +     * sequence of valid entries present in the log.  */ + +    ret = vhdx_log_search(bs, s, &logs); +    if (ret < 0) { +        goto exit; +    } + +    if (logs.valid) { +        if (bs->read_only) { +            ret = -EPERM; +            error_setg_errno(errp, EPERM, +                             "VHDX image file '%s' opened read-only, but " +                             "contains a log that needs to be replayed.  To " +                             "replay the log, execute:\n qemu-img check -r " +                             "all '%s'", +                             bs->filename, bs->filename); +            goto exit; +        } +        /* now flush the log */ +        ret = vhdx_log_flush(bs, s, &logs); +        if (ret < 0) { +            goto exit; +        } +        *flushed = true; +    } + + +exit: +    return ret; +} + + + +static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, +                                      VHDXLogDataSector *sector, void *data, +                                      uint64_t seq) +{ +    /* 8 + 4084 + 4 = 4096, 1 log sector */ +    memcpy(&desc->leading_bytes, data, 8); +    data += 8; +    cpu_to_le64s(&desc->leading_bytes); +    memcpy(sector->data, data, 4084); +    data += 4084; +    memcpy(&desc->trailing_bytes, data, 4); +    cpu_to_le32s(&desc->trailing_bytes); +    data += 4; + +    sector->sequence_high  = (uint32_t) (seq >> 32); +    sector->sequence_low   = (uint32_t) (seq & 0xffffffff); +    sector->data_signature = VHDX_LOG_DATA_SIGNATURE; + +    vhdx_log_desc_le_export(desc); +    vhdx_log_data_le_export(sector); +} + + +static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, +                          void *data, uint32_t length, uint64_t offset) +{ +    int ret = 0; +    void *buffer = NULL; +    void *merged_sector = NULL; +    void *data_tmp, *sector_write; +    unsigned int i; +    int sector_offset; +    uint32_t desc_sectors, sectors, total_length; +    uint32_t sectors_written = 0; +    uint32_t aligned_length; +    uint32_t leading_length = 0; +    uint32_t trailing_length = 0; +    uint32_t partial_sectors = 0; +    uint32_t bytes_written = 0; +    uint64_t file_offset; +    VHDXHeader *header; +    VHDXLogEntryHeader new_hdr; +    VHDXLogDescriptor *new_desc = NULL; +    VHDXLogDataSector *data_sector = NULL; +    MSGUID new_guid = { 0 }; + +    header = s->headers[s->curr_header]; + +    /* need to have offset read data, and be on 4096 byte boundary */ + +    if (length > header->log_length) { +        /* no log present.  we could create a log here instead of failing */ +        ret = -EINVAL; +        goto exit; +    } + +    if (guid_eq(header->log_guid, zero_guid)) { +        vhdx_guid_generate(&new_guid); +        vhdx_update_headers(bs, s, false, &new_guid); +    } else { +        /* currently, we require that the log be flushed after +         * every write. */ +        ret = -ENOTSUP; +        goto exit; +    } + +    /* 0 is an invalid sequence number, but may also represent the first +     * log write (or a wrapped seq) */ +    if (s->log.sequence == 0) { +        s->log.sequence = 1; +    } + +    sector_offset = offset % VHDX_LOG_SECTOR_SIZE; +    file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE; + +    aligned_length = length; + +    /* add in the unaligned head and tail bytes */ +    if (sector_offset) { +        leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset); +        leading_length = leading_length > length ? length : leading_length; +        aligned_length -= leading_length; +        partial_sectors++; +    } + +    sectors = aligned_length / VHDX_LOG_SECTOR_SIZE; +    trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE); +    if (trailing_length) { +        partial_sectors++; +    } + +    sectors += partial_sectors; + +    /* sectors is now how many sectors the data itself takes, not +     * including the header and descriptor metadata */ + +    new_hdr = (VHDXLogEntryHeader) { +                .signature           = VHDX_LOG_SIGNATURE, +                .tail                = s->log.tail, +                .sequence_number     = s->log.sequence, +                .descriptor_count    = sectors, +                .reserved            = 0, +                .flushed_file_offset = bdrv_getlength(bs->file), +                .last_file_offset    = bdrv_getlength(bs->file), +              }; + +    new_hdr.log_guid = header->log_guid; + +    desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count); + +    total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE; +    new_hdr.entry_length = total_length; + +    vhdx_log_entry_hdr_le_export(&new_hdr); + +    buffer = qemu_blockalign(bs, total_length); +    memcpy(buffer, &new_hdr, sizeof(new_hdr)); + +    new_desc = buffer + sizeof(new_hdr); +    data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE); +    data_tmp = data; + +    /* All log sectors are 4KB, so for any partial sectors we must +     * merge the data with preexisting data from the final file +     * destination */ +    merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + +    for (i = 0; i < sectors; i++) { +        new_desc->signature       = VHDX_LOG_DESC_SIGNATURE; +        new_desc->sequence_number = s->log.sequence; +        new_desc->file_offset     = file_offset; + +        if (i == 0 && leading_length) { +            /* partial sector at the front of the buffer */ +            ret = bdrv_pread(bs->file, file_offset, merged_sector, +                             VHDX_LOG_SECTOR_SIZE); +            if (ret < 0) { +                goto exit; +            } +            memcpy(merged_sector + sector_offset, data_tmp, leading_length); +            bytes_written = leading_length; +            sector_write = merged_sector; +        } else if (i == sectors - 1 && trailing_length) { +            /* partial sector at the end of the buffer */ +            ret = bdrv_pread(bs->file, +                            file_offset, +                            merged_sector + trailing_length, +                            VHDX_LOG_SECTOR_SIZE - trailing_length); +            if (ret < 0) { +                goto exit; +            } +            memcpy(merged_sector, data_tmp, trailing_length); +            bytes_written = trailing_length; +            sector_write = merged_sector; +        } else { +            bytes_written = VHDX_LOG_SECTOR_SIZE; +            sector_write = data_tmp; +        } + +        /* populate the raw sector data into the proper structures, +         * as well as update the descriptor, and convert to proper +         * endianness */ +        vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write, +                                  s->log.sequence); + +        data_tmp += bytes_written; +        data_sector++; +        new_desc++; +        file_offset += VHDX_LOG_SECTOR_SIZE; +    } + +    /* checksum covers entire entry, from the log header through the +     * last data sector */ +    vhdx_update_checksum(buffer, total_length, +                         offsetof(VHDXLogEntryHeader, checksum)); + +    /* now write to the log */ +    ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, +                                 desc_sectors + sectors); +    if (ret < 0) { +        goto exit; +    } + +    if (sectors_written != desc_sectors + sectors) { +        /* instead of failing, we could flush the log here */ +        ret = -EINVAL; +        goto exit; +    } + +    s->log.sequence++; +    /* write new tail */ +    s->log.tail = s->log.write; + +exit: +    qemu_vfree(buffer); +    qemu_vfree(merged_sector); +    return ret; +} + +/* Perform a log write, and then immediately flush the entire log */ +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, +                             void *data, uint32_t length, uint64_t offset) +{ +    int ret = 0; +    VHDXLogSequence logs = { .valid = true, +                             .count = 1, +                             .hdr = { 0 } }; + + +    /* Make sure data written (new and/or changed blocks) is stable +     * on disk, before creating log entry */ +    bdrv_flush(bs); +    ret = vhdx_log_write(bs, s, data, length, offset); +    if (ret < 0) { +        goto exit; +    } +    logs.log = s->log; + +    /* Make sure log is stable on disk */ +    bdrv_flush(bs); +    ret = vhdx_log_flush(bs, s, &logs); +    if (ret < 0) { +        goto exit; +    } + +    s->log = logs.log; + +exit: +    return ret; +} + diff --git a/block/vhdx.c b/block/vhdx.c new file mode 100644 index 00000000..0776de71 --- /dev/null +++ b/block/vhdx.c @@ -0,0 +1,1980 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + *  Jeff Cody <jcody@redhat.com> + * + *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + *  by Microsoft: + *      https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/crc32c.h" +#include "block/vhdx.h" +#include "migration/migration.h" + +#include <uuid/uuid.h> +#include <glib.h> + +/* Options for VHDX creation */ + +#define VHDX_BLOCK_OPT_LOG_SIZE   "log_size" +#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size" +#define VHDX_BLOCK_OPT_ZERO "block_state_zero" + +typedef enum VHDXImageType { +    VHDX_TYPE_DYNAMIC = 0, +    VHDX_TYPE_FIXED, +    VHDX_TYPE_DIFFERENCING,   /* Currently unsupported */ +} VHDXImageType; + +/* Several metadata and region table data entries are identified by + * guids in  a MS-specific GUID format. */ + + +/* ------- Known Region Table GUIDs ---------------------- */ +static const MSGUID bat_guid =      { .data1 = 0x2dc27766, +                                      .data2 = 0xf623, +                                      .data3 = 0x4200, +                                      .data4 = { 0x9d, 0x64, 0x11, 0x5e, +                                                 0x9b, 0xfd, 0x4a, 0x08} }; + +static const MSGUID metadata_guid = { .data1 = 0x8b7ca206, +                                      .data2 = 0x4790, +                                      .data3 = 0x4b9a, +                                      .data4 = { 0xb8, 0xfe, 0x57, 0x5f, +                                                 0x05, 0x0f, 0x88, 0x6e} }; + + + +/* ------- Known Metadata Entry GUIDs ---------------------- */ +static const MSGUID file_param_guid =   { .data1 = 0xcaa16737, +                                          .data2 = 0xfa36, +                                          .data3 = 0x4d43, +                                          .data4 = { 0xb3, 0xb6, 0x33, 0xf0, +                                                     0xaa, 0x44, 0xe7, 0x6b} }; + +static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224, +                                          .data2 = 0xcd1b, +                                          .data3 = 0x4876, +                                          .data4 = { 0xb2, 0x11, 0x5d, 0xbe, +                                                     0xd8, 0x3b, 0xf4, 0xb8} }; + +static const MSGUID page83_guid =       { .data1 = 0xbeca12ab, +                                          .data2 = 0xb2e6, +                                          .data3 = 0x4523, +                                          .data4 = { 0x93, 0xef, 0xc3, 0x09, +                                                     0xe0, 0x00, 0xc7, 0x46} }; + + +static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7, +                                          .data2 = 0x445d, +                                          .data3 = 0x4471, +                                          .data4 = { 0x9c, 0xc9, 0xe9, 0x88, +                                                     0x52, 0x51, 0xc5, 0x56} }; + +static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d, +                                            .data2 = 0xb30b, +                                            .data3 = 0x454d, +                                            .data4 = { 0xab, 0xf7, 0xd3, +                                                       0xd8, 0x48, 0x34, +                                                       0xab, 0x0c} }; + +static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d, +                                            .data2 = 0xa96f, +                                            .data3 = 0x4709, +                                            .data4 = { 0xba, 0x47, 0xf2, +                                                       0x33, 0xa8, 0xfa, +                                                       0xab, 0x5f} }; + +/* Each parent type must have a valid GUID; this is for parent images + * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would + * need to make up our own QCOW2 GUID type */ +static const MSGUID parent_vhdx_guid __attribute__((unused)) +                                     = { .data1 = 0xb04aefb7, +                                         .data2 = 0xd19e, +                                         .data3 = 0x4a81, +                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8, +                                                    0xe9, 0x44, 0x59, 0x13} }; + + +#define META_FILE_PARAMETER_PRESENT      0x01 +#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02 +#define META_PAGE_83_PRESENT             0x04 +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08 +#define META_PHYS_SECTOR_SIZE_PRESENT    0x10 +#define META_PARENT_LOCATOR_PRESENT      0x20 + +#define META_ALL_PRESENT    \ +    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \ +     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ +     META_PHYS_SECTOR_SIZE_PRESENT) + + +typedef struct VHDXSectorInfo { +    uint32_t bat_idx;       /* BAT entry index */ +    uint32_t sectors_avail; /* sectors available in payload block */ +    uint32_t bytes_left;    /* bytes left in the block after data to r/w */ +    uint32_t bytes_avail;   /* bytes available in payload block */ +    uint64_t file_offset;   /* absolute offset in bytes, in file */ +    uint64_t block_offset;  /* block offset, in bytes */ +} VHDXSectorInfo; + +/* Calculates new checksum. + * + * Zero is substituted during crc calculation for the original crc field + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * Note: The buffer should have all multi-byte data in little-endian format, + *       and the resulting checksum is in little endian format. + */ +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) +{ +    uint32_t crc; + +    assert(buf != NULL); +    assert(size > (crc_offset + sizeof(crc))); + +    memset(buf + crc_offset, 0, sizeof(crc)); +    crc =  crc32c(0xffffffff, buf, size); +    cpu_to_le32s(&crc); +    memcpy(buf + crc_offset, &crc, sizeof(crc)); + +    return crc; +} + +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, +                            int crc_offset) +{ +    uint32_t crc_new; +    uint32_t crc_orig; +    assert(buf != NULL); + +    if (crc_offset > 0) { +        memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); +        memset(buf + crc_offset, 0, sizeof(crc_orig)); +    } + +    crc_new = crc32c(crc, buf, size); +    if (crc_offset > 0) { +        memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig)); +    } + +    return crc_new; +} + +/* Validates the checksum of the buffer, with an in-place CRC. + * + * Zero is substituted during crc calculation for the original crc field, + * and the crc field is restored afterwards.  But the buffer will be modifed + * during the calculation, so this may not be not suitable for multi-threaded + * use. + * + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * returns true if checksum is valid, false otherwise + */ +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) +{ +    uint32_t crc_orig; +    uint32_t crc; + +    assert(buf != NULL); +    assert(size > (crc_offset + 4)); + +    memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); +    crc_orig = le32_to_cpu(crc_orig); + +    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset); + +    return crc == crc_orig; +} + + +/* + * This generates a UUID that is compliant with the MS GUIDs used + * in the VHDX spec (and elsewhere). + */ +void vhdx_guid_generate(MSGUID *guid) +{ +    uuid_t uuid; +    assert(guid != NULL); + +    uuid_generate(uuid); +    memcpy(guid, uuid, sizeof(MSGUID)); +} + +/* Check for region overlaps inside the VHDX image */ +static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length) +{ +    int ret = 0; +    uint64_t end; +    VHDXRegionEntry *r; + +    end = start + length; +    QLIST_FOREACH(r, &s->regions, entries) { +        if (!((start >= r->end) || (end <= r->start))) { +            ret = -EINVAL; +            goto exit; +        } +    } + +exit: +    return ret; +} + +/* Register a region for future checks */ +static void vhdx_region_register(BDRVVHDXState *s, +                                 uint64_t start, uint64_t length) +{ +    VHDXRegionEntry *r; + +    r = g_malloc0(sizeof(*r)); + +    r->start = start; +    r->end = start + length; + +    QLIST_INSERT_HEAD(&s->regions, r, entries); +} + +/* Free all registered regions */ +static void vhdx_region_unregister_all(BDRVVHDXState *s) +{ +    VHDXRegionEntry *r, *r_next; + +    QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) { +        QLIST_REMOVE(r, entries); +        g_free(r); +    } +} + +static void vhdx_set_shift_bits(BDRVVHDXState *s) +{ +    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); +    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block); +    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio); +    s->block_size_bits =          31 - clz32(s->block_size); +} + +/* + * Per the MS VHDX Specification, for every VHDX file: + *      - The header section is fixed size - 1 MB + *      - The header section is always the first "object" + *      - The first 64KB of the header is the File Identifier + *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile") + *      - The following 512 bytes constitute a UTF-16 string identifiying the + *        software that created the file, and is optional and diagnostic only. + * + *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile" + */ +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) { +        return 100; +    } +    return 0; +} + +/* + * Writes the header to the specified offset. + * + * This will optionally read in buffer data from disk (otherwise zero-fill), + * and then update the header checksum.  Header is converted to proper + * endianness before being written to the specified file offset + */ +static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, +                             uint64_t offset, bool read) +{ +    uint8_t *buffer = NULL; +    int ret; +    VHDXHeader *header_le; + +    assert(bs_file != NULL); +    assert(hdr != NULL); + +    /* the header checksum is not over just the packed size of VHDXHeader, +     * but rather over the entire 'reserved' range for the header, which is +     * 4KB (VHDX_HEADER_SIZE). */ + +    buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); +    if (read) { +        /* if true, we can't assume the extra reserved bytes are 0 */ +        ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); +        if (ret < 0) { +            goto exit; +        } +    } else { +        memset(buffer, 0, VHDX_HEADER_SIZE); +    } + +    /* overwrite the actual VHDXHeader portion */ +    header_le = (VHDXHeader *)buffer; +    memcpy(header_le, hdr, sizeof(VHDXHeader)); +    vhdx_header_le_export(hdr, header_le); +    vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, +                         offsetof(VHDXHeader, checksum)); +    ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); + +exit: +    qemu_vfree(buffer); +    return ret; +} + +/* Update the VHDX headers + * + * This follows the VHDX spec procedures for header updates. + * + *  - non-current header is updated with largest sequence number + */ +static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, +                              bool generate_data_write_guid, MSGUID *log_guid) +{ +    int ret = 0; +    int hdr_idx = 0; +    uint64_t header_offset = VHDX_HEADER1_OFFSET; + +    VHDXHeader *active_header; +    VHDXHeader *inactive_header; + +    /* operate on the non-current header */ +    if (s->curr_header == 0) { +        hdr_idx = 1; +        header_offset = VHDX_HEADER2_OFFSET; +    } + +    active_header   = s->headers[s->curr_header]; +    inactive_header = s->headers[hdr_idx]; + +    inactive_header->sequence_number = active_header->sequence_number + 1; + +    /* a new file guid must be generated before any file write, including +     * headers */ +    inactive_header->file_write_guid = s->session_guid; + +    /* a new data guid only needs to be generated before any guest-visible +     * writes (i.e. something observable via virtual disk read) */ +    if (generate_data_write_guid) { +        vhdx_guid_generate(&inactive_header->data_write_guid); +    } + +    /* update the log guid if present */ +    if (log_guid) { +        inactive_header->log_guid = *log_guid; +    } + +    ret = vhdx_write_header(bs->file, inactive_header, header_offset, true); +    if (ret < 0) { +        goto exit; +    } +    s->curr_header = hdr_idx; + +exit: +    return ret; +} + +/* + * The VHDX spec calls for header updates to be performed twice, so that both + * the current and non-current header have valid info + */ +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, +                        bool generate_data_write_guid, MSGUID *log_guid) +{ +    int ret; + +    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); +    if (ret < 0) { +        return ret; +    } +    ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); +    return ret; +} + +/* opens the specified header block from the VHDX file header section */ +static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, +                              Error **errp) +{ +    int ret; +    VHDXHeader *header1; +    VHDXHeader *header2; +    bool h1_valid = false; +    bool h2_valid = false; +    uint64_t h1_seq = 0; +    uint64_t h2_seq = 0; +    uint8_t *buffer; + +    /* header1 & header2 are freed in vhdx_close() */ +    header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); +    header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); + +    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE); + +    s->headers[0] = header1; +    s->headers[1] = header2; + +    /* We have to read the whole VHDX_HEADER_SIZE instead of +     * sizeof(VHDXHeader), because the checksum is over the whole +     * region */ +    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE); +    if (ret < 0) { +        goto fail; +    } +    /* copy over just the relevant portion that we need */ +    memcpy(header1, buffer, sizeof(VHDXHeader)); + +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { +        vhdx_header_le_import(header1); +        if (header1->signature == VHDX_HEADER_SIGNATURE && +            header1->version == 1) { +            h1_seq = header1->sequence_number; +            h1_valid = true; +        } +    } + +    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); +    if (ret < 0) { +        goto fail; +    } +    /* copy over just the relevant portion that we need */ +    memcpy(header2, buffer, sizeof(VHDXHeader)); + +    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { +        vhdx_header_le_import(header2); +        if (header2->signature == VHDX_HEADER_SIGNATURE && +            header2->version == 1) { +            h2_seq = header2->sequence_number; +            h2_valid = true; +        } +    } + +    /* If there is only 1 valid header (or no valid headers), we +     * don't care what the sequence numbers are */ +    if (h1_valid && !h2_valid) { +        s->curr_header = 0; +    } else if (!h1_valid && h2_valid) { +        s->curr_header = 1; +    } else if (!h1_valid && !h2_valid) { +        goto fail; +    } else { +        /* If both headers are valid, then we choose the active one by the +         * highest sequence number.  If the sequence numbers are equal, that is +         * invalid */ +        if (h1_seq > h2_seq) { +            s->curr_header = 0; +        } else if (h2_seq > h1_seq) { +            s->curr_header = 1; +        } else { +            /* The Microsoft Disk2VHD tool will create 2 identical +             * headers, with identical sequence numbers.  If the headers are +             * identical, don't consider the file corrupt */ +            if (!memcmp(header1, header2, sizeof(VHDXHeader))) { +                s->curr_header = 0; +            } else { +                goto fail; +            } +        } +    } + +    vhdx_region_register(s, s->headers[s->curr_header]->log_offset, +                            s->headers[s->curr_header]->log_length); +    goto exit; + +fail: +    error_setg_errno(errp, -ret, "No valid VHDX header found"); +    qemu_vfree(header1); +    qemu_vfree(header2); +    s->headers[0] = NULL; +    s->headers[1] = NULL; +exit: +    qemu_vfree(buffer); +} + + +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) +{ +    int ret = 0; +    uint8_t *buffer; +    int offset = 0; +    VHDXRegionTableEntry rt_entry; +    uint32_t i; +    bool bat_rt_found = false; +    bool metadata_rt_found = false; + +    /* We have to read the whole 64KB block, because the crc32 is over the +     * whole block */ +    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); + +    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer, +                     VHDX_HEADER_BLOCK_SIZE); +    if (ret < 0) { +        goto fail; +    } +    memcpy(&s->rt, buffer, sizeof(s->rt)); +    offset += sizeof(s->rt); + +    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) { +        ret = -EINVAL; +        goto fail; +    } + +    vhdx_region_header_le_import(&s->rt); + +    if (s->rt.signature != VHDX_REGION_SIGNATURE) { +        ret = -EINVAL; +        goto fail; +    } + + +    /* Per spec, maximum region table entry count is 2047 */ +    if (s->rt.entry_count > 2047) { +        ret = -EINVAL; +        goto fail; +    } + +    for (i = 0; i < s->rt.entry_count; i++) { +        memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); +        offset += sizeof(rt_entry); + +        vhdx_region_entry_le_import(&rt_entry); + +        /* check for region overlap between these entries, and any +         * other memory regions in the file */ +        ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length); +        if (ret < 0) { +            goto fail; +        } + +        vhdx_region_register(s, rt_entry.file_offset, rt_entry.length); + +        /* see if we recognize the entry */ +        if (guid_eq(rt_entry.guid, bat_guid)) { +            /* must be unique; if we have already found it this is invalid */ +            if (bat_rt_found) { +                ret = -EINVAL; +                goto fail; +            } +            bat_rt_found = true; +            s->bat_rt = rt_entry; +            continue; +        } + +        if (guid_eq(rt_entry.guid, metadata_guid)) { +            /* must be unique; if we have already found it this is invalid */ +            if (metadata_rt_found) { +                ret = -EINVAL; +                goto fail; +            } +            metadata_rt_found = true; +            s->metadata_rt = rt_entry; +            continue; +        } + +        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) { +            /* cannot read vhdx file - required region table entry that +             * we do not understand.  per spec, we must fail to open */ +            ret = -ENOTSUP; +            goto fail; +        } +    } + +    if (!bat_rt_found || !metadata_rt_found) { +        ret = -EINVAL; +        goto fail; +    } + +    ret = 0; + +fail: +    qemu_vfree(buffer); +    return ret; +} + + + +/* Metadata initial parser + * + * This loads all the metadata entry fields.  This may cause additional + * fields to be processed (e.g. parent locator, etc..). + * + * There are 5 Metadata items that are always required: + *      - File Parameters (block size, has a parent) + *      - Virtual Disk Size (size, in bytes, of the virtual drive) + *      - Page 83 Data (scsi page 83 guid) + *      - Logical Sector Size (logical sector size in bytes, either 512 or + *                             4096.  We only support 512 currently) + *      - Physical Sector Size (512 or 4096) + * + * Also, if the File Parameters indicate this is a differencing file, + * we must also look for the Parent Locator metadata item. + */ +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) +{ +    int ret = 0; +    uint8_t *buffer; +    int offset = 0; +    uint32_t i = 0; +    VHDXMetadataTableEntry md_entry; + +    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); + +    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer, +                     VHDX_METADATA_TABLE_MAX_SIZE); +    if (ret < 0) { +        goto exit; +    } +    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); +    offset += sizeof(s->metadata_hdr); + +    vhdx_metadata_header_le_import(&s->metadata_hdr); + +    if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) { +        ret = -EINVAL; +        goto exit; +    } + +    s->metadata_entries.present = 0; + +    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) > +        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) { +        ret = -EINVAL; +        goto exit; +    } + +    for (i = 0; i < s->metadata_hdr.entry_count; i++) { +        memcpy(&md_entry, buffer + offset, sizeof(md_entry)); +        offset += sizeof(md_entry); + +        vhdx_metadata_entry_le_import(&md_entry); + +        if (guid_eq(md_entry.item_id, file_param_guid)) { +            if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.file_parameters_entry = md_entry; +            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT; +            continue; +        } + +        if (guid_eq(md_entry.item_id, virtual_size_guid)) { +            if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.virtual_disk_size_entry = md_entry; +            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT; +            continue; +        } + +        if (guid_eq(md_entry.item_id, page83_guid)) { +            if (s->metadata_entries.present & META_PAGE_83_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.page83_data_entry = md_entry; +            s->metadata_entries.present |= META_PAGE_83_PRESENT; +            continue; +        } + +        if (guid_eq(md_entry.item_id, logical_sector_guid)) { +            if (s->metadata_entries.present & +                META_LOGICAL_SECTOR_SIZE_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.logical_sector_size_entry = md_entry; +            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT; +            continue; +        } + +        if (guid_eq(md_entry.item_id, phys_sector_guid)) { +            if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.phys_sector_size_entry = md_entry; +            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT; +            continue; +        } + +        if (guid_eq(md_entry.item_id, parent_locator_guid)) { +            if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { +                ret = -EINVAL; +                goto exit; +            } +            s->metadata_entries.parent_locator_entry = md_entry; +            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT; +            continue; +        } + +        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) { +            /* cannot read vhdx file - required region table entry that +             * we do not understand.  per spec, we must fail to open */ +            ret = -ENOTSUP; +            goto exit; +        } +    } + +    if (s->metadata_entries.present != META_ALL_PRESENT) { +        ret = -ENOTSUP; +        goto exit; +    } + +    ret = bdrv_pread(bs->file, +                     s->metadata_entries.file_parameters_entry.offset +                                         + s->metadata_rt.file_offset, +                     &s->params, +                     sizeof(s->params)); + +    if (ret < 0) { +        goto exit; +    } + +    le32_to_cpus(&s->params.block_size); +    le32_to_cpus(&s->params.data_bits); + + +    /* We now have the file parameters, so we can tell if this is a +     * differencing file (i.e.. has_parent), is dynamic or fixed +     * sized (leave_blocks_allocated), and the block size */ + +    /* The parent locator required iff the file parameters has_parent set */ +    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { +        if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { +            /* TODO: parse  parent locator fields */ +            ret = -ENOTSUP; /* temp, until differencing files are supported */ +            goto exit; +        } else { +            /* if has_parent is set, but there is not parent locator present, +             * then that is an invalid combination */ +            ret = -EINVAL; +            goto exit; +        } +    } + +    /* determine virtual disk size, logical sector size, +     * and phys sector size */ + +    ret = bdrv_pread(bs->file, +                     s->metadata_entries.virtual_disk_size_entry.offset +                                           + s->metadata_rt.file_offset, +                     &s->virtual_disk_size, +                     sizeof(uint64_t)); +    if (ret < 0) { +        goto exit; +    } +    ret = bdrv_pread(bs->file, +                     s->metadata_entries.logical_sector_size_entry.offset +                                             + s->metadata_rt.file_offset, +                     &s->logical_sector_size, +                     sizeof(uint32_t)); +    if (ret < 0) { +        goto exit; +    } +    ret = bdrv_pread(bs->file, +                     s->metadata_entries.phys_sector_size_entry.offset +                                          + s->metadata_rt.file_offset, +                     &s->physical_sector_size, +                     sizeof(uint32_t)); +    if (ret < 0) { +        goto exit; +    } + +    le64_to_cpus(&s->virtual_disk_size); +    le32_to_cpus(&s->logical_sector_size); +    le32_to_cpus(&s->physical_sector_size); + +    if (s->params.block_size < VHDX_BLOCK_SIZE_MIN || +        s->params.block_size > VHDX_BLOCK_SIZE_MAX) { +        ret = -EINVAL; +        goto exit; +    } + +    /* only 2 supported sector sizes */ +    if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) { +        ret = -EINVAL; +        goto exit; +    } + +    /* Both block_size and sector_size are guaranteed powers of 2, below. +       Due to range checks above, s->sectors_per_block can never be < 256 */ +    s->sectors_per_block = s->params.block_size / s->logical_sector_size; +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * +                     (uint64_t)s->logical_sector_size / +                     (uint64_t)s->params.block_size; + +    /* These values are ones we will want to use for division / multiplication +     * later on, and they are all guaranteed (per the spec) to be powers of 2, +     * so we can take advantage of that for shift operations during +     * reads/writes */ +    if (s->logical_sector_size & (s->logical_sector_size - 1)) { +        ret = -EINVAL; +        goto exit; +    } +    if (s->sectors_per_block & (s->sectors_per_block - 1)) { +        ret = -EINVAL; +        goto exit; +    } +    if (s->chunk_ratio & (s->chunk_ratio - 1)) { +        ret = -EINVAL; +        goto exit; +    } +    s->block_size = s->params.block_size; +    if (s->block_size & (s->block_size - 1)) { +        ret = -EINVAL; +        goto exit; +    } + +    vhdx_set_shift_bits(s); + +    ret = 0; + +exit: +    qemu_vfree(buffer); +    return ret; +} + +/* + * Calculate the number of BAT entries, including sector + * bitmap entries. + */ +static void vhdx_calc_bat_entries(BDRVVHDXState *s) +{ +    uint32_t data_blocks_cnt, bitmap_blocks_cnt; + +    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; +    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { +        data_blocks_cnt++; +    } +    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; +    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { +        bitmap_blocks_cnt++; +    } + +    if (s->parent_entries) { +        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); +    } else { +        s->bat_entries = data_blocks_cnt + +                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); +    } + +} + +static void vhdx_close(BlockDriverState *bs) +{ +    BDRVVHDXState *s = bs->opaque; +    qemu_vfree(s->headers[0]); +    s->headers[0] = NULL; +    qemu_vfree(s->headers[1]); +    s->headers[1] = NULL; +    qemu_vfree(s->bat); +    s->bat = NULL; +    qemu_vfree(s->parent_entries); +    s->parent_entries = NULL; +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +    qemu_vfree(s->log.hdr); +    s->log.hdr = NULL; +    vhdx_region_unregister_all(s); +} + +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    BDRVVHDXState *s = bs->opaque; +    int ret = 0; +    uint32_t i; +    uint64_t signature; +    Error *local_err = NULL; + +    s->bat = NULL; +    s->first_visible_write = true; + +    qemu_co_mutex_init(&s->lock); +    QLIST_INIT(&s->regions); + +    /* validate the file signature */ +    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); +    if (ret < 0) { +        goto fail; +    } +    if (memcmp(&signature, "vhdxfile", 8)) { +        ret = -EINVAL; +        goto fail; +    } + +    /* This is used for any header updates, for the file_write_guid. +     * The spec dictates that a new value should be used for the first +     * header update */ +    vhdx_guid_generate(&s->session_guid); + +    vhdx_parse_header(bs, s, &local_err); +    if (local_err != NULL) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); +    if (ret < 0) { +        goto fail; +    } + +    ret = vhdx_open_region_tables(bs, s); +    if (ret < 0) { +        goto fail; +    } + +    ret = vhdx_parse_metadata(bs, s); +    if (ret < 0) { +        goto fail; +    } + +    s->block_size = s->params.block_size; + +    /* the VHDX spec dictates that virtual_disk_size is always a multiple of +     * logical_sector_size */ +    bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; + +    vhdx_calc_bat_entries(s); + +    s->bat_offset = s->bat_rt.file_offset; + +    if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) { +        /* BAT allocation is not large enough for all entries */ +        ret = -EINVAL; +        goto fail; +    } + +    /* s->bat is freed in vhdx_close() */ +    s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length); +    if (s->bat == NULL) { +        ret = -ENOMEM; +        goto fail; +    } + +    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); +    if (ret < 0) { +        goto fail; +    } + +    uint64_t payblocks = s->chunk_ratio; +    /* endian convert, and verify populated BAT field file offsets against +     * region table and log entries */ +    for (i = 0; i < s->bat_entries; i++) { +        le64_to_cpus(&s->bat[i]); +        if (payblocks--) { +            /* payload bat entries */ +            if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == +                    PAYLOAD_BLOCK_FULLY_PRESENT) { +                ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, +                                        s->block_size); +                if (ret < 0) { +                    goto fail; +                } +            } +        } else { +            payblocks = s->chunk_ratio; +            /* Once differencing files are supported, verify sector bitmap +             * blocks here */ +        } +    } + +    if (flags & BDRV_O_RDWR) { +        ret = vhdx_update_headers(bs, s, false, NULL); +        if (ret < 0) { +            goto fail; +        } +    } + +    /* TODO: differencing files */ + +    /* Disable migration when VHDX images are used */ +    error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " +               "does not support live migration", +               bdrv_get_device_or_node_name(bs)); +    migrate_add_blocker(s->migration_blocker); + +    return 0; +fail: +    vhdx_close(bs); +    return ret; +} + +static int vhdx_reopen_prepare(BDRVReopenState *state, +                               BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + + +/* + * Perform sector to block offset translations, to get various + * sector and file offsets into the image.  See VHDXSectorInfo + */ +static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, +                                 int nb_sectors, VHDXSectorInfo *sinfo) +{ +    uint32_t block_offset; + +    sinfo->bat_idx = sector_num >> s->sectors_per_block_bits; +    /* effectively a modulo - this gives us the offset into the block +     * (in sector sizes) for our sector number */ +    block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits); +    /* the chunk ratio gives us the interleaving of the sector +     * bitmaps, so we need to advance our page block index by the +     * sector bitmaps entry number */ +    sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits; + +    /* the number of sectors we can read/write in this cycle */ +    sinfo->sectors_avail = s->sectors_per_block - block_offset; + +    sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits; + +    if (sinfo->sectors_avail > nb_sectors) { +        sinfo->sectors_avail = nb_sectors; +    } + +    sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; + +    sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK; + +    sinfo->block_offset = block_offset << s->logical_sector_size_bits; + +    /* The file offset must be past the header section, so must be > 0 */ +    if (sinfo->file_offset == 0) { +        return; +    } + +    /* block offset is the offset in vhdx logical sectors, in +     * the payload data block. Convert that to a byte offset +     * in the block, and add in the payload data block offset +     * in the file, in bytes, to get the final read address */ + +    sinfo->file_offset += sinfo->block_offset; +} + + +static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVVHDXState *s = bs->opaque; + +    bdi->cluster_size = s->block_size; + +    bdi->unallocated_blocks_are_zero = +        (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0; + +    return 0; +} + + +static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, +                                      int nb_sectors, QEMUIOVector *qiov) +{ +    BDRVVHDXState *s = bs->opaque; +    int ret = 0; +    VHDXSectorInfo sinfo; +    uint64_t bytes_done = 0; +    QEMUIOVector hd_qiov; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    qemu_co_mutex_lock(&s->lock); + +    while (nb_sectors > 0) { +        /* We are a differencing file, so we need to inspect the sector bitmap +         * to see if we have the data or not */ +        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { +            /* not supported yet */ +            ret = -ENOTSUP; +            goto exit; +        } else { +            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + +            qemu_iovec_reset(&hd_qiov); +            qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail); + +            /* check the payload block state */ +            switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) { +            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ +            case PAYLOAD_BLOCK_UNDEFINED: +            case PAYLOAD_BLOCK_UNMAPPED: +            case PAYLOAD_BLOCK_UNMAPPED_v095: +            case PAYLOAD_BLOCK_ZERO: +                /* return zero */ +                qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); +                break; +            case PAYLOAD_BLOCK_FULLY_PRESENT: +                qemu_co_mutex_unlock(&s->lock); +                ret = bdrv_co_readv(bs->file, +                                    sinfo.file_offset >> BDRV_SECTOR_BITS, +                                    sinfo.sectors_avail, &hd_qiov); +                qemu_co_mutex_lock(&s->lock); +                if (ret < 0) { +                    goto exit; +                } +                break; +            case PAYLOAD_BLOCK_PARTIALLY_PRESENT: +                /* we don't yet support difference files, fall through +                 * to error */ +            default: +                ret = -EIO; +                goto exit; +                break; +            } +            nb_sectors -= sinfo.sectors_avail; +            sector_num += sinfo.sectors_avail; +            bytes_done += sinfo.bytes_avail; +        } +    } +    ret = 0; +exit: +    qemu_co_mutex_unlock(&s->lock); +    qemu_iovec_destroy(&hd_qiov); +    return ret; +} + +/* + * Allocate a new payload block at the end of the file. + * + * Allocation will happen at 1MB alignment inside the file + * + * Returns the file offset start of the new payload block + */ +static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, +                                    uint64_t *new_offset) +{ +    *new_offset = bdrv_getlength(bs->file); + +    /* per the spec, the address for a block is in units of 1MB */ +    *new_offset = ROUND_UP(*new_offset, 1024 * 1024); + +    return bdrv_truncate(bs->file, *new_offset + s->block_size); +} + +/* + * Update the BAT table entry with the new file offset, and the new entry + * state */ +static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, +                                       VHDXSectorInfo *sinfo, +                                       uint64_t *bat_entry_le, +                                       uint64_t *bat_offset, int state) +{ +    /* The BAT entry is a uint64, with 44 bits for the file offset in units of +     * 1MB, and 3 bits for the block state. */ +    if ((state == PAYLOAD_BLOCK_ZERO)        || +        (state == PAYLOAD_BLOCK_UNDEFINED)   || +        (state == PAYLOAD_BLOCK_NOT_PRESENT) || +        (state == PAYLOAD_BLOCK_UNMAPPED)) { +        s->bat[sinfo->bat_idx]  = 0;  /* For PAYLOAD_BLOCK_ZERO, the +                                         FileOffsetMB field is denoted as +                                         'reserved' in the v1.0 spec.  If it is +                                         non-zero, MS Hyper-V will fail to read +                                         the disk image */ +    } else { +        s->bat[sinfo->bat_idx]  = sinfo->file_offset; +    } + +    s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; + +    *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); +    *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); + +} + +/* Per the spec, on the first write of guest-visible data to the file the + * data write guid must be updated in the header */ +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) +{ +    int ret = 0; +    if (s->first_visible_write) { +        s->first_visible_write = false; +        ret = vhdx_update_headers(bs, s, true, NULL); +    } +    return ret; +} + +static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, +                                      int nb_sectors, QEMUIOVector *qiov) +{ +    int ret = -ENOTSUP; +    BDRVVHDXState *s = bs->opaque; +    VHDXSectorInfo sinfo; +    uint64_t bytes_done = 0; +    uint64_t bat_entry = 0; +    uint64_t bat_entry_offset = 0; +    QEMUIOVector hd_qiov; +    struct iovec iov1 = { 0 }; +    struct iovec iov2 = { 0 }; +    int sectors_to_write; +    int bat_state; +    uint64_t bat_prior_offset = 0; +    bool bat_update = false; + +    qemu_iovec_init(&hd_qiov, qiov->niov); + +    qemu_co_mutex_lock(&s->lock); + +    ret = vhdx_user_visible_write(bs, s); +    if (ret < 0) { +        goto exit; +    } + +    while (nb_sectors > 0) { +        bool use_zero_buffers = false; +        bat_update = false; +        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { +            /* not supported yet */ +            ret = -ENOTSUP; +            goto exit; +        } else { +            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); +            sectors_to_write = sinfo.sectors_avail; + +            qemu_iovec_reset(&hd_qiov); +            /* check the payload block state */ +            bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; +            switch (bat_state) { +            case PAYLOAD_BLOCK_ZERO: +                /* in this case, we need to preserve zero writes for +                 * data that is not part of this write, so we must pad +                 * the rest of the buffer to zeroes */ + +                /* if we are on a posix system with ftruncate() that extends +                 * a file, then it is zero-filled for us.  On Win32, the raw +                 * layer uses SetFilePointer and SetFileEnd, which does not +                 * zero fill AFAIK */ + +                /* Queue another write of zero buffers if the underlying file +                 * does not zero-fill on file extension */ + +                if (bdrv_has_zero_init(bs->file) == 0) { +                    use_zero_buffers = true; + +                    /* zero fill the front, if any */ +                    if (sinfo.block_offset) { +                        iov1.iov_len = sinfo.block_offset; +                        iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); +                        memset(iov1.iov_base, 0, iov1.iov_len); +                        qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, +                                              iov1.iov_len); +                        sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; +                    } + +                    /* our actual data */ +                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, +                                      sinfo.bytes_avail); + +                    /* zero fill the back, if any */ +                    if ((sinfo.bytes_avail - sinfo.block_offset) < +                         s->block_size) { +                        iov2.iov_len = s->block_size - +                                      (sinfo.bytes_avail + sinfo.block_offset); +                        iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); +                        memset(iov2.iov_base, 0, iov2.iov_len); +                        qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, +                                              iov2.iov_len); +                        sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; +                    } +                } +                /* fall through */ +            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ +            case PAYLOAD_BLOCK_UNMAPPED: +            case PAYLOAD_BLOCK_UNMAPPED_v095: +            case PAYLOAD_BLOCK_UNDEFINED: +                bat_prior_offset = sinfo.file_offset; +                ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); +                if (ret < 0) { +                    goto exit; +                } +                /* once we support differencing files, this may also be +                 * partially present */ +                /* update block state to the newly specified state */ +                vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, +                                            &bat_entry_offset, +                                            PAYLOAD_BLOCK_FULLY_PRESENT); +                bat_update = true; +                /* since we just allocated a block, file_offset is the +                 * beginning of the payload block. It needs to be the +                 * write address, which includes the offset into the block */ +                if (!use_zero_buffers) { +                    sinfo.file_offset += sinfo.block_offset; +                } +                /* fall through */ +            case PAYLOAD_BLOCK_FULLY_PRESENT: +                /* if the file offset address is in the header zone, +                 * there is a problem */ +                if (sinfo.file_offset < (1024 * 1024)) { +                    ret = -EFAULT; +                    goto error_bat_restore; +                } + +                if (!use_zero_buffers) { +                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, +                                      sinfo.bytes_avail); +                } +                /* block exists, so we can just overwrite it */ +                qemu_co_mutex_unlock(&s->lock); +                ret = bdrv_co_writev(bs->file, +                                    sinfo.file_offset >> BDRV_SECTOR_BITS, +                                    sectors_to_write, &hd_qiov); +                qemu_co_mutex_lock(&s->lock); +                if (ret < 0) { +                    goto error_bat_restore; +                } +                break; +            case PAYLOAD_BLOCK_PARTIALLY_PRESENT: +                /* we don't yet support difference files, fall through +                 * to error */ +            default: +                ret = -EIO; +                goto exit; +                break; +            } + +            if (bat_update) { +                /* this will update the BAT entry into the log journal, and +                 * then flush the log journal out to disk */ +                ret =  vhdx_log_write_and_flush(bs, s, &bat_entry, +                                                sizeof(VHDXBatEntry), +                                                bat_entry_offset); +                if (ret < 0) { +                    goto exit; +                } +            } + +            nb_sectors -= sinfo.sectors_avail; +            sector_num += sinfo.sectors_avail; +            bytes_done += sinfo.bytes_avail; + +        } +    } + +    goto exit; + +error_bat_restore: +    if (bat_update) { +        /* keep metadata in sync, and restore the bat entry state +         * if error. */ +        sinfo.file_offset = bat_prior_offset; +        vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, +                                    &bat_entry_offset, bat_state); +    } +exit: +    qemu_vfree(iov1.iov_base); +    qemu_vfree(iov2.iov_base); +    qemu_co_mutex_unlock(&s->lock); +    qemu_iovec_destroy(&hd_qiov); +    return ret; +} + + + +/* + * Create VHDX Headers + * + * There are 2 headers, and the highest sequence number will represent + * the active header + */ +static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, +                                   uint32_t log_size) +{ +    int ret = 0; +    VHDXHeader *hdr = NULL; + +    hdr = g_new0(VHDXHeader, 1); + +    hdr->signature       = VHDX_HEADER_SIGNATURE; +    hdr->sequence_number = g_random_int(); +    hdr->log_version     = 0; +    hdr->version         = 1; +    hdr->log_length      = log_size; +    hdr->log_offset      = VHDX_HEADER_SECTION_END; +    vhdx_guid_generate(&hdr->file_write_guid); +    vhdx_guid_generate(&hdr->data_write_guid); + +    ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); +    if (ret < 0) { +        goto exit; +    } +    hdr->sequence_number++; +    ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); +    if (ret < 0) { +        goto exit; +    } + +exit: +    g_free(hdr); +    return ret; +} + +#define VHDX_METADATA_ENTRY_BUFFER_SIZE \ +                                    (sizeof(VHDXFileParameters)               +\ +                                     sizeof(VHDXVirtualDiskSize)              +\ +                                     sizeof(VHDXPage83Data)                   +\ +                                     sizeof(VHDXVirtualDiskLogicalSectorSize) +\ +                                     sizeof(VHDXVirtualDiskPhysicalSectorSize)) + +/* + * Create the Metadata entries. + * + * For more details on the entries, see section 3.5 (pg 29) in the + * VHDX 1.00 specification. + * + * We support 5 metadata entries (all required by spec): + *          File Parameters, + *          Virtual Disk Size, + *          Page 83 Data, + *          Logical Sector Size, + *          Physical Sector Size + * + * The first 64KB of the Metadata section is reserved for the metadata + * header and entries; beyond that, the metadata items themselves reside. + */ +static int vhdx_create_new_metadata(BlockDriverState *bs, +                                    uint64_t image_size, +                                    uint32_t block_size, +                                    uint32_t sector_size, +                                    uint64_t metadata_offset, +                                    VHDXImageType type) +{ +    int ret = 0; +    uint32_t offset = 0; +    void *buffer = NULL; +    void *entry_buffer; +    VHDXMetadataTableHeader *md_table;; +    VHDXMetadataTableEntry  *md_table_entry; + +    /* Metadata entries */ +    VHDXFileParameters     *mt_file_params; +    VHDXVirtualDiskSize    *mt_virtual_size; +    VHDXPage83Data         *mt_page83; +    VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size; +    VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; + +    entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE); + +    mt_file_params = entry_buffer; +    offset += sizeof(VHDXFileParameters); +    mt_virtual_size = entry_buffer + offset; +    offset += sizeof(VHDXVirtualDiskSize); +    mt_page83 = entry_buffer + offset; +    offset += sizeof(VHDXPage83Data); +    mt_log_sector_size = entry_buffer + offset; +    offset += sizeof(VHDXVirtualDiskLogicalSectorSize); +    mt_phys_sector_size = entry_buffer + offset; + +    mt_file_params->block_size = cpu_to_le32(block_size); +    if (type == VHDX_TYPE_FIXED) { +        mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED; +        cpu_to_le32s(&mt_file_params->data_bits); +    } + +    vhdx_guid_generate(&mt_page83->page_83_data); +    cpu_to_leguids(&mt_page83->page_83_data); +    mt_virtual_size->virtual_disk_size        = cpu_to_le64(image_size); +    mt_log_sector_size->logical_sector_size   = cpu_to_le32(sector_size); +    mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size); + +    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); +    md_table = buffer; + +    md_table->signature   = VHDX_METADATA_SIGNATURE; +    md_table->entry_count = 5; +    vhdx_metadata_header_le_export(md_table); + + +    /* This will reference beyond the reserved table portion */ +    offset = 64 * KiB; + +    md_table_entry = buffer + sizeof(VHDXMetadataTableHeader); + +    md_table_entry[0].item_id = file_param_guid; +    md_table_entry[0].offset  = offset; +    md_table_entry[0].length  = sizeof(VHDXFileParameters); +    md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED; +    offset += md_table_entry[0].length; +    vhdx_metadata_entry_le_export(&md_table_entry[0]); + +    md_table_entry[1].item_id = virtual_size_guid; +    md_table_entry[1].offset  = offset; +    md_table_entry[1].length  = sizeof(VHDXVirtualDiskSize); +    md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | +                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK; +    offset += md_table_entry[1].length; +    vhdx_metadata_entry_le_export(&md_table_entry[1]); + +    md_table_entry[2].item_id = page83_guid; +    md_table_entry[2].offset  = offset; +    md_table_entry[2].length  = sizeof(VHDXPage83Data); +    md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | +                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK; +    offset += md_table_entry[2].length; +    vhdx_metadata_entry_le_export(&md_table_entry[2]); + +    md_table_entry[3].item_id = logical_sector_guid; +    md_table_entry[3].offset  = offset; +    md_table_entry[3].length  = sizeof(VHDXVirtualDiskLogicalSectorSize); +    md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | +                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK; +    offset += md_table_entry[3].length; +    vhdx_metadata_entry_le_export(&md_table_entry[3]); + +    md_table_entry[4].item_id = phys_sector_guid; +    md_table_entry[4].offset  = offset; +    md_table_entry[4].length  = sizeof(VHDXVirtualDiskPhysicalSectorSize); +    md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | +                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK; +    vhdx_metadata_entry_le_export(&md_table_entry[4]); + +    ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); +    if (ret < 0) { +        goto exit; +    } + +    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, +                      VHDX_METADATA_ENTRY_BUFFER_SIZE); +    if (ret < 0) { +        goto exit; +    } + + +exit: +    g_free(buffer); +    g_free(entry_buffer); +    return ret; +} + +/* This create the actual BAT itself.  We currently only support + * 'Dynamic' and 'Fixed' image types. + * + *  Dynamic images: default state of the BAT is all zeroes. + * + *  Fixed images: default state of the BAT is fully populated, with + *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. + */ +static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, +                           uint64_t image_size, VHDXImageType type, +                           bool use_zero_blocks, uint64_t file_offset, +                           uint32_t length) +{ +    int ret = 0; +    uint64_t data_file_offset; +    uint64_t total_sectors = 0; +    uint64_t sector_num = 0; +    uint64_t unused; +    int block_state; +    VHDXSectorInfo sinfo; + +    assert(s->bat == NULL); + +    /* this gives a data start after BAT/bitmap entries, and well +     * past any metadata entries (with a 4 MB buffer for future +     * expansion */ +    data_file_offset = file_offset + length + 5 * MiB; +    total_sectors = image_size >> s->logical_sector_size_bits; + +    if (type == VHDX_TYPE_DYNAMIC) { +        /* All zeroes, so we can just extend the file - the end of the BAT +         * is the furthest thing we have written yet */ +        ret = bdrv_truncate(bs, data_file_offset); +        if (ret < 0) { +            goto exit; +        } +    } else if (type == VHDX_TYPE_FIXED) { +        ret = bdrv_truncate(bs, data_file_offset + image_size); +        if (ret < 0) { +            goto exit; +        } +    } else { +        ret = -ENOTSUP; +        goto exit; +    } + +    if (type == VHDX_TYPE_FIXED || +                use_zero_blocks || +                bdrv_has_zero_init(bs) == 0) { +        /* for a fixed file, the default BAT entry is not zero */ +        s->bat = g_try_malloc0(length); +        if (length && s->bat == NULL) { +            ret = -ENOMEM; +            goto exit; +        } +        block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : +                                                PAYLOAD_BLOCK_NOT_PRESENT; +        block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; +        /* fill the BAT by emulating sector writes of sectors_per_block size */ +        while (sector_num < total_sectors) { +            vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo); +            sinfo.file_offset = data_file_offset + +                                (sector_num << s->logical_sector_size_bits); +            sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); +            vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, +                                        block_state); +            cpu_to_le64s(&s->bat[sinfo.bat_idx]); +            sector_num += s->sectors_per_block; +        } +        ret = bdrv_pwrite(bs, file_offset, s->bat, length); +        if (ret < 0) { +            goto exit; +        } +    } + + + +exit: +    g_free(s->bat); +    return ret; +} + +/* Creates the region table header, and region table entries. + * There are 2 supported region table entries: BAT, and Metadata/ + * + * As the calculations for the BAT region table are also needed + * to create the BAT itself, we will also cause the BAT to be + * created. + */ +static int vhdx_create_new_region_table(BlockDriverState *bs, +                                        uint64_t image_size, +                                        uint32_t block_size, +                                        uint32_t sector_size, +                                        uint32_t log_size, +                                        bool use_zero_blocks, +                                        VHDXImageType type, +                                        uint64_t *metadata_offset) +{ +    int ret = 0; +    uint32_t offset = 0; +    void *buffer = NULL; +    uint64_t bat_file_offset; +    uint32_t bat_length; +    BDRVVHDXState *s = NULL; +    VHDXRegionTableHeader *region_table; +    VHDXRegionTableEntry *rt_bat; +    VHDXRegionTableEntry *rt_metadata; + +    assert(metadata_offset != NULL); + +    /* Populate enough of the BDRVVHDXState to be able to use the +     * pre-existing BAT calculation, translation, and update functions */ +    s = g_new0(BDRVVHDXState, 1); + +    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * +                     (uint64_t) sector_size / (uint64_t) block_size; + +    s->sectors_per_block = block_size / sector_size; +    s->virtual_disk_size = image_size; +    s->block_size = block_size; +    s->logical_sector_size = sector_size; + +    vhdx_set_shift_bits(s); + +    vhdx_calc_bat_entries(s); + +    /* At this point the VHDX state is populated enough for creation */ + +    /* a single buffer is used so we can calculate the checksum over the +     * entire 64KB block */ +    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); +    region_table = buffer; +    offset += sizeof(VHDXRegionTableHeader); +    rt_bat = buffer + offset; +    offset += sizeof(VHDXRegionTableEntry); +    rt_metadata  = buffer + offset; + +    region_table->signature = VHDX_REGION_SIGNATURE; +    region_table->entry_count = 2;   /* BAT and Metadata */ + +    rt_bat->guid        = bat_guid; +    rt_bat->length      = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB); +    rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB); +    s->bat_offset = rt_bat->file_offset; + +    rt_metadata->guid        = metadata_guid; +    rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length, +                                        MiB); +    rt_metadata->length      = 1 * MiB; /* min size, and more than enough */ +    *metadata_offset = rt_metadata->file_offset; + +    bat_file_offset = rt_bat->file_offset; +    bat_length = rt_bat->length; + +    vhdx_region_header_le_export(region_table); +    vhdx_region_entry_le_export(rt_bat); +    vhdx_region_entry_le_export(rt_metadata); + +    vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, +                         offsetof(VHDXRegionTableHeader, checksum)); + + +    /* The region table gives us the data we need to create the BAT, +     * so do that now */ +    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, +                          bat_file_offset, bat_length); +    if (ret < 0) { +        goto exit; +    } + +    /* Now write out the region headers to disk */ +    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, +                      VHDX_HEADER_BLOCK_SIZE); +    if (ret < 0) { +        goto exit; +    } + +    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, +                      VHDX_HEADER_BLOCK_SIZE); +    if (ret < 0) { +        goto exit; +    } + +exit: +    g_free(s); +    g_free(buffer); +    return ret; +} + +/* We need to create the following elements: + * + *    .-----------------------------------------------------------------. + *    |   (A)    |   (B)    |    (C)    |     (D)       |     (E)       | + *    |  File ID |  Header1 |  Header 2 |  Region Tbl 1 |  Region Tbl 2 | + *    |          |          |           |               |               | + *    .-----------------------------------------------------------------. + *    0         64KB      128KB       192KB           256KB           320KB + * + * + *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + *    |     (F)     |     (G)       |    (H)    |                        | + *    | Journal Log |  BAT / Bitmap |  Metadata |  .... data ......      | + *    |             |               |           |                        | + *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + *   1MB + */ +static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int ret = 0; +    uint64_t image_size = (uint64_t) 2 * GiB; +    uint32_t log_size   = 1 * MiB; +    uint32_t block_size = 0; +    uint64_t signature; +    uint64_t metadata_offset; +    bool use_zero_blocks = false; + +    gunichar2 *creator = NULL; +    glong creator_items; +    BlockDriverState *bs; +    char *type = NULL; +    VHDXImageType image_type; +    Error *local_err = NULL; + +    image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0); +    block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0); +    type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); +    use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, true); + +    if (image_size > VHDX_MAX_IMAGE_SIZE) { +        error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); +        ret = -EINVAL; +        goto exit; +    } + +    if (type == NULL) { +        type = g_strdup("dynamic"); +    } + +    if (!strcmp(type, "dynamic")) { +        image_type = VHDX_TYPE_DYNAMIC; +    } else if (!strcmp(type, "fixed")) { +        image_type = VHDX_TYPE_FIXED; +    } else if (!strcmp(type, "differencing")) { +        error_setg_errno(errp, ENOTSUP, +                         "Differencing files not yet supported"); +        ret = -ENOTSUP; +        goto exit; +    } else { +        ret = -EINVAL; +        goto exit; +    } + +    /* These are pretty arbitrary, and mainly designed to keep the BAT +     * size reasonable to load into RAM */ +    if (block_size == 0) { +        if (image_size > 32 * TiB) { +            block_size = 64 * MiB; +        } else if (image_size > (uint64_t) 100 * GiB) { +            block_size = 32 * MiB; +        } else if (image_size > 1 * GiB) { +            block_size = 16 * MiB; +        } else { +            block_size = 8 * MiB; +        } +    } + + +    /* make the log size close to what was specified, but must be +     * min 1MB, and multiple of 1MB */ +    log_size = ROUND_UP(log_size, MiB); + +    block_size = ROUND_UP(block_size, MiB); +    block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : +                                                    block_size; + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } + +    bs = NULL; +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } + +    /* Create (A) */ + +    /* The creator field is optional, but may be useful for +     * debugging / diagnostics */ +    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, +                              &creator_items, NULL); +    signature = cpu_to_le64(VHDX_FILE_SIGNATURE); +    ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); +    if (ret < 0) { +        goto delete_and_exit; +    } +    if (creator) { +        ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), +                          creator, creator_items * sizeof(gunichar2)); +        if (ret < 0) { +            goto delete_and_exit; +        } +    } + + +    /* Creates (B),(C) */ +    ret = vhdx_create_new_headers(bs, image_size, log_size); +    if (ret < 0) { +        goto delete_and_exit; +    } + +    /* Creates (D),(E),(G) explicitly. (F) created as by-product */ +    ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, +                                       log_size, use_zero_blocks, image_type, +                                       &metadata_offset); +    if (ret < 0) { +        goto delete_and_exit; +    } + +    /* Creates (H) */ +    ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, +                                   metadata_offset, image_type); +    if (ret < 0) { +        goto delete_and_exit; +    } + + +delete_and_exit: +    bdrv_unref(bs); +exit: +    g_free(type); +    g_free(creator); +    return ret; +} + +/* If opened r/w, the VHDX driver will automatically replay the log, + * if one is present, inside the vhdx_open() call. + * + * If qemu-img check -r all is called, the image is automatically opened + * r/w and any log has already been replayed, so there is nothing (currently) + * for us to do here + */ +static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, +                       BdrvCheckMode fix) +{ +    BDRVVHDXState *s = bs->opaque; + +    if (s->log_replayed_on_open) { +        result->corruptions_fixed++; +    } +    return 0; +} + +static QemuOptsList vhdx_create_opts = { +    .name = "vhdx-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(vhdx_create_opts.head), +    .desc = { +        { +           .name = BLOCK_OPT_SIZE, +           .type = QEMU_OPT_SIZE, +           .help = "Virtual disk size; max of 64TB." +       }, +       { +           .name = VHDX_BLOCK_OPT_LOG_SIZE, +           .type = QEMU_OPT_SIZE, +           .def_value_str = stringify(DEFAULT_LOG_SIZE), +           .help = "Log size; min 1MB." +       }, +       { +           .name = VHDX_BLOCK_OPT_BLOCK_SIZE, +           .type = QEMU_OPT_SIZE, +           .def_value_str = stringify(0), +           .help = "Block Size; min 1MB, max 256MB. " \ +                   "0 means auto-calculate based on image size." +       }, +       { +           .name = BLOCK_OPT_SUBFMT, +           .type = QEMU_OPT_STRING, +           .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ +                   "Default is 'dynamic'." +       }, +       { +           .name = VHDX_BLOCK_OPT_ZERO, +           .type = QEMU_OPT_BOOL, +           .help = "Force use of payload blocks of type 'ZERO'. "\ +                   "Non-standard, but default.  Do not set to 'off' when "\ +                   "using 'qemu-img convert' with subformat=dynamic." +       }, +       { NULL } +    } +}; + +static BlockDriver bdrv_vhdx = { +    .format_name            = "vhdx", +    .instance_size          = sizeof(BDRVVHDXState), +    .bdrv_probe             = vhdx_probe, +    .bdrv_open              = vhdx_open, +    .bdrv_close             = vhdx_close, +    .bdrv_reopen_prepare    = vhdx_reopen_prepare, +    .bdrv_co_readv          = vhdx_co_readv, +    .bdrv_co_writev         = vhdx_co_writev, +    .bdrv_create            = vhdx_create, +    .bdrv_get_info          = vhdx_get_info, +    .bdrv_check             = vhdx_check, +    .bdrv_has_zero_init     = bdrv_has_zero_init_1, + +    .create_opts            = &vhdx_create_opts, +}; + +static void bdrv_vhdx_init(void) +{ +    bdrv_register(&bdrv_vhdx); +} + +block_init(bdrv_vhdx_init); diff --git a/block/vhdx.h b/block/vhdx.h new file mode 100644 index 00000000..7003ab7a --- /dev/null +++ b/block/vhdx.h @@ -0,0 +1,453 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + *  Jeff Cody <jcody@redhat.com> + * + *  This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + *  by Microsoft: + *      https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_VHDX_H +#define BLOCK_VHDX_H + +#define KiB              (1 * 1024) +#define MiB            (KiB * 1024) +#define GiB            (MiB * 1024) +#define TiB ((uint64_t) GiB * 1024) + +#define DEFAULT_LOG_SIZE 1048576 /* 1MiB */ +/* Structures and fields present in the VHDX file */ + +/* The header section has the following blocks, + * each block is 64KB: + * + * _____________________________________________________________________________ + * | File Id. |   Header 1    | Header 2   | Region Table |  Reserved (768KB)  | + * |----------|---------------|------------|--------------|--------------------| + * |          |               |            |              |                    | + * 0.........64KB...........128KB........192KB..........256KB................1MB + */ + +#define VHDX_HEADER_BLOCK_SIZE      (64 * 1024) + +#define VHDX_FILE_ID_OFFSET         0 +#define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 1) +#define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 2) +#define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE * 3) +#define VHDX_REGION_TABLE2_OFFSET   (VHDX_HEADER_BLOCK_SIZE * 4) + +#define VHDX_HEADER_SECTION_END     (1 * MiB) +/* + * A note on the use of MS-GUID fields.  For more details on the GUID, + * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. + * + * The VHDX specification only states that these are MS GUIDs, and which + * bytes are data1-data4. It makes no mention of what algorithm should be used + * to generate the GUID, nor what standard.  However, looking at the specified + * known GUID fields, it appears the GUIDs are: + *  Standard/DCE GUID type  (noted by 10b in the MSB of byte 0 of .data4) + *  Random algorithm        (noted by 0x4XXX for .data3) + */ + +/* ---- HEADER SECTION STRUCTURES ---- */ + +/* These structures are ones that are defined in the VHDX specification + * document */ + +#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL  /* "vhdxfile" in ASCII */ +typedef struct VHDXFileIdentifier { +    uint64_t    signature;              /* "vhdxfile" in ASCII */ +    uint16_t    creator[256];           /* optional; utf-16 string to identify +                                           the vhdx file creator.  Diagnostic +                                           only */ +} VHDXFileIdentifier; + + +/* the guid is a 16 byte unique ID - the definition for this used by + * Microsoft is not just 16 bytes though - it is a structure that is defined, + * so we need to follow it here so that endianness does not trip us up */ + +typedef struct QEMU_PACKED MSGUID { +    uint32_t  data1; +    uint16_t  data2; +    uint16_t  data3; +    uint8_t   data4[8]; +} MSGUID; + +#define guid_eq(a, b) \ +    (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) + +#define VHDX_HEADER_SIZE (4 * 1024)   /* although the vhdx_header struct in disk +                                         is only 582 bytes, for purposes of crc +                                         the header is the first 4KB of the 64KB +                                         block */ + +/* The full header is 4KB, although the actual header data is much smaller. + * But for the checksum calculation, it is over the entire 4KB structure, + * not just the defined portion of it */ +#define VHDX_HEADER_SIGNATURE 0x64616568 +typedef struct QEMU_PACKED VHDXHeader { +    uint32_t    signature;              /* "head" in ASCII */ +    uint32_t    checksum;               /* CRC-32C hash of the whole header */ +    uint64_t    sequence_number;        /* Seq number of this header.  Each +                                           VHDX file has 2 of these headers, +                                           and only the header with the highest +                                           sequence number is valid */ +    MSGUID      file_write_guid;        /* 128 bit unique identifier. Must be +                                           updated to new, unique value before +                                           the first modification is made to +                                           file */ +    MSGUID      data_write_guid;        /* 128 bit unique identifier. Must be +                                           updated to new, unique value before +                                           the first modification is made to +                                           visible data.   Visbile data is +                                           defined as: +                                                    - system & user metadata +                                                    - raw block data +                                                    - disk size +                                                    - any change that will +                                                      cause the virtual disk +                                                      sector read to differ + +                                           This does not need to change if +                                           blocks are re-arranged */ +    MSGUID      log_guid;               /* 128 bit unique identifier. If zero, +                                           there is no valid log. If non-zero, +                                           log entries with this guid are +                                           valid. */ +    uint16_t    log_version;            /* version of the log format. Must be +                                           set to zero */ +    uint16_t    version;                /* version of the vhdx file.  Currently, +                                           only supported version is "1" */ +    uint32_t    log_length;             /* length of the log.  Must be multiple +                                           of 1MB */ +    uint64_t    log_offset;             /* byte offset in the file of the log. +                                           Must also be a multiple of 1MB */ +} VHDXHeader; + +/* Header for the region table block */ +#define VHDX_REGION_SIGNATURE  0x69676572  /* "regi" in ASCII */ +typedef struct QEMU_PACKED VHDXRegionTableHeader { +    uint32_t    signature;              /* "regi" in ASCII */ +    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */ +    uint32_t    entry_count;            /* number of valid entries */ +    uint32_t    reserved; +} VHDXRegionTableHeader; + +/* Individual region table entry.  There may be a maximum of 2047 of these + * + *  There are two known region table properties.  Both are required. + *  BAT (block allocation table):  2DC27766F62342009D64115E9BFD4A08 + *  Metadata:                      8B7CA20647904B9AB8FE575F050F886E + */ +#define VHDX_REGION_ENTRY_REQUIRED  0x01    /* if set, parser must understand +                                               this entry in order to open +                                               file */ +typedef struct QEMU_PACKED VHDXRegionTableEntry { +    MSGUID      guid;                   /* 128-bit unique identifier */ +    uint64_t    file_offset;            /* offset of the object in the file. +                                           Must be multiple of 1MB */ +    uint32_t    length;                 /* length, in bytes, of the object */ +    uint32_t    data_bits; +} VHDXRegionTableEntry; + + +/* ---- LOG ENTRY STRUCTURES ---- */ +#define VHDX_LOG_MIN_SIZE (1024 * 1024) +#define VHDX_LOG_SECTOR_SIZE 4096 +#define VHDX_LOG_HDR_SIZE 64 +#define VHDX_LOG_SIGNATURE 0x65676f6c +typedef struct QEMU_PACKED VHDXLogEntryHeader { +    uint32_t    signature;              /* "loge" in ASCII */ +    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */ +    uint32_t    entry_length;           /* length in bytes, multiple of 1MB */ +    uint32_t    tail;                   /* byte offset of first log entry of a +                                           seq, where this entry is the last +                                           entry */ +    uint64_t    sequence_number;        /* incremented with each log entry. +                                           May not be zero. */ +    uint32_t    descriptor_count;       /* number of descriptors in this log +                                           entry, must be >= 0 */ +    uint32_t    reserved; +    MSGUID      log_guid;               /* value of the log_guid from +                                           vhdx_header.  If not found in +                                           vhdx_header, it is invalid */ +    uint64_t    flushed_file_offset;    /* see spec for full details - this +                                           should be vhdx file size in bytes */ +    uint64_t    last_file_offset;       /* size in bytes that all allocated +                                           file structures fit into */ +} VHDXLogEntryHeader; + +#define VHDX_LOG_DESC_SIZE 32 +#define VHDX_LOG_DESC_SIGNATURE 0x63736564 +#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a +typedef struct QEMU_PACKED VHDXLogDescriptor { +    uint32_t    signature;              /* "zero" or "desc" in ASCII */ +    union  { +        uint32_t    reserved;           /* zero desc */ +        uint32_t    trailing_bytes;     /* data desc: bytes 4092-4096 of the +                                           data sector */ +    }; +    union { +        uint64_t    zero_length;        /* zero desc: length of the section to +                                           zero */ +        uint64_t    leading_bytes;      /* data desc: bytes 0-7 of the data +                                           sector */ +    }; +    uint64_t    file_offset;            /* file offset to write zeros - multiple +                                           of 4kB */ +    uint64_t    sequence_number;        /* must match same field in +                                           vhdx_log_entry_header */ +} VHDXLogDescriptor; + +#define VHDX_LOG_DATA_SIGNATURE 0x61746164 +typedef struct QEMU_PACKED VHDXLogDataSector { +    uint32_t    data_signature;         /* "data" in ASCII */ +    uint32_t    sequence_high;          /* 4 MSB of 8 byte sequence_number */ +    uint8_t     data[4084];             /* raw data, bytes 8-4091 (inclusive). +                                           see the data descriptor field for the +                                           other mising bytes */ +    uint32_t    sequence_low;           /* 4 LSB of 8 byte sequence_number */ +} VHDXLogDataSector; + + + +/* block states - different state values depending on whether it is a + * payload block, or a sector block. */ + +#define PAYLOAD_BLOCK_NOT_PRESENT       0 +#define PAYLOAD_BLOCK_UNDEFINED         1 +#define PAYLOAD_BLOCK_ZERO              2 +#define PAYLOAD_BLOCK_UNMAPPED          3 +#define PAYLOAD_BLOCK_UNMAPPED_v095     5 +#define PAYLOAD_BLOCK_FULLY_PRESENT     6 +#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 + +#define SB_BLOCK_NOT_PRESENT    0 +#define SB_BLOCK_PRESENT        6 + +/* per the spec */ +#define VHDX_MAX_SECTORS_PER_BLOCK  (1 << 23) + +/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state +   other bits are reserved */ +#define VHDX_BAT_STATE_BIT_MASK 0x07 +#define VHDX_BAT_FILE_OFF_MASK  0xFFFFFFFFFFF00000ULL /* upper 44 bits */ +typedef uint64_t VHDXBatEntry; + +/* ---- METADATA REGION STRUCTURES ---- */ + +#define VHDX_METADATA_ENTRY_SIZE 32 +#define VHDX_METADATA_MAX_ENTRIES 2047  /* not including the header */ +#define VHDX_METADATA_TABLE_MAX_SIZE \ +    (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) +#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL  /* "metadata" in ASCII */ +typedef struct QEMU_PACKED VHDXMetadataTableHeader { +    uint64_t    signature;              /* "metadata" in ASCII */ +    uint16_t    reserved; +    uint16_t    entry_count;            /* number table entries. <= 2047 */ +    uint32_t    reserved2[5]; +} VHDXMetadataTableHeader; + +#define VHDX_META_FLAGS_IS_USER         0x01    /* max 1024 entries */ +#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02    /* virtual disk metadata if set, +                                                   otherwise file metdata */ +#define VHDX_META_FLAGS_IS_REQUIRED     0x04    /* parse must understand this +                                                   entry to open the file */ +typedef struct QEMU_PACKED VHDXMetadataTableEntry { +    MSGUID      item_id;                /* 128-bit identifier for metadata */ +    uint32_t    offset;                 /* byte offset of the metadata.  At +                                           least 64kB.  Relative to start of +                                           metadata region */ +                                        /* note: if length = 0, so is offset */ +    uint32_t    length;                 /* length of metadata. <= 1MB. */ +    uint32_t    data_bits;              /* least-significant 3 bits are flags, +                                           the rest are reserved (see above) */ +    uint32_t    reserved2; +} VHDXMetadataTableEntry; + +#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01   /* Do not change any blocks to +                                                   be BLOCK_NOT_PRESENT. +                                                   If set indicates a fixed +                                                   size VHDX file */ +#define VHDX_PARAMS_HAS_PARENT           0x02    /* has parent / backing file */ +#define VHDX_BLOCK_SIZE_MIN             (1   * MiB) +#define VHDX_BLOCK_SIZE_MAX             (256 * MiB) +typedef struct QEMU_PACKED VHDXFileParameters { +    uint32_t    block_size;             /* size of each payload block, always +                                           power of 2, <= 256MB and >= 1MB. */ +    uint32_t data_bits;                 /* least-significant 2 bits are flags, +                                           the rest are reserved (see above) */ +} VHDXFileParameters; + +#define VHDX_MAX_IMAGE_SIZE  ((uint64_t) 64 * TiB) +typedef struct QEMU_PACKED VHDXVirtualDiskSize { +    uint64_t    virtual_disk_size;      /* Size of the virtual disk, in bytes. +                                           Must be multiple of the sector size, +                                           max of 64TB */ +} VHDXVirtualDiskSize; + +typedef struct QEMU_PACKED VHDXPage83Data { +    MSGUID      page_83_data;           /* unique id for scsi devices that +                                           support page 0x83 */ +} VHDXPage83Data; + +typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize { +    uint32_t    logical_sector_size;    /* virtual disk sector size (in bytes). +                                           Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskLogicalSectorSize; + +typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { +    uint32_t    physical_sector_size;   /* physical sector size (in bytes). +                                           Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskPhysicalSectorSize; + +typedef struct QEMU_PACKED VHDXParentLocatorHeader { +    MSGUID      locator_type;           /* type of the parent virtual disk. */ +    uint16_t    reserved; +    uint16_t    key_value_count;        /* number of key/value pairs for this +                                           locator */ +} VHDXParentLocatorHeader; + +/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */ +typedef struct QEMU_PACKED VHDXParentLocatorEntry { +    uint32_t    key_offset;             /* offset in metadata for key, > 0 */ +    uint32_t    value_offset;           /* offset in metadata for value, >0 */ +    uint16_t    key_length;             /* length of entry key, > 0 */ +    uint16_t    value_length;           /* length of entry value, > 0 */ +} VHDXParentLocatorEntry; + + +/* ----- END VHDX SPECIFICATION STRUCTURES ---- */ + +typedef struct VHDXMetadataEntries { +    VHDXMetadataTableEntry file_parameters_entry; +    VHDXMetadataTableEntry virtual_disk_size_entry; +    VHDXMetadataTableEntry page83_data_entry; +    VHDXMetadataTableEntry logical_sector_size_entry; +    VHDXMetadataTableEntry phys_sector_size_entry; +    VHDXMetadataTableEntry parent_locator_entry; +    uint16_t present; +} VHDXMetadataEntries; + +typedef struct VHDXLogEntries { +    uint64_t offset; +    uint64_t length; +    uint32_t write; +    uint32_t read; +    VHDXLogEntryHeader *hdr; +    void *desc_buffer; +    uint64_t sequence; +    uint32_t tail; +} VHDXLogEntries; + +typedef struct VHDXRegionEntry { +    uint64_t start; +    uint64_t end; +    QLIST_ENTRY(VHDXRegionEntry) entries; +} VHDXRegionEntry; + +typedef struct BDRVVHDXState { +    CoMutex lock; + +    int curr_header; +    VHDXHeader *headers[2]; + +    VHDXRegionTableHeader rt; +    VHDXRegionTableEntry bat_rt;         /* region table for the BAT */ +    VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */ + +    VHDXMetadataTableHeader metadata_hdr; +    VHDXMetadataEntries metadata_entries; + +    VHDXFileParameters params; +    uint32_t block_size; +    uint32_t block_size_bits; +    uint32_t sectors_per_block; +    uint32_t sectors_per_block_bits; + +    uint64_t virtual_disk_size; +    uint32_t logical_sector_size; +    uint32_t physical_sector_size; + +    uint64_t chunk_ratio; +    uint32_t chunk_ratio_bits; +    uint32_t logical_sector_size_bits; + +    uint32_t bat_entries; +    VHDXBatEntry *bat; +    uint64_t bat_offset; + +    bool first_visible_write; +    MSGUID session_guid; + +    VHDXLogEntries log; + +    VHDXParentLocatorHeader parent_header; +    VHDXParentLocatorEntry *parent_entries; + +    Error *migration_blocker; + +    bool log_replayed_on_open; + +    QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; +} BDRVVHDXState; + +void vhdx_guid_generate(MSGUID *guid); + +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw, +                        MSGUID *log_guid); + +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset); +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, +                            int crc_offset); + +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); + +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, +                   Error **errp); + +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, +                             void *data, uint32_t length, uint64_t offset); + +static inline void leguid_to_cpus(MSGUID *guid) +{ +    le32_to_cpus(&guid->data1); +    le16_to_cpus(&guid->data2); +    le16_to_cpus(&guid->data3); +} + +static inline void cpu_to_leguids(MSGUID *guid) +{ +    cpu_to_le32s(&guid->data1); +    cpu_to_le16s(&guid->data2); +    cpu_to_le16s(&guid->data3); +} + +void vhdx_header_le_import(VHDXHeader *h); +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); +void vhdx_log_desc_le_import(VHDXLogDescriptor *d); +void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_import(VHDXLogDataSector *d); +void vhdx_log_data_le_export(VHDXLogDataSector *d); +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr); +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr); +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e); +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e); +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e); +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e); +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s); + +#endif diff --git a/block/vmdk.c b/block/vmdk.c new file mode 100644 index 00000000..fbaab67c --- /dev/null +++ b/block/vmdk.c @@ -0,0 +1,2301 @@ +/* + * Block driver for the VMDK format + * + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2005 Filip Navara + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include <zlib.h> +#include <glib.h> + +#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') +#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') +#define VMDK4_COMPRESSION_DEFLATE 1 +#define VMDK4_FLAG_NL_DETECT (1 << 0) +#define VMDK4_FLAG_RGD (1 << 1) +/* Zeroed-grain enable bit */ +#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2) +#define VMDK4_FLAG_COMPRESS (1 << 16) +#define VMDK4_FLAG_MARKER (1 << 17) +#define VMDK4_GD_AT_END 0xffffffffffffffffULL + +#define VMDK_GTE_ZEROED 0x1 + +/* VMDK internal error codes */ +#define VMDK_OK      0 +#define VMDK_ERROR   (-1) +/* Cluster not allocated */ +#define VMDK_UNALLOC (-2) +#define VMDK_ZEROED  (-3) + +#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" + +typedef struct { +    uint32_t version; +    uint32_t flags; +    uint32_t disk_sectors; +    uint32_t granularity; +    uint32_t l1dir_offset; +    uint32_t l1dir_size; +    uint32_t file_sectors; +    uint32_t cylinders; +    uint32_t heads; +    uint32_t sectors_per_track; +} QEMU_PACKED VMDK3Header; + +typedef struct { +    uint32_t version; +    uint32_t flags; +    uint64_t capacity; +    uint64_t granularity; +    uint64_t desc_offset; +    uint64_t desc_size; +    /* Number of GrainTableEntries per GrainTable */ +    uint32_t num_gtes_per_gt; +    uint64_t rgd_offset; +    uint64_t gd_offset; +    uint64_t grain_offset; +    char filler[1]; +    char check_bytes[4]; +    uint16_t compressAlgorithm; +} QEMU_PACKED VMDK4Header; + +#define L2_CACHE_SIZE 16 + +typedef struct VmdkExtent { +    BlockDriverState *file; +    bool flat; +    bool compressed; +    bool has_marker; +    bool has_zero_grain; +    int version; +    int64_t sectors; +    int64_t end_sector; +    int64_t flat_start_offset; +    int64_t l1_table_offset; +    int64_t l1_backup_table_offset; +    uint32_t *l1_table; +    uint32_t *l1_backup_table; +    unsigned int l1_size; +    uint32_t l1_entry_sectors; + +    unsigned int l2_size; +    uint32_t *l2_cache; +    uint32_t l2_cache_offsets[L2_CACHE_SIZE]; +    uint32_t l2_cache_counts[L2_CACHE_SIZE]; + +    int64_t cluster_sectors; +    int64_t next_cluster_sector; +    char *type; +} VmdkExtent; + +typedef struct BDRVVmdkState { +    CoMutex lock; +    uint64_t desc_offset; +    bool cid_updated; +    bool cid_checked; +    uint32_t cid; +    uint32_t parent_cid; +    int num_extents; +    /* Extent array with num_extents entries, ascend ordered by address */ +    VmdkExtent *extents; +    Error *migration_blocker; +    char *create_type; +} BDRVVmdkState; + +typedef struct VmdkMetaData { +    unsigned int l1_index; +    unsigned int l2_index; +    unsigned int l2_offset; +    int valid; +    uint32_t *l2_cache_entry; +} VmdkMetaData; + +typedef struct VmdkGrainMarker { +    uint64_t lba; +    uint32_t size; +    uint8_t  data[0]; +} QEMU_PACKED VmdkGrainMarker; + +enum { +    MARKER_END_OF_STREAM    = 0, +    MARKER_GRAIN_TABLE      = 1, +    MARKER_GRAIN_DIRECTORY  = 2, +    MARKER_FOOTER           = 3, +}; + +static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    uint32_t magic; + +    if (buf_size < 4) { +        return 0; +    } +    magic = be32_to_cpu(*(uint32_t *)buf); +    if (magic == VMDK3_MAGIC || +        magic == VMDK4_MAGIC) { +        return 100; +    } else { +        const char *p = (const char *)buf; +        const char *end = p + buf_size; +        while (p < end) { +            if (*p == '#') { +                /* skip comment line */ +                while (p < end && *p != '\n') { +                    p++; +                } +                p++; +                continue; +            } +            if (*p == ' ') { +                while (p < end && *p == ' ') { +                    p++; +                } +                /* skip '\r' if windows line endings used. */ +                if (p < end && *p == '\r') { +                    p++; +                } +                /* only accept blank lines before 'version=' line */ +                if (p == end || *p != '\n') { +                    return 0; +                } +                p++; +                continue; +            } +            if (end - p >= strlen("version=X\n")) { +                if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || +                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) { +                    return 100; +                } +            } +            if (end - p >= strlen("version=X\r\n")) { +                if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || +                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { +                    return 100; +                } +            } +            return 0; +        } +        return 0; +    } +} + +#define SECTOR_SIZE 512 +#define DESC_SIZE (20 * SECTOR_SIZE)    /* 20 sectors of 512 bytes each */ +#define BUF_SIZE 4096 +#define HEADER_SIZE 512                 /* first sector of 512 bytes */ + +static void vmdk_free_extents(BlockDriverState *bs) +{ +    int i; +    BDRVVmdkState *s = bs->opaque; +    VmdkExtent *e; + +    for (i = 0; i < s->num_extents; i++) { +        e = &s->extents[i]; +        g_free(e->l1_table); +        g_free(e->l2_cache); +        g_free(e->l1_backup_table); +        g_free(e->type); +        if (e->file != bs->file) { +            bdrv_unref(e->file); +        } +    } +    g_free(s->extents); +} + +static void vmdk_free_last_extent(BlockDriverState *bs) +{ +    BDRVVmdkState *s = bs->opaque; + +    if (s->num_extents == 0) { +        return; +    } +    s->num_extents--; +    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); +} + +static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) +{ +    char desc[DESC_SIZE]; +    uint32_t cid = 0xffffffff; +    const char *p_name, *cid_str; +    size_t cid_str_size; +    BDRVVmdkState *s = bs->opaque; +    int ret; + +    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); +    if (ret < 0) { +        return 0; +    } + +    if (parent) { +        cid_str = "parentCID"; +        cid_str_size = sizeof("parentCID"); +    } else { +        cid_str = "CID"; +        cid_str_size = sizeof("CID"); +    } + +    desc[DESC_SIZE - 1] = '\0'; +    p_name = strstr(desc, cid_str); +    if (p_name != NULL) { +        p_name += cid_str_size; +        sscanf(p_name, "%" SCNx32, &cid); +    } + +    return cid; +} + +static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) +{ +    char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; +    char *p_name, *tmp_str; +    BDRVVmdkState *s = bs->opaque; +    int ret; + +    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); +    if (ret < 0) { +        return ret; +    } + +    desc[DESC_SIZE - 1] = '\0'; +    tmp_str = strstr(desc, "parentCID"); +    if (tmp_str == NULL) { +        return -EINVAL; +    } + +    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); +    p_name = strstr(desc, "CID"); +    if (p_name != NULL) { +        p_name += sizeof("CID"); +        snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); +        pstrcat(desc, sizeof(desc), tmp_desc); +    } + +    ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE); +    if (ret < 0) { +        return ret; +    } + +    return 0; +} + +static int vmdk_is_cid_valid(BlockDriverState *bs) +{ +    BDRVVmdkState *s = bs->opaque; +    BlockDriverState *p_bs = bs->backing_hd; +    uint32_t cur_pcid; + +    if (!s->cid_checked && p_bs) { +        cur_pcid = vmdk_read_cid(p_bs, 0); +        if (s->parent_cid != cur_pcid) { +            /* CID not valid */ +            return 0; +        } +    } +    s->cid_checked = true; +    /* CID valid */ +    return 1; +} + +/* We have nothing to do for VMDK reopen, stubs just return success */ +static int vmdk_reopen_prepare(BDRVReopenState *state, +                               BlockReopenQueue *queue, Error **errp) +{ +    assert(state != NULL); +    assert(state->bs != NULL); +    return 0; +} + +static int vmdk_parent_open(BlockDriverState *bs) +{ +    char *p_name; +    char desc[DESC_SIZE + 1]; +    BDRVVmdkState *s = bs->opaque; +    int ret; + +    desc[DESC_SIZE] = '\0'; +    ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); +    if (ret < 0) { +        return ret; +    } + +    p_name = strstr(desc, "parentFileNameHint"); +    if (p_name != NULL) { +        char *end_name; + +        p_name += sizeof("parentFileNameHint") + 1; +        end_name = strchr(p_name, '\"'); +        if (end_name == NULL) { +            return -EINVAL; +        } +        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { +            return -EINVAL; +        } + +        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); +    } + +    return 0; +} + +/* Create and append extent to the extent array. Return the added VmdkExtent + * address. return NULL if allocation failed. */ +static int vmdk_add_extent(BlockDriverState *bs, +                           BlockDriverState *file, bool flat, int64_t sectors, +                           int64_t l1_offset, int64_t l1_backup_offset, +                           uint32_t l1_size, +                           int l2_size, uint64_t cluster_sectors, +                           VmdkExtent **new_extent, +                           Error **errp) +{ +    VmdkExtent *extent; +    BDRVVmdkState *s = bs->opaque; +    int64_t nb_sectors; + +    if (cluster_sectors > 0x200000) { +        /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ +        error_setg(errp, "Invalid granularity, image may be corrupt"); +        return -EFBIG; +    } +    if (l1_size > 512 * 1024 * 1024) { +        /* Although with big capacity and small l1_entry_sectors, we can get a +         * big l1_size, we don't want unbounded value to allocate the table. +         * Limit it to 512M, which is 16PB for default cluster and L2 table +         * size */ +        error_setg(errp, "L1 size too big"); +        return -EFBIG; +    } + +    nb_sectors = bdrv_nb_sectors(file); +    if (nb_sectors < 0) { +        return nb_sectors; +    } + +    s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1); +    extent = &s->extents[s->num_extents]; +    s->num_extents++; + +    memset(extent, 0, sizeof(VmdkExtent)); +    extent->file = file; +    extent->flat = flat; +    extent->sectors = sectors; +    extent->l1_table_offset = l1_offset; +    extent->l1_backup_table_offset = l1_backup_offset; +    extent->l1_size = l1_size; +    extent->l1_entry_sectors = l2_size * cluster_sectors; +    extent->l2_size = l2_size; +    extent->cluster_sectors = flat ? sectors : cluster_sectors; +    extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors); + +    if (s->num_extents > 1) { +        extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; +    } else { +        extent->end_sector = extent->sectors; +    } +    bs->total_sectors = extent->end_sector; +    if (new_extent) { +        *new_extent = extent; +    } +    return 0; +} + +static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, +                            Error **errp) +{ +    int ret; +    size_t l1_size; +    int i; + +    /* read the L1 table */ +    l1_size = extent->l1_size * sizeof(uint32_t); +    extent->l1_table = g_try_malloc(l1_size); +    if (l1_size && extent->l1_table == NULL) { +        return -ENOMEM; +    } + +    ret = bdrv_pread(extent->file, +                     extent->l1_table_offset, +                     extent->l1_table, +                     l1_size); +    if (ret < 0) { +        error_setg_errno(errp, -ret, +                         "Could not read l1 table from extent '%s'", +                         extent->file->filename); +        goto fail_l1; +    } +    for (i = 0; i < extent->l1_size; i++) { +        le32_to_cpus(&extent->l1_table[i]); +    } + +    if (extent->l1_backup_table_offset) { +        extent->l1_backup_table = g_try_malloc(l1_size); +        if (l1_size && extent->l1_backup_table == NULL) { +            ret = -ENOMEM; +            goto fail_l1; +        } +        ret = bdrv_pread(extent->file, +                         extent->l1_backup_table_offset, +                         extent->l1_backup_table, +                         l1_size); +        if (ret < 0) { +            error_setg_errno(errp, -ret, +                             "Could not read l1 backup table from extent '%s'", +                             extent->file->filename); +            goto fail_l1b; +        } +        for (i = 0; i < extent->l1_size; i++) { +            le32_to_cpus(&extent->l1_backup_table[i]); +        } +    } + +    extent->l2_cache = +        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE); +    return 0; + fail_l1b: +    g_free(extent->l1_backup_table); + fail_l1: +    g_free(extent->l1_table); +    return ret; +} + +static int vmdk_open_vmfs_sparse(BlockDriverState *bs, +                                 BlockDriverState *file, +                                 int flags, Error **errp) +{ +    int ret; +    uint32_t magic; +    VMDK3Header header; +    VmdkExtent *extent; + +    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); +    if (ret < 0) { +        error_setg_errno(errp, -ret, +                         "Could not read header from file '%s'", +                         file->filename); +        return ret; +    } +    ret = vmdk_add_extent(bs, file, false, +                          le32_to_cpu(header.disk_sectors), +                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9, +                          0, +                          le32_to_cpu(header.l1dir_size), +                          4096, +                          le32_to_cpu(header.granularity), +                          &extent, +                          errp); +    if (ret < 0) { +        return ret; +    } +    ret = vmdk_init_tables(bs, extent, errp); +    if (ret) { +        /* free extent allocated by vmdk_add_extent */ +        vmdk_free_last_extent(bs); +    } +    return ret; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, +                               QDict *options, Error **errp); + +static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, +                            Error **errp) +{ +    int64_t size; +    char *buf; +    int ret; + +    size = bdrv_getlength(file); +    if (size < 0) { +        error_setg_errno(errp, -size, "Could not access file"); +        return NULL; +    } + +    if (size < 4) { +        /* Both descriptor file and sparse image must be much larger than 4 +         * bytes, also callers of vmdk_read_desc want to compare the first 4 +         * bytes with VMDK4_MAGIC, let's error out if less is read. */ +        error_setg(errp, "File is too small, not a valid image"); +        return NULL; +    } + +    size = MIN(size, (1 << 20) - 1);  /* avoid unbounded allocation */ +    buf = g_malloc(size + 1); + +    ret = bdrv_pread(file, desc_offset, buf, size); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not read from file"); +        g_free(buf); +        return NULL; +    } +    buf[ret] = 0; + +    return buf; +} + +static int vmdk_open_vmdk4(BlockDriverState *bs, +                           BlockDriverState *file, +                           int flags, QDict *options, Error **errp) +{ +    int ret; +    uint32_t magic; +    uint32_t l1_size, l1_entry_sectors; +    VMDK4Header header; +    VmdkExtent *extent; +    BDRVVmdkState *s = bs->opaque; +    int64_t l1_backup_offset = 0; + +    ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); +    if (ret < 0) { +        error_setg_errno(errp, -ret, +                         "Could not read header from file '%s'", +                         file->filename); +        return -EINVAL; +    } +    if (header.capacity == 0) { +        uint64_t desc_offset = le64_to_cpu(header.desc_offset); +        if (desc_offset) { +            char *buf = vmdk_read_desc(file, desc_offset << 9, errp); +            if (!buf) { +                return -EINVAL; +            } +            ret = vmdk_open_desc_file(bs, flags, buf, options, errp); +            g_free(buf); +            return ret; +        } +    } + +    if (!s->create_type) { +        s->create_type = g_strdup("monolithicSparse"); +    } + +    if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { +        /* +         * The footer takes precedence over the header, so read it in. The +         * footer starts at offset -1024 from the end: One sector for the +         * footer, and another one for the end-of-stream marker. +         */ +        struct { +            struct { +                uint64_t val; +                uint32_t size; +                uint32_t type; +                uint8_t pad[512 - 16]; +            } QEMU_PACKED footer_marker; + +            uint32_t magic; +            VMDK4Header header; +            uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; + +            struct { +                uint64_t val; +                uint32_t size; +                uint32_t type; +                uint8_t pad[512 - 16]; +            } QEMU_PACKED eos_marker; +        } QEMU_PACKED footer; + +        ret = bdrv_pread(file, +            bs->file->total_sectors * 512 - 1536, +            &footer, sizeof(footer)); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Failed to read footer"); +            return ret; +        } + +        /* Some sanity checks for the footer */ +        if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || +            le32_to_cpu(footer.footer_marker.size) != 0  || +            le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || +            le64_to_cpu(footer.eos_marker.val) != 0  || +            le32_to_cpu(footer.eos_marker.size) != 0  || +            le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) +        { +            error_setg(errp, "Invalid footer"); +            return -EINVAL; +        } + +        header = footer.header; +    } + +    if (le32_to_cpu(header.version) > 3) { +        char buf[64]; +        snprintf(buf, sizeof(buf), "VMDK version %" PRId32, +                 le32_to_cpu(header.version)); +        error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, +                   bdrv_get_device_or_node_name(bs), "vmdk", buf); +        return -ENOTSUP; +    } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { +        /* VMware KB 2064959 explains that version 3 added support for +         * persistent changed block tracking (CBT), and backup software can +         * read it as version=1 if it doesn't care about the changed area +         * information. So we are safe to enable read only. */ +        error_setg(errp, "VMDK version 3 must be read only"); +        return -EINVAL; +    } + +    if (le32_to_cpu(header.num_gtes_per_gt) > 512) { +        error_setg(errp, "L2 table size too big"); +        return -EINVAL; +    } + +    l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) +                        * le64_to_cpu(header.granularity); +    if (l1_entry_sectors == 0) { +        error_setg(errp, "L1 entry size is invalid"); +        return -EINVAL; +    } +    l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) +                / l1_entry_sectors; +    if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { +        l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; +    } +    if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) { +        error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", +                   (int64_t)(le64_to_cpu(header.grain_offset) +                             * BDRV_SECTOR_SIZE)); +        return -EINVAL; +    } + +    ret = vmdk_add_extent(bs, file, false, +                          le64_to_cpu(header.capacity), +                          le64_to_cpu(header.gd_offset) << 9, +                          l1_backup_offset, +                          l1_size, +                          le32_to_cpu(header.num_gtes_per_gt), +                          le64_to_cpu(header.granularity), +                          &extent, +                          errp); +    if (ret < 0) { +        return ret; +    } +    extent->compressed = +        le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; +    if (extent->compressed) { +        g_free(s->create_type); +        s->create_type = g_strdup("streamOptimized"); +    } +    extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; +    extent->version = le32_to_cpu(header.version); +    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; +    ret = vmdk_init_tables(bs, extent, errp); +    if (ret) { +        /* free extent allocated by vmdk_add_extent */ +        vmdk_free_last_extent(bs); +    } +    return ret; +} + +/* find an option value out of descriptor file */ +static int vmdk_parse_description(const char *desc, const char *opt_name, +        char *buf, int buf_size) +{ +    char *opt_pos, *opt_end; +    const char *end = desc + strlen(desc); + +    opt_pos = strstr(desc, opt_name); +    if (!opt_pos) { +        return VMDK_ERROR; +    } +    /* Skip "=\"" following opt_name */ +    opt_pos += strlen(opt_name) + 2; +    if (opt_pos >= end) { +        return VMDK_ERROR; +    } +    opt_end = opt_pos; +    while (opt_end < end && *opt_end != '"') { +        opt_end++; +    } +    if (opt_end == end || buf_size < opt_end - opt_pos + 1) { +        return VMDK_ERROR; +    } +    pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); +    return VMDK_OK; +} + +/* Open an extent file and append to bs array */ +static int vmdk_open_sparse(BlockDriverState *bs, +                            BlockDriverState *file, int flags, +                            char *buf, QDict *options, Error **errp) +{ +    uint32_t magic; + +    magic = ldl_be_p(buf); +    switch (magic) { +        case VMDK3_MAGIC: +            return vmdk_open_vmfs_sparse(bs, file, flags, errp); +            break; +        case VMDK4_MAGIC: +            return vmdk_open_vmdk4(bs, file, flags, options, errp); +            break; +        default: +            error_setg(errp, "Image not in VMDK format"); +            return -EINVAL; +            break; +    } +} + +static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, +                              const char *desc_file_path, QDict *options, +                              Error **errp) +{ +    int ret; +    int matches; +    char access[11]; +    char type[11]; +    char fname[512]; +    const char *p = desc; +    int64_t sectors = 0; +    int64_t flat_offset; +    char *extent_path; +    BlockDriverState *extent_file; +    BDRVVmdkState *s = bs->opaque; +    VmdkExtent *extent; +    char extent_opt_prefix[32]; + +    while (*p) { +        /* parse extent line in one of below formats: +         * +         * RW [size in sectors] FLAT "file-name.vmdk" OFFSET +         * RW [size in sectors] SPARSE "file-name.vmdk" +         * RW [size in sectors] VMFS "file-name.vmdk" +         * RW [size in sectors] VMFSSPARSE "file-name.vmdk" +         */ +        flat_offset = -1; +        matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, +                         access, §ors, type, fname, &flat_offset); +        if (matches < 4 || strcmp(access, "RW")) { +            goto next_line; +        } else if (!strcmp(type, "FLAT")) { +            if (matches != 5 || flat_offset < 0) { +                error_setg(errp, "Invalid extent lines: \n%s", p); +                return -EINVAL; +            } +        } else if (!strcmp(type, "VMFS")) { +            if (matches == 4) { +                flat_offset = 0; +            } else { +                error_setg(errp, "Invalid extent lines:\n%s", p); +                return -EINVAL; +            } +        } else if (matches != 4) { +            error_setg(errp, "Invalid extent lines:\n%s", p); +            return -EINVAL; +        } + +        if (sectors <= 0 || +            (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && +             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || +            (strcmp(access, "RW"))) { +            goto next_line; +        } + +        if (!path_is_absolute(fname) && !path_has_protocol(fname) && +            !desc_file_path[0]) +        { +            error_setg(errp, "Cannot use relative extent paths with VMDK " +                       "descriptor file '%s'", bs->file->filename); +            return -EINVAL; +        } + +        extent_path = g_malloc0(PATH_MAX); +        path_combine(extent_path, PATH_MAX, desc_file_path, fname); +        extent_file = NULL; + +        ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); +        assert(ret < 32); + +        ret = bdrv_open_image(&extent_file, extent_path, options, +                              extent_opt_prefix, bs, &child_file, false, errp); +        g_free(extent_path); +        if (ret) { +            return ret; +        } + +        /* save to extents array */ +        if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { +            /* FLAT extent */ + +            ret = vmdk_add_extent(bs, extent_file, true, sectors, +                            0, 0, 0, 0, 0, &extent, errp); +            if (ret < 0) { +                bdrv_unref(extent_file); +                return ret; +            } +            extent->flat_start_offset = flat_offset << 9; +        } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { +            /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ +            char *buf = vmdk_read_desc(extent_file, 0, errp); +            if (!buf) { +                ret = -EINVAL; +            } else { +                ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, +                                       options, errp); +            } +            g_free(buf); +            if (ret) { +                bdrv_unref(extent_file); +                return ret; +            } +            extent = &s->extents[s->num_extents - 1]; +        } else { +            error_setg(errp, "Unsupported extent type '%s'", type); +            bdrv_unref(extent_file); +            return -ENOTSUP; +        } +        extent->type = g_strdup(type); +next_line: +        /* move to next line */ +        while (*p) { +            if (*p == '\n') { +                p++; +                break; +            } +            p++; +        } +    } +    return 0; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, +                               QDict *options, Error **errp) +{ +    int ret; +    char ct[128]; +    BDRVVmdkState *s = bs->opaque; + +    if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { +        error_setg(errp, "invalid VMDK image descriptor"); +        ret = -EINVAL; +        goto exit; +    } +    if (strcmp(ct, "monolithicFlat") && +        strcmp(ct, "vmfs") && +        strcmp(ct, "vmfsSparse") && +        strcmp(ct, "twoGbMaxExtentSparse") && +        strcmp(ct, "twoGbMaxExtentFlat")) { +        error_setg(errp, "Unsupported image type '%s'", ct); +        ret = -ENOTSUP; +        goto exit; +    } +    s->create_type = g_strdup(ct); +    s->desc_offset = 0; +    ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp); +exit: +    return ret; +} + +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, +                     Error **errp) +{ +    char *buf; +    int ret; +    BDRVVmdkState *s = bs->opaque; +    uint32_t magic; + +    buf = vmdk_read_desc(bs->file, 0, errp); +    if (!buf) { +        return -EINVAL; +    } + +    magic = ldl_be_p(buf); +    switch (magic) { +        case VMDK3_MAGIC: +        case VMDK4_MAGIC: +            ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp); +            s->desc_offset = 0x200; +            break; +        default: +            ret = vmdk_open_desc_file(bs, flags, buf, options, errp); +            break; +    } +    if (ret) { +        goto fail; +    } + +    /* try to open parent images, if exist */ +    ret = vmdk_parent_open(bs); +    if (ret) { +        goto fail; +    } +    s->cid = vmdk_read_cid(bs, 0); +    s->parent_cid = vmdk_read_cid(bs, 1); +    qemu_co_mutex_init(&s->lock); + +    /* Disable migration when VMDK images are used */ +    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " +               "does not support live migration", +               bdrv_get_device_or_node_name(bs)); +    migrate_add_blocker(s->migration_blocker); +    g_free(buf); +    return 0; + +fail: +    g_free(buf); +    g_free(s->create_type); +    s->create_type = NULL; +    vmdk_free_extents(bs); +    return ret; +} + + +static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) +{ +    BDRVVmdkState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_extents; i++) { +        if (!s->extents[i].flat) { +            bs->bl.write_zeroes_alignment = +                MAX(bs->bl.write_zeroes_alignment, +                    s->extents[i].cluster_sectors); +        } +    } +} + +/** + * get_whole_cluster + * + * Copy backing file's cluster that covers @sector_num, otherwise write zero, + * to the cluster at @cluster_sector_num. + * + * If @skip_start_sector < @skip_end_sector, the relative range + * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave + * it for call to write user data in the request. + */ +static int get_whole_cluster(BlockDriverState *bs, +                             VmdkExtent *extent, +                             uint64_t cluster_sector_num, +                             uint64_t sector_num, +                             uint64_t skip_start_sector, +                             uint64_t skip_end_sector) +{ +    int ret = VMDK_OK; +    int64_t cluster_bytes; +    uint8_t *whole_grain; + +    /* For COW, align request sector_num to cluster start */ +    sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); +    cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; +    whole_grain = qemu_blockalign(bs, cluster_bytes); + +    if (!bs->backing_hd) { +        memset(whole_grain, 0,  skip_start_sector << BDRV_SECTOR_BITS); +        memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, +               cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); +    } + +    assert(skip_end_sector <= extent->cluster_sectors); +    /* we will be here if it's first write on non-exist grain(cluster). +     * try to read from parent image, if exist */ +    if (bs->backing_hd && !vmdk_is_cid_valid(bs)) { +        ret = VMDK_ERROR; +        goto exit; +    } + +    /* Read backing data before skip range */ +    if (skip_start_sector > 0) { +        if (bs->backing_hd) { +            ret = bdrv_read(bs->backing_hd, sector_num, +                            whole_grain, skip_start_sector); +            if (ret < 0) { +                ret = VMDK_ERROR; +                goto exit; +            } +        } +        ret = bdrv_write(extent->file, cluster_sector_num, whole_grain, +                         skip_start_sector); +        if (ret < 0) { +            ret = VMDK_ERROR; +            goto exit; +        } +    } +    /* Read backing data after skip range */ +    if (skip_end_sector < extent->cluster_sectors) { +        if (bs->backing_hd) { +            ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector, +                            whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), +                            extent->cluster_sectors - skip_end_sector); +            if (ret < 0) { +                ret = VMDK_ERROR; +                goto exit; +            } +        } +        ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector, +                         whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), +                         extent->cluster_sectors - skip_end_sector); +        if (ret < 0) { +            ret = VMDK_ERROR; +            goto exit; +        } +    } + +exit: +    qemu_vfree(whole_grain); +    return ret; +} + +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, +                         uint32_t offset) +{ +    offset = cpu_to_le32(offset); +    /* update L2 table */ +    if (bdrv_pwrite_sync( +                extent->file, +                ((int64_t)m_data->l2_offset * 512) +                    + (m_data->l2_index * sizeof(offset)), +                &offset, sizeof(offset)) < 0) { +        return VMDK_ERROR; +    } +    /* update backup L2 table */ +    if (extent->l1_backup_table_offset != 0) { +        m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; +        if (bdrv_pwrite_sync( +                    extent->file, +                    ((int64_t)m_data->l2_offset * 512) +                        + (m_data->l2_index * sizeof(offset)), +                    &offset, sizeof(offset)) < 0) { +            return VMDK_ERROR; +        } +    } +    if (m_data->l2_cache_entry) { +        *m_data->l2_cache_entry = offset; +    } + +    return VMDK_OK; +} + +/** + * get_cluster_offset + * + * Look up cluster offset in extent file by sector number, and store in + * @cluster_offset. + * + * For flat extents, the start offset as parsed from the description file is + * returned. + * + * For sparse extents, look up in L1, L2 table. If allocate is true, return an + * offset for a new cluster and update L2 cache. If there is a backing file, + * COW is done before returning; otherwise, zeroes are written to the allocated + * cluster. Both COW and zero writing skips the sector range + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller + * has new data to write there. + * + * Returns: VMDK_OK if cluster exists and mapped in the image. + *          VMDK_UNALLOC if cluster is not mapped and @allocate is false. + *          VMDK_ERROR if failed. + */ +static int get_cluster_offset(BlockDriverState *bs, +                              VmdkExtent *extent, +                              VmdkMetaData *m_data, +                              uint64_t offset, +                              bool allocate, +                              uint64_t *cluster_offset, +                              uint64_t skip_start_sector, +                              uint64_t skip_end_sector) +{ +    unsigned int l1_index, l2_offset, l2_index; +    int min_index, i, j; +    uint32_t min_count, *l2_table; +    bool zeroed = false; +    int64_t ret; +    int64_t cluster_sector; + +    if (m_data) { +        m_data->valid = 0; +    } +    if (extent->flat) { +        *cluster_offset = extent->flat_start_offset; +        return VMDK_OK; +    } + +    offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; +    l1_index = (offset >> 9) / extent->l1_entry_sectors; +    if (l1_index >= extent->l1_size) { +        return VMDK_ERROR; +    } +    l2_offset = extent->l1_table[l1_index]; +    if (!l2_offset) { +        return VMDK_UNALLOC; +    } +    for (i = 0; i < L2_CACHE_SIZE; i++) { +        if (l2_offset == extent->l2_cache_offsets[i]) { +            /* increment the hit count */ +            if (++extent->l2_cache_counts[i] == 0xffffffff) { +                for (j = 0; j < L2_CACHE_SIZE; j++) { +                    extent->l2_cache_counts[j] >>= 1; +                } +            } +            l2_table = extent->l2_cache + (i * extent->l2_size); +            goto found; +        } +    } +    /* not found: load a new entry in the least used one */ +    min_index = 0; +    min_count = 0xffffffff; +    for (i = 0; i < L2_CACHE_SIZE; i++) { +        if (extent->l2_cache_counts[i] < min_count) { +            min_count = extent->l2_cache_counts[i]; +            min_index = i; +        } +    } +    l2_table = extent->l2_cache + (min_index * extent->l2_size); +    if (bdrv_pread( +                extent->file, +                (int64_t)l2_offset * 512, +                l2_table, +                extent->l2_size * sizeof(uint32_t) +            ) != extent->l2_size * sizeof(uint32_t)) { +        return VMDK_ERROR; +    } + +    extent->l2_cache_offsets[min_index] = l2_offset; +    extent->l2_cache_counts[min_index] = 1; + found: +    l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; +    cluster_sector = le32_to_cpu(l2_table[l2_index]); + +    if (m_data) { +        m_data->valid = 1; +        m_data->l1_index = l1_index; +        m_data->l2_index = l2_index; +        m_data->l2_offset = l2_offset; +        m_data->l2_cache_entry = &l2_table[l2_index]; +    } +    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { +        zeroed = true; +    } + +    if (!cluster_sector || zeroed) { +        if (!allocate) { +            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; +        } + +        cluster_sector = extent->next_cluster_sector; +        extent->next_cluster_sector += extent->cluster_sectors; + +        /* First of all we write grain itself, to avoid race condition +         * that may to corrupt the image. +         * This problem may occur because of insufficient space on host disk +         * or inappropriate VM shutdown. +         */ +        ret = get_whole_cluster(bs, extent, +                                cluster_sector, +                                offset >> BDRV_SECTOR_BITS, +                                skip_start_sector, skip_end_sector); +        if (ret) { +            return ret; +        } +    } +    *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; +    return VMDK_OK; +} + +static VmdkExtent *find_extent(BDRVVmdkState *s, +                                int64_t sector_num, VmdkExtent *start_hint) +{ +    VmdkExtent *extent = start_hint; + +    if (!extent) { +        extent = &s->extents[0]; +    } +    while (extent < &s->extents[s->num_extents]) { +        if (sector_num < extent->end_sector) { +            return extent; +        } +        extent++; +    } +    return NULL; +} + +static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, +                                                  int64_t sector_num) +{ +    uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num; + +    extent_begin_sector = extent->end_sector - extent->sectors; +    extent_relative_sector_num = sector_num - extent_begin_sector; +    index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; +    return index_in_cluster; +} + +static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVVmdkState *s = bs->opaque; +    int64_t index_in_cluster, n, ret; +    uint64_t offset; +    VmdkExtent *extent; + +    extent = find_extent(s, sector_num, NULL); +    if (!extent) { +        return 0; +    } +    qemu_co_mutex_lock(&s->lock); +    ret = get_cluster_offset(bs, extent, NULL, +                             sector_num * 512, false, &offset, +                             0, 0); +    qemu_co_mutex_unlock(&s->lock); + +    switch (ret) { +    case VMDK_ERROR: +        ret = -EIO; +        break; +    case VMDK_UNALLOC: +        ret = 0; +        break; +    case VMDK_ZEROED: +        ret = BDRV_BLOCK_ZERO; +        break; +    case VMDK_OK: +        ret = BDRV_BLOCK_DATA; +        if (extent->file == bs->file && !extent->compressed) { +            ret |= BDRV_BLOCK_OFFSET_VALID | offset; +        } + +        break; +    } + +    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); +    n = extent->cluster_sectors - index_in_cluster; +    if (n > nb_sectors) { +        n = nb_sectors; +    } +    *pnum = n; +    return ret; +} + +static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, +                            int64_t offset_in_cluster, const uint8_t *buf, +                            int nb_sectors, int64_t sector_num) +{ +    int ret; +    VmdkGrainMarker *data = NULL; +    uLongf buf_len; +    const uint8_t *write_buf = buf; +    int write_len = nb_sectors * 512; +    int64_t write_offset; +    int64_t write_end_sector; + +    if (extent->compressed) { +        if (!extent->has_marker) { +            ret = -EINVAL; +            goto out; +        } +        buf_len = (extent->cluster_sectors << 9) * 2; +        data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); +        if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || +                buf_len == 0) { +            ret = -EINVAL; +            goto out; +        } +        data->lba = sector_num; +        data->size = buf_len; +        write_buf = (uint8_t *)data; +        write_len = buf_len + sizeof(VmdkGrainMarker); +    } +    write_offset = cluster_offset + offset_in_cluster, +    ret = bdrv_pwrite(extent->file, write_offset, write_buf, write_len); + +    write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); + +    extent->next_cluster_sector = MAX(extent->next_cluster_sector, +                                      write_end_sector); + +    if (ret != write_len) { +        ret = ret < 0 ? ret : -EIO; +        goto out; +    } +    ret = 0; + out: +    g_free(data); +    return ret; +} + +static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, +                            int64_t offset_in_cluster, uint8_t *buf, +                            int nb_sectors) +{ +    int ret; +    int cluster_bytes, buf_bytes; +    uint8_t *cluster_buf, *compressed_data; +    uint8_t *uncomp_buf; +    uint32_t data_len; +    VmdkGrainMarker *marker; +    uLongf buf_len; + + +    if (!extent->compressed) { +        ret = bdrv_pread(extent->file, +                          cluster_offset + offset_in_cluster, +                          buf, nb_sectors * 512); +        if (ret == nb_sectors * 512) { +            return 0; +        } else { +            return -EIO; +        } +    } +    cluster_bytes = extent->cluster_sectors * 512; +    /* Read two clusters in case GrainMarker + compressed data > one cluster */ +    buf_bytes = cluster_bytes * 2; +    cluster_buf = g_malloc(buf_bytes); +    uncomp_buf = g_malloc(cluster_bytes); +    ret = bdrv_pread(extent->file, +                cluster_offset, +                cluster_buf, buf_bytes); +    if (ret < 0) { +        goto out; +    } +    compressed_data = cluster_buf; +    buf_len = cluster_bytes; +    data_len = cluster_bytes; +    if (extent->has_marker) { +        marker = (VmdkGrainMarker *)cluster_buf; +        compressed_data = marker->data; +        data_len = le32_to_cpu(marker->size); +    } +    if (!data_len || data_len > buf_bytes) { +        ret = -EINVAL; +        goto out; +    } +    ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); +    if (ret != Z_OK) { +        ret = -EINVAL; +        goto out; + +    } +    if (offset_in_cluster < 0 || +            offset_in_cluster + nb_sectors * 512 > buf_len) { +        ret = -EINVAL; +        goto out; +    } +    memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); +    ret = 0; + + out: +    g_free(uncomp_buf); +    g_free(cluster_buf); +    return ret; +} + +static int vmdk_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    BDRVVmdkState *s = bs->opaque; +    int ret; +    uint64_t n, index_in_cluster; +    VmdkExtent *extent = NULL; +    uint64_t cluster_offset; + +    while (nb_sectors > 0) { +        extent = find_extent(s, sector_num, extent); +        if (!extent) { +            return -EIO; +        } +        ret = get_cluster_offset(bs, extent, NULL, +                                 sector_num << 9, false, &cluster_offset, +                                 0, 0); +        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); +        n = extent->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } +        if (ret != VMDK_OK) { +            /* if not allocated, try to read from parent image, if exist */ +            if (bs->backing_hd && ret != VMDK_ZEROED) { +                if (!vmdk_is_cid_valid(bs)) { +                    return -EINVAL; +                } +                ret = bdrv_read(bs->backing_hd, sector_num, buf, n); +                if (ret < 0) { +                    return ret; +                } +            } else { +                memset(buf, 0, 512 * n); +            } +        } else { +            ret = vmdk_read_extent(extent, +                            cluster_offset, index_in_cluster * 512, +                            buf, n); +            if (ret) { +                return ret; +            } +        } +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; +    } +    return 0; +} + +static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, +                                     uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVmdkState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vmdk_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +/** + * vmdk_write: + * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature + *                if possible, otherwise return -ENOTSUP. + * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try + *                with each cluster. By dry run we can find if the zero write + *                is possible without modifying image data. + * + * Returns: error code with 0 for success. + */ +static int vmdk_write(BlockDriverState *bs, int64_t sector_num, +                      const uint8_t *buf, int nb_sectors, +                      bool zeroed, bool zero_dry_run) +{ +    BDRVVmdkState *s = bs->opaque; +    VmdkExtent *extent = NULL; +    int ret; +    int64_t index_in_cluster, n; +    uint64_t cluster_offset; +    VmdkMetaData m_data; + +    if (sector_num > bs->total_sectors) { +        error_report("Wrong offset: sector_num=0x%" PRIx64 +                " total_sectors=0x%" PRIx64 "\n", +                sector_num, bs->total_sectors); +        return -EIO; +    } + +    while (nb_sectors > 0) { +        extent = find_extent(s, sector_num, extent); +        if (!extent) { +            return -EIO; +        } +        index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); +        n = extent->cluster_sectors - index_in_cluster; +        if (n > nb_sectors) { +            n = nb_sectors; +        } +        ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, +                                 !(extent->compressed || zeroed), +                                 &cluster_offset, +                                 index_in_cluster, index_in_cluster + n); +        if (extent->compressed) { +            if (ret == VMDK_OK) { +                /* Refuse write to allocated cluster for streamOptimized */ +                error_report("Could not write to allocated cluster" +                              " for streamOptimized"); +                return -EIO; +            } else { +                /* allocate */ +                ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, +                                         true, &cluster_offset, 0, 0); +            } +        } +        if (ret == VMDK_ERROR) { +            return -EINVAL; +        } +        if (zeroed) { +            /* Do zeroed write, buf is ignored */ +            if (extent->has_zero_grain && +                    index_in_cluster == 0 && +                    n >= extent->cluster_sectors) { +                n = extent->cluster_sectors; +                if (!zero_dry_run) { +                    /* update L2 tables */ +                    if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) +                            != VMDK_OK) { +                        return -EIO; +                    } +                } +            } else { +                return -ENOTSUP; +            } +        } else { +            ret = vmdk_write_extent(extent, +                            cluster_offset, index_in_cluster * 512, +                            buf, n, sector_num); +            if (ret) { +                return ret; +            } +            if (m_data.valid) { +                /* update L2 tables */ +                if (vmdk_L2update(extent, &m_data, +                                  cluster_offset >> BDRV_SECTOR_BITS) +                        != VMDK_OK) { +                    return -EIO; +                } +            } +        } +        nb_sectors -= n; +        sector_num += n; +        buf += n * 512; + +        /* update CID on the first write every time the virtual disk is +         * opened */ +        if (!s->cid_updated) { +            ret = vmdk_write_cid(bs, g_random_int()); +            if (ret < 0) { +                return ret; +            } +            s->cid_updated = true; +        } +    } +    return 0; +} + +static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, +                                      const uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVmdkState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int vmdk_write_compressed(BlockDriverState *bs, +                                 int64_t sector_num, +                                 const uint8_t *buf, +                                 int nb_sectors) +{ +    BDRVVmdkState *s = bs->opaque; +    if (s->num_extents == 1 && s->extents[0].compressed) { +        return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); +    } else { +        return -ENOTSUP; +    } +} + +static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, +                                             int64_t sector_num, +                                             int nb_sectors, +                                             BdrvRequestFlags flags) +{ +    int ret; +    BDRVVmdkState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    /* write zeroes could fail if sectors not aligned to cluster, test it with +     * dry_run == true before really updating image */ +    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); +    if (!ret) { +        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); +    } +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int vmdk_create_extent(const char *filename, int64_t filesize, +                              bool flat, bool compress, bool zeroed_grain, +                              QemuOpts *opts, Error **errp) +{ +    int ret, i; +    BlockDriverState *bs = NULL; +    VMDK4Header header; +    Error *local_err = NULL; +    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; +    uint32_t *gd_buf = NULL; +    int gd_buf_size; + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } + +    assert(bs == NULL); +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } + +    if (flat) { +        ret = bdrv_truncate(bs, filesize); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not truncate file"); +        } +        goto exit; +    } +    magic = cpu_to_be32(VMDK4_MAGIC); +    memset(&header, 0, sizeof(header)); +    header.version = zeroed_grain ? 2 : 1; +    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT +                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) +                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); +    header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; +    header.capacity = filesize / BDRV_SECTOR_SIZE; +    header.granularity = 128; +    header.num_gtes_per_gt = BDRV_SECTOR_SIZE; + +    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); +    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), +                           BDRV_SECTOR_SIZE); +    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); +    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); + +    header.desc_offset = 1; +    header.desc_size = 20; +    header.rgd_offset = header.desc_offset + header.desc_size; +    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); +    header.grain_offset = +        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), +                 header.granularity); +    /* swap endianness for all header fields */ +    header.version = cpu_to_le32(header.version); +    header.flags = cpu_to_le32(header.flags); +    header.capacity = cpu_to_le64(header.capacity); +    header.granularity = cpu_to_le64(header.granularity); +    header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); +    header.desc_offset = cpu_to_le64(header.desc_offset); +    header.desc_size = cpu_to_le64(header.desc_size); +    header.rgd_offset = cpu_to_le64(header.rgd_offset); +    header.gd_offset = cpu_to_le64(header.gd_offset); +    header.grain_offset = cpu_to_le64(header.grain_offset); +    header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); + +    header.check_bytes[0] = 0xa; +    header.check_bytes[1] = 0x20; +    header.check_bytes[2] = 0xd; +    header.check_bytes[3] = 0xa; + +    /* write all the data */ +    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); +    if (ret < 0) { +        error_setg(errp, QERR_IO_ERROR); +        goto exit; +    } +    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); +    if (ret < 0) { +        error_setg(errp, QERR_IO_ERROR); +        goto exit; +    } + +    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not truncate file"); +        goto exit; +    } + +    /* write grain directory */ +    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; +    gd_buf = g_malloc0(gd_buf_size); +    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; +         i < gt_count; i++, tmp += gt_size) { +        gd_buf[i] = cpu_to_le32(tmp); +    } +    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, +                      gd_buf, gd_buf_size); +    if (ret < 0) { +        error_setg(errp, QERR_IO_ERROR); +        goto exit; +    } + +    /* write backup grain directory */ +    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; +         i < gt_count; i++, tmp += gt_size) { +        gd_buf[i] = cpu_to_le32(tmp); +    } +    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, +                      gd_buf, gd_buf_size); +    if (ret < 0) { +        error_setg(errp, QERR_IO_ERROR); +        goto exit; +    } + +    ret = 0; +exit: +    if (bs) { +        bdrv_unref(bs); +    } +    g_free(gd_buf); +    return ret; +} + +static int filename_decompose(const char *filename, char *path, char *prefix, +                              char *postfix, size_t buf_len, Error **errp) +{ +    const char *p, *q; + +    if (filename == NULL || !strlen(filename)) { +        error_setg(errp, "No filename provided"); +        return VMDK_ERROR; +    } +    p = strrchr(filename, '/'); +    if (p == NULL) { +        p = strrchr(filename, '\\'); +    } +    if (p == NULL) { +        p = strrchr(filename, ':'); +    } +    if (p != NULL) { +        p++; +        if (p - filename >= buf_len) { +            return VMDK_ERROR; +        } +        pstrcpy(path, p - filename + 1, filename); +    } else { +        p = filename; +        path[0] = '\0'; +    } +    q = strrchr(p, '.'); +    if (q == NULL) { +        pstrcpy(prefix, buf_len, p); +        postfix[0] = '\0'; +    } else { +        if (q - p >= buf_len) { +            return VMDK_ERROR; +        } +        pstrcpy(prefix, q - p + 1, p); +        pstrcpy(postfix, buf_len, q); +    } +    return VMDK_OK; +} + +static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    int idx = 0; +    BlockDriverState *new_bs = NULL; +    Error *local_err = NULL; +    char *desc = NULL; +    int64_t total_size = 0, filesize; +    char *adapter_type = NULL; +    char *backing_file = NULL; +    char *fmt = NULL; +    int flags = 0; +    int ret = 0; +    bool flat, split, compress; +    GString *ext_desc_lines; +    char *path = g_malloc0(PATH_MAX); +    char *prefix = g_malloc0(PATH_MAX); +    char *postfix = g_malloc0(PATH_MAX); +    char *desc_line = g_malloc0(BUF_SIZE); +    char *ext_filename = g_malloc0(PATH_MAX); +    char *desc_filename = g_malloc0(PATH_MAX); +    const int64_t split_size = 0x80000000;  /* VMDK has constant split size */ +    const char *desc_extent_line; +    char *parent_desc_line = g_malloc0(BUF_SIZE); +    uint32_t parent_cid = 0xffffffff; +    uint32_t number_heads = 16; +    bool zeroed_grain = false; +    uint32_t desc_offset = 0, desc_len; +    const char desc_template[] = +        "# Disk DescriptorFile\n" +        "version=1\n" +        "CID=%" PRIx32 "\n" +        "parentCID=%" PRIx32 "\n" +        "createType=\"%s\"\n" +        "%s" +        "\n" +        "# Extent description\n" +        "%s" +        "\n" +        "# The Disk Data Base\n" +        "#DDB\n" +        "\n" +        "ddb.virtualHWVersion = \"%d\"\n" +        "ddb.geometry.cylinders = \"%" PRId64 "\"\n" +        "ddb.geometry.heads = \"%" PRIu32 "\"\n" +        "ddb.geometry.sectors = \"63\"\n" +        "ddb.adapterType = \"%s\"\n"; + +    ext_desc_lines = g_string_new(NULL); + +    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { +        ret = -EINVAL; +        goto exit; +    } +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); +    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { +        flags |= BLOCK_FLAG_COMPAT6; +    } +    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); +    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { +        zeroed_grain = true; +    } + +    if (!adapter_type) { +        adapter_type = g_strdup("ide"); +    } else if (strcmp(adapter_type, "ide") && +               strcmp(adapter_type, "buslogic") && +               strcmp(adapter_type, "lsilogic") && +               strcmp(adapter_type, "legacyESX")) { +        error_setg(errp, "Unknown adapter type: '%s'", adapter_type); +        ret = -EINVAL; +        goto exit; +    } +    if (strcmp(adapter_type, "ide") != 0) { +        /* that's the number of heads with which vmware operates when +           creating, exporting, etc. vmdk files with a non-ide adapter type */ +        number_heads = 255; +    } +    if (!fmt) { +        /* Default format to monolithicSparse */ +        fmt = g_strdup("monolithicSparse"); +    } else if (strcmp(fmt, "monolithicFlat") && +               strcmp(fmt, "monolithicSparse") && +               strcmp(fmt, "twoGbMaxExtentSparse") && +               strcmp(fmt, "twoGbMaxExtentFlat") && +               strcmp(fmt, "streamOptimized")) { +        error_setg(errp, "Unknown subformat: '%s'", fmt); +        ret = -EINVAL; +        goto exit; +    } +    split = !(strcmp(fmt, "twoGbMaxExtentFlat") && +              strcmp(fmt, "twoGbMaxExtentSparse")); +    flat = !(strcmp(fmt, "monolithicFlat") && +             strcmp(fmt, "twoGbMaxExtentFlat")); +    compress = !strcmp(fmt, "streamOptimized"); +    if (flat) { +        desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; +    } else { +        desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; +    } +    if (flat && backing_file) { +        error_setg(errp, "Flat image can't have backing file"); +        ret = -ENOTSUP; +        goto exit; +    } +    if (flat && zeroed_grain) { +        error_setg(errp, "Flat image can't enable zeroed grain"); +        ret = -ENOTSUP; +        goto exit; +    } +    if (backing_file) { +        BlockDriverState *bs = NULL; +        char *full_backing = g_new0(char, PATH_MAX); +        bdrv_get_full_backing_filename_from_filename(filename, backing_file, +                                                     full_backing, PATH_MAX, +                                                     &local_err); +        if (local_err) { +            g_free(full_backing); +            error_propagate(errp, local_err); +            ret = -ENOENT; +            goto exit; +        } +        ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, NULL, +                        errp); +        g_free(full_backing); +        if (ret != 0) { +            goto exit; +        } +        if (strcmp(bs->drv->format_name, "vmdk")) { +            bdrv_unref(bs); +            ret = -EINVAL; +            goto exit; +        } +        parent_cid = vmdk_read_cid(bs, 0); +        bdrv_unref(bs); +        snprintf(parent_desc_line, BUF_SIZE, +                "parentFileNameHint=\"%s\"", backing_file); +    } + +    /* Create extents */ +    filesize = total_size; +    while (filesize > 0) { +        int64_t size = filesize; + +        if (split && size > split_size) { +            size = split_size; +        } +        if (split) { +            snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s", +                    prefix, flat ? 'f' : 's', ++idx, postfix); +        } else if (flat) { +            snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix); +        } else { +            snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix); +        } +        snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename); + +        if (vmdk_create_extent(ext_filename, size, +                               flat, compress, zeroed_grain, opts, errp)) { +            ret = -EINVAL; +            goto exit; +        } +        filesize -= size; + +        /* Format description line */ +        snprintf(desc_line, BUF_SIZE, +                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); +        g_string_append(ext_desc_lines, desc_line); +    } +    /* generate descriptor file */ +    desc = g_strdup_printf(desc_template, +                           g_random_int(), +                           parent_cid, +                           fmt, +                           parent_desc_line, +                           ext_desc_lines->str, +                           (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), +                           total_size / +                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), +                           number_heads, +                           adapter_type); +    desc_len = strlen(desc); +    /* the descriptor offset = 0x200 */ +    if (!split && !flat) { +        desc_offset = 0x200; +    } else { +        ret = bdrv_create_file(filename, opts, &local_err); +        if (ret < 0) { +            error_propagate(errp, local_err); +            goto exit; +        } +    } +    assert(new_bs == NULL); +    ret = bdrv_open(&new_bs, filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto exit; +    } +    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "Could not write description"); +        goto exit; +    } +    /* bdrv_pwrite write padding zeros to align to sector, we don't need that +     * for description file */ +    if (desc_offset == 0) { +        ret = bdrv_truncate(new_bs, desc_len); +        if (ret < 0) { +            error_setg_errno(errp, -ret, "Could not truncate file"); +        } +    } +exit: +    if (new_bs) { +        bdrv_unref(new_bs); +    } +    g_free(adapter_type); +    g_free(backing_file); +    g_free(fmt); +    g_free(desc); +    g_free(path); +    g_free(prefix); +    g_free(postfix); +    g_free(desc_line); +    g_free(ext_filename); +    g_free(desc_filename); +    g_free(parent_desc_line); +    g_string_free(ext_desc_lines, true); +    return ret; +} + +static void vmdk_close(BlockDriverState *bs) +{ +    BDRVVmdkState *s = bs->opaque; + +    vmdk_free_extents(bs); +    g_free(s->create_type); + +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +} + +static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) +{ +    BDRVVmdkState *s = bs->opaque; +    int i, err; +    int ret = 0; + +    for (i = 0; i < s->num_extents; i++) { +        err = bdrv_co_flush(s->extents[i].file); +        if (err < 0) { +            ret = err; +        } +    } +    return ret; +} + +static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) +{ +    int i; +    int64_t ret = 0; +    int64_t r; +    BDRVVmdkState *s = bs->opaque; + +    ret = bdrv_get_allocated_file_size(bs->file); +    if (ret < 0) { +        return ret; +    } +    for (i = 0; i < s->num_extents; i++) { +        if (s->extents[i].file == bs->file) { +            continue; +        } +        r = bdrv_get_allocated_file_size(s->extents[i].file); +        if (r < 0) { +            return r; +        } +        ret += r; +    } +    return ret; +} + +static int vmdk_has_zero_init(BlockDriverState *bs) +{ +    int i; +    BDRVVmdkState *s = bs->opaque; + +    /* If has a flat extent and its underlying storage doesn't have zero init, +     * return 0. */ +    for (i = 0; i < s->num_extents; i++) { +        if (s->extents[i].flat) { +            if (!bdrv_has_zero_init(s->extents[i].file)) { +                return 0; +            } +        } +    } +    return 1; +} + +static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) +{ +    ImageInfo *info = g_new0(ImageInfo, 1); + +    *info = (ImageInfo){ +        .filename         = g_strdup(extent->file->filename), +        .format           = g_strdup(extent->type), +        .virtual_size     = extent->sectors * BDRV_SECTOR_SIZE, +        .compressed       = extent->compressed, +        .has_compressed   = extent->compressed, +        .cluster_size     = extent->cluster_sectors * BDRV_SECTOR_SIZE, +        .has_cluster_size = !extent->flat, +    }; + +    return info; +} + +static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, +                      BdrvCheckMode fix) +{ +    BDRVVmdkState *s = bs->opaque; +    VmdkExtent *extent = NULL; +    int64_t sector_num = 0; +    int64_t total_sectors = bdrv_nb_sectors(bs); +    int ret; +    uint64_t cluster_offset; + +    if (fix) { +        return -ENOTSUP; +    } + +    for (;;) { +        if (sector_num >= total_sectors) { +            return 0; +        } +        extent = find_extent(s, sector_num, extent); +        if (!extent) { +            fprintf(stderr, +                    "ERROR: could not find extent for sector %" PRId64 "\n", +                    sector_num); +            break; +        } +        ret = get_cluster_offset(bs, extent, NULL, +                                 sector_num << BDRV_SECTOR_BITS, +                                 false, &cluster_offset, 0, 0); +        if (ret == VMDK_ERROR) { +            fprintf(stderr, +                    "ERROR: could not get cluster_offset for sector %" +                    PRId64 "\n", sector_num); +            break; +        } +        if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) { +            fprintf(stderr, +                    "ERROR: cluster offset for sector %" +                    PRId64 " points after EOF\n", sector_num); +            break; +        } +        sector_num += extent->cluster_sectors; +    } + +    result->corruptions++; +    return 0; +} + +static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) +{ +    int i; +    BDRVVmdkState *s = bs->opaque; +    ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); +    ImageInfoList **next; + +    *spec_info = (ImageInfoSpecific){ +        .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK, +        { +            .vmdk = g_new0(ImageInfoSpecificVmdk, 1), +        }, +    }; + +    *spec_info->vmdk = (ImageInfoSpecificVmdk) { +        .create_type = g_strdup(s->create_type), +        .cid = s->cid, +        .parent_cid = s->parent_cid, +    }; + +    next = &spec_info->vmdk->extents; +    for (i = 0; i < s->num_extents; i++) { +        *next = g_new0(ImageInfoList, 1); +        (*next)->value = vmdk_get_extent_info(&s->extents[i]); +        (*next)->next = NULL; +        next = &(*next)->next; +    } + +    return spec_info; +} + +static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b) +{ +    return a->flat == b->flat && +           a->compressed == b->compressed && +           (a->flat || a->cluster_sectors == b->cluster_sectors); +} + +static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    int i; +    BDRVVmdkState *s = bs->opaque; +    assert(s->num_extents); + +    /* See if we have multiple extents but they have different cases */ +    for (i = 1; i < s->num_extents; i++) { +        if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) { +            return -ENOTSUP; +        } +    } +    bdi->needs_compressed_writes = s->extents[0].compressed; +    if (!s->extents[0].flat) { +        bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; +    } +    return 0; +} + +static void vmdk_detach_aio_context(BlockDriverState *bs) +{ +    BDRVVmdkState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_extents; i++) { +        bdrv_detach_aio_context(s->extents[i].file); +    } +} + +static void vmdk_attach_aio_context(BlockDriverState *bs, +                                    AioContext *new_context) +{ +    BDRVVmdkState *s = bs->opaque; +    int i; + +    for (i = 0; i < s->num_extents; i++) { +        bdrv_attach_aio_context(s->extents[i].file, new_context); +    } +} + +static QemuOptsList vmdk_create_opts = { +    .name = "vmdk-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_ADAPTER_TYPE, +            .type = QEMU_OPT_STRING, +            .help = "Virtual adapter type, can be one of " +                    "ide (default), lsilogic, buslogic or legacyESX" +        }, +        { +            .name = BLOCK_OPT_BACKING_FILE, +            .type = QEMU_OPT_STRING, +            .help = "File name of a base image" +        }, +        { +            .name = BLOCK_OPT_COMPAT6, +            .type = QEMU_OPT_BOOL, +            .help = "VMDK version 6 image", +            .def_value_str = "off" +        }, +        { +            .name = BLOCK_OPT_SUBFMT, +            .type = QEMU_OPT_STRING, +            .help = +                "VMDK flat extent format, can be one of " +                "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " +        }, +        { +            .name = BLOCK_OPT_ZEROED_GRAIN, +            .type = QEMU_OPT_BOOL, +            .help = "Enable efficient zero writes " +                    "using the zeroed-grain GTE feature" +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_vmdk = { +    .format_name                  = "vmdk", +    .instance_size                = sizeof(BDRVVmdkState), +    .bdrv_probe                   = vmdk_probe, +    .bdrv_open                    = vmdk_open, +    .bdrv_check                   = vmdk_check, +    .bdrv_reopen_prepare          = vmdk_reopen_prepare, +    .bdrv_read                    = vmdk_co_read, +    .bdrv_write                   = vmdk_co_write, +    .bdrv_write_compressed        = vmdk_write_compressed, +    .bdrv_co_write_zeroes         = vmdk_co_write_zeroes, +    .bdrv_close                   = vmdk_close, +    .bdrv_create                  = vmdk_create, +    .bdrv_co_flush_to_disk        = vmdk_co_flush, +    .bdrv_co_get_block_status     = vmdk_co_get_block_status, +    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, +    .bdrv_has_zero_init           = vmdk_has_zero_init, +    .bdrv_get_specific_info       = vmdk_get_specific_info, +    .bdrv_refresh_limits          = vmdk_refresh_limits, +    .bdrv_get_info                = vmdk_get_info, +    .bdrv_detach_aio_context      = vmdk_detach_aio_context, +    .bdrv_attach_aio_context      = vmdk_attach_aio_context, + +    .supports_backing             = true, +    .create_opts                  = &vmdk_create_opts, +}; + +static void bdrv_vmdk_init(void) +{ +    bdrv_register(&bdrv_vmdk); +} + +block_init(bdrv_vmdk_init); diff --git a/block/vpc.c b/block/vpc.c new file mode 100644 index 00000000..3e385d9f --- /dev/null +++ b/block/vpc.c @@ -0,0 +1,944 @@ +/* + * Block driver for Connectix / Microsoft Virtual PC images + * + * Copyright (c) 2005 Alex Beregszaszi + * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#endif + +/**************************************************************/ + +#define HEADER_SIZE 512 + +//#define CACHE + +enum vhd_type { +    VHD_FIXED           = 2, +    VHD_DYNAMIC         = 3, +    VHD_DIFFERENCING    = 4, +}; + +// Seconds since Jan 1, 2000 0:00:00 (UTC) +#define VHD_TIMESTAMP_BASE 946684800 + +#define VHD_MAX_SECTORS       (65535LL * 255 * 255) +#define VHD_MAX_GEOMETRY      (65535LL *  16 * 255) + +// always big-endian +typedef struct vhd_footer { +    char        creator[8]; // "conectix" +    uint32_t    features; +    uint32_t    version; + +    // Offset of next header structure, 0xFFFFFFFF if none +    uint64_t    data_offset; + +    // Seconds since Jan 1, 2000 0:00:00 (UTC) +    uint32_t    timestamp; + +    char        creator_app[4]; // "vpc " +    uint16_t    major; +    uint16_t    minor; +    char        creator_os[4]; // "Wi2k" + +    uint64_t    orig_size; +    uint64_t    current_size; + +    uint16_t    cyls; +    uint8_t     heads; +    uint8_t     secs_per_cyl; + +    uint32_t    type; + +    // Checksum of the Hard Disk Footer ("one's complement of the sum of all +    // the bytes in the footer without the checksum field") +    uint32_t    checksum; + +    // UUID used to identify a parent hard disk (backing file) +    uint8_t     uuid[16]; + +    uint8_t     in_saved_state; +} QEMU_PACKED VHDFooter; + +typedef struct vhd_dyndisk_header { +    char        magic[8]; // "cxsparse" + +    // Offset of next header structure, 0xFFFFFFFF if none +    uint64_t    data_offset; + +    // Offset of the Block Allocation Table (BAT) +    uint64_t    table_offset; + +    uint32_t    version; +    uint32_t    max_table_entries; // 32bit/entry + +    // 2 MB by default, must be a power of two +    uint32_t    block_size; + +    uint32_t    checksum; +    uint8_t     parent_uuid[16]; +    uint32_t    parent_timestamp; +    uint32_t    reserved; + +    // Backing file name (in UTF-16) +    uint8_t     parent_name[512]; + +    struct { +        uint32_t    platform; +        uint32_t    data_space; +        uint32_t    data_length; +        uint32_t    reserved; +        uint64_t    data_offset; +    } parent_locator[8]; +} QEMU_PACKED VHDDynDiskHeader; + +typedef struct BDRVVPCState { +    CoMutex lock; +    uint8_t footer_buf[HEADER_SIZE]; +    uint64_t free_data_block_offset; +    int max_table_entries; +    uint32_t *pagetable; +    uint64_t bat_offset; +    uint64_t last_bitmap_offset; + +    uint32_t block_size; +    uint32_t bitmap_size; + +#ifdef CACHE +    uint8_t *pageentry_u8; +    uint32_t *pageentry_u32; +    uint16_t *pageentry_u16; + +    uint64_t last_bitmap; +#endif + +    Error *migration_blocker; +} BDRVVPCState; + +static uint32_t vpc_checksum(uint8_t* buf, size_t size) +{ +    uint32_t res = 0; +    int i; + +    for (i = 0; i < size; i++) +        res += buf[i]; + +    return ~res; +} + + +static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) +{ +    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8)) +	return 100; +    return 0; +} + +static int vpc_open(BlockDriverState *bs, QDict *options, int flags, +                    Error **errp) +{ +    BDRVVPCState *s = bs->opaque; +    int i; +    VHDFooter *footer; +    VHDDynDiskHeader *dyndisk_header; +    uint8_t buf[HEADER_SIZE]; +    uint32_t checksum; +    uint64_t computed_size; +    uint64_t pagetable_size; +    int disk_type = VHD_DYNAMIC; +    int ret; + +    ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE); +    if (ret < 0) { +        goto fail; +    } + +    footer = (VHDFooter *) s->footer_buf; +    if (strncmp(footer->creator, "conectix", 8)) { +        int64_t offset = bdrv_getlength(bs->file); +        if (offset < 0) { +            ret = offset; +            goto fail; +        } else if (offset < HEADER_SIZE) { +            ret = -EINVAL; +            goto fail; +        } + +        /* If a fixed disk, the footer is found only at the end of the file */ +        ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, +                         HEADER_SIZE); +        if (ret < 0) { +            goto fail; +        } +        if (strncmp(footer->creator, "conectix", 8)) { +            error_setg(errp, "invalid VPC image"); +            ret = -EINVAL; +            goto fail; +        } +        disk_type = VHD_FIXED; +    } + +    checksum = be32_to_cpu(footer->checksum); +    footer->checksum = 0; +    if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) +        fprintf(stderr, "block-vpc: The header checksum of '%s' is " +            "incorrect.\n", bs->filename); + +    /* Write 'checksum' back to footer, or else will leave it with zero. */ +    footer->checksum = cpu_to_be32(checksum); + +    // The visible size of a image in Virtual PC depends on the geometry +    // rather than on the size stored in the footer (the size in the footer +    // is too large usually) +    bs->total_sectors = (int64_t) +        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + +    /* Images that have exactly the maximum geometry are probably bigger and +     * would be truncated if we adhered to the geometry for them. Rely on +     * footer->current_size for them. */ +    if (bs->total_sectors == VHD_MAX_GEOMETRY) { +        bs->total_sectors = be64_to_cpu(footer->current_size) / +                            BDRV_SECTOR_SIZE; +    } + +    /* Allow a maximum disk size of approximately 2 TB */ +    if (bs->total_sectors >= VHD_MAX_SECTORS) { +        ret = -EFBIG; +        goto fail; +    } + +    if (disk_type == VHD_DYNAMIC) { +        ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, +                         HEADER_SIZE); +        if (ret < 0) { +            goto fail; +        } + +        dyndisk_header = (VHDDynDiskHeader *) buf; + +        if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { +            ret = -EINVAL; +            goto fail; +        } + +        s->block_size = be32_to_cpu(dyndisk_header->block_size); +        if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) { +            error_setg(errp, "Invalid block size %" PRIu32, s->block_size); +            ret = -EINVAL; +            goto fail; +        } +        s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; + +        s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); + +        if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { +            ret = -EINVAL; +            goto fail; +        } +        if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) { +            ret = -EINVAL; +            goto fail; +        } + +        computed_size = (uint64_t) s->max_table_entries * s->block_size; +        if (computed_size < bs->total_sectors * 512) { +            ret = -EINVAL; +            goto fail; +        } + +        if (s->max_table_entries > SIZE_MAX / 4 || +            s->max_table_entries > (int) INT_MAX / 4) { +            error_setg(errp, "Max Table Entries too large (%" PRId32 ")", +                        s->max_table_entries); +            ret = -EINVAL; +            goto fail; +        } + +        pagetable_size = (uint64_t) s->max_table_entries * 4; + +        s->pagetable = qemu_try_blockalign(bs->file, pagetable_size); +        if (s->pagetable == NULL) { +            ret = -ENOMEM; +            goto fail; +        } + +        s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); + +        ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable, pagetable_size); +        if (ret < 0) { +            goto fail; +        } + +        s->free_data_block_offset = +            ROUND_UP(s->bat_offset + pagetable_size, 512); + +        for (i = 0; i < s->max_table_entries; i++) { +            be32_to_cpus(&s->pagetable[i]); +            if (s->pagetable[i] != 0xFFFFFFFF) { +                int64_t next = (512 * (int64_t) s->pagetable[i]) + +                    s->bitmap_size + s->block_size; + +                if (next > s->free_data_block_offset) { +                    s->free_data_block_offset = next; +                } +            } +        } + +        if (s->free_data_block_offset > bdrv_getlength(bs->file)) { +            error_setg(errp, "block-vpc: free_data_block_offset points after " +                             "the end of file. The image has been truncated."); +            ret = -EINVAL; +            goto fail; +        } + +        s->last_bitmap_offset = (int64_t) -1; + +#ifdef CACHE +        s->pageentry_u8 = g_malloc(512); +        s->pageentry_u32 = s->pageentry_u8; +        s->pageentry_u16 = s->pageentry_u8; +        s->last_pagetable = -1; +#endif +    } + +    qemu_co_mutex_init(&s->lock); + +    /* Disable migration when VHD images are used */ +    error_setg(&s->migration_blocker, "The vpc format used by node '%s' " +               "does not support live migration", +               bdrv_get_device_or_node_name(bs)); +    migrate_add_blocker(s->migration_blocker); + +    return 0; + +fail: +    qemu_vfree(s->pagetable); +#ifdef CACHE +    g_free(s->pageentry_u8); +#endif +    return ret; +} + +static int vpc_reopen_prepare(BDRVReopenState *state, +                              BlockReopenQueue *queue, Error **errp) +{ +    return 0; +} + +/* + * Returns the absolute byte offset of the given sector in the image file. + * If the sector is not allocated, -1 is returned instead. + * + * The parameter write must be 1 if the offset will be used for a write + * operation (the block bitmaps is updated then), 0 otherwise. + */ +static inline int64_t get_sector_offset(BlockDriverState *bs, +    int64_t sector_num, int write) +{ +    BDRVVPCState *s = bs->opaque; +    uint64_t offset = sector_num * 512; +    uint64_t bitmap_offset, block_offset; +    uint32_t pagetable_index, pageentry_index; + +    pagetable_index = offset / s->block_size; +    pageentry_index = (offset % s->block_size) / 512; + +    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) +        return -1; // not allocated + +    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; +    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + +    // We must ensure that we don't write to any sectors which are marked as +    // unused in the bitmap. We get away with setting all bits in the block +    // bitmap each time we write to a new block. This might cause Virtual PC to +    // miss sparse read optimization, but it's not a problem in terms of +    // correctness. +    if (write && (s->last_bitmap_offset != bitmap_offset)) { +        uint8_t bitmap[s->bitmap_size]; + +        s->last_bitmap_offset = bitmap_offset; +        memset(bitmap, 0xff, s->bitmap_size); +        bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); +    } + +    return block_offset; +} + +/* + * Writes the footer to the end of the image file. This is needed when the + * file grows as it overwrites the old footer + * + * Returns 0 on success and < 0 on error + */ +static int rewrite_footer(BlockDriverState* bs) +{ +    int ret; +    BDRVVPCState *s = bs->opaque; +    int64_t offset = s->free_data_block_offset; + +    ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE); +    if (ret < 0) +        return ret; + +    return 0; +} + +/* + * Allocates a new block. This involves writing a new footer and updating + * the Block Allocation Table to use the space at the old end of the image + * file (overwriting the old footer) + * + * Returns the sectors' offset in the image file on success and < 0 on error + */ +static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +{ +    BDRVVPCState *s = bs->opaque; +    int64_t bat_offset; +    uint32_t index, bat_value; +    int ret; +    uint8_t bitmap[s->bitmap_size]; + +    // Check if sector_num is valid +    if ((sector_num < 0) || (sector_num > bs->total_sectors)) +        return -1; + +    // Write entry into in-memory BAT +    index = (sector_num * 512) / s->block_size; +    if (s->pagetable[index] != 0xFFFFFFFF) +        return -1; + +    s->pagetable[index] = s->free_data_block_offset / 512; + +    // Initialize the block's bitmap +    memset(bitmap, 0xff, s->bitmap_size); +    ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap, +        s->bitmap_size); +    if (ret < 0) { +        return ret; +    } + +    // Write new footer (the old one will be overwritten) +    s->free_data_block_offset += s->block_size + s->bitmap_size; +    ret = rewrite_footer(bs); +    if (ret < 0) +        goto fail; + +    // Write BAT entry to disk +    bat_offset = s->bat_offset + (4 * index); +    bat_value = cpu_to_be32(s->pagetable[index]); +    ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4); +    if (ret < 0) +        goto fail; + +    return get_sector_offset(bs, sector_num, 0); + +fail: +    s->free_data_block_offset -= (s->block_size + s->bitmap_size); +    return -1; +} + +static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +    BDRVVPCState *s = (BDRVVPCState *)bs->opaque; +    VHDFooter *footer = (VHDFooter *) s->footer_buf; + +    if (be32_to_cpu(footer->type) != VHD_FIXED) { +        bdi->cluster_size = s->block_size; +    } + +    bdi->unallocated_blocks_are_zero = true; +    return 0; +} + +static int vpc_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    BDRVVPCState *s = bs->opaque; +    int ret; +    int64_t offset; +    int64_t sectors, sectors_per_block; +    VHDFooter *footer = (VHDFooter *) s->footer_buf; + +    if (be32_to_cpu(footer->type) == VHD_FIXED) { +        return bdrv_read(bs->file, sector_num, buf, nb_sectors); +    } +    while (nb_sectors > 0) { +        offset = get_sector_offset(bs, sector_num, 0); + +        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; +        sectors = sectors_per_block - (sector_num % sectors_per_block); +        if (sectors > nb_sectors) { +            sectors = nb_sectors; +        } + +        if (offset == -1) { +            memset(buf, 0, sectors * BDRV_SECTOR_SIZE); +        } else { +            ret = bdrv_pread(bs->file, offset, buf, +                sectors * BDRV_SECTOR_SIZE); +            if (ret != sectors * BDRV_SECTOR_SIZE) { +                return -1; +            } +        } + +        nb_sectors -= sectors; +        sector_num += sectors; +        buf += sectors * BDRV_SECTOR_SIZE; +    } +    return 0; +} + +static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, +                                    uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVPCState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vpc_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int vpc_write(BlockDriverState *bs, int64_t sector_num, +    const uint8_t *buf, int nb_sectors) +{ +    BDRVVPCState *s = bs->opaque; +    int64_t offset; +    int64_t sectors, sectors_per_block; +    int ret; +    VHDFooter *footer =  (VHDFooter *) s->footer_buf; + +    if (be32_to_cpu(footer->type) == VHD_FIXED) { +        return bdrv_write(bs->file, sector_num, buf, nb_sectors); +    } +    while (nb_sectors > 0) { +        offset = get_sector_offset(bs, sector_num, 1); + +        sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; +        sectors = sectors_per_block - (sector_num % sectors_per_block); +        if (sectors > nb_sectors) { +            sectors = nb_sectors; +        } + +        if (offset == -1) { +            offset = alloc_block(bs, sector_num); +            if (offset < 0) +                return -1; +        } + +        ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE); +        if (ret != sectors * BDRV_SECTOR_SIZE) { +            return -1; +        } + +        nb_sectors -= sectors; +        sector_num += sectors; +        buf += sectors * BDRV_SECTOR_SIZE; +    } + +    return 0; +} + +static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, +                                     const uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVPCState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vpc_write(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, +        int64_t sector_num, int nb_sectors, int *pnum) +{ +    BDRVVPCState *s = bs->opaque; +    VHDFooter *footer = (VHDFooter*) s->footer_buf; +    int64_t start, offset; +    bool allocated; +    int n; + +    if (be32_to_cpu(footer->type) == VHD_FIXED) { +        *pnum = nb_sectors; +        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | +               (sector_num << BDRV_SECTOR_BITS); +    } + +    offset = get_sector_offset(bs, sector_num, 0); +    start = offset; +    allocated = (offset != -1); +    *pnum = 0; + +    do { +        /* All sectors in a block are contiguous (without using the bitmap) */ +        n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE) +          - sector_num; +        n = MIN(n, nb_sectors); + +        *pnum += n; +        sector_num += n; +        nb_sectors -= n; +        /* *pnum can't be greater than one block for allocated +         * sectors since there is always a bitmap in between. */ +        if (allocated) { +            return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; +        } +        if (nb_sectors == 0) { +            break; +        } +        offset = get_sector_offset(bs, sector_num, 0); +    } while (offset == -1); + +    return 0; +} + +/* + * Calculates the number of cylinders, heads and sectors per cylinder + * based on a given number of sectors. This is the algorithm described + * in the VHD specification. + * + * Note that the geometry doesn't always exactly match total_sectors but + * may round it down. + * + * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override + * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB) + * and instead allow up to 255 heads. + */ +static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, +    uint8_t* heads, uint8_t* secs_per_cyl) +{ +    uint32_t cyls_times_heads; + +    total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY); + +    if (total_sectors >= 65535LL * 16 * 63) { +        *secs_per_cyl = 255; +        *heads = 16; +        cyls_times_heads = total_sectors / *secs_per_cyl; +    } else { +        *secs_per_cyl = 17; +        cyls_times_heads = total_sectors / *secs_per_cyl; +        *heads = (cyls_times_heads + 1023) / 1024; + +        if (*heads < 4) { +            *heads = 4; +        } + +        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { +            *secs_per_cyl = 31; +            *heads = 16; +            cyls_times_heads = total_sectors / *secs_per_cyl; +        } + +        if (cyls_times_heads >= (*heads * 1024)) { +            *secs_per_cyl = 63; +            *heads = 16; +            cyls_times_heads = total_sectors / *secs_per_cyl; +        } +    } + +    *cyls = cyls_times_heads / *heads; + +    return 0; +} + +static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, +                               int64_t total_sectors) +{ +    VHDDynDiskHeader *dyndisk_header = +        (VHDDynDiskHeader *) buf; +    size_t block_size, num_bat_entries; +    int i; +    int ret; +    int64_t offset = 0; + +    // Write the footer (twice: at the beginning and at the end) +    block_size = 0x200000; +    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); + +    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); +    if (ret) { +        goto fail; +    } + +    offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); +    ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); +    if (ret < 0) { +        goto fail; +    } + +    // Write the initial BAT +    offset = 3 * 512; + +    memset(buf, 0xFF, 512); +    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { +        ret = bdrv_pwrite_sync(bs, offset, buf, 512); +        if (ret < 0) { +            goto fail; +        } +        offset += 512; +    } + +    // Prepare the Dynamic Disk Header +    memset(buf, 0, 1024); + +    memcpy(dyndisk_header->magic, "cxsparse", 8); + +    /* +     * Note: The spec is actually wrong here for data_offset, it says +     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set. +     */ +    dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); +    dyndisk_header->table_offset = cpu_to_be64(3 * 512); +    dyndisk_header->version = cpu_to_be32(0x00010000); +    dyndisk_header->block_size = cpu_to_be32(block_size); +    dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries); + +    dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024)); + +    // Write the header +    offset = 512; + +    ret = bdrv_pwrite_sync(bs, offset, buf, 1024); +    if (ret < 0) { +        goto fail; +    } + + fail: +    return ret; +} + +static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, +                             int64_t total_size) +{ +    int ret; + +    /* Add footer to total size */ +    total_size += HEADER_SIZE; + +    ret = bdrv_truncate(bs, total_size); +    if (ret < 0) { +        return ret; +    } + +    ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); +    if (ret < 0) { +        return ret; +    } + +    return ret; +} + +static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) +{ +    uint8_t buf[1024]; +    VHDFooter *footer = (VHDFooter *) buf; +    char *disk_type_param; +    int i; +    uint16_t cyls = 0; +    uint8_t heads = 0; +    uint8_t secs_per_cyl = 0; +    int64_t total_sectors; +    int64_t total_size; +    int disk_type; +    int ret = -EIO; +    Error *local_err = NULL; +    BlockDriverState *bs = NULL; + +    /* Read out options */ +    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), +                          BDRV_SECTOR_SIZE); +    disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); +    if (disk_type_param) { +        if (!strcmp(disk_type_param, "dynamic")) { +            disk_type = VHD_DYNAMIC; +        } else if (!strcmp(disk_type_param, "fixed")) { +            disk_type = VHD_FIXED; +        } else { +            ret = -EINVAL; +            goto out; +        } +    } else { +        disk_type = VHD_DYNAMIC; +    } + +    ret = bdrv_create_file(filename, opts, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto out; +    } +    ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, +                    NULL, &local_err); +    if (ret < 0) { +        error_propagate(errp, local_err); +        goto out; +    } + +    /* +     * Calculate matching total_size and geometry. Increase the number of +     * sectors requested until we get enough (or fail). This ensures that +     * qemu-img convert doesn't truncate images, but rather rounds up. +     * +     * If the image size can't be represented by a spec conform CHS geometry, +     * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use +     * the image size from the VHD footer to calculate total_sectors. +     */ +    total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); +    for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { +        calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); +    } + +    if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { +        total_sectors = total_size / BDRV_SECTOR_SIZE; +        /* Allow a maximum disk size of approximately 2 TB */ +        if (total_sectors > VHD_MAX_SECTORS) { +            ret = -EFBIG; +            goto out; +        } +    } else { +        total_sectors = (int64_t)cyls * heads * secs_per_cyl; +        total_size = total_sectors * BDRV_SECTOR_SIZE; +    } + +    /* Prepare the Hard Disk Footer */ +    memset(buf, 0, 1024); + +    memcpy(footer->creator, "conectix", 8); +    /* TODO Check if "qemu" creator_app is ok for VPC */ +    memcpy(footer->creator_app, "qemu", 4); +    memcpy(footer->creator_os, "Wi2k", 4); + +    footer->features = cpu_to_be32(0x02); +    footer->version = cpu_to_be32(0x00010000); +    if (disk_type == VHD_DYNAMIC) { +        footer->data_offset = cpu_to_be64(HEADER_SIZE); +    } else { +        footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); +    } +    footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE); + +    /* Version of Virtual PC 2007 */ +    footer->major = cpu_to_be16(0x0005); +    footer->minor = cpu_to_be16(0x0003); +    footer->orig_size = cpu_to_be64(total_size); +    footer->current_size = cpu_to_be64(total_size); +    footer->cyls = cpu_to_be16(cyls); +    footer->heads = heads; +    footer->secs_per_cyl = secs_per_cyl; + +    footer->type = cpu_to_be32(disk_type); + +#if defined(CONFIG_UUID) +    uuid_generate(footer->uuid); +#endif + +    footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); + +    if (disk_type == VHD_DYNAMIC) { +        ret = create_dynamic_disk(bs, buf, total_sectors); +    } else { +        ret = create_fixed_disk(bs, buf, total_size); +    } + +out: +    bdrv_unref(bs); +    g_free(disk_type_param); +    return ret; +} + +static int vpc_has_zero_init(BlockDriverState *bs) +{ +    BDRVVPCState *s = bs->opaque; +    VHDFooter *footer =  (VHDFooter *) s->footer_buf; + +    if (be32_to_cpu(footer->type) == VHD_FIXED) { +        return bdrv_has_zero_init(bs->file); +    } else { +        return 1; +    } +} + +static void vpc_close(BlockDriverState *bs) +{ +    BDRVVPCState *s = bs->opaque; +    qemu_vfree(s->pagetable); +#ifdef CACHE +    g_free(s->pageentry_u8); +#endif + +    migrate_del_blocker(s->migration_blocker); +    error_free(s->migration_blocker); +} + +static QemuOptsList vpc_create_opts = { +    .name = "vpc-create-opts", +    .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head), +    .desc = { +        { +            .name = BLOCK_OPT_SIZE, +            .type = QEMU_OPT_SIZE, +            .help = "Virtual disk size" +        }, +        { +            .name = BLOCK_OPT_SUBFMT, +            .type = QEMU_OPT_STRING, +            .help = +                "Type of virtual hard disk format. Supported formats are " +                "{dynamic (default) | fixed} " +        }, +        { /* end of list */ } +    } +}; + +static BlockDriver bdrv_vpc = { +    .format_name    = "vpc", +    .instance_size  = sizeof(BDRVVPCState), + +    .bdrv_probe             = vpc_probe, +    .bdrv_open              = vpc_open, +    .bdrv_close             = vpc_close, +    .bdrv_reopen_prepare    = vpc_reopen_prepare, +    .bdrv_create            = vpc_create, + +    .bdrv_read                  = vpc_co_read, +    .bdrv_write                 = vpc_co_write, +    .bdrv_co_get_block_status   = vpc_co_get_block_status, + +    .bdrv_get_info          = vpc_get_info, + +    .create_opts            = &vpc_create_opts, +    .bdrv_has_zero_init     = vpc_has_zero_init, +}; + +static void bdrv_vpc_init(void) +{ +    bdrv_register(&bdrv_vpc); +} + +block_init(bdrv_vpc_init); diff --git a/block/vvfat.c b/block/vvfat.c new file mode 100644 index 00000000..20686971 --- /dev/null +++ b/block/vvfat.c @@ -0,0 +1,3042 @@ +/* vim:set shiftwidth=4 ts=4: */ +/* + * QEMU Block driver for virtual VFAT (shadows a local directory) + * + * Copyright (c) 2004,2005 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <sys/stat.h> +#include <dirent.h> +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" + +#ifndef S_IWGRP +#define S_IWGRP 0 +#endif +#ifndef S_IWOTH +#define S_IWOTH 0 +#endif + +/* TODO: add ":bootsector=blabla.img:" */ +/* LATER TODO: add automatic boot sector generation from +    BOOTEASY.ASM and Ranish Partition Manager +    Note that DOS assumes the system files to be the first files in the +    file system (test if the boot sector still relies on that fact)! */ +/* MAYBE TODO: write block-visofs.c */ +/* TODO: call try_commit() only after a timeout */ + +/* #define DEBUG */ + +#ifdef DEBUG + +#define DLOG(a) a + +static void checkpoint(void); + +#ifdef __MINGW32__ +void nonono(const char* file, int line, const char* msg) { +    fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg); +    exit(-5); +} +#undef assert +#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0) +#endif + +#else + +#define DLOG(a) + +#endif + +/* dynamic array functions */ +typedef struct array_t { +    char* pointer; +    unsigned int size,next,item_size; +} array_t; + +static inline void array_init(array_t* array,unsigned int item_size) +{ +    array->pointer = NULL; +    array->size=0; +    array->next=0; +    array->item_size=item_size; +} + +static inline void array_free(array_t* array) +{ +    g_free(array->pointer); +    array->size=array->next=0; +} + +/* does not automatically grow */ +static inline void* array_get(array_t* array,unsigned int index) { +    assert(index < array->next); +    return array->pointer + index * array->item_size; +} + +static inline int array_ensure_allocated(array_t* array, int index) +{ +    if((index + 1) * array->item_size > array->size) { +	int new_size = (index + 32) * array->item_size; +	array->pointer = g_realloc(array->pointer, new_size); +	if (!array->pointer) +	    return -1; +	array->size = new_size; +	array->next = index + 1; +    } + +    return 0; +} + +static inline void* array_get_next(array_t* array) { +    unsigned int next = array->next; +    void* result; + +    if (array_ensure_allocated(array, next) < 0) +	return NULL; + +    array->next = next + 1; +    result = array_get(array, next); + +    return result; +} + +static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { +    if((array->next+count)*array->item_size>array->size) { +	int increment=count*array->item_size; +	array->pointer=g_realloc(array->pointer,array->size+increment); +	if(!array->pointer) +            return NULL; +	array->size+=increment; +    } +    memmove(array->pointer+(index+count)*array->item_size, +		array->pointer+index*array->item_size, +		(array->next-index)*array->item_size); +    array->next+=count; +    return array->pointer+index*array->item_size; +} + +/* this performs a "roll", so that the element which was at index_from becomes + * index_to, but the order of all other elements is preserved. */ +static inline int array_roll(array_t* array,int index_to,int index_from,int count) +{ +    char* buf; +    char* from; +    char* to; +    int is; + +    if(!array || +	    index_to<0 || index_to>=array->next || +	    index_from<0 || index_from>=array->next) +	return -1; + +    if(index_to==index_from) +	return 0; + +    is=array->item_size; +    from=array->pointer+index_from*is; +    to=array->pointer+index_to*is; +    buf=g_malloc(is*count); +    memcpy(buf,from,is*count); + +    if(index_to<index_from) +	memmove(to+is*count,to,from-to); +    else +	memmove(from,from+is*count,to-from); + +    memcpy(to,buf,is*count); + +    g_free(buf); + +    return 0; +} + +static inline int array_remove_slice(array_t* array,int index, int count) +{ +    assert(index >=0); +    assert(count > 0); +    assert(index + count <= array->next); +    if(array_roll(array,array->next-1,index,count)) +	return -1; +    array->next -= count; +    return 0; +} + +static int array_remove(array_t* array,int index) +{ +    return array_remove_slice(array, index, 1); +} + +/* return the index for a given member */ +static int array_index(array_t* array, void* pointer) +{ +    size_t offset = (char*)pointer - array->pointer; +    assert((offset % array->item_size) == 0); +    assert(offset/array->item_size < array->next); +    return offset/array->item_size; +} + +/* These structures are used to fake a disk and the VFAT filesystem. + * For this reason we need to use QEMU_PACKED. */ + +typedef struct bootsector_t { +    uint8_t jump[3]; +    uint8_t name[8]; +    uint16_t sector_size; +    uint8_t sectors_per_cluster; +    uint16_t reserved_sectors; +    uint8_t number_of_fats; +    uint16_t root_entries; +    uint16_t total_sectors16; +    uint8_t media_type; +    uint16_t sectors_per_fat; +    uint16_t sectors_per_track; +    uint16_t number_of_heads; +    uint32_t hidden_sectors; +    uint32_t total_sectors; +    union { +        struct { +	    uint8_t drive_number; +	    uint8_t current_head; +	    uint8_t signature; +	    uint32_t id; +	    uint8_t volume_label[11]; +	} QEMU_PACKED fat16; +	struct { +	    uint32_t sectors_per_fat; +	    uint16_t flags; +	    uint8_t major,minor; +	    uint32_t first_cluster_of_root_directory; +	    uint16_t info_sector; +	    uint16_t backup_boot_sector; +	    uint16_t ignored; +	} QEMU_PACKED fat32; +    } u; +    uint8_t fat_type[8]; +    uint8_t ignored[0x1c0]; +    uint8_t magic[2]; +} QEMU_PACKED bootsector_t; + +typedef struct { +    uint8_t head; +    uint8_t sector; +    uint8_t cylinder; +} mbr_chs_t; + +typedef struct partition_t { +    uint8_t attributes; /* 0x80 = bootable */ +    mbr_chs_t start_CHS; +    uint8_t   fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */ +    mbr_chs_t end_CHS; +    uint32_t start_sector_long; +    uint32_t length_sector_long; +} QEMU_PACKED partition_t; + +typedef struct mbr_t { +    uint8_t ignored[0x1b8]; +    uint32_t nt_id; +    uint8_t ignored2[2]; +    partition_t partition[4]; +    uint8_t magic[2]; +} QEMU_PACKED mbr_t; + +typedef struct direntry_t { +    uint8_t name[8 + 3]; +    uint8_t attributes; +    uint8_t reserved[2]; +    uint16_t ctime; +    uint16_t cdate; +    uint16_t adate; +    uint16_t begin_hi; +    uint16_t mtime; +    uint16_t mdate; +    uint16_t begin; +    uint32_t size; +} QEMU_PACKED direntry_t; + +/* this structure are used to transparently access the files */ + +typedef struct mapping_t { +    /* begin is the first cluster, end is the last+1 */ +    uint32_t begin,end; +    /* as s->directory is growable, no pointer may be used here */ +    unsigned int dir_index; +    /* the clusters of a file may be in any order; this points to the first */ +    int first_mapping_index; +    union { +	/* offset is +	 * - the offset in the file (in clusters) for a file, or +	 * - the next cluster of the directory for a directory, and +	 * - the address of the buffer for a faked entry +	 */ +	struct { +	    uint32_t offset; +	} file; +	struct { +	    int parent_mapping_index; +	    int first_dir_index; +	} dir; +    } info; +    /* path contains the full path, i.e. it always starts with s->path */ +    char* path; + +    enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2, +	MODE_DIRECTORY = 4, MODE_FAKED = 8, +	MODE_DELETED = 16, MODE_RENAMED = 32 } mode; +    int read_only; +} mapping_t; + +#ifdef DEBUG +static void print_direntry(const struct direntry_t*); +static void print_mapping(const struct mapping_t* mapping); +#endif + +/* here begins the real VVFAT driver */ + +typedef struct BDRVVVFATState { +    CoMutex lock; +    BlockDriverState* bs; /* pointer to parent */ +    unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */ +    unsigned char first_sectors[0x40*0x200]; + +    int fat_type; /* 16 or 32 */ +    array_t fat,directory,mapping; +    char volume_label[11]; + +    unsigned int cluster_size; +    unsigned int sectors_per_cluster; +    unsigned int sectors_per_fat; +    unsigned int sectors_of_root_directory; +    uint32_t last_cluster_of_root_directory; +    unsigned int faked_sectors; /* how many sectors are faked before file data */ +    uint32_t sector_count; /* total number of sectors of the partition */ +    uint32_t cluster_count; /* total number of clusters of this partition */ +    uint32_t max_fat_value; + +    int current_fd; +    mapping_t* current_mapping; +    unsigned char* cluster; /* points to current cluster */ +    unsigned char* cluster_buffer; /* points to a buffer to hold temp data */ +    unsigned int current_cluster; + +    /* write support */ +    BlockDriverState* write_target; +    char* qcow_filename; +    BlockDriverState* qcow; +    void* fat2; +    char* used_clusters; +    array_t commits; +    const char* path; +    int downcase_short_names; + +    Error *migration_blocker; +} BDRVVVFATState; + +/* take the sector position spos and convert it to Cylinder/Head/Sector position + * if the position is outside the specified geometry, fill maximum value for CHS + * and return 1 to signal overflow. + */ +static int sector2CHS(mbr_chs_t *chs, int spos, int cyls, int heads, int secs) +{ +    int head,sector; +    sector   = spos % secs;  spos /= secs; +    head     = spos % heads; spos /= heads; +    if (spos >= cyls) { +        /* Overflow, +        it happens if 32bit sector positions are used, while CHS is only 24bit. +        Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */ +        chs->head     = 0xFF; +        chs->sector   = 0xFF; +        chs->cylinder = 0xFF; +        return 1; +    } +    chs->head     = (uint8_t)head; +    chs->sector   = (uint8_t)( (sector+1) | ((spos>>8)<<6) ); +    chs->cylinder = (uint8_t)spos; +    return 0; +} + +static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs) +{ +    /* TODO: if the files mbr.img and bootsect.img exist, use them */ +    mbr_t* real_mbr=(mbr_t*)s->first_sectors; +    partition_t* partition = &(real_mbr->partition[0]); +    int lba; + +    memset(s->first_sectors,0,512); + +    /* Win NT Disk Signature */ +    real_mbr->nt_id= cpu_to_le32(0xbe1afdfa); + +    partition->attributes=0x80; /* bootable */ + +    /* LBA is used when partition is outside the CHS geometry */ +    lba  = sector2CHS(&partition->start_CHS, s->first_sectors_number - 1, +                     cyls, heads, secs); +    lba |= sector2CHS(&partition->end_CHS,   s->bs->total_sectors - 1, +                     cyls, heads, secs); + +    /*LBA partitions are identified only by start/length_sector_long not by CHS*/ +    partition->start_sector_long  = cpu_to_le32(s->first_sectors_number - 1); +    partition->length_sector_long = cpu_to_le32(s->bs->total_sectors +                                                - s->first_sectors_number + 1); + +    /* FAT12/FAT16/FAT32 */ +    /* DOS uses different types when partition is LBA, +       probably to prevent older versions from using CHS on them */ +    partition->fs_type= s->fat_type==12 ? 0x1: +                        s->fat_type==16 ? (lba?0xe:0x06): +                         /*fat_tyoe==32*/ (lba?0xc:0x0b); + +    real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa; +} + +/* direntry functions */ + +/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */ +static inline int short2long_name(char* dest,const char* src) +{ +    int i; +    int len; +    for(i=0;i<129 && src[i];i++) { +        dest[2*i]=src[i]; +	dest[2*i+1]=0; +    } +    len=2*i; +    dest[2*i]=dest[2*i+1]=0; +    for(i=2*i+2;(i%26);i++) +	dest[i]=0xff; +    return len; +} + +static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename) +{ +    char buffer[258]; +    int length=short2long_name(buffer,filename), +        number_of_entries=(length+25)/26,i; +    direntry_t* entry; + +    for(i=0;i<number_of_entries;i++) { +	entry=array_get_next(&(s->directory)); +	entry->attributes=0xf; +	entry->reserved[0]=0; +	entry->begin=0; +	entry->name[0]=(number_of_entries-i)|(i==0?0x40:0); +    } +    for(i=0;i<26*number_of_entries;i++) { +	int offset=(i%26); +	if(offset<10) offset=1+offset; +	else if(offset<22) offset=14+offset-10; +	else offset=28+offset-22; +	entry=array_get(&(s->directory),s->directory.next-1-(i/26)); +	entry->name[offset]=buffer[i]; +    } +    return array_get(&(s->directory),s->directory.next-number_of_entries); +} + +static char is_free(const direntry_t* direntry) +{ +    return direntry->name[0]==0xe5 || direntry->name[0]==0x00; +} + +static char is_volume_label(const direntry_t* direntry) +{ +    return direntry->attributes == 0x28; +} + +static char is_long_name(const direntry_t* direntry) +{ +    return direntry->attributes == 0xf; +} + +static char is_short_name(const direntry_t* direntry) +{ +    return !is_volume_label(direntry) && !is_long_name(direntry) +	&& !is_free(direntry); +} + +static char is_directory(const direntry_t* direntry) +{ +    return direntry->attributes & 0x10 && direntry->name[0] != 0xe5; +} + +static inline char is_dot(const direntry_t* direntry) +{ +    return is_short_name(direntry) && direntry->name[0] == '.'; +} + +static char is_file(const direntry_t* direntry) +{ +    return is_short_name(direntry) && !is_directory(direntry); +} + +static inline uint32_t begin_of_direntry(const direntry_t* direntry) +{ +    return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16); +} + +static inline uint32_t filesize_of_direntry(const direntry_t* direntry) +{ +    return le32_to_cpu(direntry->size); +} + +static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin) +{ +    direntry->begin = cpu_to_le16(begin & 0xffff); +    direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff); +} + +/* fat functions */ + +static inline uint8_t fat_chksum(const direntry_t* entry) +{ +    uint8_t chksum=0; +    int i; + +    for (i = 0; i < ARRAY_SIZE(entry->name); i++) { +        chksum = (((chksum & 0xfe) >> 1) | +                  ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i]; +    } + +    return chksum; +} + +/* if return_time==0, this returns the fat_date, else the fat_time */ +static uint16_t fat_datetime(time_t time,int return_time) { +    struct tm* t; +    struct tm t1; +    t = &t1; +    localtime_r(&time,t); +    if(return_time) +	return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); +    return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); +} + +static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value) +{ +    if(s->fat_type==32) { +	uint32_t* entry=array_get(&(s->fat),cluster); +	*entry=cpu_to_le32(value); +    } else if(s->fat_type==16) { +	uint16_t* entry=array_get(&(s->fat),cluster); +	*entry=cpu_to_le16(value&0xffff); +    } else { +	int offset = (cluster*3/2); +	unsigned char* p = array_get(&(s->fat), offset); +        switch (cluster&1) { +	case 0: +		p[0] = value&0xff; +		p[1] = (p[1]&0xf0) | ((value>>8)&0xf); +		break; +	case 1: +		p[0] = (p[0]&0xf) | ((value&0xf)<<4); +		p[1] = (value>>4); +		break; +	} +    } +} + +static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster) +{ +    if(s->fat_type==32) { +	uint32_t* entry=array_get(&(s->fat),cluster); +	return le32_to_cpu(*entry); +    } else if(s->fat_type==16) { +	uint16_t* entry=array_get(&(s->fat),cluster); +	return le16_to_cpu(*entry); +    } else { +	const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2; +	return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; +    } +} + +static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry) +{ +    if(fat_entry>s->max_fat_value-8) +	return -1; +    return 0; +} + +static inline void init_fat(BDRVVVFATState* s) +{ +    if (s->fat_type == 12) { +	array_init(&(s->fat),1); +	array_ensure_allocated(&(s->fat), +		s->sectors_per_fat * 0x200 * 3 / 2 - 1); +    } else { +	array_init(&(s->fat),(s->fat_type==32?4:2)); +	array_ensure_allocated(&(s->fat), +		s->sectors_per_fat * 0x200 / s->fat.item_size - 1); +    } +    memset(s->fat.pointer,0,s->fat.size); + +    switch(s->fat_type) { +	case 12: s->max_fat_value=0xfff; break; +	case 16: s->max_fat_value=0xffff; break; +	case 32: s->max_fat_value=0x0fffffff; break; +	default: s->max_fat_value=0; /* error... */ +    } + +} + +/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */ +/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */ +static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, +	unsigned int directory_start, const char* filename, int is_dot) +{ +    int i,j,long_index=s->directory.next; +    direntry_t* entry = NULL; +    direntry_t* entry_long = NULL; + +    if(is_dot) { +	entry=array_get_next(&(s->directory)); +        memset(entry->name, 0x20, sizeof(entry->name)); +	memcpy(entry->name,filename,strlen(filename)); +	return entry; +    } + +    entry_long=create_long_filename(s,filename); + +    i = strlen(filename); +    for(j = i - 1; j>0  && filename[j]!='.';j--); +    if (j > 0) +	i = (j > 8 ? 8 : j); +    else if (i > 8) +	i = 8; + +    entry=array_get_next(&(s->directory)); +    memset(entry->name, 0x20, sizeof(entry->name)); +    memcpy(entry->name, filename, i); + +    if (j > 0) { +        for (i = 0; i < 3 && filename[j + 1 + i]; i++) { +            entry->name[8 + i] = filename[j + 1 + i]; +        } +    } + +    /* upcase & remove unwanted characters */ +    for(i=10;i>=0;i--) { +	if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--); +	if(entry->name[i]<=' ' || entry->name[i]>0x7f +		|| strchr(".*?<>|\":/\\[];,+='",entry->name[i])) +	    entry->name[i]='_'; +        else if(entry->name[i]>='a' && entry->name[i]<='z') +            entry->name[i]+='A'-'a'; +    } + +    /* mangle duplicates */ +    while(1) { +	direntry_t* entry1=array_get(&(s->directory),directory_start); +	int j; + +	for(;entry1<entry;entry1++) +	    if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11)) +		break; /* found dupe */ +	if(entry1==entry) /* no dupe found */ +	    break; + +	/* use all 8 characters of name */ +	if(entry->name[7]==' ') { +	    int j; +	    for(j=6;j>0 && entry->name[j]==' ';j--) +		entry->name[j]='~'; +	} + +	/* increment number */ +	for(j=7;j>0 && entry->name[j]=='9';j--) +	    entry->name[j]='0'; +	if(j>0) { +	    if(entry->name[j]<'0' || entry->name[j]>'9') +	        entry->name[j]='0'; +	    else +	        entry->name[j]++; +	} +    } + +    /* calculate checksum; propagate to long name */ +    if(entry_long) { +        uint8_t chksum=fat_chksum(entry); + +	/* calculate anew, because realloc could have taken place */ +	entry_long=array_get(&(s->directory),long_index); +	while(entry_long<entry && is_long_name(entry_long)) { +	    entry_long->reserved[1]=chksum; +	    entry_long++; +	} +    } + +    return entry; +} + +/* + * Read a directory. (the index of the corresponding mapping must be passed). + */ +static int read_directory(BDRVVVFATState* s, int mapping_index) +{ +    mapping_t* mapping = array_get(&(s->mapping), mapping_index); +    direntry_t* direntry; +    const char* dirname = mapping->path; +    int first_cluster = mapping->begin; +    int parent_index = mapping->info.dir.parent_mapping_index; +    mapping_t* parent_mapping = (mapping_t*) +        (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL); +    int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1; + +    DIR* dir=opendir(dirname); +    struct dirent* entry; +    int i; + +    assert(mapping->mode & MODE_DIRECTORY); + +    if(!dir) { +	mapping->end = mapping->begin; +	return -1; +    } + +    i = mapping->info.dir.first_dir_index = +	    first_cluster == 0 ? 0 : s->directory.next; + +    /* actually read the directory, and allocate the mappings */ +    while((entry=readdir(dir))) { +	unsigned int length=strlen(dirname)+2+strlen(entry->d_name); +        char* buffer; +	direntry_t* direntry; +        struct stat st; +	int is_dot=!strcmp(entry->d_name,"."); +	int is_dotdot=!strcmp(entry->d_name,".."); + +	if(first_cluster == 0 && (is_dotdot || is_dot)) +	    continue; + +	buffer = g_malloc(length); +	snprintf(buffer,length,"%s/%s",dirname,entry->d_name); + +	if(stat(buffer,&st)<0) { +            g_free(buffer); +            continue; +	} + +	/* create directory entry for this file */ +	direntry=create_short_and_long_name(s, i, entry->d_name, +		is_dot || is_dotdot); +	direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20); +	direntry->reserved[0]=direntry->reserved[1]=0; +	direntry->ctime=fat_datetime(st.st_ctime,1); +	direntry->cdate=fat_datetime(st.st_ctime,0); +	direntry->adate=fat_datetime(st.st_atime,0); +	direntry->begin_hi=0; +	direntry->mtime=fat_datetime(st.st_mtime,1); +	direntry->mdate=fat_datetime(st.st_mtime,0); +	if(is_dotdot) +	    set_begin_of_direntry(direntry, first_cluster_of_parent); +	else if(is_dot) +	    set_begin_of_direntry(direntry, first_cluster); +	else +	    direntry->begin=0; /* do that later */ +        if (st.st_size > 0x7fffffff) { +	    fprintf(stderr, "File %s is larger than 2GB\n", buffer); +            g_free(buffer); +            closedir(dir); +	    return -2; +        } +	direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size); + +	/* create mapping for this file */ +	if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) { +	    s->current_mapping = array_get_next(&(s->mapping)); +	    s->current_mapping->begin=0; +	    s->current_mapping->end=st.st_size; +	    /* +	     * we get the direntry of the most recent direntry, which +	     * contains the short name and all the relevant information. +	     */ +	    s->current_mapping->dir_index=s->directory.next-1; +	    s->current_mapping->first_mapping_index = -1; +	    if (S_ISDIR(st.st_mode)) { +		s->current_mapping->mode = MODE_DIRECTORY; +		s->current_mapping->info.dir.parent_mapping_index = +		    mapping_index; +	    } else { +		s->current_mapping->mode = MODE_UNDEFINED; +		s->current_mapping->info.file.offset = 0; +	    } +	    s->current_mapping->path=buffer; +	    s->current_mapping->read_only = +		(st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; +        } else { +            g_free(buffer); +        } +    } +    closedir(dir); + +    /* fill with zeroes up to the end of the cluster */ +    while(s->directory.next%(0x10*s->sectors_per_cluster)) { +	direntry_t* direntry=array_get_next(&(s->directory)); +	memset(direntry,0,sizeof(direntry_t)); +    } + +/* TODO: if there are more entries, bootsector has to be adjusted! */ +#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster) +    if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) { +	/* root directory */ +	int cur = s->directory.next; +	array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1); +	s->directory.next = ROOT_ENTRIES; +	memset(array_get(&(s->directory), cur), 0, +		(ROOT_ENTRIES - cur) * sizeof(direntry_t)); +    } + +     /* reget the mapping, since s->mapping was possibly realloc()ed */ +    mapping = array_get(&(s->mapping), mapping_index); +    first_cluster += (s->directory.next - mapping->info.dir.first_dir_index) +	* 0x20 / s->cluster_size; +    mapping->end = first_cluster; + +    direntry = array_get(&(s->directory), mapping->dir_index); +    set_begin_of_direntry(direntry, mapping->begin); + +    return 0; +} + +static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num) +{ +    return (sector_num-s->faked_sectors)/s->sectors_per_cluster; +} + +static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) +{ +    return s->faked_sectors + s->sectors_per_cluster * cluster_num; +} + +static int init_directories(BDRVVVFATState* s, +                            const char *dirname, int heads, int secs, +                            Error **errp) +{ +    bootsector_t* bootsector; +    mapping_t* mapping; +    unsigned int i; +    unsigned int cluster; + +    memset(&(s->first_sectors[0]),0,0x40*0x200); + +    s->cluster_size=s->sectors_per_cluster*0x200; +    s->cluster_buffer=g_malloc(s->cluster_size); + +    /* +     * The formula: sc = spf+1+spf*spc*(512*8/fat_type), +     * where sc is sector_count, +     * spf is sectors_per_fat, +     * spc is sectors_per_clusters, and +     * fat_type = 12, 16 or 32. +     */ +    i = 1+s->sectors_per_cluster*0x200*8/s->fat_type; +    s->sectors_per_fat=(s->sector_count+i)/i; /* round up */ + +    array_init(&(s->mapping),sizeof(mapping_t)); +    array_init(&(s->directory),sizeof(direntry_t)); + +    /* add volume label */ +    { +	direntry_t* entry=array_get_next(&(s->directory)); +	entry->attributes=0x28; /* archive | volume label */ +        memcpy(entry->name, s->volume_label, sizeof(entry->name)); +    } + +    /* Now build FAT, and write back information into directory */ +    init_fat(s); + +    s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2; +    s->cluster_count=sector2cluster(s, s->sector_count); + +    mapping = array_get_next(&(s->mapping)); +    mapping->begin = 0; +    mapping->dir_index = 0; +    mapping->info.dir.parent_mapping_index = -1; +    mapping->first_mapping_index = -1; +    mapping->path = g_strdup(dirname); +    i = strlen(mapping->path); +    if (i > 0 && mapping->path[i - 1] == '/') +	mapping->path[i - 1] = '\0'; +    mapping->mode = MODE_DIRECTORY; +    mapping->read_only = 0; +    s->path = mapping->path; + +    for (i = 0, cluster = 0; i < s->mapping.next; i++) { +	/* MS-DOS expects the FAT to be 0 for the root directory +	 * (except for the media byte). */ +	/* LATER TODO: still true for FAT32? */ +	int fix_fat = (i != 0); +	mapping = array_get(&(s->mapping), i); + +        if (mapping->mode & MODE_DIRECTORY) { +	    mapping->begin = cluster; +	    if(read_directory(s, i)) { +                error_setg(errp, "Could not read directory %s", +                           mapping->path); +		return -1; +	    } +	    mapping = array_get(&(s->mapping), i); +	} else { +	    assert(mapping->mode == MODE_UNDEFINED); +	    mapping->mode=MODE_NORMAL; +	    mapping->begin = cluster; +	    if (mapping->end > 0) { +		direntry_t* direntry = array_get(&(s->directory), +			mapping->dir_index); + +		mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size; +		set_begin_of_direntry(direntry, mapping->begin); +	    } else { +		mapping->end = cluster + 1; +		fix_fat = 0; +	    } +	} + +	assert(mapping->begin < mapping->end); + +	/* next free cluster */ +	cluster = mapping->end; + +	if(cluster > s->cluster_count) { +            error_setg(errp, +                       "Directory does not fit in FAT%d (capacity %.2f MB)", +                       s->fat_type, s->sector_count / 2000.0); +            return -1; +	} + +	/* fix fat for entry */ +	if (fix_fat) { +	    int j; +	    for(j = mapping->begin; j < mapping->end - 1; j++) +		fat_set(s, j, j+1); +	    fat_set(s, mapping->end - 1, s->max_fat_value); +	} +    } + +    mapping = array_get(&(s->mapping), 0); +    s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster; +    s->last_cluster_of_root_directory = mapping->end; + +    /* the FAT signature */ +    fat_set(s,0,s->max_fat_value); +    fat_set(s,1,s->max_fat_value); + +    s->current_mapping = NULL; + +    bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200); +    bootsector->jump[0]=0xeb; +    bootsector->jump[1]=0x3e; +    bootsector->jump[2]=0x90; +    memcpy(bootsector->name,"QEMU    ",8); +    bootsector->sector_size=cpu_to_le16(0x200); +    bootsector->sectors_per_cluster=s->sectors_per_cluster; +    bootsector->reserved_sectors=cpu_to_le16(1); +    bootsector->number_of_fats=0x2; /* number of FATs */ +    bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10); +    bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count); +    bootsector->media_type=(s->first_sectors_number>1?0xf8:0xf0); /* media descriptor (f8=hd, f0=3.5 fd)*/ +    s->fat.pointer[0] = bootsector->media_type; +    bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat); +    bootsector->sectors_per_track = cpu_to_le16(secs); +    bootsector->number_of_heads = cpu_to_le16(heads); +    bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f); +    bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0); + +    /* LATER TODO: if FAT32, this is wrong */ +    bootsector->u.fat16.drive_number=s->first_sectors_number==1?0:0x80; /* fda=0, hda=0x80 */ +    bootsector->u.fat16.current_head=0; +    bootsector->u.fat16.signature=0x29; +    bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); + +    memcpy(bootsector->u.fat16.volume_label, s->volume_label, +           sizeof(bootsector->u.fat16.volume_label)); +    memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12   ":s->fat_type==16?"FAT16   ":"FAT32   "),8); +    bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; + +    return 0; +} + +#ifdef DEBUG +static BDRVVVFATState *vvv = NULL; +#endif + +static int enable_write_target(BDRVVVFATState *s, Error **errp); +static int is_consistent(BDRVVVFATState *s); + +static void vvfat_rebind(BlockDriverState *bs) +{ +    BDRVVVFATState *s = bs->opaque; +    s->bs = bs; +} + +static QemuOptsList runtime_opts = { +    .name = "vvfat", +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +    .desc = { +        { +            .name = "dir", +            .type = QEMU_OPT_STRING, +            .help = "Host directory to map to the vvfat device", +        }, +        { +            .name = "fat-type", +            .type = QEMU_OPT_NUMBER, +            .help = "FAT type (12, 16 or 32)", +        }, +        { +            .name = "floppy", +            .type = QEMU_OPT_BOOL, +            .help = "Create a floppy rather than a hard disk image", +        }, +        { +            .name = "label", +            .type = QEMU_OPT_STRING, +            .help = "Use a volume label other than QEMU VVFAT", +        }, +        { +            .name = "rw", +            .type = QEMU_OPT_BOOL, +            .help = "Make the image writable", +        }, +        { /* end of list */ } +    }, +}; + +static void vvfat_parse_filename(const char *filename, QDict *options, +                                 Error **errp) +{ +    int fat_type = 0; +    bool floppy = false; +    bool rw = false; +    int i; + +    if (!strstart(filename, "fat:", NULL)) { +        error_setg(errp, "File name string must start with 'fat:'"); +        return; +    } + +    /* Parse options */ +    if (strstr(filename, ":32:")) { +        fat_type = 32; +    } else if (strstr(filename, ":16:")) { +        fat_type = 16; +    } else if (strstr(filename, ":12:")) { +        fat_type = 12; +    } + +    if (strstr(filename, ":floppy:")) { +        floppy = true; +    } + +    if (strstr(filename, ":rw:")) { +        rw = true; +    } + +    /* Get the directory name without options */ +    i = strrchr(filename, ':') - filename; +    assert(i >= 3); +    if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) { +        /* workaround for DOS drive names */ +        filename += i - 1; +    } else { +        filename += i + 1; +    } + +    /* Fill in the options QDict */ +    qdict_put(options, "dir", qstring_from_str(filename)); +    qdict_put(options, "fat-type", qint_from_int(fat_type)); +    qdict_put(options, "floppy", qbool_from_bool(floppy)); +    qdict_put(options, "rw", qbool_from_bool(rw)); +} + +static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, +                      Error **errp) +{ +    BDRVVVFATState *s = bs->opaque; +    int cyls, heads, secs; +    bool floppy; +    const char *dirname, *label; +    QemuOpts *opts; +    Error *local_err = NULL; +    int ret; + +#ifdef DEBUG +    vvv = s; +#endif + +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +    qemu_opts_absorb_qdict(opts, options, &local_err); +    if (local_err) { +        error_propagate(errp, local_err); +        ret = -EINVAL; +        goto fail; +    } + +    dirname = qemu_opt_get(opts, "dir"); +    if (!dirname) { +        error_setg(errp, "vvfat block driver requires a 'dir' option"); +        ret = -EINVAL; +        goto fail; +    } + +    s->fat_type = qemu_opt_get_number(opts, "fat-type", 0); +    floppy = qemu_opt_get_bool(opts, "floppy", false); + +    memset(s->volume_label, ' ', sizeof(s->volume_label)); +    label = qemu_opt_get(opts, "label"); +    if (label) { +        size_t label_length = strlen(label); +        if (label_length > 11) { +            error_setg(errp, "vvfat label cannot be longer than 11 bytes"); +            ret = -EINVAL; +            goto fail; +        } +        memcpy(s->volume_label, label, label_length); +    } + +    if (floppy) { +        /* 1.44MB or 2.88MB floppy.  2.88MB can be FAT12 (default) or FAT16. */ +        if (!s->fat_type) { +            s->fat_type = 12; +            secs = 36; +            s->sectors_per_cluster = 2; +        } else { +            secs = s->fat_type == 12 ? 18 : 36; +            s->sectors_per_cluster = 1; +        } +        s->first_sectors_number = 1; +        cyls = 80; +        heads = 2; +    } else { +        /* 32MB or 504MB disk*/ +        if (!s->fat_type) { +            s->fat_type = 16; +        } +        s->first_sectors_number = 0x40; +        cyls = s->fat_type == 12 ? 64 : 1024; +        heads = 16; +        secs = 63; +    } + +    switch (s->fat_type) { +    case 32: +	    fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. " +                "You are welcome to do so!\n"); +        break; +    case 16: +    case 12: +        break; +    default: +        error_setg(errp, "Valid FAT types are only 12, 16 and 32"); +        ret = -EINVAL; +        goto fail; +    } + + +    s->bs = bs; + +    /* LATER TODO: if FAT32, adjust */ +    s->sectors_per_cluster=0x10; + +    s->current_cluster=0xffffffff; + +    /* read only is the default for safety */ +    bs->read_only = 1; +    s->qcow = s->write_target = NULL; +    s->qcow_filename = NULL; +    s->fat2 = NULL; +    s->downcase_short_names = 1; + +    fprintf(stderr, "vvfat %s chs %d,%d,%d\n", +            dirname, cyls, heads, secs); + +    s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); + +    if (qemu_opt_get_bool(opts, "rw", false)) { +        ret = enable_write_target(s, errp); +        if (ret < 0) { +            goto fail; +        } +        bs->read_only = 0; +    } + +    bs->total_sectors = cyls * heads * secs; + +    if (init_directories(s, dirname, heads, secs, errp)) { +        ret = -EIO; +        goto fail; +    } + +    s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; + +    if (s->first_sectors_number == 0x40) { +        init_mbr(s, cyls, heads, secs); +    } + +    //    assert(is_consistent(s)); +    qemu_co_mutex_init(&s->lock); + +    /* Disable migration when vvfat is used rw */ +    if (s->qcow) { +        error_setg(&s->migration_blocker, +                   "The vvfat (rw) format used by node '%s' " +                   "does not support live migration", +                   bdrv_get_device_or_node_name(bs)); +        migrate_add_blocker(s->migration_blocker); +    } + +    ret = 0; +fail: +    qemu_opts_del(opts); +    return ret; +} + +static inline void vvfat_close_current_file(BDRVVVFATState *s) +{ +    if(s->current_mapping) { +	s->current_mapping = NULL; +	if (s->current_fd) { +		qemu_close(s->current_fd); +		s->current_fd = 0; +	} +    } +    s->current_cluster = -1; +} + +/* mappings between index1 and index2-1 are supposed to be ordered + * return value is the index of the last mapping for which end>cluster_num + */ +static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2) +{ +    while(1) { +        int index3; +	mapping_t* mapping; +	index3=(index1+index2)/2; +	mapping=array_get(&(s->mapping),index3); +	assert(mapping->begin < mapping->end); +	if(mapping->begin>=cluster_num) { +	    assert(index2!=index3 || index2==0); +	    if(index2==index3) +		return index1; +	    index2=index3; +	} else { +	    if(index1==index3) +		return mapping->end<=cluster_num ? index2 : index1; +	    index1=index3; +	} +	assert(index1<=index2); +	DLOG(mapping=array_get(&(s->mapping),index1); +	assert(mapping->begin<=cluster_num); +	assert(index2 >= s->mapping.next || +		((mapping = array_get(&(s->mapping),index2)) && +		mapping->end>cluster_num))); +    } +} + +static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num) +{ +    int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next); +    mapping_t* mapping; +    if(index>=s->mapping.next) +        return NULL; +    mapping=array_get(&(s->mapping),index); +    if(mapping->begin>cluster_num) +        return NULL; +    assert(mapping->begin<=cluster_num && mapping->end>cluster_num); +    return mapping; +} + +static int open_file(BDRVVVFATState* s,mapping_t* mapping) +{ +    if(!mapping) +	return -1; +    if(!s->current_mapping || +	    strcmp(s->current_mapping->path,mapping->path)) { +	/* open file */ +	int fd = qemu_open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE); +	if(fd<0) +	    return -1; +	vvfat_close_current_file(s); +	s->current_fd = fd; +	s->current_mapping = mapping; +    } +    return 0; +} + +static inline int read_cluster(BDRVVVFATState *s,int cluster_num) +{ +    if(s->current_cluster != cluster_num) { +	int result=0; +	off_t offset; +	assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY)); +	if(!s->current_mapping +		|| s->current_mapping->begin>cluster_num +		|| s->current_mapping->end<=cluster_num) { +	    /* binary search of mappings for file */ +	    mapping_t* mapping=find_mapping_for_cluster(s,cluster_num); + +	    assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end)); + +	    if (mapping && mapping->mode & MODE_DIRECTORY) { +		vvfat_close_current_file(s); +		s->current_mapping = mapping; +read_cluster_directory: +		offset = s->cluster_size*(cluster_num-s->current_mapping->begin); +		s->cluster = (unsigned char*)s->directory.pointer+offset +			+ 0x20*s->current_mapping->info.dir.first_dir_index; +		assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0); +		assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size); +		s->current_cluster = cluster_num; +		return 0; +	    } + +	    if(open_file(s,mapping)) +		return -2; +	} else if (s->current_mapping->mode & MODE_DIRECTORY) +	    goto read_cluster_directory; + +	assert(s->current_fd); + +	offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset; +	if(lseek(s->current_fd, offset, SEEK_SET)!=offset) +	    return -3; +	s->cluster=s->cluster_buffer; +	result=read(s->current_fd,s->cluster,s->cluster_size); +	if(result<0) { +	    s->current_cluster = -1; +	    return -1; +	} +	s->current_cluster = cluster_num; +    } +    return 0; +} + +#ifdef DEBUG +static void print_direntry(const direntry_t* direntry) +{ +    int j = 0; +    char buffer[1024]; + +    fprintf(stderr, "direntry %p: ", direntry); +    if(!direntry) +	return; +    if(is_long_name(direntry)) { +	unsigned char* c=(unsigned char*)direntry; +	int i; +	for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2) +#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;} +	    ADD_CHAR(c[i]); +	for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2) +	    ADD_CHAR(c[i]); +	for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2) +	    ADD_CHAR(c[i]); +	buffer[j] = 0; +	fprintf(stderr, "%s\n", buffer); +    } else { +	int i; +	for(i=0;i<11;i++) +	    ADD_CHAR(direntry->name[i]); +	buffer[j] = 0; +	fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n", +		buffer, +		direntry->attributes, +		begin_of_direntry(direntry),le32_to_cpu(direntry->size)); +    } +} + +static void print_mapping(const mapping_t* mapping) +{ +    fprintf(stderr, "mapping (%p): begin, end = %d, %d, dir_index = %d, " +        "first_mapping_index = %d, name = %s, mode = 0x%x, " , +        mapping, mapping->begin, mapping->end, mapping->dir_index, +        mapping->first_mapping_index, mapping->path, mapping->mode); + +    if (mapping->mode & MODE_DIRECTORY) +	fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index); +    else +	fprintf(stderr, "offset = %d\n", mapping->info.file.offset); +} +#endif + +static int vvfat_read(BlockDriverState *bs, int64_t sector_num, +                    uint8_t *buf, int nb_sectors) +{ +    BDRVVVFATState *s = bs->opaque; +    int i; + +    for(i=0;i<nb_sectors;i++,sector_num++) { +	if (sector_num >= bs->total_sectors) +	   return -1; +	if (s->qcow) { +	    int n; +            if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) { +DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); +                if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) { +                    return -1; +                } +                i += n - 1; +                sector_num += n - 1; +                continue; +            } +DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); +	} +	if(sector_num<s->faked_sectors) { +	    if(sector_num<s->first_sectors_number) +		memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200); +	    else if(sector_num-s->first_sectors_number<s->sectors_per_fat) +		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200); +	    else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat) +		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200); +	} else { +	    uint32_t sector=sector_num-s->faked_sectors, +	    sector_offset_in_cluster=(sector%s->sectors_per_cluster), +	    cluster_num=sector/s->sectors_per_cluster; +	    if(cluster_num > s->cluster_count || read_cluster(s, cluster_num) != 0) { +		/* LATER TODO: strict: return -1; */ +		memset(buf+i*0x200,0,0x200); +		continue; +	    } +	    memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200); +	} +    } +    return 0; +} + +static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, +                                      uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVVFATState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vvfat_read(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +/* LATER TODO: statify all functions */ + +/* + * Idea of the write support (use snapshot): + * + * 1. check if all data is consistent, recording renames, modifications, + *    new files and directories (in s->commits). + * + * 2. if the data is not consistent, stop committing + * + * 3. handle renames, and create new files and directories (do not yet + *    write their contents) + * + * 4. walk the directories, fixing the mapping and direntries, and marking + *    the handled mappings as not deleted + * + * 5. commit the contents of the files + * + * 6. handle deleted files and directories + * + */ + +typedef struct commit_t { +    char* path; +    union { +	struct { uint32_t cluster; } rename; +	struct { int dir_index; uint32_t modified_offset; } writeout; +	struct { uint32_t first_cluster; } new_file; +	struct { uint32_t cluster; } mkdir; +    } param; +    /* DELETEs and RMDIRs are handled differently: see handle_deletes() */ +    enum { +	ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR +    } action; +} commit_t; + +static void clear_commits(BDRVVVFATState* s) +{ +    int i; +DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next)); +    for (i = 0; i < s->commits.next; i++) { +	commit_t* commit = array_get(&(s->commits), i); +	assert(commit->path || commit->action == ACTION_WRITEOUT); +	if (commit->action != ACTION_WRITEOUT) { +	    assert(commit->path); +            g_free(commit->path); +	} else +	    assert(commit->path == NULL); +    } +    s->commits.next = 0; +} + +static void schedule_rename(BDRVVVFATState* s, +	uint32_t cluster, char* new_path) +{ +    commit_t* commit = array_get_next(&(s->commits)); +    commit->path = new_path; +    commit->param.rename.cluster = cluster; +    commit->action = ACTION_RENAME; +} + +static void schedule_writeout(BDRVVVFATState* s, +	int dir_index, uint32_t modified_offset) +{ +    commit_t* commit = array_get_next(&(s->commits)); +    commit->path = NULL; +    commit->param.writeout.dir_index = dir_index; +    commit->param.writeout.modified_offset = modified_offset; +    commit->action = ACTION_WRITEOUT; +} + +static void schedule_new_file(BDRVVVFATState* s, +	char* path, uint32_t first_cluster) +{ +    commit_t* commit = array_get_next(&(s->commits)); +    commit->path = path; +    commit->param.new_file.first_cluster = first_cluster; +    commit->action = ACTION_NEW_FILE; +} + +static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path) +{ +    commit_t* commit = array_get_next(&(s->commits)); +    commit->path = path; +    commit->param.mkdir.cluster = cluster; +    commit->action = ACTION_MKDIR; +} + +typedef struct { +    /* +     * Since the sequence number is at most 0x3f, and the filename +     * length is at most 13 times the sequence number, the maximal +     * filename length is 0x3f * 13 bytes. +     */ +    unsigned char name[0x3f * 13 + 1]; +    int checksum, len; +    int sequence_number; +} long_file_name; + +static void lfn_init(long_file_name* lfn) +{ +   lfn->sequence_number = lfn->len = 0; +   lfn->checksum = 0x100; +} + +/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */ +static int parse_long_name(long_file_name* lfn, +	const direntry_t* direntry) +{ +    int i, j, offset; +    const unsigned char* pointer = (const unsigned char*)direntry; + +    if (!is_long_name(direntry)) +	return 1; + +    if (pointer[0] & 0x40) { +	lfn->sequence_number = pointer[0] & 0x3f; +	lfn->checksum = pointer[13]; +	lfn->name[0] = 0; +	lfn->name[lfn->sequence_number * 13] = 0; +    } else if ((pointer[0] & 0x3f) != --lfn->sequence_number) +	return -1; +    else if (pointer[13] != lfn->checksum) +	return -2; +    else if (pointer[12] || pointer[26] || pointer[27]) +	return -3; + +    offset = 13 * (lfn->sequence_number - 1); +    for (i = 0, j = 1; i < 13; i++, j+=2) { +	if (j == 11) +	    j = 14; +	else if (j == 26) +	    j = 28; + +	if (pointer[j+1] == 0) +	    lfn->name[offset + i] = pointer[j]; +	else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0) +	    return -4; +	else +	    lfn->name[offset + i] = 0; +    } + +    if (pointer[0] & 0x40) +	lfn->len = offset + strlen((char*)lfn->name + offset); + +    return 0; +} + +/* returns 0 if successful, >0 if no short_name, and <0 on error */ +static int parse_short_name(BDRVVVFATState* s, +	long_file_name* lfn, direntry_t* direntry) +{ +    int i, j; + +    if (!is_short_name(direntry)) +	return 1; + +    for (j = 7; j >= 0 && direntry->name[j] == ' '; j--); +    for (i = 0; i <= j; i++) { +	if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f) +	    return -1; +	else if (s->downcase_short_names) +	    lfn->name[i] = qemu_tolower(direntry->name[i]); +	else +	    lfn->name[i] = direntry->name[i]; +    } + +    for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) { +    } +    if (j >= 0) { +	lfn->name[i++] = '.'; +	lfn->name[i + j + 1] = '\0'; +	for (;j >= 0; j--) { +            uint8_t c = direntry->name[8 + j]; +            if (c <= ' ' || c > 0x7f) { +                return -2; +            } else if (s->downcase_short_names) { +                lfn->name[i + j] = qemu_tolower(c); +            } else { +                lfn->name[i + j] = c; +            } +	} +    } else +	lfn->name[i + j + 1] = '\0'; + +    lfn->len = strlen((char*)lfn->name); + +    return 0; +} + +static inline uint32_t modified_fat_get(BDRVVVFATState* s, +	unsigned int cluster) +{ +    if (cluster < s->last_cluster_of_root_directory) { +	if (cluster + 1 == s->last_cluster_of_root_directory) +	    return s->max_fat_value; +	else +	    return cluster + 1; +    } + +    if (s->fat_type==32) { +        uint32_t* entry=((uint32_t*)s->fat2)+cluster; +        return le32_to_cpu(*entry); +    } else if (s->fat_type==16) { +        uint16_t* entry=((uint16_t*)s->fat2)+cluster; +        return le16_to_cpu(*entry); +    } else { +        const uint8_t* x=s->fat2+cluster*3/2; +        return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; +    } +} + +static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) +{ +    int was_modified = 0; +    int i, dummy; + +    if (s->qcow == NULL) +	return 0; + +    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) +	was_modified = bdrv_is_allocated(s->qcow, +		cluster2sector(s, cluster_num) + i, 1, &dummy); + +    return was_modified; +} + +static const char* get_basename(const char* path) +{ +    char* basename = strrchr(path, '/'); +    if (basename == NULL) +	return path; +    else +	return basename + 1; /* strip '/' */ +} + +/* + * The array s->used_clusters holds the states of the clusters. If it is + * part of a file, it has bit 2 set, in case of a directory, bit 1. If it + * was modified, bit 3 is set. + * If any cluster is allocated, but not part of a file or directory, this + * driver refuses to commit. + */ +typedef enum { +     USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4 +} used_t; + +/* + * get_cluster_count_for_direntry() not only determines how many clusters + * are occupied by direntry, but also if it was renamed or modified. + * + * A file is thought to be renamed *only* if there already was a file with + * exactly the same first cluster, but a different name. + * + * Further, the files/directories handled by this function are + * assumed to be *not* deleted (and *only* those). + */ +static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, +	direntry_t* direntry, const char* path) +{ +    /* +     * This is a little bit tricky: +     * IF the guest OS just inserts a cluster into the file chain, +     * and leaves the rest alone, (i.e. the original file had clusters +     * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens: +     * +     * - do_commit will write the cluster into the file at the given +     *   offset, but +     * +     * - the cluster which is overwritten should be moved to a later +     *   position in the file. +     * +     * I am not aware that any OS does something as braindead, but this +     * situation could happen anyway when not committing for a long time. +     * Just to be sure that this does not bite us, detect it, and copy the +     * contents of the clusters to-be-overwritten into the qcow. +     */ +    int copy_it = 0; +    int was_modified = 0; +    int32_t ret = 0; + +    uint32_t cluster_num = begin_of_direntry(direntry); +    uint32_t offset = 0; +    int first_mapping_index = -1; +    mapping_t* mapping = NULL; +    const char* basename2 = NULL; + +    vvfat_close_current_file(s); + +    /* the root directory */ +    if (cluster_num == 0) +	return 0; + +    /* write support */ +    if (s->qcow) { +	basename2 = get_basename(path); + +	mapping = find_mapping_for_cluster(s, cluster_num); + +	if (mapping) { +	    const char* basename; + +	    assert(mapping->mode & MODE_DELETED); +	    mapping->mode &= ~MODE_DELETED; + +	    basename = get_basename(mapping->path); + +	    assert(mapping->mode & MODE_NORMAL); + +	    /* rename */ +	    if (strcmp(basename, basename2)) +		schedule_rename(s, cluster_num, g_strdup(path)); +	} else if (is_file(direntry)) +	    /* new file */ +	    schedule_new_file(s, g_strdup(path), cluster_num); +	else { +            abort(); +	    return 0; +	} +    } + +    while(1) { +	if (s->qcow) { +	    if (!copy_it && cluster_was_modified(s, cluster_num)) { +		if (mapping == NULL || +			mapping->begin > cluster_num || +			mapping->end <= cluster_num) +		mapping = find_mapping_for_cluster(s, cluster_num); + + +		if (mapping && +			(mapping->mode & MODE_DIRECTORY) == 0) { + +		    /* was modified in qcow */ +		    if (offset != mapping->info.file.offset + s->cluster_size +			    * (cluster_num - mapping->begin)) { +			/* offset of this cluster in file chain has changed */ +                        abort(); +			copy_it = 1; +		    } else if (offset == 0) { +			const char* basename = get_basename(mapping->path); + +			if (strcmp(basename, basename2)) +			    copy_it = 1; +			first_mapping_index = array_index(&(s->mapping), mapping); +		    } + +		    if (mapping->first_mapping_index != first_mapping_index +			    && mapping->info.file.offset > 0) { +                        abort(); +			copy_it = 1; +		    } + +		    /* need to write out? */ +		    if (!was_modified && is_file(direntry)) { +			was_modified = 1; +			schedule_writeout(s, mapping->dir_index, offset); +		    } +		} +	    } + +	    if (copy_it) { +		int i, dummy; +		/* +		 * This is horribly inefficient, but that is okay, since +		 * it is rarely executed, if at all. +		 */ +		int64_t offset = cluster2sector(s, cluster_num); + +		vvfat_close_current_file(s); +                for (i = 0; i < s->sectors_per_cluster; i++) { +                    if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) { +                        if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) { +                            return -1; +                        } +                        if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) { +                            return -2; +                        } +                    } +                } +	    } +	} + +	ret++; +	if (s->used_clusters[cluster_num] & USED_ANY) +	    return 0; +	s->used_clusters[cluster_num] = USED_FILE; + +	cluster_num = modified_fat_get(s, cluster_num); + +	if (fat_eof(s, cluster_num)) +	    return ret; +	else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16) +	    return -1; + +	offset += s->cluster_size; +    } +} + +/* + * This function looks at the modified data (qcow). + * It returns 0 upon inconsistency or error, and the number of clusters + * used by the directory, its subdirectories and their files. + */ +static int check_directory_consistency(BDRVVVFATState *s, +	int cluster_num, const char* path) +{ +    int ret = 0; +    unsigned char* cluster = g_malloc(s->cluster_size); +    direntry_t* direntries = (direntry_t*)cluster; +    mapping_t* mapping = find_mapping_for_cluster(s, cluster_num); + +    long_file_name lfn; +    int path_len = strlen(path); +    char path2[PATH_MAX + 1]; + +    assert(path_len < PATH_MAX); /* len was tested before! */ +    pstrcpy(path2, sizeof(path2), path); +    path2[path_len] = '/'; +    path2[path_len + 1] = '\0'; + +    if (mapping) { +	const char* basename = get_basename(mapping->path); +	const char* basename2 = get_basename(path); + +	assert(mapping->mode & MODE_DIRECTORY); + +	assert(mapping->mode & MODE_DELETED); +	mapping->mode &= ~MODE_DELETED; + +	if (strcmp(basename, basename2)) +	    schedule_rename(s, cluster_num, g_strdup(path)); +    } else +	/* new directory */ +	schedule_mkdir(s, cluster_num, g_strdup(path)); + +    lfn_init(&lfn); +    do { +	int i; +	int subret = 0; + +	ret++; + +	if (s->used_clusters[cluster_num] & USED_ANY) { +	    fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); +            goto fail; +	} +	s->used_clusters[cluster_num] = USED_DIRECTORY; + +DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num))); +	subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster, +		s->sectors_per_cluster); +	if (subret) { +	    fprintf(stderr, "Error fetching direntries\n"); +	fail: +            g_free(cluster); +	    return 0; +	} + +	for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) { +	    int cluster_count = 0; + +DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)); +	    if (is_volume_label(direntries + i) || is_dot(direntries + i) || +		    is_free(direntries + i)) +		continue; + +	    subret = parse_long_name(&lfn, direntries + i); +	    if (subret < 0) { +		fprintf(stderr, "Error in long name\n"); +		goto fail; +	    } +	    if (subret == 0 || is_free(direntries + i)) +		continue; + +	    if (fat_chksum(direntries+i) != lfn.checksum) { +		subret = parse_short_name(s, &lfn, direntries + i); +		if (subret < 0) { +		    fprintf(stderr, "Error in short name (%d)\n", subret); +		    goto fail; +		} +		if (subret > 0 || !strcmp((char*)lfn.name, ".") +			|| !strcmp((char*)lfn.name, "..")) +		    continue; +	    } +	    lfn.checksum = 0x100; /* cannot use long name twice */ + +	    if (path_len + 1 + lfn.len >= PATH_MAX) { +		fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name); +		goto fail; +	    } +            pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1, +                    (char*)lfn.name); + +	    if (is_directory(direntries + i)) { +		if (begin_of_direntry(direntries + i) == 0) { +		    DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i)); +		    goto fail; +		} +		cluster_count = check_directory_consistency(s, +			begin_of_direntry(direntries + i), path2); +		if (cluster_count == 0) { +		    DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i)); +		    goto fail; +		} +	    } else if (is_file(direntries + i)) { +		/* check file size with FAT */ +		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); +		if (cluster_count != +			(le32_to_cpu(direntries[i].size) + s->cluster_size +			 - 1) / s->cluster_size) { +		    DLOG(fprintf(stderr, "Cluster count mismatch\n")); +		    goto fail; +		} +	    } else +                abort(); /* cluster_count = 0; */ + +	    ret += cluster_count; +	} + +	cluster_num = modified_fat_get(s, cluster_num); +    } while(!fat_eof(s, cluster_num)); + +    g_free(cluster); +    return ret; +} + +/* returns 1 on success */ +static int is_consistent(BDRVVVFATState* s) +{ +    int i, check; +    int used_clusters_count = 0; + +DLOG(checkpoint()); +    /* +     * - get modified FAT +     * - compare the two FATs (TODO) +     * - get buffer for marking used clusters +     * - recurse direntries from root (using bs->bdrv_read to make +     *    sure to get the new data) +     *   - check that the FAT agrees with the size +     *   - count the number of clusters occupied by this directory and +     *     its files +     * - check that the cumulative used cluster count agrees with the +     *   FAT +     * - if all is fine, return number of used clusters +     */ +    if (s->fat2 == NULL) { +	int size = 0x200 * s->sectors_per_fat; +	s->fat2 = g_malloc(size); +	memcpy(s->fat2, s->fat.pointer, size); +    } +    check = vvfat_read(s->bs, +	    s->first_sectors_number, s->fat2, s->sectors_per_fat); +    if (check) { +	fprintf(stderr, "Could not copy fat\n"); +	return 0; +    } +    assert (s->used_clusters); +    for (i = 0; i < sector2cluster(s, s->sector_count); i++) +	s->used_clusters[i] &= ~USED_ANY; + +    clear_commits(s); + +    /* mark every mapped file/directory as deleted. +     * (check_directory_consistency() will unmark those still present). */ +    if (s->qcow) +	for (i = 0; i < s->mapping.next; i++) { +	    mapping_t* mapping = array_get(&(s->mapping), i); +	    if (mapping->first_mapping_index < 0) +		mapping->mode |= MODE_DELETED; +	} + +    used_clusters_count = check_directory_consistency(s, 0, s->path); +    if (used_clusters_count <= 0) { +	DLOG(fprintf(stderr, "problem in directory\n")); +	return 0; +    } + +    check = s->last_cluster_of_root_directory; +    for (i = check; i < sector2cluster(s, s->sector_count); i++) { +	if (modified_fat_get(s, i)) { +	    if(!s->used_clusters[i]) { +		DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i)); +		return 0; +	    } +	    check++; +	} + +	if (s->used_clusters[i] == USED_ALLOCATED) { +	    /* allocated, but not used... */ +	    DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i)); +	    return 0; +	} +    } + +    if (check != used_clusters_count) +	return 0; + +    return used_clusters_count; +} + +static inline void adjust_mapping_indices(BDRVVVFATState* s, +	int offset, int adjust) +{ +    int i; + +    for (i = 0; i < s->mapping.next; i++) { +	mapping_t* mapping = array_get(&(s->mapping), i); + +#define ADJUST_MAPPING_INDEX(name) \ +	if (mapping->name >= offset) \ +	    mapping->name += adjust + +	ADJUST_MAPPING_INDEX(first_mapping_index); +	if (mapping->mode & MODE_DIRECTORY) +	    ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index); +    } +} + +/* insert or update mapping */ +static mapping_t* insert_mapping(BDRVVVFATState* s, +	uint32_t begin, uint32_t end) +{ +    /* +     * - find mapping where mapping->begin >= begin, +     * - if mapping->begin > begin: insert +     *   - adjust all references to mappings! +     * - else: adjust +     * - replace name +     */ +    int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next); +    mapping_t* mapping = NULL; +    mapping_t* first_mapping = array_get(&(s->mapping), 0); + +    if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index)) +	    && mapping->begin < begin) { +	mapping->end = begin; +	index++; +	mapping = array_get(&(s->mapping), index); +    } +    if (index >= s->mapping.next || mapping->begin > begin) { +	mapping = array_insert(&(s->mapping), index, 1); +	mapping->path = NULL; +	adjust_mapping_indices(s, index, +1); +    } + +    mapping->begin = begin; +    mapping->end = end; + +DLOG(mapping_t* next_mapping; +assert(index + 1 >= s->mapping.next || +((next_mapping = array_get(&(s->mapping), index + 1)) && + next_mapping->begin >= end))); + +    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) +	s->current_mapping = array_get(&(s->mapping), +		s->current_mapping - first_mapping); + +    return mapping; +} + +static int remove_mapping(BDRVVVFATState* s, int mapping_index) +{ +    mapping_t* mapping = array_get(&(s->mapping), mapping_index); +    mapping_t* first_mapping = array_get(&(s->mapping), 0); + +    /* free mapping */ +    if (mapping->first_mapping_index < 0) { +        g_free(mapping->path); +    } + +    /* remove from s->mapping */ +    array_remove(&(s->mapping), mapping_index); + +    /* adjust all references to mappings */ +    adjust_mapping_indices(s, mapping_index, -1); + +    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) +	s->current_mapping = array_get(&(s->mapping), +		s->current_mapping - first_mapping); + +    return 0; +} + +static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust) +{ +    int i; +    for (i = 0; i < s->mapping.next; i++) { +	mapping_t* mapping = array_get(&(s->mapping), i); +	if (mapping->dir_index >= offset) +	    mapping->dir_index += adjust; +	if ((mapping->mode & MODE_DIRECTORY) && +		mapping->info.dir.first_dir_index >= offset) +	    mapping->info.dir.first_dir_index += adjust; +    } +} + +static direntry_t* insert_direntries(BDRVVVFATState* s, +	int dir_index, int count) +{ +    /* +     * make room in s->directory, +     * adjust_dirindices +     */ +    direntry_t* result = array_insert(&(s->directory), dir_index, count); +    if (result == NULL) +	return NULL; +    adjust_dirindices(s, dir_index, count); +    return result; +} + +static int remove_direntries(BDRVVVFATState* s, int dir_index, int count) +{ +    int ret = array_remove_slice(&(s->directory), dir_index, count); +    if (ret) +	return ret; +    adjust_dirindices(s, dir_index, -count); +    return 0; +} + +/* + * Adapt the mappings of the cluster chain starting at first cluster + * (i.e. if a file starts at first_cluster, the chain is followed according + * to the modified fat, and the corresponding entries in s->mapping are + * adjusted) + */ +static int commit_mappings(BDRVVVFATState* s, +	uint32_t first_cluster, int dir_index) +{ +    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); +    direntry_t* direntry = array_get(&(s->directory), dir_index); +    uint32_t cluster = first_cluster; + +    vvfat_close_current_file(s); + +    assert(mapping); +    assert(mapping->begin == first_cluster); +    mapping->first_mapping_index = -1; +    mapping->dir_index = dir_index; +    mapping->mode = (dir_index <= 0 || is_directory(direntry)) ? +	MODE_DIRECTORY : MODE_NORMAL; + +    while (!fat_eof(s, cluster)) { +	uint32_t c, c1; + +	for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1; +		c = c1, c1 = modified_fat_get(s, c1)); + +	c++; +	if (c > mapping->end) { +	    int index = array_index(&(s->mapping), mapping); +	    int i, max_i = s->mapping.next - index; +	    for (i = 1; i < max_i && mapping[i].begin < c; i++); +	    while (--i > 0) +		remove_mapping(s, index + 1); +	} +	assert(mapping == array_get(&(s->mapping), s->mapping.next - 1) +		|| mapping[1].begin >= c); +	mapping->end = c; + +	if (!fat_eof(s, c1)) { +	    int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next); +	    mapping_t* next_mapping = i >= s->mapping.next ? NULL : +		array_get(&(s->mapping), i); + +	    if (next_mapping == NULL || next_mapping->begin > c1) { +		int i1 = array_index(&(s->mapping), mapping); + +		next_mapping = insert_mapping(s, c1, c1+1); + +		if (c1 < c) +		    i1++; +		mapping = array_get(&(s->mapping), i1); +	    } + +	    next_mapping->dir_index = mapping->dir_index; +	    next_mapping->first_mapping_index = +		mapping->first_mapping_index < 0 ? +		array_index(&(s->mapping), mapping) : +		mapping->first_mapping_index; +	    next_mapping->path = mapping->path; +	    next_mapping->mode = mapping->mode; +	    next_mapping->read_only = mapping->read_only; +	    if (mapping->mode & MODE_DIRECTORY) { +		next_mapping->info.dir.parent_mapping_index = +			mapping->info.dir.parent_mapping_index; +		next_mapping->info.dir.first_dir_index = +			mapping->info.dir.first_dir_index + +			0x10 * s->sectors_per_cluster * +			(mapping->end - mapping->begin); +	    } else +		next_mapping->info.file.offset = mapping->info.file.offset + +			mapping->end - mapping->begin; + +	    mapping = next_mapping; +	} + +	cluster = c1; +    } + +    return 0; +} + +static int commit_direntries(BDRVVVFATState* s, +	int dir_index, int parent_mapping_index) +{ +    direntry_t* direntry = array_get(&(s->directory), dir_index); +    uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); +    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + +    int factor = 0x10 * s->sectors_per_cluster; +    int old_cluster_count, new_cluster_count; +    int current_dir_index = mapping->info.dir.first_dir_index; +    int first_dir_index = current_dir_index; +    int ret, i; +    uint32_t c; + +DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index)); + +    assert(direntry); +    assert(mapping); +    assert(mapping->begin == first_cluster); +    assert(mapping->info.dir.first_dir_index < s->directory.next); +    assert(mapping->mode & MODE_DIRECTORY); +    assert(dir_index == 0 || is_directory(direntry)); + +    mapping->info.dir.parent_mapping_index = parent_mapping_index; + +    if (first_cluster == 0) { +	old_cluster_count = new_cluster_count = +	    s->last_cluster_of_root_directory; +    } else { +	for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c); +		c = fat_get(s, c)) +	    old_cluster_count++; + +	for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c); +		c = modified_fat_get(s, c)) +	    new_cluster_count++; +    } + +    if (new_cluster_count > old_cluster_count) { +	if (insert_direntries(s, +		current_dir_index + factor * old_cluster_count, +		factor * (new_cluster_count - old_cluster_count)) == NULL) +	    return -1; +    } else if (new_cluster_count < old_cluster_count) +	remove_direntries(s, +		current_dir_index + factor * new_cluster_count, +		factor * (old_cluster_count - new_cluster_count)); + +    for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { +	void* direntry = array_get(&(s->directory), current_dir_index); +	int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, +		s->sectors_per_cluster); +	if (ret) +	    return ret; +	assert(!strncmp(s->directory.pointer, "QEMU", 4)); +	current_dir_index += factor; +    } + +    ret = commit_mappings(s, first_cluster, dir_index); +    if (ret) +	return ret; + +    /* recurse */ +    for (i = 0; i < factor * new_cluster_count; i++) { +	direntry = array_get(&(s->directory), first_dir_index + i); +	if (is_directory(direntry) && !is_dot(direntry)) { +	    mapping = find_mapping_for_cluster(s, first_cluster); +	    assert(mapping->mode & MODE_DIRECTORY); +	    ret = commit_direntries(s, first_dir_index + i, +		array_index(&(s->mapping), mapping)); +	    if (ret) +		return ret; +	} +    } + +    return 0; +} + +/* commit one file (adjust contents, adjust mapping), +   return first_mapping_index */ +static int commit_one_file(BDRVVVFATState* s, +	int dir_index, uint32_t offset) +{ +    direntry_t* direntry = array_get(&(s->directory), dir_index); +    uint32_t c = begin_of_direntry(direntry); +    uint32_t first_cluster = c; +    mapping_t* mapping = find_mapping_for_cluster(s, c); +    uint32_t size = filesize_of_direntry(direntry); +    char* cluster = g_malloc(s->cluster_size); +    uint32_t i; +    int fd = 0; + +    assert(offset < size); +    assert((offset % s->cluster_size) == 0); + +    for (i = s->cluster_size; i < offset; i += s->cluster_size) +	c = modified_fat_get(s, c); + +    fd = qemu_open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); +    if (fd < 0) { +	fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, +		strerror(errno), errno); +        g_free(cluster); +	return fd; +    } +    if (offset > 0) { +        if (lseek(fd, offset, SEEK_SET) != offset) { +            qemu_close(fd); +            g_free(cluster); +            return -3; +        } +    } + +    while (offset < size) { +	uint32_t c1; +	int rest_size = (size - offset > s->cluster_size ? +		s->cluster_size : size - offset); +	int ret; + +	c1 = modified_fat_get(s, c); + +	assert((size - offset == 0 && fat_eof(s, c)) || +		(size > offset && c >=2 && !fat_eof(s, c))); + +	ret = vvfat_read(s->bs, cluster2sector(s, c), +	    (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200); + +        if (ret < 0) { +            qemu_close(fd); +            g_free(cluster); +            return ret; +        } + +        if (write(fd, cluster, rest_size) < 0) { +            qemu_close(fd); +            g_free(cluster); +            return -2; +        } + +	offset += rest_size; +	c = c1; +    } + +    if (ftruncate(fd, size)) { +        perror("ftruncate()"); +        qemu_close(fd); +        g_free(cluster); +        return -4; +    } +    qemu_close(fd); +    g_free(cluster); + +    return commit_mappings(s, first_cluster, dir_index); +} + +#ifdef DEBUG +/* test, if all mappings point to valid direntries */ +static void check1(BDRVVVFATState* s) +{ +    int i; +    for (i = 0; i < s->mapping.next; i++) { +	mapping_t* mapping = array_get(&(s->mapping), i); +	if (mapping->mode & MODE_DELETED) { +	    fprintf(stderr, "deleted\n"); +	    continue; +	} +	assert(mapping->dir_index < s->directory.next); +	direntry_t* direntry = array_get(&(s->directory), mapping->dir_index); +	assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0); +	if (mapping->mode & MODE_DIRECTORY) { +	    assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next); +	    assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0); +	} +    } +} + +/* test, if all direntries have mappings */ +static void check2(BDRVVVFATState* s) +{ +    int i; +    int first_mapping = -1; + +    for (i = 0; i < s->directory.next; i++) { +	direntry_t* direntry = array_get(&(s->directory), i); + +	if (is_short_name(direntry) && begin_of_direntry(direntry)) { +	    mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry)); +	    assert(mapping); +	    assert(mapping->dir_index == i || is_dot(direntry)); +	    assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry)); +	} + +	if ((i % (0x10 * s->sectors_per_cluster)) == 0) { +	    /* cluster start */ +	    int j, count = 0; + +	    for (j = 0; j < s->mapping.next; j++) { +		mapping_t* mapping = array_get(&(s->mapping), j); +		if (mapping->mode & MODE_DELETED) +		    continue; +		if (mapping->mode & MODE_DIRECTORY) { +		    if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) { +			assert(++count == 1); +			if (mapping->first_mapping_index == -1) +			    first_mapping = array_index(&(s->mapping), mapping); +			else +			    assert(first_mapping == mapping->first_mapping_index); +			if (mapping->info.dir.parent_mapping_index < 0) +			    assert(j == 0); +			else { +			    mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index); +			    assert(parent->mode & MODE_DIRECTORY); +			    assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index); +			} +		    } +		} +	    } +	    if (count == 0) +		first_mapping = -1; +	} +    } +} +#endif + +static int handle_renames_and_mkdirs(BDRVVVFATState* s) +{ +    int i; + +#ifdef DEBUG +    fprintf(stderr, "handle_renames\n"); +    for (i = 0; i < s->commits.next; i++) { +	commit_t* commit = array_get(&(s->commits), i); +	fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action); +    } +#endif + +    for (i = 0; i < s->commits.next;) { +	commit_t* commit = array_get(&(s->commits), i); +	if (commit->action == ACTION_RENAME) { +	    mapping_t* mapping = find_mapping_for_cluster(s, +		    commit->param.rename.cluster); +	    char* old_path = mapping->path; + +	    assert(commit->path); +	    mapping->path = commit->path; +	    if (rename(old_path, mapping->path)) +		return -2; + +	    if (mapping->mode & MODE_DIRECTORY) { +		int l1 = strlen(mapping->path); +		int l2 = strlen(old_path); +		int diff = l1 - l2; +		direntry_t* direntry = array_get(&(s->directory), +			mapping->info.dir.first_dir_index); +		uint32_t c = mapping->begin; +		int i = 0; + +		/* recurse */ +		while (!fat_eof(s, c)) { +		    do { +			direntry_t* d = direntry + i; + +			if (is_file(d) || (is_directory(d) && !is_dot(d))) { +			    mapping_t* m = find_mapping_for_cluster(s, +				    begin_of_direntry(d)); +			    int l = strlen(m->path); +			    char* new_path = g_malloc(l + diff + 1); + +			    assert(!strncmp(m->path, mapping->path, l2)); + +                            pstrcpy(new_path, l + diff + 1, mapping->path); +                            pstrcpy(new_path + l1, l + diff + 1 - l1, +                                    m->path + l2); + +			    schedule_rename(s, m->begin, new_path); +			} +			i++; +		    } while((i % (0x10 * s->sectors_per_cluster)) != 0); +		    c = fat_get(s, c); +		} +	    } + +            g_free(old_path); +	    array_remove(&(s->commits), i); +	    continue; +	} else if (commit->action == ACTION_MKDIR) { +	    mapping_t* mapping; +	    int j, parent_path_len; + +#ifdef __MINGW32__ +            if (mkdir(commit->path)) +                return -5; +#else +            if (mkdir(commit->path, 0755)) +                return -5; +#endif + +	    mapping = insert_mapping(s, commit->param.mkdir.cluster, +		    commit->param.mkdir.cluster + 1); +	    if (mapping == NULL) +		return -6; + +	    mapping->mode = MODE_DIRECTORY; +	    mapping->read_only = 0; +	    mapping->path = commit->path; +	    j = s->directory.next; +	    assert(j); +	    insert_direntries(s, s->directory.next, +		    0x10 * s->sectors_per_cluster); +	    mapping->info.dir.first_dir_index = j; + +	    parent_path_len = strlen(commit->path) +		- strlen(get_basename(commit->path)) - 1; +	    for (j = 0; j < s->mapping.next; j++) { +		mapping_t* m = array_get(&(s->mapping), j); +		if (m->first_mapping_index < 0 && m != mapping && +			!strncmp(m->path, mapping->path, parent_path_len) && +			strlen(m->path) == parent_path_len) +		    break; +	    } +	    assert(j < s->mapping.next); +	    mapping->info.dir.parent_mapping_index = j; + +	    array_remove(&(s->commits), i); +	    continue; +	} + +	i++; +    } +    return 0; +} + +/* + * TODO: make sure that the short name is not matching *another* file + */ +static int handle_commits(BDRVVVFATState* s) +{ +    int i, fail = 0; + +    vvfat_close_current_file(s); + +    for (i = 0; !fail && i < s->commits.next; i++) { +	commit_t* commit = array_get(&(s->commits), i); +	switch(commit->action) { +	case ACTION_RENAME: case ACTION_MKDIR: +            abort(); +	    fail = -2; +	    break; +	case ACTION_WRITEOUT: { +#ifndef NDEBUG +            /* these variables are only used by assert() below */ +	    direntry_t* entry = array_get(&(s->directory), +		    commit->param.writeout.dir_index); +	    uint32_t begin = begin_of_direntry(entry); +	    mapping_t* mapping = find_mapping_for_cluster(s, begin); +#endif + +	    assert(mapping); +	    assert(mapping->begin == begin); +	    assert(commit->path == NULL); + +	    if (commit_one_file(s, commit->param.writeout.dir_index, +			commit->param.writeout.modified_offset)) +		fail = -3; + +	    break; +	} +	case ACTION_NEW_FILE: { +	    int begin = commit->param.new_file.first_cluster; +	    mapping_t* mapping = find_mapping_for_cluster(s, begin); +	    direntry_t* entry; +	    int i; + +	    /* find direntry */ +	    for (i = 0; i < s->directory.next; i++) { +		entry = array_get(&(s->directory), i); +		if (is_file(entry) && begin_of_direntry(entry) == begin) +		    break; +	    } + +	    if (i >= s->directory.next) { +		fail = -6; +		continue; +	    } + +	    /* make sure there exists an initial mapping */ +	    if (mapping && mapping->begin != begin) { +		mapping->end = begin; +		mapping = NULL; +	    } +	    if (mapping == NULL) { +		mapping = insert_mapping(s, begin, begin+1); +	    } +	    /* most members will be fixed in commit_mappings() */ +	    assert(commit->path); +	    mapping->path = commit->path; +	    mapping->read_only = 0; +	    mapping->mode = MODE_NORMAL; +	    mapping->info.file.offset = 0; + +	    if (commit_one_file(s, i, 0)) +		fail = -7; + +	    break; +	} +	default: +            abort(); +	} +    } +    if (i > 0 && array_remove_slice(&(s->commits), 0, i)) +	return -1; +    return fail; +} + +static int handle_deletes(BDRVVVFATState* s) +{ +    int i, deferred = 1, deleted = 1; + +    /* delete files corresponding to mappings marked as deleted */ +    /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */ +    while (deferred && deleted) { +	deferred = 0; +	deleted = 0; + +	for (i = 1; i < s->mapping.next; i++) { +	    mapping_t* mapping = array_get(&(s->mapping), i); +	    if (mapping->mode & MODE_DELETED) { +		direntry_t* entry = array_get(&(s->directory), +			mapping->dir_index); + +		if (is_free(entry)) { +		    /* remove file/directory */ +		    if (mapping->mode & MODE_DIRECTORY) { +			int j, next_dir_index = s->directory.next, +			first_dir_index = mapping->info.dir.first_dir_index; + +			if (rmdir(mapping->path) < 0) { +			    if (errno == ENOTEMPTY) { +				deferred++; +				continue; +			    } else +				return -5; +			} + +			for (j = 1; j < s->mapping.next; j++) { +			    mapping_t* m = array_get(&(s->mapping), j); +			    if (m->mode & MODE_DIRECTORY && +				    m->info.dir.first_dir_index > +				    first_dir_index && +				    m->info.dir.first_dir_index < +				    next_dir_index) +				next_dir_index = +				    m->info.dir.first_dir_index; +			} +			remove_direntries(s, first_dir_index, +				next_dir_index - first_dir_index); + +			deleted++; +		    } +		} else { +		    if (unlink(mapping->path)) +			return -4; +		    deleted++; +		} +		DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry)); +		remove_mapping(s, i); +	    } +	} +    } + +    return 0; +} + +/* + * synchronize mapping with new state: + * + * - copy FAT (with bdrv_read) + * - mark all filenames corresponding to mappings as deleted + * - recurse direntries from root (using bs->bdrv_read) + * - delete files corresponding to mappings marked as deleted + */ +static int do_commit(BDRVVVFATState* s) +{ +    int ret = 0; + +    /* the real meat are the commits. Nothing to do? Move along! */ +    if (s->commits.next == 0) +	return 0; + +    vvfat_close_current_file(s); + +    ret = handle_renames_and_mkdirs(s); +    if (ret) { +	fprintf(stderr, "Error handling renames (%d)\n", ret); +        abort(); +	return ret; +    } + +    /* copy FAT (with bdrv_read) */ +    memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat); + +    /* recurse direntries from root (using bs->bdrv_read) */ +    ret = commit_direntries(s, 0, -1); +    if (ret) { +	fprintf(stderr, "Fatal: error while committing (%d)\n", ret); +        abort(); +	return ret; +    } + +    ret = handle_commits(s); +    if (ret) { +	fprintf(stderr, "Error handling commits (%d)\n", ret); +        abort(); +	return ret; +    } + +    ret = handle_deletes(s); +    if (ret) { +	fprintf(stderr, "Error deleting\n"); +        abort(); +	return ret; +    } + +    if (s->qcow->drv->bdrv_make_empty) { +        s->qcow->drv->bdrv_make_empty(s->qcow); +    } + +    memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); + +DLOG(checkpoint()); +    return 0; +} + +static int try_commit(BDRVVVFATState* s) +{ +    vvfat_close_current_file(s); +DLOG(checkpoint()); +    if(!is_consistent(s)) +	return -1; +    return do_commit(s); +} + +static int vvfat_write(BlockDriverState *bs, int64_t sector_num, +                    const uint8_t *buf, int nb_sectors) +{ +    BDRVVVFATState *s = bs->opaque; +    int i, ret; + +DLOG(checkpoint()); + +    /* Check if we're operating in read-only mode */ +    if (s->qcow == NULL) { +        return -EACCES; +    } + +    vvfat_close_current_file(s); + +    /* +     * Some sanity checks: +     * - do not allow writing to the boot sector +     * - do not allow to write non-ASCII filenames +     */ + +    if (sector_num < s->first_sectors_number) +	return -1; + +    for (i = sector2cluster(s, sector_num); +	    i <= sector2cluster(s, sector_num + nb_sectors - 1);) { +	mapping_t* mapping = find_mapping_for_cluster(s, i); +	if (mapping) { +	    if (mapping->read_only) { +		fprintf(stderr, "Tried to write to write-protected file %s\n", +			mapping->path); +		return -1; +	    } + +	    if (mapping->mode & MODE_DIRECTORY) { +		int begin = cluster2sector(s, i); +		int end = begin + s->sectors_per_cluster, k; +		int dir_index; +		const direntry_t* direntries; +		long_file_name lfn; + +		lfn_init(&lfn); + +		if (begin < sector_num) +		    begin = sector_num; +		if (end > sector_num + nb_sectors) +		    end = sector_num + nb_sectors; +		dir_index  = mapping->dir_index + +		    0x10 * (begin - mapping->begin * s->sectors_per_cluster); +		direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num)); + +		for (k = 0; k < (end - begin) * 0x10; k++) { +		    /* do not allow non-ASCII filenames */ +		    if (parse_long_name(&lfn, direntries + k) < 0) { +			fprintf(stderr, "Warning: non-ASCII filename\n"); +			return -1; +		    } +		    /* no access to the direntry of a read-only file */ +		    else if (is_short_name(direntries+k) && +			    (direntries[k].attributes & 1)) { +			if (memcmp(direntries + k, +				    array_get(&(s->directory), dir_index + k), +				    sizeof(direntry_t))) { +			    fprintf(stderr, "Warning: tried to write to write-protected file\n"); +			    return -1; +			} +		    } +		} +	    } +	    i = mapping->end; +	} else +	    i++; +    } + +    /* +     * Use qcow backend. Commit later. +     */ +DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); +    ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors); +    if (ret < 0) { +	fprintf(stderr, "Error writing to qcow backend\n"); +	return ret; +    } + +    for (i = sector2cluster(s, sector_num); +	    i <= sector2cluster(s, sector_num + nb_sectors - 1); i++) +	if (i >= 0) +	    s->used_clusters[i] |= USED_ALLOCATED; + +DLOG(checkpoint()); +    /* TODO: add timeout */ +    try_commit(s); + +DLOG(checkpoint()); +    return 0; +} + +static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, +                                       const uint8_t *buf, int nb_sectors) +{ +    int ret; +    BDRVVVFATState *s = bs->opaque; +    qemu_co_mutex_lock(&s->lock); +    ret = vvfat_write(bs, sector_num, buf, nb_sectors); +    qemu_co_mutex_unlock(&s->lock); +    return ret; +} + +static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, +	int64_t sector_num, int nb_sectors, int* n) +{ +    BDRVVVFATState* s = bs->opaque; +    *n = s->sector_count - sector_num; +    if (*n > nb_sectors) { +        *n = nb_sectors; +    } else if (*n < 0) { +        return 0; +    } +    return BDRV_BLOCK_DATA; +} + +static int write_target_commit(BlockDriverState *bs, int64_t sector_num, +	const uint8_t* buffer, int nb_sectors) { +    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); +    return try_commit(s); +} + +static void write_target_close(BlockDriverState *bs) { +    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); +    bdrv_unref(s->qcow); +    g_free(s->qcow_filename); +} + +static BlockDriver vvfat_write_target = { +    .format_name        = "vvfat_write_target", +    .bdrv_write         = write_target_commit, +    .bdrv_close         = write_target_close, +}; + +static int enable_write_target(BDRVVVFATState *s, Error **errp) +{ +    BlockDriver *bdrv_qcow = NULL; +    QemuOpts *opts = NULL; +    int ret; +    int size = sector2cluster(s, s->sector_count); +    s->used_clusters = calloc(size, 1); + +    array_init(&(s->commits), sizeof(commit_t)); + +    s->qcow_filename = g_malloc(PATH_MAX); +    ret = get_tmp_filename(s->qcow_filename, PATH_MAX); +    if (ret < 0) { +        error_setg_errno(errp, -ret, "can't create temporary file"); +        goto err; +    } + +    bdrv_qcow = bdrv_find_format("qcow"); +    if (!bdrv_qcow) { +        error_setg(errp, "Failed to locate qcow driver"); +        ret = -ENOENT; +        goto err; +    } + +    opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, &error_abort); +    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512, +                        &error_abort); +    qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:", &error_abort); + +    ret = bdrv_create(bdrv_qcow, s->qcow_filename, opts, errp); +    qemu_opts_del(opts); +    if (ret < 0) { +        goto err; +    } + +    s->qcow = NULL; +    ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL, +                    BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, +                    bdrv_qcow, errp); +    if (ret < 0) { +        goto err; +    } + +#ifndef _WIN32 +    unlink(s->qcow_filename); +#endif + +    bdrv_set_backing_hd(s->bs, bdrv_new()); +    s->bs->backing_hd->drv = &vvfat_write_target; +    s->bs->backing_hd->opaque = g_new(void *, 1); +    *(void**)s->bs->backing_hd->opaque = s; + +    return 0; + +err: +    g_free(s->qcow_filename); +    s->qcow_filename = NULL; +    return ret; +} + +static void vvfat_close(BlockDriverState *bs) +{ +    BDRVVVFATState *s = bs->opaque; + +    vvfat_close_current_file(s); +    array_free(&(s->fat)); +    array_free(&(s->directory)); +    array_free(&(s->mapping)); +    g_free(s->cluster_buffer); + +    if (s->qcow) { +        migrate_del_blocker(s->migration_blocker); +        error_free(s->migration_blocker); +    } +} + +static BlockDriver bdrv_vvfat = { +    .format_name            = "vvfat", +    .protocol_name          = "fat", +    .instance_size          = sizeof(BDRVVVFATState), + +    .bdrv_parse_filename    = vvfat_parse_filename, +    .bdrv_file_open         = vvfat_open, +    .bdrv_close             = vvfat_close, +    .bdrv_rebind            = vvfat_rebind, + +    .bdrv_read              = vvfat_co_read, +    .bdrv_write             = vvfat_co_write, +    .bdrv_co_get_block_status = vvfat_co_get_block_status, +}; + +static void bdrv_vvfat_init(void) +{ +    bdrv_register(&bdrv_vvfat); +} + +block_init(bdrv_vvfat_init); + +#ifdef DEBUG +static void checkpoint(void) { +    assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2); +    check1(vvv); +    check2(vvv); +    assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY)); +#if 0 +    if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf) +	fprintf(stderr, "Nonono!\n"); +    mapping_t* mapping; +    direntry_t* direntry; +    assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next); +    assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next); +    if (vvv->mapping.next<47) +	return; +    assert((mapping = array_get(&(vvv->mapping), 47))); +    assert(mapping->dir_index < vvv->directory.next); +    direntry = array_get(&(vvv->directory), mapping->dir_index); +    assert(!memcmp(direntry->name, "USB     H  ", 11) || direntry->name[0]==0); +#endif +} +#endif diff --git a/block/win32-aio.c b/block/win32-aio.c new file mode 100644 index 00000000..64e86827 --- /dev/null +++ b/block/win32-aio.c @@ -0,0 +1,217 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "block/aio.h" +#include "raw-aio.h" +#include "qemu/event_notifier.h" +#include "qemu/iov.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD     1 +#define FTYPE_HARDDISK 2 + +struct QEMUWin32AIOState { +    HANDLE hIOCP; +    EventNotifier e; +    int count; +    bool is_aio_context_attached; +}; + +typedef struct QEMUWin32AIOCB { +    BlockAIOCB common; +    struct QEMUWin32AIOState *ctx; +    int nbytes; +    OVERLAPPED ov; +    QEMUIOVector *qiov; +    void *buf; +    bool is_read; +    bool is_linear; +} QEMUWin32AIOCB; + +/* + * Completes an AIO request (calls the callback and frees the ACB). + */ +static void win32_aio_process_completion(QEMUWin32AIOState *s, +    QEMUWin32AIOCB *waiocb, DWORD count) +{ +    int ret; +    s->count--; + +    if (waiocb->ov.Internal != 0) { +        ret = -EIO; +    } else { +        ret = 0; +        if (count < waiocb->nbytes) { +            /* Short reads mean EOF, pad with zeros. */ +            if (waiocb->is_read) { +                qemu_iovec_memset(waiocb->qiov, count, 0, +                    waiocb->qiov->size - count); +            } else { +                ret = -EINVAL; +            } +       } +    } + +    if (!waiocb->is_linear) { +        if (ret == 0 && waiocb->is_read) { +            QEMUIOVector *qiov = waiocb->qiov; +            iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); +        } +        qemu_vfree(waiocb->buf); +    } + + +    waiocb->common.cb(waiocb->common.opaque, ret); +    qemu_aio_unref(waiocb); +} + +static void win32_aio_completion_cb(EventNotifier *e) +{ +    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e); +    DWORD count; +    ULONG_PTR key; +    OVERLAPPED *ov; + +    event_notifier_test_and_clear(&s->e); +    while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) { +        QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov); + +        win32_aio_process_completion(s, waiocb, count); +    } +} + +static const AIOCBInfo win32_aiocb_info = { +    .aiocb_size         = sizeof(QEMUWin32AIOCB), +}; + +BlockAIOCB *win32_aio_submit(BlockDriverState *bs, +        QEMUWin32AIOState *aio, HANDLE hfile, +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, +        BlockCompletionFunc *cb, void *opaque, int type) +{ +    struct QEMUWin32AIOCB *waiocb; +    uint64_t offset = sector_num * 512; +    DWORD rc; + +    waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque); +    waiocb->nbytes = nb_sectors * 512; +    waiocb->qiov = qiov; +    waiocb->is_read = (type == QEMU_AIO_READ); + +    if (qiov->niov > 1) { +        waiocb->buf = qemu_try_blockalign(bs, qiov->size); +        if (waiocb->buf == NULL) { +            goto out; +        } +        if (type & QEMU_AIO_WRITE) { +            iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); +        } +        waiocb->is_linear = false; +    } else { +        waiocb->buf = qiov->iov[0].iov_base; +        waiocb->is_linear = true; +    } + +    memset(&waiocb->ov, 0, sizeof(waiocb->ov)); +    waiocb->ov.Offset = (DWORD)offset; +    waiocb->ov.OffsetHigh = (DWORD)(offset >> 32); +    waiocb->ov.hEvent = event_notifier_get_handle(&aio->e); + +    aio->count++; + +    if (type & QEMU_AIO_READ) { +        rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); +    } else { +        rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); +    } +    if(rc == 0 && GetLastError() != ERROR_IO_PENDING) { +        goto out_dec_count; +    } +    return &waiocb->common; + +out_dec_count: +    aio->count--; +out: +    qemu_aio_unref(waiocb); +    return NULL; +} + +int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile) +{ +    if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) { +        return -EINVAL; +    } else { +        return 0; +    } +} + +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, +                                  AioContext *old_context) +{ +    aio_set_event_notifier(old_context, &aio->e, NULL); +    aio->is_aio_context_attached = false; +} + +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, +                                  AioContext *new_context) +{ +    aio->is_aio_context_attached = true; +    aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb); +} + +QEMUWin32AIOState *win32_aio_init(void) +{ +    QEMUWin32AIOState *s; + +    s = g_malloc0(sizeof(*s)); +    if (event_notifier_init(&s->e, false) < 0) { +        goto out_free_state; +    } + +    s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); +    if (s->hIOCP == NULL) { +        goto out_close_efd; +    } + +    return s; + +out_close_efd: +    event_notifier_cleanup(&s->e); +out_free_state: +    g_free(s); +    return NULL; +} + +void win32_aio_cleanup(QEMUWin32AIOState *aio) +{ +    assert(!aio->is_aio_context_attached); +    CloseHandle(aio->hIOCP); +    event_notifier_cleanup(&aio->e); +    g_free(aio); +} diff --git a/block/write-threshold.c b/block/write-threshold.c new file mode 100644 index 00000000..a53c1f5e --- /dev/null +++ b/block/write-threshold.c @@ -0,0 +1,125 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + *  Francesco Romani <fromani@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include "block/block_int.h" +#include "block/coroutine.h" +#include "block/write-threshold.h" +#include "qemu/notify.h" +#include "qapi-event.h" +#include "qmp-commands.h" + + +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) +{ +    return bs->write_threshold_offset; +} + +bool bdrv_write_threshold_is_set(const BlockDriverState *bs) +{ +    return bs->write_threshold_offset > 0; +} + +static void write_threshold_disable(BlockDriverState *bs) +{ +    if (bdrv_write_threshold_is_set(bs)) { +        notifier_with_return_remove(&bs->write_threshold_notifier); +        bs->write_threshold_offset = 0; +    } +} + +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, +                                       const BdrvTrackedRequest *req) +{ +    if (bdrv_write_threshold_is_set(bs)) { +        if (req->offset > bs->write_threshold_offset) { +            return (req->offset - bs->write_threshold_offset) + req->bytes; +        } +        if ((req->offset + req->bytes) > bs->write_threshold_offset) { +            return (req->offset + req->bytes) - bs->write_threshold_offset; +        } +    } +    return 0; +} + +static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, +                                            void *opaque) +{ +    BdrvTrackedRequest *req = opaque; +    BlockDriverState *bs = req->bs; +    uint64_t amount = 0; + +    amount = bdrv_write_threshold_exceeded(bs, req); +    if (amount > 0) { +        qapi_event_send_block_write_threshold( +            bs->node_name, +            amount, +            bs->write_threshold_offset, +            &error_abort); + +        /* autodisable to avoid flooding the monitor */ +        write_threshold_disable(bs); +    } + +    return 0; /* should always let other notifiers run */ +} + +static void write_threshold_register_notifier(BlockDriverState *bs) +{ +    bs->write_threshold_notifier.notify = before_write_notify; +    notifier_with_return_list_add(&bs->before_write_notifiers, +                                  &bs->write_threshold_notifier); +} + +static void write_threshold_update(BlockDriverState *bs, +                                   int64_t threshold_bytes) +{ +    bs->write_threshold_offset = threshold_bytes; +} + +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) +{ +    if (bdrv_write_threshold_is_set(bs)) { +        if (threshold_bytes > 0) { +            write_threshold_update(bs, threshold_bytes); +        } else { +            write_threshold_disable(bs); +        } +    } else { +        if (threshold_bytes > 0) { +            /* avoid multiple registration */ +            write_threshold_register_notifier(bs); +            write_threshold_update(bs, threshold_bytes); +        } +        /* discard bogus disable request */ +    } +} + +void qmp_block_set_write_threshold(const char *node_name, +                                   uint64_t threshold_bytes, +                                   Error **errp) +{ +    BlockDriverState *bs; +    AioContext *aio_context; + +    bs = bdrv_find_node(node_name); +    if (!bs) { +        error_setg(errp, "Device '%s' not found", node_name); +        return; +    } + +    aio_context = bdrv_get_aio_context(bs); +    aio_context_acquire(aio_context); + +    bdrv_write_threshold_set(bs, threshold_bytes); + +    aio_context_release(aio_context); +}  | 
