diff options
Diffstat (limited to 'migration')
| -rw-r--r-- | migration/Makefile.objs | 10 | ||||
| -rw-r--r-- | migration/block.c | 895 | ||||
| -rw-r--r-- | migration/exec.c | 69 | ||||
| -rw-r--r-- | migration/fd.c | 88 | ||||
| -rw-r--r-- | migration/migration.c | 1054 | ||||
| -rw-r--r-- | migration/qemu-file-buf.c | 462 | ||||
| -rw-r--r-- | migration/qemu-file-internal.h | 53 | ||||
| -rw-r--r-- | migration/qemu-file-stdio.c | 194 | ||||
| -rw-r--r-- | migration/qemu-file-unix.c | 238 | ||||
| -rw-r--r-- | migration/qemu-file.c | 613 | ||||
| -rw-r--r-- | migration/ram.c | 1670 | ||||
| -rw-r--r-- | migration/rdma.c | 3516 | ||||
| -rw-r--r-- | migration/savevm.c | 1605 | ||||
| -rw-r--r-- | migration/tcp.c | 103 | ||||
| -rw-r--r-- | migration/unix.c | 103 | ||||
| -rw-r--r-- | migration/vmstate.c | 890 | ||||
| -rw-r--r-- | migration/xbzrle.c | 175 | 
17 files changed, 11738 insertions, 0 deletions
diff --git a/migration/Makefile.objs b/migration/Makefile.objs new file mode 100644 index 00000000..d929e969 --- /dev/null +++ b/migration/Makefile.objs @@ -0,0 +1,10 @@ +common-obj-y += migration.o tcp.o +common-obj-y += vmstate.o +common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o +common-obj-y += xbzrle.o + +common-obj-$(CONFIG_RDMA) += rdma.o +common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o + +common-obj-y += block.o + diff --git a/migration/block.c b/migration/block.c new file mode 100644 index 00000000..ed865ed2 --- /dev/null +++ b/migration/block.c @@ -0,0 +1,895 @@ +/* + * QEMU live block migration + * + * Copyright IBM, Corp. 2009 + * + * Authors: + *  Liran Schour   <lirans@il.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "block/block.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "hw/hw.h" +#include "qemu/queue.h" +#include "qemu/timer.h" +#include "migration/block.h" +#include "migration/migration.h" +#include "sysemu/blockdev.h" +#include "sysemu/block-backend.h" +#include <assert.h> + +#define BLOCK_SIZE                       (1 << 20) +#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS) + +#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01 +#define BLK_MIG_FLAG_EOS                0x02 +#define BLK_MIG_FLAG_PROGRESS           0x04 +#define BLK_MIG_FLAG_ZERO_BLOCK         0x08 + +#define MAX_IS_ALLOCATED_SEARCH 65536 + +//#define DEBUG_BLK_MIGRATION + +#ifdef DEBUG_BLK_MIGRATION +#define DPRINTF(fmt, ...) \ +    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +typedef struct BlkMigDevState { +    /* Written during setup phase.  Can be read without a lock.  */ +    BlockDriverState *bs; +    int shared_base; +    int64_t total_sectors; +    QSIMPLEQ_ENTRY(BlkMigDevState) entry; + +    /* Only used by migration thread.  Does not need a lock.  */ +    int bulk_completed; +    int64_t cur_sector; +    int64_t cur_dirty; + +    /* Protected by block migration lock.  */ +    unsigned long *aio_bitmap; +    int64_t completed_sectors; +    BdrvDirtyBitmap *dirty_bitmap; +    Error *blocker; +} BlkMigDevState; + +typedef struct BlkMigBlock { +    /* Only used by migration thread.  */ +    uint8_t *buf; +    BlkMigDevState *bmds; +    int64_t sector; +    int nr_sectors; +    struct iovec iov; +    QEMUIOVector qiov; +    BlockAIOCB *aiocb; + +    /* Protected by block migration lock.  */ +    int ret; +    QSIMPLEQ_ENTRY(BlkMigBlock) entry; +} BlkMigBlock; + +typedef struct BlkMigState { +    /* Written during setup phase.  Can be read without a lock.  */ +    int blk_enable; +    int shared_base; +    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list; +    int64_t total_sector_sum; +    bool zero_blocks; + +    /* Protected by lock.  */ +    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list; +    int submitted; +    int read_done; + +    /* Only used by migration thread.  Does not need a lock.  */ +    int transferred; +    int prev_progress; +    int bulk_completed; + +    /* Lock must be taken _inside_ the iothread lock.  */ +    QemuMutex lock; +} BlkMigState; + +static BlkMigState block_mig_state; + +static void blk_mig_lock(void) +{ +    qemu_mutex_lock(&block_mig_state.lock); +} + +static void blk_mig_unlock(void) +{ +    qemu_mutex_unlock(&block_mig_state.lock); +} + +/* Must run outside of the iothread lock during the bulk phase, + * or the VM will stall. + */ + +static void blk_send(QEMUFile *f, BlkMigBlock * blk) +{ +    int len; +    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; + +    if (block_mig_state.zero_blocks && +        buffer_is_zero(blk->buf, BLOCK_SIZE)) { +        flags |= BLK_MIG_FLAG_ZERO_BLOCK; +    } + +    /* sector number and flags */ +    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) +                     | flags); + +    /* device name */ +    len = strlen(bdrv_get_device_name(blk->bmds->bs)); +    qemu_put_byte(f, len); +    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len); + +    /* if a block is zero we need to flush here since the network +     * bandwidth is now a lot higher than the storage device bandwidth. +     * thus if we queue zero blocks we slow down the migration */ +    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { +        qemu_fflush(f); +        return; +    } + +    qemu_put_buffer(f, blk->buf, BLOCK_SIZE); +} + +int blk_mig_active(void) +{ +    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); +} + +uint64_t blk_mig_bytes_transferred(void) +{ +    BlkMigDevState *bmds; +    uint64_t sum = 0; + +    blk_mig_lock(); +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        sum += bmds->completed_sectors; +    } +    blk_mig_unlock(); +    return sum << BDRV_SECTOR_BITS; +} + +uint64_t blk_mig_bytes_remaining(void) +{ +    return blk_mig_bytes_total() - blk_mig_bytes_transferred(); +} + +uint64_t blk_mig_bytes_total(void) +{ +    BlkMigDevState *bmds; +    uint64_t sum = 0; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        sum += bmds->total_sectors; +    } +    return sum << BDRV_SECTOR_BITS; +} + + +/* Called with migration lock held.  */ + +static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) +{ +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; + +    if (sector < bdrv_nb_sectors(bmds->bs)) { +        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & +            (1UL << (chunk % (sizeof(unsigned long) * 8)))); +    } else { +        return 0; +    } +} + +/* Called with migration lock held.  */ + +static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, +                             int nb_sectors, int set) +{ +    int64_t start, end; +    unsigned long val, idx, bit; + +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; + +    for (; start <= end; start++) { +        idx = start / (sizeof(unsigned long) * 8); +        bit = start % (sizeof(unsigned long) * 8); +        val = bmds->aio_bitmap[idx]; +        if (set) { +            val |= 1UL << bit; +        } else { +            val &= ~(1UL << bit); +        } +        bmds->aio_bitmap[idx] = val; +    } +} + +static void alloc_aio_bitmap(BlkMigDevState *bmds) +{ +    BlockDriverState *bs = bmds->bs; +    int64_t bitmap_size; + +    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; + +    bmds->aio_bitmap = g_malloc0(bitmap_size); +} + +/* Never hold migration lock when yielding to the main loop!  */ + +static void blk_mig_read_cb(void *opaque, int ret) +{ +    BlkMigBlock *blk = opaque; + +    blk_mig_lock(); +    blk->ret = ret; + +    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); +    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); + +    block_mig_state.submitted--; +    block_mig_state.read_done++; +    assert(block_mig_state.submitted >= 0); +    blk_mig_unlock(); +} + +/* Called with no lock taken.  */ + +static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) +{ +    int64_t total_sectors = bmds->total_sectors; +    int64_t cur_sector = bmds->cur_sector; +    BlockDriverState *bs = bmds->bs; +    BlkMigBlock *blk; +    int nr_sectors; + +    if (bmds->shared_base) { +        qemu_mutex_lock_iothread(); +        while (cur_sector < total_sectors && +               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH, +                                  &nr_sectors)) { +            cur_sector += nr_sectors; +        } +        qemu_mutex_unlock_iothread(); +    } + +    if (cur_sector >= total_sectors) { +        bmds->cur_sector = bmds->completed_sectors = total_sectors; +        return 1; +    } + +    bmds->completed_sectors = cur_sector; + +    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); + +    /* we are going to transfer a full block even if it is not allocated */ +    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; + +    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { +        nr_sectors = total_sectors - cur_sector; +    } + +    blk = g_new(BlkMigBlock, 1); +    blk->buf = g_malloc(BLOCK_SIZE); +    blk->bmds = bmds; +    blk->sector = cur_sector; +    blk->nr_sectors = nr_sectors; + +    blk->iov.iov_base = blk->buf; +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; +    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + +    blk_mig_lock(); +    block_mig_state.submitted++; +    blk_mig_unlock(); + +    qemu_mutex_lock_iothread(); +    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, +                                nr_sectors, blk_mig_read_cb, blk); + +    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors); +    qemu_mutex_unlock_iothread(); + +    bmds->cur_sector = cur_sector + nr_sectors; +    return (bmds->cur_sector >= total_sectors); +} + +/* Called with iothread lock taken.  */ + +static int set_dirty_tracking(void) +{ +    BlkMigDevState *bmds; +    int ret; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE, +                                                      NULL, NULL); +        if (!bmds->dirty_bitmap) { +            ret = -errno; +            goto fail; +        } +    } +    return 0; + +fail: +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        if (bmds->dirty_bitmap) { +            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); +        } +    } +    return ret; +} + +static void unset_dirty_tracking(void) +{ +    BlkMigDevState *bmds; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); +    } +} + +static void init_blk_migration(QEMUFile *f) +{ +    BlockDriverState *bs; +    BlkMigDevState *bmds; +    int64_t sectors; + +    block_mig_state.submitted = 0; +    block_mig_state.read_done = 0; +    block_mig_state.transferred = 0; +    block_mig_state.total_sector_sum = 0; +    block_mig_state.prev_progress = -1; +    block_mig_state.bulk_completed = 0; +    block_mig_state.zero_blocks = migrate_zero_blocks(); + +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) { +        if (bdrv_is_read_only(bs)) { +            continue; +        } + +        sectors = bdrv_nb_sectors(bs); +        if (sectors <= 0) { +            return; +        } + +        bmds = g_new0(BlkMigDevState, 1); +        bmds->bs = bs; +        bmds->bulk_completed = 0; +        bmds->total_sectors = sectors; +        bmds->completed_sectors = 0; +        bmds->shared_base = block_mig_state.shared_base; +        alloc_aio_bitmap(bmds); +        error_setg(&bmds->blocker, "block device is in use by migration"); +        bdrv_op_block_all(bs, bmds->blocker); +        bdrv_ref(bs); + +        block_mig_state.total_sector_sum += sectors; + +        if (bmds->shared_base) { +            DPRINTF("Start migration for %s with shared base image\n", +                    bdrv_get_device_name(bs)); +        } else { +            DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs)); +        } + +        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); +    } +} + +/* Called with no lock taken.  */ + +static int blk_mig_save_bulked_block(QEMUFile *f) +{ +    int64_t completed_sector_sum = 0; +    BlkMigDevState *bmds; +    int progress; +    int ret = 0; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        if (bmds->bulk_completed == 0) { +            if (mig_save_device_bulk(f, bmds) == 1) { +                /* completed bulk section for this device */ +                bmds->bulk_completed = 1; +            } +            completed_sector_sum += bmds->completed_sectors; +            ret = 1; +            break; +        } else { +            completed_sector_sum += bmds->completed_sectors; +        } +    } + +    if (block_mig_state.total_sector_sum != 0) { +        progress = completed_sector_sum * 100 / +                   block_mig_state.total_sector_sum; +    } else { +        progress = 100; +    } +    if (progress != block_mig_state.prev_progress) { +        block_mig_state.prev_progress = progress; +        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) +                         | BLK_MIG_FLAG_PROGRESS); +        DPRINTF("Completed %d %%\r", progress); +    } + +    return ret; +} + +static void blk_mig_reset_dirty_cursor(void) +{ +    BlkMigDevState *bmds; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        bmds->cur_dirty = 0; +    } +} + +/* Called with iothread lock taken.  */ + +static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, +                                 int is_async) +{ +    BlkMigBlock *blk; +    int64_t total_sectors = bmds->total_sectors; +    int64_t sector; +    int nr_sectors; +    int ret = -EIO; + +    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { +        blk_mig_lock(); +        if (bmds_aio_inflight(bmds, sector)) { +            blk_mig_unlock(); +            bdrv_drain(bmds->bs); +        } else { +            blk_mig_unlock(); +        } +        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) { + +            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { +                nr_sectors = total_sectors - sector; +            } else { +                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; +            } +            blk = g_new(BlkMigBlock, 1); +            blk->buf = g_malloc(BLOCK_SIZE); +            blk->bmds = bmds; +            blk->sector = sector; +            blk->nr_sectors = nr_sectors; + +            if (is_async) { +                blk->iov.iov_base = blk->buf; +                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; +                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + +                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, +                                            nr_sectors, blk_mig_read_cb, blk); + +                blk_mig_lock(); +                block_mig_state.submitted++; +                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); +                blk_mig_unlock(); +            } else { +                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors); +                if (ret < 0) { +                    goto error; +                } +                blk_send(f, blk); + +                g_free(blk->buf); +                g_free(blk); +            } + +            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors); +            break; +        } +        sector += BDRV_SECTORS_PER_DIRTY_CHUNK; +        bmds->cur_dirty = sector; +    } + +    return (bmds->cur_dirty >= bmds->total_sectors); + +error: +    DPRINTF("Error reading sector %" PRId64 "\n", sector); +    g_free(blk->buf); +    g_free(blk); +    return ret; +} + +/* Called with iothread lock taken. + * + * return value: + * 0: too much data for max_downtime + * 1: few enough data for max_downtime +*/ +static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) +{ +    BlkMigDevState *bmds; +    int ret = 1; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        ret = mig_save_device_dirty(f, bmds, is_async); +        if (ret <= 0) { +            break; +        } +    } + +    return ret; +} + +/* Called with no locks taken.  */ + +static int flush_blks(QEMUFile *f) +{ +    BlkMigBlock *blk; +    int ret = 0; + +    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n", +            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done, +            block_mig_state.transferred); + +    blk_mig_lock(); +    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { +        if (qemu_file_rate_limit(f)) { +            break; +        } +        if (blk->ret < 0) { +            ret = blk->ret; +            break; +        } + +        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); +        blk_mig_unlock(); +        blk_send(f, blk); +        blk_mig_lock(); + +        g_free(blk->buf); +        g_free(blk); + +        block_mig_state.read_done--; +        block_mig_state.transferred++; +        assert(block_mig_state.read_done >= 0); +    } +    blk_mig_unlock(); + +    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__, +            block_mig_state.submitted, block_mig_state.read_done, +            block_mig_state.transferred); +    return ret; +} + +/* Called with iothread lock taken.  */ + +static int64_t get_remaining_dirty(void) +{ +    BlkMigDevState *bmds; +    int64_t dirty = 0; + +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { +        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); +    } + +    return dirty << BDRV_SECTOR_BITS; +} + +/* Called with iothread lock taken.  */ + +static void blk_mig_cleanup(void) +{ +    BlkMigDevState *bmds; +    BlkMigBlock *blk; + +    bdrv_drain_all(); + +    unset_dirty_tracking(); + +    blk_mig_lock(); +    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { +        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); +        bdrv_op_unblock_all(bmds->bs, bmds->blocker); +        error_free(bmds->blocker); +        bdrv_unref(bmds->bs); +        g_free(bmds->aio_bitmap); +        g_free(bmds); +    } + +    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { +        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); +        g_free(blk->buf); +        g_free(blk); +    } +    blk_mig_unlock(); +} + +static void block_migration_cancel(void *opaque) +{ +    blk_mig_cleanup(); +} + +static int block_save_setup(QEMUFile *f, void *opaque) +{ +    int ret; + +    DPRINTF("Enter save live setup submitted %d transferred %d\n", +            block_mig_state.submitted, block_mig_state.transferred); + +    qemu_mutex_lock_iothread(); +    init_blk_migration(f); + +    /* start track dirty blocks */ +    ret = set_dirty_tracking(); + +    if (ret) { +        qemu_mutex_unlock_iothread(); +        return ret; +    } + +    qemu_mutex_unlock_iothread(); + +    ret = flush_blks(f); +    blk_mig_reset_dirty_cursor(); +    qemu_put_be64(f, BLK_MIG_FLAG_EOS); + +    return ret; +} + +static int block_save_iterate(QEMUFile *f, void *opaque) +{ +    int ret; +    int64_t last_ftell = qemu_ftell(f); +    int64_t delta_ftell; + +    DPRINTF("Enter save live iterate submitted %d transferred %d\n", +            block_mig_state.submitted, block_mig_state.transferred); + +    ret = flush_blks(f); +    if (ret) { +        return ret; +    } + +    blk_mig_reset_dirty_cursor(); + +    /* control the rate of transfer */ +    blk_mig_lock(); +    while ((block_mig_state.submitted + +            block_mig_state.read_done) * BLOCK_SIZE < +           qemu_file_get_rate_limit(f)) { +        blk_mig_unlock(); +        if (block_mig_state.bulk_completed == 0) { +            /* first finish the bulk phase */ +            if (blk_mig_save_bulked_block(f) == 0) { +                /* finished saving bulk on all devices */ +                block_mig_state.bulk_completed = 1; +            } +            ret = 0; +        } else { +            /* Always called with iothread lock taken for +             * simplicity, block_save_complete also calls it. +             */ +            qemu_mutex_lock_iothread(); +            ret = blk_mig_save_dirty_block(f, 1); +            qemu_mutex_unlock_iothread(); +        } +        if (ret < 0) { +            return ret; +        } +        blk_mig_lock(); +        if (ret != 0) { +            /* no more dirty blocks */ +            break; +        } +    } +    blk_mig_unlock(); + +    ret = flush_blks(f); +    if (ret) { +        return ret; +    } + +    qemu_put_be64(f, BLK_MIG_FLAG_EOS); +    delta_ftell = qemu_ftell(f) - last_ftell; +    if (delta_ftell > 0) { +        return 1; +    } else if (delta_ftell < 0) { +        return -1; +    } else { +        return 0; +    } +} + +/* Called with iothread lock taken.  */ + +static int block_save_complete(QEMUFile *f, void *opaque) +{ +    int ret; + +    DPRINTF("Enter save live complete submitted %d transferred %d\n", +            block_mig_state.submitted, block_mig_state.transferred); + +    ret = flush_blks(f); +    if (ret) { +        return ret; +    } + +    blk_mig_reset_dirty_cursor(); + +    /* we know for sure that save bulk is completed and +       all async read completed */ +    blk_mig_lock(); +    assert(block_mig_state.submitted == 0); +    blk_mig_unlock(); + +    do { +        ret = blk_mig_save_dirty_block(f, 0); +        if (ret < 0) { +            return ret; +        } +    } while (ret == 0); + +    /* report completion */ +    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); + +    DPRINTF("Block migration completed\n"); + +    qemu_put_be64(f, BLK_MIG_FLAG_EOS); + +    blk_mig_cleanup(); +    return 0; +} + +static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +{ +    /* Estimate pending number of bytes to send */ +    uint64_t pending; + +    qemu_mutex_lock_iothread(); +    blk_mig_lock(); +    pending = get_remaining_dirty() + +                       block_mig_state.submitted * BLOCK_SIZE + +                       block_mig_state.read_done * BLOCK_SIZE; + +    /* Report at least one block pending during bulk phase */ +    if (pending <= max_size && !block_mig_state.bulk_completed) { +        pending = max_size + BLOCK_SIZE; +    } +    blk_mig_unlock(); +    qemu_mutex_unlock_iothread(); + +    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending); +    return pending; +} + +static int block_load(QEMUFile *f, void *opaque, int version_id) +{ +    static int banner_printed; +    int len, flags; +    char device_name[256]; +    int64_t addr; +    BlockDriverState *bs, *bs_prev = NULL; +    BlockBackend *blk; +    uint8_t *buf; +    int64_t total_sectors = 0; +    int nr_sectors; +    int ret; + +    do { +        addr = qemu_get_be64(f); + +        flags = addr & ~BDRV_SECTOR_MASK; +        addr >>= BDRV_SECTOR_BITS; + +        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { +            /* get device name */ +            len = qemu_get_byte(f); +            qemu_get_buffer(f, (uint8_t *)device_name, len); +            device_name[len] = '\0'; + +            blk = blk_by_name(device_name); +            if (!blk) { +                fprintf(stderr, "Error unknown block device %s\n", +                        device_name); +                return -EINVAL; +            } +            bs = blk_bs(blk); + +            if (bs != bs_prev) { +                bs_prev = bs; +                total_sectors = bdrv_nb_sectors(bs); +                if (total_sectors <= 0) { +                    error_report("Error getting length of block device %s", +                                 device_name); +                    return -EINVAL; +                } +            } + +            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { +                nr_sectors = total_sectors - addr; +            } else { +                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; +            } + +            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { +                ret = bdrv_write_zeroes(bs, addr, nr_sectors, +                                        BDRV_REQ_MAY_UNMAP); +            } else { +                buf = g_malloc(BLOCK_SIZE); +                qemu_get_buffer(f, buf, BLOCK_SIZE); +                ret = bdrv_write(bs, addr, buf, nr_sectors); +                g_free(buf); +            } + +            if (ret < 0) { +                return ret; +            } +        } else if (flags & BLK_MIG_FLAG_PROGRESS) { +            if (!banner_printed) { +                printf("Receiving block device images\n"); +                banner_printed = 1; +            } +            printf("Completed %d %%%c", (int)addr, +                   (addr == 100) ? '\n' : '\r'); +            fflush(stdout); +        } else if (!(flags & BLK_MIG_FLAG_EOS)) { +            fprintf(stderr, "Unknown block migration flags: %#x\n", flags); +            return -EINVAL; +        } +        ret = qemu_file_get_error(f); +        if (ret != 0) { +            return ret; +        } +    } while (!(flags & BLK_MIG_FLAG_EOS)); + +    return 0; +} + +static void block_set_params(const MigrationParams *params, void *opaque) +{ +    block_mig_state.blk_enable = params->blk; +    block_mig_state.shared_base = params->shared; + +    /* shared base means that blk_enable = 1 */ +    block_mig_state.blk_enable |= params->shared; +} + +static bool block_is_active(void *opaque) +{ +    return block_mig_state.blk_enable == 1; +} + +static SaveVMHandlers savevm_block_handlers = { +    .set_params = block_set_params, +    .save_live_setup = block_save_setup, +    .save_live_iterate = block_save_iterate, +    .save_live_complete = block_save_complete, +    .save_live_pending = block_save_pending, +    .load_state = block_load, +    .cancel = block_migration_cancel, +    .is_active = block_is_active, +}; + +void blk_mig_init(void) +{ +    QSIMPLEQ_INIT(&block_mig_state.bmds_list); +    QSIMPLEQ_INIT(&block_mig_state.blk_list); +    qemu_mutex_init(&block_mig_state.lock); + +    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers, +                         &block_mig_state); +} diff --git a/migration/exec.c b/migration/exec.c new file mode 100644 index 00000000..8406d2bb --- /dev/null +++ b/migration/exec.c @@ -0,0 +1,69 @@ +/* + * QEMU live migration + * + * Copyright IBM, Corp. 2008 + * Copyright Dell MessageOne 2008 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + *  Charles Duffy     <charles_duffy@messageone.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu/sockets.h" +#include "qemu/main-loop.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "block/block.h" +#include <sys/types.h> +#include <sys/wait.h> + +//#define DEBUG_MIGRATION_EXEC + +#ifdef DEBUG_MIGRATION_EXEC +#define DPRINTF(fmt, ...) \ +    do { printf("migration-exec: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +void exec_start_outgoing_migration(MigrationState *s, const char *command, Error **errp) +{ +    s->file = qemu_popen_cmd(command, "w"); +    if (s->file == NULL) { +        error_setg_errno(errp, errno, "failed to popen the migration target"); +        return; +    } + +    migrate_fd_connect(s); +} + +static void exec_accept_incoming_migration(void *opaque) +{ +    QEMUFile *f = opaque; + +    qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); +    process_incoming_migration(f); +} + +void exec_start_incoming_migration(const char *command, Error **errp) +{ +    QEMUFile *f; + +    DPRINTF("Attempting to start an incoming migration\n"); +    f = qemu_popen_cmd(command, "r"); +    if(f == NULL) { +        error_setg_errno(errp, errno, "failed to popen the migration source"); +        return; +    } + +    qemu_set_fd_handler(qemu_get_fd(f), exec_accept_incoming_migration, NULL, +                        f); +} diff --git a/migration/fd.c b/migration/fd.c new file mode 100644 index 00000000..3e4bed0e --- /dev/null +++ b/migration/fd.c @@ -0,0 +1,88 @@ +/* + * QEMU live migration via generic fd + * + * Copyright Red Hat, Inc. 2009 + * + * Authors: + *  Chris Lalancette <clalance@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu/main-loop.h" +#include "qemu/sockets.h" +#include "migration/migration.h" +#include "monitor/monitor.h" +#include "migration/qemu-file.h" +#include "block/block.h" + +//#define DEBUG_MIGRATION_FD + +#ifdef DEBUG_MIGRATION_FD +#define DPRINTF(fmt, ...) \ +    do { printf("migration-fd: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +static bool fd_is_socket(int fd) +{ +    struct stat stat; +    int ret = fstat(fd, &stat); +    if (ret == -1) { +        /* When in doubt say no */ +        return false; +    } +    return S_ISSOCK(stat.st_mode); +} + +void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) +{ +    int fd = monitor_get_fd(cur_mon, fdname, errp); +    if (fd == -1) { +        return; +    } + +    if (fd_is_socket(fd)) { +        s->file = qemu_fopen_socket(fd, "wb"); +    } else { +        s->file = qemu_fdopen(fd, "wb"); +    } + +    migrate_fd_connect(s); +} + +static void fd_accept_incoming_migration(void *opaque) +{ +    QEMUFile *f = opaque; + +    qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); +    process_incoming_migration(f); +} + +void fd_start_incoming_migration(const char *infd, Error **errp) +{ +    int fd; +    QEMUFile *f; + +    DPRINTF("Attempting to start an incoming migration via fd\n"); + +    fd = strtol(infd, NULL, 0); +    if (fd_is_socket(fd)) { +        f = qemu_fopen_socket(fd, "rb"); +    } else { +        f = qemu_fdopen(fd, "rb"); +    } +    if(f == NULL) { +        error_setg_errno(errp, errno, "failed to open the source descriptor"); +        return; +    } + +    qemu_set_fd_handler(fd, fd_accept_incoming_migration, NULL, f); +} diff --git a/migration/migration.c b/migration/migration.c new file mode 100644 index 00000000..c4a7d0b7 --- /dev/null +++ b/migration/migration.c @@ -0,0 +1,1054 @@ +/* + * QEMU live migration + * + * Copyright IBM, Corp. 2008 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "sysemu/sysemu.h" +#include "block/block.h" +#include "qapi/qmp/qerror.h" +#include "qemu/sockets.h" +#include "qemu/rcu.h" +#include "migration/block.h" +#include "qemu/thread.h" +#include "qmp-commands.h" +#include "trace.h" +#include "qapi/util.h" +#include "qapi-event.h" + +#define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */ + +/* Amount of time to allocate to each "chunk" of bandwidth-throttled + * data. */ +#define BUFFER_DELAY     100 +#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY) + +/* Default compression thread count */ +#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8 +/* Default decompression thread count, usually decompression is at + * least 4 times as fast as compression.*/ +#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 +/*0: means nocompress, 1: best speed, ... 9: best compress ratio */ +#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 + +/* Migration XBZRLE default cache size */ +#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) + +static NotifierList migration_state_notifiers = +    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); + +static bool deferred_incoming; + +/* When we add fault tolerance, we could have several +   migrations at once.  For now we don't need to add +   dynamic creation of migration */ + +/* For outgoing */ +MigrationState *migrate_get_current(void) +{ +    static MigrationState current_migration = { +        .state = MIGRATION_STATUS_NONE, +        .bandwidth_limit = MAX_THROTTLE, +        .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, +        .mbps = -1, +        .parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = +                DEFAULT_MIGRATE_COMPRESS_LEVEL, +        .parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = +                DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, +        .parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = +                DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, +    }; + +    return ¤t_migration; +} + +/* For incoming */ +static MigrationIncomingState *mis_current; + +MigrationIncomingState *migration_incoming_get_current(void) +{ +    return mis_current; +} + +MigrationIncomingState *migration_incoming_state_new(QEMUFile* f) +{ +    mis_current = g_malloc0(sizeof(MigrationIncomingState)); +    mis_current->file = f; +    QLIST_INIT(&mis_current->loadvm_handlers); + +    return mis_current; +} + +void migration_incoming_state_destroy(void) +{ +    loadvm_free_handlers(mis_current); +    g_free(mis_current); +    mis_current = NULL; +} + + +typedef struct { +    bool optional; +    uint32_t size; +    uint8_t runstate[100]; +    RunState state; +    bool received; +} GlobalState; + +static GlobalState global_state; + +int global_state_store(void) +{ +    if (!runstate_store((char *)global_state.runstate, +                        sizeof(global_state.runstate))) { +        error_report("runstate name too big: %s", global_state.runstate); +        trace_migrate_state_too_big(); +        return -EINVAL; +    } +    return 0; +} + +void global_state_store_running(void) +{ +    const char *state = RunState_lookup[RUN_STATE_RUNNING]; +    strncpy((char *)global_state.runstate, +           state, sizeof(global_state.runstate)); +} + +static bool global_state_received(void) +{ +    return global_state.received; +} + +static RunState global_state_get_runstate(void) +{ +    return global_state.state; +} + +void global_state_set_optional(void) +{ +    global_state.optional = true; +} + +static bool global_state_needed(void *opaque) +{ +    GlobalState *s = opaque; +    char *runstate = (char *)s->runstate; + +    /* If it is not optional, it is mandatory */ + +    if (s->optional == false) { +        return true; +    } + +    /* If state is running or paused, it is not needed */ + +    if (strcmp(runstate, "running") == 0 || +        strcmp(runstate, "paused") == 0) { +        return false; +    } + +    /* for any other state it is needed */ +    return true; +} + +static int global_state_post_load(void *opaque, int version_id) +{ +    GlobalState *s = opaque; +    Error *local_err = NULL; +    int r; +    char *runstate = (char *)s->runstate; + +    s->received = true; +    trace_migrate_global_state_post_load(runstate); + +    r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX, +                                -1, &local_err); + +    if (r == -1) { +        if (local_err) { +            error_report_err(local_err); +        } +        return -EINVAL; +    } +    s->state = r; + +    return 0; +} + +static void global_state_pre_save(void *opaque) +{ +    GlobalState *s = opaque; + +    trace_migrate_global_state_pre_save((char *)s->runstate); +    s->size = strlen((char *)s->runstate) + 1; +} + +static const VMStateDescription vmstate_globalstate = { +    .name = "globalstate", +    .version_id = 1, +    .minimum_version_id = 1, +    .post_load = global_state_post_load, +    .pre_save = global_state_pre_save, +    .needed = global_state_needed, +    .fields = (VMStateField[]) { +        VMSTATE_UINT32(size, GlobalState), +        VMSTATE_BUFFER(runstate, GlobalState), +        VMSTATE_END_OF_LIST() +    }, +}; + +void register_global_state(void) +{ +    /* We would use it independently that we receive it */ +    strcpy((char *)&global_state.runstate, ""); +    global_state.received = false; +    vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); +} + +static void migrate_generate_event(int new_state) +{ +    if (migrate_use_events()) { +        qapi_event_send_migration(new_state, &error_abort); +    } +} + +/* + * Called on -incoming with a defer: uri. + * The migration can be started later after any parameters have been + * changed. + */ +static void deferred_incoming_migration(Error **errp) +{ +    if (deferred_incoming) { +        error_setg(errp, "Incoming migration already deferred"); +    } +    deferred_incoming = true; +} + +void qemu_start_incoming_migration(const char *uri, Error **errp) +{ +    const char *p; + +    qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); +    if (!strcmp(uri, "defer")) { +        deferred_incoming_migration(errp); +    } else if (strstart(uri, "tcp:", &p)) { +        tcp_start_incoming_migration(p, errp); +#ifdef CONFIG_RDMA +    } else if (strstart(uri, "rdma:", &p)) { +        rdma_start_incoming_migration(p, errp); +#endif +#if !defined(WIN32) +    } else if (strstart(uri, "exec:", &p)) { +        exec_start_incoming_migration(p, errp); +    } else if (strstart(uri, "unix:", &p)) { +        unix_start_incoming_migration(p, errp); +    } else if (strstart(uri, "fd:", &p)) { +        fd_start_incoming_migration(p, errp); +#endif +    } else { +        error_setg(errp, "unknown migration protocol: %s", uri); +    } +} + +static void process_incoming_migration_co(void *opaque) +{ +    QEMUFile *f = opaque; +    Error *local_err = NULL; +    int ret; + +    migration_incoming_state_new(f); +    migrate_generate_event(MIGRATION_STATUS_ACTIVE); +    ret = qemu_loadvm_state(f); + +    qemu_fclose(f); +    free_xbzrle_decoded_buf(); +    migration_incoming_state_destroy(); + +    if (ret < 0) { +        migrate_generate_event(MIGRATION_STATUS_FAILED); +        error_report("load of migration failed: %s", strerror(-ret)); +        migrate_decompress_threads_join(); +        exit(EXIT_FAILURE); +    } +    qemu_announce_self(); + +    /* Make sure all file formats flush their mutable metadata */ +    bdrv_invalidate_cache_all(&local_err); +    if (local_err) { +        migrate_generate_event(MIGRATION_STATUS_FAILED); +        error_report_err(local_err); +        migrate_decompress_threads_join(); +        exit(EXIT_FAILURE); +    } + +    /* If global state section was not received or we are in running +       state, we need to obey autostart. Any other state is set with +       runstate_set. */ + +    if (!global_state_received() || +        global_state_get_runstate() == RUN_STATE_RUNNING) { +        if (autostart) { +            vm_start(); +        } else { +            runstate_set(RUN_STATE_PAUSED); +        } +    } else { +        runstate_set(global_state_get_runstate()); +    } +    migrate_decompress_threads_join(); +    /* +     * This must happen after any state changes since as soon as an external +     * observer sees this event they might start to prod at the VM assuming +     * it's ready to use. +     */ +    migrate_generate_event(MIGRATION_STATUS_COMPLETED); +} + +void process_incoming_migration(QEMUFile *f) +{ +    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co); +    int fd = qemu_get_fd(f); + +    assert(fd != -1); +    migrate_decompress_threads_create(); +    qemu_set_nonblock(fd); +    qemu_coroutine_enter(co, f); +} + +/* amount of nanoseconds we are willing to wait for migration to be down. + * the choice of nanoseconds is because it is the maximum resolution that + * get_clock() can achieve. It is an internal measure. All user-visible + * units must be in seconds */ +static uint64_t max_downtime = 300000000; + +uint64_t migrate_max_downtime(void) +{ +    return max_downtime; +} + +MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) +{ +    MigrationCapabilityStatusList *head = NULL; +    MigrationCapabilityStatusList *caps; +    MigrationState *s = migrate_get_current(); +    int i; + +    caps = NULL; /* silence compiler warning */ +    for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) { +        if (head == NULL) { +            head = g_malloc0(sizeof(*caps)); +            caps = head; +        } else { +            caps->next = g_malloc0(sizeof(*caps)); +            caps = caps->next; +        } +        caps->value = +            g_malloc(sizeof(*caps->value)); +        caps->value->capability = i; +        caps->value->state = s->enabled_capabilities[i]; +    } + +    return head; +} + +MigrationParameters *qmp_query_migrate_parameters(Error **errp) +{ +    MigrationParameters *params; +    MigrationState *s = migrate_get_current(); + +    params = g_malloc0(sizeof(*params)); +    params->compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; +    params->compress_threads = +            s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; +    params->decompress_threads = +            s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; + +    return params; +} + +static void get_xbzrle_cache_stats(MigrationInfo *info) +{ +    if (migrate_use_xbzrle()) { +        info->has_xbzrle_cache = true; +        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); +        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); +        info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); +        info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); +        info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); +        info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate(); +        info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); +    } +} + +MigrationInfo *qmp_query_migrate(Error **errp) +{ +    MigrationInfo *info = g_malloc0(sizeof(*info)); +    MigrationState *s = migrate_get_current(); + +    switch (s->state) { +    case MIGRATION_STATUS_NONE: +        /* no migration has happened ever */ +        break; +    case MIGRATION_STATUS_SETUP: +        info->has_status = true; +        info->has_total_time = false; +        break; +    case MIGRATION_STATUS_ACTIVE: +    case MIGRATION_STATUS_CANCELLING: +        info->has_status = true; +        info->has_total_time = true; +        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +            - s->total_time; +        info->has_expected_downtime = true; +        info->expected_downtime = s->expected_downtime; +        info->has_setup_time = true; +        info->setup_time = s->setup_time; + +        info->has_ram = true; +        info->ram = g_malloc0(sizeof(*info->ram)); +        info->ram->transferred = ram_bytes_transferred(); +        info->ram->remaining = ram_bytes_remaining(); +        info->ram->total = ram_bytes_total(); +        info->ram->duplicate = dup_mig_pages_transferred(); +        info->ram->skipped = skipped_mig_pages_transferred(); +        info->ram->normal = norm_mig_pages_transferred(); +        info->ram->normal_bytes = norm_mig_bytes_transferred(); +        info->ram->dirty_pages_rate = s->dirty_pages_rate; +        info->ram->mbps = s->mbps; +        info->ram->dirty_sync_count = s->dirty_sync_count; + +        if (blk_mig_active()) { +            info->has_disk = true; +            info->disk = g_malloc0(sizeof(*info->disk)); +            info->disk->transferred = blk_mig_bytes_transferred(); +            info->disk->remaining = blk_mig_bytes_remaining(); +            info->disk->total = blk_mig_bytes_total(); +        } + +        get_xbzrle_cache_stats(info); +        break; +    case MIGRATION_STATUS_COMPLETED: +        get_xbzrle_cache_stats(info); + +        info->has_status = true; +        info->has_total_time = true; +        info->total_time = s->total_time; +        info->has_downtime = true; +        info->downtime = s->downtime; +        info->has_setup_time = true; +        info->setup_time = s->setup_time; + +        info->has_ram = true; +        info->ram = g_malloc0(sizeof(*info->ram)); +        info->ram->transferred = ram_bytes_transferred(); +        info->ram->remaining = 0; +        info->ram->total = ram_bytes_total(); +        info->ram->duplicate = dup_mig_pages_transferred(); +        info->ram->skipped = skipped_mig_pages_transferred(); +        info->ram->normal = norm_mig_pages_transferred(); +        info->ram->normal_bytes = norm_mig_bytes_transferred(); +        info->ram->mbps = s->mbps; +        info->ram->dirty_sync_count = s->dirty_sync_count; +        break; +    case MIGRATION_STATUS_FAILED: +        info->has_status = true; +        break; +    case MIGRATION_STATUS_CANCELLED: +        info->has_status = true; +        break; +    } +    info->status = s->state; + +    return info; +} + +void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, +                                  Error **errp) +{ +    MigrationState *s = migrate_get_current(); +    MigrationCapabilityStatusList *cap; + +    if (s->state == MIGRATION_STATUS_ACTIVE || +        s->state == MIGRATION_STATUS_SETUP) { +        error_setg(errp, QERR_MIGRATION_ACTIVE); +        return; +    } + +    for (cap = params; cap; cap = cap->next) { +        s->enabled_capabilities[cap->value->capability] = cap->value->state; +    } +} + +void qmp_migrate_set_parameters(bool has_compress_level, +                                int64_t compress_level, +                                bool has_compress_threads, +                                int64_t compress_threads, +                                bool has_decompress_threads, +                                int64_t decompress_threads, Error **errp) +{ +    MigrationState *s = migrate_get_current(); + +    if (has_compress_level && (compress_level < 0 || compress_level > 9)) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", +                   "is invalid, it should be in the range of 0 to 9"); +        return; +    } +    if (has_compress_threads && +            (compress_threads < 1 || compress_threads > 255)) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, +                   "compress_threads", +                   "is invalid, it should be in the range of 1 to 255"); +        return; +    } +    if (has_decompress_threads && +            (decompress_threads < 1 || decompress_threads > 255)) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, +                   "decompress_threads", +                   "is invalid, it should be in the range of 1 to 255"); +        return; +    } + +    if (has_compress_level) { +        s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; +    } +    if (has_compress_threads) { +        s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = compress_threads; +    } +    if (has_decompress_threads) { +        s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = +                                                    decompress_threads; +    } +} + +/* shared migration helpers */ + +static void migrate_set_state(MigrationState *s, int old_state, int new_state) +{ +    if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { +        trace_migrate_set_state(new_state); +        migrate_generate_event(new_state); +    } +} + +static void migrate_fd_cleanup(void *opaque) +{ +    MigrationState *s = opaque; + +    qemu_bh_delete(s->cleanup_bh); +    s->cleanup_bh = NULL; + +    if (s->file) { +        trace_migrate_fd_cleanup(); +        qemu_mutex_unlock_iothread(); +        qemu_thread_join(&s->thread); +        qemu_mutex_lock_iothread(); + +        migrate_compress_threads_join(); +        qemu_fclose(s->file); +        s->file = NULL; +    } + +    assert(s->state != MIGRATION_STATUS_ACTIVE); + +    if (s->state != MIGRATION_STATUS_COMPLETED) { +        qemu_savevm_state_cancel(); +        if (s->state == MIGRATION_STATUS_CANCELLING) { +            migrate_set_state(s, MIGRATION_STATUS_CANCELLING, +                              MIGRATION_STATUS_CANCELLED); +        } +    } + +    notifier_list_notify(&migration_state_notifiers, s); +} + +void migrate_fd_error(MigrationState *s) +{ +    trace_migrate_fd_error(); +    assert(s->file == NULL); +    migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); +    notifier_list_notify(&migration_state_notifiers, s); +} + +static void migrate_fd_cancel(MigrationState *s) +{ +    int old_state ; +    QEMUFile *f = migrate_get_current()->file; +    trace_migrate_fd_cancel(); + +    do { +        old_state = s->state; +        if (old_state != MIGRATION_STATUS_SETUP && +            old_state != MIGRATION_STATUS_ACTIVE) { +            break; +        } +        migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING); +    } while (s->state != MIGRATION_STATUS_CANCELLING); + +    /* +     * If we're unlucky the migration code might be stuck somewhere in a +     * send/write while the network has failed and is waiting to timeout; +     * if we've got shutdown(2) available then we can force it to quit. +     * The outgoing qemu file gets closed in migrate_fd_cleanup that is +     * called in a bh, so there is no race against this cancel. +     */ +    if (s->state == MIGRATION_STATUS_CANCELLING && f) { +        qemu_file_shutdown(f); +    } +} + +void add_migration_state_change_notifier(Notifier *notify) +{ +    notifier_list_add(&migration_state_notifiers, notify); +} + +void remove_migration_state_change_notifier(Notifier *notify) +{ +    notifier_remove(notify); +} + +bool migration_in_setup(MigrationState *s) +{ +    return s->state == MIGRATION_STATUS_SETUP; +} + +bool migration_has_finished(MigrationState *s) +{ +    return s->state == MIGRATION_STATUS_COMPLETED; +} + +bool migration_has_failed(MigrationState *s) +{ +    return (s->state == MIGRATION_STATUS_CANCELLED || +            s->state == MIGRATION_STATUS_FAILED); +} + +static MigrationState *migrate_init(const MigrationParams *params) +{ +    MigrationState *s = migrate_get_current(); +    int64_t bandwidth_limit = s->bandwidth_limit; +    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; +    int64_t xbzrle_cache_size = s->xbzrle_cache_size; +    int compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; +    int compress_thread_count = +            s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; +    int decompress_thread_count = +            s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; + +    memcpy(enabled_capabilities, s->enabled_capabilities, +           sizeof(enabled_capabilities)); + +    memset(s, 0, sizeof(*s)); +    s->params = *params; +    memcpy(s->enabled_capabilities, enabled_capabilities, +           sizeof(enabled_capabilities)); +    s->xbzrle_cache_size = xbzrle_cache_size; + +    s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; +    s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = +               compress_thread_count; +    s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = +               decompress_thread_count; +    s->bandwidth_limit = bandwidth_limit; +    migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); + +    s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +    return s; +} + +static GSList *migration_blockers; + +void migrate_add_blocker(Error *reason) +{ +    migration_blockers = g_slist_prepend(migration_blockers, reason); +} + +void migrate_del_blocker(Error *reason) +{ +    migration_blockers = g_slist_remove(migration_blockers, reason); +} + +void qmp_migrate_incoming(const char *uri, Error **errp) +{ +    Error *local_err = NULL; +    static bool once = true; + +    if (!deferred_incoming) { +        error_setg(errp, "For use with '-incoming defer'"); +        return; +    } +    if (!once) { +        error_setg(errp, "The incoming migration has already been started"); +    } + +    qemu_start_incoming_migration(uri, &local_err); + +    if (local_err) { +        error_propagate(errp, local_err); +        return; +    } + +    once = false; +} + +void qmp_migrate(const char *uri, bool has_blk, bool blk, +                 bool has_inc, bool inc, bool has_detach, bool detach, +                 Error **errp) +{ +    Error *local_err = NULL; +    MigrationState *s = migrate_get_current(); +    MigrationParams params; +    const char *p; + +    params.blk = has_blk && blk; +    params.shared = has_inc && inc; + +    if (s->state == MIGRATION_STATUS_ACTIVE || +        s->state == MIGRATION_STATUS_SETUP || +        s->state == MIGRATION_STATUS_CANCELLING) { +        error_setg(errp, QERR_MIGRATION_ACTIVE); +        return; +    } +    if (runstate_check(RUN_STATE_INMIGRATE)) { +        error_setg(errp, "Guest is waiting for an incoming migration"); +        return; +    } + +    if (qemu_savevm_state_blocked(errp)) { +        return; +    } + +    if (migration_blockers) { +        *errp = error_copy(migration_blockers->data); +        return; +    } + +    /* We are starting a new migration, so we want to start in a clean +       state.  This change is only needed if previous migration +       failed/was cancelled.  We don't use migrate_set_state() because +       we are setting the initial state, not changing it. */ +    s->state = MIGRATION_STATUS_NONE; + +    s = migrate_init(¶ms); + +    if (strstart(uri, "tcp:", &p)) { +        tcp_start_outgoing_migration(s, p, &local_err); +#ifdef CONFIG_RDMA +    } else if (strstart(uri, "rdma:", &p)) { +        rdma_start_outgoing_migration(s, p, &local_err); +#endif +#if !defined(WIN32) +    } else if (strstart(uri, "exec:", &p)) { +        exec_start_outgoing_migration(s, p, &local_err); +    } else if (strstart(uri, "unix:", &p)) { +        unix_start_outgoing_migration(s, p, &local_err); +    } else if (strstart(uri, "fd:", &p)) { +        fd_start_outgoing_migration(s, p, &local_err); +#endif +    } else { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", +                   "a valid migration protocol"); +        migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); +        return; +    } + +    if (local_err) { +        migrate_fd_error(s); +        error_propagate(errp, local_err); +        return; +    } +} + +void qmp_migrate_cancel(Error **errp) +{ +    migrate_fd_cancel(migrate_get_current()); +} + +void qmp_migrate_set_cache_size(int64_t value, Error **errp) +{ +    MigrationState *s = migrate_get_current(); +    int64_t new_size; + +    /* Check for truncation */ +    if (value != (size_t)value) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", +                   "exceeding address space"); +        return; +    } + +    /* Cache should not be larger than guest ram size */ +    if (value > ram_bytes_total()) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", +                   "exceeds guest ram size "); +        return; +    } + +    new_size = xbzrle_cache_resize(value); +    if (new_size < 0) { +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", +                   "is smaller than page size"); +        return; +    } + +    s->xbzrle_cache_size = new_size; +} + +int64_t qmp_query_migrate_cache_size(Error **errp) +{ +    return migrate_xbzrle_cache_size(); +} + +void qmp_migrate_set_speed(int64_t value, Error **errp) +{ +    MigrationState *s; + +    if (value < 0) { +        value = 0; +    } +    if (value > SIZE_MAX) { +        value = SIZE_MAX; +    } + +    s = migrate_get_current(); +    s->bandwidth_limit = value; +    if (s->file) { +        qemu_file_set_rate_limit(s->file, s->bandwidth_limit / XFER_LIMIT_RATIO); +    } +} + +void qmp_migrate_set_downtime(double value, Error **errp) +{ +    value *= 1e9; +    value = MAX(0, MIN(UINT64_MAX, value)); +    max_downtime = (uint64_t)value; +} + +bool migrate_auto_converge(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; +} + +bool migrate_zero_blocks(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; +} + +bool migrate_use_compression(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS]; +} + +int migrate_compress_level(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; +} + +int migrate_compress_threads(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; +} + +int migrate_decompress_threads(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; +} + +bool migrate_use_events(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; +} + +int migrate_use_xbzrle(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; +} + +int64_t migrate_xbzrle_cache_size(void) +{ +    MigrationState *s; + +    s = migrate_get_current(); + +    return s->xbzrle_cache_size; +} + +/* migration thread support */ + +static void *migration_thread(void *opaque) +{ +    MigrationState *s = opaque; +    int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); +    int64_t initial_bytes = 0; +    int64_t max_size = 0; +    int64_t start_time = initial_time; +    bool old_vm_running = false; + +    rcu_register_thread(); + +    qemu_savevm_state_header(s->file); +    qemu_savevm_state_begin(s->file, &s->params); + +    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; +    migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); + +    while (s->state == MIGRATION_STATUS_ACTIVE) { +        int64_t current_time; +        uint64_t pending_size; + +        if (!qemu_file_rate_limit(s->file)) { +            pending_size = qemu_savevm_state_pending(s->file, max_size); +            trace_migrate_pending(pending_size, max_size); +            if (pending_size && pending_size >= max_size) { +                qemu_savevm_state_iterate(s->file); +            } else { +                int ret; + +                qemu_mutex_lock_iothread(); +                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); +                old_vm_running = runstate_is_running(); + +                ret = global_state_store(); +                if (!ret) { +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); +                    if (ret >= 0) { +                        qemu_file_set_rate_limit(s->file, INT64_MAX); +                        qemu_savevm_state_complete(s->file); +                    } +                } +                qemu_mutex_unlock_iothread(); + +                if (ret < 0) { +                    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, +                                      MIGRATION_STATUS_FAILED); +                    break; +                } + +                if (!qemu_file_get_error(s->file)) { +                    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, +                                      MIGRATION_STATUS_COMPLETED); +                    break; +                } +            } +        } + +        if (qemu_file_get_error(s->file)) { +            migrate_set_state(s, MIGRATION_STATUS_ACTIVE, +                              MIGRATION_STATUS_FAILED); +            break; +        } +        current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +        if (current_time >= initial_time + BUFFER_DELAY) { +            uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes; +            uint64_t time_spent = current_time - initial_time; +            double bandwidth = transferred_bytes / time_spent; +            max_size = bandwidth * migrate_max_downtime() / 1000000; + +            s->mbps = time_spent ? (((double) transferred_bytes * 8.0) / +                    ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1; + +            trace_migrate_transferred(transferred_bytes, time_spent, +                                      bandwidth, max_size); +            /* if we haven't sent anything, we don't want to recalculate +               10000 is a small enough number for our purposes */ +            if (s->dirty_bytes_rate && transferred_bytes > 10000) { +                s->expected_downtime = s->dirty_bytes_rate / bandwidth; +            } + +            qemu_file_reset_rate_limit(s->file); +            initial_time = current_time; +            initial_bytes = qemu_ftell(s->file); +        } +        if (qemu_file_rate_limit(s->file)) { +            /* usleep expects microseconds */ +            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); +        } +    } + +    qemu_mutex_lock_iothread(); +    if (s->state == MIGRATION_STATUS_COMPLETED) { +        int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +        uint64_t transferred_bytes = qemu_ftell(s->file); +        s->total_time = end_time - s->total_time; +        s->downtime = end_time - start_time; +        if (s->total_time) { +            s->mbps = (((double) transferred_bytes * 8.0) / +                       ((double) s->total_time)) / 1000; +        } +        runstate_set(RUN_STATE_POSTMIGRATE); +    } else { +        if (old_vm_running) { +            vm_start(); +        } +    } +    qemu_bh_schedule(s->cleanup_bh); +    qemu_mutex_unlock_iothread(); + +    rcu_unregister_thread(); +    return NULL; +} + +void migrate_fd_connect(MigrationState *s) +{ +    /* This is a best 1st approximation. ns to ms */ +    s->expected_downtime = max_downtime/1000000; +    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); + +    qemu_file_set_rate_limit(s->file, +                             s->bandwidth_limit / XFER_LIMIT_RATIO); + +    /* Notify before starting migration thread */ +    notifier_list_notify(&migration_state_notifiers, s); + +    migrate_compress_threads_create(); +    qemu_thread_create(&s->thread, "migration", migration_thread, s, +                       QEMU_THREAD_JOINABLE); +} diff --git a/migration/qemu-file-buf.c b/migration/qemu-file-buf.c new file mode 100644 index 00000000..2de9330c --- /dev/null +++ b/migration/qemu-file-buf.c @@ -0,0 +1,462 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2014 IBM Corp. + * + * Authors: + *  Stefan Berger <stefanb@linux.vnet.ibm.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/iov.h" +#include "qemu/sockets.h" +#include "block/coroutine.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "migration/qemu-file-internal.h" +#include "trace.h" + +#define QSB_CHUNK_SIZE      (1 << 10) +#define QSB_MAX_CHUNK_SIZE  (16 * QSB_CHUNK_SIZE) + +/** + * Create a QEMUSizedBuffer + * This type of buffer uses scatter-gather lists internally and + * can grow to any size. Any data array in the scatter-gather list + * can hold different amount of bytes. + * + * @buffer: Optional buffer to copy into the QSB + * @len: size of initial buffer; if @buffer is given, buffer must + *       hold at least len bytes + * + * Returns a pointer to a QEMUSizedBuffer or NULL on allocation failure + */ +QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len) +{ +    QEMUSizedBuffer *qsb; +    size_t alloc_len, num_chunks, i, to_copy; +    size_t chunk_size = (len > QSB_MAX_CHUNK_SIZE) +                        ? QSB_MAX_CHUNK_SIZE +                        : QSB_CHUNK_SIZE; + +    num_chunks = DIV_ROUND_UP(len ? len : QSB_CHUNK_SIZE, chunk_size); +    alloc_len = num_chunks * chunk_size; + +    qsb = g_try_new0(QEMUSizedBuffer, 1); +    if (!qsb) { +        return NULL; +    } + +    qsb->iov = g_try_new0(struct iovec, num_chunks); +    if (!qsb->iov) { +        g_free(qsb); +        return NULL; +    } + +    qsb->n_iov = num_chunks; + +    for (i = 0; i < num_chunks; i++) { +        qsb->iov[i].iov_base = g_try_malloc0(chunk_size); +        if (!qsb->iov[i].iov_base) { +            /* qsb_free is safe since g_free can cope with NULL */ +            qsb_free(qsb); +            return NULL; +        } + +        qsb->iov[i].iov_len = chunk_size; +        if (buffer) { +            to_copy = (len - qsb->used) > chunk_size +                      ? chunk_size : (len - qsb->used); +            memcpy(qsb->iov[i].iov_base, &buffer[qsb->used], to_copy); +            qsb->used += to_copy; +        } +    } + +    qsb->size = alloc_len; + +    return qsb; +} + +/** + * Free the QEMUSizedBuffer + * + * @qsb: The QEMUSizedBuffer to free + */ +void qsb_free(QEMUSizedBuffer *qsb) +{ +    size_t i; + +    if (!qsb) { +        return; +    } + +    for (i = 0; i < qsb->n_iov; i++) { +        g_free(qsb->iov[i].iov_base); +    } +    g_free(qsb->iov); +    g_free(qsb); +} + +/** + * Get the number of used bytes in the QEMUSizedBuffer + * + * @qsb: A QEMUSizedBuffer + * + * Returns the number of bytes currently used in this buffer + */ +size_t qsb_get_length(const QEMUSizedBuffer *qsb) +{ +    return qsb->used; +} + +/** + * Set the length of the buffer; the primary usage of this + * function is to truncate the number of used bytes in the buffer. + * The size will not be extended beyond the current number of + * allocated bytes in the QEMUSizedBuffer. + * + * @qsb: A QEMUSizedBuffer + * @new_len: The new length of bytes in the buffer + * + * Returns the number of bytes the buffer was truncated or extended + * to. + */ +size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t new_len) +{ +    if (new_len <= qsb->size) { +        qsb->used = new_len; +    } else { +        qsb->used = qsb->size; +    } +    return qsb->used; +} + +/** + * Get the iovec that holds the data for a given position @pos. + * + * @qsb: A QEMUSizedBuffer + * @pos: The index of a byte in the buffer + * @d_off: Pointer to an offset that this function will indicate + *         at what position within the returned iovec the byte + *         is to be found + * + * Returns the index of the iovec that holds the byte at the given + * index @pos in the byte stream; a negative number if the iovec + * for the given position @pos does not exist. + */ +static ssize_t qsb_get_iovec(const QEMUSizedBuffer *qsb, +                             off_t pos, off_t *d_off) +{ +    ssize_t i; +    off_t curr = 0; + +    if (pos > qsb->used) { +        return -1; +    } + +    for (i = 0; i < qsb->n_iov; i++) { +        if (curr + qsb->iov[i].iov_len > pos) { +            *d_off = pos - curr; +            return i; +        } +        curr += qsb->iov[i].iov_len; +    } +    return -1; +} + +/* + * Convert the QEMUSizedBuffer into a flat buffer. + * + * Note: If at all possible, try to avoid this function since it + *       may unnecessarily copy memory around. + * + * @qsb: pointer to QEMUSizedBuffer + * @start: offset to start at + * @count: number of bytes to copy + * @buf: a pointer to a buffer to write into (at least @count bytes) + * + * Returns the number of bytes copied into the output buffer + */ +ssize_t qsb_get_buffer(const QEMUSizedBuffer *qsb, off_t start, +                       size_t count, uint8_t *buffer) +{ +    const struct iovec *iov; +    size_t to_copy, all_copy; +    ssize_t index; +    off_t s_off; +    off_t d_off = 0; +    char *s; + +    if (start > qsb->used) { +        return 0; +    } + +    all_copy = qsb->used - start; +    if (all_copy > count) { +        all_copy = count; +    } else { +        count = all_copy; +    } + +    index = qsb_get_iovec(qsb, start, &s_off); +    if (index < 0) { +        return 0; +    } + +    while (all_copy > 0) { +        iov = &qsb->iov[index]; + +        s = iov->iov_base; + +        to_copy = iov->iov_len - s_off; +        if (to_copy > all_copy) { +            to_copy = all_copy; +        } +        memcpy(&buffer[d_off], &s[s_off], to_copy); + +        d_off += to_copy; +        all_copy -= to_copy; + +        s_off = 0; +        index++; +    } + +    return count; +} + +/** + * Grow the QEMUSizedBuffer to the given size and allocate + * memory for it. + * + * @qsb: A QEMUSizedBuffer + * @new_size: The new size of the buffer + * + * Return: + *    a negative error code in case of memory allocation failure + * or + *    the new size of the buffer. The returned size may be greater or equal + *    to @new_size. + */ +static ssize_t qsb_grow(QEMUSizedBuffer *qsb, size_t new_size) +{ +    size_t needed_chunks, i; + +    if (qsb->size < new_size) { +        struct iovec *new_iov; +        size_t size_diff = new_size - qsb->size; +        size_t chunk_size = (size_diff > QSB_MAX_CHUNK_SIZE) +                             ? QSB_MAX_CHUNK_SIZE : QSB_CHUNK_SIZE; + +        needed_chunks = DIV_ROUND_UP(size_diff, chunk_size); + +        new_iov = g_try_new(struct iovec, qsb->n_iov + needed_chunks); +        if (new_iov == NULL) { +            return -ENOMEM; +        } + +        /* Allocate new chunks as needed into new_iov */ +        for (i = qsb->n_iov; i < qsb->n_iov + needed_chunks; i++) { +            new_iov[i].iov_base = g_try_malloc0(chunk_size); +            new_iov[i].iov_len = chunk_size; +            if (!new_iov[i].iov_base) { +                size_t j; + +                /* Free previously allocated new chunks */ +                for (j = qsb->n_iov; j < i; j++) { +                    g_free(new_iov[j].iov_base); +                } +                g_free(new_iov); + +                return -ENOMEM; +            } +        } + +        /* +         * Now we can't get any allocation errors, copy over to new iov +         * and switch. +         */ +        for (i = 0; i < qsb->n_iov; i++) { +            new_iov[i] = qsb->iov[i]; +        } + +        qsb->n_iov += needed_chunks; +        g_free(qsb->iov); +        qsb->iov = new_iov; +        qsb->size += (needed_chunks * chunk_size); +    } + +    return qsb->size; +} + +/** + * Write into the QEMUSizedBuffer at a given position and a given + * number of bytes. This function will automatically grow the + * QEMUSizedBuffer. + * + * @qsb: A QEMUSizedBuffer + * @source: A byte array to copy data from + * @pos: The position within the @qsb to write data to + * @size: The number of bytes to copy into the @qsb + * + * Returns @size or a negative error code in case of memory allocation failure, + *           or with an invalid 'pos' + */ +ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *source, +                     off_t pos, size_t count) +{ +    ssize_t rc = qsb_grow(qsb, pos + count); +    size_t to_copy; +    size_t all_copy = count; +    const struct iovec *iov; +    ssize_t index; +    char *dest; +    off_t d_off, s_off = 0; + +    if (rc < 0) { +        return rc; +    } + +    if (pos + count > qsb->used) { +        qsb->used = pos + count; +    } + +    index = qsb_get_iovec(qsb, pos, &d_off); +    if (index < 0) { +        return -EINVAL; +    } + +    while (all_copy > 0) { +        iov = &qsb->iov[index]; + +        dest = iov->iov_base; + +        to_copy = iov->iov_len - d_off; +        if (to_copy > all_copy) { +            to_copy = all_copy; +        } + +        memcpy(&dest[d_off], &source[s_off], to_copy); + +        s_off += to_copy; +        all_copy -= to_copy; + +        d_off = 0; +        index++; +    } + +    return count; +} + +typedef struct QEMUBuffer { +    QEMUSizedBuffer *qsb; +    QEMUFile *file; +    bool qsb_allocated; +} QEMUBuffer; + +static int buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +    QEMUBuffer *s = opaque; +    ssize_t len = qsb_get_length(s->qsb) - pos; + +    if (len <= 0) { +        return 0; +    } + +    if (len > size) { +        len = size; +    } +    return qsb_get_buffer(s->qsb, pos, len, buf); +} + +static int buf_put_buffer(void *opaque, const uint8_t *buf, +                          int64_t pos, int size) +{ +    QEMUBuffer *s = opaque; + +    return qsb_write_at(s->qsb, buf, pos, size); +} + +static int buf_close(void *opaque) +{ +    QEMUBuffer *s = opaque; + +    if (s->qsb_allocated) { +        qsb_free(s->qsb); +    } + +    g_free(s); + +    return 0; +} + +const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f) +{ +    QEMUBuffer *p; + +    qemu_fflush(f); + +    p = f->opaque; + +    return p->qsb; +} + +static const QEMUFileOps buf_read_ops = { +    .get_buffer = buf_get_buffer, +    .close =      buf_close, +}; + +static const QEMUFileOps buf_write_ops = { +    .put_buffer = buf_put_buffer, +    .close =      buf_close, +}; + +QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input) +{ +    QEMUBuffer *s; + +    if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || +        mode[1] != '\0') { +        error_report("qemu_bufopen: Argument validity check failed"); +        return NULL; +    } + +    s = g_malloc0(sizeof(QEMUBuffer)); +    s->qsb = input; + +    if (s->qsb == NULL) { +        s->qsb = qsb_create(NULL, 0); +        s->qsb_allocated = true; +    } +    if (!s->qsb) { +        g_free(s); +        error_report("qemu_bufopen: qsb_create failed"); +        return NULL; +    } + + +    if (mode[0] == 'r') { +        s->file = qemu_fopen_ops(s, &buf_read_ops); +    } else { +        s->file = qemu_fopen_ops(s, &buf_write_ops); +    } +    return s->file; +} diff --git a/migration/qemu-file-internal.h b/migration/qemu-file-internal.h new file mode 100644 index 00000000..d95e8538 --- /dev/null +++ b/migration/qemu-file-internal.h @@ -0,0 +1,53 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_FILE_INTERNAL_H +#define QEMU_FILE_INTERNAL_H 1 + +#include "qemu-common.h" +#include "qemu/iov.h" + +#define IO_BUF_SIZE 32768 +#define MAX_IOV_SIZE MIN(IOV_MAX, 64) + +struct QEMUFile { +    const QEMUFileOps *ops; +    void *opaque; + +    int64_t bytes_xfer; +    int64_t xfer_limit; + +    int64_t pos; /* start of buffer when writing, end of buffer +                    when reading */ +    int buf_index; +    int buf_size; /* 0 when writing */ +    uint8_t buf[IO_BUF_SIZE]; + +    struct iovec iov[MAX_IOV_SIZE]; +    unsigned int iovcnt; + +    int last_error; +}; + +#endif diff --git a/migration/qemu-file-stdio.c b/migration/qemu-file-stdio.c new file mode 100644 index 00000000..285068b3 --- /dev/null +++ b/migration/qemu-file-stdio.c @@ -0,0 +1,194 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/coroutine.h" +#include "migration/qemu-file.h" + +typedef struct QEMUFileStdio { +    FILE *stdio_file; +    QEMUFile *file; +} QEMUFileStdio; + +static int stdio_get_fd(void *opaque) +{ +    QEMUFileStdio *s = opaque; + +    return fileno(s->stdio_file); +} + +static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, +                            int size) +{ +    QEMUFileStdio *s = opaque; +    int res; + +    res = fwrite(buf, 1, size, s->stdio_file); + +    if (res != size) { +        return -errno; +    } +    return res; +} + +static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +    QEMUFileStdio *s = opaque; +    FILE *fp = s->stdio_file; +    int bytes; + +    for (;;) { +        clearerr(fp); +        bytes = fread(buf, 1, size, fp); +        if (bytes != 0 || !ferror(fp)) { +            break; +        } +        if (errno == EAGAIN) { +            yield_until_fd_readable(fileno(fp)); +        } else if (errno != EINTR) { +            break; +        } +    } +    return bytes; +} + +static int stdio_pclose(void *opaque) +{ +    QEMUFileStdio *s = opaque; +    int ret; +    ret = pclose(s->stdio_file); +    if (ret == -1) { +        ret = -errno; +    } else if (!WIFEXITED(ret) || WEXITSTATUS(ret) != 0) { +        /* close succeeded, but non-zero exit code: */ +        ret = -EIO; /* fake errno value */ +    } +    g_free(s); +    return ret; +} + +static int stdio_fclose(void *opaque) +{ +    QEMUFileStdio *s = opaque; +    int ret = 0; + +    if (qemu_file_is_writable(s->file)) { +        int fd = fileno(s->stdio_file); +        struct stat st; + +        ret = fstat(fd, &st); +        if (ret == 0 && S_ISREG(st.st_mode)) { +            /* +             * If the file handle is a regular file make sure the +             * data is flushed to disk before signaling success. +             */ +            ret = fsync(fd); +            if (ret != 0) { +                ret = -errno; +                return ret; +            } +        } +    } +    if (fclose(s->stdio_file) == EOF) { +        ret = -errno; +    } +    g_free(s); +    return ret; +} + +static const QEMUFileOps stdio_pipe_read_ops = { +    .get_fd =     stdio_get_fd, +    .get_buffer = stdio_get_buffer, +    .close =      stdio_pclose +}; + +static const QEMUFileOps stdio_pipe_write_ops = { +    .get_fd =     stdio_get_fd, +    .put_buffer = stdio_put_buffer, +    .close =      stdio_pclose +}; + +QEMUFile *qemu_popen_cmd(const char *command, const char *mode) +{ +    FILE *stdio_file; +    QEMUFileStdio *s; + +    if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 0) { +        fprintf(stderr, "qemu_popen: Argument validity check failed\n"); +        return NULL; +    } + +    stdio_file = popen(command, mode); +    if (stdio_file == NULL) { +        return NULL; +    } + +    s = g_malloc0(sizeof(QEMUFileStdio)); + +    s->stdio_file = stdio_file; + +    if (mode[0] == 'r') { +        s->file = qemu_fopen_ops(s, &stdio_pipe_read_ops); +    } else { +        s->file = qemu_fopen_ops(s, &stdio_pipe_write_ops); +    } +    return s->file; +} + +static const QEMUFileOps stdio_file_read_ops = { +    .get_fd =     stdio_get_fd, +    .get_buffer = stdio_get_buffer, +    .close =      stdio_fclose +}; + +static const QEMUFileOps stdio_file_write_ops = { +    .get_fd =     stdio_get_fd, +    .put_buffer = stdio_put_buffer, +    .close =      stdio_fclose +}; + +QEMUFile *qemu_fopen(const char *filename, const char *mode) +{ +    QEMUFileStdio *s; + +    if (qemu_file_mode_is_not_valid(mode)) { +        return NULL; +    } + +    s = g_malloc0(sizeof(QEMUFileStdio)); + +    s->stdio_file = fopen(filename, mode); +    if (!s->stdio_file) { +        goto fail; +    } + +    if (mode[0] == 'w') { +        s->file = qemu_fopen_ops(s, &stdio_file_write_ops); +    } else { +        s->file = qemu_fopen_ops(s, &stdio_file_read_ops); +    } +    return s->file; +fail: +    g_free(s); +    return NULL; +} diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c new file mode 100644 index 00000000..bfbc0861 --- /dev/null +++ b/migration/qemu-file-unix.c @@ -0,0 +1,238 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/iov.h" +#include "qemu/sockets.h" +#include "block/coroutine.h" +#include "migration/qemu-file.h" +#include "migration/qemu-file-internal.h" + +typedef struct QEMUFileSocket { +    int fd; +    QEMUFile *file; +} QEMUFileSocket; + +static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, +                                    int64_t pos) +{ +    QEMUFileSocket *s = opaque; +    ssize_t len; +    ssize_t size = iov_size(iov, iovcnt); + +    len = iov_send(s->fd, iov, iovcnt, 0, size); +    if (len < size) { +        len = -socket_error(); +    } +    return len; +} + +static int socket_get_fd(void *opaque) +{ +    QEMUFileSocket *s = opaque; + +    return s->fd; +} + +static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +    QEMUFileSocket *s = opaque; +    ssize_t len; + +    for (;;) { +        len = qemu_recv(s->fd, buf, size, 0); +        if (len != -1) { +            break; +        } +        if (socket_error() == EAGAIN) { +            yield_until_fd_readable(s->fd); +        } else if (socket_error() != EINTR) { +            break; +        } +    } + +    if (len == -1) { +        len = -socket_error(); +    } +    return len; +} + +static int socket_close(void *opaque) +{ +    QEMUFileSocket *s = opaque; +    closesocket(s->fd); +    g_free(s); +    return 0; +} + +static int socket_shutdown(void *opaque, bool rd, bool wr) +{ +    QEMUFileSocket *s = opaque; + +    if (shutdown(s->fd, rd ? (wr ? SHUT_RDWR : SHUT_RD) : SHUT_WR)) { +        return -errno; +    } else { +        return 0; +    } +} + +static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, +                                  int64_t pos) +{ +    QEMUFileSocket *s = opaque; +    ssize_t len, offset; +    ssize_t size = iov_size(iov, iovcnt); +    ssize_t total = 0; + +    assert(iovcnt > 0); +    offset = 0; +    while (size > 0) { +        /* Find the next start position; skip all full-sized vector elements  */ +        while (offset >= iov[0].iov_len) { +            offset -= iov[0].iov_len; +            iov++, iovcnt--; +        } + +        /* skip `offset' bytes from the (now) first element, undo it on exit */ +        assert(iovcnt > 0); +        iov[0].iov_base += offset; +        iov[0].iov_len -= offset; + +        do { +            len = writev(s->fd, iov, iovcnt); +        } while (len == -1 && errno == EINTR); +        if (len == -1) { +            return -errno; +        } + +        /* Undo the changes above */ +        iov[0].iov_base -= offset; +        iov[0].iov_len += offset; + +        /* Prepare for the next iteration */ +        offset += len; +        total += len; +        size -= len; +    } + +    return total; +} + +static int unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +    QEMUFileSocket *s = opaque; +    ssize_t len; + +    for (;;) { +        len = read(s->fd, buf, size); +        if (len != -1) { +            break; +        } +        if (errno == EAGAIN) { +            yield_until_fd_readable(s->fd); +        } else if (errno != EINTR) { +            break; +        } +    } + +    if (len == -1) { +        len = -errno; +    } +    return len; +} + +static int unix_close(void *opaque) +{ +    QEMUFileSocket *s = opaque; +    close(s->fd); +    g_free(s); +    return 0; +} + +static const QEMUFileOps unix_read_ops = { +    .get_fd =     socket_get_fd, +    .get_buffer = unix_get_buffer, +    .close =      unix_close +}; + +static const QEMUFileOps unix_write_ops = { +    .get_fd =     socket_get_fd, +    .writev_buffer = unix_writev_buffer, +    .close =      unix_close +}; + +QEMUFile *qemu_fdopen(int fd, const char *mode) +{ +    QEMUFileSocket *s; + +    if (mode == NULL || +        (mode[0] != 'r' && mode[0] != 'w') || +        mode[1] != 'b' || mode[2] != 0) { +        fprintf(stderr, "qemu_fdopen: Argument validity check failed\n"); +        return NULL; +    } + +    s = g_malloc0(sizeof(QEMUFileSocket)); +    s->fd = fd; + +    if (mode[0] == 'r') { +        s->file = qemu_fopen_ops(s, &unix_read_ops); +    } else { +        s->file = qemu_fopen_ops(s, &unix_write_ops); +    } +    return s->file; +} + +static const QEMUFileOps socket_read_ops = { +    .get_fd     = socket_get_fd, +    .get_buffer = socket_get_buffer, +    .close      = socket_close, +    .shut_down  = socket_shutdown + +}; + +static const QEMUFileOps socket_write_ops = { +    .get_fd        = socket_get_fd, +    .writev_buffer = socket_writev_buffer, +    .close         = socket_close, +    .shut_down     = socket_shutdown +}; + +QEMUFile *qemu_fopen_socket(int fd, const char *mode) +{ +    QEMUFileSocket *s; + +    if (qemu_file_mode_is_not_valid(mode)) { +        return NULL; +    } + +    s = g_malloc0(sizeof(QEMUFileSocket)); +    s->fd = fd; +    if (mode[0] == 'w') { +        qemu_set_block(s->fd); +        s->file = qemu_fopen_ops(s, &socket_write_ops); +    } else { +        s->file = qemu_fopen_ops(s, &socket_read_ops); +    } +    return s->file; +} diff --git a/migration/qemu-file.c b/migration/qemu-file.c new file mode 100644 index 00000000..6bb3dc15 --- /dev/null +++ b/migration/qemu-file.c @@ -0,0 +1,613 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <zlib.h> +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/iov.h" +#include "qemu/sockets.h" +#include "block/coroutine.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "migration/qemu-file-internal.h" +#include "trace.h" + +/* + * Stop a file from being read/written - not all backing files can do this + * typically only sockets can. + */ +int qemu_file_shutdown(QEMUFile *f) +{ +    if (!f->ops->shut_down) { +        return -ENOSYS; +    } +    return f->ops->shut_down(f->opaque, true, true); +} + +bool qemu_file_mode_is_not_valid(const char *mode) +{ +    if (mode == NULL || +        (mode[0] != 'r' && mode[0] != 'w') || +        mode[1] != 'b' || mode[2] != 0) { +        fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); +        return true; +    } + +    return false; +} + +QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops) +{ +    QEMUFile *f; + +    f = g_malloc0(sizeof(QEMUFile)); + +    f->opaque = opaque; +    f->ops = ops; +    return f; +} + +/* + * Get last error for stream f + * + * Return negative error value if there has been an error on previous + * operations, return 0 if no error happened. + * + */ +int qemu_file_get_error(QEMUFile *f) +{ +    return f->last_error; +} + +void qemu_file_set_error(QEMUFile *f, int ret) +{ +    if (f->last_error == 0) { +        f->last_error = ret; +    } +} + +bool qemu_file_is_writable(QEMUFile *f) +{ +    return f->ops->writev_buffer || f->ops->put_buffer; +} + +/** + * Flushes QEMUFile buffer + * + * If there is writev_buffer QEMUFileOps it uses it otherwise uses + * put_buffer ops. + */ +void qemu_fflush(QEMUFile *f) +{ +    ssize_t ret = 0; + +    if (!qemu_file_is_writable(f)) { +        return; +    } + +    if (f->ops->writev_buffer) { +        if (f->iovcnt > 0) { +            ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); +        } +    } else { +        if (f->buf_index > 0) { +            ret = f->ops->put_buffer(f->opaque, f->buf, f->pos, f->buf_index); +        } +    } +    if (ret >= 0) { +        f->pos += ret; +    } +    f->buf_index = 0; +    f->iovcnt = 0; +    if (ret < 0) { +        qemu_file_set_error(f, ret); +    } +} + +void ram_control_before_iterate(QEMUFile *f, uint64_t flags) +{ +    int ret = 0; + +    if (f->ops->before_ram_iterate) { +        ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); +        if (ret < 0) { +            qemu_file_set_error(f, ret); +        } +    } +} + +void ram_control_after_iterate(QEMUFile *f, uint64_t flags) +{ +    int ret = 0; + +    if (f->ops->after_ram_iterate) { +        ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); +        if (ret < 0) { +            qemu_file_set_error(f, ret); +        } +    } +} + +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) +{ +    int ret = -EINVAL; + +    if (f->ops->hook_ram_load) { +        ret = f->ops->hook_ram_load(f, f->opaque, flags, data); +        if (ret < 0) { +            qemu_file_set_error(f, ret); +        } +    } else { +        /* +         * Hook is a hook specifically requested by the source sending a flag +         * that expects there to be a hook on the destination. +         */ +        if (flags == RAM_CONTROL_HOOK) { +            qemu_file_set_error(f, ret); +        } +    } +} + +size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, +                             ram_addr_t offset, size_t size, +                             uint64_t *bytes_sent) +{ +    if (f->ops->save_page) { +        int ret = f->ops->save_page(f, f->opaque, block_offset, +                                    offset, size, bytes_sent); + +        if (ret != RAM_SAVE_CONTROL_DELAYED) { +            if (bytes_sent && *bytes_sent > 0) { +                qemu_update_position(f, *bytes_sent); +            } else if (ret < 0) { +                qemu_file_set_error(f, ret); +            } +        } + +        return ret; +    } + +    return RAM_SAVE_CONTROL_NOT_SUPP; +} + +/* + * Attempt to fill the buffer from the underlying file + * Returns the number of bytes read, or negative value for an error. + * + * Note that it can return a partially full buffer even in a not error/not EOF + * case if the underlying file descriptor gives a short read, and that can + * happen even on a blocking fd. + */ +static ssize_t qemu_fill_buffer(QEMUFile *f) +{ +    int len; +    int pending; + +    assert(!qemu_file_is_writable(f)); + +    pending = f->buf_size - f->buf_index; +    if (pending > 0) { +        memmove(f->buf, f->buf + f->buf_index, pending); +    } +    f->buf_index = 0; +    f->buf_size = pending; + +    len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos, +                        IO_BUF_SIZE - pending); +    if (len > 0) { +        f->buf_size += len; +        f->pos += len; +    } else if (len == 0) { +        qemu_file_set_error(f, -EIO); +    } else if (len != -EAGAIN) { +        qemu_file_set_error(f, len); +    } + +    return len; +} + +int qemu_get_fd(QEMUFile *f) +{ +    if (f->ops->get_fd) { +        return f->ops->get_fd(f->opaque); +    } +    return -1; +} + +void qemu_update_position(QEMUFile *f, size_t size) +{ +    f->pos += size; +} + +/** Closes the file + * + * Returns negative error value if any error happened on previous operations or + * while closing the file. Returns 0 or positive number on success. + * + * The meaning of return value on success depends on the specific backend + * being used. + */ +int qemu_fclose(QEMUFile *f) +{ +    int ret; +    qemu_fflush(f); +    ret = qemu_file_get_error(f); + +    if (f->ops->close) { +        int ret2 = f->ops->close(f->opaque); +        if (ret >= 0) { +            ret = ret2; +        } +    } +    /* If any error was spotted before closing, we should report it +     * instead of the close() return value. +     */ +    if (f->last_error) { +        ret = f->last_error; +    } +    g_free(f); +    trace_qemu_file_fclose(); +    return ret; +} + +static void add_to_iovec(QEMUFile *f, const uint8_t *buf, int size) +{ +    /* check for adjacent buffer and coalesce them */ +    if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + +        f->iov[f->iovcnt - 1].iov_len) { +        f->iov[f->iovcnt - 1].iov_len += size; +    } else { +        f->iov[f->iovcnt].iov_base = (uint8_t *)buf; +        f->iov[f->iovcnt++].iov_len = size; +    } + +    if (f->iovcnt >= MAX_IOV_SIZE) { +        qemu_fflush(f); +    } +} + +void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, int size) +{ +    if (!f->ops->writev_buffer) { +        qemu_put_buffer(f, buf, size); +        return; +    } + +    if (f->last_error) { +        return; +    } + +    f->bytes_xfer += size; +    add_to_iovec(f, buf, size); +} + +void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size) +{ +    int l; + +    if (f->last_error) { +        return; +    } + +    while (size > 0) { +        l = IO_BUF_SIZE - f->buf_index; +        if (l > size) { +            l = size; +        } +        memcpy(f->buf + f->buf_index, buf, l); +        f->bytes_xfer += l; +        if (f->ops->writev_buffer) { +            add_to_iovec(f, f->buf + f->buf_index, l); +        } +        f->buf_index += l; +        if (f->buf_index == IO_BUF_SIZE) { +            qemu_fflush(f); +        } +        if (qemu_file_get_error(f)) { +            break; +        } +        buf += l; +        size -= l; +    } +} + +void qemu_put_byte(QEMUFile *f, int v) +{ +    if (f->last_error) { +        return; +    } + +    f->buf[f->buf_index] = v; +    f->bytes_xfer++; +    if (f->ops->writev_buffer) { +        add_to_iovec(f, f->buf + f->buf_index, 1); +    } +    f->buf_index++; +    if (f->buf_index == IO_BUF_SIZE) { +        qemu_fflush(f); +    } +} + +void qemu_file_skip(QEMUFile *f, int size) +{ +    if (f->buf_index + size <= f->buf_size) { +        f->buf_index += size; +    } +} + +/* + * Read 'size' bytes from file (at 'offset') without moving the + * pointer and set 'buf' to point to that data. + * + * It will return size bytes unless there was an error, in which case it will + * return as many as it managed to read (assuming blocking fd's which + * all current QEMUFile are) + */ +int qemu_peek_buffer(QEMUFile *f, uint8_t **buf, int size, size_t offset) +{ +    int pending; +    int index; + +    assert(!qemu_file_is_writable(f)); +    assert(offset < IO_BUF_SIZE); +    assert(size <= IO_BUF_SIZE - offset); + +    /* The 1st byte to read from */ +    index = f->buf_index + offset; +    /* The number of available bytes starting at index */ +    pending = f->buf_size - index; + +    /* +     * qemu_fill_buffer might return just a few bytes, even when there isn't +     * an error, so loop collecting them until we get enough. +     */ +    while (pending < size) { +        int received = qemu_fill_buffer(f); + +        if (received <= 0) { +            break; +        } + +        index = f->buf_index + offset; +        pending = f->buf_size - index; +    } + +    if (pending <= 0) { +        return 0; +    } +    if (size > pending) { +        size = pending; +    } + +    *buf = f->buf + index; +    return size; +} + +/* + * Read 'size' bytes of data from the file into buf. + * 'size' can be larger than the internal buffer. + * + * It will return size bytes unless there was an error, in which case it will + * return as many as it managed to read (assuming blocking fd's which + * all current QEMUFile are) + */ +int qemu_get_buffer(QEMUFile *f, uint8_t *buf, int size) +{ +    int pending = size; +    int done = 0; + +    while (pending > 0) { +        int res; +        uint8_t *src; + +        res = qemu_peek_buffer(f, &src, MIN(pending, IO_BUF_SIZE), 0); +        if (res == 0) { +            return done; +        } +        memcpy(buf, src, res); +        qemu_file_skip(f, res); +        buf += res; +        pending -= res; +        done += res; +    } +    return done; +} + +/* + * Peeks a single byte from the buffer; this isn't guaranteed to work if + * offset leaves a gap after the previous read/peeked data. + */ +int qemu_peek_byte(QEMUFile *f, int offset) +{ +    int index = f->buf_index + offset; + +    assert(!qemu_file_is_writable(f)); +    assert(offset < IO_BUF_SIZE); + +    if (index >= f->buf_size) { +        qemu_fill_buffer(f); +        index = f->buf_index + offset; +        if (index >= f->buf_size) { +            return 0; +        } +    } +    return f->buf[index]; +} + +int qemu_get_byte(QEMUFile *f) +{ +    int result; + +    result = qemu_peek_byte(f, 0); +    qemu_file_skip(f, 1); +    return result; +} + +int64_t qemu_ftell_fast(QEMUFile *f) +{ +    int64_t ret = f->pos; +    int i; + +    if (f->ops->writev_buffer) { +        for (i = 0; i < f->iovcnt; i++) { +            ret += f->iov[i].iov_len; +        } +    } else { +        ret += f->buf_index; +    } + +    return ret; +} + +int64_t qemu_ftell(QEMUFile *f) +{ +    qemu_fflush(f); +    return f->pos; +} + +int qemu_file_rate_limit(QEMUFile *f) +{ +    if (qemu_file_get_error(f)) { +        return 1; +    } +    if (f->xfer_limit > 0 && f->bytes_xfer > f->xfer_limit) { +        return 1; +    } +    return 0; +} + +int64_t qemu_file_get_rate_limit(QEMUFile *f) +{ +    return f->xfer_limit; +} + +void qemu_file_set_rate_limit(QEMUFile *f, int64_t limit) +{ +    f->xfer_limit = limit; +} + +void qemu_file_reset_rate_limit(QEMUFile *f) +{ +    f->bytes_xfer = 0; +} + +void qemu_put_be16(QEMUFile *f, unsigned int v) +{ +    qemu_put_byte(f, v >> 8); +    qemu_put_byte(f, v); +} + +void qemu_put_be32(QEMUFile *f, unsigned int v) +{ +    qemu_put_byte(f, v >> 24); +    qemu_put_byte(f, v >> 16); +    qemu_put_byte(f, v >> 8); +    qemu_put_byte(f, v); +} + +void qemu_put_be64(QEMUFile *f, uint64_t v) +{ +    qemu_put_be32(f, v >> 32); +    qemu_put_be32(f, v); +} + +unsigned int qemu_get_be16(QEMUFile *f) +{ +    unsigned int v; +    v = qemu_get_byte(f) << 8; +    v |= qemu_get_byte(f); +    return v; +} + +unsigned int qemu_get_be32(QEMUFile *f) +{ +    unsigned int v; +    v = (unsigned int)qemu_get_byte(f) << 24; +    v |= qemu_get_byte(f) << 16; +    v |= qemu_get_byte(f) << 8; +    v |= qemu_get_byte(f); +    return v; +} + +uint64_t qemu_get_be64(QEMUFile *f) +{ +    uint64_t v; +    v = (uint64_t)qemu_get_be32(f) << 32; +    v |= qemu_get_be32(f); +    return v; +} + +/* compress size bytes of data start at p with specific compression + * level and store the compressed data to the buffer of f. + */ + +ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, +                                  int level) +{ +    ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); + +    if (blen < compressBound(size)) { +        return 0; +    } +    if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *)&blen, +                  (Bytef *)p, size, level) != Z_OK) { +        error_report("Compress Failed!"); +        return 0; +    } +    qemu_put_be32(f, blen); +    f->buf_index += blen; +    return blen + sizeof(int32_t); +} + +/* Put the data in the buffer of f_src to the buffer of f_des, and + * then reset the buf_index of f_src to 0. + */ + +int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src) +{ +    int len = 0; + +    if (f_src->buf_index > 0) { +        len = f_src->buf_index; +        qemu_put_buffer(f_des, f_src->buf, f_src->buf_index); +        f_src->buf_index = 0; +    } +    return len; +} + +/* + * Get a string whose length is determined by a single preceding byte + * A preallocated 256 byte buffer must be passed in. + * Returns: len on success and a 0 terminated string in the buffer + *          else 0 + *          (Note a 0 length string will return 0 either way) + */ +size_t qemu_get_counted_string(QEMUFile *f, char buf[256]) +{ +    size_t len = qemu_get_byte(f); +    size_t res = qemu_get_buffer(f, (uint8_t *)buf, len); + +    buf[res] = 0; + +    return res == len ? res : 0; +} diff --git a/migration/ram.c b/migration/ram.c new file mode 100644 index 00000000..7f007e64 --- /dev/null +++ b/migration/ram.c @@ -0,0 +1,1670 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2011-2015 Red Hat Inc + * + * Authors: + *  Juan Quintela <quintela@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <stdint.h> +#include <zlib.h> +#include "qemu/bitops.h" +#include "qemu/bitmap.h" +#include "qemu/timer.h" +#include "qemu/main-loop.h" +#include "migration/migration.h" +#include "exec/address-spaces.h" +#include "migration/page_cache.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "exec/ram_addr.h" +#include "qemu/rcu_queue.h" + +#ifdef DEBUG_MIGRATION_RAM +#define DPRINTF(fmt, ...) \ +    do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +static bool mig_throttle_on; +static int dirty_rate_high_cnt; +static void check_guest_throttling(void); + +static uint64_t bitmap_sync_count; + +/***********************************************************/ +/* ram save/restore */ + +#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */ +#define RAM_SAVE_FLAG_COMPRESS 0x02 +#define RAM_SAVE_FLAG_MEM_SIZE 0x04 +#define RAM_SAVE_FLAG_PAGE     0x08 +#define RAM_SAVE_FLAG_EOS      0x10 +#define RAM_SAVE_FLAG_CONTINUE 0x20 +#define RAM_SAVE_FLAG_XBZRLE   0x40 +/* 0x80 is reserved in migration.h start with 0x100 next */ +#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100 + +static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; + +static inline bool is_zero_range(uint8_t *p, uint64_t size) +{ +    return buffer_find_nonzero_offset(p, size) == size; +} + +/* struct contains XBZRLE cache and a static page +   used by the compression */ +static struct { +    /* buffer used for XBZRLE encoding */ +    uint8_t *encoded_buf; +    /* buffer for storing page content */ +    uint8_t *current_buf; +    /* Cache for XBZRLE, Protected by lock. */ +    PageCache *cache; +    QemuMutex lock; +} XBZRLE; + +/* buffer used for XBZRLE decoding */ +static uint8_t *xbzrle_decoded_buf; + +static void XBZRLE_cache_lock(void) +{ +    if (migrate_use_xbzrle()) +        qemu_mutex_lock(&XBZRLE.lock); +} + +static void XBZRLE_cache_unlock(void) +{ +    if (migrate_use_xbzrle()) +        qemu_mutex_unlock(&XBZRLE.lock); +} + +/* + * called from qmp_migrate_set_cache_size in main thread, possibly while + * a migration is in progress. + * A running migration maybe using the cache and might finish during this + * call, hence changes to the cache are protected by XBZRLE.lock(). + */ +int64_t xbzrle_cache_resize(int64_t new_size) +{ +    PageCache *new_cache; +    int64_t ret; + +    if (new_size < TARGET_PAGE_SIZE) { +        return -1; +    } + +    XBZRLE_cache_lock(); + +    if (XBZRLE.cache != NULL) { +        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { +            goto out_new_size; +        } +        new_cache = cache_init(new_size / TARGET_PAGE_SIZE, +                                        TARGET_PAGE_SIZE); +        if (!new_cache) { +            error_report("Error creating cache"); +            ret = -1; +            goto out; +        } + +        cache_fini(XBZRLE.cache); +        XBZRLE.cache = new_cache; +    } + +out_new_size: +    ret = pow2floor(new_size); +out: +    XBZRLE_cache_unlock(); +    return ret; +} + +/* accounting for migration statistics */ +typedef struct AccountingInfo { +    uint64_t dup_pages; +    uint64_t skipped_pages; +    uint64_t norm_pages; +    uint64_t iterations; +    uint64_t xbzrle_bytes; +    uint64_t xbzrle_pages; +    uint64_t xbzrle_cache_miss; +    double xbzrle_cache_miss_rate; +    uint64_t xbzrle_overflows; +} AccountingInfo; + +static AccountingInfo acct_info; + +static void acct_clear(void) +{ +    memset(&acct_info, 0, sizeof(acct_info)); +} + +uint64_t dup_mig_bytes_transferred(void) +{ +    return acct_info.dup_pages * TARGET_PAGE_SIZE; +} + +uint64_t dup_mig_pages_transferred(void) +{ +    return acct_info.dup_pages; +} + +uint64_t skipped_mig_bytes_transferred(void) +{ +    return acct_info.skipped_pages * TARGET_PAGE_SIZE; +} + +uint64_t skipped_mig_pages_transferred(void) +{ +    return acct_info.skipped_pages; +} + +uint64_t norm_mig_bytes_transferred(void) +{ +    return acct_info.norm_pages * TARGET_PAGE_SIZE; +} + +uint64_t norm_mig_pages_transferred(void) +{ +    return acct_info.norm_pages; +} + +uint64_t xbzrle_mig_bytes_transferred(void) +{ +    return acct_info.xbzrle_bytes; +} + +uint64_t xbzrle_mig_pages_transferred(void) +{ +    return acct_info.xbzrle_pages; +} + +uint64_t xbzrle_mig_pages_cache_miss(void) +{ +    return acct_info.xbzrle_cache_miss; +} + +double xbzrle_mig_cache_miss_rate(void) +{ +    return acct_info.xbzrle_cache_miss_rate; +} + +uint64_t xbzrle_mig_pages_overflow(void) +{ +    return acct_info.xbzrle_overflows; +} + +/* This is the last block that we have visited serching for dirty pages + */ +static RAMBlock *last_seen_block; +/* This is the last block from where we have sent data */ +static RAMBlock *last_sent_block; +static ram_addr_t last_offset; +static unsigned long *migration_bitmap; +static QemuMutex migration_bitmap_mutex; +static uint64_t migration_dirty_pages; +static uint32_t last_version; +static bool ram_bulk_stage; + +struct CompressParam { +    bool start; +    bool done; +    QEMUFile *file; +    QemuMutex mutex; +    QemuCond cond; +    RAMBlock *block; +    ram_addr_t offset; +}; +typedef struct CompressParam CompressParam; + +struct DecompressParam { +    bool start; +    QemuMutex mutex; +    QemuCond cond; +    void *des; +    uint8 *compbuf; +    int len; +}; +typedef struct DecompressParam DecompressParam; + +static CompressParam *comp_param; +static QemuThread *compress_threads; +/* comp_done_cond is used to wake up the migration thread when + * one of the compression threads has finished the compression. + * comp_done_lock is used to co-work with comp_done_cond. + */ +static QemuMutex *comp_done_lock; +static QemuCond *comp_done_cond; +/* The empty QEMUFileOps will be used by file in CompressParam */ +static const QEMUFileOps empty_ops = { }; + +static bool compression_switch; +static bool quit_comp_thread; +static bool quit_decomp_thread; +static DecompressParam *decomp_param; +static QemuThread *decompress_threads; +static uint8_t *compressed_data_buf; + +static int do_compress_ram_page(CompressParam *param); + +static void *do_data_compress(void *opaque) +{ +    CompressParam *param = opaque; + +    while (!quit_comp_thread) { +        qemu_mutex_lock(¶m->mutex); +        /* Re-check the quit_comp_thread in case of +         * terminate_compression_threads is called just before +         * qemu_mutex_lock(¶m->mutex) and after +         * while(!quit_comp_thread), re-check it here can make +         * sure the compression thread terminate as expected. +         */ +        while (!param->start && !quit_comp_thread) { +            qemu_cond_wait(¶m->cond, ¶m->mutex); +        } +        if (!quit_comp_thread) { +            do_compress_ram_page(param); +        } +        param->start = false; +        qemu_mutex_unlock(¶m->mutex); + +        qemu_mutex_lock(comp_done_lock); +        param->done = true; +        qemu_cond_signal(comp_done_cond); +        qemu_mutex_unlock(comp_done_lock); +    } + +    return NULL; +} + +static inline void terminate_compression_threads(void) +{ +    int idx, thread_count; + +    thread_count = migrate_compress_threads(); +    quit_comp_thread = true; +    for (idx = 0; idx < thread_count; idx++) { +        qemu_mutex_lock(&comp_param[idx].mutex); +        qemu_cond_signal(&comp_param[idx].cond); +        qemu_mutex_unlock(&comp_param[idx].mutex); +    } +} + +void migrate_compress_threads_join(void) +{ +    int i, thread_count; + +    if (!migrate_use_compression()) { +        return; +    } +    terminate_compression_threads(); +    thread_count = migrate_compress_threads(); +    for (i = 0; i < thread_count; i++) { +        qemu_thread_join(compress_threads + i); +        qemu_fclose(comp_param[i].file); +        qemu_mutex_destroy(&comp_param[i].mutex); +        qemu_cond_destroy(&comp_param[i].cond); +    } +    qemu_mutex_destroy(comp_done_lock); +    qemu_cond_destroy(comp_done_cond); +    g_free(compress_threads); +    g_free(comp_param); +    g_free(comp_done_cond); +    g_free(comp_done_lock); +    compress_threads = NULL; +    comp_param = NULL; +    comp_done_cond = NULL; +    comp_done_lock = NULL; +} + +void migrate_compress_threads_create(void) +{ +    int i, thread_count; + +    if (!migrate_use_compression()) { +        return; +    } +    quit_comp_thread = false; +    compression_switch = true; +    thread_count = migrate_compress_threads(); +    compress_threads = g_new0(QemuThread, thread_count); +    comp_param = g_new0(CompressParam, thread_count); +    comp_done_cond = g_new0(QemuCond, 1); +    comp_done_lock = g_new0(QemuMutex, 1); +    qemu_cond_init(comp_done_cond); +    qemu_mutex_init(comp_done_lock); +    for (i = 0; i < thread_count; i++) { +        /* com_param[i].file is just used as a dummy buffer to save data, set +         * it's ops to empty. +         */ +        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); +        comp_param[i].done = true; +        qemu_mutex_init(&comp_param[i].mutex); +        qemu_cond_init(&comp_param[i].cond); +        qemu_thread_create(compress_threads + i, "compress", +                           do_data_compress, comp_param + i, +                           QEMU_THREAD_JOINABLE); +    } +} + +/** + * save_page_header: Write page header to wire + * + * If this is the 1st block, it also writes the block identification + * + * Returns: Number of bytes written + * + * @f: QEMUFile where to send the data + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + *          in the lower bits, it contains flags + */ +static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +{ +    size_t size, len; + +    qemu_put_be64(f, offset); +    size = 8; + +    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { +        len = strlen(block->idstr); +        qemu_put_byte(f, len); +        qemu_put_buffer(f, (uint8_t *)block->idstr, len); +        size += 1 + len; +    } +    return size; +} + +/* Update the xbzrle cache to reflect a page that's been sent as all 0. + * The important thing is that a stale (not-yet-0'd) page be replaced + * by the new data. + * As a bonus, if the page wasn't in the cache it gets added so that + * when a small write is made into the 0'd page it gets XBZRLE sent + */ +static void xbzrle_cache_zero_page(ram_addr_t current_addr) +{ +    if (ram_bulk_stage || !migrate_use_xbzrle()) { +        return; +    } + +    /* We don't care if this fails to allocate a new cache page +     * as long as it updated an old one */ +    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, +                 bitmap_sync_count); +} + +#define ENCODING_FLAG_XBZRLE 0x1 + +/** + * save_xbzrle_page: compress and send current page + * + * Returns: 1 means that we wrote the page + *          0 means that page is identical to the one already sent + *          -1 means that xbzrle would be longer than normal + * + * @f: QEMUFile where to send the data + * @current_data: + * @current_addr: + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + */ +static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, +                            ram_addr_t current_addr, RAMBlock *block, +                            ram_addr_t offset, bool last_stage, +                            uint64_t *bytes_transferred) +{ +    int encoded_len = 0, bytes_xbzrle; +    uint8_t *prev_cached_page; + +    if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { +        acct_info.xbzrle_cache_miss++; +        if (!last_stage) { +            if (cache_insert(XBZRLE.cache, current_addr, *current_data, +                             bitmap_sync_count) == -1) { +                return -1; +            } else { +                /* update *current_data when the page has been +                   inserted into cache */ +                *current_data = get_cached_data(XBZRLE.cache, current_addr); +            } +        } +        return -1; +    } + +    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); + +    /* save current buffer into memory */ +    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); + +    /* XBZRLE encoding (if there is no overflow) */ +    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, +                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +                                       TARGET_PAGE_SIZE); +    if (encoded_len == 0) { +        DPRINTF("Skipping unmodified page\n"); +        return 0; +    } else if (encoded_len == -1) { +        DPRINTF("Overflow\n"); +        acct_info.xbzrle_overflows++; +        /* update data in the cache */ +        if (!last_stage) { +            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); +            *current_data = prev_cached_page; +        } +        return -1; +    } + +    /* we need to update the data in the cache, in order to get the same data */ +    if (!last_stage) { +        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); +    } + +    /* Send XBZRLE based compressed page */ +    bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); +    qemu_put_byte(f, ENCODING_FLAG_XBZRLE); +    qemu_put_be16(f, encoded_len); +    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); +    bytes_xbzrle += encoded_len + 1 + 2; +    acct_info.xbzrle_pages++; +    acct_info.xbzrle_bytes += bytes_xbzrle; +    *bytes_transferred += bytes_xbzrle; + +    return 1; +} + +/* Called with rcu_read_lock() to protect migration_bitmap */ +static inline +ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, +                                                 ram_addr_t start) +{ +    unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; +    unsigned long nr = base + (start >> TARGET_PAGE_BITS); +    uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); +    unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); +    unsigned long *bitmap; + +    unsigned long next; + +    bitmap = atomic_rcu_read(&migration_bitmap); +    if (ram_bulk_stage && nr > base) { +        next = nr + 1; +    } else { +        next = find_next_bit(bitmap, size, nr); +    } + +    if (next < size) { +        clear_bit(next, bitmap); +        migration_dirty_pages--; +    } +    return (next - base) << TARGET_PAGE_BITS; +} + +/* Called with rcu_read_lock() to protect migration_bitmap */ +static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) +{ +    unsigned long *bitmap; +    bitmap = atomic_rcu_read(&migration_bitmap); +    migration_dirty_pages += +        cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); +} + + +/* Fix me: there are too many global variables used in migration process. */ +static int64_t start_time; +static int64_t bytes_xfer_prev; +static int64_t num_dirty_pages_period; +static uint64_t xbzrle_cache_miss_prev; +static uint64_t iterations_prev; + +static void migration_bitmap_sync_init(void) +{ +    start_time = 0; +    bytes_xfer_prev = 0; +    num_dirty_pages_period = 0; +    xbzrle_cache_miss_prev = 0; +    iterations_prev = 0; +} + +/* Called with iothread lock held, to protect ram_list.dirty_memory[] */ +static void migration_bitmap_sync(void) +{ +    RAMBlock *block; +    uint64_t num_dirty_pages_init = migration_dirty_pages; +    MigrationState *s = migrate_get_current(); +    int64_t end_time; +    int64_t bytes_xfer_now; + +    bitmap_sync_count++; + +    if (!bytes_xfer_prev) { +        bytes_xfer_prev = ram_bytes_transferred(); +    } + +    if (!start_time) { +        start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +    } + +    trace_migration_bitmap_sync_start(); +    address_space_sync_dirty_bitmap(&address_space_memory); + +    qemu_mutex_lock(&migration_bitmap_mutex); +    rcu_read_lock(); +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { +        migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); +    } +    rcu_read_unlock(); +    qemu_mutex_unlock(&migration_bitmap_mutex); + +    trace_migration_bitmap_sync_end(migration_dirty_pages +                                    - num_dirty_pages_init); +    num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; +    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + +    /* more than 1 second = 1000 millisecons */ +    if (end_time > start_time + 1000) { +        if (migrate_auto_converge()) { +            /* The following detection logic can be refined later. For now: +               Check to see if the dirtied bytes is 50% more than the approx. +               amount of bytes that just got transferred since the last time we +               were in this routine. If that happens >N times (for now N==4) +               we turn on the throttle down logic */ +            bytes_xfer_now = ram_bytes_transferred(); +            if (s->dirty_pages_rate && +               (num_dirty_pages_period * TARGET_PAGE_SIZE > +                   (bytes_xfer_now - bytes_xfer_prev)/2) && +               (dirty_rate_high_cnt++ > 4)) { +                    trace_migration_throttle(); +                    mig_throttle_on = true; +                    dirty_rate_high_cnt = 0; +             } +             bytes_xfer_prev = bytes_xfer_now; +        } else { +             mig_throttle_on = false; +        } +        if (migrate_use_xbzrle()) { +            if (iterations_prev != acct_info.iterations) { +                acct_info.xbzrle_cache_miss_rate = +                   (double)(acct_info.xbzrle_cache_miss - +                            xbzrle_cache_miss_prev) / +                   (acct_info.iterations - iterations_prev); +            } +            iterations_prev = acct_info.iterations; +            xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; +        } +        s->dirty_pages_rate = num_dirty_pages_period * 1000 +            / (end_time - start_time); +        s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; +        start_time = end_time; +        num_dirty_pages_period = 0; +    } +    s->dirty_sync_count = bitmap_sync_count; +} + +/** + * save_zero_page: Send the zero page to the stream + * + * Returns: Number of pages written. + * + * @f: QEMUFile where to send the data + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + * @p: pointer to the page + * @bytes_transferred: increase it with the number of transferred bytes + */ +static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, +                          uint8_t *p, uint64_t *bytes_transferred) +{ +    int pages = -1; + +    if (is_zero_range(p, TARGET_PAGE_SIZE)) { +        acct_info.dup_pages++; +        *bytes_transferred += save_page_header(f, block, +                                               offset | RAM_SAVE_FLAG_COMPRESS); +        qemu_put_byte(f, 0); +        *bytes_transferred += 1; +        pages = 1; +    } + +    return pages; +} + +/** + * ram_save_page: Send the given page to the stream + * + * Returns: Number of pages written. + * + * @f: QEMUFile where to send the data + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + */ +static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset, +                         bool last_stage, uint64_t *bytes_transferred) +{ +    int pages = -1; +    uint64_t bytes_xmit; +    ram_addr_t current_addr; +    MemoryRegion *mr = block->mr; +    uint8_t *p; +    int ret; +    bool send_async = true; + +    p = memory_region_get_ram_ptr(mr) + offset; + +    /* In doubt sent page as normal */ +    bytes_xmit = 0; +    ret = ram_control_save_page(f, block->offset, +                           offset, TARGET_PAGE_SIZE, &bytes_xmit); +    if (bytes_xmit) { +        *bytes_transferred += bytes_xmit; +        pages = 1; +    } + +    XBZRLE_cache_lock(); + +    current_addr = block->offset + offset; + +    if (block == last_sent_block) { +        offset |= RAM_SAVE_FLAG_CONTINUE; +    } +    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { +        if (ret != RAM_SAVE_CONTROL_DELAYED) { +            if (bytes_xmit > 0) { +                acct_info.norm_pages++; +            } else if (bytes_xmit == 0) { +                acct_info.dup_pages++; +            } +        } +    } else { +        pages = save_zero_page(f, block, offset, p, bytes_transferred); +        if (pages > 0) { +            /* Must let xbzrle know, otherwise a previous (now 0'd) cached +             * page would be stale +             */ +            xbzrle_cache_zero_page(current_addr); +        } else if (!ram_bulk_stage && migrate_use_xbzrle()) { +            pages = save_xbzrle_page(f, &p, current_addr, block, +                                     offset, last_stage, bytes_transferred); +            if (!last_stage) { +                /* Can't send this cached data async, since the cache page +                 * might get updated before it gets to the wire +                 */ +                send_async = false; +            } +        } +    } + +    /* XBZRLE overflow or normal page */ +    if (pages == -1) { +        *bytes_transferred += save_page_header(f, block, +                                               offset | RAM_SAVE_FLAG_PAGE); +        if (send_async) { +            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); +        } else { +            qemu_put_buffer(f, p, TARGET_PAGE_SIZE); +        } +        *bytes_transferred += TARGET_PAGE_SIZE; +        pages = 1; +        acct_info.norm_pages++; +    } + +    XBZRLE_cache_unlock(); + +    return pages; +} + +static int do_compress_ram_page(CompressParam *param) +{ +    int bytes_sent, blen; +    uint8_t *p; +    RAMBlock *block = param->block; +    ram_addr_t offset = param->offset; + +    p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK); + +    bytes_sent = save_page_header(param->file, block, offset | +                                  RAM_SAVE_FLAG_COMPRESS_PAGE); +    blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, +                                     migrate_compress_level()); +    bytes_sent += blen; + +    return bytes_sent; +} + +static inline void start_compression(CompressParam *param) +{ +    param->done = false; +    qemu_mutex_lock(¶m->mutex); +    param->start = true; +    qemu_cond_signal(¶m->cond); +    qemu_mutex_unlock(¶m->mutex); +} + +static inline void start_decompression(DecompressParam *param) +{ +    qemu_mutex_lock(¶m->mutex); +    param->start = true; +    qemu_cond_signal(¶m->cond); +    qemu_mutex_unlock(¶m->mutex); +} + +static uint64_t bytes_transferred; + +static void flush_compressed_data(QEMUFile *f) +{ +    int idx, len, thread_count; + +    if (!migrate_use_compression()) { +        return; +    } +    thread_count = migrate_compress_threads(); +    for (idx = 0; idx < thread_count; idx++) { +        if (!comp_param[idx].done) { +            qemu_mutex_lock(comp_done_lock); +            while (!comp_param[idx].done && !quit_comp_thread) { +                qemu_cond_wait(comp_done_cond, comp_done_lock); +            } +            qemu_mutex_unlock(comp_done_lock); +        } +        if (!quit_comp_thread) { +            len = qemu_put_qemu_file(f, comp_param[idx].file); +            bytes_transferred += len; +        } +    } +} + +static inline void set_compress_params(CompressParam *param, RAMBlock *block, +                                       ram_addr_t offset) +{ +    param->block = block; +    param->offset = offset; +} + +static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, +                                           ram_addr_t offset, +                                           uint64_t *bytes_transferred) +{ +    int idx, thread_count, bytes_xmit = -1, pages = -1; + +    thread_count = migrate_compress_threads(); +    qemu_mutex_lock(comp_done_lock); +    while (true) { +        for (idx = 0; idx < thread_count; idx++) { +            if (comp_param[idx].done) { +                bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); +                set_compress_params(&comp_param[idx], block, offset); +                start_compression(&comp_param[idx]); +                pages = 1; +                acct_info.norm_pages++; +                *bytes_transferred += bytes_xmit; +                break; +            } +        } +        if (pages > 0) { +            break; +        } else { +            qemu_cond_wait(comp_done_cond, comp_done_lock); +        } +    } +    qemu_mutex_unlock(comp_done_lock); + +    return pages; +} + +/** + * ram_save_compressed_page: compress the given page and send it to the stream + * + * Returns: Number of pages written. + * + * @f: QEMUFile where to send the data + * @block: block that contains the page we want to send + * @offset: offset inside the block for the page + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + */ +static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, +                                    ram_addr_t offset, bool last_stage, +                                    uint64_t *bytes_transferred) +{ +    int pages = -1; +    uint64_t bytes_xmit; +    MemoryRegion *mr = block->mr; +    uint8_t *p; +    int ret; + +    p = memory_region_get_ram_ptr(mr) + offset; + +    bytes_xmit = 0; +    ret = ram_control_save_page(f, block->offset, +                                offset, TARGET_PAGE_SIZE, &bytes_xmit); +    if (bytes_xmit) { +        *bytes_transferred += bytes_xmit; +        pages = 1; +    } +    if (block == last_sent_block) { +        offset |= RAM_SAVE_FLAG_CONTINUE; +    } +    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { +        if (ret != RAM_SAVE_CONTROL_DELAYED) { +            if (bytes_xmit > 0) { +                acct_info.norm_pages++; +            } else if (bytes_xmit == 0) { +                acct_info.dup_pages++; +            } +        } +    } else { +        /* When starting the process of a new block, the first page of +         * the block should be sent out before other pages in the same +         * block, and all the pages in last block should have been sent +         * out, keeping this order is important, because the 'cont' flag +         * is used to avoid resending the block name. +         */ +        if (block != last_sent_block) { +            flush_compressed_data(f); +            pages = save_zero_page(f, block, offset, p, bytes_transferred); +            if (pages == -1) { +                set_compress_params(&comp_param[0], block, offset); +                /* Use the qemu thread to compress the data to make sure the +                 * first page is sent out before other pages +                 */ +                bytes_xmit = do_compress_ram_page(&comp_param[0]); +                acct_info.norm_pages++; +                qemu_put_qemu_file(f, comp_param[0].file); +                *bytes_transferred += bytes_xmit; +                pages = 1; +            } +        } else { +            pages = save_zero_page(f, block, offset, p, bytes_transferred); +            if (pages == -1) { +                pages = compress_page_with_multi_thread(f, block, offset, +                                                        bytes_transferred); +            } +        } +    } + +    return pages; +} + +/** + * ram_find_and_save_block: Finds a dirty page and sends it to f + * + * Called within an RCU critical section. + * + * Returns:  The number of pages written + *           0 means no dirty pages + * + * @f: QEMUFile where to send the data + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + */ + +static int ram_find_and_save_block(QEMUFile *f, bool last_stage, +                                   uint64_t *bytes_transferred) +{ +    RAMBlock *block = last_seen_block; +    ram_addr_t offset = last_offset; +    bool complete_round = false; +    int pages = 0; +    MemoryRegion *mr; + +    if (!block) +        block = QLIST_FIRST_RCU(&ram_list.blocks); + +    while (true) { +        mr = block->mr; +        offset = migration_bitmap_find_and_reset_dirty(mr, offset); +        if (complete_round && block == last_seen_block && +            offset >= last_offset) { +            break; +        } +        if (offset >= block->used_length) { +            offset = 0; +            block = QLIST_NEXT_RCU(block, next); +            if (!block) { +                block = QLIST_FIRST_RCU(&ram_list.blocks); +                complete_round = true; +                ram_bulk_stage = false; +                if (migrate_use_xbzrle()) { +                    /* If xbzrle is on, stop using the data compression at this +                     * point. In theory, xbzrle can do better than compression. +                     */ +                    flush_compressed_data(f); +                    compression_switch = false; +                } +            } +        } else { +            if (compression_switch && migrate_use_compression()) { +                pages = ram_save_compressed_page(f, block, offset, last_stage, +                                                 bytes_transferred); +            } else { +                pages = ram_save_page(f, block, offset, last_stage, +                                      bytes_transferred); +            } + +            /* if page is unmodified, continue to the next */ +            if (pages > 0) { +                last_sent_block = block; +                break; +            } +        } +    } + +    last_seen_block = block; +    last_offset = offset; + +    return pages; +} + +void acct_update_position(QEMUFile *f, size_t size, bool zero) +{ +    uint64_t pages = size / TARGET_PAGE_SIZE; +    if (zero) { +        acct_info.dup_pages += pages; +    } else { +        acct_info.norm_pages += pages; +        bytes_transferred += size; +        qemu_update_position(f, size); +    } +} + +static ram_addr_t ram_save_remaining(void) +{ +    return migration_dirty_pages; +} + +uint64_t ram_bytes_remaining(void) +{ +    return ram_save_remaining() * TARGET_PAGE_SIZE; +} + +uint64_t ram_bytes_transferred(void) +{ +    return bytes_transferred; +} + +uint64_t ram_bytes_total(void) +{ +    RAMBlock *block; +    uint64_t total = 0; + +    rcu_read_lock(); +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) +        total += block->used_length; +    rcu_read_unlock(); +    return total; +} + +void free_xbzrle_decoded_buf(void) +{ +    g_free(xbzrle_decoded_buf); +    xbzrle_decoded_buf = NULL; +} + +static void migration_end(void) +{ +    /* caller have hold iothread lock or is in a bh, so there is +     * no writing race against this migration_bitmap +     */ +    unsigned long *bitmap = migration_bitmap; +    atomic_rcu_set(&migration_bitmap, NULL); +    if (bitmap) { +        memory_global_dirty_log_stop(); +        synchronize_rcu(); +        g_free(bitmap); +    } + +    XBZRLE_cache_lock(); +    if (XBZRLE.cache) { +        cache_fini(XBZRLE.cache); +        g_free(XBZRLE.encoded_buf); +        g_free(XBZRLE.current_buf); +        XBZRLE.cache = NULL; +        XBZRLE.encoded_buf = NULL; +        XBZRLE.current_buf = NULL; +    } +    XBZRLE_cache_unlock(); +} + +static void ram_migration_cancel(void *opaque) +{ +    migration_end(); +} + +static void reset_ram_globals(void) +{ +    last_seen_block = NULL; +    last_sent_block = NULL; +    last_offset = 0; +    last_version = ram_list.version; +    ram_bulk_stage = true; +} + +#define MAX_WAIT 50 /* ms, half buffered_file limit */ + +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) +{ +    /* called in qemu main thread, so there is +     * no writing race against this migration_bitmap +     */ +    if (migration_bitmap) { +        unsigned long *old_bitmap = migration_bitmap, *bitmap; +        bitmap = bitmap_new(new); + +        /* prevent migration_bitmap content from being set bit +         * by migration_bitmap_sync_range() at the same time. +         * it is safe to migration if migration_bitmap is cleared bit +         * at the same time. +         */ +        qemu_mutex_lock(&migration_bitmap_mutex); +        bitmap_copy(bitmap, old_bitmap, old); +        bitmap_set(bitmap, old, new - old); +        atomic_rcu_set(&migration_bitmap, bitmap); +        qemu_mutex_unlock(&migration_bitmap_mutex); +        migration_dirty_pages += new - old; +        synchronize_rcu(); +        g_free(old_bitmap); +    } +} + +/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has + * long-running RCU critical section.  When rcu-reclaims in the code + * start to become numerous it will be necessary to reduce the + * granularity of these critical sections. + */ + +static int ram_save_setup(QEMUFile *f, void *opaque) +{ +    RAMBlock *block; +    int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ + +    mig_throttle_on = false; +    dirty_rate_high_cnt = 0; +    bitmap_sync_count = 0; +    migration_bitmap_sync_init(); +    qemu_mutex_init(&migration_bitmap_mutex); + +    if (migrate_use_xbzrle()) { +        XBZRLE_cache_lock(); +        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / +                                  TARGET_PAGE_SIZE, +                                  TARGET_PAGE_SIZE); +        if (!XBZRLE.cache) { +            XBZRLE_cache_unlock(); +            error_report("Error creating cache"); +            return -1; +        } +        XBZRLE_cache_unlock(); + +        /* We prefer not to abort if there is no memory */ +        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); +        if (!XBZRLE.encoded_buf) { +            error_report("Error allocating encoded_buf"); +            return -1; +        } + +        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); +        if (!XBZRLE.current_buf) { +            error_report("Error allocating current_buf"); +            g_free(XBZRLE.encoded_buf); +            XBZRLE.encoded_buf = NULL; +            return -1; +        } + +        acct_clear(); +    } + +    /* iothread lock needed for ram_list.dirty_memory[] */ +    qemu_mutex_lock_iothread(); +    qemu_mutex_lock_ramlist(); +    rcu_read_lock(); +    bytes_transferred = 0; +    reset_ram_globals(); + +    ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; +    migration_bitmap = bitmap_new(ram_bitmap_pages); +    bitmap_set(migration_bitmap, 0, ram_bitmap_pages); + +    /* +     * Count the total number of pages used by ram blocks not including any +     * gaps due to alignment or unplugs. +     */ +    migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; + +    memory_global_dirty_log_start(); +    migration_bitmap_sync(); +    qemu_mutex_unlock_ramlist(); +    qemu_mutex_unlock_iothread(); + +    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); + +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { +        qemu_put_byte(f, strlen(block->idstr)); +        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); +        qemu_put_be64(f, block->used_length); +    } + +    rcu_read_unlock(); + +    ram_control_before_iterate(f, RAM_CONTROL_SETUP); +    ram_control_after_iterate(f, RAM_CONTROL_SETUP); + +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + +    return 0; +} + +static int ram_save_iterate(QEMUFile *f, void *opaque) +{ +    int ret; +    int i; +    int64_t t0; +    int pages_sent = 0; + +    rcu_read_lock(); +    if (ram_list.version != last_version) { +        reset_ram_globals(); +    } + +    /* Read version before ram_list.blocks */ +    smp_rmb(); + +    ram_control_before_iterate(f, RAM_CONTROL_ROUND); + +    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +    i = 0; +    while ((ret = qemu_file_rate_limit(f)) == 0) { +        int pages; + +        pages = ram_find_and_save_block(f, false, &bytes_transferred); +        /* no more pages to sent */ +        if (pages == 0) { +            break; +        } +        pages_sent += pages; +        acct_info.iterations++; +        check_guest_throttling(); +        /* we want to check in the 1st loop, just in case it was the 1st time +           and we had to sync the dirty bitmap. +           qemu_get_clock_ns() is a bit expensive, so we only check each some +           iterations +        */ +        if ((i & 63) == 0) { +            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; +            if (t1 > MAX_WAIT) { +                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", +                        t1, i); +                break; +            } +        } +        i++; +    } +    flush_compressed_data(f); +    rcu_read_unlock(); + +    /* +     * Must occur before EOS (or any QEMUFile operation) +     * because of RDMA protocol. +     */ +    ram_control_after_iterate(f, RAM_CONTROL_ROUND); + +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS); +    bytes_transferred += 8; + +    ret = qemu_file_get_error(f); +    if (ret < 0) { +        return ret; +    } + +    return pages_sent; +} + +/* Called with iothread lock */ +static int ram_save_complete(QEMUFile *f, void *opaque) +{ +    rcu_read_lock(); + +    migration_bitmap_sync(); + +    ram_control_before_iterate(f, RAM_CONTROL_FINISH); + +    /* try transferring iterative blocks of memory */ + +    /* flush all remaining blocks regardless of rate limiting */ +    while (true) { +        int pages; + +        pages = ram_find_and_save_block(f, true, &bytes_transferred); +        /* no more blocks to sent */ +        if (pages == 0) { +            break; +        } +    } + +    flush_compressed_data(f); +    ram_control_after_iterate(f, RAM_CONTROL_FINISH); + +    rcu_read_unlock(); + +    migration_end(); +    qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + +    return 0; +} + +static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +{ +    uint64_t remaining_size; + +    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; + +    if (remaining_size < max_size) { +        qemu_mutex_lock_iothread(); +        rcu_read_lock(); +        migration_bitmap_sync(); +        rcu_read_unlock(); +        qemu_mutex_unlock_iothread(); +        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; +    } +    return remaining_size; +} + +static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) +{ +    unsigned int xh_len; +    int xh_flags; + +    if (!xbzrle_decoded_buf) { +        xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); +    } + +    /* extract RLE header */ +    xh_flags = qemu_get_byte(f); +    xh_len = qemu_get_be16(f); + +    if (xh_flags != ENCODING_FLAG_XBZRLE) { +        error_report("Failed to load XBZRLE page - wrong compression!"); +        return -1; +    } + +    if (xh_len > TARGET_PAGE_SIZE) { +        error_report("Failed to load XBZRLE page - len overflow!"); +        return -1; +    } +    /* load data and decode */ +    qemu_get_buffer(f, xbzrle_decoded_buf, xh_len); + +    /* decode RLE */ +    if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host, +                             TARGET_PAGE_SIZE) == -1) { +        error_report("Failed to load XBZRLE page - decode error!"); +        return -1; +    } + +    return 0; +} + +/* Must be called from within a rcu critical section. + * Returns a pointer from within the RCU-protected ram_list. + */ +static inline void *host_from_stream_offset(QEMUFile *f, +                                            ram_addr_t offset, +                                            int flags) +{ +    static RAMBlock *block = NULL; +    char id[256]; +    uint8_t len; + +    if (flags & RAM_SAVE_FLAG_CONTINUE) { +        if (!block || block->max_length <= offset) { +            error_report("Ack, bad migration stream!"); +            return NULL; +        } + +        return memory_region_get_ram_ptr(block->mr) + offset; +    } + +    len = qemu_get_byte(f); +    qemu_get_buffer(f, (uint8_t *)id, len); +    id[len] = 0; + +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { +        if (!strncmp(id, block->idstr, sizeof(id)) && +            block->max_length > offset) { +            return memory_region_get_ram_ptr(block->mr) + offset; +        } +    } + +    error_report("Can't find block %s!", id); +    return NULL; +} + +/* + * If a page (or a whole RDMA chunk) has been + * determined to be zero, then zap it. + */ +void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) +{ +    if (ch != 0 || !is_zero_range(host, size)) { +        memset(host, ch, size); +    } +} + +static void *do_data_decompress(void *opaque) +{ +    DecompressParam *param = opaque; +    unsigned long pagesize; + +    while (!quit_decomp_thread) { +        qemu_mutex_lock(¶m->mutex); +        while (!param->start && !quit_decomp_thread) { +            qemu_cond_wait(¶m->cond, ¶m->mutex); +            pagesize = TARGET_PAGE_SIZE; +            if (!quit_decomp_thread) { +                /* uncompress() will return failed in some case, especially +                 * when the page is dirted when doing the compression, it's +                 * not a problem because the dirty page will be retransferred +                 * and uncompress() won't break the data in other pages. +                 */ +                uncompress((Bytef *)param->des, &pagesize, +                           (const Bytef *)param->compbuf, param->len); +            } +            param->start = false; +        } +        qemu_mutex_unlock(¶m->mutex); +    } + +    return NULL; +} + +void migrate_decompress_threads_create(void) +{ +    int i, thread_count; + +    thread_count = migrate_decompress_threads(); +    decompress_threads = g_new0(QemuThread, thread_count); +    decomp_param = g_new0(DecompressParam, thread_count); +    compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); +    quit_decomp_thread = false; +    for (i = 0; i < thread_count; i++) { +        qemu_mutex_init(&decomp_param[i].mutex); +        qemu_cond_init(&decomp_param[i].cond); +        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); +        qemu_thread_create(decompress_threads + i, "decompress", +                           do_data_decompress, decomp_param + i, +                           QEMU_THREAD_JOINABLE); +    } +} + +void migrate_decompress_threads_join(void) +{ +    int i, thread_count; + +    quit_decomp_thread = true; +    thread_count = migrate_decompress_threads(); +    for (i = 0; i < thread_count; i++) { +        qemu_mutex_lock(&decomp_param[i].mutex); +        qemu_cond_signal(&decomp_param[i].cond); +        qemu_mutex_unlock(&decomp_param[i].mutex); +    } +    for (i = 0; i < thread_count; i++) { +        qemu_thread_join(decompress_threads + i); +        qemu_mutex_destroy(&decomp_param[i].mutex); +        qemu_cond_destroy(&decomp_param[i].cond); +        g_free(decomp_param[i].compbuf); +    } +    g_free(decompress_threads); +    g_free(decomp_param); +    g_free(compressed_data_buf); +    decompress_threads = NULL; +    decomp_param = NULL; +    compressed_data_buf = NULL; +} + +static void decompress_data_with_multi_threads(uint8_t *compbuf, +                                               void *host, int len) +{ +    int idx, thread_count; + +    thread_count = migrate_decompress_threads(); +    while (true) { +        for (idx = 0; idx < thread_count; idx++) { +            if (!decomp_param[idx].start) { +                memcpy(decomp_param[idx].compbuf, compbuf, len); +                decomp_param[idx].des = host; +                decomp_param[idx].len = len; +                start_decompression(&decomp_param[idx]); +                break; +            } +        } +        if (idx < thread_count) { +            break; +        } +    } +} + +static int ram_load(QEMUFile *f, void *opaque, int version_id) +{ +    int flags = 0, ret = 0; +    static uint64_t seq_iter; +    int len = 0; + +    seq_iter++; + +    if (version_id != 4) { +        ret = -EINVAL; +    } + +    /* This RCU critical section can be very long running. +     * When RCU reclaims in the code start to become numerous, +     * it will be necessary to reduce the granularity of this +     * critical section. +     */ +    rcu_read_lock(); +    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { +        ram_addr_t addr, total_ram_bytes; +        void *host; +        uint8_t ch; + +        addr = qemu_get_be64(f); +        flags = addr & ~TARGET_PAGE_MASK; +        addr &= TARGET_PAGE_MASK; + +        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { +        case RAM_SAVE_FLAG_MEM_SIZE: +            /* Synchronize RAM block list */ +            total_ram_bytes = addr; +            while (!ret && total_ram_bytes) { +                RAMBlock *block; +                char id[256]; +                ram_addr_t length; + +                len = qemu_get_byte(f); +                qemu_get_buffer(f, (uint8_t *)id, len); +                id[len] = 0; +                length = qemu_get_be64(f); + +                QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { +                    if (!strncmp(id, block->idstr, sizeof(id))) { +                        if (length != block->used_length) { +                            Error *local_err = NULL; + +                            ret = qemu_ram_resize(block->offset, length, &local_err); +                            if (local_err) { +                                error_report_err(local_err); +                            } +                        } +                        ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, +                                              block->idstr); +                        break; +                    } +                } + +                if (!block) { +                    error_report("Unknown ramblock \"%s\", cannot " +                                 "accept migration", id); +                    ret = -EINVAL; +                } + +                total_ram_bytes -= length; +            } +            break; +        case RAM_SAVE_FLAG_COMPRESS: +            host = host_from_stream_offset(f, addr, flags); +            if (!host) { +                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); +                ret = -EINVAL; +                break; +            } +            ch = qemu_get_byte(f); +            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); +            break; +        case RAM_SAVE_FLAG_PAGE: +            host = host_from_stream_offset(f, addr, flags); +            if (!host) { +                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); +                ret = -EINVAL; +                break; +            } +            qemu_get_buffer(f, host, TARGET_PAGE_SIZE); +            break; +        case RAM_SAVE_FLAG_COMPRESS_PAGE: +            host = host_from_stream_offset(f, addr, flags); +            if (!host) { +                error_report("Invalid RAM offset " RAM_ADDR_FMT, addr); +                ret = -EINVAL; +                break; +            } + +            len = qemu_get_be32(f); +            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { +                error_report("Invalid compressed data length: %d", len); +                ret = -EINVAL; +                break; +            } +            qemu_get_buffer(f, compressed_data_buf, len); +            decompress_data_with_multi_threads(compressed_data_buf, host, len); +            break; +        case RAM_SAVE_FLAG_XBZRLE: +            host = host_from_stream_offset(f, addr, flags); +            if (!host) { +                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); +                ret = -EINVAL; +                break; +            } +            if (load_xbzrle(f, addr, host) < 0) { +                error_report("Failed to decompress XBZRLE page at " +                             RAM_ADDR_FMT, addr); +                ret = -EINVAL; +                break; +            } +            break; +        case RAM_SAVE_FLAG_EOS: +            /* normal exit */ +            break; +        default: +            if (flags & RAM_SAVE_FLAG_HOOK) { +                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); +            } else { +                error_report("Unknown combination of migration flags: %#x", +                             flags); +                ret = -EINVAL; +            } +        } +        if (!ret) { +            ret = qemu_file_get_error(f); +        } +    } + +    rcu_read_unlock(); +    DPRINTF("Completed load of VM with exit code %d seq iteration " +            "%" PRIu64 "\n", ret, seq_iter); +    return ret; +} + +static SaveVMHandlers savevm_ram_handlers = { +    .save_live_setup = ram_save_setup, +    .save_live_iterate = ram_save_iterate, +    .save_live_complete = ram_save_complete, +    .save_live_pending = ram_save_pending, +    .load_state = ram_load, +    .cancel = ram_migration_cancel, +}; + +void ram_mig_init(void) +{ +    qemu_mutex_init(&XBZRLE.lock); +    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); +} +/* Stub function that's gets run on the vcpu when its brought out of the +   VM to run inside qemu via async_run_on_cpu()*/ + +static void mig_sleep_cpu(void *opq) +{ +    qemu_mutex_unlock_iothread(); +    g_usleep(30*1000); +    qemu_mutex_lock_iothread(); +} + +/* To reduce the dirty rate explicitly disallow the VCPUs from spending +   much time in the VM. The migration thread will try to catchup. +   Workload will experience a performance drop. +*/ +static void mig_throttle_guest_down(void) +{ +    CPUState *cpu; + +    qemu_mutex_lock_iothread(); +    CPU_FOREACH(cpu) { +        async_run_on_cpu(cpu, mig_sleep_cpu, NULL); +    } +    qemu_mutex_unlock_iothread(); +} + +static void check_guest_throttling(void) +{ +    static int64_t t0; +    int64_t        t1; + +    if (!mig_throttle_on) { +        return; +    } + +    if (!t0)  { +        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +        return; +    } + +    t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + +    /* If it has been more than 40 ms since the last time the guest +     * was throttled then do it again. +     */ +    if (40 < (t1-t0)/1000000) { +        mig_throttle_guest_down(); +        t0 = t1; +    } +} diff --git a/migration/rdma.c b/migration/rdma.c new file mode 100644 index 00000000..74876fd7 --- /dev/null +++ b/migration/rdma.c @@ -0,0 +1,3516 @@ +/* + * RDMA protocol and interfaces + * + * Copyright IBM, Corp. 2010-2013 + * + * Authors: + *  Michael R. Hines <mrhines@us.ibm.com> + *  Jiuxing Liu <jl@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later.  See the COPYING file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "exec/cpu-common.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/sockets.h" +#include "qemu/bitmap.h" +#include "block/coroutine.h" +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <string.h> +#include <rdma/rdma_cma.h> +#include "trace.h" + +/* + * Print and error on both the Monitor and the Log file. + */ +#define ERROR(errp, fmt, ...) \ +    do { \ +        fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \ +        if (errp && (*(errp) == NULL)) { \ +            error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \ +        } \ +    } while (0) + +#define RDMA_RESOLVE_TIMEOUT_MS 10000 + +/* Do not merge data if larger than this. */ +#define RDMA_MERGE_MAX (2 * 1024 * 1024) +#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096) + +#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */ + +/* + * This is only for non-live state being migrated. + * Instead of RDMA_WRITE messages, we use RDMA_SEND + * messages for that state, which requires a different + * delivery design than main memory. + */ +#define RDMA_SEND_INCREMENT 32768 + +/* + * Maximum size infiniband SEND message + */ +#define RDMA_CONTROL_MAX_BUFFER (512 * 1024) +#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096 + +#define RDMA_CONTROL_VERSION_CURRENT 1 +/* + * Capabilities for negotiation. + */ +#define RDMA_CAPABILITY_PIN_ALL 0x01 + +/* + * Add the other flags above to this list of known capabilities + * as they are introduced. + */ +static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL; + +#define CHECK_ERROR_STATE() \ +    do { \ +        if (rdma->error_state) { \ +            if (!rdma->error_reported) { \ +                error_report("RDMA is in an error state waiting migration" \ +                                " to abort!"); \ +                rdma->error_reported = 1; \ +            } \ +            return rdma->error_state; \ +        } \ +    } while (0); + +/* + * A work request ID is 64-bits and we split up these bits + * into 3 parts: + * + * bits 0-15 : type of control message, 2^16 + * bits 16-29: ram block index, 2^14 + * bits 30-63: ram block chunk number, 2^34 + * + * The last two bit ranges are only used for RDMA writes, + * in order to track their completion and potentially + * also track unregistration status of the message. + */ +#define RDMA_WRID_TYPE_SHIFT  0UL +#define RDMA_WRID_BLOCK_SHIFT 16UL +#define RDMA_WRID_CHUNK_SHIFT 30UL + +#define RDMA_WRID_TYPE_MASK \ +    ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL) + +#define RDMA_WRID_BLOCK_MASK \ +    (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL)) + +#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK) + +/* + * RDMA migration protocol: + * 1. RDMA Writes (data messages, i.e. RAM) + * 2. IB Send/Recv (control channel messages) + */ +enum { +    RDMA_WRID_NONE = 0, +    RDMA_WRID_RDMA_WRITE = 1, +    RDMA_WRID_SEND_CONTROL = 2000, +    RDMA_WRID_RECV_CONTROL = 4000, +}; + +static const char *wrid_desc[] = { +    [RDMA_WRID_NONE] = "NONE", +    [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA", +    [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND", +    [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV", +}; + +/* + * Work request IDs for IB SEND messages only (not RDMA writes). + * This is used by the migration protocol to transmit + * control messages (such as device state and registration commands) + * + * We could use more WRs, but we have enough for now. + */ +enum { +    RDMA_WRID_READY = 0, +    RDMA_WRID_DATA, +    RDMA_WRID_CONTROL, +    RDMA_WRID_MAX, +}; + +/* + * SEND/RECV IB Control Messages. + */ +enum { +    RDMA_CONTROL_NONE = 0, +    RDMA_CONTROL_ERROR, +    RDMA_CONTROL_READY,               /* ready to receive */ +    RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */ +    RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */ +    RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */ +    RDMA_CONTROL_COMPRESS,            /* page contains repeat values */ +    RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */ +    RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */ +    RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */ +    RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */ +    RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */ +}; + +static const char *control_desc[] = { +    [RDMA_CONTROL_NONE] = "NONE", +    [RDMA_CONTROL_ERROR] = "ERROR", +    [RDMA_CONTROL_READY] = "READY", +    [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE", +    [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST", +    [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT", +    [RDMA_CONTROL_COMPRESS] = "COMPRESS", +    [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST", +    [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT", +    [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED", +    [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST", +    [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED", +}; + +/* + * Memory and MR structures used to represent an IB Send/Recv work request. + * This is *not* used for RDMA writes, only IB Send/Recv. + */ +typedef struct { +    uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */ +    struct   ibv_mr *control_mr;               /* registration metadata */ +    size_t   control_len;                      /* length of the message */ +    uint8_t *control_curr;                     /* start of unconsumed bytes */ +} RDMAWorkRequestData; + +/* + * Negotiate RDMA capabilities during connection-setup time. + */ +typedef struct { +    uint32_t version; +    uint32_t flags; +} RDMACapabilities; + +static void caps_to_network(RDMACapabilities *cap) +{ +    cap->version = htonl(cap->version); +    cap->flags = htonl(cap->flags); +} + +static void network_to_caps(RDMACapabilities *cap) +{ +    cap->version = ntohl(cap->version); +    cap->flags = ntohl(cap->flags); +} + +/* + * Representation of a RAMBlock from an RDMA perspective. + * This is not transmitted, only local. + * This and subsequent structures cannot be linked lists + * because we're using a single IB message to transmit + * the information. It's small anyway, so a list is overkill. + */ +typedef struct RDMALocalBlock { +    char          *block_name; +    uint8_t       *local_host_addr; /* local virtual address */ +    uint64_t       remote_host_addr; /* remote virtual address */ +    uint64_t       offset; +    uint64_t       length; +    struct         ibv_mr **pmr;    /* MRs for chunk-level registration */ +    struct         ibv_mr *mr;      /* MR for non-chunk-level registration */ +    uint32_t      *remote_keys;     /* rkeys for chunk-level registration */ +    uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */ +    int            index;           /* which block are we */ +    unsigned int   src_index;       /* (Only used on dest) */ +    bool           is_ram_block; +    int            nb_chunks; +    unsigned long *transit_bitmap; +    unsigned long *unregister_bitmap; +} RDMALocalBlock; + +/* + * Also represents a RAMblock, but only on the dest. + * This gets transmitted by the dest during connection-time + * to the source VM and then is used to populate the + * corresponding RDMALocalBlock with + * the information needed to perform the actual RDMA. + */ +typedef struct QEMU_PACKED RDMADestBlock { +    uint64_t remote_host_addr; +    uint64_t offset; +    uint64_t length; +    uint32_t remote_rkey; +    uint32_t padding; +} RDMADestBlock; + +static uint64_t htonll(uint64_t v) +{ +    union { uint32_t lv[2]; uint64_t llv; } u; +    u.lv[0] = htonl(v >> 32); +    u.lv[1] = htonl(v & 0xFFFFFFFFULL); +    return u.llv; +} + +static uint64_t ntohll(uint64_t v) { +    union { uint32_t lv[2]; uint64_t llv; } u; +    u.llv = v; +    return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]); +} + +static void dest_block_to_network(RDMADestBlock *db) +{ +    db->remote_host_addr = htonll(db->remote_host_addr); +    db->offset = htonll(db->offset); +    db->length = htonll(db->length); +    db->remote_rkey = htonl(db->remote_rkey); +} + +static void network_to_dest_block(RDMADestBlock *db) +{ +    db->remote_host_addr = ntohll(db->remote_host_addr); +    db->offset = ntohll(db->offset); +    db->length = ntohll(db->length); +    db->remote_rkey = ntohl(db->remote_rkey); +} + +/* + * Virtual address of the above structures used for transmitting + * the RAMBlock descriptions at connection-time. + * This structure is *not* transmitted. + */ +typedef struct RDMALocalBlocks { +    int nb_blocks; +    bool     init;             /* main memory init complete */ +    RDMALocalBlock *block; +} RDMALocalBlocks; + +/* + * Main data structure for RDMA state. + * While there is only one copy of this structure being allocated right now, + * this is the place where one would start if you wanted to consider + * having more than one RDMA connection open at the same time. + */ +typedef struct RDMAContext { +    char *host; +    int port; + +    RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; + +    /* +     * This is used by *_exchange_send() to figure out whether or not +     * the initial "READY" message has already been received or not. +     * This is because other functions may potentially poll() and detect +     * the READY message before send() does, in which case we need to +     * know if it completed. +     */ +    int control_ready_expected; + +    /* number of outstanding writes */ +    int nb_sent; + +    /* store info about current buffer so that we can +       merge it with future sends */ +    uint64_t current_addr; +    uint64_t current_length; +    /* index of ram block the current buffer belongs to */ +    int current_index; +    /* index of the chunk in the current ram block */ +    int current_chunk; + +    bool pin_all; + +    /* +     * infiniband-specific variables for opening the device +     * and maintaining connection state and so forth. +     * +     * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in +     * cm_id->verbs, cm_id->channel, and cm_id->qp. +     */ +    struct rdma_cm_id *cm_id;               /* connection manager ID */ +    struct rdma_cm_id *listen_id; +    bool connected; + +    struct ibv_context          *verbs; +    struct rdma_event_channel   *channel; +    struct ibv_qp *qp;                      /* queue pair */ +    struct ibv_comp_channel *comp_channel;  /* completion channel */ +    struct ibv_pd *pd;                      /* protection domain */ +    struct ibv_cq *cq;                      /* completion queue */ + +    /* +     * If a previous write failed (perhaps because of a failed +     * memory registration, then do not attempt any future work +     * and remember the error state. +     */ +    int error_state; +    int error_reported; + +    /* +     * Description of ram blocks used throughout the code. +     */ +    RDMALocalBlocks local_ram_blocks; +    RDMADestBlock  *dest_blocks; + +    /* Index of the next RAMBlock received during block registration */ +    unsigned int    next_src_index; + +    /* +     * Migration on *destination* started. +     * Then use coroutine yield function. +     * Source runs in a thread, so we don't care. +     */ +    int migration_started_on_destination; + +    int total_registrations; +    int total_writes; + +    int unregister_current, unregister_next; +    uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX]; + +    GHashTable *blockmap; +} RDMAContext; + +/* + * Interface to the rest of the migration call stack. + */ +typedef struct QEMUFileRDMA { +    RDMAContext *rdma; +    size_t len; +    void *file; +} QEMUFileRDMA; + +/* + * Main structure for IB Send/Recv control messages. + * This gets prepended at the beginning of every Send/Recv. + */ +typedef struct QEMU_PACKED { +    uint32_t len;     /* Total length of data portion */ +    uint32_t type;    /* which control command to perform */ +    uint32_t repeat;  /* number of commands in data portion of same type */ +    uint32_t padding; +} RDMAControlHeader; + +static void control_to_network(RDMAControlHeader *control) +{ +    control->type = htonl(control->type); +    control->len = htonl(control->len); +    control->repeat = htonl(control->repeat); +} + +static void network_to_control(RDMAControlHeader *control) +{ +    control->type = ntohl(control->type); +    control->len = ntohl(control->len); +    control->repeat = ntohl(control->repeat); +} + +/* + * Register a single Chunk. + * Information sent by the source VM to inform the dest + * to register an single chunk of memory before we can perform + * the actual RDMA operation. + */ +typedef struct QEMU_PACKED { +    union QEMU_PACKED { +        uint64_t current_addr;  /* offset into the ram_addr_t space */ +        uint64_t chunk;         /* chunk to lookup if unregistering */ +    } key; +    uint32_t current_index; /* which ramblock the chunk belongs to */ +    uint32_t padding; +    uint64_t chunks;            /* how many sequential chunks to register */ +} RDMARegister; + +static void register_to_network(RDMAContext *rdma, RDMARegister *reg) +{ +    RDMALocalBlock *local_block; +    local_block  = &rdma->local_ram_blocks.block[reg->current_index]; + +    if (local_block->is_ram_block) { +        /* +         * current_addr as passed in is an address in the local ram_addr_t +         * space, we need to translate this for the destination +         */ +        reg->key.current_addr -= local_block->offset; +        reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; +    } +    reg->key.current_addr = htonll(reg->key.current_addr); +    reg->current_index = htonl(reg->current_index); +    reg->chunks = htonll(reg->chunks); +} + +static void network_to_register(RDMARegister *reg) +{ +    reg->key.current_addr = ntohll(reg->key.current_addr); +    reg->current_index = ntohl(reg->current_index); +    reg->chunks = ntohll(reg->chunks); +} + +typedef struct QEMU_PACKED { +    uint32_t value;     /* if zero, we will madvise() */ +    uint32_t block_idx; /* which ram block index */ +    uint64_t offset;    /* Address in remote ram_addr_t space */ +    uint64_t length;    /* length of the chunk */ +} RDMACompress; + +static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) +{ +    comp->value = htonl(comp->value); +    /* +     * comp->offset as passed in is an address in the local ram_addr_t +     * space, we need to translate this for the destination +     */ +    comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; +    comp->offset += rdma->dest_blocks[comp->block_idx].offset; +    comp->block_idx = htonl(comp->block_idx); +    comp->offset = htonll(comp->offset); +    comp->length = htonll(comp->length); +} + +static void network_to_compress(RDMACompress *comp) +{ +    comp->value = ntohl(comp->value); +    comp->block_idx = ntohl(comp->block_idx); +    comp->offset = ntohll(comp->offset); +    comp->length = ntohll(comp->length); +} + +/* + * The result of the dest's memory registration produces an "rkey" + * which the source VM must reference in order to perform + * the RDMA operation. + */ +typedef struct QEMU_PACKED { +    uint32_t rkey; +    uint32_t padding; +    uint64_t host_addr; +} RDMARegisterResult; + +static void result_to_network(RDMARegisterResult *result) +{ +    result->rkey = htonl(result->rkey); +    result->host_addr = htonll(result->host_addr); +}; + +static void network_to_result(RDMARegisterResult *result) +{ +    result->rkey = ntohl(result->rkey); +    result->host_addr = ntohll(result->host_addr); +}; + +const char *print_wrid(int wrid); +static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, +                                   uint8_t *data, RDMAControlHeader *resp, +                                   int *resp_idx, +                                   int (*callback)(RDMAContext *rdma)); + +static inline uint64_t ram_chunk_index(const uint8_t *start, +                                       const uint8_t *host) +{ +    return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT; +} + +static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block, +                                       uint64_t i) +{ +    return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr + +                                  (i << RDMA_REG_CHUNK_SHIFT)); +} + +static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, +                                     uint64_t i) +{ +    uint8_t *result = ram_chunk_start(rdma_ram_block, i) + +                                         (1UL << RDMA_REG_CHUNK_SHIFT); + +    if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) { +        result = rdma_ram_block->local_host_addr + rdma_ram_block->length; +    } + +    return result; +} + +static int rdma_add_block(RDMAContext *rdma, const char *block_name, +                         void *host_addr, +                         ram_addr_t block_offset, uint64_t length) +{ +    RDMALocalBlocks *local = &rdma->local_ram_blocks; +    RDMALocalBlock *block; +    RDMALocalBlock *old = local->block; + +    local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); + +    if (local->nb_blocks) { +        int x; + +        if (rdma->blockmap) { +            for (x = 0; x < local->nb_blocks; x++) { +                g_hash_table_remove(rdma->blockmap, +                                    (void *)(uintptr_t)old[x].offset); +                g_hash_table_insert(rdma->blockmap, +                                    (void *)(uintptr_t)old[x].offset, +                                    &local->block[x]); +            } +        } +        memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); +        g_free(old); +    } + +    block = &local->block[local->nb_blocks]; + +    block->block_name = g_strdup(block_name); +    block->local_host_addr = host_addr; +    block->offset = block_offset; +    block->length = length; +    block->index = local->nb_blocks; +    block->src_index = ~0U; /* Filled in by the receipt of the block list */ +    block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; +    block->transit_bitmap = bitmap_new(block->nb_chunks); +    bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); +    block->unregister_bitmap = bitmap_new(block->nb_chunks); +    bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); +    block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t)); + +    block->is_ram_block = local->init ? false : true; + +    if (rdma->blockmap) { +        g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); +    } + +    trace_rdma_add_block(block_name, local->nb_blocks, +                         (uintptr_t) block->local_host_addr, +                         block->offset, block->length, +                         (uintptr_t) (block->local_host_addr + block->length), +                         BITS_TO_LONGS(block->nb_chunks) * +                             sizeof(unsigned long) * 8, +                         block->nb_chunks); + +    local->nb_blocks++; + +    return 0; +} + +/* + * Memory regions need to be registered with the device and queue pairs setup + * in advanced before the migration starts. This tells us where the RAM blocks + * are so that we can register them individually. + */ +static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, +    ram_addr_t block_offset, ram_addr_t length, void *opaque) +{ +    return rdma_add_block(opaque, block_name, host_addr, block_offset, length); +} + +/* + * Identify the RAMBlocks and their quantity. They will be references to + * identify chunk boundaries inside each RAMBlock and also be referenced + * during dynamic page registration. + */ +static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) +{ +    RDMALocalBlocks *local = &rdma->local_ram_blocks; + +    assert(rdma->blockmap == NULL); +    memset(local, 0, sizeof *local); +    qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); +    trace_qemu_rdma_init_ram_blocks(local->nb_blocks); +    rdma->dest_blocks = (RDMADestBlock *) g_malloc0(sizeof(RDMADestBlock) * +                        rdma->local_ram_blocks.nb_blocks); +    local->init = true; +    return 0; +} + +/* + * Note: If used outside of cleanup, the caller must ensure that the destination + * block structures are also updated + */ +static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) +{ +    RDMALocalBlocks *local = &rdma->local_ram_blocks; +    RDMALocalBlock *old = local->block; +    int x; + +    if (rdma->blockmap) { +        g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); +    } +    if (block->pmr) { +        int j; + +        for (j = 0; j < block->nb_chunks; j++) { +            if (!block->pmr[j]) { +                continue; +            } +            ibv_dereg_mr(block->pmr[j]); +            rdma->total_registrations--; +        } +        g_free(block->pmr); +        block->pmr = NULL; +    } + +    if (block->mr) { +        ibv_dereg_mr(block->mr); +        rdma->total_registrations--; +        block->mr = NULL; +    } + +    g_free(block->transit_bitmap); +    block->transit_bitmap = NULL; + +    g_free(block->unregister_bitmap); +    block->unregister_bitmap = NULL; + +    g_free(block->remote_keys); +    block->remote_keys = NULL; + +    g_free(block->block_name); +    block->block_name = NULL; + +    if (rdma->blockmap) { +        for (x = 0; x < local->nb_blocks; x++) { +            g_hash_table_remove(rdma->blockmap, +                                (void *)(uintptr_t)old[x].offset); +        } +    } + +    if (local->nb_blocks > 1) { + +        local->block = g_malloc0(sizeof(RDMALocalBlock) * +                                    (local->nb_blocks - 1)); + +        if (block->index) { +            memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); +        } + +        if (block->index < (local->nb_blocks - 1)) { +            memcpy(local->block + block->index, old + (block->index + 1), +                sizeof(RDMALocalBlock) * +                    (local->nb_blocks - (block->index + 1))); +        } +    } else { +        assert(block == local->block); +        local->block = NULL; +    } + +    trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, +                           block->offset, block->length, +                            (uintptr_t)(block->local_host_addr + block->length), +                           BITS_TO_LONGS(block->nb_chunks) * +                               sizeof(unsigned long) * 8, block->nb_chunks); + +    g_free(old); + +    local->nb_blocks--; + +    if (local->nb_blocks && rdma->blockmap) { +        for (x = 0; x < local->nb_blocks; x++) { +            g_hash_table_insert(rdma->blockmap, +                                (void *)(uintptr_t)local->block[x].offset, +                                &local->block[x]); +        } +    } + +    return 0; +} + +/* + * Put in the log file which RDMA device was opened and the details + * associated with that device. + */ +static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) +{ +    struct ibv_port_attr port; + +    if (ibv_query_port(verbs, 1, &port)) { +        error_report("Failed to query port information"); +        return; +    } + +    printf("%s RDMA Device opened: kernel name %s " +           "uverbs device name %s, " +           "infiniband_verbs class device path %s, " +           "infiniband class device path %s, " +           "transport: (%d) %s\n", +                who, +                verbs->device->name, +                verbs->device->dev_name, +                verbs->device->dev_path, +                verbs->device->ibdev_path, +                port.link_layer, +                (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" : +                 ((port.link_layer == IBV_LINK_LAYER_ETHERNET) +                    ? "Ethernet" : "Unknown")); +} + +/* + * Put in the log file the RDMA gid addressing information, + * useful for folks who have trouble understanding the + * RDMA device hierarchy in the kernel. + */ +static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) +{ +    char sgid[33]; +    char dgid[33]; +    inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); +    inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); +    trace_qemu_rdma_dump_gid(who, sgid, dgid); +} + +/* + * As of now, IPv6 over RoCE / iWARP is not supported by linux. + * We will try the next addrinfo struct, and fail if there are + * no other valid addresses to bind against. + * + * If user is listening on '[::]', then we will not have a opened a device + * yet and have no way of verifying if the device is RoCE or not. + * + * In this case, the source VM will throw an error for ALL types of + * connections (both IPv4 and IPv6) if the destination machine does not have + * a regular infiniband network available for use. + * + * The only way to guarantee that an error is thrown for broken kernels is + * for the management software to choose a *specific* interface at bind time + * and validate what time of hardware it is. + * + * Unfortunately, this puts the user in a fix: + * + *  If the source VM connects with an IPv4 address without knowing that the + *  destination has bound to '[::]' the migration will unconditionally fail + *  unless the management software is explicitly listening on the the IPv4 + *  address while using a RoCE-based device. + * + *  If the source VM connects with an IPv6 address, then we're OK because we can + *  throw an error on the source (and similarly on the destination). + * + *  But in mixed environments, this will be broken for a while until it is fixed + *  inside linux. + * + * We do provide a *tiny* bit of help in this function: We can list all of the + * devices in the system and check to see if all the devices are RoCE or + * Infiniband. + * + * If we detect that we have a *pure* RoCE environment, then we can safely + * thrown an error even if the management software has specified '[::]' as the + * bind address. + * + * However, if there is are multiple hetergeneous devices, then we cannot make + * this assumption and the user just has to be sure they know what they are + * doing. + * + * Patches are being reviewed on linux-rdma. + */ +static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs) +{ +    struct ibv_port_attr port_attr; + +    /* This bug only exists in linux, to our knowledge. */ +#ifdef CONFIG_LINUX + +    /* +     * Verbs are only NULL if management has bound to '[::]'. +     * +     * Let's iterate through all the devices and see if there any pure IB +     * devices (non-ethernet). +     * +     * If not, then we can safely proceed with the migration. +     * Otherwise, there are no guarantees until the bug is fixed in linux. +     */ +    if (!verbs) { +        int num_devices, x; +        struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); +        bool roce_found = false; +        bool ib_found = false; + +        for (x = 0; x < num_devices; x++) { +            verbs = ibv_open_device(dev_list[x]); +            if (!verbs) { +                if (errno == EPERM) { +                    continue; +                } else { +                    return -EINVAL; +                } +            } + +            if (ibv_query_port(verbs, 1, &port_attr)) { +                ibv_close_device(verbs); +                ERROR(errp, "Could not query initial IB port"); +                return -EINVAL; +            } + +            if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { +                ib_found = true; +            } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { +                roce_found = true; +            } + +            ibv_close_device(verbs); + +        } + +        if (roce_found) { +            if (ib_found) { +                fprintf(stderr, "WARN: migrations may fail:" +                                " IPv6 over RoCE / iWARP in linux" +                                " is broken. But since you appear to have a" +                                " mixed RoCE / IB environment, be sure to only" +                                " migrate over the IB fabric until the kernel " +                                " fixes the bug.\n"); +            } else { +                ERROR(errp, "You only have RoCE / iWARP devices in your systems" +                            " and your management software has specified '[::]'" +                            ", but IPv6 over RoCE / iWARP is not supported in Linux."); +                return -ENONET; +            } +        } + +        return 0; +    } + +    /* +     * If we have a verbs context, that means that some other than '[::]' was +     * used by the management software for binding. In which case we can +     * actually warn the user about a potentially broken kernel. +     */ + +    /* IB ports start with 1, not 0 */ +    if (ibv_query_port(verbs, 1, &port_attr)) { +        ERROR(errp, "Could not query initial IB port"); +        return -EINVAL; +    } + +    if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { +        ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 " +                    "(but patches on linux-rdma in progress)"); +        return -ENONET; +    } + +#endif + +    return 0; +} + +/* + * Figure out which RDMA device corresponds to the requested IP hostname + * Also create the initial connection manager identifiers for opening + * the connection. + */ +static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) +{ +    int ret; +    struct rdma_addrinfo *res; +    char port_str[16]; +    struct rdma_cm_event *cm_event; +    char ip[40] = "unknown"; +    struct rdma_addrinfo *e; + +    if (rdma->host == NULL || !strcmp(rdma->host, "")) { +        ERROR(errp, "RDMA hostname has not been set"); +        return -EINVAL; +    } + +    /* create CM channel */ +    rdma->channel = rdma_create_event_channel(); +    if (!rdma->channel) { +        ERROR(errp, "could not create CM channel"); +        return -EINVAL; +    } + +    /* create CM id */ +    ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP); +    if (ret) { +        ERROR(errp, "could not create channel id"); +        goto err_resolve_create_id; +    } + +    snprintf(port_str, 16, "%d", rdma->port); +    port_str[15] = '\0'; + +    ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); +    if (ret < 0) { +        ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host); +        goto err_resolve_get_addr; +    } + +    for (e = res; e != NULL; e = e->ai_next) { +        inet_ntop(e->ai_family, +            &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); +        trace_qemu_rdma_resolve_host_trying(rdma->host, ip); + +        ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, +                RDMA_RESOLVE_TIMEOUT_MS); +        if (!ret) { +            if (e->ai_family == AF_INET6) { +                ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs); +                if (ret) { +                    continue; +                } +            } +            goto route; +        } +    } + +    ERROR(errp, "could not resolve address %s", rdma->host); +    goto err_resolve_get_addr; + +route: +    qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); + +    ret = rdma_get_cm_event(rdma->channel, &cm_event); +    if (ret) { +        ERROR(errp, "could not perform event_addr_resolved"); +        goto err_resolve_get_addr; +    } + +    if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { +        ERROR(errp, "result not equal to event_addr_resolved %s", +                rdma_event_str(cm_event->event)); +        perror("rdma_resolve_addr"); +        rdma_ack_cm_event(cm_event); +        ret = -EINVAL; +        goto err_resolve_get_addr; +    } +    rdma_ack_cm_event(cm_event); + +    /* resolve route */ +    ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS); +    if (ret) { +        ERROR(errp, "could not resolve rdma route"); +        goto err_resolve_get_addr; +    } + +    ret = rdma_get_cm_event(rdma->channel, &cm_event); +    if (ret) { +        ERROR(errp, "could not perform event_route_resolved"); +        goto err_resolve_get_addr; +    } +    if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { +        ERROR(errp, "result not equal to event_route_resolved: %s", +                        rdma_event_str(cm_event->event)); +        rdma_ack_cm_event(cm_event); +        ret = -EINVAL; +        goto err_resolve_get_addr; +    } +    rdma_ack_cm_event(cm_event); +    rdma->verbs = rdma->cm_id->verbs; +    qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs); +    qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id); +    return 0; + +err_resolve_get_addr: +    rdma_destroy_id(rdma->cm_id); +    rdma->cm_id = NULL; +err_resolve_create_id: +    rdma_destroy_event_channel(rdma->channel); +    rdma->channel = NULL; +    return ret; +} + +/* + * Create protection domain and completion queues + */ +static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) +{ +    /* allocate pd */ +    rdma->pd = ibv_alloc_pd(rdma->verbs); +    if (!rdma->pd) { +        error_report("failed to allocate protection domain"); +        return -1; +    } + +    /* create completion channel */ +    rdma->comp_channel = ibv_create_comp_channel(rdma->verbs); +    if (!rdma->comp_channel) { +        error_report("failed to allocate completion channel"); +        goto err_alloc_pd_cq; +    } + +    /* +     * Completion queue can be filled by both read and write work requests, +     * so must reflect the sum of both possible queue sizes. +     */ +    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), +            NULL, rdma->comp_channel, 0); +    if (!rdma->cq) { +        error_report("failed to allocate completion queue"); +        goto err_alloc_pd_cq; +    } + +    return 0; + +err_alloc_pd_cq: +    if (rdma->pd) { +        ibv_dealloc_pd(rdma->pd); +    } +    if (rdma->comp_channel) { +        ibv_destroy_comp_channel(rdma->comp_channel); +    } +    rdma->pd = NULL; +    rdma->comp_channel = NULL; +    return -1; + +} + +/* + * Create queue pairs. + */ +static int qemu_rdma_alloc_qp(RDMAContext *rdma) +{ +    struct ibv_qp_init_attr attr = { 0 }; +    int ret; + +    attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX; +    attr.cap.max_recv_wr = 3; +    attr.cap.max_send_sge = 1; +    attr.cap.max_recv_sge = 1; +    attr.send_cq = rdma->cq; +    attr.recv_cq = rdma->cq; +    attr.qp_type = IBV_QPT_RC; + +    ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); +    if (ret) { +        return -1; +    } + +    rdma->qp = rdma->cm_id->qp; +    return 0; +} + +static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) +{ +    int i; +    RDMALocalBlocks *local = &rdma->local_ram_blocks; + +    for (i = 0; i < local->nb_blocks; i++) { +        local->block[i].mr = +            ibv_reg_mr(rdma->pd, +                    local->block[i].local_host_addr, +                    local->block[i].length, +                    IBV_ACCESS_LOCAL_WRITE | +                    IBV_ACCESS_REMOTE_WRITE +                    ); +        if (!local->block[i].mr) { +            perror("Failed to register local dest ram block!\n"); +            break; +        } +        rdma->total_registrations++; +    } + +    if (i >= local->nb_blocks) { +        return 0; +    } + +    for (i--; i >= 0; i--) { +        ibv_dereg_mr(local->block[i].mr); +        rdma->total_registrations--; +    } + +    return -1; + +} + +/* + * Find the ram block that corresponds to the page requested to be + * transmitted by QEMU. + * + * Once the block is found, also identify which 'chunk' within that + * block that the page belongs to. + * + * This search cannot fail or the migration will fail. + */ +static int qemu_rdma_search_ram_block(RDMAContext *rdma, +                                      uintptr_t block_offset, +                                      uint64_t offset, +                                      uint64_t length, +                                      uint64_t *block_index, +                                      uint64_t *chunk_index) +{ +    uint64_t current_addr = block_offset + offset; +    RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, +                                                (void *) block_offset); +    assert(block); +    assert(current_addr >= block->offset); +    assert((current_addr + length) <= (block->offset + block->length)); + +    *block_index = block->index; +    *chunk_index = ram_chunk_index(block->local_host_addr, +                block->local_host_addr + (current_addr - block->offset)); + +    return 0; +} + +/* + * Register a chunk with IB. If the chunk was already registered + * previously, then skip. + * + * Also return the keys associated with the registration needed + * to perform the actual RDMA operation. + */ +static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, +        RDMALocalBlock *block, uintptr_t host_addr, +        uint32_t *lkey, uint32_t *rkey, int chunk, +        uint8_t *chunk_start, uint8_t *chunk_end) +{ +    if (block->mr) { +        if (lkey) { +            *lkey = block->mr->lkey; +        } +        if (rkey) { +            *rkey = block->mr->rkey; +        } +        return 0; +    } + +    /* allocate memory to store chunk MRs */ +    if (!block->pmr) { +        block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *)); +    } + +    /* +     * If 'rkey', then we're the destination, so grant access to the source. +     * +     * If 'lkey', then we're the source VM, so grant access only to ourselves. +     */ +    if (!block->pmr[chunk]) { +        uint64_t len = chunk_end - chunk_start; + +        trace_qemu_rdma_register_and_get_keys(len, chunk_start); + +        block->pmr[chunk] = ibv_reg_mr(rdma->pd, +                chunk_start, len, +                (rkey ? (IBV_ACCESS_LOCAL_WRITE | +                        IBV_ACCESS_REMOTE_WRITE) : 0)); + +        if (!block->pmr[chunk]) { +            perror("Failed to register chunk!"); +            fprintf(stderr, "Chunk details: block: %d chunk index %d" +                            " start %" PRIuPTR " end %" PRIuPTR +                            " host %" PRIuPTR +                            " local %" PRIuPTR " registrations: %d\n", +                            block->index, chunk, (uintptr_t)chunk_start, +                            (uintptr_t)chunk_end, host_addr, +                            (uintptr_t)block->local_host_addr, +                            rdma->total_registrations); +            return -1; +        } +        rdma->total_registrations++; +    } + +    if (lkey) { +        *lkey = block->pmr[chunk]->lkey; +    } +    if (rkey) { +        *rkey = block->pmr[chunk]->rkey; +    } +    return 0; +} + +/* + * Register (at connection time) the memory used for control + * channel messages. + */ +static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) +{ +    rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, +            rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER, +            IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); +    if (rdma->wr_data[idx].control_mr) { +        rdma->total_registrations++; +        return 0; +    } +    error_report("qemu_rdma_reg_control failed"); +    return -1; +} + +const char *print_wrid(int wrid) +{ +    if (wrid >= RDMA_WRID_RECV_CONTROL) { +        return wrid_desc[RDMA_WRID_RECV_CONTROL]; +    } +    return wrid_desc[wrid]; +} + +/* + * RDMA requires memory registration (mlock/pinning), but this is not good for + * overcommitment. + * + * In preparation for the future where LRU information or workload-specific + * writable writable working set memory access behavior is available to QEMU + * it would be nice to have in place the ability to UN-register/UN-pin + * particular memory regions from the RDMA hardware when it is determine that + * those regions of memory will likely not be accessed again in the near future. + * + * While we do not yet have such information right now, the following + * compile-time option allows us to perform a non-optimized version of this + * behavior. + * + * By uncommenting this option, you will cause *all* RDMA transfers to be + * unregistered immediately after the transfer completes on both sides of the + * connection. This has no effect in 'rdma-pin-all' mode, only regular mode. + * + * This will have a terrible impact on migration performance, so until future + * workload information or LRU information is available, do not attempt to use + * this feature except for basic testing. + */ +//#define RDMA_UNREGISTRATION_EXAMPLE + +/* + * Perform a non-optimized memory unregistration after every transfer + * for demonstration purposes, only if pin-all is not requested. + * + * Potential optimizations: + * 1. Start a new thread to run this function continuously +        - for bit clearing +        - and for receipt of unregister messages + * 2. Use an LRU. + * 3. Use workload hints. + */ +static int qemu_rdma_unregister_waiting(RDMAContext *rdma) +{ +    while (rdma->unregistrations[rdma->unregister_current]) { +        int ret; +        uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; +        uint64_t chunk = +            (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; +        uint64_t index = +            (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; +        RDMALocalBlock *block = +            &(rdma->local_ram_blocks.block[index]); +        RDMARegister reg = { .current_index = index }; +        RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED, +                                 }; +        RDMAControlHeader head = { .len = sizeof(RDMARegister), +                                   .type = RDMA_CONTROL_UNREGISTER_REQUEST, +                                   .repeat = 1, +                                 }; + +        trace_qemu_rdma_unregister_waiting_proc(chunk, +                                                rdma->unregister_current); + +        rdma->unregistrations[rdma->unregister_current] = 0; +        rdma->unregister_current++; + +        if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) { +            rdma->unregister_current = 0; +        } + + +        /* +         * Unregistration is speculative (because migration is single-threaded +         * and we cannot break the protocol's inifinband message ordering). +         * Thus, if the memory is currently being used for transmission, +         * then abort the attempt to unregister and try again +         * later the next time a completion is received for this memory. +         */ +        clear_bit(chunk, block->unregister_bitmap); + +        if (test_bit(chunk, block->transit_bitmap)) { +            trace_qemu_rdma_unregister_waiting_inflight(chunk); +            continue; +        } + +        trace_qemu_rdma_unregister_waiting_send(chunk); + +        ret = ibv_dereg_mr(block->pmr[chunk]); +        block->pmr[chunk] = NULL; +        block->remote_keys[chunk] = 0; + +        if (ret != 0) { +            perror("unregistration chunk failed"); +            return -ret; +        } +        rdma->total_registrations--; + +        reg.key.chunk = chunk; +        register_to_network(rdma, ®); +        ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, +                                &resp, NULL, NULL); +        if (ret < 0) { +            return ret; +        } + +        trace_qemu_rdma_unregister_waiting_complete(chunk); +    } + +    return 0; +} + +static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index, +                                         uint64_t chunk) +{ +    uint64_t result = wr_id & RDMA_WRID_TYPE_MASK; + +    result |= (index << RDMA_WRID_BLOCK_SHIFT); +    result |= (chunk << RDMA_WRID_CHUNK_SHIFT); + +    return result; +} + +/* + * Set bit for unregistration in the next iteration. + * We cannot transmit right here, but will unpin later. + */ +static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, +                                        uint64_t chunk, uint64_t wr_id) +{ +    if (rdma->unregistrations[rdma->unregister_next] != 0) { +        error_report("rdma migration: queue is full"); +    } else { +        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); + +        if (!test_and_set_bit(chunk, block->unregister_bitmap)) { +            trace_qemu_rdma_signal_unregister_append(chunk, +                                                     rdma->unregister_next); + +            rdma->unregistrations[rdma->unregister_next++] = +                    qemu_rdma_make_wrid(wr_id, index, chunk); + +            if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) { +                rdma->unregister_next = 0; +            } +        } else { +            trace_qemu_rdma_signal_unregister_already(chunk); +        } +    } +} + +/* + * Consult the connection manager to see a work request + * (of any kind) has completed. + * Return the work request ID that completed. + */ +static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, +                               uint32_t *byte_len) +{ +    int ret; +    struct ibv_wc wc; +    uint64_t wr_id; + +    ret = ibv_poll_cq(rdma->cq, 1, &wc); + +    if (!ret) { +        *wr_id_out = RDMA_WRID_NONE; +        return 0; +    } + +    if (ret < 0) { +        error_report("ibv_poll_cq return %d", ret); +        return ret; +    } + +    wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK; + +    if (wc.status != IBV_WC_SUCCESS) { +        fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", +                        wc.status, ibv_wc_status_str(wc.status)); +        fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]); + +        return -1; +    } + +    if (rdma->control_ready_expected && +        (wr_id >= RDMA_WRID_RECV_CONTROL)) { +        trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL], +                  wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent); +        rdma->control_ready_expected = 0; +    } + +    if (wr_id == RDMA_WRID_RDMA_WRITE) { +        uint64_t chunk = +            (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; +        uint64_t index = +            (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; +        RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); + +        trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent, +                                   index, chunk, block->local_host_addr, +                                   (void *)(uintptr_t)block->remote_host_addr); + +        clear_bit(chunk, block->transit_bitmap); + +        if (rdma->nb_sent > 0) { +            rdma->nb_sent--; +        } + +        if (!rdma->pin_all) { +            /* +             * FYI: If one wanted to signal a specific chunk to be unregistered +             * using LRU or workload-specific information, this is the function +             * you would call to do so. That chunk would then get asynchronously +             * unregistered later. +             */ +#ifdef RDMA_UNREGISTRATION_EXAMPLE +            qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id); +#endif +        } +    } else { +        trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent); +    } + +    *wr_id_out = wc.wr_id; +    if (byte_len) { +        *byte_len = wc.byte_len; +    } + +    return  0; +} + +/* + * Block until the next work request has completed. + * + * First poll to see if a work request has already completed, + * otherwise block. + * + * If we encounter completed work requests for IDs other than + * the one we're interested in, then that's generally an error. + * + * The only exception is actual RDMA Write completions. These + * completions only need to be recorded, but do not actually + * need further processing. + */ +static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, +                                    uint32_t *byte_len) +{ +    int num_cq_events = 0, ret = 0; +    struct ibv_cq *cq; +    void *cq_ctx; +    uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; + +    if (ibv_req_notify_cq(rdma->cq, 0)) { +        return -1; +    } +    /* poll cq first */ +    while (wr_id != wrid_requested) { +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); +        if (ret < 0) { +            return ret; +        } + +        wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; + +        if (wr_id == RDMA_WRID_NONE) { +            break; +        } +        if (wr_id != wrid_requested) { +            trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested), +                       wrid_requested, print_wrid(wr_id), wr_id); +        } +    } + +    if (wr_id == wrid_requested) { +        return 0; +    } + +    while (1) { +        /* +         * Coroutine doesn't start until process_incoming_migration() +         * so don't yield unless we know we're running inside of a coroutine. +         */ +        if (rdma->migration_started_on_destination) { +            yield_until_fd_readable(rdma->comp_channel->fd); +        } + +        if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) { +            perror("ibv_get_cq_event"); +            goto err_block_for_wrid; +        } + +        num_cq_events++; + +        if (ibv_req_notify_cq(cq, 0)) { +            goto err_block_for_wrid; +        } + +        while (wr_id != wrid_requested) { +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); +            if (ret < 0) { +                goto err_block_for_wrid; +            } + +            wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; + +            if (wr_id == RDMA_WRID_NONE) { +                break; +            } +            if (wr_id != wrid_requested) { +                trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested), +                                   wrid_requested, print_wrid(wr_id), wr_id); +            } +        } + +        if (wr_id == wrid_requested) { +            goto success_block_for_wrid; +        } +    } + +success_block_for_wrid: +    if (num_cq_events) { +        ibv_ack_cq_events(cq, num_cq_events); +    } +    return 0; + +err_block_for_wrid: +    if (num_cq_events) { +        ibv_ack_cq_events(cq, num_cq_events); +    } +    return ret; +} + +/* + * Post a SEND message work request for the control channel + * containing some data and block until the post completes. + */ +static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, +                                       RDMAControlHeader *head) +{ +    int ret = 0; +    RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; +    struct ibv_send_wr *bad_wr; +    struct ibv_sge sge = { +                           .addr = (uintptr_t)(wr->control), +                           .length = head->len + sizeof(RDMAControlHeader), +                           .lkey = wr->control_mr->lkey, +                         }; +    struct ibv_send_wr send_wr = { +                                   .wr_id = RDMA_WRID_SEND_CONTROL, +                                   .opcode = IBV_WR_SEND, +                                   .send_flags = IBV_SEND_SIGNALED, +                                   .sg_list = &sge, +                                   .num_sge = 1, +                                }; + +    trace_qemu_rdma_post_send_control(control_desc[head->type]); + +    /* +     * We don't actually need to do a memcpy() in here if we used +     * the "sge" properly, but since we're only sending control messages +     * (not RAM in a performance-critical path), then its OK for now. +     * +     * The copy makes the RDMAControlHeader simpler to manipulate +     * for the time being. +     */ +    assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head)); +    memcpy(wr->control, head, sizeof(RDMAControlHeader)); +    control_to_network((void *) wr->control); + +    if (buf) { +        memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len); +    } + + +    ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); + +    if (ret > 0) { +        error_report("Failed to use post IB SEND for control"); +        return -ret; +    } + +    ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); +    if (ret < 0) { +        error_report("rdma migration: send polling control error"); +    } + +    return ret; +} + +/* + * Post a RECV work request in anticipation of some future receipt + * of data on the control channel. + */ +static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx) +{ +    struct ibv_recv_wr *bad_wr; +    struct ibv_sge sge = { +                            .addr = (uintptr_t)(rdma->wr_data[idx].control), +                            .length = RDMA_CONTROL_MAX_BUFFER, +                            .lkey = rdma->wr_data[idx].control_mr->lkey, +                         }; + +    struct ibv_recv_wr recv_wr = { +                                    .wr_id = RDMA_WRID_RECV_CONTROL + idx, +                                    .sg_list = &sge, +                                    .num_sge = 1, +                                 }; + + +    if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) { +        return -1; +    } + +    return 0; +} + +/* + * Block and wait for a RECV control channel message to arrive. + */ +static int qemu_rdma_exchange_get_response(RDMAContext *rdma, +                RDMAControlHeader *head, int expecting, int idx) +{ +    uint32_t byte_len; +    int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx, +                                       &byte_len); + +    if (ret < 0) { +        error_report("rdma migration: recv polling control error!"); +        return ret; +    } + +    network_to_control((void *) rdma->wr_data[idx].control); +    memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader)); + +    trace_qemu_rdma_exchange_get_response_start(control_desc[expecting]); + +    if (expecting == RDMA_CONTROL_NONE) { +        trace_qemu_rdma_exchange_get_response_none(control_desc[head->type], +                                             head->type); +    } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { +        error_report("Was expecting a %s (%d) control message" +                ", but got: %s (%d), length: %d", +                control_desc[expecting], expecting, +                control_desc[head->type], head->type, head->len); +        return -EIO; +    } +    if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) { +        error_report("too long length: %d", head->len); +        return -EINVAL; +    } +    if (sizeof(*head) + head->len != byte_len) { +        error_report("Malformed length: %d byte_len %d", head->len, byte_len); +        return -EINVAL; +    } + +    return 0; +} + +/* + * When a RECV work request has completed, the work request's + * buffer is pointed at the header. + * + * This will advance the pointer to the data portion + * of the control message of the work request's buffer that + * was populated after the work request finished. + */ +static void qemu_rdma_move_header(RDMAContext *rdma, int idx, +                                  RDMAControlHeader *head) +{ +    rdma->wr_data[idx].control_len = head->len; +    rdma->wr_data[idx].control_curr = +        rdma->wr_data[idx].control + sizeof(RDMAControlHeader); +} + +/* + * This is an 'atomic' high-level operation to deliver a single, unified + * control-channel message. + * + * Additionally, if the user is expecting some kind of reply to this message, + * they can request a 'resp' response message be filled in by posting an + * additional work request on behalf of the user and waiting for an additional + * completion. + * + * The extra (optional) response is used during registration to us from having + * to perform an *additional* exchange of message just to provide a response by + * instead piggy-backing on the acknowledgement. + */ +static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, +                                   uint8_t *data, RDMAControlHeader *resp, +                                   int *resp_idx, +                                   int (*callback)(RDMAContext *rdma)) +{ +    int ret = 0; + +    /* +     * Wait until the dest is ready before attempting to deliver the message +     * by waiting for a READY message. +     */ +    if (rdma->control_ready_expected) { +        RDMAControlHeader resp; +        ret = qemu_rdma_exchange_get_response(rdma, +                                    &resp, RDMA_CONTROL_READY, RDMA_WRID_READY); +        if (ret < 0) { +            return ret; +        } +    } + +    /* +     * If the user is expecting a response, post a WR in anticipation of it. +     */ +    if (resp) { +        ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA); +        if (ret) { +            error_report("rdma migration: error posting" +                    " extra control recv for anticipated result!"); +            return ret; +        } +    } + +    /* +     * Post a WR to replace the one we just consumed for the READY message. +     */ +    ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); +    if (ret) { +        error_report("rdma migration: error posting first control recv!"); +        return ret; +    } + +    /* +     * Deliver the control message that was requested. +     */ +    ret = qemu_rdma_post_send_control(rdma, data, head); + +    if (ret < 0) { +        error_report("Failed to send control buffer!"); +        return ret; +    } + +    /* +     * If we're expecting a response, block and wait for it. +     */ +    if (resp) { +        if (callback) { +            trace_qemu_rdma_exchange_send_issue_callback(); +            ret = callback(rdma); +            if (ret < 0) { +                return ret; +            } +        } + +        trace_qemu_rdma_exchange_send_waiting(control_desc[resp->type]); +        ret = qemu_rdma_exchange_get_response(rdma, resp, +                                              resp->type, RDMA_WRID_DATA); + +        if (ret < 0) { +            return ret; +        } + +        qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); +        if (resp_idx) { +            *resp_idx = RDMA_WRID_DATA; +        } +        trace_qemu_rdma_exchange_send_received(control_desc[resp->type]); +    } + +    rdma->control_ready_expected = 1; + +    return 0; +} + +/* + * This is an 'atomic' high-level operation to receive a single, unified + * control-channel message. + */ +static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, +                                int expecting) +{ +    RDMAControlHeader ready = { +                                .len = 0, +                                .type = RDMA_CONTROL_READY, +                                .repeat = 1, +                              }; +    int ret; + +    /* +     * Inform the source that we're ready to receive a message. +     */ +    ret = qemu_rdma_post_send_control(rdma, NULL, &ready); + +    if (ret < 0) { +        error_report("Failed to send control buffer!"); +        return ret; +    } + +    /* +     * Block and wait for the message. +     */ +    ret = qemu_rdma_exchange_get_response(rdma, head, +                                          expecting, RDMA_WRID_READY); + +    if (ret < 0) { +        return ret; +    } + +    qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); + +    /* +     * Post a new RECV work request to replace the one we just consumed. +     */ +    ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); +    if (ret) { +        error_report("rdma migration: error posting second control recv!"); +        return ret; +    } + +    return 0; +} + +/* + * Write an actual chunk of memory using RDMA. + * + * If we're using dynamic registration on the dest-side, we have to + * send a registration command first. + */ +static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma, +                               int current_index, uint64_t current_addr, +                               uint64_t length) +{ +    struct ibv_sge sge; +    struct ibv_send_wr send_wr = { 0 }; +    struct ibv_send_wr *bad_wr; +    int reg_result_idx, ret, count = 0; +    uint64_t chunk, chunks; +    uint8_t *chunk_start, *chunk_end; +    RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); +    RDMARegister reg; +    RDMARegisterResult *reg_result; +    RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT }; +    RDMAControlHeader head = { .len = sizeof(RDMARegister), +                               .type = RDMA_CONTROL_REGISTER_REQUEST, +                               .repeat = 1, +                             }; + +retry: +    sge.addr = (uintptr_t)(block->local_host_addr + +                            (current_addr - block->offset)); +    sge.length = length; + +    chunk = ram_chunk_index(block->local_host_addr, +                            (uint8_t *)(uintptr_t)sge.addr); +    chunk_start = ram_chunk_start(block, chunk); + +    if (block->is_ram_block) { +        chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT); + +        if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { +            chunks--; +        } +    } else { +        chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT); + +        if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { +            chunks--; +        } +    } + +    trace_qemu_rdma_write_one_top(chunks + 1, +                                  (chunks + 1) * +                                  (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024); + +    chunk_end = ram_chunk_end(block, chunk + chunks); + +    if (!rdma->pin_all) { +#ifdef RDMA_UNREGISTRATION_EXAMPLE +        qemu_rdma_unregister_waiting(rdma); +#endif +    } + +    while (test_bit(chunk, block->transit_bitmap)) { +        (void)count; +        trace_qemu_rdma_write_one_block(count++, current_index, chunk, +                sge.addr, length, rdma->nb_sent, block->nb_chunks); + +        ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); + +        if (ret < 0) { +            error_report("Failed to Wait for previous write to complete " +                    "block %d chunk %" PRIu64 +                    " current %" PRIu64 " len %" PRIu64 " %d", +                    current_index, chunk, sge.addr, length, rdma->nb_sent); +            return ret; +        } +    } + +    if (!rdma->pin_all || !block->is_ram_block) { +        if (!block->remote_keys[chunk]) { +            /* +             * This chunk has not yet been registered, so first check to see +             * if the entire chunk is zero. If so, tell the other size to +             * memset() + madvise() the entire chunk without RDMA. +             */ + +            if (can_use_buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr, +                                                   length) +                   && buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr, +                                                    length) == length) { +                RDMACompress comp = { +                                        .offset = current_addr, +                                        .value = 0, +                                        .block_idx = current_index, +                                        .length = length, +                                    }; + +                head.len = sizeof(comp); +                head.type = RDMA_CONTROL_COMPRESS; + +                trace_qemu_rdma_write_one_zero(chunk, sge.length, +                                               current_index, current_addr); + +                compress_to_network(rdma, &comp); +                ret = qemu_rdma_exchange_send(rdma, &head, +                                (uint8_t *) &comp, NULL, NULL, NULL); + +                if (ret < 0) { +                    return -EIO; +                } + +                acct_update_position(f, sge.length, true); + +                return 1; +            } + +            /* +             * Otherwise, tell other side to register. +             */ +            reg.current_index = current_index; +            if (block->is_ram_block) { +                reg.key.current_addr = current_addr; +            } else { +                reg.key.chunk = chunk; +            } +            reg.chunks = chunks; + +            trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, +                                              current_addr); + +            register_to_network(rdma, ®); +            ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, +                                    &resp, ®_result_idx, NULL); +            if (ret < 0) { +                return ret; +            } + +            /* try to overlap this single registration with the one we sent. */ +            if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, +                                                &sge.lkey, NULL, chunk, +                                                chunk_start, chunk_end)) { +                error_report("cannot get lkey"); +                return -EINVAL; +            } + +            reg_result = (RDMARegisterResult *) +                    rdma->wr_data[reg_result_idx].control_curr; + +            network_to_result(reg_result); + +            trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk], +                                                 reg_result->rkey, chunk); + +            block->remote_keys[chunk] = reg_result->rkey; +            block->remote_host_addr = reg_result->host_addr; +        } else { +            /* already registered before */ +            if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, +                                                &sge.lkey, NULL, chunk, +                                                chunk_start, chunk_end)) { +                error_report("cannot get lkey!"); +                return -EINVAL; +            } +        } + +        send_wr.wr.rdma.rkey = block->remote_keys[chunk]; +    } else { +        send_wr.wr.rdma.rkey = block->remote_rkey; + +        if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, +                                                     &sge.lkey, NULL, chunk, +                                                     chunk_start, chunk_end)) { +            error_report("cannot get lkey!"); +            return -EINVAL; +        } +    } + +    /* +     * Encode the ram block index and chunk within this wrid. +     * We will use this information at the time of completion +     * to figure out which bitmap to check against and then which +     * chunk in the bitmap to look for. +     */ +    send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE, +                                        current_index, chunk); + +    send_wr.opcode = IBV_WR_RDMA_WRITE; +    send_wr.send_flags = IBV_SEND_SIGNALED; +    send_wr.sg_list = &sge; +    send_wr.num_sge = 1; +    send_wr.wr.rdma.remote_addr = block->remote_host_addr + +                                (current_addr - block->offset); + +    trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr, +                                   sge.length); + +    /* +     * ibv_post_send() does not return negative error numbers, +     * per the specification they are positive - no idea why. +     */ +    ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); + +    if (ret == ENOMEM) { +        trace_qemu_rdma_write_one_queue_full(); +        ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); +        if (ret < 0) { +            error_report("rdma migration: failed to make " +                         "room in full send queue! %d", ret); +            return ret; +        } + +        goto retry; + +    } else if (ret > 0) { +        perror("rdma migration: post rdma write failed"); +        return -ret; +    } + +    set_bit(chunk, block->transit_bitmap); +    acct_update_position(f, sge.length, false); +    rdma->total_writes++; + +    return 0; +} + +/* + * Push out any unwritten RDMA operations. + * + * We support sending out multiple chunks at the same time. + * Not all of them need to get signaled in the completion queue. + */ +static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma) +{ +    int ret; + +    if (!rdma->current_length) { +        return 0; +    } + +    ret = qemu_rdma_write_one(f, rdma, +            rdma->current_index, rdma->current_addr, rdma->current_length); + +    if (ret < 0) { +        return ret; +    } + +    if (ret == 0) { +        rdma->nb_sent++; +        trace_qemu_rdma_write_flush(rdma->nb_sent); +    } + +    rdma->current_length = 0; +    rdma->current_addr = 0; + +    return 0; +} + +static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma, +                    uint64_t offset, uint64_t len) +{ +    RDMALocalBlock *block; +    uint8_t *host_addr; +    uint8_t *chunk_end; + +    if (rdma->current_index < 0) { +        return 0; +    } + +    if (rdma->current_chunk < 0) { +        return 0; +    } + +    block = &(rdma->local_ram_blocks.block[rdma->current_index]); +    host_addr = block->local_host_addr + (offset - block->offset); +    chunk_end = ram_chunk_end(block, rdma->current_chunk); + +    if (rdma->current_length == 0) { +        return 0; +    } + +    /* +     * Only merge into chunk sequentially. +     */ +    if (offset != (rdma->current_addr + rdma->current_length)) { +        return 0; +    } + +    if (offset < block->offset) { +        return 0; +    } + +    if ((offset + len) > (block->offset + block->length)) { +        return 0; +    } + +    if ((host_addr + len) > chunk_end) { +        return 0; +    } + +    return 1; +} + +/* + * We're not actually writing here, but doing three things: + * + * 1. Identify the chunk the buffer belongs to. + * 2. If the chunk is full or the buffer doesn't belong to the current + *    chunk, then start a new chunk and flush() the old chunk. + * 3. To keep the hardware busy, we also group chunks into batches + *    and only require that a batch gets acknowledged in the completion + *    qeueue instead of each individual chunk. + */ +static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma, +                           uint64_t block_offset, uint64_t offset, +                           uint64_t len) +{ +    uint64_t current_addr = block_offset + offset; +    uint64_t index = rdma->current_index; +    uint64_t chunk = rdma->current_chunk; +    int ret; + +    /* If we cannot merge it, we flush the current buffer first. */ +    if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) { +        ret = qemu_rdma_write_flush(f, rdma); +        if (ret) { +            return ret; +        } +        rdma->current_length = 0; +        rdma->current_addr = current_addr; + +        ret = qemu_rdma_search_ram_block(rdma, block_offset, +                                         offset, len, &index, &chunk); +        if (ret) { +            error_report("ram block search failed"); +            return ret; +        } +        rdma->current_index = index; +        rdma->current_chunk = chunk; +    } + +    /* merge it */ +    rdma->current_length += len; + +    /* flush it if buffer is too large */ +    if (rdma->current_length >= RDMA_MERGE_MAX) { +        return qemu_rdma_write_flush(f, rdma); +    } + +    return 0; +} + +static void qemu_rdma_cleanup(RDMAContext *rdma) +{ +    struct rdma_cm_event *cm_event; +    int ret, idx; + +    if (rdma->cm_id && rdma->connected) { +        if (rdma->error_state) { +            RDMAControlHeader head = { .len = 0, +                                       .type = RDMA_CONTROL_ERROR, +                                       .repeat = 1, +                                     }; +            error_report("Early error. Sending error."); +            qemu_rdma_post_send_control(rdma, NULL, &head); +        } + +        ret = rdma_disconnect(rdma->cm_id); +        if (!ret) { +            trace_qemu_rdma_cleanup_waiting_for_disconnect(); +            ret = rdma_get_cm_event(rdma->channel, &cm_event); +            if (!ret) { +                rdma_ack_cm_event(cm_event); +            } +        } +        trace_qemu_rdma_cleanup_disconnect(); +        rdma->connected = false; +    } + +    g_free(rdma->dest_blocks); +    rdma->dest_blocks = NULL; + +    for (idx = 0; idx < RDMA_WRID_MAX; idx++) { +        if (rdma->wr_data[idx].control_mr) { +            rdma->total_registrations--; +            ibv_dereg_mr(rdma->wr_data[idx].control_mr); +        } +        rdma->wr_data[idx].control_mr = NULL; +    } + +    if (rdma->local_ram_blocks.block) { +        while (rdma->local_ram_blocks.nb_blocks) { +            rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); +        } +    } + +    if (rdma->qp) { +        rdma_destroy_qp(rdma->cm_id); +        rdma->qp = NULL; +    } +    if (rdma->cq) { +        ibv_destroy_cq(rdma->cq); +        rdma->cq = NULL; +    } +    if (rdma->comp_channel) { +        ibv_destroy_comp_channel(rdma->comp_channel); +        rdma->comp_channel = NULL; +    } +    if (rdma->pd) { +        ibv_dealloc_pd(rdma->pd); +        rdma->pd = NULL; +    } +    if (rdma->cm_id) { +        rdma_destroy_id(rdma->cm_id); +        rdma->cm_id = NULL; +    } +    if (rdma->listen_id) { +        rdma_destroy_id(rdma->listen_id); +        rdma->listen_id = NULL; +    } +    if (rdma->channel) { +        rdma_destroy_event_channel(rdma->channel); +        rdma->channel = NULL; +    } +    g_free(rdma->host); +    rdma->host = NULL; +} + + +static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) +{ +    int ret, idx; +    Error *local_err = NULL, **temp = &local_err; + +    /* +     * Will be validated against destination's actual capabilities +     * after the connect() completes. +     */ +    rdma->pin_all = pin_all; + +    ret = qemu_rdma_resolve_host(rdma, temp); +    if (ret) { +        goto err_rdma_source_init; +    } + +    ret = qemu_rdma_alloc_pd_cq(rdma); +    if (ret) { +        ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()" +                    " limits may be too low. Please check $ ulimit -a # and " +                    "search for 'ulimit -l' in the output"); +        goto err_rdma_source_init; +    } + +    ret = qemu_rdma_alloc_qp(rdma); +    if (ret) { +        ERROR(temp, "rdma migration: error allocating qp!"); +        goto err_rdma_source_init; +    } + +    ret = qemu_rdma_init_ram_blocks(rdma); +    if (ret) { +        ERROR(temp, "rdma migration: error initializing ram blocks!"); +        goto err_rdma_source_init; +    } + +    /* Build the hash that maps from offset to RAMBlock */ +    rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); +    for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { +        g_hash_table_insert(rdma->blockmap, +                (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, +                &rdma->local_ram_blocks.block[idx]); +    } + +    for (idx = 0; idx < RDMA_WRID_MAX; idx++) { +        ret = qemu_rdma_reg_control(rdma, idx); +        if (ret) { +            ERROR(temp, "rdma migration: error registering %d control!", +                                                            idx); +            goto err_rdma_source_init; +        } +    } + +    return 0; + +err_rdma_source_init: +    error_propagate(errp, local_err); +    qemu_rdma_cleanup(rdma); +    return -1; +} + +static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) +{ +    RDMACapabilities cap = { +                                .version = RDMA_CONTROL_VERSION_CURRENT, +                                .flags = 0, +                           }; +    struct rdma_conn_param conn_param = { .initiator_depth = 2, +                                          .retry_count = 5, +                                          .private_data = &cap, +                                          .private_data_len = sizeof(cap), +                                        }; +    struct rdma_cm_event *cm_event; +    int ret; + +    /* +     * Only negotiate the capability with destination if the user +     * on the source first requested the capability. +     */ +    if (rdma->pin_all) { +        trace_qemu_rdma_connect_pin_all_requested(); +        cap.flags |= RDMA_CAPABILITY_PIN_ALL; +    } + +    caps_to_network(&cap); + +    ret = rdma_connect(rdma->cm_id, &conn_param); +    if (ret) { +        perror("rdma_connect"); +        ERROR(errp, "connecting to destination!"); +        goto err_rdma_source_connect; +    } + +    ret = rdma_get_cm_event(rdma->channel, &cm_event); +    if (ret) { +        perror("rdma_get_cm_event after rdma_connect"); +        ERROR(errp, "connecting to destination!"); +        rdma_ack_cm_event(cm_event); +        goto err_rdma_source_connect; +    } + +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { +        perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect"); +        ERROR(errp, "connecting to destination!"); +        rdma_ack_cm_event(cm_event); +        goto err_rdma_source_connect; +    } +    rdma->connected = true; + +    memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); +    network_to_caps(&cap); + +    /* +     * Verify that the *requested* capabilities are supported by the destination +     * and disable them otherwise. +     */ +    if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) { +        ERROR(errp, "Server cannot support pinning all memory. " +                        "Will register memory dynamically."); +        rdma->pin_all = false; +    } + +    trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all); + +    rdma_ack_cm_event(cm_event); + +    ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); +    if (ret) { +        ERROR(errp, "posting second control recv!"); +        goto err_rdma_source_connect; +    } + +    rdma->control_ready_expected = 1; +    rdma->nb_sent = 0; +    return 0; + +err_rdma_source_connect: +    qemu_rdma_cleanup(rdma); +    return -1; +} + +static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) +{ +    int ret, idx; +    struct rdma_cm_id *listen_id; +    char ip[40] = "unknown"; +    struct rdma_addrinfo *res, *e; +    char port_str[16]; + +    for (idx = 0; idx < RDMA_WRID_MAX; idx++) { +        rdma->wr_data[idx].control_len = 0; +        rdma->wr_data[idx].control_curr = NULL; +    } + +    if (!rdma->host || !rdma->host[0]) { +        ERROR(errp, "RDMA host is not set!"); +        rdma->error_state = -EINVAL; +        return -1; +    } +    /* create CM channel */ +    rdma->channel = rdma_create_event_channel(); +    if (!rdma->channel) { +        ERROR(errp, "could not create rdma event channel"); +        rdma->error_state = -EINVAL; +        return -1; +    } + +    /* create CM id */ +    ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP); +    if (ret) { +        ERROR(errp, "could not create cm_id!"); +        goto err_dest_init_create_listen_id; +    } + +    snprintf(port_str, 16, "%d", rdma->port); +    port_str[15] = '\0'; + +    ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); +    if (ret < 0) { +        ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host); +        goto err_dest_init_bind_addr; +    } + +    for (e = res; e != NULL; e = e->ai_next) { +        inet_ntop(e->ai_family, +            &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); +        trace_qemu_rdma_dest_init_trying(rdma->host, ip); +        ret = rdma_bind_addr(listen_id, e->ai_dst_addr); +        if (ret) { +            continue; +        } +        if (e->ai_family == AF_INET6) { +            ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs); +            if (ret) { +                continue; +            } +        } +        break; +    } + +    if (!e) { +        ERROR(errp, "Error: could not rdma_bind_addr!"); +        goto err_dest_init_bind_addr; +    } + +    rdma->listen_id = listen_id; +    qemu_rdma_dump_gid("dest_init", listen_id); +    return 0; + +err_dest_init_bind_addr: +    rdma_destroy_id(listen_id); +err_dest_init_create_listen_id: +    rdma_destroy_event_channel(rdma->channel); +    rdma->channel = NULL; +    rdma->error_state = ret; +    return ret; + +} + +static void *qemu_rdma_data_init(const char *host_port, Error **errp) +{ +    RDMAContext *rdma = NULL; +    InetSocketAddress *addr; + +    if (host_port) { +        rdma = g_malloc0(sizeof(RDMAContext)); +        rdma->current_index = -1; +        rdma->current_chunk = -1; + +        addr = inet_parse(host_port, NULL); +        if (addr != NULL) { +            rdma->port = atoi(addr->port); +            rdma->host = g_strdup(addr->host); +        } else { +            ERROR(errp, "bad RDMA migration address '%s'", host_port); +            g_free(rdma); +            rdma = NULL; +        } + +        qapi_free_InetSocketAddress(addr); +    } + +    return rdma; +} + +/* + * QEMUFile interface to the control channel. + * SEND messages for control only. + * VM's ram is handled with regular RDMA messages. + */ +static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, +                                int64_t pos, int size) +{ +    QEMUFileRDMA *r = opaque; +    QEMUFile *f = r->file; +    RDMAContext *rdma = r->rdma; +    size_t remaining = size; +    uint8_t * data = (void *) buf; +    int ret; + +    CHECK_ERROR_STATE(); + +    /* +     * Push out any writes that +     * we're queued up for VM's ram. +     */ +    ret = qemu_rdma_write_flush(f, rdma); +    if (ret < 0) { +        rdma->error_state = ret; +        return ret; +    } + +    while (remaining) { +        RDMAControlHeader head; + +        r->len = MIN(remaining, RDMA_SEND_INCREMENT); +        remaining -= r->len; + +        head.len = r->len; +        head.type = RDMA_CONTROL_QEMU_FILE; + +        ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); + +        if (ret < 0) { +            rdma->error_state = ret; +            return ret; +        } + +        data += r->len; +    } + +    return size; +} + +static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, +                             int size, int idx) +{ +    size_t len = 0; + +    if (rdma->wr_data[idx].control_len) { +        trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size); + +        len = MIN(size, rdma->wr_data[idx].control_len); +        memcpy(buf, rdma->wr_data[idx].control_curr, len); +        rdma->wr_data[idx].control_curr += len; +        rdma->wr_data[idx].control_len -= len; +    } + +    return len; +} + +/* + * QEMUFile interface to the control channel. + * RDMA links don't use bytestreams, so we have to + * return bytes to QEMUFile opportunistically. + */ +static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, +                                int64_t pos, int size) +{ +    QEMUFileRDMA *r = opaque; +    RDMAContext *rdma = r->rdma; +    RDMAControlHeader head; +    int ret = 0; + +    CHECK_ERROR_STATE(); + +    /* +     * First, we hold on to the last SEND message we +     * were given and dish out the bytes until we run +     * out of bytes. +     */ +    r->len = qemu_rdma_fill(r->rdma, buf, size, 0); +    if (r->len) { +        return r->len; +    } + +    /* +     * Once we run out, we block and wait for another +     * SEND message to arrive. +     */ +    ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); + +    if (ret < 0) { +        rdma->error_state = ret; +        return ret; +    } + +    /* +     * SEND was received with new bytes, now try again. +     */ +    return qemu_rdma_fill(r->rdma, buf, size, 0); +} + +/* + * Block until all the outstanding chunks have been delivered by the hardware. + */ +static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma) +{ +    int ret; + +    if (qemu_rdma_write_flush(f, rdma) < 0) { +        return -EIO; +    } + +    while (rdma->nb_sent) { +        ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); +        if (ret < 0) { +            error_report("rdma migration: complete polling error!"); +            return -EIO; +        } +    } + +    qemu_rdma_unregister_waiting(rdma); + +    return 0; +} + +static int qemu_rdma_close(void *opaque) +{ +    trace_qemu_rdma_close(); +    QEMUFileRDMA *r = opaque; +    if (r->rdma) { +        qemu_rdma_cleanup(r->rdma); +        g_free(r->rdma); +    } +    g_free(r); +    return 0; +} + +/* + * Parameters: + *    @offset == 0 : + *        This means that 'block_offset' is a full virtual address that does not + *        belong to a RAMBlock of the virtual machine and instead + *        represents a private malloc'd memory area that the caller wishes to + *        transfer. + * + *    @offset != 0 : + *        Offset is an offset to be added to block_offset and used + *        to also lookup the corresponding RAMBlock. + * + *    @size > 0 : + *        Initiate an transfer this size. + * + *    @size == 0 : + *        A 'hint' or 'advice' that means that we wish to speculatively + *        and asynchronously unregister this memory. In this case, there is no + *        guarantee that the unregister will actually happen, for example, + *        if the memory is being actively transmitted. Additionally, the memory + *        may be re-registered at any future time if a write within the same + *        chunk was requested again, even if you attempted to unregister it + *        here. + * + *    @size < 0 : TODO, not yet supported + *        Unregister the memory NOW. This means that the caller does not + *        expect there to be any future RDMA transfers and we just want to clean + *        things up. This is used in case the upper layer owns the memory and + *        cannot wait for qemu_fclose() to occur. + * + *    @bytes_sent : User-specificed pointer to indicate how many bytes were + *                  sent. Usually, this will not be more than a few bytes of + *                  the protocol because most transfers are sent asynchronously. + */ +static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, +                                  ram_addr_t block_offset, ram_addr_t offset, +                                  size_t size, uint64_t *bytes_sent) +{ +    QEMUFileRDMA *rfile = opaque; +    RDMAContext *rdma = rfile->rdma; +    int ret; + +    CHECK_ERROR_STATE(); + +    qemu_fflush(f); + +    if (size > 0) { +        /* +         * Add this page to the current 'chunk'. If the chunk +         * is full, or the page doen't belong to the current chunk, +         * an actual RDMA write will occur and a new chunk will be formed. +         */ +        ret = qemu_rdma_write(f, rdma, block_offset, offset, size); +        if (ret < 0) { +            error_report("rdma migration: write error! %d", ret); +            goto err; +        } + +        /* +         * We always return 1 bytes because the RDMA +         * protocol is completely asynchronous. We do not yet know +         * whether an  identified chunk is zero or not because we're +         * waiting for other pages to potentially be merged with +         * the current chunk. So, we have to call qemu_update_position() +         * later on when the actual write occurs. +         */ +        if (bytes_sent) { +            *bytes_sent = 1; +        } +    } else { +        uint64_t index, chunk; + +        /* TODO: Change QEMUFileOps prototype to be signed: size_t => long +        if (size < 0) { +            ret = qemu_rdma_drain_cq(f, rdma); +            if (ret < 0) { +                fprintf(stderr, "rdma: failed to synchronously drain" +                                " completion queue before unregistration.\n"); +                goto err; +            } +        } +        */ + +        ret = qemu_rdma_search_ram_block(rdma, block_offset, +                                         offset, size, &index, &chunk); + +        if (ret) { +            error_report("ram block search failed"); +            goto err; +        } + +        qemu_rdma_signal_unregister(rdma, index, chunk, 0); + +        /* +         * TODO: Synchronous, guaranteed unregistration (should not occur during +         * fast-path). Otherwise, unregisters will process on the next call to +         * qemu_rdma_drain_cq() +        if (size < 0) { +            qemu_rdma_unregister_waiting(rdma); +        } +        */ +    } + +    /* +     * Drain the Completion Queue if possible, but do not block, +     * just poll. +     * +     * If nothing to poll, the end of the iteration will do this +     * again to make sure we don't overflow the request queue. +     */ +    while (1) { +        uint64_t wr_id, wr_id_in; +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); +        if (ret < 0) { +            error_report("rdma migration: polling error! %d", ret); +            goto err; +        } + +        wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; + +        if (wr_id == RDMA_WRID_NONE) { +            break; +        } +    } + +    return RAM_SAVE_CONTROL_DELAYED; +err: +    rdma->error_state = ret; +    return ret; +} + +static int qemu_rdma_accept(RDMAContext *rdma) +{ +    RDMACapabilities cap; +    struct rdma_conn_param conn_param = { +                                            .responder_resources = 2, +                                            .private_data = &cap, +                                            .private_data_len = sizeof(cap), +                                         }; +    struct rdma_cm_event *cm_event; +    struct ibv_context *verbs; +    int ret = -EINVAL; +    int idx; + +    ret = rdma_get_cm_event(rdma->channel, &cm_event); +    if (ret) { +        goto err_rdma_dest_wait; +    } + +    if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { +        rdma_ack_cm_event(cm_event); +        goto err_rdma_dest_wait; +    } + +    memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); + +    network_to_caps(&cap); + +    if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) { +            error_report("Unknown source RDMA version: %d, bailing...", +                            cap.version); +            rdma_ack_cm_event(cm_event); +            goto err_rdma_dest_wait; +    } + +    /* +     * Respond with only the capabilities this version of QEMU knows about. +     */ +    cap.flags &= known_capabilities; + +    /* +     * Enable the ones that we do know about. +     * Add other checks here as new ones are introduced. +     */ +    if (cap.flags & RDMA_CAPABILITY_PIN_ALL) { +        rdma->pin_all = true; +    } + +    rdma->cm_id = cm_event->id; +    verbs = cm_event->id->verbs; + +    rdma_ack_cm_event(cm_event); + +    trace_qemu_rdma_accept_pin_state(rdma->pin_all); + +    caps_to_network(&cap); + +    trace_qemu_rdma_accept_pin_verbsc(verbs); + +    if (!rdma->verbs) { +        rdma->verbs = verbs; +    } else if (rdma->verbs != verbs) { +            error_report("ibv context not matching %p, %p!", rdma->verbs, +                         verbs); +            goto err_rdma_dest_wait; +    } + +    qemu_rdma_dump_id("dest_init", verbs); + +    ret = qemu_rdma_alloc_pd_cq(rdma); +    if (ret) { +        error_report("rdma migration: error allocating pd and cq!"); +        goto err_rdma_dest_wait; +    } + +    ret = qemu_rdma_alloc_qp(rdma); +    if (ret) { +        error_report("rdma migration: error allocating qp!"); +        goto err_rdma_dest_wait; +    } + +    ret = qemu_rdma_init_ram_blocks(rdma); +    if (ret) { +        error_report("rdma migration: error initializing ram blocks!"); +        goto err_rdma_dest_wait; +    } + +    for (idx = 0; idx < RDMA_WRID_MAX; idx++) { +        ret = qemu_rdma_reg_control(rdma, idx); +        if (ret) { +            error_report("rdma: error registering %d control", idx); +            goto err_rdma_dest_wait; +        } +    } + +    qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); + +    ret = rdma_accept(rdma->cm_id, &conn_param); +    if (ret) { +        error_report("rdma_accept returns %d", ret); +        goto err_rdma_dest_wait; +    } + +    ret = rdma_get_cm_event(rdma->channel, &cm_event); +    if (ret) { +        error_report("rdma_accept get_cm_event failed %d", ret); +        goto err_rdma_dest_wait; +    } + +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { +        error_report("rdma_accept not event established"); +        rdma_ack_cm_event(cm_event); +        goto err_rdma_dest_wait; +    } + +    rdma_ack_cm_event(cm_event); +    rdma->connected = true; + +    ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); +    if (ret) { +        error_report("rdma migration: error posting second control recv"); +        goto err_rdma_dest_wait; +    } + +    qemu_rdma_dump_gid("dest_connect", rdma->cm_id); + +    return 0; + +err_rdma_dest_wait: +    rdma->error_state = ret; +    qemu_rdma_cleanup(rdma); +    return ret; +} + +static int dest_ram_sort_func(const void *a, const void *b) +{ +    unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; +    unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; + +    return (a_index < b_index) ? -1 : (a_index != b_index); +} + +/* + * During each iteration of the migration, we listen for instructions + * by the source VM to perform dynamic page registrations before they + * can perform RDMA operations. + * + * We respond with the 'rkey'. + * + * Keep doing this until the source tells us to stop. + */ +static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) +{ +    RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), +                               .type = RDMA_CONTROL_REGISTER_RESULT, +                               .repeat = 0, +                             }; +    RDMAControlHeader unreg_resp = { .len = 0, +                               .type = RDMA_CONTROL_UNREGISTER_FINISHED, +                               .repeat = 0, +                             }; +    RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, +                                 .repeat = 1 }; +    QEMUFileRDMA *rfile = opaque; +    RDMAContext *rdma = rfile->rdma; +    RDMALocalBlocks *local = &rdma->local_ram_blocks; +    RDMAControlHeader head; +    RDMARegister *reg, *registers; +    RDMACompress *comp; +    RDMARegisterResult *reg_result; +    static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE]; +    RDMALocalBlock *block; +    void *host_addr; +    int ret = 0; +    int idx = 0; +    int count = 0; +    int i = 0; + +    CHECK_ERROR_STATE(); + +    do { +        trace_qemu_rdma_registration_handle_wait(); + +        ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); + +        if (ret < 0) { +            break; +        } + +        if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) { +            error_report("rdma: Too many requests in this message (%d)." +                            "Bailing.", head.repeat); +            ret = -EIO; +            break; +        } + +        switch (head.type) { +        case RDMA_CONTROL_COMPRESS: +            comp = (RDMACompress *) rdma->wr_data[idx].control_curr; +            network_to_compress(comp); + +            trace_qemu_rdma_registration_handle_compress(comp->length, +                                                         comp->block_idx, +                                                         comp->offset); +            if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { +                error_report("rdma: 'compress' bad block index %u (vs %d)", +                             (unsigned int)comp->block_idx, +                             rdma->local_ram_blocks.nb_blocks); +                ret = -EIO; +                goto out; +            } +            block = &(rdma->local_ram_blocks.block[comp->block_idx]); + +            host_addr = block->local_host_addr + +                            (comp->offset - block->offset); + +            ram_handle_compressed(host_addr, comp->value, comp->length); +            break; + +        case RDMA_CONTROL_REGISTER_FINISHED: +            trace_qemu_rdma_registration_handle_finished(); +            goto out; + +        case RDMA_CONTROL_RAM_BLOCKS_REQUEST: +            trace_qemu_rdma_registration_handle_ram_blocks(); + +            /* Sort our local RAM Block list so it's the same as the source, +             * we can do this since we've filled in a src_index in the list +             * as we received the RAMBlock list earlier. +             */ +            qsort(rdma->local_ram_blocks.block, +                  rdma->local_ram_blocks.nb_blocks, +                  sizeof(RDMALocalBlock), dest_ram_sort_func); +            if (rdma->pin_all) { +                ret = qemu_rdma_reg_whole_ram_blocks(rdma); +                if (ret) { +                    error_report("rdma migration: error dest " +                                    "registering ram blocks"); +                    goto out; +                } +            } + +            /* +             * Dest uses this to prepare to transmit the RAMBlock descriptions +             * to the source VM after connection setup. +             * Both sides use the "remote" structure to communicate and update +             * their "local" descriptions with what was sent. +             */ +            for (i = 0; i < local->nb_blocks; i++) { +                rdma->dest_blocks[i].remote_host_addr = +                    (uintptr_t)(local->block[i].local_host_addr); + +                if (rdma->pin_all) { +                    rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey; +                } + +                rdma->dest_blocks[i].offset = local->block[i].offset; +                rdma->dest_blocks[i].length = local->block[i].length; + +                dest_block_to_network(&rdma->dest_blocks[i]); +                trace_qemu_rdma_registration_handle_ram_blocks_loop( +                    local->block[i].block_name, +                    local->block[i].offset, +                    local->block[i].length, +                    local->block[i].local_host_addr, +                    local->block[i].src_index); +            } + +            blocks.len = rdma->local_ram_blocks.nb_blocks +                                                * sizeof(RDMADestBlock); + + +            ret = qemu_rdma_post_send_control(rdma, +                                        (uint8_t *) rdma->dest_blocks, &blocks); + +            if (ret < 0) { +                error_report("rdma migration: error sending remote info"); +                goto out; +            } + +            break; +        case RDMA_CONTROL_REGISTER_REQUEST: +            trace_qemu_rdma_registration_handle_register(head.repeat); + +            reg_resp.repeat = head.repeat; +            registers = (RDMARegister *) rdma->wr_data[idx].control_curr; + +            for (count = 0; count < head.repeat; count++) { +                uint64_t chunk; +                uint8_t *chunk_start, *chunk_end; + +                reg = ®isters[count]; +                network_to_register(reg); + +                reg_result = &results[count]; + +                trace_qemu_rdma_registration_handle_register_loop(count, +                         reg->current_index, reg->key.current_addr, reg->chunks); + +                if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { +                    error_report("rdma: 'register' bad block index %u (vs %d)", +                                 (unsigned int)reg->current_index, +                                 rdma->local_ram_blocks.nb_blocks); +                    ret = -ENOENT; +                    goto out; +                } +                block = &(rdma->local_ram_blocks.block[reg->current_index]); +                if (block->is_ram_block) { +                    if (block->offset > reg->key.current_addr) { +                        error_report("rdma: bad register address for block %s" +                            " offset: %" PRIx64 " current_addr: %" PRIx64, +                            block->block_name, block->offset, +                            reg->key.current_addr); +                        ret = -ERANGE; +                        goto out; +                    } +                    host_addr = (block->local_host_addr + +                                (reg->key.current_addr - block->offset)); +                    chunk = ram_chunk_index(block->local_host_addr, +                                            (uint8_t *) host_addr); +                } else { +                    chunk = reg->key.chunk; +                    host_addr = block->local_host_addr + +                        (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); +                    /* Check for particularly bad chunk value */ +                    if (host_addr < (void *)block->local_host_addr) { +                        error_report("rdma: bad chunk for block %s" +                            " chunk: %" PRIx64, +                            block->block_name, reg->key.chunk); +                        ret = -ERANGE; +                        goto out; +                    } +                } +                chunk_start = ram_chunk_start(block, chunk); +                chunk_end = ram_chunk_end(block, chunk + reg->chunks); +                if (qemu_rdma_register_and_get_keys(rdma, block, +                            (uintptr_t)host_addr, NULL, ®_result->rkey, +                            chunk, chunk_start, chunk_end)) { +                    error_report("cannot get rkey"); +                    ret = -EINVAL; +                    goto out; +                } + +                reg_result->host_addr = (uintptr_t)block->local_host_addr; + +                trace_qemu_rdma_registration_handle_register_rkey( +                                                           reg_result->rkey); + +                result_to_network(reg_result); +            } + +            ret = qemu_rdma_post_send_control(rdma, +                            (uint8_t *) results, ®_resp); + +            if (ret < 0) { +                error_report("Failed to send control buffer"); +                goto out; +            } +            break; +        case RDMA_CONTROL_UNREGISTER_REQUEST: +            trace_qemu_rdma_registration_handle_unregister(head.repeat); +            unreg_resp.repeat = head.repeat; +            registers = (RDMARegister *) rdma->wr_data[idx].control_curr; + +            for (count = 0; count < head.repeat; count++) { +                reg = ®isters[count]; +                network_to_register(reg); + +                trace_qemu_rdma_registration_handle_unregister_loop(count, +                           reg->current_index, reg->key.chunk); + +                block = &(rdma->local_ram_blocks.block[reg->current_index]); + +                ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); +                block->pmr[reg->key.chunk] = NULL; + +                if (ret != 0) { +                    perror("rdma unregistration chunk failed"); +                    ret = -ret; +                    goto out; +                } + +                rdma->total_registrations--; + +                trace_qemu_rdma_registration_handle_unregister_success( +                                                       reg->key.chunk); +            } + +            ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp); + +            if (ret < 0) { +                error_report("Failed to send control buffer"); +                goto out; +            } +            break; +        case RDMA_CONTROL_REGISTER_RESULT: +            error_report("Invalid RESULT message at dest."); +            ret = -EIO; +            goto out; +        default: +            error_report("Unknown control message %s", control_desc[head.type]); +            ret = -EIO; +            goto out; +        } +    } while (1); +out: +    if (ret < 0) { +        rdma->error_state = ret; +    } +    return ret; +} + +/* Destination: + * Called via a ram_control_load_hook during the initial RAM load section which + * lists the RAMBlocks by name.  This lets us know the order of the RAMBlocks + * on the source. + * We've already built our local RAMBlock list, but not yet sent the list to + * the source. + */ +static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) +{ +    RDMAContext *rdma = rfile->rdma; +    int curr; +    int found = -1; + +    /* Find the matching RAMBlock in our local list */ +    for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { +        if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { +            found = curr; +            break; +        } +    } + +    if (found == -1) { +        error_report("RAMBlock '%s' not found on destination", name); +        return -ENOENT; +    } + +    rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; +    trace_rdma_block_notification_handle(name, rdma->next_src_index); +    rdma->next_src_index++; + +    return 0; +} + +static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) +{ +    switch (flags) { +    case RAM_CONTROL_BLOCK_REG: +        return rdma_block_notification_handle(opaque, data); + +    case RAM_CONTROL_HOOK: +        return qemu_rdma_registration_handle(f, opaque); + +    default: +        /* Shouldn't be called with any other values */ +        abort(); +    } +} + +static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, +                                        uint64_t flags, void *data) +{ +    QEMUFileRDMA *rfile = opaque; +    RDMAContext *rdma = rfile->rdma; + +    CHECK_ERROR_STATE(); + +    trace_qemu_rdma_registration_start(flags); +    qemu_put_be64(f, RAM_SAVE_FLAG_HOOK); +    qemu_fflush(f); + +    return 0; +} + +/* + * Inform dest that dynamic registrations are done for now. + * First, flush writes, if any. + */ +static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, +                                       uint64_t flags, void *data) +{ +    Error *local_err = NULL, **errp = &local_err; +    QEMUFileRDMA *rfile = opaque; +    RDMAContext *rdma = rfile->rdma; +    RDMAControlHeader head = { .len = 0, .repeat = 1 }; +    int ret = 0; + +    CHECK_ERROR_STATE(); + +    qemu_fflush(f); +    ret = qemu_rdma_drain_cq(f, rdma); + +    if (ret < 0) { +        goto err; +    } + +    if (flags == RAM_CONTROL_SETUP) { +        RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; +        RDMALocalBlocks *local = &rdma->local_ram_blocks; +        int reg_result_idx, i, nb_dest_blocks; + +        head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; +        trace_qemu_rdma_registration_stop_ram(); + +        /* +         * Make sure that we parallelize the pinning on both sides. +         * For very large guests, doing this serially takes a really +         * long time, so we have to 'interleave' the pinning locally +         * with the control messages by performing the pinning on this +         * side before we receive the control response from the other +         * side that the pinning has completed. +         */ +        ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, +                    ®_result_idx, rdma->pin_all ? +                    qemu_rdma_reg_whole_ram_blocks : NULL); +        if (ret < 0) { +            ERROR(errp, "receiving remote info!"); +            return ret; +        } + +        nb_dest_blocks = resp.len / sizeof(RDMADestBlock); + +        /* +         * The protocol uses two different sets of rkeys (mutually exclusive): +         * 1. One key to represent the virtual address of the entire ram block. +         *    (dynamic chunk registration disabled - pin everything with one rkey.) +         * 2. One to represent individual chunks within a ram block. +         *    (dynamic chunk registration enabled - pin individual chunks.) +         * +         * Once the capability is successfully negotiated, the destination transmits +         * the keys to use (or sends them later) including the virtual addresses +         * and then propagates the remote ram block descriptions to his local copy. +         */ + +        if (local->nb_blocks != nb_dest_blocks) { +            ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) " +                        "Your QEMU command line parameters are probably " +                        "not identical on both the source and destination.", +                        local->nb_blocks, nb_dest_blocks); +            rdma->error_state = -EINVAL; +            return -EINVAL; +        } + +        qemu_rdma_move_header(rdma, reg_result_idx, &resp); +        memcpy(rdma->dest_blocks, +            rdma->wr_data[reg_result_idx].control_curr, resp.len); +        for (i = 0; i < nb_dest_blocks; i++) { +            network_to_dest_block(&rdma->dest_blocks[i]); + +            /* We require that the blocks are in the same order */ +            if (rdma->dest_blocks[i].length != local->block[i].length) { +                ERROR(errp, "Block %s/%d has a different length %" PRIu64 +                            "vs %" PRIu64, local->block[i].block_name, i, +                            local->block[i].length, +                            rdma->dest_blocks[i].length); +                rdma->error_state = -EINVAL; +                return -EINVAL; +            } +            local->block[i].remote_host_addr = +                    rdma->dest_blocks[i].remote_host_addr; +            local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; +        } +    } + +    trace_qemu_rdma_registration_stop(flags); + +    head.type = RDMA_CONTROL_REGISTER_FINISHED; +    ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL); + +    if (ret < 0) { +        goto err; +    } + +    return 0; +err: +    rdma->error_state = ret; +    return ret; +} + +static int qemu_rdma_get_fd(void *opaque) +{ +    QEMUFileRDMA *rfile = opaque; +    RDMAContext *rdma = rfile->rdma; + +    return rdma->comp_channel->fd; +} + +static const QEMUFileOps rdma_read_ops = { +    .get_buffer    = qemu_rdma_get_buffer, +    .get_fd        = qemu_rdma_get_fd, +    .close         = qemu_rdma_close, +    .hook_ram_load = rdma_load_hook, +}; + +static const QEMUFileOps rdma_write_ops = { +    .put_buffer         = qemu_rdma_put_buffer, +    .close              = qemu_rdma_close, +    .before_ram_iterate = qemu_rdma_registration_start, +    .after_ram_iterate  = qemu_rdma_registration_stop, +    .save_page          = qemu_rdma_save_page, +}; + +static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) +{ +    QEMUFileRDMA *r; + +    if (qemu_file_mode_is_not_valid(mode)) { +        return NULL; +    } + +    r = g_malloc0(sizeof(QEMUFileRDMA)); +    r->rdma = rdma; + +    if (mode[0] == 'w') { +        r->file = qemu_fopen_ops(r, &rdma_write_ops); +    } else { +        r->file = qemu_fopen_ops(r, &rdma_read_ops); +    } + +    return r->file; +} + +static void rdma_accept_incoming_migration(void *opaque) +{ +    RDMAContext *rdma = opaque; +    int ret; +    QEMUFile *f; +    Error *local_err = NULL, **errp = &local_err; + +    trace_qemu_rdma_accept_incoming_migration(); +    ret = qemu_rdma_accept(rdma); + +    if (ret) { +        ERROR(errp, "RDMA Migration initialization failed!"); +        return; +    } + +    trace_qemu_rdma_accept_incoming_migration_accepted(); + +    f = qemu_fopen_rdma(rdma, "rb"); +    if (f == NULL) { +        ERROR(errp, "could not qemu_fopen_rdma!"); +        qemu_rdma_cleanup(rdma); +        return; +    } + +    rdma->migration_started_on_destination = 1; +    process_incoming_migration(f); +} + +void rdma_start_incoming_migration(const char *host_port, Error **errp) +{ +    int ret; +    RDMAContext *rdma; +    Error *local_err = NULL; + +    trace_rdma_start_incoming_migration(); +    rdma = qemu_rdma_data_init(host_port, &local_err); + +    if (rdma == NULL) { +        goto err; +    } + +    ret = qemu_rdma_dest_init(rdma, &local_err); + +    if (ret) { +        goto err; +    } + +    trace_rdma_start_incoming_migration_after_dest_init(); + +    ret = rdma_listen(rdma->listen_id, 5); + +    if (ret) { +        ERROR(errp, "listening on socket!"); +        goto err; +    } + +    trace_rdma_start_incoming_migration_after_rdma_listen(); + +    qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, +                        NULL, (void *)(intptr_t)rdma); +    return; +err: +    error_propagate(errp, local_err); +    g_free(rdma); +} + +void rdma_start_outgoing_migration(void *opaque, +                            const char *host_port, Error **errp) +{ +    MigrationState *s = opaque; +    Error *local_err = NULL, **temp = &local_err; +    RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err); +    int ret = 0; + +    if (rdma == NULL) { +        ERROR(temp, "Failed to initialize RDMA data structures! %d", ret); +        goto err; +    } + +    ret = qemu_rdma_source_init(rdma, &local_err, +        s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); + +    if (ret) { +        goto err; +    } + +    trace_rdma_start_outgoing_migration_after_rdma_source_init(); +    ret = qemu_rdma_connect(rdma, &local_err); + +    if (ret) { +        goto err; +    } + +    trace_rdma_start_outgoing_migration_after_rdma_connect(); + +    s->file = qemu_fopen_rdma(rdma, "wb"); +    migrate_fd_connect(s); +    return; +err: +    error_propagate(errp, local_err); +    g_free(rdma); +    migrate_fd_error(s); +} diff --git a/migration/savevm.c b/migration/savevm.c new file mode 100644 index 00000000..60712153 --- /dev/null +++ b/migration/savevm.c @@ -0,0 +1,1605 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2009-2015 Red Hat Inc + * + * Authors: + *  Juan Quintela <quintela@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" +#include "qemu-common.h" +#include "hw/boards.h" +#include "hw/hw.h" +#include "hw/qdev.h" +#include "net/net.h" +#include "monitor/monitor.h" +#include "sysemu/sysemu.h" +#include "qemu/timer.h" +#include "audio/audio.h" +#include "migration/migration.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "qemu/queue.h" +#include "sysemu/cpus.h" +#include "exec/memory.h" +#include "qmp-commands.h" +#include "trace.h" +#include "qemu/iov.h" +#include "block/snapshot.h" +#include "block/qapi.h" + + +#ifndef ETH_P_RARP +#define ETH_P_RARP 0x8035 +#endif +#define ARP_HTYPE_ETH 0x0001 +#define ARP_PTYPE_IP 0x0800 +#define ARP_OP_REQUEST_REV 0x3 + +static bool skip_section_footers; + +static int announce_self_create(uint8_t *buf, +                                uint8_t *mac_addr) +{ +    /* Ethernet header. */ +    memset(buf, 0xff, 6);         /* destination MAC addr */ +    memcpy(buf + 6, mac_addr, 6); /* source MAC addr */ +    *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */ + +    /* RARP header. */ +    *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */ +    *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */ +    *(buf + 18) = 6; /* hardware addr length (ethernet) */ +    *(buf + 19) = 4; /* protocol addr length (IPv4) */ +    *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */ +    memcpy(buf + 22, mac_addr, 6); /* source hw addr */ +    memset(buf + 28, 0x00, 4);     /* source protocol addr */ +    memcpy(buf + 32, mac_addr, 6); /* target hw addr */ +    memset(buf + 38, 0x00, 4);     /* target protocol addr */ + +    /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */ +    memset(buf + 42, 0x00, 18); + +    return 60; /* len (FCS will be added by hardware) */ +} + +static void qemu_announce_self_iter(NICState *nic, void *opaque) +{ +    uint8_t buf[60]; +    int len; + +    trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr)); +    len = announce_self_create(buf, nic->conf->macaddr.a); + +    qemu_send_packet_raw(qemu_get_queue(nic), buf, len); +} + + +static void qemu_announce_self_once(void *opaque) +{ +    static int count = SELF_ANNOUNCE_ROUNDS; +    QEMUTimer *timer = *(QEMUTimer **)opaque; + +    qemu_foreach_nic(qemu_announce_self_iter, NULL); + +    if (--count) { +        /* delay 50ms, 150ms, 250ms, ... */ +        timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + +                  self_announce_delay(count)); +    } else { +            timer_del(timer); +            timer_free(timer); +    } +} + +void qemu_announce_self(void) +{ +    static QEMUTimer *timer; +    timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer); +    qemu_announce_self_once(&timer); +} + +/***********************************************************/ +/* savevm/loadvm support */ + +static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, +                                   int64_t pos) +{ +    int ret; +    QEMUIOVector qiov; + +    qemu_iovec_init_external(&qiov, iov, iovcnt); +    ret = bdrv_writev_vmstate(opaque, &qiov, pos); +    if (ret < 0) { +        return ret; +    } + +    return qiov.size; +} + +static int block_put_buffer(void *opaque, const uint8_t *buf, +                           int64_t pos, int size) +{ +    bdrv_save_vmstate(opaque, buf, pos, size); +    return size; +} + +static int block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) +{ +    return bdrv_load_vmstate(opaque, buf, pos, size); +} + +static int bdrv_fclose(void *opaque) +{ +    return bdrv_flush(opaque); +} + +static const QEMUFileOps bdrv_read_ops = { +    .get_buffer = block_get_buffer, +    .close =      bdrv_fclose +}; + +static const QEMUFileOps bdrv_write_ops = { +    .put_buffer     = block_put_buffer, +    .writev_buffer  = block_writev_buffer, +    .close          = bdrv_fclose +}; + +static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable) +{ +    if (is_writable) { +        return qemu_fopen_ops(bs, &bdrv_write_ops); +    } +    return qemu_fopen_ops(bs, &bdrv_read_ops); +} + + +/* QEMUFile timer support. + * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c + */ + +void timer_put(QEMUFile *f, QEMUTimer *ts) +{ +    uint64_t expire_time; + +    expire_time = timer_expire_time_ns(ts); +    qemu_put_be64(f, expire_time); +} + +void timer_get(QEMUFile *f, QEMUTimer *ts) +{ +    uint64_t expire_time; + +    expire_time = qemu_get_be64(f); +    if (expire_time != -1) { +        timer_mod_ns(ts, expire_time); +    } else { +        timer_del(ts); +    } +} + + +/* VMState timer support. + * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c + */ + +static int get_timer(QEMUFile *f, void *pv, size_t size) +{ +    QEMUTimer *v = pv; +    timer_get(f, v); +    return 0; +} + +static void put_timer(QEMUFile *f, void *pv, size_t size) +{ +    QEMUTimer *v = pv; +    timer_put(f, v); +} + +const VMStateInfo vmstate_info_timer = { +    .name = "timer", +    .get  = get_timer, +    .put  = put_timer, +}; + + +typedef struct CompatEntry { +    char idstr[256]; +    int instance_id; +} CompatEntry; + +typedef struct SaveStateEntry { +    QTAILQ_ENTRY(SaveStateEntry) entry; +    char idstr[256]; +    int instance_id; +    int alias_id; +    int version_id; +    int section_id; +    SaveVMHandlers *ops; +    const VMStateDescription *vmsd; +    void *opaque; +    CompatEntry *compat; +    int is_ram; +} SaveStateEntry; + +typedef struct SaveState { +    QTAILQ_HEAD(, SaveStateEntry) handlers; +    int global_section_id; +    bool skip_configuration; +    uint32_t len; +    const char *name; +} SaveState; + +static SaveState savevm_state = { +    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), +    .global_section_id = 0, +    .skip_configuration = false, +}; + +void savevm_skip_configuration(void) +{ +    savevm_state.skip_configuration = true; +} + + +static void configuration_pre_save(void *opaque) +{ +    SaveState *state = opaque; +    const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + +    state->len = strlen(current_name); +    state->name = current_name; +} + +static int configuration_post_load(void *opaque, int version_id) +{ +    SaveState *state = opaque; +    const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + +    if (strncmp(state->name, current_name, state->len) != 0) { +        error_report("Machine type received is '%s' and local is '%s'", +                     state->name, current_name); +        return -EINVAL; +    } +    return 0; +} + +static const VMStateDescription vmstate_configuration = { +    .name = "configuration", +    .version_id = 1, +    .post_load = configuration_post_load, +    .pre_save = configuration_pre_save, +    .fields = (VMStateField[]) { +        VMSTATE_UINT32(len, SaveState), +        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), +        VMSTATE_END_OF_LIST() +    }, +}; + +static void dump_vmstate_vmsd(FILE *out_file, +                              const VMStateDescription *vmsd, int indent, +                              bool is_subsection); + +static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field, +                              int indent) +{ +    fprintf(out_file, "%*s{\n", indent, ""); +    indent += 2; +    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name); +    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", +            field->version_id); +    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "", +            field->field_exists ? "true" : "false"); +    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size); +    if (field->vmsd != NULL) { +        fprintf(out_file, ",\n"); +        dump_vmstate_vmsd(out_file, field->vmsd, indent, false); +    } +    fprintf(out_file, "\n%*s}", indent - 2, ""); +} + +static void dump_vmstate_vmss(FILE *out_file, +                              const VMStateDescription **subsection, +                              int indent) +{ +    if (*subsection != NULL) { +        dump_vmstate_vmsd(out_file, *subsection, indent, true); +    } +} + +static void dump_vmstate_vmsd(FILE *out_file, +                              const VMStateDescription *vmsd, int indent, +                              bool is_subsection) +{ +    if (is_subsection) { +        fprintf(out_file, "%*s{\n", indent, ""); +    } else { +        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description"); +    } +    indent += 2; +    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name); +    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", +            vmsd->version_id); +    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "", +            vmsd->minimum_version_id); +    if (vmsd->fields != NULL) { +        const VMStateField *field = vmsd->fields; +        bool first; + +        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, ""); +        first = true; +        while (field->name != NULL) { +            if (field->flags & VMS_MUST_EXIST) { +                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */ +                field++; +                continue; +            } +            if (!first) { +                fprintf(out_file, ",\n"); +            } +            dump_vmstate_vmsf(out_file, field, indent + 2); +            field++; +            first = false; +        } +        fprintf(out_file, "\n%*s]", indent, ""); +    } +    if (vmsd->subsections != NULL) { +        const VMStateDescription **subsection = vmsd->subsections; +        bool first; + +        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, ""); +        first = true; +        while (*subsection != NULL) { +            if (!first) { +                fprintf(out_file, ",\n"); +            } +            dump_vmstate_vmss(out_file, subsection, indent + 2); +            subsection++; +            first = false; +        } +        fprintf(out_file, "\n%*s]", indent, ""); +    } +    fprintf(out_file, "\n%*s}", indent - 2, ""); +} + +static void dump_machine_type(FILE *out_file) +{ +    MachineClass *mc; + +    mc = MACHINE_GET_CLASS(current_machine); + +    fprintf(out_file, "  \"vmschkmachine\": {\n"); +    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name); +    fprintf(out_file, "  },\n"); +} + +void dump_vmstate_json_to_file(FILE *out_file) +{ +    GSList *list, *elt; +    bool first; + +    fprintf(out_file, "{\n"); +    dump_machine_type(out_file); + +    first = true; +    list = object_class_get_list(TYPE_DEVICE, true); +    for (elt = list; elt; elt = elt->next) { +        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data, +                                             TYPE_DEVICE); +        const char *name; +        int indent = 2; + +        if (!dc->vmsd) { +            continue; +        } + +        if (!first) { +            fprintf(out_file, ",\n"); +        } +        name = object_class_get_name(OBJECT_CLASS(dc)); +        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name); +        indent += 2; +        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name); +        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", +                dc->vmsd->version_id); +        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "", +                dc->vmsd->minimum_version_id); + +        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false); + +        fprintf(out_file, "\n%*s}", indent - 2, ""); +        first = false; +    } +    fprintf(out_file, "\n}\n"); +    fclose(out_file); +} + +static int calculate_new_instance_id(const char *idstr) +{ +    SaveStateEntry *se; +    int instance_id = 0; + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (strcmp(idstr, se->idstr) == 0 +            && instance_id <= se->instance_id) { +            instance_id = se->instance_id + 1; +        } +    } +    return instance_id; +} + +static int calculate_compat_instance_id(const char *idstr) +{ +    SaveStateEntry *se; +    int instance_id = 0; + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->compat) { +            continue; +        } + +        if (strcmp(idstr, se->compat->idstr) == 0 +            && instance_id <= se->compat->instance_id) { +            instance_id = se->compat->instance_id + 1; +        } +    } +    return instance_id; +} + +/* TODO: Individual devices generally have very little idea about the rest +   of the system, so instance_id should be removed/replaced. +   Meanwhile pass -1 as instance_id if you do not already have a clearly +   distinguishing id for all instances of your device class. */ +int register_savevm_live(DeviceState *dev, +                         const char *idstr, +                         int instance_id, +                         int version_id, +                         SaveVMHandlers *ops, +                         void *opaque) +{ +    SaveStateEntry *se; + +    se = g_malloc0(sizeof(SaveStateEntry)); +    se->version_id = version_id; +    se->section_id = savevm_state.global_section_id++; +    se->ops = ops; +    se->opaque = opaque; +    se->vmsd = NULL; +    /* if this is a live_savem then set is_ram */ +    if (ops->save_live_setup != NULL) { +        se->is_ram = 1; +    } + +    if (dev) { +        char *id = qdev_get_dev_path(dev); +        if (id) { +            pstrcpy(se->idstr, sizeof(se->idstr), id); +            pstrcat(se->idstr, sizeof(se->idstr), "/"); +            g_free(id); + +            se->compat = g_malloc0(sizeof(CompatEntry)); +            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr); +            se->compat->instance_id = instance_id == -1 ? +                         calculate_compat_instance_id(idstr) : instance_id; +            instance_id = -1; +        } +    } +    pstrcat(se->idstr, sizeof(se->idstr), idstr); + +    if (instance_id == -1) { +        se->instance_id = calculate_new_instance_id(se->idstr); +    } else { +        se->instance_id = instance_id; +    } +    assert(!se->compat || se->instance_id == 0); +    /* add at the end of list */ +    QTAILQ_INSERT_TAIL(&savevm_state.handlers, se, entry); +    return 0; +} + +int register_savevm(DeviceState *dev, +                    const char *idstr, +                    int instance_id, +                    int version_id, +                    SaveStateHandler *save_state, +                    LoadStateHandler *load_state, +                    void *opaque) +{ +    SaveVMHandlers *ops = g_malloc0(sizeof(SaveVMHandlers)); +    ops->save_state = save_state; +    ops->load_state = load_state; +    return register_savevm_live(dev, idstr, instance_id, version_id, +                                ops, opaque); +} + +void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) +{ +    SaveStateEntry *se, *new_se; +    char id[256] = ""; + +    if (dev) { +        char *path = qdev_get_dev_path(dev); +        if (path) { +            pstrcpy(id, sizeof(id), path); +            pstrcat(id, sizeof(id), "/"); +            g_free(path); +        } +    } +    pstrcat(id, sizeof(id), idstr); + +    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { +        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { +            QTAILQ_REMOVE(&savevm_state.handlers, se, entry); +            if (se->compat) { +                g_free(se->compat); +            } +            g_free(se->ops); +            g_free(se); +        } +    } +} + +int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, +                                   const VMStateDescription *vmsd, +                                   void *opaque, int alias_id, +                                   int required_for_version) +{ +    SaveStateEntry *se; + +    /* If this triggers, alias support can be dropped for the vmsd. */ +    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id); + +    se = g_malloc0(sizeof(SaveStateEntry)); +    se->version_id = vmsd->version_id; +    se->section_id = savevm_state.global_section_id++; +    se->opaque = opaque; +    se->vmsd = vmsd; +    se->alias_id = alias_id; + +    if (dev) { +        char *id = qdev_get_dev_path(dev); +        if (id) { +            pstrcpy(se->idstr, sizeof(se->idstr), id); +            pstrcat(se->idstr, sizeof(se->idstr), "/"); +            g_free(id); + +            se->compat = g_malloc0(sizeof(CompatEntry)); +            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); +            se->compat->instance_id = instance_id == -1 ? +                         calculate_compat_instance_id(vmsd->name) : instance_id; +            instance_id = -1; +        } +    } +    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); + +    if (instance_id == -1) { +        se->instance_id = calculate_new_instance_id(se->idstr); +    } else { +        se->instance_id = instance_id; +    } +    assert(!se->compat || se->instance_id == 0); +    /* add at the end of list */ +    QTAILQ_INSERT_TAIL(&savevm_state.handlers, se, entry); +    return 0; +} + +void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd, +                        void *opaque) +{ +    SaveStateEntry *se, *new_se; + +    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { +        if (se->vmsd == vmsd && se->opaque == opaque) { +            QTAILQ_REMOVE(&savevm_state.handlers, se, entry); +            if (se->compat) { +                g_free(se->compat); +            } +            g_free(se); +        } +    } +} + +static int vmstate_load(QEMUFile *f, SaveStateEntry *se, int version_id) +{ +    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); +    if (!se->vmsd) {         /* Old style */ +        return se->ops->load_state(f, se->opaque, version_id); +    } +    return vmstate_load_state(f, se->vmsd, se->opaque, version_id); +} + +static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc) +{ +    int64_t old_offset, size; + +    old_offset = qemu_ftell_fast(f); +    se->ops->save_state(f, se->opaque); +    size = qemu_ftell_fast(f) - old_offset; + +    if (vmdesc) { +        json_prop_int(vmdesc, "size", size); +        json_start_array(vmdesc, "fields"); +        json_start_object(vmdesc, NULL); +        json_prop_str(vmdesc, "name", "data"); +        json_prop_int(vmdesc, "size", size); +        json_prop_str(vmdesc, "type", "buffer"); +        json_end_object(vmdesc); +        json_end_array(vmdesc); +    } +} + +static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc) +{ +    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); +    if (!se->vmsd) { +        vmstate_save_old_style(f, se, vmdesc); +        return; +    } +    vmstate_save_state(f, se->vmsd, se->opaque, vmdesc); +} + +void savevm_skip_section_footers(void) +{ +    skip_section_footers = true; +} + +/* + * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL) + */ +static void save_section_header(QEMUFile *f, SaveStateEntry *se, +                                uint8_t section_type) +{ +    qemu_put_byte(f, section_type); +    qemu_put_be32(f, se->section_id); + +    if (section_type == QEMU_VM_SECTION_FULL || +        section_type == QEMU_VM_SECTION_START) { +        /* ID string */ +        size_t len = strlen(se->idstr); +        qemu_put_byte(f, len); +        qemu_put_buffer(f, (uint8_t *)se->idstr, len); + +        qemu_put_be32(f, se->instance_id); +        qemu_put_be32(f, se->version_id); +    } +} + +/* + * Write a footer onto device sections that catches cases misformatted device + * sections. + */ +static void save_section_footer(QEMUFile *f, SaveStateEntry *se) +{ +    if (!skip_section_footers) { +        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER); +        qemu_put_be32(f, se->section_id); +    } +} + +bool qemu_savevm_state_blocked(Error **errp) +{ +    SaveStateEntry *se; + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (se->vmsd && se->vmsd->unmigratable) { +            error_setg(errp, "State blocked by non-migratable device '%s'", +                       se->idstr); +            return true; +        } +    } +    return false; +} + +void qemu_savevm_state_header(QEMUFile *f) +{ +    trace_savevm_state_header(); +    qemu_put_be32(f, QEMU_VM_FILE_MAGIC); +    qemu_put_be32(f, QEMU_VM_FILE_VERSION); +} + +void qemu_savevm_state_begin(QEMUFile *f, +                             const MigrationParams *params) +{ +    SaveStateEntry *se; +    int ret; + +    trace_savevm_state_begin(); +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->ops || !se->ops->set_params) { +            continue; +        } +        se->ops->set_params(params, se->opaque); +    } + +    if (!savevm_state.skip_configuration) { +        qemu_put_byte(f, QEMU_VM_CONFIGURATION); +        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); +    } + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->ops || !se->ops->save_live_setup) { +            continue; +        } +        if (se->ops && se->ops->is_active) { +            if (!se->ops->is_active(se->opaque)) { +                continue; +            } +        } +        save_section_header(f, se, QEMU_VM_SECTION_START); + +        ret = se->ops->save_live_setup(f, se->opaque); +        save_section_footer(f, se); +        if (ret < 0) { +            qemu_file_set_error(f, ret); +            break; +        } +    } +} + +/* + * this function has three return values: + *   negative: there was one error, and we have -errno. + *   0 : We haven't finished, caller have to go again + *   1 : We have finished, we can go to complete phase + */ +int qemu_savevm_state_iterate(QEMUFile *f) +{ +    SaveStateEntry *se; +    int ret = 1; + +    trace_savevm_state_iterate(); +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->ops || !se->ops->save_live_iterate) { +            continue; +        } +        if (se->ops && se->ops->is_active) { +            if (!se->ops->is_active(se->opaque)) { +                continue; +            } +        } +        if (qemu_file_rate_limit(f)) { +            return 0; +        } +        trace_savevm_section_start(se->idstr, se->section_id); + +        save_section_header(f, se, QEMU_VM_SECTION_PART); + +        ret = se->ops->save_live_iterate(f, se->opaque); +        trace_savevm_section_end(se->idstr, se->section_id, ret); +        save_section_footer(f, se); + +        if (ret < 0) { +            qemu_file_set_error(f, ret); +        } +        if (ret <= 0) { +            /* Do not proceed to the next vmstate before this one reported +               completion of the current stage. This serializes the migration +               and reduces the probability that a faster changing state is +               synchronized over and over again. */ +            break; +        } +    } +    return ret; +} + +static bool should_send_vmdesc(void) +{ +    MachineState *machine = MACHINE(qdev_get_machine()); +    return !machine->suppress_vmdesc; +} + +void qemu_savevm_state_complete(QEMUFile *f) +{ +    QJSON *vmdesc; +    int vmdesc_len; +    SaveStateEntry *se; +    int ret; + +    trace_savevm_state_complete(); + +    cpu_synchronize_all_states(); + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->ops || !se->ops->save_live_complete) { +            continue; +        } +        if (se->ops && se->ops->is_active) { +            if (!se->ops->is_active(se->opaque)) { +                continue; +            } +        } +        trace_savevm_section_start(se->idstr, se->section_id); + +        save_section_header(f, se, QEMU_VM_SECTION_END); + +        ret = se->ops->save_live_complete(f, se->opaque); +        trace_savevm_section_end(se->idstr, se->section_id, ret); +        save_section_footer(f, se); +        if (ret < 0) { +            qemu_file_set_error(f, ret); +            return; +        } +    } + +    vmdesc = qjson_new(); +    json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE); +    json_start_array(vmdesc, "devices"); +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + +        if ((!se->ops || !se->ops->save_state) && !se->vmsd) { +            continue; +        } +        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { +            trace_savevm_section_skip(se->idstr, se->section_id); +            continue; +        } + +        trace_savevm_section_start(se->idstr, se->section_id); + +        json_start_object(vmdesc, NULL); +        json_prop_str(vmdesc, "name", se->idstr); +        json_prop_int(vmdesc, "instance_id", se->instance_id); + +        save_section_header(f, se, QEMU_VM_SECTION_FULL); + +        vmstate_save(f, se, vmdesc); + +        json_end_object(vmdesc); +        trace_savevm_section_end(se->idstr, se->section_id, 0); +        save_section_footer(f, se); +    } + +    qemu_put_byte(f, QEMU_VM_EOF); + +    json_end_array(vmdesc); +    qjson_finish(vmdesc); +    vmdesc_len = strlen(qjson_get_str(vmdesc)); + +    if (should_send_vmdesc()) { +        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION); +        qemu_put_be32(f, vmdesc_len); +        qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len); +    } +    object_unref(OBJECT(vmdesc)); + +    qemu_fflush(f); +} + +uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) +{ +    SaveStateEntry *se; +    uint64_t ret = 0; + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!se->ops || !se->ops->save_live_pending) { +            continue; +        } +        if (se->ops && se->ops->is_active) { +            if (!se->ops->is_active(se->opaque)) { +                continue; +            } +        } +        ret += se->ops->save_live_pending(f, se->opaque, max_size); +    } +    return ret; +} + +void qemu_savevm_state_cancel(void) +{ +    SaveStateEntry *se; + +    trace_savevm_state_cancel(); +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (se->ops && se->ops->cancel) { +            se->ops->cancel(se->opaque); +        } +    } +} + +static int qemu_savevm_state(QEMUFile *f, Error **errp) +{ +    int ret; +    MigrationParams params = { +        .blk = 0, +        .shared = 0 +    }; + +    if (qemu_savevm_state_blocked(errp)) { +        return -EINVAL; +    } + +    qemu_mutex_unlock_iothread(); +    qemu_savevm_state_header(f); +    qemu_savevm_state_begin(f, ¶ms); +    qemu_mutex_lock_iothread(); + +    while (qemu_file_get_error(f) == 0) { +        if (qemu_savevm_state_iterate(f) > 0) { +            break; +        } +    } + +    ret = qemu_file_get_error(f); +    if (ret == 0) { +        qemu_savevm_state_complete(f); +        ret = qemu_file_get_error(f); +    } +    if (ret != 0) { +        qemu_savevm_state_cancel(); +        error_setg_errno(errp, -ret, "Error while writing VM state"); +    } +    return ret; +} + +static int qemu_save_device_state(QEMUFile *f) +{ +    SaveStateEntry *se; + +    qemu_put_be32(f, QEMU_VM_FILE_MAGIC); +    qemu_put_be32(f, QEMU_VM_FILE_VERSION); + +    cpu_synchronize_all_states(); + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (se->is_ram) { +            continue; +        } +        if ((!se->ops || !se->ops->save_state) && !se->vmsd) { +            continue; +        } +        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { +            continue; +        } + +        save_section_header(f, se, QEMU_VM_SECTION_FULL); + +        vmstate_save(f, se, NULL); + +        save_section_footer(f, se); +    } + +    qemu_put_byte(f, QEMU_VM_EOF); + +    return qemu_file_get_error(f); +} + +static SaveStateEntry *find_se(const char *idstr, int instance_id) +{ +    SaveStateEntry *se; + +    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { +        if (!strcmp(se->idstr, idstr) && +            (instance_id == se->instance_id || +             instance_id == se->alias_id)) +            return se; +        /* Migrating from an older version? */ +        if (strstr(se->idstr, idstr) && se->compat) { +            if (!strcmp(se->compat->idstr, idstr) && +                (instance_id == se->compat->instance_id || +                 instance_id == se->alias_id)) +                return se; +        } +    } +    return NULL; +} + +struct LoadStateEntry { +    QLIST_ENTRY(LoadStateEntry) entry; +    SaveStateEntry *se; +    int section_id; +    int version_id; +}; + +/* + * Read a footer off the wire and check that it matches the expected section + * + * Returns: true if the footer was good + *          false if there is a problem (and calls error_report to say why) + */ +static bool check_section_footer(QEMUFile *f, LoadStateEntry *le) +{ +    uint8_t read_mark; +    uint32_t read_section_id; + +    if (skip_section_footers) { +        /* No footer to check */ +        return true; +    } + +    read_mark = qemu_get_byte(f); + +    if (read_mark != QEMU_VM_SECTION_FOOTER) { +        error_report("Missing section footer for %s", le->se->idstr); +        return false; +    } + +    read_section_id = qemu_get_be32(f); +    if (read_section_id != le->section_id) { +        error_report("Mismatched section id in footer for %s -" +                     " read 0x%x expected 0x%x", +                     le->se->idstr, read_section_id, le->section_id); +        return false; +    } + +    /* All good */ +    return true; +} + +void loadvm_free_handlers(MigrationIncomingState *mis) +{ +    LoadStateEntry *le, *new_le; + +    QLIST_FOREACH_SAFE(le, &mis->loadvm_handlers, entry, new_le) { +        QLIST_REMOVE(le, entry); +        g_free(le); +    } +} + +int qemu_loadvm_state(QEMUFile *f) +{ +    MigrationIncomingState *mis = migration_incoming_get_current(); +    Error *local_err = NULL; +    uint8_t section_type; +    unsigned int v; +    int ret; +    int file_error_after_eof = -1; + +    if (qemu_savevm_state_blocked(&local_err)) { +        error_report_err(local_err); +        return -EINVAL; +    } + +    v = qemu_get_be32(f); +    if (v != QEMU_VM_FILE_MAGIC) { +        error_report("Not a migration stream"); +        return -EINVAL; +    } + +    v = qemu_get_be32(f); +    if (v == QEMU_VM_FILE_VERSION_COMPAT) { +        error_report("SaveVM v2 format is obsolete and don't work anymore"); +        return -ENOTSUP; +    } +    if (v != QEMU_VM_FILE_VERSION) { +        error_report("Unsupported migration stream version"); +        return -ENOTSUP; +    } + +    if (!savevm_state.skip_configuration) { +        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { +            error_report("Configuration section missing"); +            return -EINVAL; +        } +        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + +        if (ret) { +            return ret; +        } +    } + +    while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { +        uint32_t instance_id, version_id, section_id; +        SaveStateEntry *se; +        LoadStateEntry *le; +        char idstr[256]; + +        trace_qemu_loadvm_state_section(section_type); +        switch (section_type) { +        case QEMU_VM_SECTION_START: +        case QEMU_VM_SECTION_FULL: +            /* Read section start */ +            section_id = qemu_get_be32(f); +            if (!qemu_get_counted_string(f, idstr)) { +                error_report("Unable to read ID string for section %u", +                            section_id); +                return -EINVAL; +            } +            instance_id = qemu_get_be32(f); +            version_id = qemu_get_be32(f); + +            trace_qemu_loadvm_state_section_startfull(section_id, idstr, +                                                      instance_id, version_id); +            /* Find savevm section */ +            se = find_se(idstr, instance_id); +            if (se == NULL) { +                error_report("Unknown savevm section or instance '%s' %d", +                             idstr, instance_id); +                ret = -EINVAL; +                goto out; +            } + +            /* Validate version */ +            if (version_id > se->version_id) { +                error_report("savevm: unsupported version %d for '%s' v%d", +                             version_id, idstr, se->version_id); +                ret = -EINVAL; +                goto out; +            } + +            /* Add entry */ +            le = g_malloc0(sizeof(*le)); + +            le->se = se; +            le->section_id = section_id; +            le->version_id = version_id; +            QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry); + +            ret = vmstate_load(f, le->se, le->version_id); +            if (ret < 0) { +                error_report("error while loading state for instance 0x%x of" +                             " device '%s'", instance_id, idstr); +                goto out; +            } +            if (!check_section_footer(f, le)) { +                ret = -EINVAL; +                goto out; +            } +            break; +        case QEMU_VM_SECTION_PART: +        case QEMU_VM_SECTION_END: +            section_id = qemu_get_be32(f); + +            trace_qemu_loadvm_state_section_partend(section_id); +            QLIST_FOREACH(le, &mis->loadvm_handlers, entry) { +                if (le->section_id == section_id) { +                    break; +                } +            } +            if (le == NULL) { +                error_report("Unknown savevm section %d", section_id); +                ret = -EINVAL; +                goto out; +            } + +            ret = vmstate_load(f, le->se, le->version_id); +            if (ret < 0) { +                error_report("error while loading state section id %d(%s)", +                             section_id, le->se->idstr); +                goto out; +            } +            if (!check_section_footer(f, le)) { +                ret = -EINVAL; +                goto out; +            } +            break; +        default: +            error_report("Unknown savevm section type %d", section_type); +            ret = -EINVAL; +            goto out; +        } +    } + +    file_error_after_eof = qemu_file_get_error(f); + +    /* +     * Try to read in the VMDESC section as well, so that dumping tools that +     * intercept our migration stream have the chance to see it. +     */ + +    /* We've got to be careful; if we don't read the data and just shut the fd +     * then the sender can error if we close while it's still sending. +     * We also mustn't read data that isn't there; some transports (RDMA) +     * will stall waiting for that data when the source has already closed. +     */ +    if (should_send_vmdesc()) { +        uint8_t *buf; +        uint32_t size; +        section_type = qemu_get_byte(f); + +        if (section_type != QEMU_VM_VMDESCRIPTION) { +            error_report("Expected vmdescription section, but got %d", +                         section_type); +            /* +             * It doesn't seem worth failing at this point since +             * we apparently have an otherwise valid VM state +             */ +        } else { +            buf = g_malloc(0x1000); +            size = qemu_get_be32(f); + +            while (size > 0) { +                uint32_t read_chunk = MIN(size, 0x1000); +                qemu_get_buffer(f, buf, read_chunk); +                size -= read_chunk; +            } +            g_free(buf); +        } +    } + +    cpu_synchronize_all_post_init(); + +    ret = 0; + +out: +    if (ret == 0) { +        /* We may not have a VMDESC section, so ignore relative errors */ +        ret = file_error_after_eof; +    } + +    return ret; +} + +static BlockDriverState *find_vmstate_bs(void) +{ +    BlockDriverState *bs = NULL; +    while ((bs = bdrv_next(bs))) { +        if (bdrv_can_snapshot(bs)) { +            return bs; +        } +    } +    return NULL; +} + +/* + * Deletes snapshots of a given name in all opened images. + */ +static int del_existing_snapshots(Monitor *mon, const char *name) +{ +    BlockDriverState *bs; +    QEMUSnapshotInfo sn1, *snapshot = &sn1; +    Error *err = NULL; + +    bs = NULL; +    while ((bs = bdrv_next(bs))) { +        if (bdrv_can_snapshot(bs) && +            bdrv_snapshot_find(bs, snapshot, name) >= 0) { +            bdrv_snapshot_delete_by_id_or_name(bs, name, &err); +            if (err) { +                monitor_printf(mon, +                               "Error while deleting snapshot on device '%s':" +                               " %s\n", +                               bdrv_get_device_name(bs), +                               error_get_pretty(err)); +                error_free(err); +                return -1; +            } +        } +    } + +    return 0; +} + +void hmp_savevm(Monitor *mon, const QDict *qdict) +{ +    BlockDriverState *bs, *bs1; +    QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1; +    int ret; +    QEMUFile *f; +    int saved_vm_running; +    uint64_t vm_state_size; +    qemu_timeval tv; +    struct tm tm; +    const char *name = qdict_get_try_str(qdict, "name"); +    Error *local_err = NULL; + +    /* Verify if there is a device that doesn't support snapshots and is writable */ +    bs = NULL; +    while ((bs = bdrv_next(bs))) { + +        if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { +            continue; +        } + +        if (!bdrv_can_snapshot(bs)) { +            monitor_printf(mon, "Device '%s' is writable but does not support snapshots.\n", +                               bdrv_get_device_name(bs)); +            return; +        } +    } + +    bs = find_vmstate_bs(); +    if (!bs) { +        monitor_printf(mon, "No block device can accept snapshots\n"); +        return; +    } + +    saved_vm_running = runstate_is_running(); + +    ret = global_state_store(); +    if (ret) { +        monitor_printf(mon, "Error saving global state\n"); +        return; +    } +    vm_stop(RUN_STATE_SAVE_VM); + +    memset(sn, 0, sizeof(*sn)); + +    /* fill auxiliary fields */ +    qemu_gettimeofday(&tv); +    sn->date_sec = tv.tv_sec; +    sn->date_nsec = tv.tv_usec * 1000; +    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + +    if (name) { +        ret = bdrv_snapshot_find(bs, old_sn, name); +        if (ret >= 0) { +            pstrcpy(sn->name, sizeof(sn->name), old_sn->name); +            pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str); +        } else { +            pstrcpy(sn->name, sizeof(sn->name), name); +        } +    } else { +        /* cast below needed for OpenBSD where tv_sec is still 'long' */ +        localtime_r((const time_t *)&tv.tv_sec, &tm); +        strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm); +    } + +    /* Delete old snapshots of the same name */ +    if (name && del_existing_snapshots(mon, name) < 0) { +        goto the_end; +    } + +    /* save the VM state */ +    f = qemu_fopen_bdrv(bs, 1); +    if (!f) { +        monitor_printf(mon, "Could not open VM state file\n"); +        goto the_end; +    } +    ret = qemu_savevm_state(f, &local_err); +    vm_state_size = qemu_ftell(f); +    qemu_fclose(f); +    if (ret < 0) { +        monitor_printf(mon, "%s\n", error_get_pretty(local_err)); +        error_free(local_err); +        goto the_end; +    } + +    /* create the snapshots */ + +    bs1 = NULL; +    while ((bs1 = bdrv_next(bs1))) { +        if (bdrv_can_snapshot(bs1)) { +            /* Write VM state size only to the image that contains the state */ +            sn->vm_state_size = (bs == bs1 ? vm_state_size : 0); +            ret = bdrv_snapshot_create(bs1, sn); +            if (ret < 0) { +                monitor_printf(mon, "Error while creating snapshot on '%s'\n", +                               bdrv_get_device_name(bs1)); +            } +        } +    } + + the_end: +    if (saved_vm_running) { +        vm_start(); +    } +} + +void qmp_xen_save_devices_state(const char *filename, Error **errp) +{ +    QEMUFile *f; +    int saved_vm_running; +    int ret; + +    saved_vm_running = runstate_is_running(); +    vm_stop(RUN_STATE_SAVE_VM); +    global_state_store_running(); + +    f = qemu_fopen(filename, "wb"); +    if (!f) { +        error_setg_file_open(errp, errno, filename); +        goto the_end; +    } +    ret = qemu_save_device_state(f); +    qemu_fclose(f); +    if (ret < 0) { +        error_setg(errp, QERR_IO_ERROR); +    } + + the_end: +    if (saved_vm_running) { +        vm_start(); +    } +} + +int load_vmstate(const char *name) +{ +    BlockDriverState *bs, *bs_vm_state; +    QEMUSnapshotInfo sn; +    QEMUFile *f; +    int ret; + +    bs_vm_state = find_vmstate_bs(); +    if (!bs_vm_state) { +        error_report("No block device supports snapshots"); +        return -ENOTSUP; +    } + +    /* Don't even try to load empty VM states */ +    ret = bdrv_snapshot_find(bs_vm_state, &sn, name); +    if (ret < 0) { +        return ret; +    } else if (sn.vm_state_size == 0) { +        error_report("This is a disk-only snapshot. Revert to it offline " +            "using qemu-img."); +        return -EINVAL; +    } + +    /* Verify if there is any device that doesn't support snapshots and is +    writable and check if the requested snapshot is available too. */ +    bs = NULL; +    while ((bs = bdrv_next(bs))) { + +        if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { +            continue; +        } + +        if (!bdrv_can_snapshot(bs)) { +            error_report("Device '%s' is writable but does not support snapshots.", +                               bdrv_get_device_name(bs)); +            return -ENOTSUP; +        } + +        ret = bdrv_snapshot_find(bs, &sn, name); +        if (ret < 0) { +            error_report("Device '%s' does not have the requested snapshot '%s'", +                           bdrv_get_device_name(bs), name); +            return ret; +        } +    } + +    /* Flush all IO requests so they don't interfere with the new state.  */ +    bdrv_drain_all(); + +    bs = NULL; +    while ((bs = bdrv_next(bs))) { +        if (bdrv_can_snapshot(bs)) { +            ret = bdrv_snapshot_goto(bs, name); +            if (ret < 0) { +                error_report("Error %d while activating snapshot '%s' on '%s'", +                             ret, name, bdrv_get_device_name(bs)); +                return ret; +            } +        } +    } + +    /* restore the VM state */ +    f = qemu_fopen_bdrv(bs_vm_state, 0); +    if (!f) { +        error_report("Could not open VM state file"); +        return -EINVAL; +    } + +    qemu_system_reset(VMRESET_SILENT); +    migration_incoming_state_new(f); +    ret = qemu_loadvm_state(f); + +    qemu_fclose(f); +    migration_incoming_state_destroy(); +    if (ret < 0) { +        error_report("Error %d while loading VM state", ret); +        return ret; +    } + +    return 0; +} + +void hmp_delvm(Monitor *mon, const QDict *qdict) +{ +    BlockDriverState *bs; +    Error *err; +    const char *name = qdict_get_str(qdict, "name"); + +    if (!find_vmstate_bs()) { +        monitor_printf(mon, "No block device supports snapshots\n"); +        return; +    } + +    bs = NULL; +    while ((bs = bdrv_next(bs))) { +        if (bdrv_can_snapshot(bs)) { +            err = NULL; +            bdrv_snapshot_delete_by_id_or_name(bs, name, &err); +            if (err) { +                monitor_printf(mon, +                               "Error while deleting snapshot on device '%s':" +                               " %s\n", +                               bdrv_get_device_name(bs), +                               error_get_pretty(err)); +                error_free(err); +            } +        } +    } +} + +void hmp_info_snapshots(Monitor *mon, const QDict *qdict) +{ +    BlockDriverState *bs, *bs1; +    QEMUSnapshotInfo *sn_tab, *sn, s, *sn_info = &s; +    int nb_sns, i, ret, available; +    int total; +    int *available_snapshots; + +    bs = find_vmstate_bs(); +    if (!bs) { +        monitor_printf(mon, "No available block device supports snapshots\n"); +        return; +    } + +    nb_sns = bdrv_snapshot_list(bs, &sn_tab); +    if (nb_sns < 0) { +        monitor_printf(mon, "bdrv_snapshot_list: error %d\n", nb_sns); +        return; +    } + +    if (nb_sns == 0) { +        monitor_printf(mon, "There is no snapshot available.\n"); +        return; +    } + +    available_snapshots = g_malloc0(sizeof(int) * nb_sns); +    total = 0; +    for (i = 0; i < nb_sns; i++) { +        sn = &sn_tab[i]; +        available = 1; +        bs1 = NULL; + +        while ((bs1 = bdrv_next(bs1))) { +            if (bdrv_can_snapshot(bs1) && bs1 != bs) { +                ret = bdrv_snapshot_find(bs1, sn_info, sn->id_str); +                if (ret < 0) { +                    available = 0; +                    break; +                } +            } +        } + +        if (available) { +            available_snapshots[total] = i; +            total++; +        } +    } + +    if (total > 0) { +        bdrv_snapshot_dump((fprintf_function)monitor_printf, mon, NULL); +        monitor_printf(mon, "\n"); +        for (i = 0; i < total; i++) { +            sn = &sn_tab[available_snapshots[i]]; +            bdrv_snapshot_dump((fprintf_function)monitor_printf, mon, sn); +            monitor_printf(mon, "\n"); +        } +    } else { +        monitor_printf(mon, "There is no suitable snapshot available\n"); +    } + +    g_free(sn_tab); +    g_free(available_snapshots); + +} + +void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) +{ +    qemu_ram_set_idstr(memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK, +                       memory_region_name(mr), dev); +} + +void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev) +{ +    qemu_ram_unset_idstr(memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK); +} + +void vmstate_register_ram_global(MemoryRegion *mr) +{ +    vmstate_register_ram(mr, NULL); +} diff --git a/migration/tcp.c b/migration/tcp.c new file mode 100644 index 00000000..ae891728 --- /dev/null +++ b/migration/tcp.c @@ -0,0 +1,103 @@ +/* + * QEMU live migration + * + * Copyright IBM, Corp. 2008 + * + * Authors: + *  Anthony Liguori   <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include <string.h> + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "block/block.h" +#include "qemu/main-loop.h" + +//#define DEBUG_MIGRATION_TCP + +#ifdef DEBUG_MIGRATION_TCP +#define DPRINTF(fmt, ...) \ +    do { printf("migration-tcp: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +static void tcp_wait_for_connect(int fd, Error *err, void *opaque) +{ +    MigrationState *s = opaque; + +    if (fd < 0) { +        DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); +        s->file = NULL; +        migrate_fd_error(s); +    } else { +        DPRINTF("migrate connect success\n"); +        s->file = qemu_fopen_socket(fd, "wb"); +        migrate_fd_connect(s); +    } +} + +void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp) +{ +    inet_nonblocking_connect(host_port, tcp_wait_for_connect, s, errp); +} + +static void tcp_accept_incoming_migration(void *opaque) +{ +    struct sockaddr_in addr; +    socklen_t addrlen = sizeof(addr); +    int s = (intptr_t)opaque; +    QEMUFile *f; +    int c, err; + +    do { +        c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); +        err = socket_error(); +    } while (c < 0 && err == EINTR); +    qemu_set_fd_handler(s, NULL, NULL, NULL); +    closesocket(s); + +    DPRINTF("accepted migration\n"); + +    if (c < 0) { +        error_report("could not accept migration connection (%s)", +                     strerror(err)); +        return; +    } + +    f = qemu_fopen_socket(c, "rb"); +    if (f == NULL) { +        error_report("could not qemu_fopen socket"); +        goto out; +    } + +    process_incoming_migration(f); +    return; + +out: +    closesocket(c); +} + +void tcp_start_incoming_migration(const char *host_port, Error **errp) +{ +    int s; + +    s = inet_listen(host_port, NULL, 256, SOCK_STREAM, 0, errp); +    if (s < 0) { +        return; +    } + +    qemu_set_fd_handler(s, tcp_accept_incoming_migration, NULL, +                        (void *)(intptr_t)s); +} diff --git a/migration/unix.c b/migration/unix.c new file mode 100644 index 00000000..b591813e --- /dev/null +++ b/migration/unix.c @@ -0,0 +1,103 @@ +/* + * QEMU live migration via Unix Domain Sockets + * + * Copyright Red Hat, Inc. 2009 + * + * Authors: + *  Chris Lalancette <clalance@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include <string.h> + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "qemu/main-loop.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "block/block.h" + +//#define DEBUG_MIGRATION_UNIX + +#ifdef DEBUG_MIGRATION_UNIX +#define DPRINTF(fmt, ...) \ +    do { printf("migration-unix: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +    do { } while (0) +#endif + +static void unix_wait_for_connect(int fd, Error *err, void *opaque) +{ +    MigrationState *s = opaque; + +    if (fd < 0) { +        DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); +        s->file = NULL; +        migrate_fd_error(s); +    } else { +        DPRINTF("migrate connect success\n"); +        s->file = qemu_fopen_socket(fd, "wb"); +        migrate_fd_connect(s); +    } +} + +void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp) +{ +    unix_nonblocking_connect(path, unix_wait_for_connect, s, errp); +} + +static void unix_accept_incoming_migration(void *opaque) +{ +    struct sockaddr_un addr; +    socklen_t addrlen = sizeof(addr); +    int s = (intptr_t)opaque; +    QEMUFile *f; +    int c, err; + +    do { +        c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); +        err = errno; +    } while (c < 0 && err == EINTR); +    qemu_set_fd_handler(s, NULL, NULL, NULL); +    close(s); + +    DPRINTF("accepted migration\n"); + +    if (c < 0) { +        error_report("could not accept migration connection (%s)", +                     strerror(err)); +        return; +    } + +    f = qemu_fopen_socket(c, "rb"); +    if (f == NULL) { +        error_report("could not qemu_fopen socket"); +        goto out; +    } + +    process_incoming_migration(f); +    return; + +out: +    close(c); +} + +void unix_start_incoming_migration(const char *path, Error **errp) +{ +    int s; + +    s = unix_listen(path, NULL, 0, errp); +    if (s < 0) { +        return; +    } + +    qemu_set_fd_handler(s, unix_accept_incoming_migration, NULL, +                        (void *)(intptr_t)s); +} diff --git a/migration/vmstate.c b/migration/vmstate.c new file mode 100644 index 00000000..e8ccf22f --- /dev/null +++ b/migration/vmstate.c @@ -0,0 +1,890 @@ +#include "qemu-common.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "migration/vmstate.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qjson.h" + +static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, +                                    void *opaque, QJSON *vmdesc); +static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, +                                   void *opaque); + +static int vmstate_n_elems(void *opaque, VMStateField *field) +{ +    int n_elems = 1; + +    if (field->flags & VMS_ARRAY) { +        n_elems = field->num; +    } else if (field->flags & VMS_VARRAY_INT32) { +        n_elems = *(int32_t *)(opaque+field->num_offset); +    } else if (field->flags & VMS_VARRAY_UINT32) { +        n_elems = *(uint32_t *)(opaque+field->num_offset); +    } else if (field->flags & VMS_VARRAY_UINT16) { +        n_elems = *(uint16_t *)(opaque+field->num_offset); +    } else if (field->flags & VMS_VARRAY_UINT8) { +        n_elems = *(uint8_t *)(opaque+field->num_offset); +    } + +    return n_elems; +} + +static int vmstate_size(void *opaque, VMStateField *field) +{ +    int size = field->size; + +    if (field->flags & VMS_VBUFFER) { +        size = *(int32_t *)(opaque+field->size_offset); +        if (field->flags & VMS_MULTIPLY) { +            size *= field->size; +        } +    } + +    return size; +} + +static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc) +{ +    void *base_addr = opaque + field->offset; + +    if (field->flags & VMS_POINTER) { +        if (alloc && (field->flags & VMS_ALLOC)) { +            gsize size = 0; +            if (field->flags & VMS_VBUFFER) { +                size = vmstate_size(opaque, field); +            } else { +                int n_elems = vmstate_n_elems(opaque, field); +                if (n_elems) { +                    size = n_elems * field->size; +                } +            } +            if (size) { +                *((void **)base_addr + field->start) = g_malloc(size); +            } +        } +        base_addr = *(void **)base_addr + field->start; +    } + +    return base_addr; +} + +int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, +                       void *opaque, int version_id) +{ +    VMStateField *field = vmsd->fields; +    int ret = 0; + +    trace_vmstate_load_state(vmsd->name, version_id); +    if (version_id > vmsd->version_id) { +        trace_vmstate_load_state_end(vmsd->name, "too new", -EINVAL); +        return -EINVAL; +    } +    if  (version_id < vmsd->minimum_version_id) { +        if (vmsd->load_state_old && +            version_id >= vmsd->minimum_version_id_old) { +            ret = vmsd->load_state_old(f, opaque, version_id); +            trace_vmstate_load_state_end(vmsd->name, "old path", ret); +            return ret; +        } +        trace_vmstate_load_state_end(vmsd->name, "too old", -EINVAL); +        return -EINVAL; +    } +    if (vmsd->pre_load) { +        int ret = vmsd->pre_load(opaque); +        if (ret) { +            return ret; +        } +    } +    while (field->name) { +        trace_vmstate_load_state_field(vmsd->name, field->name); +        if ((field->field_exists && +             field->field_exists(opaque, version_id)) || +            (!field->field_exists && +             field->version_id <= version_id)) { +            void *base_addr = vmstate_base_addr(opaque, field, true); +            int i, n_elems = vmstate_n_elems(opaque, field); +            int size = vmstate_size(opaque, field); + +            for (i = 0; i < n_elems; i++) { +                void *addr = base_addr + size * i; + +                if (field->flags & VMS_ARRAY_OF_POINTER) { +                    addr = *(void **)addr; +                } +                if (field->flags & VMS_STRUCT) { +                    ret = vmstate_load_state(f, field->vmsd, addr, +                                             field->vmsd->version_id); +                } else { +                    ret = field->info->get(f, addr, size); + +                } +                if (ret >= 0) { +                    ret = qemu_file_get_error(f); +                } +                if (ret < 0) { +                    qemu_file_set_error(f, ret); +                    trace_vmstate_load_field_error(field->name, ret); +                    return ret; +                } +            } +        } else if (field->flags & VMS_MUST_EXIST) { +            error_report("Input validation failed: %s/%s", +                         vmsd->name, field->name); +            return -1; +        } +        field++; +    } +    ret = vmstate_subsection_load(f, vmsd, opaque); +    if (ret != 0) { +        return ret; +    } +    if (vmsd->post_load) { +        ret = vmsd->post_load(opaque, version_id); +    } +    trace_vmstate_load_state_end(vmsd->name, "end", ret); +    return ret; +} + +static int vmfield_name_num(VMStateField *start, VMStateField *search) +{ +    VMStateField *field; +    int found = 0; + +    for (field = start; field->name; field++) { +        if (!strcmp(field->name, search->name)) { +            if (field == search) { +                return found; +            } +            found++; +        } +    } + +    return -1; +} + +static bool vmfield_name_is_unique(VMStateField *start, VMStateField *search) +{ +    VMStateField *field; +    int found = 0; + +    for (field = start; field->name; field++) { +        if (!strcmp(field->name, search->name)) { +            found++; +            /* name found more than once, so it's not unique */ +            if (found > 1) { +                return false; +            } +        } +    } + +    return true; +} + +static const char *vmfield_get_type_name(VMStateField *field) +{ +    const char *type = "unknown"; + +    if (field->flags & VMS_STRUCT) { +        type = "struct"; +    } else if (field->info->name) { +        type = field->info->name; +    } + +    return type; +} + +static bool vmsd_can_compress(VMStateField *field) +{ +    if (field->field_exists) { +        /* Dynamically existing fields mess up compression */ +        return false; +    } + +    if (field->flags & VMS_STRUCT) { +        VMStateField *sfield = field->vmsd->fields; +        while (sfield->name) { +            if (!vmsd_can_compress(sfield)) { +                /* Child elements can't compress, so can't we */ +                return false; +            } +            sfield++; +        } + +        if (field->vmsd->subsections) { +            /* Subsections may come and go, better don't compress */ +            return false; +        } +    } + +    return true; +} + +static void vmsd_desc_field_start(const VMStateDescription *vmsd, QJSON *vmdesc, +                                  VMStateField *field, int i, int max) +{ +    char *name, *old_name; +    bool is_array = max > 1; +    bool can_compress = vmsd_can_compress(field); + +    if (!vmdesc) { +        return; +    } + +    name = g_strdup(field->name); + +    /* Field name is not unique, need to make it unique */ +    if (!vmfield_name_is_unique(vmsd->fields, field)) { +        int num = vmfield_name_num(vmsd->fields, field); +        old_name = name; +        name = g_strdup_printf("%s[%d]", name, num); +        g_free(old_name); +    } + +    json_start_object(vmdesc, NULL); +    json_prop_str(vmdesc, "name", name); +    if (is_array) { +        if (can_compress) { +            json_prop_int(vmdesc, "array_len", max); +        } else { +            json_prop_int(vmdesc, "index", i); +        } +    } +    json_prop_str(vmdesc, "type", vmfield_get_type_name(field)); + +    if (field->flags & VMS_STRUCT) { +        json_start_object(vmdesc, "struct"); +    } + +    g_free(name); +} + +static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc, +                                VMStateField *field, size_t size, int i) +{ +    if (!vmdesc) { +        return; +    } + +    if (field->flags & VMS_STRUCT) { +        /* We printed a struct in between, close its child object */ +        json_end_object(vmdesc); +    } + +    json_prop_int(vmdesc, "size", size); +    json_end_object(vmdesc); +} + + +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque) +{ +    if (vmsd->needed && !vmsd->needed(opaque)) { +        /* optional section not needed */ +        return false; +    } +    return true; +} + + +void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, +                        void *opaque, QJSON *vmdesc) +{ +    VMStateField *field = vmsd->fields; + +    if (vmsd->pre_save) { +        vmsd->pre_save(opaque); +    } + +    if (vmdesc) { +        json_prop_str(vmdesc, "vmsd_name", vmsd->name); +        json_prop_int(vmdesc, "version", vmsd->version_id); +        json_start_array(vmdesc, "fields"); +    } + +    while (field->name) { +        if (!field->field_exists || +            field->field_exists(opaque, vmsd->version_id)) { +            void *base_addr = vmstate_base_addr(opaque, field, false); +            int i, n_elems = vmstate_n_elems(opaque, field); +            int size = vmstate_size(opaque, field); +            int64_t old_offset, written_bytes; +            QJSON *vmdesc_loop = vmdesc; + +            for (i = 0; i < n_elems; i++) { +                void *addr = base_addr + size * i; + +                vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems); +                old_offset = qemu_ftell_fast(f); + +                if (field->flags & VMS_ARRAY_OF_POINTER) { +                    addr = *(void **)addr; +                } +                if (field->flags & VMS_STRUCT) { +                    vmstate_save_state(f, field->vmsd, addr, vmdesc_loop); +                } else { +                    field->info->put(f, addr, size); +                } + +                written_bytes = qemu_ftell_fast(f) - old_offset; +                vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i); + +                /* Compressed arrays only care about the first element */ +                if (vmdesc_loop && vmsd_can_compress(field)) { +                    vmdesc_loop = NULL; +                } +            } +        } else { +            if (field->flags & VMS_MUST_EXIST) { +                error_report("Output state validation failed: %s/%s", +                        vmsd->name, field->name); +                assert(!(field->flags & VMS_MUST_EXIST)); +            } +        } +        field++; +    } + +    if (vmdesc) { +        json_end_array(vmdesc); +    } + +    vmstate_subsection_save(f, vmsd, opaque, vmdesc); +} + +static const VMStateDescription * +vmstate_get_subsection(const VMStateDescription **sub, char *idstr) +{ +    while (sub && *sub && (*sub)->needed) { +        if (strcmp(idstr, (*sub)->name) == 0) { +            return *sub; +        } +        sub++; +    } +    return NULL; +} + +static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, +                                   void *opaque) +{ +    trace_vmstate_subsection_load(vmsd->name); + +    while (qemu_peek_byte(f, 0) == QEMU_VM_SUBSECTION) { +        char idstr[256], *idstr_ret; +        int ret; +        uint8_t version_id, len, size; +        const VMStateDescription *sub_vmsd; + +        len = qemu_peek_byte(f, 1); +        if (len < strlen(vmsd->name) + 1) { +            /* subsection name has be be "section_name/a" */ +            trace_vmstate_subsection_load_bad(vmsd->name, "(short)"); +            return 0; +        } +        size = qemu_peek_buffer(f, (uint8_t **)&idstr_ret, len, 2); +        if (size != len) { +            trace_vmstate_subsection_load_bad(vmsd->name, "(peek fail)"); +            return 0; +        } +        memcpy(idstr, idstr_ret, size); +        idstr[size] = 0; + +        if (strncmp(vmsd->name, idstr, strlen(vmsd->name)) != 0) { +            trace_vmstate_subsection_load_bad(vmsd->name, idstr); +            /* it don't have a valid subsection name */ +            return 0; +        } +        sub_vmsd = vmstate_get_subsection(vmsd->subsections, idstr); +        if (sub_vmsd == NULL) { +            trace_vmstate_subsection_load_bad(vmsd->name, "(lookup)"); +            return -ENOENT; +        } +        qemu_file_skip(f, 1); /* subsection */ +        qemu_file_skip(f, 1); /* len */ +        qemu_file_skip(f, len); /* idstr */ +        version_id = qemu_get_be32(f); + +        ret = vmstate_load_state(f, sub_vmsd, opaque, version_id); +        if (ret) { +            trace_vmstate_subsection_load_bad(vmsd->name, "(child)"); +            return ret; +        } +    } + +    trace_vmstate_subsection_load_good(vmsd->name); +    return 0; +} + +static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, +                                    void *opaque, QJSON *vmdesc) +{ +    const VMStateDescription **sub = vmsd->subsections; +    bool subsection_found = false; + +    while (sub && *sub && (*sub)->needed) { +        if ((*sub)->needed(opaque)) { +            const VMStateDescription *vmsd = *sub; +            uint8_t len; + +            if (vmdesc) { +                /* Only create subsection array when we have any */ +                if (!subsection_found) { +                    json_start_array(vmdesc, "subsections"); +                    subsection_found = true; +                } + +                json_start_object(vmdesc, NULL); +            } + +            qemu_put_byte(f, QEMU_VM_SUBSECTION); +            len = strlen(vmsd->name); +            qemu_put_byte(f, len); +            qemu_put_buffer(f, (uint8_t *)vmsd->name, len); +            qemu_put_be32(f, vmsd->version_id); +            vmstate_save_state(f, vmsd, opaque, vmdesc); + +            if (vmdesc) { +                json_end_object(vmdesc); +            } +        } +        sub++; +    } + +    if (vmdesc && subsection_found) { +        json_end_array(vmdesc); +    } +} + +/* bool */ + +static int get_bool(QEMUFile *f, void *pv, size_t size) +{ +    bool *v = pv; +    *v = qemu_get_byte(f); +    return 0; +} + +static void put_bool(QEMUFile *f, void *pv, size_t size) +{ +    bool *v = pv; +    qemu_put_byte(f, *v); +} + +const VMStateInfo vmstate_info_bool = { +    .name = "bool", +    .get  = get_bool, +    .put  = put_bool, +}; + +/* 8 bit int */ + +static int get_int8(QEMUFile *f, void *pv, size_t size) +{ +    int8_t *v = pv; +    qemu_get_s8s(f, v); +    return 0; +} + +static void put_int8(QEMUFile *f, void *pv, size_t size) +{ +    int8_t *v = pv; +    qemu_put_s8s(f, v); +} + +const VMStateInfo vmstate_info_int8 = { +    .name = "int8", +    .get  = get_int8, +    .put  = put_int8, +}; + +/* 16 bit int */ + +static int get_int16(QEMUFile *f, void *pv, size_t size) +{ +    int16_t *v = pv; +    qemu_get_sbe16s(f, v); +    return 0; +} + +static void put_int16(QEMUFile *f, void *pv, size_t size) +{ +    int16_t *v = pv; +    qemu_put_sbe16s(f, v); +} + +const VMStateInfo vmstate_info_int16 = { +    .name = "int16", +    .get  = get_int16, +    .put  = put_int16, +}; + +/* 32 bit int */ + +static int get_int32(QEMUFile *f, void *pv, size_t size) +{ +    int32_t *v = pv; +    qemu_get_sbe32s(f, v); +    return 0; +} + +static void put_int32(QEMUFile *f, void *pv, size_t size) +{ +    int32_t *v = pv; +    qemu_put_sbe32s(f, v); +} + +const VMStateInfo vmstate_info_int32 = { +    .name = "int32", +    .get  = get_int32, +    .put  = put_int32, +}; + +/* 32 bit int. See that the received value is the same than the one +   in the field */ + +static int get_int32_equal(QEMUFile *f, void *pv, size_t size) +{ +    int32_t *v = pv; +    int32_t v2; +    qemu_get_sbe32s(f, &v2); + +    if (*v == v2) { +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_int32_equal = { +    .name = "int32 equal", +    .get  = get_int32_equal, +    .put  = put_int32, +}; + +/* 32 bit int. Check that the received value is non-negative + * and less than or equal to the one in the field. + */ + +static int get_int32_le(QEMUFile *f, void *pv, size_t size) +{ +    int32_t *cur = pv; +    int32_t loaded; +    qemu_get_sbe32s(f, &loaded); + +    if (loaded >= 0 && loaded <= *cur) { +        *cur = loaded; +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_int32_le = { +    .name = "int32 le", +    .get  = get_int32_le, +    .put  = put_int32, +}; + +/* 64 bit int */ + +static int get_int64(QEMUFile *f, void *pv, size_t size) +{ +    int64_t *v = pv; +    qemu_get_sbe64s(f, v); +    return 0; +} + +static void put_int64(QEMUFile *f, void *pv, size_t size) +{ +    int64_t *v = pv; +    qemu_put_sbe64s(f, v); +} + +const VMStateInfo vmstate_info_int64 = { +    .name = "int64", +    .get  = get_int64, +    .put  = put_int64, +}; + +/* 8 bit unsigned int */ + +static int get_uint8(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t *v = pv; +    qemu_get_8s(f, v); +    return 0; +} + +static void put_uint8(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t *v = pv; +    qemu_put_8s(f, v); +} + +const VMStateInfo vmstate_info_uint8 = { +    .name = "uint8", +    .get  = get_uint8, +    .put  = put_uint8, +}; + +/* 16 bit unsigned int */ + +static int get_uint16(QEMUFile *f, void *pv, size_t size) +{ +    uint16_t *v = pv; +    qemu_get_be16s(f, v); +    return 0; +} + +static void put_uint16(QEMUFile *f, void *pv, size_t size) +{ +    uint16_t *v = pv; +    qemu_put_be16s(f, v); +} + +const VMStateInfo vmstate_info_uint16 = { +    .name = "uint16", +    .get  = get_uint16, +    .put  = put_uint16, +}; + +/* 32 bit unsigned int */ + +static int get_uint32(QEMUFile *f, void *pv, size_t size) +{ +    uint32_t *v = pv; +    qemu_get_be32s(f, v); +    return 0; +} + +static void put_uint32(QEMUFile *f, void *pv, size_t size) +{ +    uint32_t *v = pv; +    qemu_put_be32s(f, v); +} + +const VMStateInfo vmstate_info_uint32 = { +    .name = "uint32", +    .get  = get_uint32, +    .put  = put_uint32, +}; + +/* 32 bit uint. See that the received value is the same than the one +   in the field */ + +static int get_uint32_equal(QEMUFile *f, void *pv, size_t size) +{ +    uint32_t *v = pv; +    uint32_t v2; +    qemu_get_be32s(f, &v2); + +    if (*v == v2) { +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_uint32_equal = { +    .name = "uint32 equal", +    .get  = get_uint32_equal, +    .put  = put_uint32, +}; + +/* 64 bit unsigned int */ + +static int get_uint64(QEMUFile *f, void *pv, size_t size) +{ +    uint64_t *v = pv; +    qemu_get_be64s(f, v); +    return 0; +} + +static void put_uint64(QEMUFile *f, void *pv, size_t size) +{ +    uint64_t *v = pv; +    qemu_put_be64s(f, v); +} + +const VMStateInfo vmstate_info_uint64 = { +    .name = "uint64", +    .get  = get_uint64, +    .put  = put_uint64, +}; + +/* 64 bit unsigned int. See that the received value is the same than the one +   in the field */ + +static int get_uint64_equal(QEMUFile *f, void *pv, size_t size) +{ +    uint64_t *v = pv; +    uint64_t v2; +    qemu_get_be64s(f, &v2); + +    if (*v == v2) { +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_uint64_equal = { +    .name = "int64 equal", +    .get  = get_uint64_equal, +    .put  = put_uint64, +}; + +/* 8 bit int. See that the received value is the same than the one +   in the field */ + +static int get_uint8_equal(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t *v = pv; +    uint8_t v2; +    qemu_get_8s(f, &v2); + +    if (*v == v2) { +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_uint8_equal = { +    .name = "uint8 equal", +    .get  = get_uint8_equal, +    .put  = put_uint8, +}; + +/* 16 bit unsigned int int. See that the received value is the same than the one +   in the field */ + +static int get_uint16_equal(QEMUFile *f, void *pv, size_t size) +{ +    uint16_t *v = pv; +    uint16_t v2; +    qemu_get_be16s(f, &v2); + +    if (*v == v2) { +        return 0; +    } +    return -EINVAL; +} + +const VMStateInfo vmstate_info_uint16_equal = { +    .name = "uint16 equal", +    .get  = get_uint16_equal, +    .put  = put_uint16, +}; + +/* floating point */ + +static int get_float64(QEMUFile *f, void *pv, size_t size) +{ +    float64 *v = pv; + +    *v = make_float64(qemu_get_be64(f)); +    return 0; +} + +static void put_float64(QEMUFile *f, void *pv, size_t size) +{ +    uint64_t *v = pv; + +    qemu_put_be64(f, float64_val(*v)); +} + +const VMStateInfo vmstate_info_float64 = { +    .name = "float64", +    .get  = get_float64, +    .put  = put_float64, +}; + +/* uint8_t buffers */ + +static int get_buffer(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t *v = pv; +    qemu_get_buffer(f, v, size); +    return 0; +} + +static void put_buffer(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t *v = pv; +    qemu_put_buffer(f, v, size); +} + +const VMStateInfo vmstate_info_buffer = { +    .name = "buffer", +    .get  = get_buffer, +    .put  = put_buffer, +}; + +/* unused buffers: space that was used for some fields that are +   not useful anymore */ + +static int get_unused_buffer(QEMUFile *f, void *pv, size_t size) +{ +    uint8_t buf[1024]; +    int block_len; + +    while (size > 0) { +        block_len = MIN(sizeof(buf), size); +        size -= block_len; +        qemu_get_buffer(f, buf, block_len); +    } +   return 0; +} + +static void put_unused_buffer(QEMUFile *f, void *pv, size_t size) +{ +    static const uint8_t buf[1024]; +    int block_len; + +    while (size > 0) { +        block_len = MIN(sizeof(buf), size); +        size -= block_len; +        qemu_put_buffer(f, buf, block_len); +    } +} + +const VMStateInfo vmstate_info_unused_buffer = { +    .name = "unused_buffer", +    .get  = get_unused_buffer, +    .put  = put_unused_buffer, +}; + +/* bitmaps (as defined by bitmap.h). Note that size here is the size + * of the bitmap in bits. The on-the-wire format of a bitmap is 64 + * bit words with the bits in big endian order. The in-memory format + * is an array of 'unsigned long', which may be either 32 or 64 bits. + */ +/* This is the number of 64 bit words sent over the wire */ +#define BITS_TO_U64S(nr) DIV_ROUND_UP(nr, 64) +static int get_bitmap(QEMUFile *f, void *pv, size_t size) +{ +    unsigned long *bmp = pv; +    int i, idx = 0; +    for (i = 0; i < BITS_TO_U64S(size); i++) { +        uint64_t w = qemu_get_be64(f); +        bmp[idx++] = w; +        if (sizeof(unsigned long) == 4 && idx < BITS_TO_LONGS(size)) { +            bmp[idx++] = w >> 32; +        } +    } +    return 0; +} + +static void put_bitmap(QEMUFile *f, void *pv, size_t size) +{ +    unsigned long *bmp = pv; +    int i, idx = 0; +    for (i = 0; i < BITS_TO_U64S(size); i++) { +        uint64_t w = bmp[idx++]; +        if (sizeof(unsigned long) == 4 && idx < BITS_TO_LONGS(size)) { +            w |= ((uint64_t)bmp[idx++]) << 32; +        } +        qemu_put_be64(f, w); +    } +} + +const VMStateInfo vmstate_info_bitmap = { +    .name = "bitmap", +    .get = get_bitmap, +    .put = put_bitmap, +}; diff --git a/migration/xbzrle.c b/migration/xbzrle.c new file mode 100644 index 00000000..8e220bf2 --- /dev/null +++ b/migration/xbzrle.c @@ -0,0 +1,175 @@ +/* + * Xor Based Zero Run Length Encoding + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + *  Orit Wasserman  <owasserm@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "include/migration/migration.h" + +/* +  page = zrun nzrun +       | zrun nzrun page + +  zrun = length + +  nzrun = length byte... + +  length = uleb128 encoded integer + */ +int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, +                         uint8_t *dst, int dlen) +{ +    uint32_t zrun_len = 0, nzrun_len = 0; +    int d = 0, i = 0; +    long res; +    uint8_t *nzrun_start = NULL; + +    g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) % +               sizeof(long))); + +    while (i < slen) { +        /* overflow */ +        if (d + 2 > dlen) { +            return -1; +        } + +        /* not aligned to sizeof(long) */ +        res = (slen - i) % sizeof(long); +        while (res && old_buf[i] == new_buf[i]) { +            zrun_len++; +            i++; +            res--; +        } + +        /* word at a time for speed */ +        if (!res) { +            while (i < slen && +                   (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) { +                i += sizeof(long); +                zrun_len += sizeof(long); +            } + +            /* go over the rest */ +            while (i < slen && old_buf[i] == new_buf[i]) { +                zrun_len++; +                i++; +            } +        } + +        /* buffer unchanged */ +        if (zrun_len == slen) { +            return 0; +        } + +        /* skip last zero run */ +        if (i == slen) { +            return d; +        } + +        d += uleb128_encode_small(dst + d, zrun_len); + +        zrun_len = 0; +        nzrun_start = new_buf + i; + +        /* overflow */ +        if (d + 2 > dlen) { +            return -1; +        } +        /* not aligned to sizeof(long) */ +        res = (slen - i) % sizeof(long); +        while (res && old_buf[i] != new_buf[i]) { +            i++; +            nzrun_len++; +            res--; +        } + +        /* word at a time for speed, use of 32-bit long okay */ +        if (!res) { +            /* truncation to 32-bit long okay */ +            unsigned long mask = (unsigned long)0x0101010101010101ULL; +            while (i < slen) { +                unsigned long xor; +                xor = *(unsigned long *)(old_buf + i) +                    ^ *(unsigned long *)(new_buf + i); +                if ((xor - mask) & ~xor & (mask << 7)) { +                    /* found the end of an nzrun within the current long */ +                    while (old_buf[i] != new_buf[i]) { +                        nzrun_len++; +                        i++; +                    } +                    break; +                } else { +                    i += sizeof(long); +                    nzrun_len += sizeof(long); +                } +            } +        } + +        d += uleb128_encode_small(dst + d, nzrun_len); +        /* overflow */ +        if (d + nzrun_len > dlen) { +            return -1; +        } +        memcpy(dst + d, nzrun_start, nzrun_len); +        d += nzrun_len; +        nzrun_len = 0; +    } + +    return d; +} + +int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) +{ +    int i = 0, d = 0; +    int ret; +    uint32_t count = 0; + +    while (i < slen) { + +        /* zrun */ +        if ((slen - i) < 2) { +            return -1; +        } + +        ret = uleb128_decode_small(src + i, &count); +        if (ret < 0 || (i && !count)) { +            return -1; +        } +        i += ret; +        d += count; + +        /* overflow */ +        if (d > dlen) { +            return -1; +        } + +        /* nzrun */ +        if ((slen - i) < 2) { +            return -1; +        } + +        ret = uleb128_decode_small(src + i, &count); +        if (ret < 0 || !count) { +            return -1; +        } +        i += ret; + +        /* overflow */ +        if (d + count > dlen || i + count > slen) { +            return -1; +        } + +        memcpy(dst + d, src + i, count); +        d += count; +        i += count; +    } + +    return d; +}  | 
