aboutsummaryrefslogtreecommitdiffstats
path: root/tools/libxc/xg_save_restore.h
blob: f859621f625f0d25ae26da223c975f2910f5ed67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
/*
 * Definitions and utilities for save / restore.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation;
 * version 2.1 of the License.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include "xc_private.h"

#include <xen/foreign/x86_32.h>
#include <xen/foreign/x86_64.h>

/*
 * SAVE/RESTORE/MIGRATE PROTOCOL
 * =============================
 *
 * The general form of a stream of chunks is a header followed by a
 * body consisting of a variable number of chunks (terminated by a
 * chunk with type 0) followed by a trailer.
 *
 * For a rolling/checkpoint (e.g. remus) migration then the body and
 * trailer phases can be repeated until an external event
 * (e.g. failure) causes the process to terminate and commit to the
 * most recent complete checkpoint.
 *
 * HEADER
 * ------
 *
 * unsigned long        : p2m_size
 *
 * extended-info (PV-only, optional):
 *
 *   If first unsigned long == ~0UL then extended info is present,
 *   otherwise unsigned long is part of p2m. Note that p2m_size above
 *   does not include the length of the extended info.
 *
 *   extended-info:
 *
 *     unsigned long    : signature == ~0UL
 *     uint32_t	        : number of bytes remaining in extended-info
 *
 *     1 or more extended-info blocks of form:
 *     char[4]          : block identifier
 *     uint32_t         : block data size
 *     bytes            : block data
 *
 *     defined extended-info blocks:
 *     "vcpu"		: VCPU context info containing vcpu_guest_context_t.
 *                        The precise variant of the context structure
 *                        (e.g. 32 vs 64 bit) is distinguished by
 *                        the block size.
 *     "extv"           : Presence indicates use of extended VCPU context in
 *                        tail, data size is 0.
 *
 * p2m (PV-only):
 *
 *   consists of p2m_size bytes comprising an array of xen_pfn_t sized entries.
 *
 * BODY PHASE - Format A (for live migration or Remus without compression)
 * ----------
 *
 * A series of chunks with a common header:
 *   int              : chunk type
 *
 * If the chunk type is +ve then chunk contains guest memory data, and the
 * type contains the number of pages in the batch:
 *
 *     unsigned long[]  : PFN array, length == number of pages in batch
 *                        Each entry consists of XEN_DOMCTL_PFINFO_*
 *                        in bits 31-28 and the PFN number in bits 27-0.
 *     page data        : PAGE_SIZE bytes for each page marked present in PFN
 *                        array
 *
 * If the chunk type is -ve then chunk consists of one of a number of
 * metadata types.  See definitions of XC_SAVE_ID_* below.
 *
 * If chunk type is 0 then body phase is complete.
 *
 *
 * BODY PHASE - Format B (for Remus with compression)
 * ----------
 *
 * A series of chunks with a common header:
 *   int              : chunk type
 *
 * If the chunk type is +ve then chunk contains array of PFNs corresponding
 * to guest memory and type contains the number of PFNs in the batch:
 *
 *     unsigned long[]  : PFN array, length == number of pages in batch
 *                        Each entry consists of XEN_DOMCTL_PFINFO_*
 *                        in bits 31-28 and the PFN number in bits 27-0.
 *
 * If the chunk type is -ve then chunk consists of one of a number of
 * metadata types.  See definitions of XC_SAVE_ID_* below.
 *
 * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the
 * chunk consists of compressed page data, in the following format:
 *
 *     unsigned long        : Size of the compressed chunk to follow
 *     compressed data :      variable length data of size indicated above.
 *                            This chunk consists of compressed page data.
 *                            The number of pages in one chunk depends on
 *                            the amount of space available in the sender's
 *                            output buffer.
 *
 * Format of compressed data:
 *   compressed_data = <deltas>*
 *   delta           = <marker, run*>
 *   marker          = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker]
 *   RUNFLAG         = 0
 *   SKIPFLAG        = 1 << 7
 *   RUNLEN          = 7-bit unsigned value indicating number of WORDS in the run
 *   run             = string of bytes of length sizeof(WORD) * RUNLEN
 *
 *    If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following
 *   the marker is copied into the target page at the appropriate offset indicated by
 *   the offset_ptr
 *    If marker contains SKIPFLAG, then the offset_ptr is advanced
 *   by RUNLEN * sizeof(WORD).
 *
 * If chunk type is 0 then body phase is complete.
 *
 * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA,
 * containing compressed pages. The compressed chunks are collated to form
 * one single compressed chunk for the entire iteration. The number of pages
 * present in this final compressed chunk will be equal to the total number
 * of valid PFNs specified by the +ve chunks.
 *
 * At the sender side, compressed pages are inserted into the output stream
 * in the same order as they would have been if compression logic was absent.
 *
 * Until last iteration, the BODY is sent in Format A, to maintain live
 * migration compatibility with receivers of older Xen versions.
 * At the last iteration, if Remus compression was enabled, the sender sends
 * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the
 * BODY in Format B from the next iteration onwards.
 *
 * An example sequence of chunks received in Format B:
 *     +16                              +ve chunk
 *     unsigned long[16]                PFN array
 *     +100                             +ve chunk
 *     unsigned long[100]               PFN array
 *     +50                              +ve chunk
 *     unsigned long[50]                PFN array
 *
 *     XC_SAVE_ID_COMPRESSED_DATA       TAG
 *       N                              Length of compressed data
 *       N bytes of DATA                Decompresses to 166 pages
 *
 *     XC_SAVE_ID_*                     other xc save chunks
 *     0                                END BODY TAG
 *
 * Corner case with checkpoint compression:
 *     At sender side, after pausing the domain, dirty pages are usually
 *   copied out to a temporary buffer. After the domain is resumed,
 *   compression is done and the compressed chunk(s) are sent, followed by
 *   other XC_SAVE_ID_* chunks.
 *     If the temporary buffer gets full while scanning for dirty pages,
 *   the sender stops buffering of dirty pages, compresses the temporary
 *   buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA.
 *   The sender then resumes the buffering of dirty pages and continues
 *   scanning for the dirty pages.
 *     For e.g., assume that the temporary buffer can hold 4096 pages and
 *   there are 5000 dirty pages. The following is the sequence of chunks
 *   that the receiver will see:
 *
 *     +1024                       +ve chunk
 *     unsigned long[1024]         PFN array
 *     +1024                       +ve chunk
 *     unsigned long[1024]         PFN array
 *     +1024                       +ve chunk
 *     unsigned long[1024]         PFN array
 *     +1024                       +ve chunk
 *     unsigned long[1024]         PFN array
 *
 *     XC_SAVE_ID_COMPRESSED_DATA  TAG
 *      N                          Length of compressed data
 *      N bytes of DATA            Decompresses to 4096 pages
 *
 *     +4                          +ve chunk
 *     unsigned long[4]            PFN array
 *
 *     XC_SAVE_ID_COMPRESSED_DATA  TAG
 *      M                          Length of compressed data
 *      M bytes of DATA            Decompresses to 4 pages
 *
 *     XC_SAVE_ID_*                other xc save chunks
 *     0                           END BODY TAG
 *
 *     In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with
 *   +ve chunks arbitrarily. But at the receiver end, the following condition
 *   always holds true until the end of BODY PHASE:
 *    num(PFN entries +ve chunks) >= num(pages received in compressed form)
 *
 * TAIL PHASE
 * ----------
 *
 * Content differs for PV and HVM guests.
 *
 * HVM TAIL:
 *
 *  "Magic" pages:
 *     uint64_t         : I/O req PFN
 *     uint64_t         : Buffered I/O req PFN
 *     uint64_t         : Store PFN
 *  Xen HVM Context:
 *     uint32_t         : Length of context in bytes
 *     bytes            : Context data
 *  Qemu context:
 *     char[21]         : Signature:
 *       "QemuDeviceModelRecord" : Read Qemu save data until EOF
 *       "DeviceModelRecord0002" : uint32_t length field followed by that many
 *                                 bytes of Qemu save data
 *       "RemusDeviceModelState" : Currently the same as "DeviceModelRecord0002".
 *
 * PV TAIL:
 *
 *  Unmapped PFN list   : list of all the PFNs that were not in map at the close
 *     unsigned int     : Number of unmapped pages
 *     unsigned long[]  : PFNs of unmapped pages
 *
 *  VCPU context data   : A series of VCPU records, one per present VCPU
 *                        Maximum and present map supplied in XC_SAVE_ID_VCPUINFO
 *     bytes:           : VCPU context structure. Size is determined by size
 *                        provided in extended-info header
 *     bytes[128]       : Extended VCPU context (present IFF "extv" block
 *                        present in extended-info header)
 *
 *  Shared Info Page    : 4096 bytes of shared info page
 */

#define XC_SAVE_ID_ENABLE_VERIFY_MODE -1 /* Switch to validation phase. */
#define XC_SAVE_ID_VCPU_INFO          -2 /* Additional VCPU info */
#define XC_SAVE_ID_HVM_IDENT_PT       -3 /* (HVM-only) */
#define XC_SAVE_ID_HVM_VM86_TSS       -4 /* (HVM-only) */
#define XC_SAVE_ID_TMEM               -5
#define XC_SAVE_ID_TMEM_EXTRA         -6
#define XC_SAVE_ID_TSC_INFO           -7
#define XC_SAVE_ID_HVM_CONSOLE_PFN    -8 /* (HVM-only) */
#define XC_SAVE_ID_LAST_CHECKPOINT    -9 /* Commit to restoring after completion of current iteration. */
#define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10
#define XC_SAVE_ID_HVM_VIRIDIAN       -11
#define XC_SAVE_ID_COMPRESSED_DATA    -12 /* Marker to indicate arrival of compressed data */
#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression logic at receiver side */
#define XC_SAVE_ID_HVM_GENERATION_ID_ADDR -14
/* Markers for the pfn's hosting these mem event rings */
#define XC_SAVE_ID_HVM_PAGING_RING_PFN  -15
#define XC_SAVE_ID_HVM_ACCESS_RING_PFN  -16
#define XC_SAVE_ID_HVM_SHARING_RING_PFN -17
#define XC_SAVE_ID_TOOLSTACK          -18 /* Optional toolstack specific info */

/*
** We process save/restore/migrate in batches of pages; the below
** determines how many pages we (at maximum) deal with in each batch.
*/
#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */

/* When pinning page tables at the end of restore, we also use batching. */
#define MAX_PIN_BATCH  1024

/* Maximum #VCPUs currently supported for save/restore. */
#define XC_SR_MAX_VCPUS 4096
#define vcpumap_sz(max_id) (((max_id)/64+1)*sizeof(uint64_t))


/*
** Determine various platform information required for save/restore, in
** particular:
**
**    - the maximum MFN on this machine, used to compute the size of
**      the M2P table;
**
**    - the starting virtual address of the the hypervisor; we use this
**      to determine which parts of guest address space(s) do and don't
**      require canonicalization during save/restore; and
**
**    - the number of page-table levels for save/ restore. This should
**      be a property of the domain, but for the moment we just read it
**      from the hypervisor.
**
**    - The width of a guest word (unsigned long), in bytes.
**
** Returns 1 on success, 0 on failure.
*/
static inline int get_platform_info(xc_interface *xch, uint32_t dom,
                                    /* OUT */ unsigned long *max_mfn,
                                    /* OUT */ unsigned long *hvirt_start,
                                    /* OUT */ unsigned int *pt_levels,
                                    /* OUT */ unsigned int *guest_width)
{
    xen_capabilities_info_t xen_caps = "";
    xen_platform_parameters_t xen_params;

    if (xc_version(xch, XENVER_platform_parameters, &xen_params) != 0)
        return 0;

    if (xc_version(xch, XENVER_capabilities, &xen_caps) != 0)
        return 0;

    *max_mfn = xc_maximum_ram_page(xch);

    *hvirt_start = xen_params.virt_start;

    if ( xc_domain_get_guest_width(xch, dom, guest_width) != 0)
        return 0; 

    /* 64-bit tools will see the 64-bit hvirt_start, but 32-bit guests 
     * will be using the compat one. */
    if ( *guest_width < sizeof (unsigned long) )
        /* XXX need to fix up a way of extracting this value from Xen if
         * XXX it becomes variable for domU */
        *hvirt_start = 0xf5800000;

    if (strstr(xen_caps, "xen-3.0-x86_64"))
        /* Depends on whether it's a compat 32-on-64 guest */
        *pt_levels = ( (*guest_width == 8) ? 4 : 3 );
    else if (strstr(xen_caps, "xen-3.0-x86_32p"))
        *pt_levels = 3;
    else
        return 0;

    return 1;
}


/*
** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
** The M2P simply holds the corresponding PFN, while the top bit of a P2M
** entry tell us whether or not the the PFN is currently mapped.
*/

#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10))


/*
** The M2P is made up of some number of 'chunks' of at least 2MB in size.
** The below definitions and utility function(s) deal with mapping the M2P
** regarldess of the underlying machine memory size or architecture.
*/
#define M2P_SHIFT       L2_PAGETABLE_SHIFT_PAE
#define M2P_CHUNK_SIZE  (1 << M2P_SHIFT)
#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT)
#define M2P_CHUNKS(_m)  (M2P_SIZE((_m)) >> M2P_SHIFT)

/* Returns TRUE if the PFN is currently mapped */
#define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))


#define GET_FIELD(_p, _f) ((dinfo->guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))

#define SET_FIELD(_p, _f, _v) do {              \
    if (dinfo->guest_width == 8)                \
        (_p)->x64._f = (_v);                    \
    else                                        \
        (_p)->x32._f = (_v);                    \
} while (0)

#define UNFOLD_CR3(_c)                                                  \
  ((uint64_t)((dinfo->guest_width == 8)                                 \
              ? ((_c) >> 12)                                            \
              : (((uint32_t)(_c) >> 12) | ((uint32_t)(_c) << 20))))

#define FOLD_CR3(_c)                                                    \
  ((uint64_t)((dinfo->guest_width == 8)                                 \
              ? ((uint64_t)(_c)) << 12                                  \
              : (((uint32_t)(_c) << 12) | ((uint32_t)(_c) >> 20))))

#define MEMCPY_FIELD(_d, _s, _f) do {                              \
    if (dinfo->guest_width == 8)                                   \
        memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \
    else                                                           \
        memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \
} while (0)

#define MEMSET_ARRAY_FIELD(_p, _f, _v) do {                        \
    if (dinfo->guest_width == 8)                                   \
        memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f));      \
    else                                                           \
        memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f));      \
} while (0)

#ifndef MAX
#define MAX(_a, _b) ((_a) >= (_b) ? (_a) : (_b))
#endif
#ifndef MIN
#define MIN(_a, _b) ((_a) <= (_b) ? (_a) : (_b))
#endif