aboutsummaryrefslogtreecommitdiffstats
path: root/package/libs/libnl-tiny/src/attr.c
blob: e0f50611cff9e94b5110a34ebfae32c86d47aba9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
/*
 * lib/attr.c		Netlink Attributes
 *
 *	This library is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU Lesser General Public
 *	License as published by the Free Software Foundation version 2.1
 *	of the License.
 *
 * Copyright (c) 2003-2008 Thomas Graf <tgraf@suug.ch>
 */

#include <netlink-local.h>
#include <netlink/netlink.h>
#include <netlink/utils.h>
#include <netlink/addr.h>
#include <netlink/attr.h>
#include <netlink/msg.h>
#include <linux/socket.h>

/**
 * @ingroup msg
 * @defgroup attr Attributes
 * Netlink Attributes Construction/Parsing Interface
 *
 * \section attr_sec Netlink Attributes
 * Netlink attributes allow for data chunks of arbitary length to be
 * attached to a netlink message. Each attribute is encoded with a
 * type and length field, both 16 bits, stored in the attribute header
 * preceding the attribute data. The main advantage of using attributes
 * over packing everything into the family header is that the interface
 * stays extendable as new attributes can supersede old attributes while
 * remaining backwards compatible. Also attributes can be defined optional
 * thus avoiding the transmission of unnecessary empty data blocks.
 * Special nested attributes allow for more complex data structures to
 * be transmitted, e.g. trees, lists, etc.
 *
 * While not required, netlink attributes typically follow the family
 * header of a netlink message and must be properly aligned to NLA_ALIGNTO:
 * @code
 *   +----------------+- - -+---------------+- - -+------------+- - -+
 *   | Netlink Header | Pad | Family Header | Pad | Attributes | Pad |
 *   +----------------+- - -+---------------+- - -+------------+- - -+
 * @endcode
 *
 * The actual attributes are chained together each separately aligned to
 * NLA_ALIGNTO. The position of an attribute is defined based on the
 * length field of the preceding attributes:
 * @code
 *   +-------------+- - -+-------------+- - -+------
 *   | Attribute 1 | Pad | Attribute 2 | Pad | ...
 *   +-------------+- - -+-------------+- - -+------
 *   nla_next(attr1)------^
 * @endcode
 *
 * The attribute itself consists of the attribute header followed by
 * the actual payload also aligned to NLA_ALIGNTO. The function nla_data()
 * returns a pointer to the start of the payload while nla_len() returns
 * the length of the payload in bytes.
 *
 * \b Note: Be aware, NLA_ALIGNTO equals to 4 bytes, therefore it is not
 * safe to dereference any 64 bit data types directly.
 *
 * @code
 *    <----------- nla_total_size(payload) ----------->
 *    <-------- nla_attr_size(payload) --------->
 *   +------------------+- - -+- - - - - - - - - +- - -+
 *   | Attribute Header | Pad |     Payload      | Pad |
 *   +------------------+- - -+- - - - - - - - - +- - -+
 *   nla_data(nla)-------------^
 *                             <- nla_len(nla) ->
 * @endcode
 *
 * @subsection attr_datatypes Attribute Data Types
 * A number of basic data types are supported to simplify access and
 * validation of netlink attributes. This data type information is
 * not encoded in the attribute, both the kernel and userspace part
 * are required to share this information on their own.
 *
 * One of the major advantages of these basic types is the automatic
 * validation of each attribute based on an attribute policy. The
 * validation covers most of the checks required to safely use
 * attributes and thus keeps the individual sanity check to a minimum.
 *
 * Never access attribute payload without ensuring basic validation
 * first, attributes may:
 * - not be present even though required
 * - contain less actual payload than expected
 * - fake a attribute length which exceeds the end of the message
 * - contain unterminated character strings
 *
 * Policies are defined as array of the struct nla_policy. The array is
 * indexed with the attribute type, therefore the array must be sized
 * accordingly.
 * @code
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 * 	[ATTR_FOO] = { .type = ..., .minlen = ..., .maxlen = ... },
 * };
 *
 * err = nla_validate(attrs, attrlen, ATTR_MAX, &my_policy);
 * @endcode
 *
 * Some basic validations are performed on every attribute, regardless of type.
 * - If the attribute type exceeds the maximum attribute type specified or
 *   the attribute type is lesser-or-equal than zero, the attribute will
 *   be silently ignored.
 * - If the payload length falls below the \a minlen value the attribute
 *   will be rejected.
 * - If \a maxlen is non-zero and the payload length exceeds the \a maxlen
 *   value the attribute will be rejected.
 *
 *
 * @par Unspecific Attribute (NLA_UNSPEC)
 * This is the standard type if no type is specified. It is used for
 * binary data of arbitary length. Typically this attribute carries
 * a binary structure or a stream of bytes.
 * @par
 * @code
 * // In this example, we will assume a binary structure requires to
 * // be transmitted. The definition of the structure will typically
 * // go into a header file available to both the kernel and userspace
 * // side.
 * //
 * // Note: Be careful when putting 64 bit data types into a structure.
 * // The attribute payload is only aligned to 4 bytes, dereferencing
 * // the member may fail.
 * struct my_struct {
 *     int a;
 *     int b;
 * };
 *
 * // The validation function will not enforce an exact length match to
 * // allow structures to grow as required. Note: While it is allowed
 * // to add members to the end of the structure, changing the order or
 * // inserting members in the middle of the structure will break your
 * // binary interface.
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 *     [ATTR_MY_STRICT] = { .type = NLA_UNSPEC,
 *                          .minlen = sizeof(struct my_struct) },
 *
 * // The binary structure is appened to the message using nla_put()
 * struct my_struct foo = { .a = 1, .b = 2 };
 * nla_put(msg, ATTR_MY_STRUCT, sizeof(foo), &foo);
 *
 * // On the receiving side, a pointer to the structure pointing inside
 * // the message payload is returned by nla_get().
 * if (attrs[ATTR_MY_STRUCT])
 *     struct my_struct *foo = nla_get(attrs[ATTR_MY_STRUCT]);
 * @endcode
 *
 * @par Integers (NLA_U8, NLA_U16, NLA_U32, NLA_U64)
 * Integers come in different sizes from 8 bit to 64 bit. However, since the
 * payload length is aligned to 4 bytes, integers smaller than 32 bit are
 * only useful to enforce the maximum range of values.
 * @par
 * \b Note: There is no difference made between signed and unsigned integers.
 * The validation only enforces the minimal payload length required to store
 * an integer of specified type.
 * @par
 * @code
 * // Even though possible, it does not make sense to specify .minlen or
 * // .maxlen for integer types. The data types implies the corresponding
 * // minimal payload length.
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 *     [ATTR_FOO] = { .type = NLA_U32 },
 *
 * // Numeric values can be appended directly using the respective
 * // nla_put_uxxx() function
 * nla_put_u32(msg, ATTR_FOO, 123);
 *
 * // Same for the receiving side.
 * if (attrs[ATTR_FOO])
 *     uint32_t foo = nla_get_u32(attrs[ATTR_FOO]);
 * @endcode
 *
 * @par Character string (NLA_STRING)
 * This data type represents a NUL terminated character string of variable
 * length. For binary data streams the type NLA_UNSPEC is recommended.
 * @par
 * @code
 * // Enforce a NUL terminated character string of at most 4 characters
 * // including the NUL termination.
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 *     [ATTR_BAR] = { .type = NLA_STRING, maxlen = 4 },
 *
 * // nla_put_string() creates a string attribute of the necessary length
 * // and appends it to the message including the NUL termination.
 * nla_put_string(msg, ATTR_BAR, "some text");
 *
 * // It is safe to use the returned character string directly if the
 * // attribute has been validated as the validation enforces the proper
 * // termination of the string.
 * if (attrs[ATTR_BAR])
 *     char *text = nla_get_string(attrs[ATTR_BAR]);
 * @endcode
 *
 * @par Flag (NLA_FLAG)
 * This attribute type may be used to indicate the presence of a flag. The
 * attribute is only valid if the payload length is zero. The presence of
 * the attribute header indicates the presence of the flag.
 * @par
 * @code
 * // This attribute type is special as .minlen and .maxlen have no effect.
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 *     [ATTR_FLAG] = { .type = NLA_FLAG },
 *
 * // nla_put_flag() appends a zero sized attribute to the message.
 * nla_put_flag(msg, ATTR_FLAG);
 *
 * // There is no need for a receival function, the presence is the value.
 * if (attrs[ATTR_FLAG])
 *     // flag is present
 * @endcode
 *
 * @par Micro Seconds (NLA_MSECS)
 *
 * @par Nested Attribute (NLA_NESTED)
 * Attributes can be nested and put into a container to create groups, lists
 * or to construct trees of attributes. Nested attributes are often used to
 * pass attributes to a subsystem where the top layer has no knowledge of the
 * configuration possibilities of each subsystem.
 * @par
 * \b Note: When validating the attributes using nlmsg_validate() or
 * nlmsg_parse() it will only affect the top level attributes. Each
 * level of nested attributes must be validated seperately using
 * nla_parse_nested() or nla_validate().
 * @par
 * @code
 * // The minimal length policy may be used to enforce the presence of at
 * // least one attribute.
 * static struct nla_policy my_policy[ATTR_MAX+1] = {
 *     [ATTR_OPTS] = { .type = NLA_NESTED, minlen = NLA_HDRLEN },
 *
 * // Nested attributes are constructed by enclosing the attributes
 * // to be nested with calls to nla_nest_start() respetively nla_nest_end().
 * struct nlattr *opts = nla_nest_start(msg, ATTR_OPTS);
 * nla_put_u32(msg, ATTR_FOO, 123);
 * nla_put_string(msg, ATTR_BAR, "some text");
 * nla_nest_end(msg, opts);
 *
 * // Various methods exist to parse nested attributes, the easiest being
 * // nla_parse_nested() which also allows validation in the same step.
 * if (attrs[ATTR_OPTS]) {
 *     struct nlattr *nested[ATTR_MAX+1];
 *
 *     nla_parse_nested(nested, ATTR_MAX, attrs[ATTR_OPTS], &policy);
 *
 *     if (nested[ATTR_FOO])
 *         uint32_t foo = nla_get_u32(nested[ATTR_FOO]);
 * }
 * @endcode
 *
 * @subsection attr_exceptions Exception Based Attribute Construction
 * Often a large number of attributes are added to a message in a single
 * function. In order to simplify error handling, a second set of
 * construction functions exist which jump to a error label when they
 * fail instead of returning an error code. This second set consists
 * of macros which are named after their error code based counterpart
 * except that the name is written all uppercase.
 *
 * All of the macros jump to the target \c nla_put_failure if they fail.
 * @code
 * void my_func(struct nl_msg *msg)
 * {
 *     NLA_PUT_U32(msg, ATTR_FOO, 10);
 *     NLA_PUT_STRING(msg, ATTR_BAR, "bar");
 *
 *     return 0;
 *
 * nla_put_failure:
 *     return -NLE_NOMEM;
 * }
 * @endcode
 *
 * @subsection attr_examples Examples
 * @par Example 1.1 Constructing a netlink message with attributes.
 * @code
 * struct nl_msg *build_msg(int ifindex, struct nl_addr *lladdr, int mtu)
 * {
 *     struct nl_msg *msg;
 *     struct nlattr *info, *vlan;
 *     struct ifinfomsg ifi = {
 *         .ifi_family = AF_INET,
 *         .ifi_index = ifindex,
 *     };
 *
 *     // Allocate a new netlink message, type=RTM_SETLINK, flags=NLM_F_ECHO
 *     if (!(msg = nlmsg_alloc_simple(RTM_SETLINK, NLM_F_ECHO)))
 *         return NULL;
 *
 *     // Append the family specific header (struct ifinfomsg)
 *     if (nlmsg_append(msg, &ifi, sizeof(ifi), NLMSG_ALIGNTO) < 0)
 *         goto nla_put_failure
 *
 *     // Append a 32 bit integer attribute to carry the MTU
 *     NLA_PUT_U32(msg, IFLA_MTU, mtu);
 *
 *     // Append a unspecific attribute to carry the link layer address
 *     NLA_PUT_ADDR(msg, IFLA_ADDRESS, lladdr);
 *
 *     // Append a container for nested attributes to carry link information
 *     if (!(info = nla_nest_start(msg, IFLA_LINKINFO)))
 *         goto nla_put_failure;
 *
 *     // Put a string attribute into the container
 *     NLA_PUT_STRING(msg, IFLA_INFO_KIND, "vlan");
 *
 *     // Append another container inside the open container to carry
 *     // vlan specific attributes
 *     if (!(vlan = nla_nest_start(msg, IFLA_INFO_DATA)))
 *         goto nla_put_failure;
 *
 *     // add vlan specific info attributes here...
 *
 *     // Finish nesting the vlan attributes and close the second container.
 *     nla_nest_end(msg, vlan);
 *
 *     // Finish nesting the link info attribute and close the first container.
 *     nla_nest_end(msg, info);
 *
 *     return msg;
 *
 * // If any of the construction macros fails, we end up here.
 * nla_put_failure:
 *     nlmsg_free(msg);
 *     return NULL;
 * }
 * @endcode
 *
 * @par Example 2.1 Parsing a netlink message with attributes.
 * @code
 * int parse_message(struct nl_msg *msg)
 * {
 *     // The policy defines two attributes: a 32 bit integer and a container
 *     // for nested attributes.
 *     struct nla_policy attr_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U32 },
 *         [ATTR_BAR] = { .type = NLA_NESTED },
 *     };
 *     struct nlattr *attrs[ATTR_MAX+1];
 *     int err;
 *
 *     // The nlmsg_parse() function will make sure that the message contains
 *     // enough payload to hold the header (struct my_hdr), validates any
 *     // attributes attached to the messages and stores a pointer to each
 *     // attribute in the attrs[] array accessable by attribute type.
 *     if ((err = nlmsg_parse(nlmsg_hdr(msg), sizeof(struct my_hdr), attrs,
 *                            ATTR_MAX, attr_policy)) < 0)
 *         goto errout;
 *
 *     if (attrs[ATTR_FOO]) {
 *         // It is safe to directly access the attribute payload without
 *         // any further checks since nlmsg_parse() enforced the policy.
 *         uint32_t foo = nla_get_u32(attrs[ATTR_FOO]);
 *     }
 *
 *     if (attrs[ATTR_BAR]) {
 *         struct nlattr *nested[NESTED_MAX+1];
 *
 *         // Attributes nested in a container can be parsed the same way
 *         // as top level attributes.
 *         if ((err = nla_parse_nested(nested, NESTED_MAX, attrs[ATTR_BAR],
 *                                     nested_policy)) < 0)
 *             goto errout;
 *
 *         // Process nested attributes here.
 *     }
 *
 *     err = 0;
 * errout:
 *     return err;
 * }
 * @endcode
 *
 * @{
 */

/**
 * @name Attribute Size Calculation
 * @{
 */

/** @} */

/**
 * @name Parsing Attributes
 * @{
 */

/**
 * Check if the attribute header and payload can be accessed safely.
 * @arg nla		Attribute of any kind.
 * @arg remaining	Number of bytes remaining in attribute stream.
 *
 * Verifies that the header and payload do not exceed the number of
 * bytes left in the attribute stream. This function must be called
 * before access the attribute header or payload when iterating over
 * the attribute stream using nla_next().
 *
 * @return True if the attribute can be accessed safely, false otherwise.
 */
int nla_ok(const struct nlattr *nla, int remaining)
{
	return remaining >= sizeof(*nla) &&
	       nla->nla_len >= sizeof(*nla) &&
	       nla->nla_len <= remaining;
}

/**
 * Return next attribute in a stream of attributes.
 * @arg nla		Attribute of any kind.
 * @arg remaining	Variable to count remaining bytes in stream.
 *
 * Calculates the offset to the next attribute based on the attribute
 * given. The attribute provided is assumed to be accessible, the
 * caller is responsible to use nla_ok() beforehand. The offset (length
 * of specified attribute including padding) is then subtracted from
 * the remaining bytes variable and a pointer to the next attribute is
 * returned.
 *
 * nla_next() can be called as long as remainig is >0.
 *
 * @return Pointer to next attribute.
 */
struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
	int totlen = NLA_ALIGN(nla->nla_len);

	*remaining -= totlen;
	return (struct nlattr *) ((char *) nla + totlen);
}

static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
	[NLA_U8]	= sizeof(uint8_t),
	[NLA_U16]	= sizeof(uint16_t),
	[NLA_U32]	= sizeof(uint32_t),
	[NLA_U64]	= sizeof(uint64_t),
	[NLA_STRING]	= 1,
};

static int validate_nla(struct nlattr *nla, int maxtype,
			struct nla_policy *policy)
{
	struct nla_policy *pt;
	int minlen = 0, type = nla_type(nla);

	if (type <= 0 || type > maxtype)
		return 0;

	pt = &policy[type];

	if (pt->type > NLA_TYPE_MAX)
		BUG();

	if (pt->minlen)
		minlen = pt->minlen;
	else if (pt->type != NLA_UNSPEC)
		minlen = nla_attr_minlen[pt->type];

	if (pt->type == NLA_FLAG && nla_len(nla) > 0)
		return -NLE_RANGE;

	if (nla_len(nla) < minlen)
		return -NLE_RANGE;

	if (pt->maxlen && nla_len(nla) > pt->maxlen)
		return -NLE_RANGE;

	if (pt->type == NLA_STRING) {
		char *data = nla_data(nla);
		if (data[nla_len(nla) - 1] != '\0')
			return -NLE_INVAL;
	}

	return 0;
}


/**
 * Create attribute index based on a stream of attributes.
 * @arg tb		Index array to be filled (maxtype+1 elements).
 * @arg maxtype		Maximum attribute type expected and accepted.
 * @arg head		Head of attribute stream.
 * @arg len		Length of attribute stream.
 * @arg policy		Attribute validation policy.
 *
 * Iterates over the stream of attributes and stores a pointer to each
 * attribute in the index array using the attribute type as index to
 * the array. Attribute with a type greater than the maximum type
 * specified will be silently ignored in order to maintain backwards
 * compatibility. If \a policy is not NULL, the attribute will be
 * validated using the specified policy.
 *
 * @see nla_validate
 * @return 0 on success or a negative error code.
 */
int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
	      struct nla_policy *policy)
{
	struct nlattr *nla;
	int rem, err;

	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

	nla_for_each_attr(nla, head, len, rem) {
		int type = nla_type(nla);

		if (type == 0) {
			fprintf(stderr, "Illegal nla->nla_type == 0\n");
			continue;
		}

		if (type <= maxtype) {
			if (policy) {
				err = validate_nla(nla, maxtype, policy);
				if (err < 0)
					goto errout;
			}

			tb[type] = nla;
		}
	}

	if (rem > 0)
		fprintf(stderr, "netlink: %d bytes leftover after parsing "
		       "attributes.\n", rem);

	err = 0;
errout:
	return err;
}

/**
 * Validate a stream of attributes.
 * @arg head		Head of attributes stream.
 * @arg len		Length of attributes stream.
 * @arg maxtype		Maximum attribute type expected and accepted.
 * @arg policy		Validation policy.
 *
 * Iterates over the stream of attributes and validates each attribute
 * one by one using the specified policy. Attributes with a type greater
 * than the maximum type specified will be silently ignored in order to
 * maintain backwards compatibility.
 *
 * See \ref attr_datatypes for more details on what kind of validation
 * checks are performed on each attribute data type.
 *
 * @return 0 on success or a negative error code.
 */
int nla_validate(struct nlattr *head, int len, int maxtype,
		 struct nla_policy *policy)
{
	struct nlattr *nla;
	int rem, err;

	nla_for_each_attr(nla, head, len, rem) {
		err = validate_nla(nla, maxtype, policy);
		if (err < 0)
			goto errout;
	}

	err = 0;
errout:
	return err;
}

/**
 * Find a single attribute in a stream of attributes.
 * @arg head		Head of attributes stream.
 * @arg len		Length of attributes stream.
 * @arg attrtype	Attribute type to look for.
 *
 * Iterates over the stream of attributes and compares each type with
 * the type specified. Returns the first attribute which matches the
 * type.
 *
 * @return Pointer to attribute found or NULL.
 */
struct nlattr *nla_find(struct nlattr *head, int len, int attrtype)
{
	struct nlattr *nla;
	int rem;

	nla_for_each_attr(nla, head, len, rem)
		if (nla_type(nla) == attrtype)
			return nla;

	return NULL;
}

/** @} */

/**
 * @name Unspecific Attribute
 * @{
 */

/**
 * Reserve space for a attribute.
 * @arg msg		Netlink Message.
 * @arg attrtype	Attribute Type.
 * @arg attrlen		Length of payload.
 *
 * Reserves room for a attribute in the specified netlink message and
 * fills in the attribute header (type, length). Returns NULL if there
 * is unsuficient space for the attribute.
 *
 * Any padding between payload and the start of the next attribute is
 * zeroed out.
 *
 * @return Pointer to start of attribute or NULL on failure.
 */
struct nlattr *nla_reserve(struct nl_msg *msg, int attrtype, int attrlen)
{
	struct nlattr *nla;
	int tlen;
	
	tlen = NLMSG_ALIGN(msg->nm_nlh->nlmsg_len) + nla_total_size(attrlen);

	if ((tlen + msg->nm_nlh->nlmsg_len) > msg->nm_size)
		return NULL;

	nla = (struct nlattr *) nlmsg_tail(msg->nm_nlh);
	nla->nla_type = attrtype;
	nla->nla_len = nla_attr_size(attrlen);

	memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));
	msg->nm_nlh->nlmsg_len = tlen;

	NL_DBG(2, "msg %p: Reserved %d bytes at offset +%td for attr %d "
		  "nlmsg_len=%d\n", msg, attrlen,
		  (void *) nla - nlmsg_data(msg->nm_nlh),
		  attrtype, msg->nm_nlh->nlmsg_len);

	return nla;
}

/**
 * Add a unspecific attribute to netlink message.
 * @arg msg		Netlink message.
 * @arg attrtype	Attribute type.
 * @arg datalen		Length of data to be used as payload.
 * @arg data		Pointer to data to be used as attribute payload.
 *
 * Reserves room for a unspecific attribute and copies the provided data
 * into the message as payload of the attribute. Returns an error if there
 * is insufficient space for the attribute.
 *
 * @see nla_reserve
 * @return 0 on success or a negative error code.
 */
int nla_put(struct nl_msg *msg, int attrtype, int datalen, const void *data)
{
	struct nlattr *nla;

	nla = nla_reserve(msg, attrtype, datalen);
	if (!nla)
		return -NLE_NOMEM;

	memcpy(nla_data(nla), data, datalen);
	NL_DBG(2, "msg %p: Wrote %d bytes at offset +%td for attr %d\n",
	       msg, datalen, (void *) nla - nlmsg_data(msg->nm_nlh), attrtype);

	return 0;
}



/** @} */
/span>.debugreg[7] & DR7_ACTIVE_MASK) ) __restore_debug_registers(v); } static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; c->cr0 = v->arch.hvm_vcpu.guest_cr[0]; c->cr2 = v->arch.hvm_vcpu.guest_cr[2]; c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs; c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp; c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip; c->pending_event = 0; c->error_code = 0; if ( vmcb->eventinj.fields.v && hvm_event_needs_reinjection(vmcb->eventinj.fields.type, vmcb->eventinj.fields.vector) ) { c->pending_event = (uint32_t)vmcb->eventinj.bytes; c->error_code = vmcb->eventinj.fields.errorcode; } return 1; } static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c) { unsigned long mfn = 0; p2m_type_t p2mt; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( c->pending_valid && ((c->pending_type == 1) || (c->pending_type > 6) || (c->pending_reserved != 0)) ) { gdprintk(XENLOG_ERR, "Invalid pending event 0x%"PRIx32".\n", c->pending_event); return -EINVAL; } if ( !paging_mode_hap(v->domain) ) { if ( c->cr0 & X86_CR0_PG ) { mfn = mfn_x(gfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT, &p2mt)); if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) ) { gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%"PRIx64"\n", c->cr3); return -EINVAL; } } if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_from_pfn(mfn); } v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET; v->arch.hvm_vcpu.guest_cr[2] = c->cr2; v->arch.hvm_vcpu.guest_cr[3] = c->cr3; v->arch.hvm_vcpu.guest_cr[4] = c->cr4; hvm_update_guest_cr(v, 0); hvm_update_guest_cr(v, 2); hvm_update_guest_cr(v, 4); v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs; v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp; v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip; if ( paging_mode_hap(v->domain) ) { vmcb->np_enable = 1; vmcb->g_pat = MSR_IA32_CR_PAT_RESET; /* guest PAT */ vmcb->h_cr3 = pagetable_get_paddr(v->domain->arch.phys_table); } if ( c->pending_valid ) { gdprintk(XENLOG_INFO, "Re-injecting 0x%"PRIx32", 0x%"PRIx32"\n", c->pending_event, c->error_code); if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) ) { vmcb->eventinj.bytes = c->pending_event; vmcb->eventinj.fields.errorcode = c->error_code; } } paging_update_paging_modes(v); return 0; } static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; data->shadow_gs = vmcb->kerngsbase; data->msr_lstar = vmcb->lstar; data->msr_star = vmcb->star; data->msr_cstar = vmcb->cstar; data->msr_syscall_mask = vmcb->sfmask; data->msr_efer = v->arch.hvm_vcpu.guest_efer; data->msr_flags = -1ULL; data->tsc = hvm_get_guest_tsc(v); } static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; vmcb->kerngsbase = data->shadow_gs; vmcb->lstar = data->msr_lstar; vmcb->star = data->msr_star; vmcb->cstar = data->msr_cstar; vmcb->sfmask = data->msr_syscall_mask; v->arch.hvm_vcpu.guest_efer = data->msr_efer; hvm_update_guest_efer(v); hvm_set_guest_tsc(v, data->tsc); } static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { svm_save_cpu_state(v, ctxt); svm_vmcb_save(v, ctxt); } static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { svm_load_cpu_state(v, ctxt); if (svm_vmcb_restore(v, ctxt)) { printk("svm_vmcb restore failed!\n"); domain_crash(v->domain); return -EINVAL; } return 0; } static void svm_fpu_enter(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; setup_fpu(v); vmcb->exception_intercepts &= ~(1U << TRAP_no_device); } static void svm_fpu_leave(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; ASSERT(!v->fpu_dirtied); ASSERT(read_cr0() & X86_CR0_TS); /* * If the guest does not have TS enabled then we must cause and handle an * exception on first use of the FPU. If the guest *does* have TS enabled * then this is not necessary: no FPU activity can occur until the guest * clears CR0.TS, and we will initialise the FPU when that happens. */ if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { v->arch.hvm_svm.vmcb->exception_intercepts |= 1U << TRAP_no_device; vmcb->cr0 |= X86_CR0_TS; } } static unsigned int svm_get_interrupt_shadow(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int intr_shadow = 0; if ( vmcb->interrupt_shadow ) intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI; if ( vmcb->general1_intercepts & GENERAL1_INTERCEPT_IRET ) intr_shadow |= HVM_INTR_SHADOW_NMI; return intr_shadow; } static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; vmcb->interrupt_shadow = !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI)); vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_IRET; if ( intr_shadow & HVM_INTR_SHADOW_NMI ) vmcb->general1_intercepts |= GENERAL1_INTERCEPT_IRET; } static int svm_guest_x86_mode(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) ) return 0; if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) ) return 1; if ( hvm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) ) return 8; return (likely(vmcb->cs.attr.fields.db) ? 4 : 2); } static void svm_update_host_cr3(struct vcpu *v) { /* SVM doesn't have a HOST_CR3 equivalent to update. */ } static void svm_update_guest_cr(struct vcpu *v, unsigned int cr) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; switch ( cr ) { case 0: { unsigned long hw_cr0_mask = 0; if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { if ( v != current ) hw_cr0_mask |= X86_CR0_TS; else if ( vmcb->cr0 & X86_CR0_TS ) svm_fpu_enter(v); } vmcb->cr0 = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; if ( !paging_mode_hap(v->domain) ) vmcb->cr0 |= X86_CR0_PG | X86_CR0_WP; break; } case 2: vmcb->cr2 = v->arch.hvm_vcpu.guest_cr[2]; break; case 3: vmcb->cr3 = v->arch.hvm_vcpu.hw_cr[3]; svm_asid_inv_asid(v); break; case 4: vmcb->cr4 = HVM_CR4_HOST_MASK; if ( paging_mode_hap(v->domain) ) vmcb->cr4 &= ~X86_CR4_PAE; vmcb->cr4 |= v->arch.hvm_vcpu.guest_cr[4]; break; default: BUG(); } } static void svm_update_guest_efer(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA); vmcb->efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME; if ( lma ) vmcb->efer |= EFER_LME; /* * In legacy mode (EFER.LMA=0) we natively support SYSENTER/SYSEXIT with * no need for MSR intercepts. When EFER.LMA=1 we must trap and emulate. */ svm_intercept_msr(v, MSR_IA32_SYSENTER_CS, lma); svm_intercept_msr(v, MSR_IA32_SYSENTER_ESP, lma); svm_intercept_msr(v, MSR_IA32_SYSENTER_EIP, lma); } static void svm_flush_guest_tlbs(void) { /* Roll over the CPU's ASID generation, so it gets a clean TLB when we * next VMRUN. (If ASIDs are disabled, the whole TLB is flushed on * VMRUN anyway). */ svm_asid_inc_generation(); } static void svm_sync_vmcb(struct vcpu *v) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; if ( arch_svm->vmcb_in_sync ) return; arch_svm->vmcb_in_sync = 1; svm_vmsave(arch_svm->vmcb); } static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; ASSERT((v == current) || !vcpu_runnable(v)); switch ( seg ) { case x86_seg_cs: memcpy(reg, &vmcb->cs, sizeof(*reg)); reg->attr.fields.g = reg->limit > 0xFFFFF; break; case x86_seg_ds: memcpy(reg, &vmcb->ds, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_es: memcpy(reg, &vmcb->es, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_fs: svm_sync_vmcb(v); memcpy(reg, &vmcb->fs, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_gs: svm_sync_vmcb(v); memcpy(reg, &vmcb->gs, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_ss: memcpy(reg, &vmcb->ss, sizeof(*reg)); reg->attr.fields.dpl = vmcb->cpl; if ( reg->attr.fields.type == 0 ) reg->attr.fields.db = 0; break; case x86_seg_tr: svm_sync_vmcb(v); memcpy(reg, &vmcb->tr, sizeof(*reg)); reg->attr.fields.type |= 0x2; break; case x86_seg_gdtr: memcpy(reg, &vmcb->gdtr, sizeof(*reg)); break; case x86_seg_idtr: memcpy(reg, &vmcb->idtr, sizeof(*reg)); break; case x86_seg_ldtr: svm_sync_vmcb(v); memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break; default: BUG(); } } static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; int sync = 0; ASSERT((v == current) || !vcpu_runnable(v)); switch ( seg ) { case x86_seg_fs: case x86_seg_gs: case x86_seg_tr: case x86_seg_ldtr: sync = (v == current); break; default: break; } if ( sync ) svm_sync_vmcb(v); switch ( seg ) { case x86_seg_cs: memcpy(&vmcb->cs, reg, sizeof(*reg)); break; case x86_seg_ds: memcpy(&vmcb->ds, reg, sizeof(*reg)); break; case x86_seg_es: memcpy(&vmcb->es, reg, sizeof(*reg)); break; case x86_seg_fs: memcpy(&vmcb->fs, reg, sizeof(*reg)); break; case x86_seg_gs: memcpy(&vmcb->gs, reg, sizeof(*reg)); break; case x86_seg_ss: memcpy(&vmcb->ss, reg, sizeof(*reg)); vmcb->cpl = vmcb->ss.attr.fields.dpl; break; case x86_seg_tr: memcpy(&vmcb->tr, reg, sizeof(*reg)); break; case x86_seg_gdtr: vmcb->gdtr.base = reg->base; vmcb->gdtr.limit = (uint16_t)reg->limit; break; case x86_seg_idtr: vmcb->idtr.base = reg->base; vmcb->idtr.limit = (uint16_t)reg->limit; break; case x86_seg_ldtr: memcpy(&vmcb->ldtr, reg, sizeof(*reg)); break; default: BUG(); } if ( sync ) svm_vmload(vmcb); } static void svm_set_tsc_offset(struct vcpu *v, u64 offset) { v->arch.hvm_svm.vmcb->tsc_offset = offset; } static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_RDTSC; if ( enable ) vmcb->general1_intercepts |= GENERAL1_INTERCEPT_RDTSC; } static void svm_init_hypercall_page(struct domain *d, void *hypercall_page) { char *p; int i; for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { p = (char *)(hypercall_page + (i * 32)); *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */ *(u32 *)(p + 1) = i; *(u8 *)(p + 5) = 0x0f; /* vmmcall */ *(u8 *)(p + 6) = 0x01; *(u8 *)(p + 7) = 0xd9; *(u8 *)(p + 8) = 0xc3; /* ret */ } /* Don't support HYPERVISOR_iret at the moment */ *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } static void svm_ctxt_switch_from(struct vcpu *v) { int cpu = smp_processor_id(); svm_fpu_leave(v); svm_save_dr(v); svm_sync_vmcb(v); svm_vmload(root_vmcb[cpu]); #ifdef __x86_64__ /* Resume use of ISTs now that the host TR is reinstated. */ idt_tables[cpu][TRAP_double_fault].a |= IST_DF << 32; idt_tables[cpu][TRAP_nmi].a |= IST_NMI << 32; idt_tables[cpu][TRAP_machine_check].a |= IST_MCE << 32; #endif } static void svm_ctxt_switch_to(struct vcpu *v) { int cpu = smp_processor_id(); #ifdef __x86_64__ /* * This is required, because VMRUN does consistency check * and some of the DOM0 selectors are pointing to * invalid GDT locations, and cause AMD processors * to shutdown. */ set_segment_register(ds, 0); set_segment_register(es, 0); set_segment_register(ss, 0); /* * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR. * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET. */ idt_tables[cpu][TRAP_double_fault].a &= ~(7UL << 32); idt_tables[cpu][TRAP_nmi].a &= ~(7UL << 32); idt_tables[cpu][TRAP_machine_check].a &= ~(7UL << 32); #endif svm_restore_dr(v); svm_vmsave(root_vmcb[cpu]); svm_vmload(v->arch.hvm_svm.vmcb); } static void svm_do_resume(struct vcpu *v) { bool_t debug_state = v->domain->debugger_attached; if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3); v->arch.hvm_vcpu.debug_state_latch = debug_state; if ( debug_state ) v->arch.hvm_svm.vmcb->exception_intercepts |= mask; else v->arch.hvm_svm.vmcb->exception_intercepts &= ~mask; } if ( v->arch.hvm_svm.launch_core != smp_processor_id() ) { v->arch.hvm_svm.launch_core = smp_processor_id(); hvm_migrate_timers(v); /* Migrating to another ASID domain. Request a new ASID. */ svm_asid_init_vcpu(v); } /* Reflect the vlapic's TPR in the hardware vtpr */ v->arch.hvm_svm.vmcb->vintr.fields.tpr = (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; hvm_do_resume(v); reset_stack_and_jump(svm_asm_do_resume); } static int svm_domain_initialise(struct domain *d) { return 0; } static void svm_domain_destroy(struct domain *d) { } static int svm_vcpu_initialise(struct vcpu *v) { int rc; v->arch.schedule_tail = svm_do_resume; v->arch.ctxt_switch_from = svm_ctxt_switch_from; v->arch.ctxt_switch_to = svm_ctxt_switch_to; v->arch.hvm_svm.launch_core = -1; if ( (rc = svm_create_vmcb(v)) != 0 ) { dprintk(XENLOG_WARNING, "Failed to create VMCB for vcpu %d: err=%d.\n", v->vcpu_id, rc); return rc; } return 0; } static void svm_vcpu_destroy(struct vcpu *v) { svm_destroy_vmcb(v); } static void svm_inject_exception( unsigned int trapnr, int errcode, unsigned long cr2) { struct vcpu *curr = current; struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; eventinj_t event = vmcb->eventinj; switch ( trapnr ) { case TRAP_debug: if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) { __restore_debug_registers(curr); vmcb->dr6 |= 0x4000; } case TRAP_int3: if ( curr->domain->debugger_attached ) { /* Debug/Int3: Trap to debugger. */ domain_pause_for_debugger(); return; } } if ( unlikely(event.fields.v) && (event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) ) { trapnr = hvm_combine_hw_exceptions(event.fields.vector, trapnr); if ( trapnr == TRAP_double_fault ) errcode = 0; } event.bytes = 0; event.fields.v = 1; event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; event.fields.vector = trapnr; event.fields.ev = (errcode != HVM_DELIVER_NO_ERROR_CODE); event.fields.errorcode = errcode; vmcb->eventinj = event; if ( trapnr == TRAP_page_fault ) { vmcb->cr2 = curr->arch.hvm_vcpu.guest_cr[2] = cr2; HVMTRACE_LONG_2D(PF_INJECT, errcode, TRC_PAR_LONG(cr2)); } else { HVMTRACE_2D(INJ_EXC, trapnr, errcode); } } static int svm_event_pending(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; return vmcb->eventinj.fields.v; } static int svm_do_pmu_interrupt(struct cpu_user_regs *regs) { return 0; } static int svm_cpu_up(struct cpuinfo_x86 *c) { u32 eax, edx, phys_hsa_lo, phys_hsa_hi; u64 phys_hsa; int cpu = smp_processor_id(); /* Check whether SVM feature is disabled in BIOS */ rdmsr(MSR_K8_VM_CR, eax, edx); if ( eax & K8_VMCR_SVME_DISABLE ) { printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu); return 0; } if ( ((hsa[cpu] == NULL) && ((hsa[cpu] = alloc_host_save_area()) == NULL)) || ((root_vmcb[cpu] == NULL) && ((root_vmcb[cpu] = alloc_vmcb()) == NULL)) ) return 0; write_efer(read_efer() | EFER_SVME); /* Initialize the HSA for this core. */ phys_hsa = (u64)virt_to_maddr(hsa[cpu]); phys_hsa_lo = (u32)phys_hsa; phys_hsa_hi = (u32)(phys_hsa >> 32); wrmsr(MSR_K8_VM_HSAVE_PA, phys_hsa_lo, phys_hsa_hi); /* Initialize core's ASID handling. */ svm_asid_init(c); return 1; } void start_svm(struct cpuinfo_x86 *c) { static bool_t bootstrapped; if ( test_and_set_bool(bootstrapped) ) { if ( hvm_enabled && !svm_cpu_up(c) ) { printk("SVM: FATAL: failed to initialise CPU%d!\n", smp_processor_id()); BUG(); } return; } /* Xen does not fill x86_capability words except 0. */ boot_cpu_data.x86_capability[5] = cpuid_ecx(0x80000001); if ( !test_bit(X86_FEATURE_SVME, &boot_cpu_data.x86_capability) ) return; if ( !svm_cpu_up(c) ) { printk("SVM: failed to initialise.\n"); return; } setup_vmcb_dump(); svm_feature_flags = ((cpuid_eax(0x80000000) >= 0x8000000A) ? cpuid_edx(0x8000000A) : 0); svm_function_table.hap_supported = cpu_has_svm_npt; hvm_enable(&svm_function_table); } static void svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs) { p2m_type_t p2mt; mfn_t mfn; unsigned long gfn = gpa >> PAGE_SHIFT; /* * If this GFN is emulated MMIO or marked as read-only, pass the fault * to the mmio handler. */ mfn = gfn_to_mfn_type_current(gfn, &p2mt, p2m_guest); if ( (p2mt == p2m_mmio_dm) || (p2mt == p2m_ram_ro) ) { if ( !handle_mmio() ) hvm_inject_exception(TRAP_gp_fault, 0, 0); return; } /* Log-dirty: mark the page dirty and let the guest write it again */ if ( p2mt == p2m_ram_logdirty ) { paging_mark_dirty(current->domain, mfn_x(mfn)); p2m_change_type(current->domain, gfn, p2m_ram_logdirty, p2m_ram_rw); return; } /* Okay, this shouldn't happen. Maybe the guest was writing to a read-only grant mapping? */ if ( p2mt == p2m_grant_map_ro ) { /* Naughty... */ gdprintk(XENLOG_WARNING, "trying to write to read-only grant mapping\n"); hvm_inject_exception(TRAP_gp_fault, 0, 0); return; } /* Something bad has happened; either Xen or the hardware have screwed up. */ gdprintk(XENLOG_WARNING, "unexpected SVM nested page fault\n"); } static void svm_fpu_dirty_intercept(void) { struct vcpu *curr = current; struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; svm_fpu_enter(curr); if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) vmcb->cr0 &= ~X86_CR0_TS; } #define bitmaskof(idx) (1U << ((idx) & 31)) static void svm_cpuid_intercept( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { unsigned int input = *eax; struct vcpu *v = current; hvm_cpuid(input, eax, ebx, ecx, edx); if ( input == 0x80000001 ) { /* Fix up VLAPIC details. */ if ( vlapic_hw_disabled(vcpu_vlapic(v)) ) __clear_bit(X86_FEATURE_APIC & 31, edx); } HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); } static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs) { unsigned int eax, ebx, ecx, edx, inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_CPUID)) == 0 ) return; eax = regs->eax; ebx = regs->ebx; ecx = regs->ecx; edx = regs->edx; svm_cpuid_intercept(&eax, &ebx, &ecx, &edx); regs->eax = eax; regs->ebx = ebx; regs->ecx = ecx; regs->edx = edx; __update_guest_eip(regs, inst_len); } static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs) { HVMTRACE_0D(DR_WRITE); __restore_debug_registers(v); } static int svm_msr_read_intercept(struct cpu_user_regs *regs) { u64 msr_content = 0; u32 ecx = regs->ecx, eax, edx; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; switch ( ecx ) { case MSR_EFER: msr_content = v->arch.hvm_vcpu.guest_efer; break; case MSR_IA32_SYSENTER_CS: msr_content = v->arch.hvm_svm.guest_sysenter_cs; break; case MSR_IA32_SYSENTER_ESP: msr_content = v->arch.hvm_svm.guest_sysenter_esp; break; case MSR_IA32_SYSENTER_EIP: msr_content = v->arch.hvm_svm.guest_sysenter_eip; break; case MSR_IA32_MC4_MISC: /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* * MCA/MCE: We report that the threshold register is unavailable * for OS use (locked by the BIOS). */ msr_content = 1ULL << 61; /* MC4_MISC.Locked */ break; case MSR_IA32_EBC_FREQUENCY_ID: /* * This Intel-only register may be accessed if this HVM guest * has been migrated from an Intel host. The value zero is not * particularly meaningful, but at least avoids the guest crashing! */ msr_content = 0; break; case MSR_K8_VM_HSAVE_PA: goto gpf; case MSR_IA32_DEBUGCTLMSR: msr_content = vmcb->debugctlmsr; break; case MSR_IA32_LASTBRANCHFROMIP: msr_content = vmcb->lastbranchfromip; break; case MSR_IA32_LASTBRANCHTOIP: msr_content = vmcb->lastbranchtoip; break; case MSR_IA32_LASTINTFROMIP: msr_content = vmcb->lastintfromip; break; case MSR_IA32_LASTINTTOIP: msr_content = vmcb->lastinttoip; break; default: if ( rdmsr_viridian_regs(ecx, &msr_content) || rdmsr_hypervisor_regs(ecx, &msr_content) ) break; if ( rdmsr_safe(ecx, eax, edx) == 0 ) { msr_content = ((uint64_t)edx << 32) | eax; break; } goto gpf; } regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx); HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx", ecx, (unsigned long)regs->eax, (unsigned long)regs->edx); return X86EMUL_OKAY; gpf: hvm_inject_exception(TRAP_gp_fault, 0, 0); return X86EMUL_EXCEPTION; } static int svm_msr_write_intercept(struct cpu_user_regs *regs) { u64 msr_content = 0; u32 ecx = regs->ecx; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; msr_content = (u32)regs->eax | ((u64)regs->edx << 32); HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx); switch ( ecx ) { case MSR_K8_VM_HSAVE_PA: goto gpf; case MSR_IA32_SYSENTER_CS: v->arch.hvm_svm.guest_sysenter_cs = msr_content; break; case MSR_IA32_SYSENTER_ESP: v->arch.hvm_svm.guest_sysenter_esp = msr_content; break; case MSR_IA32_SYSENTER_EIP: v->arch.hvm_svm.guest_sysenter_eip = msr_content; break; case MSR_IA32_DEBUGCTLMSR: vmcb->debugctlmsr = msr_content; if ( !msr_content || !cpu_has_svm_lbrv ) break; vmcb->lbr_control.fields.enable = 1; svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR); svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP); break; case MSR_IA32_LASTBRANCHFROMIP: vmcb->lastbranchfromip = msr_content; break; case MSR_IA32_LASTBRANCHTOIP: vmcb->lastbranchtoip = msr_content; break; case MSR_IA32_LASTINTFROMIP: vmcb->lastintfromip = msr_content; break; case MSR_IA32_LASTINTTOIP: vmcb->lastinttoip = msr_content; break; default: if ( wrmsr_viridian_regs(ecx, msr_content) ) break; switch ( long_mode_do_msr_write(regs) ) { case HNDL_unhandled: wrmsr_hypervisor_regs(ecx, msr_content); break; case HNDL_exception_raised: return X86EMUL_EXCEPTION; case HNDL_done: break; } break; } return X86EMUL_OKAY; gpf: hvm_inject_exception(TRAP_gp_fault, 0, 0); return X86EMUL_EXCEPTION; } static void svm_do_msr_access(struct cpu_user_regs *regs) { int rc, inst_len; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( vmcb->exitinfo1 == 0 ) { if ( (inst_len = __get_instruction_length(v, INSTR_RDMSR)) == 0 ) return; rc = hvm_msr_read_intercept(regs); } else { if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 ) return; rc = hvm_msr_write_intercept(regs); } if ( rc == X86EMUL_OKAY ) __update_guest_eip(regs, inst_len); } static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb, struct cpu_user_regs *regs) { unsigned int inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 ) return; __update_guest_eip(regs, inst_len); hvm_hlt(regs->eflags); } static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs) { unsigned int inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 ) return; __update_guest_eip(regs, inst_len); hvm_rdtsc_intercept(regs); } static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs) { struct hvm_emulate_ctxt ctxt; int rc; hvm_emulate_prepare(&ctxt, regs); rc = hvm_emulate_one(&ctxt); switch ( rc ) { case X86EMUL_UNHANDLEABLE: hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); break; case X86EMUL_EXCEPTION: if ( ctxt.exn_pending ) hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0); /* fall through */ default: hvm_emulate_writeback(&ctxt); break; } } static void wbinvd_ipi(void *info) { wbinvd(); } static void svm_wbinvd_intercept(void) { if ( has_arch_pdevs(current->domain) ) on_each_cpu(wbinvd_ipi, NULL, 1); } static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs) { enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD }; int inst_len; inst_len = __get_instruction_length_from_list( current, list, ARRAY_SIZE(list)); if ( inst_len == 0 ) return; svm_wbinvd_intercept(); __update_guest_eip(regs, inst_len); } static void svm_invlpg_intercept(unsigned long vaddr) { struct vcpu *curr = current; HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr)); paging_invlpg(curr, vaddr); svm_asid_g_invlpg(curr, vaddr); } static struct hvm_function_table svm_function_table = { .name = "SVM", .cpu_down = svm_cpu_down, .domain_initialise = svm_domain_initialise, .domain_destroy = svm_domain_destroy, .vcpu_initialise = svm_vcpu_initialise, .vcpu_destroy = svm_vcpu_destroy, .save_cpu_ctxt = svm_save_vmcb_ctxt, .load_cpu_ctxt = svm_load_vmcb_ctxt, .get_interrupt_shadow = svm_get_interrupt_shadow, .set_interrupt_shadow = svm_set_interrupt_shadow, .guest_x86_mode = svm_guest_x86_mode, .get_segment_register = svm_get_segment_register, .set_segment_register = svm_set_segment_register, .update_host_cr3 = svm_update_host_cr3, .update_guest_cr = svm_update_guest_cr, .update_guest_efer = svm_update_guest_efer, .flush_guest_tlbs = svm_flush_guest_tlbs, .set_tsc_offset = svm_set_tsc_offset, .inject_exception = svm_inject_exception, .init_hypercall_page = svm_init_hypercall_page, .event_pending = svm_event_pending, .do_pmu_interrupt = svm_do_pmu_interrupt, .cpuid_intercept = svm_cpuid_intercept, .wbinvd_intercept = svm_wbinvd_intercept, .fpu_dirty_intercept = svm_fpu_dirty_intercept, .msr_read_intercept = svm_msr_read_intercept, .msr_write_intercept = svm_msr_write_intercept, .invlpg_intercept = svm_invlpg_intercept, .set_rdtsc_exiting = svm_set_rdtsc_exiting }; asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) { unsigned int exit_reason; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t eventinj; int inst_len, rc; if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = vmcb->cr3; /* * Before doing anything else, we need to sync up the VLAPIC's TPR with * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows) * because we update the vTPR on MMIO writes to the TPR. * NB. We need to preserve the low bits of the TPR to make checked builds * of Windows work, even though they don't actually do anything. */ vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, ((vmcb->vintr.fields.tpr & 0x0F) << 4) | (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F)); exit_reason = vmcb->exitcode; if ( hvm_long_mode_enabled(v) ) HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), 0, 0, 0); else HVMTRACE_ND(VMEXIT, 1/*cycles*/, 2, exit_reason, (uint32_t)regs->eip, 0, 0, 0, 0); if ( unlikely(exit_reason == VMEXIT_INVALID) ) { svm_dump_vmcb(__func__, vmcb); goto exit_and_crash; } perfc_incra(svmexits, exit_reason); hvm_maybe_deassert_evtchn_irq(); /* Event delivery caused this intercept? Queue for redelivery. */ eventinj = vmcb->exitintinfo; if ( unlikely(eventinj.fields.v) && hvm_event_needs_reinjection(eventinj.fields.type, eventinj.fields.vector) ) vmcb->eventinj = eventinj; switch ( exit_reason ) { case VMEXIT_INTR: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(INTR); break; case VMEXIT_NMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(NMI); break; case VMEXIT_SMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(SMI); break; case VMEXIT_EXCEPTION_DB: if ( !v->domain->debugger_attached ) goto exit_and_crash; domain_pause_for_debugger(); break; case VMEXIT_EXCEPTION_BP: if ( !v->domain->debugger_attached ) goto exit_and_crash; /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */ if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 ) break; __update_guest_eip(regs, inst_len); domain_pause_for_debugger(); break; case VMEXIT_EXCEPTION_NM: svm_fpu_dirty_intercept(); break; case VMEXIT_EXCEPTION_PF: { unsigned long va; va = vmcb->exitinfo2; regs->error_code = vmcb->exitinfo1; HVM_DBG_LOG(DBG_LEVEL_VMMU, "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx", (unsigned long)regs->eax, (unsigned long)regs->ebx, (unsigned long)regs->ecx, (unsigned long)regs->edx, (unsigned long)regs->esi, (unsigned long)regs->edi); if ( paging_fault(va, regs) ) { if ( trace_will_trace_event(TRC_SHADOW) ) break; if ( hvm_long_mode_enabled(v) ) HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va)); else HVMTRACE_2D(PF_XEN, regs->error_code, va); break; } hvm_inject_exception(TRAP_page_fault, regs->error_code, va); break; } case VMEXIT_EXCEPTION_UD: svm_vmexit_ud_intercept(regs); break; /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ case VMEXIT_EXCEPTION_MC: HVMTRACE_0D(MCE); break; case VMEXIT_VINTR: vmcb->vintr.fields.irq = 0; vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; break; case VMEXIT_INVD: case VMEXIT_WBINVD: svm_vmexit_do_invalidate_cache(regs); break; case VMEXIT_TASK_SWITCH: { enum hvm_task_switch_reason reason; int32_t errcode = -1; if ( (vmcb->exitinfo2 >> 36) & 1 ) reason = TSW_iret; else if ( (vmcb->exitinfo2 >> 38) & 1 ) reason = TSW_jmp; else reason = TSW_call_or_int; if ( (vmcb->exitinfo2 >> 44) & 1 ) errcode = (uint32_t)vmcb->exitinfo2; /* * Some processors set the EXITINTINFO field when the task switch * is caused by a task gate in the IDT. In this case we will be * emulating the event injection, so we do not want the processor * to re-inject the original event! */ vmcb->eventinj.bytes = 0; hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode); break; } case VMEXIT_CPUID: svm_vmexit_do_cpuid(regs); break; case VMEXIT_HLT: svm_vmexit_do_hlt(vmcb, regs); break; case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: case VMEXIT_INVLPG: case VMEXIT_INVLPGA: case VMEXIT_IOIO: if ( !handle_mmio() ) hvm_inject_exception(TRAP_gp_fault, 0, 0); break; case VMEXIT_VMMCALL: if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) break; HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) { __update_guest_eip(regs, inst_len); if ( rc == HVM_HCALL_invalidate ) send_invalidate_req(); } break; case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: svm_dr_access(v, regs); break; case VMEXIT_MSR: svm_do_msr_access(regs); break; case VMEXIT_SHUTDOWN: hvm_triple_fault(); break; case VMEXIT_RDTSC: svm_vmexit_do_rdtsc(regs); break; case VMEXIT_RDTSCP: case VMEXIT_MONITOR: case VMEXIT_MWAIT: case VMEXIT_VMRUN: case VMEXIT_VMLOAD: case VMEXIT_VMSAVE: case VMEXIT_STGI: case VMEXIT_CLGI: case VMEXIT_SKINIT: hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); break; case VMEXIT_NPF: perfc_incra(svmexits, VMEXIT_NPF_PERFC); regs->error_code = vmcb->exitinfo1; svm_do_nested_pgfault(vmcb->exitinfo2, regs); break; case VMEXIT_IRET: /* * IRET clears the NMI mask. However because we clear the mask * /before/ executing IRET, we set the interrupt shadow to prevent * a pending NMI from being injected immediately. This will work * perfectly unless the IRET instruction faults: in that case we * may inject an NMI before the NMI handler's IRET instruction is * retired. */ vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_IRET; vmcb->interrupt_shadow = 1; break; case VMEXIT_PAUSE: /* * The guest is running a contended spinlock and we've detected it. * Do something useful, like reschedule the guest */ perfc_incr(pauseloop_exits); do_sched_op_compat(SCHEDOP_yield, 0); break; default: exit_and_crash: gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, " "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n", exit_reason, (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2); domain_crash(v->domain); break; } /* The exit may have updated the TPR: reflect this in the hardware vtpr */ vmcb->vintr.fields.tpr = (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; } asmlinkage void svm_trace_vmentry(void) { HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); } /* * Local variables: * mode: C * c-set-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */