/**************************************************************************** * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. **************************************************************************** * * File: common/csched_credit.c * Author: Emmanuel Ackaouy * * Description: Credit-based SMP CPU scheduler */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * CSCHED_STATS * * Manage very basic counters and stats. * * Useful for debugging live systems. The stats are displayed * with runq dumps ('r' on the Xen console). */ #define CSCHED_STATS /* * Basic constants */ #define CSCHED_TICK 10 /* milliseconds */ #define CSCHED_TSLICE 30 /* milliseconds */ #define CSCHED_ACCT_NTICKS 3 #define CSCHED_ACCT_PERIOD (CSCHED_ACCT_NTICKS * CSCHED_TICK) #define CSCHED_DEFAULT_WEIGHT 256 /* * Priorities */ #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ #define CSCHED_PRI_IDLE -64 /* idle */ #define CSCHED_PRI_TS_PARKED -65 /* time-share w/ capped credits */ /* * Useful macros */ #define CSCHED_PCPU(_c) \ ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) /* * Stats */ #ifdef CSCHED_STATS #define CSCHED_STAT(_X) (csched_priv.stats._X) #define CSCHED_STAT_DEFINE(_X) uint32_t _X; #define CSCHED_STAT_PRINTK(_X) \ do \ { \ printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \ } while ( 0 ); #define CSCHED_STATS_EXPAND_SCHED(_MACRO) \ _MACRO(vcpu_init) \ _MACRO(vcpu_sleep) \ _MACRO(vcpu_wake_running) \ _MACRO(vcpu_wake_onrunq) \ _MACRO(vcpu_wake_runnable) \ _MACRO(vcpu_wake_not_runnable) \ _MACRO(dom_destroy) \ _MACRO(schedule) \ _MACRO(tickle_local_idler) \ _MACRO(tickle_local_over) \ _MACRO(tickle_local_under) \ _MACRO(tickle_local_other) \ _MACRO(acct_run) \ _MACRO(acct_no_work) \ _MACRO(acct_balance) \ _MACRO(acct_reorder) \ _MACRO(acct_min_credit) \ _MACRO(acct_vcpu_active) \ _MACRO(acct_vcpu_idle) \ _MACRO(acct_vcpu_credit_min) #define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \ _MACRO(vcpu_migrate) \ _MACRO(load_balance_idle) \ _MACRO(load_balance_over) \ _MACRO(load_balance_other) \ _MACRO(steal_trylock_failed) \ _MACRO(steal_peer_down) \ _MACRO(steal_peer_idle) \ _MACRO(steal_peer_running) \ _MACRO(steal_peer_pinned) \ _MACRO(tickle_idlers_none) \ _MACRO(tickle_idlers_some) #ifndef NDEBUG #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \ _MACRO(vcpu_check) #else #define CSCHED_STATS_EXPAND_CHECKS(_MACRO) #endif #define CSCHED_STATS_EXPAND(_MACRO) \ CSCHED_STATS_EXPAND_SCHED(_MACRO) \ CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO) \ CSCHED_STATS_EXPAND_CHECKS(_MACRO) #define CSCHED_STATS_RESET() \ do \ { \ memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \ } while ( 0 ) #define CSCHED_STATS_DEFINE() \ struct \ { \ CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \ } stats #define CSCHED_STATS_PRINTK() \ do \ { \ printk("stats:\n"); \ CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \ } while ( 0 ) #define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++) #else /* CSCHED_STATS */ #define CSCHED_STATS_RESET() do {} while ( 0 ) #define CSCHED_STATS_DEFINE() do {} while ( 0 ) #define CSCHED_STATS_PRINTK() do {} while ( 0 ) #define CSCHED_STAT_CRANK(_X) do {} while ( 0 ) #endif /* CSCHED_STATS */ /* * Physical CPU */ struct csched_pcpu { struct list_head runq; uint32_t runq_sort_last; }; /* * Virtual CPU */ struct csched_vcpu { struct list_head runq_elem; struct list_head active_vcpu_elem; struct csched_dom *sdom; struct vcpu *vcpu; atomic_t credit; int credit_last; uint32_t credit_incr; uint32_t state_active; uint32_t state_idle; int16_t pri; }; /* * Domain */ struct csched_dom { struct list_head active_vcpu; struct list_head active_sdom_elem; struct domain *dom; uint16_t active_vcpu_count; uint16_t weight; uint16_t cap; }; /* * System-wide private data */ struct csched_private { spinlock_t lock; struct list_head active_sdom; uint32_t ncpus; unsigned int master; cpumask_t idlers; uint32_t weight; uint32_t credit; int credit_balance; uint32_t runq_sort; CSCHED_STATS_DEFINE(); }; /* * Global variables */ static struct csched_private csched_priv; static inline int __vcpu_on_runq(struct csched_vcpu *svc) { return !list_empty(&svc->runq_elem); } static inline struct csched_vcpu * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_vcpu, runq_elem); } static inline void __runq_insert(unsigned int cpu, struct csched_vcpu *svc) { const struct list_head * const runq = RUNQ(cpu); struct list_head *iter; BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( cpu != svc->vcpu->processor ); list_for_each( iter, runq ) { const struct csched_vcpu * const iter_svc = __runq_elem(iter); if ( svc->pri > iter_svc->pri ) break; } list_add_tail(&svc->runq_elem, iter); } static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } static inline void __runq_tickle(unsigned int cpu, struct csched_vcpu *new) { struct csched_vcpu * const cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); cpumask_t mask; ASSERT(cur); cpus_clear(mask); /* If strictly higher priority than current VCPU, signal the CPU */ if ( new->pri > cur->pri ) { if ( cur->pri == CSCHED_PRI_IDLE ) CSCHED_STAT_CRANK(tickle_local_idler); else if ( cur->pri == CSCHED_PRI_TS_OVER ) CSCHED_STAT_CRANK(tickle_local_over); else if ( cur->pri == CSCHED_PRI_TS_UNDER ) CSCHED_STAT_CRANK(tickle_local_under); else CSCHED_STAT_CRANK(tickle_local_other); cpu_set(cpu, mask); } /* * If this CPU has at least two runnable VCPUs, we tickle any idlers to * let them know there is runnable work in the system... */ if ( cur->pri > CSCHED_PRI_IDLE ) { if ( cpus_empty(csched_priv.idlers) ) { CSCHED_STAT_CRANK(tickle_idlers_none); } else { CSCHED_STAT_CRANK(tickle_idlers_some); cpus_or(mask, mask, csched_priv.idlers); } } /* Send scheduler interrupts to designated CPUs */ if ( !cpus_empty(mask) ) cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ); } static int csched_pcpu_init(int cpu) { struct csched_pcpu *spc; unsigned long flags; /* Allocate per-PCPU info */ spc = xmalloc(struct csched_pcpu); if ( spc == NULL ) return -1; spin_lock_irqsave(&csched_priv.lock, flags); /* Initialize/update system-wide config */ csched_priv.credit += CSCHED_ACCT_PERIOD; if ( csched_priv.ncpus <= cpu ) csched_priv.ncpus = cpu + 1; if ( csched_priv.master >= csched_priv.ncpus ) csched_priv.master = cpu; INIT_LIST_HEAD(&spc->runq); spc->runq_sort_last = csched_priv.runq_sort; per_cpu(schedule_data, cpu).sched_priv = spc; /* Start off idling... */ BUG_ON( !is_idle_vcpu(per_cpu(schedule_data, cpu).curr) ); cpu_set(cpu, csched_priv.idlers); spin_unlock_irqrestore(&csched_priv.lock, flags); return 0; } #ifndef NDEBUG static inline void __csched_vcpu_check(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->vcpu != vc ); BUG_ON( sdom != CSCHED_DOM(vc->domain) ); if ( sdom ) { BUG_ON( is_idle_vcpu(vc) ); BUG_ON( sdom->dom != vc->domain ); } else { BUG_ON( !is_idle_vcpu(vc) ); } CSCHED_STAT_CRANK(vcpu_check); } #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) #else #define CSCHED_VCPU_CHECK(_vc) #endif static inline int __csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc) { /* * Don't pick up work that's in the peer's scheduling tail. Also only pick * up work that's allowed to run on our CPU. */ if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) ) { CSCHED_STAT_CRANK(steal_peer_running); return 0; } if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) ) { CSCHED_STAT_CRANK(steal_peer_pinned); return 0; } return 1; } static void csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec) { struct csched_dom * const sdom = svc->sdom; unsigned long flags; /* Update credits */ atomic_sub(credit_dec, &svc->credit); /* Put this VCPU and domain back on the active list if it was idling */ if ( list_empty(&svc->active_vcpu_elem) ) { spin_lock_irqsave(&csched_priv.lock, flags); if ( list_empty(&svc->active_vcpu_elem) ) { CSCHED_STAT_CRANK(acct_vcpu_active); svc->state_active++; sdom->active_vcpu_count++; list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); if ( list_empty(&sdom->active_sdom_elem) ) { list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); csched_priv.weight += sdom->weight; } } spin_unlock_irqrestore(&csched_priv.lock, flags); } } static inline void __csched_vcpu_acct_idle_locked(struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; BUG_ON( list_empty(&svc->active_vcpu_elem) ); CSCHED_STAT_CRANK(acct_vcpu_idle); svc->state_idle++; sdom->active_vcpu_count--; list_del_init(&svc->active_vcpu_elem); if ( list_empty(&sdom->active_vcpu) ) { BUG_ON( csched_priv.weight < sdom->weight ); list_del_init(&sdom->active_sdom_elem); csched_priv.weight -= sdom->weight; } atomic_set(&svc->credit, 0); } static int csched_vcpu_init(struct vcpu *vc) { struct domain * const dom = vc->domain; struct csched_dom *sdom; struct csched_vcpu *svc; int16_t pri; CSCHED_STAT_CRANK(vcpu_init); /* Allocate, if appropriate, per-domain info */ if ( is_idle_vcpu(vc) ) { sdom = NULL; pri = CSCHED_PRI_IDLE; } else if ( CSCHED_DOM(dom) ) { sdom = CSCHED_DOM(dom); pri = CSCHED_PRI_TS_UNDER; } else { sdom = xmalloc(struct csched_dom); if ( !sdom ) return -1; /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_vcpu); sdom->active_vcpu_count = 0; INIT_LIST_HEAD(&sdom->active_sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->cap = 0U; dom->sched_priv = sdom; pri = CSCHED_PRI_TS_UNDER; } /* Allocate per-VCPU info */ svc = xmalloc(struct csched_vcpu); if ( !svc ) return -1; INIT_LIST_HEAD(&svc->runq_elem); INIT_LIST_HEAD(&svc->active_vcpu_elem); svc->sdom = sdom; svc->vcpu = vc; atomic_set(&svc->credit, 0); svc->credit_last = 0; svc->credit_incr = 0U; svc->state_active = 0U; svc->state_idle = 0U; svc->pri = pri; vc->sched_priv = svc; CSCHED_VCPU_CHECK(vc); /* Attach fair-share VCPUs to the accounting list */ if ( likely(sdom != NULL) ) csched_vcpu_acct(svc, 0); /* Allocate per-PCPU info */ if ( unlikely(!CSCHED_PCPU(vc->processor)) ) { if ( csched_pcpu_init(vc->processor) != 0 ) return -1; } CSCHED_VCPU_CHECK(vc); return 0; } static void csched_vcpu_free(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; unsigned long flags; BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); spin_lock_irqsave(&csched_priv.lock, flags); if ( !list_empty(&svc->active_vcpu_elem) ) __csched_vcpu_acct_idle_locked(svc); spin_unlock_irqrestore(&csched_priv.lock, flags); xfree(svc); } static void csched_vcpu_sleep(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); CSCHED_STAT_CRANK(vcpu_sleep); BUG_ON( is_idle_vcpu(vc) ); if ( per_cpu(schedule_data, vc->processor).curr == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_runq(svc) ) __runq_remove(svc); } static void csched_vcpu_wake(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); const unsigned int cpu = vc->processor; BUG_ON( is_idle_vcpu(vc) ); if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) ) { CSCHED_STAT_CRANK(vcpu_wake_running); return; } if ( unlikely(__vcpu_on_runq(svc)) )
/*
 * netlink/msg.c		Netlink Messages Interface
 *
 *	This library is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU Lesser General Public
 *	License as published by the Free Software Foundation version 2.1
 *	of the License.
 *
 * Copyright (c) 2003-2006 Thomas Graf <tgraf@suug.ch>
 */

#ifndef NETLINK_MSG_H_
#define NETLINK_MSG_H_

#include <netlink/netlink.h>
#include <netlink/object.h>

#ifdef __cplusplus
extern "C" {
#endif

struct nla_policy;

#define NL_DONTPAD	0

/**
 * @ingroup msg
 * @brief
 * Will cause the netlink pid to be set to the pid assigned to
 * the netlink handle (socket) just before sending the message off.
 * @note Requires the use of nl_send_auto_complete()!
 */
#define NL_AUTO_PID	0

/**
 * @ingroup msg
 * @brief
 * May be used to refer to a sequence number which should be
 * automatically set just before sending the message off.
 * @note Requires the use of nl_send_auto_complete()!
 */
#define NL_AUTO_SEQ	0

#define NL_MSG_CRED_PRESENT 1

struct nl_msg
{
	int			nm_protocol;
	int			nm_flags;
	struct sockaddr_nl	nm_src;
	struct sockaddr_nl	nm_dst;
	struct ucred		nm_creds;
	struct nlmsghdr *	nm_nlh;
	size_t			nm_size;
	int			nm_refcnt;
};


struct nl_msg;
struct nl_tree;
struct ucred;

/* message parsing */
extern int		  nlmsg_ok(const struct nlmsghdr *, int);
extern struct nlmsghdr *  nlmsg_next(struct nlmsghdr *, int *);
extern int		  nlmsg_parse(struct nlmsghdr *, int, struct nlattr **,
				      int, struct nla_policy *);
extern int		  nlmsg_validate(struct nlmsghdr *, int, int,
					 struct nla_policy *);

extern struct nl_msg *	  nlmsg_alloc(void);
extern struct nl_msg *	  nlmsg_alloc_size(size_t);
extern struct nl_msg *	  nlmsg_alloc_simple(int, int);
extern void		  nlmsg_set_default_size(size_t);
extern struct nl_msg *	  nlmsg_inherit(struct nlmsghdr *);
extern struct nl_msg *	  nlmsg_convert(struct nlmsghdr *);
extern void *		  nlmsg_reserve(struct nl_msg *, size_t, int);
extern int		  nlmsg_append(struct nl_msg *, void *, size_t, int);

extern struct nlmsghdr *  nlmsg_put(struct nl_msg *, uint32_t, uint32_t,
				    int, int, int);
extern void		  nlmsg_free(struct nl_msg *);

extern int		  nl_msg_parse(struct nl_msg *,
				       void (*cb)(struct nl_object *, void *),
				       void *);

extern void		nl_msg_dump(struct nl_msg *, FILE *);

/**
 * length of netlink message not including padding
 * @arg payload		length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
	return NLMSG_HDRLEN + payload;
}

/**
 * length of netlink message including padding
 * @arg payload		length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
	return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * length of padding at the message's tail
 * @arg payload		length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
	return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * head of message payload
 * @arg nlh		netlink messsage header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
	return (unsigned char *) nlh + NLMSG_HDRLEN;
}

static inline void *nlmsg_tail(const struct nlmsghdr *nlh)
{
	return (unsigned char *) nlh + NLMSG_ALIGN(nlh->nlmsg_len);
}

/**
 * length of message payload
 * @arg nlh		netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
	return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * head of attributes data
 * @arg nlh		netlink message header
 * @arg hdrlen		length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen)
{
	unsigned char *data = (unsigned char*)nlmsg_data(nlh);
	return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * length of attributes data
 * @arg nlh		netlink message header
 * @arg hdrlen		length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
	return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

static inline int nlmsg_valid_hdr(const struct nlmsghdr *nlh, int hdrlen)
{
	if (nlh->nlmsg_len < (uint)nlmsg_msg_size(hdrlen))
		return 0;

	return 1;
}


static inline void nlmsg_set_proto(struct nl_msg *msg, int protocol)
{
	msg->nm_protocol = protocol;
}

static inline int nlmsg_get_proto(struct nl_msg *msg)
{
	return msg->nm_protocol;
}

static inline size_t nlmsg_get_max_size(struct nl_msg *msg)
{
	return msg->nm_size;
}

static inline void nlmsg_set_src(struct nl_msg *msg, struct sockaddr_nl *addr)
{
	memcpy(&msg->nm_src, addr, sizeof(*addr));
}

static inline struct sockaddr_nl *nlmsg_get_src(struct nl_msg *msg)
{
	return &msg->nm_src;
}

static inline void nlmsg_set_dst(struct nl_msg *msg, struct sockaddr_nl *addr)
{
	memcpy(&msg->nm_dst, addr, sizeof(*addr));
}

static inline struct sockaddr_nl *nlmsg_get_dst(struct nl_msg *msg)
{
	return &msg->nm_dst;
}

static inline void nlmsg_set_creds(struct nl_msg *msg, struct ucred *creds)
{
	memcpy(&msg->nm_creds, creds, sizeof(*creds));
	msg->nm_flags |= NL_MSG_CRED_PRESENT;
}

static inline struct ucred *nlmsg_get_creds(struct nl_msg *msg)
{
	if (msg->nm_flags & NL_MSG_CRED_PRESENT)
		return &msg->nm_creds;
	return NULL;
}

/**
 * Return actual netlink message
 * @arg n		netlink message
 * 
 * Returns the actual netlink message casted to the type of the netlink
 * message header.
 * 
 * @return A pointer to the netlink message.
 */
static inline struct nlmsghdr *nlmsg_hdr(struct nl_msg *n)
{
	return n->nm_nlh;
}

/**
 * Acquire a reference on a netlink message
 * @arg msg		message to acquire reference from
 */
static inline void nlmsg_get(struct nl_msg *msg)
{
	msg->nm_refcnt++;
}

/**
 * Expand maximum payload size of a netlink message
 * @arg n		Netlink message.
 * @arg newlen		New maximum payload size.
 *
 * Reallocates the payload section of a netlink message and increases
 * the maximum payload size of the message.
 *
 * @note Any pointers pointing to old payload block will be stale and
 *       need to be refetched. Therfore, do not expand while constructing
 *       nested attributes or while reserved data blocks are held.
 *
 * @return 0 on success or a negative error code.
 */
static inline int nlmsg_expand(struct nl_msg *n, size_t newlen)
{
	void *tmp;

	if (newlen <= n->nm_size)
		return -NLE_INVAL;

	tmp = realloc(n->nm_nlh, newlen);
	if (tmp == NULL)
		return -NLE_NOMEM;

	n->nm_nlh = (struct nlmsghdr*)tmp;
	n->nm_size = newlen;

	return 0;
}


/**
 * @name Iterators
 * @{
 */

/**
 * @ingroup msg
 * Iterate over a stream of attributes in a message
 * @arg pos	loop counter, set to current attribute
 * @arg nlh	netlink message header
 * @arg hdrlen	length of family header
 * @arg rem	initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
	nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
			  nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * Iterate over a stream of messages
 * @arg pos	loop counter, set to current message
 * @arg head	head of message stream
 * @arg len	length of message stream
 * @arg rem	initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
	for (pos = head, rem = len; \
	     nlmsg_ok(pos, rem); \
	     pos = nlmsg_next(pos, &(rem)))

/** @} */

#ifdef __cplusplus
}
#endif

#endif