/**************************************************************************** * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd **************************************************************************** * * File: common/csched_credit2.c * Author: George Dunlap * * Description: Credit-based SMP CPU scheduler * Based on an earlier verson by Emmanuel Ackaouy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define d2printk(x...) //#define d2printk printk /* * Credit2 tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) /* * WARNING: This is still in an experimental phase. Status and work can be found at the * credit2 wiki page: * http://wiki.xen.org/wiki/Credit2_Scheduler_Development * TODO: * + Immediate bug-fixes * - Do per-runqueue, grab proper lock for dump debugkey * + Multiple sockets * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map()) * - Simple load balancer / runqueue assignment * - Runqueue load measurement * - Load-based load balancer * + Hyperthreading * - Look for non-busy core if possible * - "Discount" time run on a thread with busy siblings * + Algorithm: * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., * a flash animation in the background) can we schedule it with low enough latency * so that audio doesn't skip? * - Cap and reservation: How to implement with the current system? * + Optimizing * - Profiling, making new algorithms, making math more efficient (no long division) */ /* * Design: * * VMs "burn" credits based on their weight; higher weight means * credits burn more slowly. The highest weight vcpu burns credits at * a rate of 1 credit per nanosecond. Others burn proportionally * more. * * vcpus are inserted into the runqueue by credit order. * * Credits are "reset" when the next vcpu in the runqueue is less than * or equal to zero. At that point, everyone's credits are "clipped" * to a small value, and a fixed credit is added to everyone. * * The plan is for all cores that share an L2 will share the same * runqueue. At the moment, there is one global runqueue for all * cores. */ /* * Locking: * - Schedule-lock is per-runqueue * + Protects runqueue data, runqueue insertion, &c * + Also protects updates to private sched vcpu structure * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr * doesn't change under our feet. * - Private data lock * + Protects access to global domain list * + All other private data is written at init and only read afterwards. * Ordering: * - We grab private->schedule when updating domain weight; so we * must never grab private if a schedule lock is held. */ /* * Basic constants */ /* Default weight: How much a new domain starts with */ #define CSCHED_DEFAULT_WEIGHT 256 /* Min timer: Minimum length a timer will be set, to * achieve efficiency */ #define CSCHED_MIN_TIMER MICROSECS(500) /* Amount of credit VMs begin with, and are reset to. * ATM, set so that highest-weight VMs can only run for 10ms * before a reset event. */ #define CSCHED_CREDIT_INIT MILLISECS(10) /* Carryover: How much "extra" credit may be carried over after * a reset. */ #define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER /* Stickiness: Cross-L2 migration resistance. Should be less than * MIN_TIMER. */ #define CSCHED_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) /* How much to "compensate" a vcpu for L2 migration */ #define CSCHED_MIGRATE_COMPENSATION MICROSECS(50) /* Reset: Value below which credit will be reset. */ #define CSCHED_CREDIT_RESET 0 /* Max timer: Maximum time a guest can be run for. */ #define CSCHED_MAX_TIMER MILLISECS(2) #define CSCHED_IDLE_CREDIT (-(1<<30)) /* * Flags */ /* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off, * a physical cpu? * + Accessed only with runqueue lock held * + Set when chosen as next in csched_schedule(). * + Cleared after context switch has been saved in csched_context_saved() * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should * set CSFLAG_delayed_runq_add * + Checked to be false in runq_insert. */ #define __CSFLAG_scheduled 1 #define CSFLAG_scheduled (1<<__CSFLAG_scheduled) /* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done * being context switched out? * + Set when scheduling out in csched_schedule() if prev is runnable * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set * + Read in csched_context_saved(). If set, it adds prev to the runqueue and * clears the bit. */ #define __CSFLAG_delayed_runq_add 2 #define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add) /* CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a * credit2-initiated runq migrate request; migrate it to the runqueue indicated * in the svc struct. */ #define __CSFLAG_runq_migrate_request 3 #define CSFLAG_runq_migrate_request (1<<__CSFLAG_runq_migrate_request) int opt_migrate_resist=500; integer_param("sched_credit2_migrate_resist", opt_migrate_resist); /* * Useful macros */ #define CSCHED_PRIV(_ops) \ ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) /* CPU to runq_id macro */ #define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)]) /* CPU to runqueue struct macro */ #define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)]) /* * Shifts for load average. * - granularity: Reduce granularity of time by a factor of 1000, so we can use 32-bit maths * - window shift: Given granularity shift, make the window about 1 second * - scale shift: Shift up load by this amount rather than using fractions; 128 corresponds * to a load of 1. */ #define LOADAVG_GRANULARITY_SHIFT (10) int opt_load_window_shift=18; #define LOADAVG_WINDOW_SHIFT_MIN 4 integer_param("credit2_load_window_shift", opt_load_window_shift); int opt_underload_balance_tolerance=0; integer_param("credit2_balance_under", opt_underload_balance_tolerance); int opt_overload_balance_tolerance=-3; integer_param("credit2_balance_over", opt_overload_balance_tolerance); /* * Per-runqueue data */ struct csched_runqueue_data { int id; spinlock_t lock; /* Lock for this runqueue. */ cpumask_t active; /* CPUs enabled for this runqueue */ struct list_head runq; /* Ordered list of runnable vms */ struct list_head svc; /* List of all vcpus assigned to this runqueue */ unsigned int max_weight; cpumask_t idle, /* Currently idle */ tickled; /* Another cpu in the queue is already targeted for this one */ int load; /* Instantaneous load: Length of queue + num non-idle threads */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ s_time_t b_avgload; /* Decaying queue load modified by balancing */ }; /* * System-wide private data */ struct csched_private { spinlock_t lock; cpumask_t initialized; /* CPU is initialized for this pool */ struct list_head sdom; /* Used mostly for dump keyhandler. */ int runq_map[NR_CPUS]; cpumask_t active_queues; /* Queues which may have active cpus */ struct csched_runqueue_data rqd[NR_CPUS]; int load_window_shift; }; /* * Virtual CPU */ struct csched_vcpu { struct list_head rqd_elem; /* On the runqueue data list */ struct list_head sdom_elem; /* On the domain vcpu list */ struct list_head runq_elem; /* On the runqueue */ struct csched_runqueue_data *rqd; /* Up-pointer to the runqueue */ /* Up-pointers */ struct csched_dom *sdom; struct vcpu *vcpu; unsigned int weight; unsigned int residual; int credit; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ /* Individual contribution to load */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ struct csched_runqueue_data *migrate_rqd; /* Pre-determined rqd to which to migrate */ }; /* * Domain */ struct csched_dom { struct list_head vcpu; struct list_head sdom_elem; struct domain *dom; uint16_t weight; uint16_t nr_vcpus; }; /* * Time-to-credit, credit-to-time. * * We keep track of the "residual" time to make sure that frequent short * schedules still get accounted for in the end. * * FIXME: Do pre-calculated division? */ static void t2c_update(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc) { uint64_t val = time * rqd->max_weight + svc->residual; svc->residual = do_div(val, svc->weight); svc->credit -= val; } static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc) { return credit * svc->weight / rqd->max_weight; } /* * Runqueue related code */ static /*inline*/ int __vcpu_on_runq(struct csched_vcpu *svc) { return !list_empty(&svc->runq_elem); } static /*inline*/ struct csched_vcpu * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_vcpu, runq_elem); } static void __update_runq_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; now >>= LOADAVG_GRANULARITY_SHIFT; if ( rqd->load_last_update + (1ULL<load_window_shift) < now ) { rqd->avgload = (unsigned long long)rqd->load << prv->load_window_shift; rqd->b_avgload = (unsigned long long)rqd->load << prv->load_window_shift; } else { delta = now - rqd->load_last_update; rqd->avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * rqd->avgload ) ) >> prv->load_window_shift; rqd->b_avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * rqd->b_avgload ) ) >> prv->load_window_shift; } rqd->load += change; rqd->load_last_update = now; { struct { unsigned rq_load:4, rq_avgload:28; unsigned rq_id:4, b_avgload:28; } d; d.rq_id=rqd->id; d.rq_load = rqd->load; d.rq_avgload = rqd->avgload; d.b_avgload = rqd->b_avgload; trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void __update_svc_load(const struct scheduler *ops, struct csched_vcpu *svc, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; int vcpu_load; if ( change == -1 ) vcpu_load = 1; else if ( change == 1 ) vcpu_load = 0; else vcpu_load = vcpu_runnable(svc->vcpu); now >>= LOADAVG_GRANULARITY_SHIFT; if ( svc->load_last_update + (1ULL<load_window_shift) < now ) { svc->avgload = (unsigned long long)vcpu_load << prv->load_window_shift; } else { delta = now - svc->load_last_update; svc->avgload = ( ( delta * ( (unsigned long long)vcpu_load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * svc->avgload ) ) >> prv->load_window_shift; } svc->load_last_update = now; { struct { unsigned dom:16,vcpu:16; unsigned v_avgload:32; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.v_avgload = svc->avgload; trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void update_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, struct csched_vcpu *svc, int change, s_time_t now) { __update_runq_load(ops, rqd, change, now); if ( svc ) __update_svc_load(ops, svc, change, now); } static int __runq_insert(struct list_head *runq, struct csched_vcpu *svc) { struct list_head *iter; int pos = 0; d2printk("rqi d%dv%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); BUG_ON(&svc->rqd->runq != runq); /* Idle vcpus not allowed on the runqueue anymore */ BUG_ON(is_idle_vcpu(svc->vcpu)); BUG_ON(svc->vcpu->is_running); BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); list_for_each( iter, runq ) { struct csched_vcpu * iter_svc = __runq_elem(iter); if ( svc->credit > iter_svc->credit ) { d2printk(" p%d d%dv%d\n", pos, iter_svc->vcpu->domain->domain_id, iter_svc->vcpu->vcpu_id); break; } pos++; } list_add_tail(&svc->runq_elem, iter); return pos; } static void runq_insert(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc) { struct list_head * runq = &RQD(ops, cpu)->runq; int pos = 0; ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) ); BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) ); pos = __runq_insert(runq, svc); { struct { unsigned dom:16,vcpu:16; unsigned pos; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.pos = pos; trace_var(TRC_CSCHED2_RUNQ_POS, 0, sizeof(d), (unsigned char *)&d); } return; } static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t); /* Check to see if the item on the runqueue is higher priority than what's * currently running; if so, wake up the processor */ static /*inline*/ void runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now) { int i, ipid=-1; s_time_t lowest=(1<<30); struct csched_runqueue_data *rqd = RQD(ops, cpu); cpumask_t mask; struct csched_vcpu * cur; d2printk("rqt d%dv%d cd%dv%d\n", new->vcpu->domain->domain_id, new->vcpu->vcpu_id, current->domain->domain_id, current->vcpu_id); BUG_ON(new->vcpu->processor != cpu); BUG_ON(new->rqd != rqd); /* Look at the cpu it's running on first */ cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); burn_credits(rqd, cur, now); if ( cur->credit < new->credit ) { ipid = cpu; goto tickle; } /* Get a mask of idle, but not tickled */ cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); /* If it's not empty, choose one */ i = cpumask_cycle(cpu, &mask); if ( i < nr_cpu_ids ) { ipid = i; goto tickle; } /* Otherwise, look for the non-idle cpu with the lowest credit, * skipping cpus which have been tickled but not scheduled yet */ cpumask_andnot(&mask, &rqd->active, &rqd->idle); cpumask_andnot(&mask, &mask, &rqd->tickled); for_each_cpu(i, &mask) { struct csched_vcpu * cur; /* Already looked at this one above */ if ( i == cpu ) continue; cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr); BUG_ON(is_idle_vcpu(cur->vcpu)); /* Update credits for current to see if we want to preempt */ burn_credits(rqd, cur, now); if ( cur->credit < lowest ) { ipid = i; lowest = cur->credit; } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; } d; d.dom = cur->vcpu->domain->domain_id; d.vcpu = cur->vcpu->vcpu_id; d.credit = cur->credit; trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, sizeof(d), (unsigned char *)&d); } } /* Only switch to another processor if the credit difference is greater * than the migrate resistance */ if ( ipid == -1 || lowest + CSCHED_MIGRATE_RESIST > new->credit ) goto no_tickle; tickle: BUG_ON(ipid == -1); /* TRACE */ { struct { unsigned cpu:8; } d; d.cpu = ipid; trace_var(TRC_CSCHED2_TICKLE, 0, sizeof(d), (unsigned char *)&d); } cpumask_set_cpu(ipid, &rqd->tickled); cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); no_tickle: return; } /* * Credit-related code */ static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now, struct csched_vcpu *snext) { struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *iter; int m; /* * Under normal circumstances, snext->credit should never be less * than -CSCHED_MIN_TIMER. However, under some circumstances, a * vcpu with low credits may be allowed to run long enough that * its credits are actually less than -CSCHED_CREDIT_INIT. * (Instances have been observed, for example, where a vcpu with * 200us of credit was allowed to run for 11ms, giving it -10.8ms * of credit. Thus it was still negative even after the reset.) * * If this is the case for snext, we simply want to keep moving * everyone up until it is in the black again. This fair because * none of the other vcpus want to run at the moment. * * Rather than looping, however, we just calculate a multiplier, * avoiding an integer division and multiplication in the common * case. */ m = 1; if ( snext->credit < -CSCHED_CREDIT_INIT ) m += (-snext->credit) / CSCHED_CREDIT_INIT; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc; int start_credit; svc = list_entry(iter, struct csched_vcpu, rqd_elem); BUG_ON( is_idle_vcpu(svc->vcpu) ); BUG_ON( svc->rqd != rqd ); start_credit = svc->credit; /* And add INIT * m, avoiding integer multiplication in the * common case. */ if ( likely(m==1) ) svc->credit += CSCHED_CREDIT_INIT; else svc->credit += m * CSCHED_CREDIT_INIT; /* "Clip" credits to max carryover */ if ( svc->credit > CSCHED_CREDIT_INIT + CSCHED_CARRYOVER_MAX ) svc->credit = CSCHED_CREDIT_INIT + CSCHED_CARRYOVER_MAX; svc->start_time = now; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit_start, credit_end; unsigned multiplier; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit_start = start_credit; d.credit_end = svc->credit; d.multiplier = m; trace_var(TRC_CSCHED2_CREDIT_RESET, 1, sizeof(d), (unsigned char *)&d); } } /* No need to resort runqueue, as everyone's order should be the same. */ } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now) { s_time_t delta; /* Assert svc is current */ ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); if ( is_idle_vcpu(svc->vcpu) ) { BUG_ON(svc->credit != CSCHED_IDLE_CREDIT); return; } delta = now - svc->start_time; if ( delta > 0 ) { t2c_update(rqd, delta, svc); svc->start_time = now; d2printk("b d%dv%d c%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->credit); } else { d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", __func__, now, svc->start_time); } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; int delta; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit = svc->credit; d.delta = delta; trace_var(TRC_CSCHED2_CREDIT_BURN, 1, sizeof(d), (unsigned char *)&d); } } /* Find the domain with the highest weight. */ void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight) { /* Try to avoid brute-force search: * - If new_weight is larger, max_weigth <- new_weight * - If old_weight != max_weight, someone else is still max_weight * (No action required) * - If old_weight == max_weight, brute-force search for max weight */ if ( new_weight > rqd->max_weight ) { rqd->max_weight = new_weight; d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } else if ( old_weight == rqd->max_weight ) { struct list_head *iter; int max_weight = 1; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem); if ( svc->weight > max_weight ) max_weight = svc->weight; } rqd->max_weight = max_weight; d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } } #ifndef NDEBUG static /*inline*/ void __csched_vcpu_check(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->vcpu != vc ); BUG_ON( sdom != CSCHED_DOM(vc->domain) ); if ( sdom ) { BUG_ON( is_idle_vcpu(vc) ); BUG_ON( sdom->dom != vc->domain ); } else { BUG_ON( !is_idle_vcpu(vc) ); } } #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) #else #define CSCHED_VCPU_CHECK(_vc) #endif static void * csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) { struct csched_vcpu *svc; /* Allocate per-VCPU info */ svc = xzalloc(struct csched_vcpu); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->rqd_elem); INIT_LIST_HEAD(&svc->sdom_elem); INIT_LIST_HEAD(&svc->runq_elem); svc->sdom = dd; svc->vcpu = vc; svc->flags = 0U; if ( ! is_idle_vcpu(vc) ) { BUG_ON( svc->sdom == NULL ); svc->credit = CSCHED_CREDIT_INIT; svc->weight = svc->sdom->weight; /* Starting load of 50% */ svc->avgload = 1ULL << (CSCHED_PRIV(ops)->load_window_shift - 1); svc->load_last_update = NOW(); } else { BUG_ON( svc->sdom != NULL ); svc->credit = CSCHED_IDLE_CREDIT; svc->weight = 0; } SCHED_STAT_CRANK(vcpu_init); return svc; } /* Add and remove from runqueue assignment (not active run queue) */ static void __runq_assign(struct csched_vcpu *svc, struct csched_runqueue_data *rqd) { svc->rqd = rqd; list_add_tail(&svc->rqd_elem, &svc->rqd->svc); update_max_weight(svc->rqd, svc->weight, 0); /* Expected new load based on adding this vcpu */ rqd->b_avgload += svc->avgload; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned rqi:16; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.rqi=rqd->id; trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, sizeof(d), (unsigned char *)&d); } } static void runq_assign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != NULL); __runq_assign(svc, RQD(ops, vc->processor)); } static void __runq_deassign(struct csched_vcpu *svc) { BUG_ON(__vcpu_on_runq(svc)); BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); list_del_init(&svc->rqd_elem); update_max_weight(svc->rqd, 0, svc->weight); /* Expected new load based on removing this vcpu */ svc->rqd->b_avgload -= svc->avgload; svc->rqd = NULL; } static void runq_deassign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != RQD(ops, vc->processor)); __runq_deassign(svc); } static void csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; struct domain * const dom = vc->domain; struct csched_dom * const sdom = svc->sdom; printk("%s: Inserting d%dv%d\n", __func__, dom->domain_id, vc->vcpu_id); /* NB: On boot, idle vcpus are inserted before alloc_pdata() has * been called for that cpu. */ if ( ! is_idle_vcpu(vc) ) { spinlock_t *lock; /* FIXME: Do we need the private lock here? */ list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* Add vcpu to runqueue of initial processor */ lock = vcpu_schedule_lock_irq(vc); runq_assign(ops, vc); vcpu_schedule_unlock_irq(lock, vc); sdom->nr_vcpus++; } CSCHED_VCPU_CHECK(vc); } static void csched_free_vdata(const struct scheduler *ops, void *priv) { struct csched_vcpu *svc = priv; xfree(svc); } static void csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); if ( ! is_idle_vcpu(vc) ) { spinlock_t *lock; SCHED_STAT_CRANK(vcpu_destroy); /* Remove from runqueue */ lock = vcpu_schedule_lock_irq(vc); runq_deassign(ops, vc); vcpu_schedule_unlock_irq(lock, vc); /* Remove from sdom list. Don't need a lock for this, as it's called * syncronously when nothing else can happen. */ list_del_init(&svc->sdom_elem); svc->sdom->nr_vcpus--; } } static void csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); BUG_ON( is_idle_vcpu(vc) ); if ( per_cpu(schedule_data, vc->processor).curr == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_runq(svc) ) { BUG_ON(svc->rqd != RQD(ops, vc->processor)); update_load(ops, svc->rqd, svc, -1, NOW()); __runq_remove(svc); } else if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) ) clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); } static void csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = 0; /* Schedule lock should be held at this point. */ d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); BUG_ON( is_idle_vcpu(vc) ); /* Make sure svc priority mod happens before runq check */ if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) ) { goto out; } if ( unlikely(__vcpu_on_runq(svc)) ) { /* If we've boosted someone that's already on a runqueue, prioritize * it and inform the cpu in question. */ goto out; } /* If the context hasn't been saved for this vcpu yet, we can't put it on * another runqueue. Instead, we set a flag so that it will be put on the runqueue * after the context has been saved. */ if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) ) { set_bit(__CSFLAG_delayed_runq_add, &svc->flags); goto out; } /* Add into the new runqueue if necessary */ if ( svc->rqd == NULL ) runq_assign(ops, vc); else BUG_ON(RQD(ops, vc->processor) != svc->rqd ); now = NOW(); update_load(ops, svc->rqd, svc, 1, now); /* Put the VCPU on the runq */ runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); out: d2printk("w-\n"); return; } static void csched_context_saved(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = NOW(); spinlock_t *lock = vcpu_schedule_lock_irq(vc); BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor)); /* This vcpu is now eligible to be put on the runqueue again */ clear_bit(__CSFLAG_scheduled, &svc->flags); /* If someone wants it on the runqueue, put it there. */ /* * NB: We can get rid of CSFLAG_scheduled by checking for * vc->is_running and __vcpu_on_runq(svc) here. However, * since we're accessing the flags cacheline anyway, * it seems a bit pointless; especially as we have plenty of * bits free. */ if ( test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) && likely(vcpu_runnable(vc)) ) { BUG_ON(__vcpu_on_runq(svc)); runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); } else if ( !is_idle_vcpu(vc) ) update_load(ops, svc->rqd, svc, -1, now); vcpu_schedule_unlock_irq(lock, vc); } #define MAX_LOAD (1ULL<<60); static int choose_cpu(const struct scheduler *ops, struct vcpu *vc) { struct csched_private *prv = CSCHED_PRIV(ops); int i, min_rqi = -1, new_cpu; struct csched_vcpu *svc = CSCHED_VCPU(vc); s_time_t min_avgload; BUG_ON(cpumask_empty(&prv->active_queues)); /* Locking: * - vc->processor is already locked * - Need to grab prv lock to make sure active runqueues don't * change * - Need to grab locks for other runqueues while checking * avgload * Locking constraint is: * - Lock prv before runqueue locks * - Trylock between runqueue locks (no ordering) * * Since one of the runqueue locks is already held, we can't * just grab the prv lock. Instead, we'll have to trylock, and * do something else reasonable if we fail. */ if ( !spin_trylock(&prv->lock) ) { if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { d2printk("d%dv%d -\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); } /* Leave it where it is for now. When we actually pay attention * to affinity we'll have to figure something out... */ return vc->processor; } /* First check to see if we're here because someone else suggested a place * for us to move. */ if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { if ( unlikely(svc->migrate_rqd->id < 0) ) { printk("%s: Runqueue migrate aborted because target runqueue disappeared!\n", __func__); /* Fall-through to normal cpu pick */ } else { d2printk("d%dv%d +\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active); goto out_up; } } /* FIXME: Pay attention to cpu affinity */ min_avgload = MAX_LOAD; /* Find the runqueue with the lowest instantaneous load */ for_each_cpu(i, &prv->active_queues) { struct csched_runqueue_data *rqd; s_time_t rqd_avgload; rqd = prv->rqd + i; /* If checking a different runqueue, grab the lock, * read the avg, and then release the lock. * * If on our own runqueue, don't grab or release the lock; * but subtract our own load from the runqueue load to simulate * impartiality */ if ( rqd == svc->rqd ) { rqd_avgload = rqd->b_avgload - svc->avgload; } else if ( spin_trylock(&rqd->lock) ) { rqd_avgload = rqd->b_avgload; spin_unlock(&rqd->lock); } else continue; if ( rqd_avgload < min_avgload ) { min_avgload = rqd_avgload; min_rqi=i; } } /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */ if ( min_rqi == -1 ) new_cpu = vc->processor; else { new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active); BUG_ON(new_cpu >= nr_cpu_ids); } out_up: spin_unlock(&prv->lock); return new_cpu; } /* Working state of the load-balancing algorithm */ typedef struct { /* NB: Modified by consider() */ s_time_t load_delta; struct csched_vcpu * best_push_svc, *best_pull_svc; /* NB: Read by consider() */ struct csched_runqueue_data *lrqd; struct csched_runqueue_data *orqd; } balance_state_t; static void consider(balance_state_t *st, struct csched_vcpu *push_svc, struct csched_vcpu *pull_svc) { s_time_t l_load, o_load, delta; l_load = st->lrqd->b_avgload; o_load = st->orqd->b_avgload; if ( push_svc ) { /* What happens to the load on both if we push? */ l_load -= push_svc->avgload; o_load += push_svc->avgload; } if ( pull_svc ) { /* What happens to the load on both if we pull? */ l_load += pull_svc->avgload; o_load -= pull_svc->avgload; } delta = l_load - o_load; if ( delta < 0 ) delta = -delta; if ( delta < st->load_delta ) { st->load_delta = delta; st->best_push_svc=push_svc; st->best_pull_svc=pull_svc; } } void migrate(const struct scheduler *ops, struct csched_vcpu *svc, struct csched_runqueue_data *trqd, s_time_t now) { if ( test_bit(__CSFLAG_scheduled, &svc->flags) ) { d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); /* It's running; mark it to migrate. */ svc->migrate_rqd = trqd; set_bit(_VPF_migrating, &svc->vcpu->pause_flags); set_bit(__CSFLAG_runq_migrate_request, &svc->flags); } else { int on_runq=0; /* It's not running; just move it */ d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); if ( __vcpu_on_runq(svc) ) { __runq_remove(svc); update_load(ops, svc->rqd, svc, -1, now); on_runq=1; } __runq_deassign(svc); svc->vcpu->processor = cpumask_any(&trqd->active); __runq_assign(svc, trqd); if ( on_runq ) { update_load(ops, svc->rqd, svc, 1, now); runq_insert(ops, svc->vcpu->processor, svc); runq_tickle(ops, svc->vcpu->processor, svc, now); } } } static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); int i, max_delta_rqi = -1; struct list_head *push_iter, *pull_iter; balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; /* * Basic algorithm: Push, pull, or swap. * - Find the runqueue with the furthest load distance * - Find a pair that makes the difference the least (where one * on either side may be empty). */ /* Locking: * - pcpu schedule lock should be already locked */ st.lrqd = RQD(ops, cpu); __update_runq_load(ops, st.lrqd, 0, now); retry: if ( !spin_trylock(&prv->lock) ) return; st.load_delta = 0; for_each_cpu(i, &prv->active_queues) { s_time_t delta; st.orqd = prv->rqd + i; if ( st.orqd == st.lrqd || !spin_trylock(&st.orqd->lock) ) continue; __update_runq_load(ops, st.orqd, 0, now); delta = st.lrqd->b_avgload - st.orqd->b_avgload; if ( delta < 0 ) delta = -delta; if ( delta > st.load_delta ) { st.load_delta = delta; max_delta_rqi = i; } spin_unlock(&st.orqd->lock); } /* Minimize holding the big lock */ spin_unlock(&prv->lock); if ( max_delta_rqi == -1 ) goto out; { s_time_t load_max; int cpus_max; load_max = st.lrqd->b_avgload; if ( st.orqd->b_avgload > load_max ) load_max = st.orqd->b_avgload; cpus_max = cpumask_weight(&st.lrqd->active); i = cpumask_weight(&st.orqd->active); if ( i > cpus_max ) cpus_max = i; /* If we're under 100% capacaty, only shift if load difference * is > 1. otherwise, shift if under 12.5% */ if ( load_max < (1ULL<<(prv->load_window_shift))*cpus_max ) { if ( st.load_delta < (1ULL<<(prv->load_window_shift+opt_underload_balance_tolerance) ) ) goto out; } else if ( st.load_delta < (1ULL<<(prv->load_window_shift+opt_overload_balance_tolerance)) ) goto out; } /* Try to grab the other runqueue lock; if it's been taken in the * meantime, try the process over again. This can't deadlock * because if it doesn't get any other rqd locks, it will simply * give up and return. */ st.orqd = prv->rqd + max_delta_rqi; if ( !spin_trylock(&st.orqd->lock) ) goto retry; /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ if ( unlikely(st.orqd->id < 0) ) goto out_up; /* Look for "swap" which gives the best load average * FIXME: O(n^2)! */ /* Reuse load delta (as we're trying to minimize it) */ list_for_each( push_iter, &st.lrqd->svc ) { int inner_load_updated = 0; struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem); __update_svc_load(ops, push_svc, 0, now); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) ) continue; list_for_each( pull_iter, &st.orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); if ( ! inner_load_updated ) { __update_svc_load(ops, pull_svc, 0, now); } /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; consider(&st, push_svc, pull_svc); } inner_load_updated = 1; /* Consider push only */ consider(&st, push_svc, NULL); } list_for_each( pull_iter, &st.orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; /* Consider pull only */ consider(&st, NULL, pull_svc); } /* OK, now we have some candidates; do the moving */ if ( st.best_push_svc ) migrate(ops, st.best_push_svc, st.orqd, now); if ( st.best_pull_svc ) migrate(ops, st.best_pull_svc, st.lrqd, now); out_up: spin_unlock(&st.orqd->lock); out: return; } static int csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc) { int new_cpu; new_cpu = choose_cpu(ops, vc); return new_cpu; } static void csched_vcpu_migrate( const struct scheduler *ops, struct vcpu *vc, unsigned int new_cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_runqueue_data *trqd; /* Check if new_cpu is valid */ BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED_PRIV(ops)->initialized)); trqd = RQD(ops, new_cpu); if ( trqd != svc->rqd ) migrate(ops, svc, trqd, NOW()); } static int csched_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Must hold csched_priv lock to read and update sdom, * runq lock to update csvcs. */ spin_lock_irqsave(&prv->lock, flags); if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit2.weight = sdom->weight; } else { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); if ( op->u.credit2.weight != 0 ) { struct list_head *iter; int old_weight; old_weight = sdom->weight; sdom->weight = op->u.credit2.weight; /* Update weights for vcpus, and max_weight for runqueues on which they reside */ list_for_each ( iter, &sdom->vcpu ) { struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); /* NB: Locking order is important here. Because we grab this lock here, we * must never lock csched_priv.lock if we're holding a runqueue lock. * Also, calling vcpu_schedule_lock() is enough, since IRQs have already * been disabled. */ spinlock_t *lock = vcpu_schedule_lock(svc->vcpu); BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor)); svc->weight = sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); vcpu_schedule_unlock(lock, svc->vcpu); } } } spin_unlock_irqrestore(&prv->lock, flags); return 0; } static void * csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; unsigned long flags; sdom = xzalloc(struct csched_dom); if ( sdom == NULL ) return NULL; /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->vcpu); INIT_LIST_HEAD(&sdom->sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->nr_vcpus = 0; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); return (void *)sdom; } static int csched_dom_init(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; printk("%s: Initializing domain %d\n", __func__, dom->domain_id); if ( is_idle_domain(dom) ) return 0; sdom = csched_alloc_domdata(ops, dom); if ( sdom == NULL ) return -ENOMEM; dom->sched_priv = sdom; return 0; } static void csched_free_domdata(const struct scheduler *ops, void *data) { unsigned long flags; struct csched_dom *sdom = data; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_del_init(&sdom->sdom_elem); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); xfree(data); } static void csched_dom_destroy(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom = CSCHED_DOM(dom); BUG_ON(!list_empty(&sdom->vcpu)); csched_free_domdata(ops, CSCHED_DOM(dom)); } /* How long should we let this vcpu run for? */ static s_time_t csched_runtime(const struct scheduler *ops, int cpu, struct csched_vcpu *snext) { s_time_t time; int rt_credit; /* Proposed runtime measured in credits */ struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *runq = &rqd->runq; if ( is_idle_vcpu(snext->vcpu) ) return CSCHED_MAX_TIMER; /* General algorithm: * 1) Run until snext's credit will be 0 * 2) But if someone is waiting, run until snext's credit is equal * to his * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER. */ /* 1) Basic time: Run until credit is 0. */ rt_credit = snext->credit; /* 2) If there's someone waiting whose credit is positive, * run until your credit ~= his */ if ( ! list_empty(runq) ) { struct csched_vcpu *swait = __runq_elem(runq->next); if ( ! is_idle_vcpu(swait->vcpu) && swait->credit > 0 ) { rt_credit = snext->credit - swait->credit; } } /* The next guy may actually have a higher credit, if we've tried to * avoid migrating him from a different cpu. DTRT. */ if ( rt_credit <= 0 ) time = CSCHED_MIN_TIMER; else { /* FIXME: See if we can eliminate this conversion if we know time * will be outside (MIN,MAX). Probably requires pre-calculating * credit values of MIN,MAX per vcpu, since each vcpu burns credit * at a different rate. */ time = c2t(rqd, rt_credit, snext); /* Check limits */ if ( time < CSCHED_MIN_TIMER ) time = CSCHED_MIN_TIMER; else if ( time > CSCHED_MAX_TIMER ) time = CSCHED_MAX_TIMER; } return time; } void __dump_execstate(void *unused); /* * Find a candidate. */ static struct csched_vcpu * runq_candidate(struct csched_runqueue_data *rqd, struct csched_vcpu *scurr, int cpu, s_time_t now) { struct list_head *iter; struct csched_vcpu *snext = NULL; /* Default to current if runnable, idle otherwise */ if ( vcpu_runnable(scurr->vcpu) ) snext = scurr; else snext = CSCHED_VCPU(idle_vcpu[cpu]); list_for_each( iter, &rqd->runq ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, runq_elem); /* If this is on a different processor, don't pull it unless * its credit is at least CSCHED_MIGRATE_RESIST higher. */ if ( svc->vcpu->processor != cpu && snext->credit + CSCHED_MIGRATE_RESIST > svc->credit ) continue; /* If the next one on the list has more credit than current * (or idle, if current is not runnable), choose it. */ if ( svc->credit > snext->credit ) snext = svc; /* In any case, if we got this far, break. */ break; } return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static struct task_slice csched_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { const int cpu = smp_processor_id(); struct csched_runqueue_data *rqd; struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_vcpu *snext = NULL; struct task_slice ret; SCHED_STAT_CRANK(schedule); CSCHED_VCPU_CHECK(current); d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", cpu, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id, now); BUG_ON(!cpumask_test_cpu(cpu, &CSCHED_PRIV(ops)->initialized)); rqd = RQD(ops, cpu); BUG_ON(!cpumask_test_cpu(cpu, &rqd->active)); /* Protected by runqueue lock */ /* DEBUG */ if ( !is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd) { int other_rqi = -1, this_rqi = c2r(ops, cpu); if ( scurr->rqd ) { int rq; other_rqi = -2; for_each_cpu ( rq, &CSCHED_PRIV(ops)->active_queues ) { if ( scurr->rqd == &CSCHED_PRIV(ops)->rqd[rq] ) { other_rqi = rq; break; } } } printk("%s: pcpu %d rq %d, but scurr d%dv%d assigned to " "pcpu %d rq %d!\n", __func__, cpu, this_rqi, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id, scurr->vcpu->processor, other_rqi); } BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd); /* Clear "tickled" bit now that we've been scheduled */ if ( cpumask_test_cpu(cpu, &rqd->tickled) ) cpumask_clear_cpu(cpu, &rqd->tickled); /* Update credits */ burn_credits(rqd, scurr, now); /* * Select next runnable local VCPU (ie top of local runq). * * If the current vcpu is runnable, and has higher credit than * the next guy on the queue (or there is noone else), we want to * run him again. * * If there's tasklet work to do, we want to chose the idle vcpu * for this processor, and mark the current for delayed runqueue * add. * * If the current vcpu is runnable, and there's another runnable * candidate, we want to mark current for delayed runqueue add, * and remove the next guy from the queue. * * If the current vcpu is not runnable, we want to chose the idle * vcpu for this processor. */ if ( tasklet_work_scheduled ) { trace_var(TRC_CSCHED2_SCHED_TASKLET, 0, 0, NULL); snext = CSCHED_VCPU(idle_vcpu[cpu]); } else snext=runq_candidate(rqd, scurr, cpu, now); /* If switching from a non-idle runnable vcpu, put it * back on the runqueue. */ if ( snext != scurr && !is_idle_vcpu(scurr->vcpu) && vcpu_runnable(current) ) set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); ret.migrated = 0; /* Accounting for non-idle tasks */ if ( !is_idle_vcpu(snext->vcpu) ) { /* If switching, remove this from the runqueue and mark it scheduled */ if ( snext != scurr ) { BUG_ON(snext->rqd != rqd); __runq_remove(snext); if ( snext->vcpu->is_running ) { printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n", cpu, snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id, snext->vcpu->processor, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id); BUG(); } set_bit(__CSFLAG_scheduled, &snext->flags); } /* Check for the reset condition */ if ( snext->credit <= CSCHED_CREDIT_RESET ) { reset_credit(ops, cpu, now, snext); balance_load(ops, cpu, now); } /* Clear the idle mask if necessary */ if ( cpumask_test_cpu(cpu, &rqd->idle) ) cpumask_clear_cpu(cpu, &rqd->idle); snext->start_time = now; /* Safe because lock for old processor is held */ if ( snext->vcpu->processor != cpu ) { snext->credit += CSCHED_MIGRATE_COMPENSATION; snext->vcpu->processor = cpu; ret.migrated = 1; } } else { /* Update the idle mask if necessary */ if ( !cpumask_test_cpu(cpu, &rqd->idle) ) cpumask_set_cpu(cpu, &rqd->idle); /* Make sure avgload gets updated periodically even * if there's no activity */ update_load(ops, rqd, NULL, 0, now); } /* * Return task to run next... */ ret.time = csched_runtime(ops, cpu, snext); ret.task = snext->vcpu; CSCHED_VCPU_CHECK(ret.task); return ret; } static void csched_dump_vcpu(struct csched_vcpu *svc) { printk("[%i.%i] flags=%x cpu=%i", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->flags, svc->vcpu->processor); printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); printk("\n"); } static void csched_dump_pcpu(const struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_vcpu *svc; int loop; char cpustr[100]; /* FIXME: Do locking properly for access to runqueue structures */ runq = &RQD(ops, cpu)->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); printk(" sibling=%s, ", cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); printk("core=%s\n", cpustr); /* current VCPU */ svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); if ( svc ) { printk("\trun: "); csched_dump_vcpu(svc); } loop = 0; list_for_each( iter, runq ) { svc = __runq_elem(iter); if ( svc ) { printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void csched_dump(const struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; struct csched_private *prv = CSCHED_PRIV(ops); int i, loop; printk("Active queues: %d\n" "\tdefault-weight = %d\n", cpumask_weight(&prv->active_queues), CSCHED_DEFAULT_WEIGHT); for_each_cpu(i, &prv->active_queues) { s_time_t fraction; fraction = prv->rqd[i].avgload * 100 / (1ULL<load_window_shift); printk("Runqueue %d:\n" "\tncpus = %u\n" "\tmax_weight = %d\n" "\tinstload = %d\n" "\taveload = %3"PRI_stime"\n", i, cpumask_weight(&prv->rqd[i].active), prv->rqd[i].max_weight, prv->rqd[i].load, fraction); } /* FIXME: Locking! */ printk("Domain info:\n"); loop = 0; list_for_each( iter_sdom, &prv->sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); printk("\tDomain: %d w %d v %d\n\t", sdom->dom->domain_id, sdom->weight, sdom->nr_vcpus); list_for_each( iter_svc, &sdom->vcpu ) { struct csched_vcpu *svc; svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void activate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpumask_empty(&rqd->active)); rqd->max_weight = 1; rqd->id = rqi; INIT_LIST_HEAD(&rqd->svc); INIT_LIST_HEAD(&rqd->runq); spin_lock_init(&rqd->lock); cpumask_set_cpu(rqi, &prv->active_queues); } static void deactivate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpumask_empty(&rqd->active)); rqd->id = -1; cpumask_clear_cpu(rqi, &prv->active_queues); } static void init_pcpu(const struct scheduler *ops, int cpu) { int rqi; unsigned long flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; spinlock_t *old_lock; spin_lock_irqsave(&prv->lock, flags); if ( cpumask_test_cpu(cpu, &prv->initialized) ) { printk("%s: Strange, cpu %d already initialized!\n", __func__, cpu); spin_unlock_irqrestore(&prv->lock, flags); return; } /* Figure out which runqueue to put it in */ rqi = 0; /* Figure out which runqueue to put it in */ /* NB: cpu 0 doesn't get a STARTING callback, so we hard-code it to runqueue 0. */ if ( cpu == 0 ) rqi = 0; else rqi = cpu_to_socket(cpu); if ( rqi < 0 ) { printk("%s: cpu_to_socket(%d) returned %d!\n", __func__, cpu, rqi); BUG(); } rqd=prv->rqd + rqi; printk("Adding cpu %d to runqueue %d\n", cpu, rqi); if ( ! cpumask_test_cpu(rqi, &prv->active_queues) ) { printk(" First cpu on runqueue, activating\n"); activate_runqueue(prv, rqi); } /* IRQs already disabled */ old_lock=pcpu_schedule_lock(cpu); /* Move spinlock to new runq lock. */ per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock; /* Set the runqueue map */ prv->runq_map[cpu]=rqi; cpumask_set_cpu(cpu, &rqd->idle); cpumask_set_cpu(cpu, &rqd->active); /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */ spin_unlock(old_lock); cpumask_set_cpu(cpu, &prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static void * csched_alloc_pdata(const struct scheduler *ops, int cpu) { /* Check to see if the cpu is online yet */ /* Note: cpu 0 doesn't get a STARTING callback */ if ( cpu == 0 || cpu_to_socket(cpu) >= 0 ) init_pcpu(ops, cpu); else printk("%s: cpu %d not online yet, deferring initializatgion\n", __func__, cpu); return (void *)1; } static void csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { unsigned long flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; struct schedule_data *sd = &per_cpu(schedule_data, cpu); int rqi; spin_lock_irqsave(&prv->lock, flags); BUG_ON(!cpumask_test_cpu(cpu, &prv->initialized)); /* Find the old runqueue and remove this cpu from it */ rqi = prv->runq_map[cpu]; rqd = prv->rqd + rqi; /* No need to save IRQs here, they're already disabled */ spin_lock(&rqd->lock); BUG_ON(!cpumask_test_cpu(cpu, &rqd->idle)); printk("Removing cpu %d from runqueue %d\n", cpu, rqi); cpumask_clear_cpu(cpu, &rqd->idle); cpumask_clear_cpu(cpu, &rqd->active); if ( cpumask_empty(&rqd->active) ) { printk(" No cpus left on runqueue, disabling\n"); deactivate_runqueue(prv, rqi); } /* Move spinlock to the original lock. */ ASSERT(sd->schedule_lock == &rqd->lock); ASSERT(!spin_is_locked(&sd->_lock)); sd->schedule_lock = &sd->_lock; spin_unlock(&rqd->lock); cpumask_clear_cpu(cpu, &prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static int csched_cpu_starting(int cpu) { struct scheduler *ops; /* Hope this is safe from cpupools switching things around. :-) */ ops = per_cpu(scheduler, cpu); if ( ops->alloc_pdata == csched_alloc_pdata ) init_pcpu(ops, cpu); return NOTIFY_DONE; } static int cpu_credit2_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_STARTING: csched_cpu_starting(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_credit2_nfb = { .notifier_call = cpu_credit2_callback }; static int csched_global_init(void) { register_cpu_notifier(&cpu_credit2_nfb); return 0; } static int csched_init(struct scheduler *ops) { int i; struct csched_private *prv; printk("Initializing Credit2 scheduler\n" \ " WARNING: This is experimental software in development.\n" \ " Use at your own risk.\n"); printk(" load_window_shift: %d\n", opt_load_window_shift); printk(" underload_balance_tolerance: %d\n", opt_underload_balance_tolerance); printk(" overload_balance_tolerance: %d\n", opt_overload_balance_tolerance); if ( opt_load_window_shift < LOADAVG_WINDOW_SHIFT_MIN ) { printk("%s: opt_load_window_shift %d below min %d, resetting\n", __func__, opt_load_window_shift, LOADAVG_WINDOW_SHIFT_MIN); opt_load_window_shift = LOADAVG_WINDOW_SHIFT_MIN; } /* Basically no CPU information is available at this point; just * set up basic structures, and a callback when the CPU info is * available. */ prv = xzalloc(struct csched_private); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->sdom); /* But un-initialize all runqueues */ for ( i = 0; i < nr_cpu_ids; i++ ) { prv->runq_map[i] = -1; prv->rqd[i].id = -1; } prv->load_window_shift = opt_load_window_shift; return 0; } static void csched_deinit(const struct scheduler *ops) { struct csched_private *prv; prv = CSCHED_PRIV(ops); if ( prv != NULL ) xfree(prv); } static struct csched_private _csched_priv; const struct scheduler sched_credit2_def = { .name = "SMP Credit Scheduler rev2", .opt_name = "credit2", .sched_id = XEN_SCHEDULER_CREDIT2, .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, .insert_vcpu = csched_vcpu_insert, .remove_vcpu = csched_vcpu_remove, .sleep = csched_vcpu_sleep, .wake = csched_vcpu_wake, .adjust = csched_dom_cntl, .pick_cpu = csched_cpu_pick, .migrate = csched_vcpu_migrate, .do_schedule = csched_schedule, .context_saved = csched_context_saved, .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .global_init = csched_global_init, .init = csched_init, .deinit = csched_deinit, .alloc_vdata = csched_alloc_vdata, .free_vdata = csched_free_vdata, .alloc_pdata = csched_alloc_pdata, .free_pdata = csched_free_pdata, .alloc_domdata = csched_alloc_domdata, .free_domdata = csched_free_domdata, };