/****************************************************************************** * Simple EDF scheduler for xen * * by Stephan Diestelhorst (C) 2004 Cambridge University * based on code by Mark Williamson (C) 2004 Intel Research Cambridge */ #include #include #include #include #include #include #include #ifndef NDEBUG #define SEDF_STATS #define CHECK(_p) \ do { \ if ( !(_p) ) \ printk("Check '%s' failed, line %d, file %s\n", \ #_p , __LINE__, __FILE__); \ } while ( 0 ) #else #define CHECK(_p) ((void)0) #endif #define EXTRA_NONE (0) #define EXTRA_AWARE (1) #define EXTRA_RUN_PEN (2) #define EXTRA_RUN_UTIL (4) #define EXTRA_WANT_PEN_Q (8) #define EXTRA_PEN_Q (0) #define EXTRA_UTIL_Q (1) #define SEDF_ASLEEP (16) #define EXTRA_QUANTUM (MICROSECS(500)) #define WEIGHT_PERIOD (MILLISECS(100)) #define WEIGHT_SAFETY (MILLISECS(5)) #define PERIOD_MAX MILLISECS(10000) /* 10s */ #define PERIOD_MIN (MICROSECS(10)) /* 10us */ #define SLICE_MIN (MICROSECS(5)) /* 5us */ #define IMPLY(a, b) (!(a) || (b)) #define EQ(a, b) ((!!(a)) == (!!(b))) struct sedf_dom_info { struct domain *domain; }; struct sedf_priv_info { /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; }; struct sedf_vcpu_info { struct vcpu *vcpu; struct list_head list; struct list_head extralist[2]; /* Parameters for EDF */ s_time_t period; /* = relative deadline */ s_time_t slice; /* = worst case execution time */ /* Advaced Parameters */ /* Latency Scaling */ s_time_t period_orig; s_time_t slice_orig; s_time_t latency; /* Status of domain */ int status; /* Weights for "Scheduling for beginners/ lazy/ etc." ;) */ short weight; short extraweight; /* Bookkeeping */ s_time_t deadl_abs; s_time_t sched_start_abs; s_time_t cputime; /* Times the domain un-/blocked */ s_time_t block_abs; s_time_t unblock_abs; /* Scores for {util, block penalty}-weighted extratime distribution */ int score[2]; s_time_t short_block_lost_tot; /* Statistics */ s_time_t extra_time_tot; #ifdef SEDF_STATS s_time_t block_time_tot; s_time_t penalty_time_tot; int block_tot; int short_block_tot; int long_block_tot; int pen_extra_blocks; int pen_extra_slices; #endif }; struct sedf_cpu_info { struct list_head runnableq; struct list_head waitq; struct list_head extraq[2]; s_time_t current_slice_expires; }; #define SEDF_PRIV(_ops) \ ((struct sedf_priv_info *)((_ops)->sched_data)) #define EDOM_INFO(d) ((struct sedf_vcpu_info *)((d)->sched_priv)) #define CPU_INFO(cpu) \ ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv) #define LIST(d) (&EDOM_INFO(d)->list) #define EXTRALIST(d,i) (&(EDOM_INFO(d)->extralist[i])) #define RUNQ(cpu) (&CPU_INFO(cpu)->runnableq) #define WAITQ(cpu) (&CPU_INFO(cpu)->waitq) #define EXTRAQ(cpu,i) (&(CPU_INFO(cpu)->extraq[i])) #define IDLETASK(cpu) (idle_vcpu[cpu]) #define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period) #define DIV_UP(x,y) (((x) + (y) - 1) / y) #define extra_runs(inf) ((inf->status) & 6) #define extra_get_cur_q(inf) (((inf->status & 6) >> 1)-1) #define sedf_runnable(edom) (!(EDOM_INFO(edom)->status & SEDF_ASLEEP)) static void sedf_dump_cpu_state(const struct scheduler *ops, int i); static inline int extraq_on(struct vcpu *d, int i) { return ((EXTRALIST(d,i)->next != NULL) && (EXTRALIST(d,i)->next != EXTRALIST(d,i))); } static inline void extraq_add_head(struct vcpu *d, int i) { list_add(EXTRALIST(d,i), EXTRAQ(d->processor,i)); ASSERT(extraq_on(d, i)); } static inline void extraq_add_tail(struct vcpu *d, int i) { list_add_tail(EXTRALIST(d,i), EXTRAQ(d->processor,i)); ASSERT(extraq_on(d, i)); } static inline void extraq_del(struct vcpu *d, int i) { struct list_head *list = EXTRALIST(d,i); ASSERT(extraq_on(d,i)); list_del(list); list->next = NULL; ASSERT(!extraq_on(d, i)); } /* * Adds a domain to the queue of processes which are aware of extra time. List * is sorted by score, where a lower score means higher priority for an extra * slice. It also updates the score, by simply subtracting a fixed value from * each entry, in order to avoid overflow. The algorithm works by simply * charging each domain that recieved extratime with an inverse of its weight. */ static inline void extraq_add_sort_update(struct vcpu *d, int i, int sub) { struct list_head *cur; struct sedf_vcpu_info *curinf; ASSERT(!extraq_on(d,i)); /* * Iterate through all elements to find our "hole" and on our way * update all the other scores. */ list_for_each ( cur, EXTRAQ(d->processor, i) ) { curinf = list_entry(cur,struct sedf_vcpu_info,extralist[i]); curinf->score[i] -= sub; if ( EDOM_INFO(d)->score[i] < curinf->score[i] ) break; } /* cur now contains the element, before which we'll enqueue */ list_add(EXTRALIST(d,i),cur->prev); /* Continue updating the extraq */ if ( (cur != EXTRAQ(d->processor,i)) && sub ) { for ( cur = cur->next; cur != EXTRAQ(d->processor,i); cur = cur->next ) { curinf = list_entry(cur,struct sedf_vcpu_info, extralist[i]); curinf->score[i] -= sub; } } ASSERT(extraq_on(d,i)); } static inline void extraq_check(struct vcpu *d) { if ( extraq_on(d, EXTRA_UTIL_Q) ) { if ( !(EDOM_INFO(d)->status & EXTRA_AWARE) && !extra_runs(EDOM_INFO(d)) ) extraq_del(d, EXTRA_UTIL_Q); } else { if ( (EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d) ) extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); } } static inline void extraq_check_add_unblocked(struct vcpu *d, int priority) { struct sedf_vcpu_info *inf = EDOM_INFO(d); if ( inf->status & EXTRA_AWARE ) /* Put on the weighted extraq without updating any scores */ extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); } static inline int __task_on_queue(struct vcpu *d) { return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d))); } static inline void __del_from_queue(struct vcpu *d) { struct list_head *list = LIST(d); ASSERT(__task_on_queue(d)); list_del(list); list->next = NULL; ASSERT(!__task_on_queue(d)); } typedef int(*list_comparer)(struct list_head* el1, struct list_head* el2); static inline void list_insert_sort( struct list_head *list, struct list_head *element, list_comparer comp) { struct list_head *cur; /* Iterate through all elements to find our "hole" */ list_for_each( cur, list ) if ( comp(element, cur) < 0 ) break; /* cur now contains the element, before which we'll enqueue */ list_add(element, cur->prev); } #define DOMAIN_COMPARER(name, field, comp1, comp2) \ static int name##_comp(struct list_head* el1, struct list_head* el2) \ { \ struct sedf_vcpu_info *d1, *d2; \ d1 = list_entry(el1,struct sedf_vcpu_info, field); \ d2 = list_entry(el2,struct sedf_vcpu_info, field); \ if ( (comp1) == (comp2) ) \ return 0; \ if ( (comp1) < (comp2) ) \ return -1; \ else \ return 1; \ } /* * Adds a domain to the queue of processes which wait for the beginning of the * next period; this list is therefore sortet by this time, which is simply * absol. deadline - period. */ DOMAIN_COMPARER(waitq, list, PERIOD_BEGIN(d1), PERIOD_BEGIN(d2)); static inline void __add_to_waitqueue_sort(struct vcpu *v) { ASSERT(!__task_on_queue(v)); list_insert_sort(WAITQ(v->processor), LIST(v), waitq_comp); ASSERT(__task_on_queue(v)); } /* * Adds a domain to the queue of processes which have started their current * period and are runnable (i.e. not blocked, dieing,...). The first element * on this list is running on the processor, if the list is empty the idle * task will run. As we are implementing EDF, this list is sorted by deadlines. */ DOMAIN_COMPARER(runq, list, d1->deadl_abs, d2->deadl_abs); static inline void __add_to_runqueue_sort(struct vcpu *v) { list_insert_sort(RUNQ(v->processor), LIST(v), runq_comp); } static void sedf_insert_vcpu(const struct scheduler *ops, struct vcpu *v) { if ( !is_idle_vcpu(v) ) { extraq_check(v); } else { EDOM_INFO(v)->deadl_abs = 0; EDOM_INFO(v)->status &= ~SEDF_ASLEEP; } } static void *sedf_alloc_vdata(const struct scheduler *ops, struct vcpu *v, void *dd) { struct sedf_vcpu_info *inf; inf = xzalloc(struct sedf_vcpu_info); if ( inf == NULL ) return NULL; inf->vcpu = v; /* Every VCPU gets an equal share of extratime by default */ inf->deadl_abs = 0; inf->latency = 0; inf->status = EXTRA_AWARE | SEDF_ASLEEP; inf->extraweight = 1; /* Upon creation all domain are best-effort */ inf->period = WEIGHT_PERIOD; inf->slice = 0; inf->period_orig = inf->period; inf->slice_orig = inf->slice; INIT_LIST_HEAD(&(inf->list)); INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q])); INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q])); SCHED_STAT_CRANK(vcpu_init); return inf; } static void * sedf_alloc_pdata(const struct scheduler *ops, int cpu) { struct sedf_cpu_info *spc; spc = xzalloc(struct sedf_cpu_info); BUG_ON(spc == NULL); INIT_LIST_HEAD(&spc->waitq); INIT_LIST_HEAD(&spc->runnableq); INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]); INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]); return (void *)spc; } static void sedf_free_pdata(const struct scheduler *ops, void *spc, int cpu) { if ( spc == NULL ) return; xfree(spc); } static void sedf_free_vdata(const struct scheduler *ops, void *priv) { xfree(priv); } static void * sedf_alloc_domdata(const struct scheduler *ops, struct domain *d) { return xzalloc(struct sedf_dom_info); } static int sedf_init_domain(const struct scheduler *ops, struct domain *d) { d->sched_priv = sedf_alloc_domdata(ops, d); if ( d->sched_priv == NULL ) return -ENOMEM; return 0; } static void sedf_free_domdata(const struct scheduler *ops, void *data) { xfree(data); } static void sedf_destroy_domain(const struct scheduler *ops, struct domain *d) { sedf_free_domdata(ops, d->sched_priv); } static int sedf_pick_cpu(const struct scheduler *ops, struct vcpu *v) { cpumask_t online_affinity; cpumask_t *online; online = cpupool_scheduler_cpumask(v->domain->cpupool); cpumask_and(&online_affinity, v->cpu_affinity, online); return cpumask_cycle(v->vcpu_id % cpumask_weight(&online_affinity) - 1, &online_affinity); } /* * Handles the rescheduling & bookkeeping of domains running in their * guaranteed timeslice. */ static void desched_edf_dom(s_time_t now, struct vcpu* d) { struct sedf_vcpu_info* inf = EDOM_INFO(d); /* Current domain is running in real time mode */ ASSERT(__task_on_queue(d)); /* Update the domain's cputime */ inf->cputime += now - inf->sched_start_abs; /* Scheduling decisions which don't remove the running domain from * the runq */ if ( (inf->cputime < inf->slice) && sedf_runnable(d) ) return; __del_from_queue(d); /* * Manage bookkeeping (i.e. calculate next deadline, memorise * overrun-time of slice) of finished domains. */ if ( inf->cputime >= inf->slice ) { inf->cputime -= inf->slice; if ( inf->period < inf->period_orig ) { /* This domain runs in latency scaling or burst mode */ inf->period *= 2; inf->slice *= 2; if ( (inf->period > inf->period_orig) || (inf->slice > inf->slice_orig) ) { /* Reset slice and period */ inf->period = inf->period_orig; inf->slice = inf->slice_orig; } } /* Set next deadline */ inf->deadl_abs += inf->period; } /* Add a runnable domain to the waitqueue */ if ( sedf_runnable(d) ) { __add_to_waitqueue_sort(d); } else { /* We have a blocked realtime task -> remove it from exqs too */ if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); } ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), sedf_runnable(d))); } /* Update all elements on the queues */ static void update_queues( s_time_t now, struct list_head *runq, struct list_head *waitq) { struct list_head *cur, *tmp; struct sedf_vcpu_info *curinf; /* * Check for the first elements of the waitqueue, whether their * next period has already started. */ list_for_each_safe ( cur, tmp, waitq ) { curinf = list_entry(cur, struct sedf_vcpu_info, list); if ( PERIOD_BEGIN(curinf) > now ) break; __del_from_queue(curinf->vcpu); __add_to_runqueue_sort(curinf->vcpu); } /* Process the runq, find domains that are on the runq that shouldn't */ list_for_each_safe ( cur, tmp, runq ) { curinf = list_entry(cur,struct sedf_vcpu_info,list); if ( unlikely(curinf->slice == 0) ) { /* Ignore domains with empty slice */ __del_from_queue(curinf->vcpu); /* Move them to their next period */ curinf->deadl_abs += curinf->period; /* Ensure that the start of the next period is in the future */ if ( unlikely(PERIOD_BEGIN(curinf) < now) ) curinf->deadl_abs += (DIV_UP(now - PERIOD_BEGIN(curinf), curinf->period)) * curinf->period; /* Put them back into the queue */ __add_to_waitqueue_sort(curinf->vcpu); } else if ( unlikely((curinf->deadl_abs < now) || (curinf->cputime > curinf->slice)) ) { /* * We missed the deadline or the slice was already finished. * Might hapen because of dom_adj. */ printk("\tDomain %i.%i exceeded it's deadline/" "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64 " cputime: %"PRIu64"\n", curinf->vcpu->domain->domain_id, curinf->vcpu->vcpu_id, curinf->deadl_abs, curinf->slice, now, curinf->cputime); __del_from_queue(curinf->vcpu); /* Common case: we miss one period */ curinf->deadl_abs += curinf->period; /* * If we are still behind: modulo arithmetic, force deadline * to be in future and aligned to period borders. */ if ( unlikely(curinf->deadl_abs < now) ) curinf->deadl_abs += DIV_UP(now - curinf->deadl_abs, curinf->period) * curinf->period; ASSERT(curinf->deadl_abs >= now); /* Give a fresh slice */ curinf->cputime = 0; if ( PERIOD_BEGIN(curinf) > now ) __add_to_waitqueue_sort(curinf->vcpu); else __add_to_runqueue_sort(curinf->vcpu); } else break; } } /* * removes a domain from the head of the according extraQ and * requeues it at a specified position: * round-robin extratime: end of extraQ * weighted ext.: insert in sorted list by score * if the domain is blocked / has regained its short-block-loss * time it is not put on any queue. */ static void desched_extra_dom(s_time_t now, struct vcpu *d) { struct sedf_vcpu_info *inf = EDOM_INFO(d); int i = extra_get_cur_q(inf); unsigned long oldscore; ASSERT(extraq_on(d, i)); /* Unset all running flags */ inf->status &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL); /* Fresh slice for the next run */ inf->cputime = 0; /* Accumulate total extratime */ inf->extra_time_tot += now - inf->sched_start_abs; /* Remove extradomain from head of the queue. */ extraq_del(d, i); /* Update the score */ oldscore = inf->score[i]; if ( i == EXTRA_PEN_Q ) { /* Domain was running in L0 extraq */ /* reduce block lost, probably more sophistication here!*/ /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/ inf->short_block_lost_tot -= now - inf->sched_start_abs; #if 0 /* KAF: If we don't exit short-blocking state at this point * domain0 can steal all CPU for up to 10 seconds before * scheduling settles down (when competing against another * CPU-bound domain). Doing this seems to make things behave * nicely. Noone gets starved by default. */ if ( inf->short_block_lost_tot <= 0 ) #endif { /* We have (over-)compensated our block penalty */ inf->short_block_lost_tot = 0; /* We don't want a place on the penalty queue anymore! */ inf->status &= ~EXTRA_WANT_PEN_Q; goto check_extra_queues; } /* * We have to go again for another try in the block-extraq, * the score is not used incremantally here, as this is * already done by recalculating the block_lost */ inf->score[EXTRA_PEN_Q] = (inf->period << 10) / inf->short_block_lost_tot; oldscore = 0; } else { /* * Domain was running in L1 extraq => score is inverse of * utilization and is used somewhat incremental! */ if ( !inf->extraweight ) { /* NB: use fixed point arithmetic with 10 bits */ inf->score[EXTRA_UTIL_Q] = (inf->period << 10) / inf->slice; } else { /* * Conversion between realtime utilisation and extrawieght: * full (ie 100%) utilization is equivalent to 128 extraweight */ inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight; } } check_extra_queues: /* Adding a runnable domain to the right queue and removing blocked ones */ if ( sedf_runnable(d) ) { /* Add according to score: weighted round robin */ if (((inf->status & EXTRA_AWARE) && (i == EXTRA_UTIL_Q)) || ((inf->status & EXTRA_WANT_PEN_Q) && (i == EXTRA_PEN_Q))) extraq_add_sort_update(d, i, oldscore); } else { /* Remove this blocked domain from the waitq! */ __del_from_queue(d); /* Make sure that we remove a blocked domain from the other * extraq too. */ if ( i == EXTRA_PEN_Q ) { if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); } else { if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); } } ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), sedf_runnable(d))); } static struct task_slice sedf_do_extra_schedule( s_time_t now, s_time_t end_xt, struct list_head *extraq[], int cpu) { struct task_slice ret = { 0 }; struct sedf_vcpu_info *runinf; ASSERT(end_xt > now); /* Enough time left to use for extratime? */ if ( end_xt - now < EXTRA_QUANTUM ) goto return_idle; if ( !list_empty(extraq[EXTRA_PEN_Q]) ) { /* * We still have elements on the level 0 extraq * => let those run first! */ runinf = list_entry(extraq[EXTRA_PEN_Q]->next, struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]); runinf->status |= EXTRA_RUN_PEN; ret.task = runinf->vcpu; ret.time = EXTRA_QUANTUM; #ifdef SEDF_STATS runinf->pen_extra_slices++; #endif } else { if ( !list_empty(extraq[EXTRA_UTIL_Q]) ) { /* Use elements from the normal extraqueue */ runinf = list_entry(extraq[EXTRA_UTIL_Q]->next, struct sedf_vcpu_info, extralist[EXTRA_UTIL_Q]); runinf->status |= EXTRA_RUN_UTIL; ret.task = runinf->vcpu; ret.time = EXTRA_QUANTUM; } else goto return_idle; } ASSERT(ret.time > 0); ASSERT(sedf_runnable(ret.task)); return ret; return_idle: ret.task = IDLETASK(cpu); ret.time = end_xt - now; ASSERT(ret.time > 0); ASSERT(sedf_runnable(ret.task)); return ret; } static int sedf_init(struct scheduler *ops) { struct sedf_priv_info *prv; prv = xzalloc(struct sedf_priv_info); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; spin_lock_init(&prv->lock); return 0; } static void sedf_deinit(const struct scheduler *ops) { struct sedf_priv_info *prv; prv = SEDF_PRIV(ops); if ( prv != NULL ) xfree(prv); } /* * Main scheduling function * Reasons for calling this function are: * -timeslice for the current period used up * -domain on waitqueue has started it's period * -and various others ;) in general: determine which domain to run next */ static struct task_slice sedf_do_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { int cpu = smp_processor_id(); struct list_head *runq = RUNQ(cpu); struct list_head *waitq = WAITQ(cpu); struct sedf_vcpu_info *inf = EDOM_INFO(current); struct list_head *extraq[] = { EXTRAQ(cpu, EXTRA_PEN_Q), EXTRAQ(cpu, EXTRA_UTIL_Q)}; struct sedf_vcpu_info *runinf, *waitinf; struct task_slice ret; SCHED_STAT_CRANK(schedule); /* Idle tasks don't need any of the following stuf */ if ( is_idle_vcpu(current) ) goto check_waitq; /* * Create local state of the status of the domain, in order to avoid * inconsistent state during scheduling decisions, because data for * vcpu_runnable is not protected by the scheduling lock! */ if ( !vcpu_runnable(current) ) inf->status |= SEDF_ASLEEP; if ( inf->status & SEDF_ASLEEP ) inf->block_abs = now; if ( unlikely(extra_runs(inf)) ) { /* Special treatment of domains running in extra time */ desched_extra_dom(now, current); } else { desched_edf_dom(now, current); } check_waitq: update_queues(now, runq, waitq); /* * Now simply pick the first domain from the runqueue, which has the * earliest deadline, because the list is sorted * * Tasklet work (which runs in idle VCPU context) overrides all else. */ if ( tasklet_work_scheduled || (list_empty(runq) && list_empty(waitq)) || unlikely(!cpumask_test_cpu(cpu, cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)))) ) { ret.task = IDLETASK(cpu); ret.time = SECONDS(1); } else if ( !list_empty(runq) ) { runinf = list_entry(runq->next,struct sedf_vcpu_info,list); ret.task = runinf->vcpu; if ( !list_empty(waitq) ) { waitinf = list_entry(waitq->next, struct sedf_vcpu_info,list); /* * Rerun scheduler, when scheduled domain reaches it's * end of slice or the first domain from the waitqueue * gets ready. */ ret.time = MIN(now + runinf->slice - runinf->cputime, PERIOD_BEGIN(waitinf)) - now; } else { ret.time = runinf->slice - runinf->cputime; } } else { waitinf = list_entry(waitq->next,struct sedf_vcpu_info, list); /* * We could not find any suitable domain * => look for domains that are aware of extratime */ ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf), extraq, cpu); } /* * TODO: Do something USEFUL when this happens and find out, why it * still can happen!!! */ if ( ret.time < 0) { printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n", ret.time); ret.time = EXTRA_QUANTUM; } ret.migrated = 0; EDOM_INFO(ret.task)->sched_start_abs = now; CHECK(ret.time > 0); ASSERT(sedf_runnable(ret.task)); CPU_INFO(cpu)->current_slice_expires = now + ret.time; return ret; } static void sedf_sleep(const struct scheduler *ops, struct vcpu *d) { if ( is_idle_vcpu(d) ) return; EDOM_INFO(d)->status |= SEDF_ASLEEP; if ( per_cpu(schedule_data, d->processor).curr == d ) { cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } else { if ( __task_on_queue(d) ) __del_from_queue(d); if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); } } /* * This function wakes up a domain, i.e. moves them into the waitqueue * things to mention are: admission control is taking place nowhere at * the moment, so we can't be sure, whether it is safe to wake the domain * up at all. Anyway, even if it is safe (total cpu usage <=100%) there are * some considerations on when to allow the domain to wake up and have it's * first deadline... * I detected 3 cases, which could describe the possible behaviour of the * scheduler, * and I'll try to make them more clear: * * 1. Very conservative * -when a blocked domain unblocks, it is allowed to start execution at * the beginning of the next complete period * (D..deadline, R..running, B..blocking/sleeping, U..unblocking/waking up * * DRRB_____D__U_____DRRRRR___D________ ... * * -this causes the domain to miss a period (and a deadlline) * -doesn't disturb the schedule at all * -deadlines keep occuring isochronous * * 2. Conservative Part 1: Short Unblocking * -when a domain unblocks in the same period as it was blocked it * unblocks and may consume the rest of it's original time-slice minus * the time it was blocked * (assume period=9, slice=5) * * DRB_UR___DRRRRR___D... * * -this also doesn't disturb scheduling, but might lead to the fact, that * the domain can't finish it's workload in the period * -in addition to that the domain can be treated prioritised when * extratime is available * -addition: experiments have shown that this may have a HUGE impact on * performance of other domains, becaus it can lead to excessive context * switches * * Part2: Long Unblocking * Part 2a * -it is obvious that such accounting of block time, applied when * unblocking is happening in later periods, works fine aswell * -the domain is treated as if it would have been running since the start * of its new period * * DRB______D___UR___D... * * Part 2b * -if one needs the full slice in the next period, it is necessary to * treat the unblocking time as the start of the new period, i.e. move * the deadline further back (later) * -this doesn't disturb scheduling as well, because for EDF periods can * be treated as minimal inter-release times and scheduling stays * correct, when deadlines are kept relative to the time the process * unblocks * * DRB______D___URRRR___D... * (D) <- old deadline was here * -problem: deadlines don't occur isochronous anymore * Part 2c (Improved Atropos design) * -when a domain unblocks it is given a very short period (=latency hint) * and slice length scaled accordingly * -both rise again to the original value (e.g. get doubled every period) * * 3. Unconservative (i.e. incorrect) * -to boost the performance of I/O dependent domains it would be possible * to put the domain into the runnable queue immediately, and let it run * for the remainder of the slice of the current period * (or even worse: allocate a new full slice for the domain) * -either behaviour can lead to missed deadlines in other domains as * opposed to approaches 1,2a,2b */ static void unblock_short_extra_support( struct sedf_vcpu_info* inf, s_time_t now) { /* * This unblocking scheme tries to support the domain, by assigning it * a priority in extratime distribution according to the loss of time * in this slice due to blocking */ s_time_t pen; /* No more realtime execution in this period! */ inf->deadl_abs += inf->period; if ( likely(inf->block_abs) ) { /* Treat blocked time as consumed by the domain */ /*inf->cputime += now - inf->block_abs;*/ /* * Penalty is time the domain would have * had if it continued to run. */ pen = (inf->slice - inf->cputime); if ( pen < 0 ) pen = 0; /* Accumulate all penalties over the periods */ /*inf->short_block_lost_tot += pen;*/ /* Set penalty to the current value */ inf->short_block_lost_tot = pen; /* Not sure which one is better.. but seems to work well... */ if ( inf->short_block_lost_tot ) { inf->score[0] = (inf->period << 10) / inf->short_block_lost_tot; #ifdef SEDF_STATS inf->pen_extra_blocks++; #endif if ( extraq_on(inf->vcpu, EXTRA_PEN_Q) ) /* Remove domain for possible resorting! */ extraq_del(inf->vcpu, EXTRA_PEN_Q); else /* * Remember that we want to be on the penalty q * so that we can continue when we (un-)block * in penalty-extratime */ inf->status |= EXTRA_WANT_PEN_Q; /* (re-)add domain to the penalty extraq */ extraq_add_sort_update(inf->vcpu, EXTRA_PEN_Q, 0); } } /* Give it a fresh slice in the next period! */ inf->cputime = 0; } static void unblock_long_cons_b(struct sedf_vcpu_info* inf,s_time_t now) { /* Conservative 2b */ /* Treat the unblocking time as a start of a new period */ inf->deadl_abs = now + inf->period; inf->cputime = 0; } #define DOMAIN_EDF 1 #define DOMAIN_EXTRA_PEN 2 #define DOMAIN_EXTRA_UTIL 3 #define DOMAIN_IDLE 4 static inline int get_run_type(struct vcpu* d) { struct sedf_vcpu_info* inf = EDOM_INFO(d); if (is_idle_vcpu(d)) return DOMAIN_IDLE; if (inf->status & EXTRA_RUN_PEN) return DOMAIN_EXTRA_PEN; if (inf->status & EXTRA_RUN_UTIL) return DOMAIN_EXTRA_UTIL; return DOMAIN_EDF; } /* * Compares two domains in the relation of whether the one is allowed to * interrupt the others execution. * It returns true (!=0) if a switch to the other domain is good. * Current Priority scheme is as follows: * EDF > L0 (penalty based) extra-time > * L1 (utilization) extra-time > idle-domain * In the same class priorities are assigned as following: * EDF: early deadline > late deadline * L0 extra-time: lower score > higher score */ static inline int should_switch(struct vcpu *cur, struct vcpu *other, s_time_t now) { struct sedf_vcpu_info *cur_inf, *other_inf; cur_inf = EDOM_INFO(cur); other_inf = EDOM_INFO(other); /* Check whether we need to make an earlier scheduling decision */ if ( PERIOD_BEGIN(other_inf) < CPU_INFO(other->processor)->current_slice_expires ) return 1; /* No timing-based switches need to be taken into account here */ switch ( get_run_type(cur) ) { case DOMAIN_EDF: /* Do not interrupt a running EDF domain */ return 0; case DOMAIN_EXTRA_PEN: /* Check whether we also want the L0 ex-q with lower score */ return ((other_inf->status & EXTRA_WANT_PEN_Q) && (other_inf->score[EXTRA_PEN_Q] < cur_inf->score[EXTRA_PEN_Q])); case DOMAIN_EXTRA_UTIL: /* Check whether we want the L0 extraq. Don't * switch if both domains want L1 extraq. */ return !!(other_inf->status & EXTRA_WANT_PEN_Q); case DOMAIN_IDLE: return 1; } return 1; } static void sedf_wake(const struct scheduler *ops, struct vcpu *d) { s_time_t now = NOW(); struct sedf_vcpu_info* inf = EDOM_INFO(d); if ( unlikely(is_idle_vcpu(d)) ) return; if ( unlikely(__task_on_queue(d)) ) return; ASSERT(!sedf_runnable(d)); inf->status &= ~SEDF_ASLEEP; ASSERT(!extraq_on(d, EXTRA_UTIL_Q)); ASSERT(!extraq_on(d, EXTRA_PEN_Q)); if ( unlikely(inf->deadl_abs == 0) ) { /* Initial setup of the deadline */ inf->deadl_abs = now + inf->slice; } #ifdef SEDF_STATS inf->block_tot++; #endif if ( unlikely(now < PERIOD_BEGIN(inf)) ) { /* Unblocking in extra-time! */ if ( inf->status & EXTRA_WANT_PEN_Q ) { /* * We have a domain that wants compensation * for block penalty and did just block in * its compensation time. Give it another * chance! */ extraq_add_sort_update(d, EXTRA_PEN_Q, 0); } extraq_check_add_unblocked(d, 0); } else { if ( now < inf->deadl_abs ) { /* Short blocking */ #ifdef SEDF_STATS inf->short_block_tot++; #endif unblock_short_extra_support(inf, now); extraq_check_add_unblocked(d, 1); } else { /* Long unblocking */ #ifdef SEDF_STATS inf->long_block_tot++; #endif unblock_long_cons_b(inf, now); extraq_check_add_unblocked(d, 1); } } if ( PERIOD_BEGIN(inf) > now ) __add_to_waitqueue_sort(d); else __add_to_runqueue_sort(d); #ifdef SEDF_STATS /* Do some statistics here... */ if ( inf->block_abs != 0 ) { inf->block_time_tot += now - inf->block_abs; inf->penalty_time_tot += PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs; } #endif /* Sanity check: make sure each extra-aware domain IS on the util-q! */ ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q))); ASSERT(__task_on_queue(d)); /* * Check whether the awakened task needs to invoke the do_schedule * routine. Try to avoid unnecessary runs but: * Save approximation: Always switch to scheduler! */ ASSERT(d->processor >= 0); ASSERT(d->processor < nr_cpu_ids); ASSERT(per_cpu(schedule_data, d->processor).curr); if ( should_switch(per_cpu(schedule_data, d->processor).curr, d, now) ) cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } /* Print a lot of useful information about a domains in the system */ static void sedf_dump_domain(struct vcpu *d) { printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, d->is_running ? 'T':'F'); printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu" " sc=%i xtr(%s)=%"PRIu64" ew=%hu", EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, EDOM_INFO(d)->weight, EDOM_INFO(d)->score[EXTRA_UTIL_Q], (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); #ifdef SEDF_STATS if ( EDOM_INFO(d)->block_time_tot != 0 ) printk(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / EDOM_INFO(d)->block_time_tot); if ( EDOM_INFO(d)->block_tot != 0 ) printk("\n blks=%u sh=%u (%u%%) (shex=%i "\ "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"", EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot, (EDOM_INFO(d)->short_block_tot * 100) / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->pen_extra_blocks, EDOM_INFO(d)->pen_extra_slices, EDOM_INFO(d)->long_block_tot, (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot, (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot, (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot); #endif printk("\n"); } /* Dumps all domains on the specified cpu */ static void sedf_dump_cpu_state(const struct scheduler *ops, int i) { struct list_head *list, *queue, *tmp; struct sedf_vcpu_info *d_inf; struct domain *d; struct vcpu *ed; int loop = 0; printk("now=%"PRIu64"\n",NOW()); queue = RUNQ(i); printk("RUNQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); } queue = WAITQ(i); loop = 0; printk("\nWAITQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); } queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0; printk("\nEXTRAQ (penalty) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]); printk("%3d: ",loop++); sedf_dump_domain(d_inf->vcpu); } queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0; printk("\nEXTRAQ (utilization) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_UTIL_Q]); printk("%3d: ",loop++); sedf_dump_domain(d_inf->vcpu); } loop = 0; printk("\nnot on Q\n"); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { if ( (d->cpupool ? d->cpupool->sched : &sched_sedf_def) != ops ) continue; for_each_vcpu(d, ed) { if ( !__task_on_queue(ed) && (ed->processor == i) ) { printk("%3d: ",loop++); sedf_dump_domain(ed); } } } rcu_read_unlock(&domlist_read_lock); } /* Adjusts periods and slices of the domains accordingly to their weights */ static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, s_time_t *sumt) { struct vcpu *p; struct domain *d; unsigned int cpu; /* * Sum across all weights. Notice that no runq locking is needed * here: the caller holds sedf_priv_info.lock and we're not changing * anything that is accessed during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { for_each_vcpu( d, p ) { if ( (cpu = p->processor) >= nr_cpus ) continue; if ( EDOM_INFO(p)->weight ) { sumw[cpu] += EDOM_INFO(p)->weight; } else { /* * Don't modify domains who don't have a weight, but sum * up the time they need, projected to a WEIGHT_PERIOD, * so that this time is not given to the weight-driven * domains */ /* Check for overflows */ ASSERT((WEIGHT_PERIOD < ULONG_MAX) && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); sumt[cpu] += (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / EDOM_INFO(p)->period_orig; } } } rcu_read_unlock(&domlist_read_lock); /* * Adjust all slices (and periods) to the new weight. Unlike above, we * need to take thr runq lock for the various VCPUs: we're modyfing * slice and period which are referenced during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { for_each_vcpu ( d, p ) { if ( (cpu = p->processor) >= nr_cpus ) continue; if ( EDOM_INFO(p)->weight ) { /* Interrupts already off */ spinlock_t *lock = vcpu_schedule_lock(p); EDOM_INFO(p)->period_orig = EDOM_INFO(p)->period = WEIGHT_PERIOD; EDOM_INFO(p)->slice_orig = EDOM_INFO(p)->slice = (EDOM_INFO(p)->weight * (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[cpu]; vcpu_schedule_unlock(lock, p); } } } rcu_read_unlock(&domlist_read_lock); return 0; } /* Set or fetch domain scheduling parameters */ static int sedf_adjust(const struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op) { struct sedf_priv_info *prv = SEDF_PRIV(ops); unsigned long flags; unsigned int nr_cpus = cpumask_last(&cpu_online_map) + 1; int *sumw = xzalloc_array(int, nr_cpus); s_time_t *sumt = xzalloc_array(s_time_t, nr_cpus); struct vcpu *v; int rc = 0; /* * Serialize against the pluggable scheduler lock to protect from * concurrent updates. We need to take the runq lock for the VCPUs * as well, since we are touching extraweight, weight, slice and * period. As in sched_credit2.c, runq locks nest inside the * pluggable scheduler lock. */ spin_lock_irqsave(&prv->lock, flags); if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo ) { /* * These are used in sedf_adjust_weights() but have to be allocated in * this function, as we need to avoid nesting xmem_pool_alloc's lock * within our prv->lock. */ if ( !sumw || !sumt ) { /* Check for errors here, the _getinfo branch doesn't care */ rc = -ENOMEM; goto out; } /* Check for sane parameters */ if ( !op->u.sedf.period && !op->u.sedf.weight ) { rc = -EINVAL; goto out; } if ( op->u.sedf.weight ) { if ( (op->u.sedf.extratime & EXTRA_AWARE) && (!op->u.sedf.period) ) { /* Weight-driven domains with extratime only */ for_each_vcpu ( p, v ) { /* (Here and everywhere in the following) IRQs are already off, * hence vcpu_spin_lock() is the one. */ spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->extraweight = op->u.sedf.weight; EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->slice = 0; EDOM_INFO(v)->period = WEIGHT_PERIOD; vcpu_schedule_unlock(lock, v); } } else { /* Weight-driven domains with real-time execution */ for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->weight = op->u.sedf.weight; vcpu_schedule_unlock(lock, v); } } } else { /* * Sanity checking: note that disabling extra weight requires * that we set a non-zero slice. */ if ( (op->u.sedf.period > PERIOD_MAX) || (op->u.sedf.period < PERIOD_MIN) || (op->u.sedf.slice > op->u.sedf.period) || (op->u.sedf.slice < SLICE_MIN) ) { rc = -EINVAL; goto out; } /* Time-driven domains */ for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->extraweight = 0; EDOM_INFO(v)->period_orig = EDOM_INFO(v)->period = op->u.sedf.period; EDOM_INFO(v)->slice_orig = EDOM_INFO(v)->slice = op->u.sedf.slice; vcpu_schedule_unlock(lock, v); } } rc = sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt); if ( rc ) goto out; for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->status = (EDOM_INFO(v)->status & ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE); EDOM_INFO(v)->latency = op->u.sedf.latency; extraq_check(v); vcpu_schedule_unlock(lock, v); } } else if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { if ( p->vcpu[0] == NULL ) { rc = -EINVAL; goto out; } op->u.sedf.period = EDOM_INFO(p->vcpu[0])->period; op->u.sedf.slice = EDOM_INFO(p->vcpu[0])->slice; op->u.sedf.extratime = EDOM_INFO(p->vcpu[0])->status & EXTRA_AWARE; op->u.sedf.latency = EDOM_INFO(p->vcpu[0])->latency; op->u.sedf.weight = EDOM_INFO(p->vcpu[0])->weight; } out: spin_unlock_irqrestore(&prv->lock, flags); xfree(sumt); xfree(sumw); return rc; } static struct sedf_priv_info _sedf_priv; const struct scheduler sched_sedf_def = { .name = "Simple EDF Scheduler", .opt_name = "sedf", .sched_id = XEN_SCHEDULER_SEDF, .sched_data = &_sedf_priv, .init_domain = sedf_init_domain, .destroy_domain = sedf_destroy_domain, .insert_vcpu = sedf_insert_vcpu, .alloc_vdata = sedf_alloc_vdata, .free_vdata = sedf_free_vdata, .alloc_pdata = sedf_alloc_pdata, .free_pdata = sedf_free_pdata, .alloc_domdata = sedf_alloc_domdata, .free_domdata = sedf_free_domdata, .init = sedf_init, .deinit = sedf_deinit, .do_schedule = sedf_do_schedule, .pick_cpu = sedf_pick_cpu, .dump_cpu_state = sedf_dump_cpu_state, .sleep = sedf_sleep, .wake = sedf_wake, .adjust = sedf_adjust, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */