1 /*
2 * xen/common/sched_null.c
3 *
4 * Copyright (c) 2017, Dario Faggioli, Citrix Ltd
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 /*
20 * The 'null' scheduler always choose to run, on each pCPU, either nothing
21 * (i.e., the pCPU stays idle) or always the same vCPU.
22 *
23 * It is aimed at supporting static scenarios, where there always are
24 * less vCPUs than pCPUs (and the vCPUs don't need to move among pCPUs
25 * for any reason) with the least possible overhead.
26 *
27 * Typical usecase are embedded applications, but also HPC, especially
28 * if the scheduler is used inside a cpupool.
29 */
30
31 #include <xen/sched.h>
32 #include <xen/sched-if.h>
33 #include <xen/softirq.h>
34 #include <xen/keyhandler.h>
35 #include <xen/trace.h>
36
37 /*
38 * null tracing events. Check include/public/trace.h for more details.
39 */
40 #define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1)
41 #define TRC_SNULL_VCPU_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2)
42 #define TRC_SNULL_VCPU_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
43 #define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4)
44 #define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5)
45 #define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6)
46
47 /*
48 * Locking:
49 * - Scheduler-lock (a.k.a. runqueue lock):
50 * + is per-pCPU;
51 * + serializes assignment and deassignment of vCPUs to a pCPU.
52 * - Private data lock (a.k.a. private scheduler lock):
53 * + is scheduler-wide;
54 * + serializes accesses to the list of domains in this scheduler.
55 * - Waitqueue lock:
56 * + is scheduler-wide;
57 * + serialize accesses to the list of vCPUs waiting to be assigned
58 * to pCPUs.
59 *
60 * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
61 * waitqueue lock nests inside runqueue lock which nests inside private
62 * lock. More specifically:
63 * + if we need both runqueue and private locks, we must acquire the
64 * private lock for first;
65 * + if we need both runqueue and waitqueue locks, we must acquire
66 * the runqueue lock for first;
67 * + if we need both private and waitqueue locks, we must acquire
68 * the private lock for first;
69 * + if we already own a runqueue lock, we must never acquire
70 * the private lock;
71 * + if we already own the waitqueue lock, we must never acquire
72 * the runqueue lock or the private lock.
73 */
74
75 /*
76 * System-wide private data
77 */
78 struct null_private {
79 spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */
80 struct list_head ndom; /* Domains of this scheduler */
81 struct list_head waitq; /* vCPUs not assigned to any pCPU */
82 spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */
83 cpumask_t cpus_free; /* CPUs without a vCPU associated to them */
84 };
85
86 /*
87 * Physical CPU
88 */
89 struct null_pcpu {
90 struct vcpu *vcpu;
91 };
92 DEFINE_PER_CPU(struct null_pcpu, npc);
93
94 /*
95 * Virtual CPU
96 */
97 struct null_vcpu {
98 struct list_head waitq_elem;
99 struct vcpu *vcpu;
100 };
101
102 /*
103 * Domain
104 */
105 struct null_dom {
106 struct list_head ndom_elem;
107 struct domain *dom;
108 };
109
110 /*
111 * Accessor helpers functions
112 */
null_priv(const struct scheduler * ops)113 static inline struct null_private *null_priv(const struct scheduler *ops)
114 {
115 return ops->sched_data;
116 }
117
null_vcpu(const struct vcpu * v)118 static inline struct null_vcpu *null_vcpu(const struct vcpu *v)
119 {
120 return v->sched_priv;
121 }
122
null_dom(const struct domain * d)123 static inline struct null_dom *null_dom(const struct domain *d)
124 {
125 return d->sched_priv;
126 }
127
vcpu_check_affinity(struct vcpu * v,unsigned int cpu,unsigned int balance_step)128 static inline bool vcpu_check_affinity(struct vcpu *v, unsigned int cpu,
129 unsigned int balance_step)
130 {
131 affinity_balance_cpumask(v, balance_step, cpumask_scratch_cpu(cpu));
132 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
133 cpupool_domain_cpumask(v->domain));
134
135 return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
136 }
137
null_init(struct scheduler * ops)138 static int null_init(struct scheduler *ops)
139 {
140 struct null_private *prv;
141
142 printk("Initializing null scheduler\n"
143 "WARNING: This is experimental software in development.\n"
144 "Use at your own risk.\n");
145
146 prv = xzalloc(struct null_private);
147 if ( prv == NULL )
148 return -ENOMEM;
149
150 spin_lock_init(&prv->lock);
151 spin_lock_init(&prv->waitq_lock);
152 INIT_LIST_HEAD(&prv->ndom);
153 INIT_LIST_HEAD(&prv->waitq);
154
155 ops->sched_data = prv;
156
157 return 0;
158 }
159
null_deinit(struct scheduler * ops)160 static void null_deinit(struct scheduler *ops)
161 {
162 xfree(ops->sched_data);
163 ops->sched_data = NULL;
164 }
165
init_pdata(struct null_private * prv,unsigned int cpu)166 static void init_pdata(struct null_private *prv, unsigned int cpu)
167 {
168 /* Mark the pCPU as free, and with no vCPU assigned */
169 cpumask_set_cpu(cpu, &prv->cpus_free);
170 per_cpu(npc, cpu).vcpu = NULL;
171 }
172
null_init_pdata(const struct scheduler * ops,void * pdata,int cpu)173 static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
174 {
175 struct null_private *prv = null_priv(ops);
176 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
177
178 /* alloc_pdata is not implemented, so we want this to be NULL. */
179 ASSERT(!pdata);
180
181 /*
182 * The scheduler lock points already to the default per-cpu spinlock,
183 * so there is no remapping to be done.
184 */
185 ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
186
187 init_pdata(prv, cpu);
188 }
189
null_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)190 static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
191 {
192 struct null_private *prv = null_priv(ops);
193
194 /* alloc_pdata not implemented, so this must have stayed NULL */
195 ASSERT(!pcpu);
196
197 cpumask_clear_cpu(cpu, &prv->cpus_free);
198 per_cpu(npc, cpu).vcpu = NULL;
199 }
200
null_alloc_vdata(const struct scheduler * ops,struct vcpu * v,void * dd)201 static void *null_alloc_vdata(const struct scheduler *ops,
202 struct vcpu *v, void *dd)
203 {
204 struct null_vcpu *nvc;
205
206 nvc = xzalloc(struct null_vcpu);
207 if ( nvc == NULL )
208 return NULL;
209
210 INIT_LIST_HEAD(&nvc->waitq_elem);
211 nvc->vcpu = v;
212
213 SCHED_STAT_CRANK(vcpu_alloc);
214
215 return nvc;
216 }
217
null_free_vdata(const struct scheduler * ops,void * priv)218 static void null_free_vdata(const struct scheduler *ops, void *priv)
219 {
220 struct null_vcpu *nvc = priv;
221
222 xfree(nvc);
223 }
224
null_alloc_domdata(const struct scheduler * ops,struct domain * d)225 static void * null_alloc_domdata(const struct scheduler *ops,
226 struct domain *d)
227 {
228 struct null_private *prv = null_priv(ops);
229 struct null_dom *ndom;
230 unsigned long flags;
231
232 ndom = xzalloc(struct null_dom);
233 if ( ndom == NULL )
234 return NULL;
235
236 ndom->dom = d;
237
238 spin_lock_irqsave(&prv->lock, flags);
239 list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
240 spin_unlock_irqrestore(&prv->lock, flags);
241
242 return (void*)ndom;
243 }
244
null_free_domdata(const struct scheduler * ops,void * data)245 static void null_free_domdata(const struct scheduler *ops, void *data)
246 {
247 unsigned long flags;
248 struct null_dom *ndom = data;
249 struct null_private *prv = null_priv(ops);
250
251 spin_lock_irqsave(&prv->lock, flags);
252 list_del_init(&ndom->ndom_elem);
253 spin_unlock_irqrestore(&prv->lock, flags);
254
255 xfree(data);
256 }
257
null_dom_init(const struct scheduler * ops,struct domain * d)258 static int null_dom_init(const struct scheduler *ops, struct domain *d)
259 {
260 struct null_dom *ndom;
261
262 if ( is_idle_domain(d) )
263 return 0;
264
265 ndom = null_alloc_domdata(ops, d);
266 if ( ndom == NULL )
267 return -ENOMEM;
268
269 d->sched_priv = ndom;
270
271 return 0;
272 }
null_dom_destroy(const struct scheduler * ops,struct domain * d)273 static void null_dom_destroy(const struct scheduler *ops, struct domain *d)
274 {
275 null_free_domdata(ops, null_dom(d));
276 }
277
278 /*
279 * vCPU to pCPU assignment and placement. This _only_ happens:
280 * - on insert,
281 * - on migrate.
282 *
283 * Insert occurs when a vCPU joins this scheduler for the first time
284 * (e.g., when the domain it's part of is moved to the scheduler's
285 * cpupool).
286 *
287 * Migration may be necessary if a pCPU (with a vCPU assigned to it)
288 * is removed from the scheduler's cpupool.
289 *
290 * So this is not part of any hot path.
291 */
pick_cpu(struct null_private * prv,struct vcpu * v)292 static unsigned int pick_cpu(struct null_private *prv, struct vcpu *v)
293 {
294 unsigned int bs;
295 unsigned int cpu = v->processor, new_cpu;
296 cpumask_t *cpus = cpupool_domain_cpumask(v->domain);
297
298 ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
299
300 for_each_affinity_balance_step( bs )
301 {
302 if ( bs == BALANCE_SOFT_AFFINITY &&
303 !has_soft_affinity(v, v->cpu_hard_affinity) )
304 continue;
305
306 affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu));
307 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
308
309 /*
310 * If our processor is free, or we are assigned to it, and it is also
311 * still valid and part of our affinity, just go for it.
312 * (Note that we may call vcpu_check_affinity(), but we deliberately
313 * don't, so we get to keep in the scratch cpumask what we have just
314 * put in it.)
315 */
316 if ( likely((per_cpu(npc, cpu).vcpu == NULL || per_cpu(npc, cpu).vcpu == v)
317 && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
318 {
319 new_cpu = cpu;
320 goto out;
321 }
322
323 /* If not, just go for a free pCPU, within our affinity, if any */
324 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
325 &prv->cpus_free);
326 new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
327
328 if ( likely(new_cpu != nr_cpu_ids) )
329 goto out;
330 }
331
332 /*
333 * If we didn't find any free pCPU, just pick any valid pcpu, even if
334 * it has another vCPU assigned. This will happen during shutdown and
335 * suspend/resume, but it may also happen during "normal operation", if
336 * all the pCPUs are busy.
337 *
338 * In fact, there must always be something sane in v->processor, or
339 * vcpu_schedule_lock() and friends won't work. This is not a problem,
340 * as we will actually assign the vCPU to the pCPU we return from here,
341 * only if the pCPU is free.
342 */
343 cpumask_and(cpumask_scratch_cpu(cpu), cpus, v->cpu_hard_affinity);
344 new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
345
346 out:
347 if ( unlikely(tb_init_done) )
348 {
349 struct {
350 uint16_t vcpu, dom;
351 uint32_t new_cpu;
352 } d;
353 d.dom = v->domain->domain_id;
354 d.vcpu = v->vcpu_id;
355 d.new_cpu = new_cpu;
356 __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
357 }
358
359 return new_cpu;
360 }
361
vcpu_assign(struct null_private * prv,struct vcpu * v,unsigned int cpu)362 static void vcpu_assign(struct null_private *prv, struct vcpu *v,
363 unsigned int cpu)
364 {
365 per_cpu(npc, cpu).vcpu = v;
366 v->processor = cpu;
367 cpumask_clear_cpu(cpu, &prv->cpus_free);
368
369 dprintk(XENLOG_G_INFO, "%d <-- d%dv%d\n", cpu, v->domain->domain_id, v->vcpu_id);
370
371 if ( unlikely(tb_init_done) )
372 {
373 struct {
374 uint16_t vcpu, dom;
375 uint32_t cpu;
376 } d;
377 d.dom = v->domain->domain_id;
378 d.vcpu = v->vcpu_id;
379 d.cpu = cpu;
380 __trace_var(TRC_SNULL_VCPU_ASSIGN, 1, sizeof(d), &d);
381 }
382 }
383
vcpu_deassign(struct null_private * prv,struct vcpu * v,unsigned int cpu)384 static void vcpu_deassign(struct null_private *prv, struct vcpu *v,
385 unsigned int cpu)
386 {
387 per_cpu(npc, cpu).vcpu = NULL;
388 cpumask_set_cpu(cpu, &prv->cpus_free);
389
390 dprintk(XENLOG_G_INFO, "%d <-- NULL (d%dv%d)\n", cpu, v->domain->domain_id, v->vcpu_id);
391
392 if ( unlikely(tb_init_done) )
393 {
394 struct {
395 uint16_t vcpu, dom;
396 uint32_t cpu;
397 } d;
398 d.dom = v->domain->domain_id;
399 d.vcpu = v->vcpu_id;
400 d.cpu = cpu;
401 __trace_var(TRC_SNULL_VCPU_DEASSIGN, 1, sizeof(d), &d);
402 }
403 }
404
405 /* Change the scheduler of cpu to us (null). */
null_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)406 static void null_switch_sched(struct scheduler *new_ops, unsigned int cpu,
407 void *pdata, void *vdata)
408 {
409 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
410 struct null_private *prv = null_priv(new_ops);
411 struct null_vcpu *nvc = vdata;
412
413 ASSERT(nvc && is_idle_vcpu(nvc->vcpu));
414
415 idle_vcpu[cpu]->sched_priv = vdata;
416
417 /*
418 * We are holding the runqueue lock already (it's been taken in
419 * schedule_cpu_switch()). It actually may or may not be the 'right'
420 * one for this cpu, but that is ok for preventing races.
421 */
422 ASSERT(!local_irq_is_enabled());
423
424 init_pdata(prv, cpu);
425
426 per_cpu(scheduler, cpu) = new_ops;
427 per_cpu(schedule_data, cpu).sched_priv = pdata;
428
429 /*
430 * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
431 * if it is free (and it can be) we want that anyone that manages
432 * taking it, finds all the initializations we've done above in place.
433 */
434 smp_mb();
435 sd->schedule_lock = &sd->_lock;
436 }
437
null_vcpu_insert(const struct scheduler * ops,struct vcpu * v)438 static void null_vcpu_insert(const struct scheduler *ops, struct vcpu *v)
439 {
440 struct null_private *prv = null_priv(ops);
441 struct null_vcpu *nvc = null_vcpu(v);
442 unsigned int cpu;
443 spinlock_t *lock;
444
445 ASSERT(!is_idle_vcpu(v));
446
447 lock = vcpu_schedule_lock_irq(v);
448 retry:
449
450 cpu = v->processor = pick_cpu(prv, v);
451
452 spin_unlock(lock);
453
454 lock = vcpu_schedule_lock(v);
455
456 cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
457 cpupool_domain_cpumask(v->domain));
458
459 /* If the pCPU is free, we assign v to it */
460 if ( likely(per_cpu(npc, cpu).vcpu == NULL) )
461 {
462 /*
463 * Insert is followed by vcpu_wake(), so there's no need to poke
464 * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
465 */
466 vcpu_assign(prv, v, cpu);
467 }
468 else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
469 {
470 /*
471 * If the pCPU is not free (e.g., because we raced with another
472 * insert or a migrate), but there are other free pCPUs, we can
473 * try to pick again.
474 */
475 goto retry;
476 }
477 else
478 {
479 /*
480 * If the pCPU is not free, and there aren't any (valid) others,
481 * we have no alternatives than to go into the waitqueue.
482 */
483 spin_lock(&prv->waitq_lock);
484 list_add_tail(&nvc->waitq_elem, &prv->waitq);
485 dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
486 v->domain->domain_id, v->vcpu_id);
487 spin_unlock(&prv->waitq_lock);
488 }
489 spin_unlock_irq(lock);
490
491 SCHED_STAT_CRANK(vcpu_insert);
492 }
493
_vcpu_remove(struct null_private * prv,struct vcpu * v)494 static void _vcpu_remove(struct null_private *prv, struct vcpu *v)
495 {
496 unsigned int bs;
497 unsigned int cpu = v->processor;
498 struct null_vcpu *wvc;
499
500 ASSERT(list_empty(&null_vcpu(v)->waitq_elem));
501
502 vcpu_deassign(prv, v, cpu);
503
504 spin_lock(&prv->waitq_lock);
505
506 /*
507 * If v is assigned to a pCPU, let's see if there is someone waiting,
508 * suitable to be assigned to it (prioritizing vcpus that have
509 * soft-affinity with cpu).
510 */
511 for_each_affinity_balance_step( bs )
512 {
513 list_for_each_entry( wvc, &prv->waitq, waitq_elem )
514 {
515 if ( bs == BALANCE_SOFT_AFFINITY &&
516 !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) )
517 continue;
518
519 if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) )
520 {
521 list_del_init(&wvc->waitq_elem);
522 vcpu_assign(prv, wvc->vcpu, cpu);
523 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
524 spin_unlock(&prv->waitq_lock);
525 return;
526 }
527 }
528 }
529 spin_unlock(&prv->waitq_lock);
530 }
531
null_vcpu_remove(const struct scheduler * ops,struct vcpu * v)532 static void null_vcpu_remove(const struct scheduler *ops, struct vcpu *v)
533 {
534 struct null_private *prv = null_priv(ops);
535 struct null_vcpu *nvc = null_vcpu(v);
536 spinlock_t *lock;
537
538 ASSERT(!is_idle_vcpu(v));
539
540 lock = vcpu_schedule_lock_irq(v);
541
542 /* If v is in waitqueue, just get it out of there and bail */
543 if ( unlikely(!list_empty(&nvc->waitq_elem)) )
544 {
545 spin_lock(&prv->waitq_lock);
546 list_del_init(&nvc->waitq_elem);
547 spin_unlock(&prv->waitq_lock);
548
549 goto out;
550 }
551
552 ASSERT(per_cpu(npc, v->processor).vcpu == v);
553 ASSERT(!cpumask_test_cpu(v->processor, &prv->cpus_free));
554
555 _vcpu_remove(prv, v);
556
557 out:
558 vcpu_schedule_unlock_irq(lock, v);
559
560 SCHED_STAT_CRANK(vcpu_remove);
561 }
562
null_vcpu_wake(const struct scheduler * ops,struct vcpu * v)563 static void null_vcpu_wake(const struct scheduler *ops, struct vcpu *v)
564 {
565 ASSERT(!is_idle_vcpu(v));
566
567 if ( unlikely(curr_on_cpu(v->processor) == v) )
568 {
569 SCHED_STAT_CRANK(vcpu_wake_running);
570 return;
571 }
572
573 if ( unlikely(!list_empty(&null_vcpu(v)->waitq_elem)) )
574 {
575 /* Not exactly "on runq", but close enough for reusing the counter */
576 SCHED_STAT_CRANK(vcpu_wake_onrunq);
577 return;
578 }
579
580 if ( likely(vcpu_runnable(v)) )
581 SCHED_STAT_CRANK(vcpu_wake_runnable);
582 else
583 SCHED_STAT_CRANK(vcpu_wake_not_runnable);
584
585 /* Note that we get here only for vCPUs assigned to a pCPU */
586 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
587 }
588
null_vcpu_sleep(const struct scheduler * ops,struct vcpu * v)589 static void null_vcpu_sleep(const struct scheduler *ops, struct vcpu *v)
590 {
591 ASSERT(!is_idle_vcpu(v));
592
593 /* If v is not assigned to a pCPU, or is not running, no need to bother */
594 if ( curr_on_cpu(v->processor) == v )
595 cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
596
597 SCHED_STAT_CRANK(vcpu_sleep);
598 }
599
null_cpu_pick(const struct scheduler * ops,struct vcpu * v)600 static int null_cpu_pick(const struct scheduler *ops, struct vcpu *v)
601 {
602 ASSERT(!is_idle_vcpu(v));
603 return pick_cpu(null_priv(ops), v);
604 }
605
null_vcpu_migrate(const struct scheduler * ops,struct vcpu * v,unsigned int new_cpu)606 static void null_vcpu_migrate(const struct scheduler *ops, struct vcpu *v,
607 unsigned int new_cpu)
608 {
609 struct null_private *prv = null_priv(ops);
610 struct null_vcpu *nvc = null_vcpu(v);
611
612 ASSERT(!is_idle_vcpu(v));
613
614 if ( v->processor == new_cpu )
615 return;
616
617 if ( unlikely(tb_init_done) )
618 {
619 struct {
620 uint16_t vcpu, dom;
621 uint16_t cpu, new_cpu;
622 } d;
623 d.dom = v->domain->domain_id;
624 d.vcpu = v->vcpu_id;
625 d.cpu = v->processor;
626 d.new_cpu = new_cpu;
627 __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
628 }
629
630 /*
631 * v is either assigned to a pCPU, or in the waitqueue.
632 *
633 * In the former case, the pCPU to which it was assigned would
634 * become free, and we, therefore, should check whether there is
635 * anyone in the waitqueue that can be assigned to it.
636 *
637 * In the latter, there is just nothing to do.
638 */
639 if ( likely(list_empty(&nvc->waitq_elem)) )
640 {
641 _vcpu_remove(prv, v);
642 SCHED_STAT_CRANK(migrate_running);
643 }
644 else
645 SCHED_STAT_CRANK(migrate_on_runq);
646
647 SCHED_STAT_CRANK(migrated);
648
649 /*
650 * Let's now consider new_cpu, which is where v is being sent. It can be
651 * either free, or have a vCPU already assigned to it.
652 *
653 * In the former case, we should assign v to it, and try to get it to run,
654 * if possible, according to affinity.
655 *
656 * In latter, all we can do is to park v in the waitqueue.
657 */
658 if ( per_cpu(npc, new_cpu).vcpu == NULL &&
659 vcpu_check_affinity(v, new_cpu, BALANCE_HARD_AFFINITY) )
660 {
661 /* v might have been in the waitqueue, so remove it */
662 spin_lock(&prv->waitq_lock);
663 list_del_init(&nvc->waitq_elem);
664 spin_unlock(&prv->waitq_lock);
665
666 vcpu_assign(prv, v, new_cpu);
667 }
668 else
669 {
670 /* Put v in the waitqueue, if it wasn't there already */
671 spin_lock(&prv->waitq_lock);
672 if ( list_empty(&nvc->waitq_elem) )
673 {
674 list_add_tail(&nvc->waitq_elem, &prv->waitq);
675 dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
676 v->domain->domain_id, v->vcpu_id);
677 }
678 spin_unlock(&prv->waitq_lock);
679 }
680
681 /*
682 * Whatever all the above, we always at least override v->processor.
683 * This is especially important for shutdown or suspend/resume paths,
684 * when it is important to let our caller (cpu_disable_scheduler())
685 * know that the migration did happen, to the best of our possibilities,
686 * at least. In case of suspend, any temporary inconsistency caused
687 * by this, will be fixed-up during resume.
688 */
689 v->processor = new_cpu;
690 }
691
692 #ifndef NDEBUG
null_vcpu_check(struct vcpu * v)693 static inline void null_vcpu_check(struct vcpu *v)
694 {
695 struct null_vcpu * const nvc = null_vcpu(v);
696 struct null_dom * const ndom = null_dom(v->domain);
697
698 BUG_ON(nvc->vcpu != v);
699
700 if ( ndom )
701 BUG_ON(is_idle_vcpu(v));
702 else
703 BUG_ON(!is_idle_vcpu(v));
704
705 SCHED_STAT_CRANK(vcpu_check);
706 }
707 #define NULL_VCPU_CHECK(v) (null_vcpu_check(v))
708 #else
709 #define NULL_VCPU_CHECK(v)
710 #endif
711
712
713 /*
714 * The most simple scheduling function of all times! We either return:
715 * - the vCPU assigned to the pCPU, if there's one and it can run;
716 * - the idle vCPU, otherwise.
717 */
null_schedule(const struct scheduler * ops,s_time_t now,bool_t tasklet_work_scheduled)718 static struct task_slice null_schedule(const struct scheduler *ops,
719 s_time_t now,
720 bool_t tasklet_work_scheduled)
721 {
722 unsigned int bs;
723 const unsigned int cpu = smp_processor_id();
724 struct null_private *prv = null_priv(ops);
725 struct null_vcpu *wvc;
726 struct task_slice ret;
727
728 SCHED_STAT_CRANK(schedule);
729 NULL_VCPU_CHECK(current);
730
731 if ( unlikely(tb_init_done) )
732 {
733 struct {
734 uint16_t tasklet, cpu;
735 int16_t vcpu, dom;
736 } d;
737 d.cpu = cpu;
738 d.tasklet = tasklet_work_scheduled;
739 if ( per_cpu(npc, cpu).vcpu == NULL )
740 {
741 d.vcpu = d.dom = -1;
742 }
743 else
744 {
745 d.vcpu = per_cpu(npc, cpu).vcpu->vcpu_id;
746 d.dom = per_cpu(npc, cpu).vcpu->domain->domain_id;
747 }
748 __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
749 }
750
751 if ( tasklet_work_scheduled )
752 {
753 trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
754 ret.task = idle_vcpu[cpu];
755 }
756 else
757 ret.task = per_cpu(npc, cpu).vcpu;
758 ret.migrated = 0;
759 ret.time = -1;
760
761 /*
762 * We may be new in the cpupool, or just coming back online. In which
763 * case, there may be vCPUs in the waitqueue that we can assign to us
764 * and run.
765 */
766 if ( unlikely(ret.task == NULL) )
767 {
768 spin_lock(&prv->waitq_lock);
769
770 if ( list_empty(&prv->waitq) )
771 goto unlock;
772
773 /*
774 * We scan the waitqueue twice, for prioritizing vcpus that have
775 * soft-affinity with cpu. This may look like something expensive to
776 * do here in null_schedule(), but it's actually fine, beceuse we do
777 * it only in cases where a pcpu has no vcpu associated (e.g., as
778 * said above, the cpu has just joined a cpupool).
779 */
780 for_each_affinity_balance_step( bs )
781 {
782 list_for_each_entry( wvc, &prv->waitq, waitq_elem )
783 {
784 if ( bs == BALANCE_SOFT_AFFINITY &&
785 !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) )
786 continue;
787
788 if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) )
789 {
790 vcpu_assign(prv, wvc->vcpu, cpu);
791 list_del_init(&wvc->waitq_elem);
792 ret.task = wvc->vcpu;
793 goto unlock;
794 }
795 }
796 }
797 unlock:
798 spin_unlock(&prv->waitq_lock);
799 }
800
801 if ( unlikely(ret.task == NULL || !vcpu_runnable(ret.task)) )
802 ret.task = idle_vcpu[cpu];
803
804 NULL_VCPU_CHECK(ret.task);
805 return ret;
806 }
807
dump_vcpu(struct null_private * prv,struct null_vcpu * nvc)808 static inline void dump_vcpu(struct null_private *prv, struct null_vcpu *nvc)
809 {
810 printk("[%i.%i] pcpu=%d", nvc->vcpu->domain->domain_id,
811 nvc->vcpu->vcpu_id, list_empty(&nvc->waitq_elem) ?
812 nvc->vcpu->processor : -1);
813 }
814
null_dump_pcpu(const struct scheduler * ops,int cpu)815 static void null_dump_pcpu(const struct scheduler *ops, int cpu)
816 {
817 struct null_private *prv = null_priv(ops);
818 struct null_vcpu *nvc;
819 spinlock_t *lock;
820 unsigned long flags;
821 #define cpustr keyhandler_scratch
822
823 lock = pcpu_schedule_lock_irqsave(cpu, &flags);
824
825 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
826 printk("CPU[%02d] sibling=%s, ", cpu, cpustr);
827 cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
828 printk("core=%s", cpustr);
829 if ( per_cpu(npc, cpu).vcpu != NULL )
830 printk(", vcpu=d%dv%d", per_cpu(npc, cpu).vcpu->domain->domain_id,
831 per_cpu(npc, cpu).vcpu->vcpu_id);
832 printk("\n");
833
834 /* current VCPU (nothing to say if that's the idle vcpu) */
835 nvc = null_vcpu(curr_on_cpu(cpu));
836 if ( nvc && !is_idle_vcpu(nvc->vcpu) )
837 {
838 printk("\trun: ");
839 dump_vcpu(prv, nvc);
840 printk("\n");
841 }
842
843 pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
844 #undef cpustr
845 }
846
null_dump(const struct scheduler * ops)847 static void null_dump(const struct scheduler *ops)
848 {
849 struct null_private *prv = null_priv(ops);
850 struct list_head *iter;
851 unsigned long flags;
852 unsigned int loop;
853 #define cpustr keyhandler_scratch
854
855 spin_lock_irqsave(&prv->lock, flags);
856
857 cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->cpus_free);
858 printk("\tcpus_free = %s\n", cpustr);
859
860 printk("Domain info:\n");
861 loop = 0;
862 list_for_each( iter, &prv->ndom )
863 {
864 struct null_dom *ndom;
865 struct vcpu *v;
866
867 ndom = list_entry(iter, struct null_dom, ndom_elem);
868
869 printk("\tDomain: %d\n", ndom->dom->domain_id);
870 for_each_vcpu( ndom->dom, v )
871 {
872 struct null_vcpu * const nvc = null_vcpu(v);
873 spinlock_t *lock;
874
875 lock = vcpu_schedule_lock(nvc->vcpu);
876
877 printk("\t%3d: ", ++loop);
878 dump_vcpu(prv, nvc);
879 printk("\n");
880
881 vcpu_schedule_unlock(lock, nvc->vcpu);
882 }
883 }
884
885 printk("Waitqueue: ");
886 loop = 0;
887 spin_lock(&prv->waitq_lock);
888 list_for_each( iter, &prv->waitq )
889 {
890 struct null_vcpu *nvc = list_entry(iter, struct null_vcpu, waitq_elem);
891
892 if ( loop++ != 0 )
893 printk(", ");
894 if ( loop % 24 == 0 )
895 printk("\n\t");
896 printk("d%dv%d", nvc->vcpu->domain->domain_id, nvc->vcpu->vcpu_id);
897 }
898 printk("\n");
899 spin_unlock(&prv->waitq_lock);
900
901 spin_unlock_irqrestore(&prv->lock, flags);
902 #undef cpustr
903 }
904
905 const struct scheduler sched_null_def = {
906 .name = "null Scheduler",
907 .opt_name = "null",
908 .sched_id = XEN_SCHEDULER_NULL,
909 .sched_data = NULL,
910
911 .init = null_init,
912 .deinit = null_deinit,
913 .init_pdata = null_init_pdata,
914 .switch_sched = null_switch_sched,
915 .deinit_pdata = null_deinit_pdata,
916
917 .alloc_vdata = null_alloc_vdata,
918 .free_vdata = null_free_vdata,
919 .alloc_domdata = null_alloc_domdata,
920 .free_domdata = null_free_domdata,
921
922 .init_domain = null_dom_init,
923 .destroy_domain = null_dom_destroy,
924
925 .insert_vcpu = null_vcpu_insert,
926 .remove_vcpu = null_vcpu_remove,
927
928 .wake = null_vcpu_wake,
929 .sleep = null_vcpu_sleep,
930 .pick_cpu = null_cpu_pick,
931 .migrate = null_vcpu_migrate,
932 .do_schedule = null_schedule,
933
934 .dump_cpu_state = null_dump_pcpu,
935 .dump_settings = null_dump,
936 };
937
938 REGISTER_SCHEDULER(sched_null_def);
939