diff -rupN linux-2.6.0-test4/arch/i386/Kconfig linux-2.6.0-test4-ss/arch/i386/Kconfig --- linux-2.6.0-test4/arch/i386/Kconfig Fri Aug 22 16:52:28 2003 +++ linux-2.6.0-test4-ss/arch/i386/Kconfig Thu Sep 4 21:33:37 2003 @@ -1343,6 +1343,18 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config SCHEDSTATS + bool "Collect scheduler statistics" + help + If you say Y here, additional code will be inserted into the + scheduler and related routines to collect statistics about + scheduler behavior and provide them in /proc/schedstat. These + stats may be useful for both tuning and debugging the scheduler + If you aren't debugging the scheduler or trying to tune a specific + application, you can say N to avoid the very slight overhead + this adds. + default n + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -rupN linux-2.6.0-test4/fs/proc/proc_misc.c linux-2.6.0-test4-ss/fs/proc/proc_misc.c --- linux-2.6.0-test4/fs/proc/proc_misc.c Fri Aug 22 16:52:23 2003 +++ linux-2.6.0-test4-ss/fs/proc/proc_misc.c Thu Sep 4 21:33:37 2003 @@ -286,6 +286,11 @@ static struct file_operations proc_vmsta .release = seq_release, }; +#ifdef CONFIG_SCHEDSTATS +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); +#endif + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -635,6 +640,9 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, +#ifdef CONFIG_SCHEDSTATS + {"schedstat", schedstats_read_proc}, +#endif {NULL,} }; for (p = simple_ones; p->name; p++) diff -rupN linux-2.6.0-test4/kernel/sched.c linux-2.6.0-test4-ss/kernel/sched.c --- linux-2.6.0-test4/kernel/sched.c Fri Aug 22 16:58:43 2003 +++ linux-2.6.0-test4-ss/kernel/sched.c Thu Sep 4 21:58:07 2003 @@ -160,6 +160,7 @@ struct runqueue { unsigned long nr_running, nr_switches, expired_timestamp, nr_uninterruptible; task_t *curr, *idle; + int cpu; /* to make easy reverse-lookups with per-cpu runqueues */ struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; int prev_cpu_load[NR_CPUS]; @@ -235,6 +236,113 @@ __init void node_nr_running_init(void) #endif /* CONFIG_NUMA */ + +#ifdef CONFIG_SCHEDSTATS +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_busy; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; + unsigned long lb_bnode; + + /* pull_task stats */ + unsigned long pt_gained; + unsigned long pt_lost; + unsigned long pt_node_gained; + unsigned long pt_node_lost; + + /* balance_node stats */ + unsigned long bn_cnt; + unsigned long bn_idle; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 2 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_cnt += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_busy += schedstats[i].lb_busy; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + sums.lb_bnode += schedstats[i].lb_bnode; + sums.pt_node_gained += schedstats[i].pt_node_gained; + sums.pt_node_lost += schedstats[i].pt_node_lost; + sums.pt_gained += schedstats[i].pt_gained; + sums.pt_lost += schedstats[i].pt_lost; + sums.bn_cnt += schedstats[i].bn_cnt; + sums.bn_idle += schedstats[i].bn_idle; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_busy, + schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy, schedstats[i].lb_bnode, + schedstats[i].pt_gained, schedstats[i].pt_lost, + schedstats[i].pt_node_gained, schedstats[i].pt_node_lost, + schedstats[i].bn_cnt, schedstats[i].bn_idle); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched, + sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode, + sums.pt_gained, sums.pt_lost, sums.pt_node_gained, + sums.pt_node_lost, sums.bn_cnt, sums.bn_idle); + + return len; +} +#endif + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -986,6 +1094,14 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { +#ifdef CONFIG_SCHEDSTATS + if (cpu_to_node(this_cpu) != cpu_to_node(src_rq->cpu)) { + schedstats[this_cpu].pt_node_gained++; + schedstats[src_rq->cpu].pt_node_lost++; + } + schedstats[this_cpu].pt_gained++; + schedstats[src_rq->cpu].pt_lost++; +#endif dequeue_task(p, src_array); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); @@ -1020,9 +1136,20 @@ static void load_balance(runqueue_t *thi struct list_head *head, *curr; task_t *tmp; +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_cnt++; +#endif busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); - if (!busiest) - goto out; + if (!busiest) { +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_nobusy++; +#endif + goto out; + } + +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_imbalance += imbalance; +#endif /* * We first consider expired tasks. Those will likely not be @@ -1110,8 +1237,14 @@ static void balance_node(runqueue_t *thi { int node = find_busiest_node(cpu_to_node(this_cpu)); +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].bn_cnt++; +#endif if (node >= 0) { cpumask_t cpumask = node_to_cpumask(node); +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_bnode++; +#endif cpu_set(this_cpu, cpumask); spin_lock(&this_rq->lock); load_balance(this_rq, idle, cpumask); @@ -1122,9 +1255,7 @@ static void balance_node(runqueue_t *thi static void rebalance_tick(runqueue_t *this_rq, int idle) { -#ifdef CONFIG_NUMA int this_cpu = smp_processor_id(); -#endif unsigned long j = jiffies; /* @@ -1137,11 +1268,17 @@ static void rebalance_tick(runqueue_t *t */ if (idle) { #ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) + if (!(j % IDLE_NODE_REBALANCE_TICK)) { +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].bn_idle++; +#endif balance_node(this_rq, idle, this_cpu); #endif if (!(j % IDLE_REBALANCE_TICK)) { spin_lock(&this_rq->lock); +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_idle++; +#endif load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1153,6 +1290,9 @@ static void rebalance_tick(runqueue_t *t #endif if (!(j % BUSY_REBALANCE_TICK)) { spin_lock(&this_rq->lock); +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_busy++; +#endif load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); spin_unlock(&this_rq->lock); } @@ -1287,13 +1427,17 @@ asmlinkage void schedule(void) runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, this_cpu = smp_processor_id(); + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].sched_cnt++; +#endif if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1331,6 +1475,9 @@ need_resched: pick_next_task: if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].lb_resched++; +#endif load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); if (rq->nr_running) goto pick_next_task; @@ -1345,11 +1492,17 @@ pick_next_task: /* * Switch the active and expired arrays. */ +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].sched_switch++; +#endif rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].sched_noswitch++; +#endif idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -2009,6 +2162,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int this_cpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -2017,7 +2171,19 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ +#ifdef CONFIG_SCHEDSTATS + schedstats[this_cpu].yld_cnt++; +#endif if (likely(!rt_task(current))) { +#ifdef CONFIG_SCHEDSTATS + if (current->array->nr_active == 1) { + schedstats[this_cpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[this_cpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[this_cpu].yld_exp_empty++; + } +#endif dequeue_task(current, array); enqueue_task(current, rq->expired); } else { @@ -2522,6 +2688,7 @@ void __init sched_init(void) rq = cpu_rq(i); rq->active = rq->arrays; rq->expired = rq->arrays + 1; + rq->cpu = i; spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0);