/* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. * * The goto is "interesting". * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */ asmlinkage void schedule(void) { struct schedule_data * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; int this_cpu, c; spin_lock_prefetch(&runqueue_lock); if (!current->active_mm) BUG(); need_resched_back: prev = current; this_cpu = prev->processor; if (unlikely(in_interrupt())) { printk("Scheduling in interrupt\n"); BUG(); } release_kernel_lock(prev, this_cpu); /* * 'sched_data' is protected by the fact that we can run * only one process per CPU. */ sched_data = & aligned_data[this_cpu].schedule_data; spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ if (unlikely(prev->policy == SCHED_RR)) if (!prev->counter) { prev->counter = NICE_TO_TICKS(prev->nice); move_last_runqueue(prev); } switch (prev->state) { case TASK_INTERRUPTIBLE: if (signal_pending(prev)) { prev->state = TASK_RUNNING; break; } default: del_from_runqueue(prev); case TASK_RUNNING:; } prev->need_resched = 0; /* * this is the scheduler proper: */ repeat_schedule: /* * Default process to select.. */ next = idle_task(this_cpu); c = -1000; list_for_each(tmp, &runqueue_head) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { int weight = goodness(p, this_cpu, prev->active_mm); if (weight > c) c = weight, next = p; } } /* Do we need to re-calculate counters? */ if (unlikely(!c)) { struct task_struct *p; spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); for_each_task(p) p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); goto repeat_schedule; } /* * from this point on nothing can prevent us from * switching to the next task, save this fact in * sched_data. */ sched_data->curr = next; task_set_cpu(next, this_cpu); spin_unlock_irq(&runqueue_lock); if (unlikely(prev == next)) { /* We won't go through the normal tail, so do this by hand */ prev->policy &= ~SCHED_YIELD; goto same_process; } kstat.context_swtch++; /* * there are 3 processes which are affected by a context switch: * * prev == .... ==> (last => next) * * It's the 'much more previous' 'prev' that is on next's stack, * but prev is set to (the just run) 'last' process by switch_to(). * This might sound slightly confusing but makes tons of sense. */ prepare_to_switch(); { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; if (!mm) { if (next->active_mm) BUG(); next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next, this_cpu); } else { if (next->active_mm != mm) BUG(); switch_mm(oldmm, mm, next, this_cpu); } if (!prev->mm) { prev->active_mm = NULL; mmdrop(oldmm); } } /* * This just switches the register state and the * stack. */ switch_to(prev, next, prev); __schedule_tail(prev); same_process: reacquire_kernel_lock(current); if (current->need_resched) goto need_resched_back; return; } /* * schedule_tail() is getting called from the fork return path. This * cleans up all remaining scheduler things, without impacting the * common case. */ static inline void __schedule_tail(struct task_struct *prev) { #ifdef CONFIG_SMP int policy; /* * prev->policy can be written from here only before `prev' * can be scheduled (before setting prev->cpus_runnable to ~0UL). * Of course it must also be read before allowing prev * to be rescheduled, but since the write depends on the read * to complete, wmb() is enough. (the spin_lock() acquired * before setting cpus_runnable is not enough because the spin_lock() * common code semantics allows code outside the critical section * to enter inside the critical section) */ policy = prev->policy; prev->policy = policy & ~SCHED_YIELD; wmb(); /* * fast path falls through. We have to clear cpus_runnable before * checking prev->state to avoid a wakeup race. Protect against * the task exiting early. */ task_lock(prev); task_release_cpu(prev); mb(); if (prev->state == TASK_RUNNING) goto needs_resched; out_unlock: task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ return; /* * Slow path - we 'push' the previous process and * reschedule_idle() will attempt to find a new * processor for it. (but it might preempt the * current process as well.) We must take the runqueue * lock and re-check prev->state to be correct. It might * still happen that this process has a preemption * 'in progress' already - but this is not a problem and * might happen in other circumstances as well. */ needs_resched: { unsigned long flags; /* * Avoid taking the runqueue lock in cases where * no preemption-check is necessery: */ if ((prev == idle_task(smp_processor_id())) || (policy & SCHED_YIELD)) goto out_unlock; spin_lock_irqsave(&runqueue_lock, flags); if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) reschedule_idle(prev); spin_unlock_irqrestore(&runqueue_lock, flags); goto out_unlock; } #else prev->policy &= ~SCHED_YIELD; #endif /* CONFIG_SMP */ } #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ "movl %3,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ "pushl %4\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ "=b" (last) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ "a" (prev), "d" (next), \ "b" (prev)); \ } while (0) /* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). * * NOTE! We used to use the x86 hardware context switching. The * reason for not using it any more becomes apparent when you * try to recover gracefully from saved state that is no longer * valid (stale segment register values in particular). With the * hardware task-switch, there is no way to fix up bad state in * a reasonable manner. * * The fact that Intel documents the hardware task-switching to * be slow is a fairly red herring - this code is not noticeably * faster. However, there _is_ some room for improvement here, * so the performance issues may eventually be a valid point. * More important, however, is the fact that this allows us much * more flexibility. */ void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct tss_struct *tss = init_tss + smp_processor_id(); unlazy_fpu(prev_p); /* * Reload esp0, LDT and the page table pointer: */ tss->esp0 = next->esp0; /* * Save away %fs and %gs. No need to save %es and %ds, as * those are always kernel segments while inside the kernel. */ asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* * Restore %fs and %gs. */ loadsegment(fs, next->fs); loadsegment(gs, next->gs); /* * Now maybe reload the debug registers */ if (next->debugreg[7]){ loaddebug(next, 0); loaddebug(next, 1); loaddebug(next, 2); loaddebug(next, 3); /* no 4 and 5 */ loaddebug(next, 6); loaddebug(next, 7); } if (prev->ioperm || next->ioperm) { if (next->ioperm) { /* * 4 cachelines copy ... not good, but not that * bad either. Anyone got something better? * This only affects processes which use ioperm(). * [Putting the TSSs into 4k-tlb mapped regions * and playing VM tricks to switch the IO bitmap * is not really acceptable.] */ memcpy(tss->io_bitmap, next->io_bitmap, IO_BITMAP_SIZE*sizeof(unsigned long)); tss->bitmap = IO_BITMAP_OFFSET; } else /* * a bitmap offset pointing outside of the TSS limit * causes a nicely controllable SIGSEGV if a process * tries to use a port IO instruction. The first * sys_ioperm() call sets up the bitmap properly. */ tss->bitmap = INVALID_IO_BITMAP_OFFSET; } } /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending * on what CPU they've run on lately etc to try to handle cache * and TLB miss penalties. * * Return values: * -1000: never select this * 0: out of time, recalculate counters (but it might still be * selected) * +ve: "goodness" value (the larger, the better) * +1000: realtime process, select this. */ static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) { int weight; /* * select the current process after every other * runnable process, but before the idle thread. * Also, dont trigger a counter recalculation. */ weight = -1; if (p->policy & SCHED_YIELD) goto out; /* * Non-RT process - normal case first. */ if (p->policy == SCHED_OTHER) { /* * Give the process a first-approximation goodness value * according to the number of clock-ticks it has left. * * Don't do any other calculations if the time slice is * over.. */ weight = p->counter; if (!weight) goto out; #ifdef CONFIG_SMP /* Give a largish advantage to the same processor... */ /* (this is equivalent to penalizing other processors) */ if (p->processor == this_cpu) weight += PROC_CHANGE_PENALTY; #endif /* .. and a slight advantage to the current MM */ if (p->mm == this_mm || !p->mm) weight += 1; weight += 20 - p->nice; goto out; } /* * Realtime process, select the first one on the * runqueue (taking priorities within processes * into account). */ weight = 1000 + p->rt_priority; out: return weight; }