diff options
author | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-01-31 19:45:13 +0000 |
---|---|---|
committer | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-01-31 19:45:13 +0000 |
commit | dea8eefd6214e3ad5b54795fa958ab721d58710c (patch) | |
tree | f210aa8b10d18345ea90980eef13ea0c6a2665ee /xenolinux-2.4.24-sparse | |
parent | f21e4de207c89ca1f1d18122c43db30b2ff41e7b (diff) | |
download | xen-dea8eefd6214e3ad5b54795fa958ab721d58710c.tar.gz xen-dea8eefd6214e3ad5b54795fa958ab721d58710c.tar.bz2 xen-dea8eefd6214e3ad5b54795fa958ab721d58710c.zip |
bitkeeper revision 1.699 (401c05c9TV2zsaZ_e3zpy-zaKxCetw)
timer.c, timer.h, sched.h:
new file
Many files:
Rolf's new timer interface, plus various cleanups.
Diffstat (limited to 'xenolinux-2.4.24-sparse')
-rw-r--r-- | xenolinux-2.4.24-sparse/arch/xeno/config.in | 4 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/arch/xeno/defconfig | 5 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c | 13 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c | 30 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c | 155 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h | 24 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/include/linux/sched.h | 966 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/include/linux/timer.h | 77 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/kernel/panic.c | 3 | ||||
-rw-r--r-- | xenolinux-2.4.24-sparse/kernel/timer.c | 968 |
10 files changed, 2216 insertions, 29 deletions
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/config.in b/xenolinux-2.4.24-sparse/arch/xeno/config.in index 445b574a71..3f4736fd1f 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/config.in +++ b/xenolinux-2.4.24-sparse/arch/xeno/config.in @@ -13,9 +13,11 @@ define_bool CONFIG_SBUS n define_bool CONFIG_UID16 y mainmenu_option next_comment -comment 'Privileged guest OS' +comment 'XenoLinux' bool 'Support for privileged operations (domain 0)' CONFIG_XENO_PRIV endmenu +# the IBM S/390 patch needs this. +define_bool CONFIG_NO_IDLE_HZ y mainmenu_option next_comment comment 'Code maturity level options' diff --git a/xenolinux-2.4.24-sparse/arch/xeno/defconfig b/xenolinux-2.4.24-sparse/arch/xeno/defconfig index abef573aa7..3ba185a19b 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/defconfig +++ b/xenolinux-2.4.24-sparse/arch/xeno/defconfig @@ -8,9 +8,12 @@ CONFIG_ISA=y CONFIG_UID16=y # -# Privileged guest OS +# XenoLinux Options # +# support for priviledged domains CONFIG_XENO_PRIV=y +# On demand timer setting (taken from s390 patch set) +CONFIG_NO_IDLE_HZ=y # # Code maturity level options diff --git a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c index ac557a3c11..075acdf5af 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c @@ -81,15 +81,15 @@ static void _dbg_network_int(struct net_device *dev) if ( np->state == STATE_CLOSED ) return; - printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x," - " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x," - " tx_event = 0x%08x, state=%d\n", + printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x," + " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x," + " tx_event=0x%08x, state=%d\n", np->tx_full, np->tx_resp_cons, np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, np->net_idx->tx_event, test_bit(__LINK_STATE_XOFF, &dev->state)); - printk(KERN_ALERT "rx_resp_cons = 0x%08x," - " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n", + printk(KERN_ALERT "net: rx_resp_cons=0x%08x," + " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n", np->rx_resp_cons, np->net_idx->rx_req_prod, np->net_idx->rx_resp_prod, np->net_idx->rx_event); } @@ -550,7 +550,8 @@ int __init init_module(void) goto fail; } - err = request_irq(_EVENT_DEBUG, dbg_network_int, 0, "debug", NULL); + err = request_irq(_EVENT_DEBUG, dbg_network_int, SA_SHIRQ, "net_dbg", + &dbg_network_int); if ( err ) printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c index 3b17c7326c..ff64bccd4c 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c @@ -80,14 +80,36 @@ void enable_hlt(void) */ void cpu_idle (void) { - /* endless idle loop with no priority at all */ + extern int set_timeout_timer(void); + + /* Endless idle loop with no priority at all. */ init_idle(); current->nice = 20; current->counter = -100; - while (1) { - while (!current->need_resched) - HYPERVISOR_yield(); + for ( ; ; ) + { + while ( !current->need_resched ) + { + __cli(); + if ( current->need_resched ) + { + /* The race-free check for events failed. */ + __sti(); + break; + } + else if ( set_timeout_timer() == 0 ) + { + /* NB. Blocking reenable events in a race-free manner. */ + HYPERVISOR_block(); + } + else + { + /* No race here: yielding will get us the CPU again anyway. */ + __sti(); + HYPERVISOR_yield(); + } + } schedule(); check_pgt_cache(); } diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c index 1944e63c1c..bf43b6a99b 100644 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c @@ -75,7 +75,7 @@ static u32 st_scale_i; /* convert ticks -> usecs */ /* These are peridically updated in shared_info, and then copied here. */ static u32 shadow_tsc_stamp; -static s64 shadow_system_time; +static u64 shadow_system_time; static u32 shadow_time_version; static struct timeval shadow_tv; @@ -91,9 +91,12 @@ static long last_update_to_rtc, last_update_to_xen; #endif /* Periodically take synchronised time base from Xen, if we need it. */ -static long last_update_from_xen; +static long last_update_from_xen; /* UTC seconds when last read Xen clock. */ -static u64 processed_system_time; +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ + +#define NS_PER_TICK (1000000000ULL/HZ) #define HANDLE_USEC_UNDERFLOW(_tv) \ do { \ @@ -197,8 +200,11 @@ static int set_rtc_mmss(unsigned long nowtime) #endif -/* Must be called with the xtime_lock held for writing. */ -static void get_time_values_from_xen(void) +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) { do { shadow_time_version = HYPERVISOR_shared_info->time_version2; @@ -216,7 +222,11 @@ static void get_time_values_from_xen(void) (shadow_time_version == HYPERVISOR_shared_info->time_version2) -static inline unsigned long get_time_delta_usecs(void) +/* + * Returns the system time elapsed, in ns, since the current shadow_timestamp + * was calculated. Must be called with the xtime_lock held for reading. + */ +static inline unsigned long __get_time_delta_usecs(void) { s32 delta_tsc; u32 low; @@ -234,6 +244,9 @@ static inline unsigned long get_time_delta_usecs(void) } +/* + * Returns the current time-of-day in UTC timeval format. + */ void do_gettimeofday(struct timeval *tv) { unsigned long flags, lost; @@ -242,7 +255,7 @@ void do_gettimeofday(struct timeval *tv) again: read_lock_irqsave(&xtime_lock, flags); - _tv.tv_usec = get_time_delta_usecs(); + _tv.tv_usec = __get_time_delta_usecs(); if ( (lost = (jiffies - wall_jiffies)) != 0 ) _tv.tv_usec += lost * (1000000 / HZ); _tv.tv_sec = xtime.tv_sec; @@ -257,7 +270,7 @@ void do_gettimeofday(struct timeval *tv) */ read_unlock_irqrestore(&xtime_lock, flags); write_lock_irqsave(&xtime_lock, flags); - get_time_values_from_xen(); + __get_time_values_from_xen(); write_unlock_irqrestore(&xtime_lock, flags); goto again; } @@ -276,6 +289,10 @@ void do_gettimeofday(struct timeval *tv) *tv = _tv; } + +/* + * Sets the current time-of-day based on passed-in UTC timeval parameter. + */ void do_settimeofday(struct timeval *tv) { struct timeval newtv; @@ -291,10 +308,10 @@ void do_settimeofday(struct timeval *tv) * be stale, so we can retry with fresh ones. */ again: - tv->tv_usec -= get_time_delta_usecs(); + tv->tv_usec -= __get_time_delta_usecs(); if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) { - get_time_values_from_xen(); + __get_time_values_from_xen(); goto again; } @@ -334,6 +351,7 @@ void do_settimeofday(struct timeval *tv) } } + asmlinkage long sys_stime(int *tptr) { int value; @@ -353,14 +371,22 @@ asmlinkage long sys_stime(int *tptr) return 0; } -#define NS_PER_TICK (1000000000ULL/HZ) + +/* Convert jiffies to system time. Call with xtime_lock held for reading. */ +static inline u64 __jiffies_to_st(unsigned long j) +{ + return processed_system_time + ((j - jiffies) * NS_PER_TICK); +} + + static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { s64 delta; + unsigned long ticks = 0; long sec_diff; - get_time_values_from_xen(); + __get_time_values_from_xen(); if ( (delta = (s64)(shadow_system_time - processed_system_time)) < 0 ) { @@ -368,13 +394,24 @@ static inline void do_timer_interrupt(int irq, void *dev_id, return; } + /* Process elapsed jiffies since last call. */ while ( delta >= NS_PER_TICK ) { - do_timer(regs); + ticks++; delta -= NS_PER_TICK; processed_system_time += NS_PER_TICK; } - + + if ( ticks != 0 ) + { + do_timer_ticks(ticks); + + if ( user_mode(regs) ) + update_process_times_us(ticks, 0); + else + update_process_times_us(0, ticks); + } + /* * Take synchronised time from Xen once a minute if we're not * synchronised ourselves, and we haven't chosen to keep an independent @@ -446,6 +483,7 @@ static inline void do_timer_interrupt(int irq, void *dev_id, #endif } + static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { write_lock(&xtime_lock); @@ -463,6 +501,89 @@ static struct irqaction irq_timer = { NULL }; + +/* + * This function works out when the the next timer function has to be + * executed (by looking at the timer list) and sets the Xen one-shot + * domain timer to the appropriate value. This is typically called in + * cpu_idle() before the domain blocks. + * + * The function returns a non-0 value on error conditions. + * + * It must be called with interrupts disabled. + */ +extern spinlock_t timerlist_lock; +int set_timeout_timer(void) +{ + struct timer_list *timer; + u64 alarm = 0; + int ret = 0; + + spin_lock(&timerlist_lock); + + /* + * This is safe against long blocking (since calculations are not based on + * TSC deltas). It is also safe against warped system time since + * suspend-resume is cooperative and we would first get locked out. It is + * safe against normal updates of jiffies since interrupts are off. + */ + if ( (timer = next_timer_event()) != NULL ) + alarm = __jiffies_to_st(timer->expires); + + /* Failure is pretty bad, but we'd best soldier on. */ + if ( HYPERVISOR_set_dom_timer(alarm) != 0 ) + ret = -1; + + spin_unlock(&timerlist_lock); + + return ret; +} + + +/* Time debugging. */ +static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + unsigned long flags, j; + u64 s_now, j_st; + struct timeval s_tv, tv; + + struct timer_list *timer; + u64 t_st; + + read_lock_irqsave(&xtime_lock, flags); + s_tv.tv_sec = shadow_tv.tv_sec; + s_tv.tv_usec = shadow_tv.tv_usec; + s_now = shadow_system_time; + read_unlock_irqrestore(&xtime_lock, flags); + + do_gettimeofday(&tv); + + j = jiffies; + j_st = __jiffies_to_st(j); + + timer = next_timer_event(); + t_st = __jiffies_to_st(timer->expires); + + printk(KERN_ALERT "time: shadow_st=0x%X:%08X\n", + (u32)(s_now>>32), (u32)s_now); + printk(KERN_ALERT "time: wct=%lds %ldus shadow_wct=%lds %ldus\n", + tv.tv_sec, tv.tv_usec, s_tv.tv_sec, s_tv.tv_usec); + printk(KERN_ALERT "time: jiffies=%lu(0x%X:%08X) timeout=%lu(0x%X:%08X)\n", + jiffies,(u32)(j_st>>32), (u32)j_st, + timer->expires,(u32)(t_st>>32), (u32)t_st); + printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n", + (u32)(processed_system_time>>32), (u32)processed_system_time); +} + +static struct irqaction dbg_time = { + dbg_time_int, + SA_SHIRQ, + 0, + "timer_dbg", + &dbg_time_int, + NULL +}; + void __init time_init(void) { unsigned long long alarm; @@ -494,10 +615,12 @@ void __init time_init(void) st_scale_f = scale & 0xffffffff; st_scale_i = scale >> 32; - get_time_values_from_xen(); + __get_time_values_from_xen(); processed_system_time = shadow_system_time; - setup_irq(TIMER_IRQ, &irq_timer); + (void)setup_irq(TIMER_IRQ, &irq_timer); + + (void)setup_irq(_EVENT_DEBUG, &dbg_time); rdtscll(alarm); diff --git a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h index 064088ff6f..34272a624f 100644 --- a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h +++ b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h @@ -256,6 +256,17 @@ static inline int HYPERVISOR_yield(void) return ret; } +static inline int HYPERVISOR_block(void) +{ + int ret; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_sched_op), + "b" (SCHEDOP_block) ); + + return ret; +} + static inline int HYPERVISOR_exit(void) { int ret; @@ -279,6 +290,19 @@ static inline int HYPERVISOR_stop(unsigned long srec) return ret; } +static inline long HYPERVISOR_set_dom_timer(u64 timeout) +{ + int ret; + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer), + "b" (timeout_hi), "c" (timeout_lo) : "memory" ); + + return ret; +} + static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op) { int ret; diff --git a/xenolinux-2.4.24-sparse/include/linux/sched.h b/xenolinux-2.4.24-sparse/include/linux/sched.h new file mode 100644 index 0000000000..ed42340517 --- /dev/null +++ b/xenolinux-2.4.24-sparse/include/linux/sched.h @@ -0,0 +1,966 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include <asm/param.h> /* for HZ */ + +extern unsigned long event; + +#include <linux/config.h> +#include <linux/binfmts.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/times.h> +#include <linux/timex.h> +#include <linux/rbtree.h> + +#include <asm/system.h> +#include <asm/semaphore.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/mmu.h> + +#include <linux/smp.h> +#include <linux/tty.h> +#include <linux/sem.h> +#include <linux/signal.h> +#include <linux/securebits.h> +#include <linux/fs_struct.h> + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PID 0x00001000 /* set if pid shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ + +#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ +#define LOAD_FREQ (5*HZ) /* 5 sec intervals */ +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ +#define EXP_5 2014 /* 1/exp(5sec/5min) */ +#define EXP_15 2037 /* 1/exp(5sec/15min) */ + +#define CALC_LOAD(load,exp,n) \ + load *= exp; \ + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + +#define CT_TO_SECS(x) ((x) / HZ) +#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + +extern int nr_running, nr_threads; +extern int last_pid; + +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/param.h> +#include <linux/resource.h> +#ifdef __KERNEL__ +#include <linux/timer.h> +#endif + +#include <asm/processor.h> + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_ZOMBIE 4 +#define TASK_STOPPED 8 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + +/* + * Scheduling policies + */ +#define SCHED_OTHER 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 + +/* + * This is an additional bit set when we want to + * yield the CPU for one re-schedule.. + */ +#define SCHED_YIELD 0x10 + +struct sched_param { + int sched_priority; +}; + +struct completion; + +#ifdef __KERNEL__ + +#include <linux/spinlock.h> + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t runqueue_lock; +extern spinlock_t mmlist_lock; + +extern void sched_init(void); +extern void init_idle(void); +extern void show_state(void); +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +#ifdef CONFIG_NO_IDLE_HZ +extern void update_process_times_us(int user, int system); +#endif +extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +asmlinkage void schedule(void); + +extern int schedule_task(struct tq_struct *task); +extern void flush_scheduled_tasks(void); +extern int start_context_thread(void); +extern int current_is_keventd(void); + +#if CONFIG_SMP +extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask); +#else +# define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + +/* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ +#define NR_OPEN_DEFAULT BITS_PER_LONG + +struct namespace; +/* + * Open file table structure + */ +struct files_struct { + atomic_t count; + rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ + int max_fds; + int max_fdset; + int next_fd; + struct file ** fd; /* current fd array */ + fd_set *close_on_exec; + fd_set *open_fds; + fd_set close_on_exec_init; + fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; +}; + +#define INIT_FILES \ +{ \ + count: ATOMIC_INIT(1), \ + file_lock: RW_LOCK_UNLOCKED, \ + max_fds: NR_OPEN_DEFAULT, \ + max_fdset: __FD_SETSIZE, \ + next_fd: 0, \ + fd: &init_files.fd_array[0], \ + close_on_exec: &init_files.close_on_exec_init, \ + open_fds: &init_files.open_fds_init, \ + close_on_exec_init: { { 0, } }, \ + open_fds_init: { { 0, } }, \ + fd_array: { NULL, } \ +} + +/* Maximum number of active map areas.. This is a random (large) number */ +#define DEFAULT_MAX_MAP_COUNT (65536) + +extern int max_map_count; + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + rb_root_t mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + unsigned long cpu_vm_mask; + unsigned long swap_address; + + unsigned dumpable:1; + + /* Architecture-specific MM context */ + mm_context_t context; +}; + +extern int mmlist_nr; + +#define INIT_MM(name) \ +{ \ + mm_rb: RB_ROOT, \ + pgd: swapper_pg_dir, \ + mm_users: ATOMIC_INIT(2), \ + mm_count: ATOMIC_INIT(1), \ + mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ + page_table_lock: SPIN_LOCK_UNLOCKED, \ + mmlist: LIST_HEAD_INIT(name.mmlist), \ +} + +struct signal_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; +}; + + +#define INIT_SIGNALS { \ + count: ATOMIC_INIT(1), \ + action: { {{0,}}, }, \ + siglock: SPIN_LOCK_UNLOCKED \ +} + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct user_struct *next, **pprev; + uid_t uid; +}; + +#define get_current_user() ({ \ + struct user_struct *__user = current->user; \ + atomic_inc(&__user->__count); \ + __user; }) + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +struct task_struct { + /* + * offsets of these are hardcoded elsewhere - touch with care + */ + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + unsigned long flags; /* per process flags, defined below */ + int sigpending; + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user-thead + 0-0xFFFFFFFF for kernel-thread + */ + struct exec_domain *exec_domain; + volatile long need_resched; + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + +/* + * offset 32 begins here on 32-bit platforms. We keep + * all fields in a single cacheline that are needed for + * the goodness() loop in schedule(). + */ + long counter; + long nice; + unsigned long policy; + struct mm_struct *mm; + int processor; + /* + * cpus_runnable is ~0 if the process is not running on any + * CPU. It's (1 << cpu) if it's running on a CPU. This mask + * is updated under the runqueue lock. + * + * To determine whether a process might run on a CPU, this + * mask is AND-ed with cpus_allowed. + */ + unsigned long cpus_runnable, cpus_allowed; + /* + * (only the 'next' pointer fits into the cacheline, but + * that's just fine.) + */ + struct list_head run_list; + unsigned long sleep_time; + + struct task_struct *next_task, *prev_task; + struct mm_struct *active_mm; + struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; + +/* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + unsigned task_dumpable:1; + pid_t pid; + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->p_pptr->pid) + */ + struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + struct list_head thread_group; + + /* PID hash table linkage. */ + struct task_struct *pidhash_next; + struct task_struct **pidhash_pprev; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct tms times; + unsigned long start_time; + long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int swappable:1; +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + int ngroups; + gid_t groups[NGROUPS]; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; +/* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; +/* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ + unsigned int locks; /* How many file locks are being held */ +/* ipc stuff */ + struct sem_undo *semundo; + struct sem_queue *semsleeping; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* namespace */ + struct namespace *namespace; +/* signal handlers */ + spinlock_t sigmask_lock; /* Protects signal and blocked */ + struct signal_struct *sig; + + sigset_t blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; + +/* journalling filesystem info */ + void *journal_info; +}; + +/* + * Per process flags + */ +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ +#define PF_STARTING 0x00000002 /* being created */ +#define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* dumped core */ +#define PF_SIGNALED 0x00000400 /* killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_FREE_PAGES 0x00002000 /* per process page freeing */ +#define PF_NOIO 0x00004000 /* avoid generating further I/O */ + +#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ + +/* + * Ptrace flags + */ + +#define PT_PTRACED 0x00000001 +#define PT_TRACESYS 0x00000002 +#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ +#define PT_TRACESYSGOOD 0x00000008 +#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ + +#define is_dumpable(tsk) ((tsk)->task_dumpable && (tsk)->mm && (tsk)->mm->dumpable) + +/* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. + */ +#define _STK_LIM (8*1024*1024) + +#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ +#define MAX_COUNTER (20*HZ/100) +#define DEF_NICE (0) + +extern void yield(void); + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +/* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ +#define INIT_TASK(tsk) \ +{ \ + state: 0, \ + flags: 0, \ + sigpending: 0, \ + addr_limit: KERNEL_DS, \ + exec_domain: &default_exec_domain, \ + lock_depth: -1, \ + counter: DEF_COUNTER, \ + nice: DEF_NICE, \ + policy: SCHED_OTHER, \ + mm: NULL, \ + active_mm: &init_mm, \ + cpus_runnable: ~0UL, \ + cpus_allowed: ~0UL, \ + run_list: LIST_HEAD_INIT(tsk.run_list), \ + next_task: &tsk, \ + prev_task: &tsk, \ + p_opptr: &tsk, \ + p_pptr: &tsk, \ + thread_group: LIST_HEAD_INIT(tsk.thread_group), \ + wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + real_timer: { \ + function: it_real_fn \ + }, \ + cap_effective: CAP_INIT_EFF_SET, \ + cap_inheritable: CAP_INIT_INH_SET, \ + cap_permitted: CAP_FULL_SET, \ + keep_capabilities: 0, \ + rlim: INIT_RLIMITS, \ + user: INIT_USER, \ + comm: "swapper", \ + thread: INIT_THREAD, \ + fs: &init_fs, \ + files: &init_files, \ + sigmask_lock: SPIN_LOCK_UNLOCKED, \ + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + alloc_lock: SPIN_LOCK_UNLOCKED, \ + journal_info: NULL, \ +} + + +#ifndef INIT_TASK_SIZE +# define INIT_TASK_SIZE 2048*sizeof(long) +#endif + +union task_union { + struct task_struct task; + unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; +}; + +extern union task_union init_task_union; + +extern struct mm_struct init_mm; +extern struct task_struct *init_tasks[NR_CPUS]; + +/* PID hashing. (shouldnt this be dynamic?) */ +#define PIDHASH_SZ (4096 >> 2) +extern struct task_struct *pidhash[PIDHASH_SZ]; + +#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) + +static inline void hash_pid(struct task_struct *p) +{ + struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + + if((p->pidhash_next = *htable) != NULL) + (*htable)->pidhash_pprev = &p->pidhash_next; + *htable = p; + p->pidhash_pprev = htable; +} + +static inline void unhash_pid(struct task_struct *p) +{ + if(p->pidhash_next) + p->pidhash_next->pidhash_pprev = p->pidhash_pprev; + *p->pidhash_pprev = p->pidhash_next; +} + +static inline struct task_struct *find_task_by_pid(int pid) +{ + struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + + for(p = *htable; p && p->pid != pid; p = p->pidhash_next) + ; + + return p; +} + +#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) + +static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) +{ + tsk->processor = cpu; + tsk->cpus_runnable = 1UL << cpu; +} + +static inline void task_release_cpu(struct task_struct *tsk) +{ + tsk->cpus_runnable = ~0UL; +} + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +extern void free_uid(struct user_struct *); +extern void switch_uid(struct user_struct *); + +#include <asm/current.h> + +extern unsigned long volatile jiffies; +extern unsigned long itimer_ticks; +extern unsigned long itimer_next; +extern struct timeval xtime; +extern void do_timer(struct pt_regs *); +#ifdef CONFIG_NO_IDLE_HZ +extern void do_timer_ticks(int ticks); +#endif + +extern unsigned int * prof_buffer; +extern unsigned long prof_len; +extern unsigned long prof_shift; + +#define CURRENT_TIME (xtime.tv_sec) + +extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); +extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); + +#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) +#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *); +extern void sig_exit(int, int, struct siginfo *); +extern int dequeue_signal(sigset_t *, siginfo_t *); +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_sl_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern void notify_parent(struct task_struct *, int); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); + +static inline int signal_pending(struct task_struct *p) +{ + return (p->sigpending != 0); +} + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +{ + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + All callers should have t->sigmask_lock. */ + +static inline void recalc_sigpending(struct task_struct *t) +{ + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); +} + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + +extern int request_irq(unsigned int, + void (*handler)(int, void *, struct pt_regs *), + unsigned long, const char *, void *); +extern void free_irq(unsigned int, void *); + +/* + * This has now become a routine instead of a macro, it sets a flag if + * it returns true (to do BSD-style accounting where the process is flagged + * if it uses root privs). The implication of this is that you should do + * normal permissions checks first, and check suser() last. + * + * [Dec 1997 -- Chris Evans] + * For correctness, the above considerations need to be extended to + * fsuser(). This is done, along with moving fsuser() checks to be + * last. + * + * These will be removed, but in the mean time, when the SECURE_NOROOT + * flag is set, uids don't grant privilege. + */ +static inline int suser(void) +{ + if (!issecure(SECURE_NOROOT) && current->euid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +static inline int fsuser(void) +{ + if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * capable() checks for a particular capability. + * New privilege checks should use this interface, rather than suser() or + * fsuser(). See include/linux/capability.h for defined capabilities. + */ + +static inline int capable(int cap) +{ +#if 1 /* ok now */ + if (cap_raised(current->cap_effective, cap)) +#else + if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) +#endif + { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +extern struct mm_struct * start_lazy_tlb(void); +extern void end_lazy_tlb(struct mm_struct *mm); + +/* mmdrop drops the mm and the page tables */ +extern inline void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(void); + +/* + * Routines for handling the fd arrays + */ +extern struct file ** alloc_fd_array(int); +extern int expand_fd_array(struct files_struct *, int nr); +extern void free_fd_array(struct file **, int); + +extern fd_set *alloc_fdset(int); +extern int expand_fdset(struct files_struct *, int nr); +extern void free_fdset(fd_set *, int); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_mm(struct task_struct *); +extern void exit_files(struct task_struct *); +extern void exit_sighand(struct task_struct *); + +extern void reparent_to_init(void); +extern void daemonize(void); + +extern int do_execve(char *, char **, char **, struct pt_regs *); +extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); + +extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); +extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); + +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + +#define __wait_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_event(wq, condition); \ +} while (0) + +#define __wait_event_interruptible(wq, condition, ret) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (!signal_pending(current)) { \ + schedule(); \ + continue; \ + } \ + ret = -ERESTARTSYS; \ + break; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __wait_event_interruptible(wq, condition, __ret); \ + __ret; \ +}) + +#define REMOVE_LINKS(p) do { \ + (p)->next_task->prev_task = (p)->prev_task; \ + (p)->prev_task->next_task = (p)->next_task; \ + if ((p)->p_osptr) \ + (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ + if ((p)->p_ysptr) \ + (p)->p_ysptr->p_osptr = (p)->p_osptr; \ + else \ + (p)->p_pptr->p_cptr = (p)->p_osptr; \ + } while (0) + +#define SET_LINKS(p) do { \ + (p)->next_task = &init_task; \ + (p)->prev_task = init_task.prev_task; \ + init_task.prev_task->next_task = (p); \ + init_task.prev_task = (p); \ + (p)->p_ysptr = NULL; \ + if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ + (p)->p_osptr->p_ysptr = p; \ + (p)->p_pptr->p_cptr = p; \ + } while (0) + +#define for_each_task(p) \ + for (p = &init_task ; (p = p->next_task) != &init_task ; ) + +#define for_each_thread(task) \ + for (task = next_thread(current) ; task != current ; task = next_thread(task)) + +#define next_thread(p) \ + list_entry((p)->thread_group.next, struct task_struct, thread_group) + +#define thread_group_leader(p) (p->pid == p->tgid) + +static inline void del_from_runqueue(struct task_struct * p) +{ + nr_running--; + p->sleep_time = jiffies; + list_del(&p->run_list); + p->run_list.next = NULL; +} + +static inline int task_on_runqueue(struct task_struct *p) +{ + return (p->run_list.next != NULL); +} + +static inline void unhash_process(struct task_struct *p) +{ + if (task_on_runqueue(p)) + out_of_line_bug(); + write_lock_irq(&tasklist_lock); + nr_threads--; + unhash_pid(p); + REMOVE_LINKS(p); + list_del(&p->thread_group); + write_unlock_irq(&tasklist_lock); +} + +/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +/* write full pathname into buffer and return start of pathname */ +static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, + char *buf, int buflen) +{ + char *res; + struct vfsmount *rootmnt; + struct dentry *root; + read_lock(¤t->fs->lock); + rootmnt = mntget(current->fs->rootmnt); + root = dget(current->fs->root); + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); + spin_unlock(&dcache_lock); + dput(root); + mntput(rootmnt); + return res; +} + +static inline int need_resched(void) +{ + return (unlikely(current->need_resched)); +} + +extern void __cond_resched(void); +static inline void cond_resched(void) +{ + if (need_resched()) + __cond_resched(); +} + +#endif /* __KERNEL__ */ +#endif diff --git a/xenolinux-2.4.24-sparse/include/linux/timer.h b/xenolinux-2.4.24-sparse/include/linux/timer.h new file mode 100644 index 0000000000..238083218f --- /dev/null +++ b/xenolinux-2.4.24-sparse/include/linux/timer.h @@ -0,0 +1,77 @@ +#ifndef _LINUX_TIMER_H +#define _LINUX_TIMER_H + +#include <linux/config.h> +#include <linux/list.h> + +/* + * In Linux 2.4, static timers have been removed from the kernel. + * Timers may be dynamically created and destroyed, and should be initialized + * by a call to init_timer() upon creation. + * + * The "data" field enables use of a common timeout function for several + * timeouts. You can use this field to distinguish between the different + * invocations. + */ +struct timer_list { + struct list_head list; + unsigned long expires; + unsigned long data; + void (*function)(unsigned long); +}; + +extern void add_timer(struct timer_list * timer); +extern int del_timer(struct timer_list * timer); +#ifdef CONFIG_NO_IDLE_HZ +extern struct timer_list *next_timer_event(void); +#endif + +#ifdef CONFIG_SMP +extern int del_timer_sync(struct timer_list * timer); +extern void sync_timers(void); +#else +#define del_timer_sync(t) del_timer(t) +#define sync_timers() do { } while (0) +#endif + +/* + * mod_timer is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a). + * If the timer is known to be not pending (ie, in the handler), mod_timer + * is less efficient than a->expires = b; add_timer(a). + */ +int mod_timer(struct timer_list *timer, unsigned long expires); + +extern void it_real_fn(unsigned long); + +static inline void init_timer(struct timer_list * timer) +{ + timer->list.next = timer->list.prev = NULL; +} + +static inline int timer_pending (const struct timer_list * timer) +{ + return timer->list.next != NULL; +} + +/* + * These inlines deal with timer wrapping correctly. You are + * strongly encouraged to use them + * 1. Because people otherwise forget + * 2. Because if the timer wrap changes in future you wont have to + * alter your driver code. + * + * time_after(a,b) returns true if the time a is after time b. + * + * Do this with "<0" and ">=0" to only test the sign of the result. A + * good compiler would generate better code (and a really good compiler + * wouldn't care). Gcc is currently neither. + */ +#define time_after(a,b) ((long)(b) - (long)(a) < 0) +#define time_before(a,b) time_after(b,a) + +#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) +#define time_before_eq(a,b) time_after_eq(b,a) + +#endif diff --git a/xenolinux-2.4.24-sparse/kernel/panic.c b/xenolinux-2.4.24-sparse/kernel/panic.c index 871ea67fee..6ab619a607 100644 --- a/xenolinux-2.4.24-sparse/kernel/panic.c +++ b/xenolinux-2.4.24-sparse/kernel/panic.c @@ -110,7 +110,8 @@ NORET_TYPE void panic(const char * fmt, ...) #endif CHECK_EMERGENCY_SYNC #if defined(CONFIG_XENO) - HYPERVISOR_exit(); + HYPERVISOR_console_write(buf, strlen(buf)); + HYPERVISOR_exit(); #endif } } diff --git a/xenolinux-2.4.24-sparse/kernel/timer.c b/xenolinux-2.4.24-sparse/kernel/timer.c new file mode 100644 index 0000000000..567794ab26 --- /dev/null +++ b/xenolinux-2.4.24-sparse/kernel/timer.c @@ -0,0 +1,968 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, kernel timekeeping, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/timex.h> +#include <linux/delay.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include <asm/uaccess.h> + +/* + * Timekeeping variables + */ + +long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ + +/* The current time */ +struct timeval xtime __attribute__ ((aligned (16))); + +/* Don't completely fail for HZ > 500. */ +int tickadj = 500/HZ ? : 1; /* microsecs */ + +DECLARE_TASK_QUEUE(tq_timer); +DECLARE_TASK_QUEUE(tq_immediate); + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +long time_offset; /* time adjustment (us) */ +long time_constant = 2; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_phase; /* phase offset (scaled us) */ +long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; + /* frequency offset (scaled ppm)*/ +long time_adj; /* tick adjust (scaled 1 / HZ) */ +long time_reftime; /* time at last adjustment (s) */ + +long time_adjust; +long time_adjust_step; + +unsigned long event; + +extern int do_setitimer(int, struct itimerval *, struct itimerval *); + +unsigned long volatile jiffies; + +unsigned int * prof_buffer; +unsigned long prof_len; +unsigned long prof_shift; + +/* + * Event timer code + */ +#define TVN_BITS 6 +#define TVR_BITS 8 +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct timer_vec { + int index; + struct list_head vec[TVN_SIZE]; +}; + +struct timer_vec_root { + int index; + struct list_head vec[TVR_SIZE]; +}; + +static struct timer_vec tv5; +static struct timer_vec tv4; +static struct timer_vec tv3; +static struct timer_vec tv2; +static struct timer_vec_root tv1; + +static struct timer_vec * const tvecs[] = { + (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 +}; + +static struct list_head * run_timer_list_running; + +#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) + +void init_timervecs (void) +{ + int i; + + for (i = 0; i < TVN_SIZE; i++) { + INIT_LIST_HEAD(tv5.vec + i); + INIT_LIST_HEAD(tv4.vec + i); + INIT_LIST_HEAD(tv3.vec + i); + INIT_LIST_HEAD(tv2.vec + i); + } + for (i = 0; i < TVR_SIZE; i++) + INIT_LIST_HEAD(tv1.vec + i); +} + +static unsigned long timer_jiffies; + +static inline void internal_add_timer(struct timer_list *timer) +{ + /* + * must be cli-ed when calling this + */ + unsigned long expires = timer->expires; + unsigned long idx = expires - timer_jiffies; + struct list_head * vec; + + if (run_timer_list_running) + vec = run_timer_list_running; + else if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = tv4.vec + i; + } else if ((signed long) idx < 0) { + /* can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = tv1.vec + tv1.index; + } else if (idx <= 0xffffffffUL) { + int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = tv5.vec + i; + } else { + /* Can only get here on architectures with 64-bit jiffies */ + INIT_LIST_HEAD(&timer->list); + return; + } + /* + * Timers are FIFO! + */ + list_add(&timer->list, vec->prev); +} + +/* Initialize both explicitly - let's try to have them in the same cache line */ +spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_SMP +volatile struct timer_list * volatile running_timer; +#define timer_enter(t) do { running_timer = t; mb(); } while (0) +#define timer_exit() do { running_timer = NULL; } while (0) +#define timer_is_running(t) (running_timer == t) +#define timer_synchronize(t) while (timer_is_running(t)) barrier() +#else +#define timer_enter(t) do { } while (0) +#define timer_exit() do { } while (0) +#endif + +void add_timer(struct timer_list *timer) +{ + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + if (timer_pending(timer)) + goto bug; + internal_add_timer(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + return; +bug: + spin_unlock_irqrestore(&timerlist_lock, flags); + printk("bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); +} + +static inline int detach_timer (struct timer_list *timer) +{ + if (!timer_pending(timer)) + return 0; + list_del(&timer->list); + return 1; +} + +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + timer->expires = expires; + ret = detach_timer(timer); + internal_add_timer(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + return ret; +} + +int del_timer(struct timer_list * timer) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&timerlist_lock, flags); + ret = detach_timer(timer); + timer->list.next = timer->list.prev = NULL; + spin_unlock_irqrestore(&timerlist_lock, flags); + return ret; +} + +#ifdef CONFIG_SMP +void sync_timers(void) +{ + spin_unlock_wait(&global_bh_lock); +} + +/* + * SMP specific function to delete periodic timer. + * Caller must disable by some means restarting the timer + * for new. Upon exit the timer is not queued and handler is not running + * on any CPU. It returns number of times, which timer was deleted + * (for reference counting). + */ + +int del_timer_sync(struct timer_list * timer) +{ + int ret = 0; + + for (;;) { + unsigned long flags; + int running; + + spin_lock_irqsave(&timerlist_lock, flags); + ret += detach_timer(timer); + timer->list.next = timer->list.prev = 0; + running = timer_is_running(timer); + spin_unlock_irqrestore(&timerlist_lock, flags); + + if (!running) + break; + + timer_synchronize(timer); + } + + return ret; +} +#endif + + +static inline void cascade_timers(struct timer_vec *tv) +{ + /* cascade all the timers from tv up one level */ + struct list_head *head, *curr, *next; + + head = tv->vec + tv->index; + curr = head->next; + /* + * We are removing _all_ timers from the list, so we don't have to + * detach them individually, just clear the list afterwards. + */ + while (curr != head) { + struct timer_list *tmp; + + tmp = list_entry(curr, struct timer_list, list); + next = curr->next; + list_del(curr); // not needed + internal_add_timer(tmp); + curr = next; + } + INIT_LIST_HEAD(head); + tv->index = (tv->index + 1) & TVN_MASK; +} + +static inline void run_timer_list(void) +{ + spin_lock_irq(&timerlist_lock); + while ((long)(jiffies - timer_jiffies) >= 0) { + LIST_HEAD(queued); + struct list_head *head, *curr; + if (!tv1.index) { + int n = 1; + do { + cascade_timers(tvecs[n]); + } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); + } + run_timer_list_running = &queued; +repeat: + head = tv1.vec + tv1.index; + curr = head->next; + if (curr != head) { + struct timer_list *timer; + void (*fn)(unsigned long); + unsigned long data; + + timer = list_entry(curr, struct timer_list, list); + fn = timer->function; + data= timer->data; + + detach_timer(timer); + timer->list.next = timer->list.prev = NULL; + timer_enter(timer); + spin_unlock_irq(&timerlist_lock); + fn(data); + spin_lock_irq(&timerlist_lock); + timer_exit(); + goto repeat; + } + run_timer_list_running = NULL; + ++timer_jiffies; + tv1.index = (tv1.index + 1) & TVR_MASK; + + curr = queued.next; + while (curr != &queued) { + struct timer_list *timer; + + timer = list_entry(curr, struct timer_list, list); + curr = curr->next; + internal_add_timer(timer); + } + } + spin_unlock_irq(&timerlist_lock); +} + +#ifdef CONFIG_NO_IDLE_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when all cpus are idle. + * And in XenoLinux to achieve the same. + * The timerlist_lock must be acquired before calling this function. + */ +struct timer_list *next_timer_event(void) +{ + struct timer_list *nte, *tmp; + struct list_head *lst; + int i, j; + + /* Look for the next timer event in tv1. */ + i = 0; + j = tvecs[0]->index; + do { + struct list_head *head = tvecs[0]->vec + j; + if (!list_empty(head)) { + nte = list_entry(head->next, struct timer_list, list); + goto found; + } + j = (j + 1) & TVR_MASK; + } while (j != tv1.index); + + /* No event found in tv1. Check tv2-tv5. */ + for (i = 1; i < NOOF_TVECS; i++) { + j = tvecs[i]->index; + do { + nte = NULL; + list_for_each(lst, tvecs[i]->vec + j) { + tmp = list_entry(lst, struct timer_list, list); + if (nte == NULL || + time_before(tmp->expires, nte->expires)) + nte = tmp; + } + if (nte) + goto found; + j = (j + 1) & TVN_MASK; + } while (j != tvecs[i]->index); + } + return NULL; +found: + /* Found timer event in tvecs[i]->vec[j] */ + if (j < tvecs[i]->index && i < NOOF_TVECS-1) { + /* + * The search wrapped. We need to look at the next list + * from tvecs[i+1] that would cascade into tvecs[i]. + */ + list_for_each(lst, tvecs[i+1]->vec+tvecs[i+1]->index) { + tmp = list_entry(lst, struct timer_list, list); + if (time_before(tmp->expires, nte->expires)) + nte = tmp; + } + } + return nte; +} +#endif + +spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED; + +void tqueue_bh(void) +{ + run_task_queue(&tq_timer); +} + +void immediate_bh(void) +{ + run_task_queue(&tq_immediate); +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + */ +static void second_overflow(void) +{ + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if ( time_maxerror > NTP_PHASE_LIMIT ) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + time_state = TIME_OOP; + printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + time_state = TIME_WAIT; + printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if HZ == 100 + /* Compensate for (HZ==100) != (1 << SHIFT_HZ). + * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 2) + (-time_adj >> 5); + else + time_adj += (time_adj >> 2) + (time_adj >> 5); +#endif +} + +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + if ( (time_adjust_step = time_adjust) != 0 ) { + /* We are doing an adjtime thing. + * + * Prepare time_adjust_step to be within bounds. + * Note that a positive time_adjust means we want the clock + * to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + if (time_adjust > tickadj) + time_adjust_step = tickadj; + else if (time_adjust < -tickadj) + time_adjust_step = -tickadj; + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; + } + xtime.tv_usec += tick + time_adjust_step; + /* + * Advance the phase, once it gets to one microsecond, then + * advance the tick more. + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + long ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + xtime.tv_usec -= ltemp; + } + else if (time_phase >= FINEUSEC) { + long ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + xtime.tv_usec += ltemp; + } +} + +/* + * Using a loop looks inefficient, but "ticks" is + * usually just one (we shouldn't be losing ticks, + * we're doing this this way mainly for interrupt + * latency reasons, not because we think we'll + * have lots of lost timer ticks + */ +static void update_wall_time(unsigned long ticks) +{ + do { + ticks--; + update_wall_time_one_tick(); + } while (ticks); + + while (xtime.tv_usec >= 1000000) { + xtime.tv_usec -= 1000000; + xtime.tv_sec++; + second_overflow(); + } +} + +static inline void do_process_times(struct task_struct *p, + unsigned long user, unsigned long system) +{ + unsigned long psecs; + + psecs = (p->times.tms_utime += user); + psecs += (p->times.tms_stime += system); + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { + /* Send SIGXCPU every second.. */ + if (!(psecs % HZ)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) + send_sig(SIGKILL, p, 1); + } +} + +static inline void do_it_virt(struct task_struct * p, unsigned long ticks) +{ + unsigned long it_virt = p->it_virt_value; + + if (it_virt) { + it_virt -= ticks; + if (!it_virt) { + it_virt = p->it_virt_incr; + send_sig(SIGVTALRM, p, 1); + } + p->it_virt_value = it_virt; + } +} + +static inline void do_it_prof(struct task_struct *p) +{ + unsigned long it_prof = p->it_prof_value; + + if (it_prof) { + if (--it_prof == 0) { + it_prof = p->it_prof_incr; + send_sig(SIGPROF, p, 1); + } + p->it_prof_value = it_prof; + } +} + +void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu) +{ + p->per_cpu_utime[cpu] += user; + p->per_cpu_stime[cpu] += system; + do_process_times(p, user, system); + do_it_virt(p, user); + do_it_prof(p); +} + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(), system = user_tick ^ 1; + + update_one_process(p, user_tick, system, cpu); + if (p->pid) { + if (--p->counter <= 0) { + p->counter = 0; + /* + * SCHED_FIFO is priority preemption, so this is + * not the place to decide whether to reschedule a + * SCHED_FIFO task or not - Bhavesh Davda + */ + if (p->policy != SCHED_FIFO) { + p->need_resched = 1; + } + } + if (p->nice > 0) + kstat.per_cpu_nice[cpu] += user_tick; + else + kstat.per_cpu_user[cpu] += user_tick; + kstat.per_cpu_system[cpu] += system; + } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; +} + +/* + * Called from the timer interrupt handler to charge a couple of ticks + * to the current process. + */ +void update_process_times_us(int user_ticks, int system_ticks) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + update_one_process(p, user_ticks, system_ticks, cpu); + if (p->pid) { + p->counter -= user_ticks + system_ticks; + if (p->counter <= 0) { + p->counter = 0; + p->need_resched = 1; + } + if (p->nice > 0) + kstat.per_cpu_nice[cpu] += user_ticks; + else + kstat.per_cpu_user[cpu] += user_ticks; + kstat.per_cpu_system[cpu] += system_ticks; + } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system_ticks; +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + struct task_struct *p; + unsigned long nr = 0; + + read_lock(&tasklist_lock); + for_each_task(p) { + if ((p->state == TASK_RUNNING || + (p->state & TASK_UNINTERRUPTIBLE))) + nr += FIXED_1; + } + read_unlock(&tasklist_lock); + return nr; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + */ +unsigned long avenrun[3]; + +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + while (count < 0) { + count += LOAD_FREQ; + active_tasks = count_active_tasks(); + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + } +} + +/* jiffies at the most recent update of wall time */ +unsigned long wall_jiffies; + +/* + * This spinlock protect us from races in SMP while playing with xtime. -arca + */ +rwlock_t xtime_lock = RW_LOCK_UNLOCKED; + +static inline void update_times(void) +{ + unsigned long ticks; + + /* + * update_times() is run from the raw timer_bh handler so we + * just know that the irqs are locally enabled and so we don't + * need to save/restore the flags of the local CPU here. -arca + */ + write_lock_irq(&xtime_lock); + vxtime_lock(); + + ticks = jiffies - wall_jiffies; + if (ticks) { + wall_jiffies += ticks; + update_wall_time(ticks); + } + vxtime_unlock(); + write_unlock_irq(&xtime_lock); + calc_load(ticks); +} + +void timer_bh(void) +{ + update_times(); + run_timer_list(); +} + +void do_timer(struct pt_regs *regs) +{ + (*(unsigned long *)&jiffies)++; +#ifndef CONFIG_SMP + /* SMP process accounting uses the local APIC timer */ + + update_process_times(user_mode(regs)); +#endif + mark_bh(TIMER_BH); + if (TQ_ACTIVE(tq_timer)) + mark_bh(TQUEUE_BH); +} + +void do_timer_ticks(int ticks) +{ + (*(unsigned long *)&jiffies) += ticks; + mark_bh(TIMER_BH); + if (TQ_ACTIVE(tq_timer)) + mark_bh(TQUEUE_BH); +} + +#if !defined(__alpha__) && !defined(__ia64__) + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +asmlinkage unsigned long sys_alarm(unsigned int seconds) +{ + struct itimerval it_new, it_old; + unsigned int oldalarm; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + do_setitimer(ITIMER_REAL, &it_new, &it_old); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if (it_old.it_value.tv_usec) + oldalarm++; + return oldalarm; +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +asmlinkage long sys_getpid(void) +{ + return current->tgid; +} + +/* + * This is not strictly SMP safe: p_opptr could change + * from under us. However, rather than getting any lock + * we can use an optimistic algorithm: get the parent + * pid, and go back and check that the parent is still + * the same. If it has changed (which is extremely unlikely + * indeed), we just try again.. + * + * NOTE! This depends on the fact that even if we _do_ + * get an old value of "parent", we can happily dereference + * the pointer: we just can't necessarily trust the result + * until we know that the parent pointer is valid. + * + * The "mb()" macro is a memory barrier - a synchronizing + * event. It also makes sure that gcc doesn't optimize + * away the necessary memory references.. The barrier doesn't + * have to have all that strong semantics: on x86 we don't + * really require a synchronizing instruction, for example. + * The barrier is more important for code generation than + * for any real memory ordering semantics (even if there is + * a small window for a race, using the old pointer is + * harmless for a while). + */ +asmlinkage long sys_getppid(void) +{ + int pid; + struct task_struct * me = current; + struct task_struct * parent; + + parent = me->p_opptr; + for (;;) { + pid = parent->pid; +#if CONFIG_SMP +{ + struct task_struct *old = parent; + mb(); + parent = me->p_opptr; + if (old != parent) + continue; +} +#endif + break; + } + return pid; +} + +asmlinkage long sys_getuid(void) +{ + /* Only we change this so SMP safe */ + return current->uid; +} + +asmlinkage long sys_geteuid(void) +{ + /* Only we change this so SMP safe */ + return current->euid; +} + +asmlinkage long sys_getgid(void) +{ + /* Only we change this so SMP safe */ + return current->gid; +} + +asmlinkage long sys_getegid(void) +{ + /* Only we change this so SMP safe */ + return current->egid; +} + +#endif + +/* Thread ID - the internal kernel "pid" */ +asmlinkage long sys_gettid(void) +{ + return current->pid; +} + +asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) +{ + struct timespec t; + unsigned long expire; + + if(copy_from_user(&t, rqtp, sizeof(struct timespec))) + return -EFAULT; + + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) + return -EINVAL; + + + if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && + current->policy != SCHED_OTHER) + { + /* + * Short delay requests up to 2 ms will be handled with + * high precision by a busy wait for all real-time processes. + * + * Its important on SMP not to do this holding locks. + */ + udelay((t.tv_nsec + 999) / 1000); + return 0; + } + + expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); + + current->state = TASK_INTERRUPTIBLE; + expire = schedule_timeout(expire); + + if (expire) { + if (rmtp) { + jiffies_to_timespec(expire, &t); + if (copy_to_user(rmtp, &t, sizeof(struct timespec))) + return -EFAULT; + } + return -EINTR; + } + return 0; +} + |