1
0

Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
This commit is contained in:
Linus Torvalds
2005-04-16 15:20:36 -07:00
commit 1da177e4c3
17291 changed files with 6718755 additions and 0 deletions

53
kernel/Makefile Normal file
View File

@@ -0,0 +1,53 @@
#
# Makefile for the linux kernel.
#
obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o intermodule.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
# me. I suspect most platforms don't need this, but until we know that for sure
# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
# to get a correct value for the wait-channel (WCHAN in ps). --davidm
CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
endif
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
$(obj)/config_data.gz: .config FORCE
$(call if_changed,gzip)
quiet_cmd_ikconfiggz = IKCFG $@
cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call if_changed,ikconfiggz)

561
kernel/acct.c Normal file
View File

@@ -0,0 +1,561 @@
/*
* linux/kernel/acct.c
*
* BSD Process Accounting for Linux
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Some code based on ideas and code from:
* Thomas K. Dyas <tdyas@eden.rutgers.edu>
*
* This file implements BSD-style process accounting. Whenever any
* process exits, an accounting record of type "struct acct" is
* written to the file specified with the acct() system call. It is
* up to user-level programs to do useful things with the accounting
* log. The kernel just provides the raw accounting information.
*
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
*
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
* the file happened to be read-only. 2) If the accounting was suspended
* due to the lack of space it happily allowed to reopen it and completely
* lost the old acct_file. 3/10/98, Al Viro.
*
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
* Fixed a nasty interaction with with sys_umount(). If the accointing
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
* unless we are messing with the root. In that case we are getting a
* real mess with do_remount_sb(). 9/11/98, AV.
*
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
* one race (and leak) in BSD implementation.
* OK, that's better. ANOTHER race and leak in BSD variant. There always
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
* ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
#include <linux/config.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/file.h>
#include <linux/tty.h>
#include <linux/security.h>
#include <linux/vfs.h>
#include <linux/jiffies.h>
#include <linux/times.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
/*
* These constants control the amount of freespace that suspend and
* resume the process accounting system, and the time delay between
* each check.
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
/*
* External references and all of the globals.
*/
static void do_acct_process(long, struct file *);
/*
* This structure is used so that all the data protected by lock
* can be placed in the same cache line as the lock. This primes
* the cache line to have the data after getting the lock.
*/
struct acct_glbs {
spinlock_t lock;
volatile int active;
volatile int needcheck;
struct file *file;
struct timer_list timer;
};
static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
/*
* Called whenever the timer says to check the free space.
*/
static void acct_timeout(unsigned long unused)
{
acct_globals.needcheck = 1;
}
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct file *file)
{
struct kstatfs sbuf;
int res;
int act;
sector_t resume;
sector_t suspend;
spin_lock(&acct_globals.lock);
res = acct_globals.active;
if (!file || !acct_globals.needcheck)
goto out;
spin_unlock(&acct_globals.lock);
/* May block */
if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
sector_div(suspend, 100);
sector_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
else if (sbuf.f_bavail >= resume)
act = 1;
else
act = 0;
/*
* If some joker switched acct_globals.file under us we'ld better be
* silent and _not_ touch anything.
*/
spin_lock(&acct_globals.lock);
if (file != acct_globals.file) {
if (act)
res = act>0;
goto out;
}
if (acct_globals.active) {
if (act < 0) {
acct_globals.active = 0;
printk(KERN_INFO "Process accounting paused\n");
}
} else {
if (act > 0) {
acct_globals.active = 1;
printk(KERN_INFO "Process accounting resumed\n");
}
}
del_timer(&acct_globals.timer);
acct_globals.needcheck = 0;
acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct_globals.timer);
res = acct_globals.active;
out:
spin_unlock(&acct_globals.lock);
return res;
}
/*
* Close the old accouting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_globals.lock MUST be held on entry and exit.
*/
static void acct_file_reopen(struct file *file)
{
struct file *old_acct = NULL;
if (acct_globals.file) {
old_acct = acct_globals.file;
del_timer(&acct_globals.timer);
acct_globals.active = 0;
acct_globals.needcheck = 0;
acct_globals.file = NULL;
}
if (file) {
acct_globals.file = file;
acct_globals.needcheck = 0;
acct_globals.active = 1;
/* It's been deleted if it was used before so this is safe */
init_timer(&acct_globals.timer);
acct_globals.timer.function = acct_timeout;
acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
add_timer(&acct_globals.timer);
}
if (old_acct) {
spin_unlock(&acct_globals.lock);
do_acct_process(0, old_acct);
filp_close(old_acct, NULL);
spin_lock(&acct_globals.lock);
}
}
/*
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
*/
asmlinkage long sys_acct(const char __user *name)
{
struct file *file = NULL;
char *tmp;
int error;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
tmp = getname(name);
if (IS_ERR(tmp)) {
return (PTR_ERR(tmp));
}
/* Difference from BSD - they don't do O_APPEND */
file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
putname(tmp);
if (IS_ERR(file)) {
return (PTR_ERR(file));
}
if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
filp_close(file, NULL);
return (-EACCES);
}
if (!file->f_op->write) {
filp_close(file, NULL);
return (-EIO);
}
}
error = security_acct(file);
if (error) {
if (file)
filp_close(file, NULL);
return error;
}
spin_lock(&acct_globals.lock);
acct_file_reopen(file);
spin_unlock(&acct_globals.lock);
return (0);
}
/*
* If the accouting is turned on for a file in the filesystem pointed
* to by sb, turn accouting off.
*/
void acct_auto_close(struct super_block *sb)
{
spin_lock(&acct_globals.lock);
if (acct_globals.file &&
acct_globals.file->f_dentry->d_inode->i_sb == sb) {
acct_file_reopen((struct file *)NULL);
}
spin_unlock(&acct_globals.lock);
}
/*
* encode an unsigned long into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
static comp_t encode_comp_t(unsigned long value)
{
int exp, rnd;
exp = rnd = 0;
while (value > MAXFRACT) {
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
value >>= EXPSIZE;
exp++;
}
/*
* Clean it up and polish it off.
*/
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += value; /* and add on the mantissa. */
return exp;
}
#if ACCT_VERSION==1 || ACCT_VERSION==2
/*
* encode an u64 into a comp2_t (24 bits)
*
* Format: 5 bit base 2 exponent, 20 bits mantissa.
* The leading bit of the mantissa is not stored, but implied for
* non-zero exponents.
* Largest encodable value is 50 bits.
*/
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
int exp, rnd;
exp = (value > (MAXFRACT2>>1));
rnd = 0;
while (value > MAXFRACT2) {
rnd = value & 1;
value >>= 1;
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT2)) {
value >>= 1;
exp++;
}
if (exp > MAXEXP2) {
/* Overflow. Return largest representable number instead. */
return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
} else {
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
#endif
#if ACCT_VERSION==3
/*
* encode an u64 into a 32 bit IEEE float
*/
static u32 encode_float(u64 value)
{
unsigned exp = 190;
unsigned u;
if (value==0) return 0;
while ((s64)value > 0){
value <<= 1;
exp--;
}
u = (u32)(value >> 40) & 0x7fffffu;
return u | (exp << 23);
}
#endif
/*
* Write an accounting entry for an exiting process
*
* The acct_process() call is the workhorse of the process
* accounting system. The struct acct is built here and then written
* into the accounting file. This function should only be called from
* do_exit().
*/
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(long exitcode, struct file *file)
{
acct_t ac;
mm_segment_t fs;
unsigned long vsize;
unsigned long flim;
u64 elapsed;
u64 run_time;
struct timespec uptime;
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(file))
return;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset((caddr_t)&ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
/* calculate run_time in nsec*/
do_posix_clock_monotonic_gettime(&uptime);
run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+ current->start_time.tv_nsec;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION==3
ac.ac_etime = encode_float(elapsed);
#else
ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION==1 || ACCT_VERSION==2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
ac.ac_etime_hi = etime >> 16;
ac.ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
ac.ac_btime = xtime.tv_sec - elapsed;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
current->signal->utime +
current->group_leader->utime));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
current->signal->stime +
current->group_leader->stime));
/* we really need to bite the bullet and change layout */
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
#if ACCT_VERSION==2
ac.ac_ahz = AHZ;
#endif
#if ACCT_VERSION==1 || ACCT_VERSION==2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = current->uid;
ac.ac_gid16 = current->gid;
#endif
#if ACCT_VERSION==3
ac.ac_pid = current->tgid;
ac.ac_ppid = current->parent->tgid;
#endif
read_lock(&tasklist_lock); /* pin current->signal */
ac.ac_tty = current->signal->tty ?
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
ac.ac_flag = 0;
if (current->flags & PF_FORKNOEXEC)
ac.ac_flag |= AFORK;
if (current->flags & PF_SUPERPRIV)
ac.ac_flag |= ASU;
if (current->flags & PF_DUMPCORE)
ac.ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
ac.ac_flag |= AXSIG;
vsize = 0;
if (current->mm) {
struct vm_area_struct *vma;
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
up_read(&current->mm->mmap_sem);
}
vsize = vsize / 1024;
ac.ac_mem = encode_comp_t(vsize);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_minflt = encode_comp_t(current->signal->min_flt +
current->group_leader->min_flt);
ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
current->group_leader->maj_flt);
ac.ac_swaps = encode_comp_t(0);
ac.ac_exitcode = exitcode;
/*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
fs = get_fs();
set_fs(KERNEL_DS);
/*
* Accounting records are not subject to resource limits.
*/
flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
file->f_op->write(file, (char *)&ac,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
}
/*
* acct_process - now just a wrapper around do_acct_process
*/
void acct_process(long exitcode)
{
struct file *file = NULL;
/*
* accelerate the common fastpath:
*/
if (!acct_globals.file)
return;
spin_lock(&acct_globals.lock);
file = acct_globals.file;
if (unlikely(!file)) {
spin_unlock(&acct_globals.lock);
return;
}
get_file(file);
spin_unlock(&acct_globals.lock);
do_acct_process(exitcode, file);
fput(file);
}
/*
* acct_update_integrals
* - update mm integral fields in task_struct
*/
void acct_update_integrals(struct task_struct *tsk)
{
if (likely(tsk->mm)) {
long delta = tsk->stime - tsk->acct_stimexpd;
if (delta == 0)
return;
tsk->acct_stimexpd = tsk->stime;
tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
}
}
/*
* acct_clear_integrals
* - clear the mm integral fields in task_struct
*/
void acct_clear_integrals(struct task_struct *tsk)
{
if (tsk) {
tsk->acct_stimexpd = 0;
tsk->acct_rss_mem1 = 0;
tsk->acct_vm_mem1 = 0;
}
}

839
kernel/audit.c Normal file
View File

@@ -0,0 +1,839 @@
/* audit.c -- Auditing support -*- linux-c -*-
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with SELinux.
* 2) Minimal run-time overhead:
* a) Minimal when syscall auditing is disabled (audit_enable=0).
* b) Small when syscall auditing is enabled and no audit record
* is generated (defer as much work as possible to record
* generation time):
* i) context is allocated,
* ii) names from getname are stored without a copy, and
* iii) inode information stored from path_lookup.
* 3) Ability to disable syscall auditing at boot time (audit=0).
* 4) Usable by other parts of the kernel (if audit_log* is called,
* then a syscall record will be generated automatically for the
* current syscall).
* 5) Netlink interface to user-space.
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
* Example user-space utilities: http://people.redhat.com/faith/audit/
*/
#include <linux/init.h>
#include <asm/atomic.h>
#include <asm/types.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
/* No auditing will take place until audit_initialized != 0.
* (Initialization happens after skb_init is called.) */
static int audit_initialized;
/* No syscall auditing will take place unless audit_enabled != 0. */
int audit_enabled;
/* Default state when kernel boots without any parameters. */
static int audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
static int audit_failure = AUDIT_FAIL_PRINTK;
/* If audit records are to be written to the netlink socket, audit_pid
* contains the (non-zero) pid. */
static int audit_pid;
/* If audit_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
static int audit_rate_limit;
/* Number of outstanding audit_buffers allowed. */
static int audit_backlog_limit = 64;
static atomic_t audit_backlog = ATOMIC_INIT(0);
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
2) out of memory in audit_log_move [alloc_skb]
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
/* There are two lists of audit buffers. The txlist contains audit
* buffers that cannot be sent immediately to the netlink device because
* we are in an irq context (these are sent later in a tasklet).
*
* The second list is a list of pre-allocated audit buffers (if more
* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
* being placed on the freelist). */
static DEFINE_SPINLOCK(audit_txlist_lock);
static DEFINE_SPINLOCK(audit_freelist_lock);
static int audit_freelist_count = 0;
static LIST_HEAD(audit_txlist);
static LIST_HEAD(audit_freelist);
/* There are three lists of rules -- one to search at task creation
* time, one to search at syscall entry time, and another to search at
* syscall exit time. */
static LIST_HEAD(audit_tsklist);
static LIST_HEAD(audit_entlist);
static LIST_HEAD(audit_extlist);
/* The netlink socket is only to be read by 1 CPU, which lets us assume
* that list additions and deletions never happen simultaneiously in
* auditsc.c */
static DECLARE_MUTEX(audit_netlink_sem);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
* should be at least that large. */
#define AUDIT_BUFSIZ 1024
/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
* audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
#define AUDIT_MAXFREE (2*NR_CPUS)
/* The audit_buffer is used when formatting an audit record. The caller
* locks briefly to get the record off the freelist or to allocate the
* buffer, and locks briefly to send the buffer to the netlink layer or
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
struct list_head list;
struct sk_buff_head sklist; /* formatted skbs ready to send */
struct audit_context *ctx; /* NULL or associated context */
int len; /* used area of tmp */
char tmp[AUDIT_BUFSIZ];
/* Pointer to header and contents */
struct nlmsghdr *nlh;
int total;
int type;
int pid;
int count; /* Times requeued */
};
void audit_set_type(struct audit_buffer *ab, int type)
{
ab->type = type;
}
struct audit_entry {
struct list_head list;
struct audit_rule rule;
};
static void audit_log_end_irq(struct audit_buffer *ab);
static void audit_log_end_fast(struct audit_buffer *ab);
static void audit_panic(const char *message)
{
switch (audit_failure)
{
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
printk(KERN_ERR "audit: %s\n", message);
break;
case AUDIT_FAIL_PANIC:
panic("audit: %s\n", message);
break;
}
}
static inline int audit_rate_check(void)
{
static unsigned long last_check = 0;
static int messages = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
now = jiffies;
elapsed = now - last_check;
if (elapsed > HZ) {
last_check = now;
messages = 0;
retval = 1;
}
}
spin_unlock_irqrestore(&lock, flags);
return retval;
}
/* Emit at least 1 message per second, even if audit_rate_check is
* throttling. */
void audit_log_lost(const char *message)
{
static unsigned long last_msg = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
int print;
atomic_inc(&audit_lost);
print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
if (now - last_msg > HZ) {
print = 1;
last_msg = now;
}
spin_unlock_irqrestore(&lock, flags);
}
if (print) {
printk(KERN_WARNING
"audit: audit_lost=%d audit_backlog=%d"
" audit_rate_limit=%d audit_backlog_limit=%d\n",
atomic_read(&audit_lost),
atomic_read(&audit_backlog),
audit_rate_limit,
audit_backlog_limit);
audit_panic(message);
}
}
static int audit_set_rate_limit(int limit)
{
int old = audit_rate_limit;
audit_rate_limit = limit;
audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
audit_rate_limit, old);
return old;
}
static int audit_set_backlog_limit(int limit)
{
int old = audit_backlog_limit;
audit_backlog_limit = limit;
audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
audit_backlog_limit, old);
return old;
}
static int audit_set_enabled(int state)
{
int old = audit_enabled;
if (state != 0 && state != 1)
return -EINVAL;
audit_enabled = state;
audit_log(current->audit_context, "audit_enabled=%d old=%d",
audit_enabled, old);
return old;
}
static int audit_set_failure(int state)
{
int old = audit_failure;
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
audit_failure = state;
audit_log(current->audit_context, "audit_failure=%d old=%d",
audit_failure, old);
return old;
}
#ifdef CONFIG_NET
void audit_send_reply(int pid, int seq, int type, int done, int multi,
void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
int len = NLMSG_SPACE(size);
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
skb = alloc_skb(len, GFP_KERNEL);
if (!skb)
goto nlmsg_failure;
nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
nlh->nlmsg_flags = flags;
data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
return;
nlmsg_failure: /* Used by NLMSG_PUT */
if (skb)
kfree_skb(skb);
}
/*
* Check for appropriate CAP_AUDIT_ capabilities on incoming audit
* control messages.
*/
static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
{
int err = 0;
switch (msg_type) {
case AUDIT_GET:
case AUDIT_LIST:
case AUDIT_SET:
case AUDIT_ADD:
case AUDIT_DEL:
if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
err = -EINVAL;
}
return err;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 uid, pid, seq;
void *data;
struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
if (err)
return err;
pid = NETLINK_CREDS(skb)->pid;
uid = NETLINK_CREDS(skb)->uid;
seq = nlh->nlmsg_seq;
data = NLMSG_DATA(nlh);
switch (msg_type) {
case AUDIT_GET:
status_set.enabled = audit_enabled;
status_set.failure = audit_failure;
status_set.pid = audit_pid;
status_set.rate_limit = audit_rate_limit;
status_set.backlog_limit = audit_backlog_limit;
status_set.lost = atomic_read(&audit_lost);
status_set.backlog = atomic_read(&audit_backlog);
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
&status_set, sizeof(status_set));
break;
case AUDIT_SET:
if (nlh->nlmsg_len < sizeof(struct audit_status))
return -EINVAL;
status_get = (struct audit_status *)data;
if (status_get->mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(status_get->enabled);
if (err < 0) return err;
}
if (status_get->mask & AUDIT_STATUS_FAILURE) {
err = audit_set_failure(status_get->failure);
if (err < 0) return err;
}
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
audit_pid = status_get->pid;
audit_log(current->audit_context,
"audit_pid=%d old=%d", audit_pid, old);
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
audit_set_rate_limit(status_get->rate_limit);
if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
audit_set_backlog_limit(status_get->backlog_limit);
break;
case AUDIT_USER:
ab = audit_log_start(NULL);
if (!ab)
break; /* audit_panic has been called */
audit_log_format(ab,
"user pid=%d uid=%d length=%d msg='%.1024s'",
pid, uid,
(int)(nlh->nlmsg_len
- ((char *)data - (char *)nlh)),
(char *)data);
ab->type = AUDIT_USER;
ab->pid = pid;
audit_log_end(ab);
break;
case AUDIT_ADD:
case AUDIT_DEL:
if (nlh->nlmsg_len < sizeof(struct audit_rule))
return -EINVAL;
/* fallthrough */
case AUDIT_LIST:
#ifdef CONFIG_AUDITSYSCALL
err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
uid, seq, data);
#else
err = -EOPNOTSUPP;
#endif
break;
default:
err = -EINVAL;
break;
}
return err < 0 ? err : 0;
}
/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
* processed by audit_receive_msg. Malformed skbs with wrong length are
* discarded silently. */
static int audit_receive_skb(struct sk_buff *skb)
{
int err;
struct nlmsghdr *nlh;
u32 rlen;
while (skb->len >= NLMSG_SPACE(0)) {
nlh = (struct nlmsghdr *)skb->data;
if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
return 0;
rlen = NLMSG_ALIGN(nlh->nlmsg_len);
if (rlen > skb->len)
rlen = skb->len;
if ((err = audit_receive_msg(skb, nlh))) {
netlink_ack(skb, nlh, err);
} else if (nlh->nlmsg_flags & NLM_F_ACK)
netlink_ack(skb, nlh, 0);
skb_pull(skb, rlen);
}
return 0;
}
/* Receive messages from netlink socket. */
static void audit_receive(struct sock *sk, int length)
{
struct sk_buff *skb;
if (down_trylock(&audit_netlink_sem))
return;
/* FIXME: this must not cause starvation */
while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
if (audit_receive_skb(skb) && skb->len)
skb_queue_head(&sk->sk_receive_queue, skb);
else
kfree_skb(skb);
}
up(&audit_netlink_sem);
}
/* Move data from tmp buffer into an skb. This is an extra copy, and
* that is unfortunate. However, the copy will only occur when a record
* is being written to user space, which is already a high-overhead
* operation. (Elimination of the copy is possible, for example, by
* writing directly into a pre-allocated skb, at the cost of wasting
* memory. */
static void audit_log_move(struct audit_buffer *ab)
{
struct sk_buff *skb;
char *start;
int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
/* possible resubmission */
if (ab->len == 0)
return;
skb = skb_peek(&ab->sklist);
if (!skb || skb_tailroom(skb) <= ab->len + extra) {
skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
if (!skb) {
ab->len = 0; /* Lose information in ab->tmp */
audit_log_lost("out of memory in audit_log_move");
return;
}
__skb_queue_tail(&ab->sklist, skb);
if (!ab->nlh)
ab->nlh = (struct nlmsghdr *)skb_put(skb,
NLMSG_SPACE(0));
}
start = skb_put(skb, ab->len);
memcpy(start, ab->tmp, ab->len);
ab->len = 0;
}
/* Iterate over the skbuff in the audit_buffer, sending their contents
* to user space. */
static inline int audit_log_drain(struct audit_buffer *ab)
{
struct sk_buff *skb;
while ((skb = skb_dequeue(&ab->sklist))) {
int retval = 0;
if (audit_pid) {
if (ab->nlh) {
ab->nlh->nlmsg_len = ab->total;
ab->nlh->nlmsg_type = ab->type;
ab->nlh->nlmsg_flags = 0;
ab->nlh->nlmsg_seq = 0;
ab->nlh->nlmsg_pid = ab->pid;
}
skb_get(skb); /* because netlink_* frees */
retval = netlink_unicast(audit_sock, skb, audit_pid,
MSG_DONTWAIT);
}
if (retval == -EAGAIN && ab->count < 5) {
++ab->count;
skb_queue_tail(&ab->sklist, skb);
audit_log_end_irq(ab);
return 1;
}
if (retval < 0) {
if (retval == -ECONNREFUSED) {
printk(KERN_ERR
"audit: *NO* daemon at audit_pid=%d\n",
audit_pid);
audit_pid = 0;
} else
audit_log_lost("netlink socket too busy");
}
if (!audit_pid) { /* No daemon */
int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
int len = skb->len - offset;
printk(KERN_ERR "%*.*s\n",
len, len, skb->data + offset);
}
kfree_skb(skb);
ab->nlh = NULL;
}
return 0;
}
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
audit_initialized = 1;
audit_enabled = audit_default;
audit_log(NULL, "initialized");
return 0;
}
#else
/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
* in the buffer. */
static void audit_log_move(struct audit_buffer *ab)
{
printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
ab->len = 0;
}
static inline int audit_log_drain(struct audit_buffer *ab)
{
return 0;
}
/* Initialize audit support at boot time. */
int __init audit_init(void)
{
printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
audit_sock = NULL;
audit_pid = 0;
audit_initialized = 1;
audit_enabled = audit_default;
audit_log(NULL, "initialized");
return 0;
}
#endif
__initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
static int __init audit_enable(char *str)
{
audit_default = !!simple_strtol(str, NULL, 0);
printk(KERN_INFO "audit: %s%s\n",
audit_default ? "enabled" : "disabled",
audit_initialized ? "" : " (after initialization)");
if (audit_initialized)
audit_enabled = audit_default;
return 0;
}
__setup("audit=", audit_enable);
/* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
* audit_log_*format. If the tsk is a task that is currently in a
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, tsk
* should be NULL. */
struct audit_buffer *audit_log_start(struct audit_context *ctx)
{
struct audit_buffer *ab = NULL;
unsigned long flags;
struct timespec t;
int serial = 0;
if (!audit_initialized)
return NULL;
if (audit_backlog_limit
&& atomic_read(&audit_backlog) > audit_backlog_limit) {
if (audit_rate_check())
printk(KERN_WARNING
"audit: audit_backlog=%d > "
"audit_backlog_limit=%d\n",
atomic_read(&audit_backlog),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
return NULL;
}
spin_lock_irqsave(&audit_freelist_lock, flags);
if (!list_empty(&audit_freelist)) {
ab = list_entry(audit_freelist.next,
struct audit_buffer, list);
list_del(&ab->list);
--audit_freelist_count;
}
spin_unlock_irqrestore(&audit_freelist_lock, flags);
if (!ab)
ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
atomic_inc(&audit_backlog);
skb_queue_head_init(&ab->sklist);
ab->ctx = ctx;
ab->len = 0;
ab->nlh = NULL;
ab->total = 0;
ab->type = AUDIT_KERNEL;
ab->pid = 0;
ab->count = 0;
#ifdef CONFIG_AUDITSYSCALL
if (ab->ctx)
audit_get_stamp(ab->ctx, &t, &serial);
else
#endif
t = CURRENT_TIME;
audit_log_format(ab, "audit(%lu.%03lu:%u): ",
t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
/* Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either. */
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
int len, avail;
if (!ab)
return;
avail = sizeof(ab->tmp) - ab->len;
if (avail <= 0) {
audit_log_move(ab);
avail = sizeof(ab->tmp) - ab->len;
}
len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
audit_log_move(ab);
avail = sizeof(ab->tmp) - ab->len;
len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
}
ab->len += (len < avail) ? len : avail;
ab->total += (len < avail) ? len : avail;
}
/* Format a message into the audit buffer. All the work is done in
* audit_log_vformat. */
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
va_list args;
if (!ab)
return;
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
}
/* This is a helper-function to print the d_path without using a static
* buffer or allocating another buffer in addition to the one in
* audit_buffer. */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
struct dentry *dentry, struct vfsmount *vfsmnt)
{
char *p;
int len, avail;
if (prefix) audit_log_format(ab, " %s", prefix);
if (ab->len > 128)
audit_log_move(ab);
avail = sizeof(ab->tmp) - ab->len;
p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
if (IS_ERR(p)) {
/* FIXME: can we save some information here? */
audit_log_format(ab, "<toolong>");
} else {
/* path isn't at start of buffer */
len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
memmove(ab->tmp + ab->len, p, len);
ab->len += len;
ab->total += len;
}
}
/* Remove queued messages from the audit_txlist and send them to userspace. */
static void audit_tasklet_handler(unsigned long arg)
{
LIST_HEAD(list);
struct audit_buffer *ab;
unsigned long flags;
spin_lock_irqsave(&audit_txlist_lock, flags);
list_splice_init(&audit_txlist, &list);
spin_unlock_irqrestore(&audit_txlist_lock, flags);
while (!list_empty(&list)) {
ab = list_entry(list.next, struct audit_buffer, list);
list_del(&ab->list);
audit_log_end_fast(ab);
}
}
static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
/* The netlink_* functions cannot be called inside an irq context, so
* the audit buffer is places on a queue and a tasklet is scheduled to
* remove them from the queue outside the irq context. May be called in
* any context. */
static void audit_log_end_irq(struct audit_buffer *ab)
{
unsigned long flags;
if (!ab)
return;
spin_lock_irqsave(&audit_txlist_lock, flags);
list_add_tail(&ab->list, &audit_txlist);
spin_unlock_irqrestore(&audit_txlist_lock, flags);
tasklet_schedule(&audit_tasklet);
}
/* Send the message in the audit buffer directly to user space. May not
* be called in an irq context. */
static void audit_log_end_fast(struct audit_buffer *ab)
{
unsigned long flags;
BUG_ON(in_irq());
if (!ab)
return;
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
audit_log_move(ab);
if (audit_log_drain(ab))
return;
}
atomic_dec(&audit_backlog);
spin_lock_irqsave(&audit_freelist_lock, flags);
if (++audit_freelist_count > AUDIT_MAXFREE)
kfree(ab);
else
list_add(&ab->list, &audit_freelist);
spin_unlock_irqrestore(&audit_freelist_lock, flags);
}
/* Send or queue the message in the audit buffer, depending on the
* current context. (A convenience function that may be called in any
* context.) */
void audit_log_end(struct audit_buffer *ab)
{
if (in_irq())
audit_log_end_irq(ab);
else
audit_log_end_fast(ab);
}
/* Log an audit record. This is a convenience function that calls
* audit_log_start, audit_log_vformat, and audit_log_end. It may be
* called in any context. */
void audit_log(struct audit_context *ctx, const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
ab = audit_log_start(ctx);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
audit_log_end(ab);
}
}

1015
kernel/auditsc.c Normal file

File diff suppressed because it is too large Load Diff

220
kernel/capability.c Normal file
View File

@@ -0,0 +1,220 @@
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
*
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com>
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(securebits);
EXPORT_SYMBOL(cap_bset);
/*
* This lock protects task->cap_* for all tasks including current.
* Locking rule: acquire this prior to tasklist_lock.
*/
static DEFINE_SPINLOCK(task_capability_lock);
/*
* For sys_getproccap() and sys_setproccap(), any of the three
* capability set pointers may be NULL -- indicating that that set is
* uninteresting and/or not to be changed.
*/
/*
* sys_capget - get the capabilities of a given process.
*/
asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
{
int ret = 0;
pid_t pid;
__u32 version;
task_t *target;
struct __user_cap_data_struct data;
if (get_user(version, &header->version))
return -EFAULT;
if (version != _LINUX_CAPABILITY_VERSION) {
if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid < 0)
return -EINVAL;
spin_lock(&task_capability_lock);
read_lock(&tasklist_lock);
if (pid && pid != current->pid) {
target = find_task_by_pid(pid);
if (!target) {
ret = -ESRCH;
goto out;
}
} else
target = current;
ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
out:
read_unlock(&tasklist_lock);
spin_unlock(&task_capability_lock);
if (!ret && copy_to_user(dataptr, &data, sizeof data))
return -EFAULT;
return ret;
}
/*
* cap_set_pg - set capabilities for all processes in a given process
* group. We call this holding task_capability_lock and tasklist_lock.
*/
static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
task_t *g, *target;
int ret = -EPERM;
int found = 0;
do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
target = g;
while_each_thread(g, target) {
if (!security_capset_check(target, effective,
inheritable,
permitted)) {
security_capset_set(target, effective,
inheritable,
permitted);
ret = 0;
}
found = 1;
}
} while_each_task_pid(pgrp, PIDTYPE_PGID, g);
if (!found)
ret = 0;
return ret;
}
/*
* cap_set_all - set capabilities for all processes other than init
* and self. We call this holding task_capability_lock and tasklist_lock.
*/
static inline int cap_set_all(kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
task_t *g, *target;
int ret = -EPERM;
int found = 0;
do_each_thread(g, target) {
if (target == current || target->pid == 1)
continue;
found = 1;
if (security_capset_check(target, effective, inheritable,
permitted))
continue;
ret = 0;
security_capset_set(target, effective, inheritable, permitted);
} while_each_thread(g, target);
if (!found)
ret = 0;
return ret;
}
/*
* sys_capset - set capabilities for a given process, all processes, or all
* processes in a given process group.
*
* The restrictions on setting capabilities are specified as:
*
* [pid is for the 'target' task. 'current' is the calling task.]
*
* I: any raised capabilities must be a subset of the (old current) permitted
* P: any raised capabilities must be a subset of the (old current) permitted
* E: must be set to a subset of (new target) permitted
*/
asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
kernel_cap_t inheritable, permitted, effective;
__u32 version;
task_t *target;
int ret;
pid_t pid;
if (get_user(version, &header->version))
return -EFAULT;
if (version != _LINUX_CAPABILITY_VERSION) {
if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid && pid != current->pid && !capable(CAP_SETPCAP))
return -EPERM;
if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
return -EFAULT;
spin_lock(&task_capability_lock);
read_lock(&tasklist_lock);
if (pid > 0 && pid != current->pid) {
target = find_task_by_pid(pid);
if (!target) {
ret = -ESRCH;
goto out;
}
} else
target = current;
ret = 0;
/* having verified that the proposed changes are legal,
we now put them into effect. */
if (pid < 0) {
if (pid == -1) /* all procs other than current and init */
ret = cap_set_all(&effective, &inheritable, &permitted);
else /* all procs in process group */
ret = cap_set_pg(-pid, &effective, &inheritable,
&permitted);
} else {
ret = security_capset_check(target, &effective, &inheritable,
&permitted);
if (!ret)
security_capset_set(target, &effective, &inheritable,
&permitted);
}
out:
read_unlock(&tasklist_lock);
spin_unlock(&task_capability_lock);
return ret;
}

860
kernel/compat.c Normal file
View File

@@ -0,0 +1,860 @@
/*
* linux/kernel/compat.c
*
* Kernel compatibililty routines for e.g. 32 bit syscall support
* on 64 bit kernels.
*
* Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/signal.h>
#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
#include <linux/futex.h> /* for FUTEX_WAIT */
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
#include <asm/uaccess.h>
#include <asm/bug.h>
int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
static long compat_nanosleep_restart(struct restart_block *restart)
{
unsigned long expire = restart->arg0, now = jiffies;
struct compat_timespec __user *rmtp;
/* Did it expire while we handled signals? */
if (!time_after(expire, now))
return 0;
current->state = TASK_INTERRUPTIBLE;
expire = schedule_timeout(expire - now);
if (expire == 0)
return 0;
rmtp = (struct compat_timespec __user *)restart->arg1;
if (rmtp) {
struct compat_timespec ct;
struct timespec t;
jiffies_to_timespec(expire, &t);
ct.tv_sec = t.tv_sec;
ct.tv_nsec = t.tv_nsec;
if (copy_to_user(rmtp, &ct, sizeof(ct)))
return -EFAULT;
}
/* The 'restart' block is already filled in */
return -ERESTART_RESTARTBLOCK;
}
asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
{
struct timespec t;
struct restart_block *restart;
unsigned long expire;
if (get_compat_timespec(&t, rqtp))
return -EFAULT;
if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
current->state = TASK_INTERRUPTIBLE;
expire = schedule_timeout(expire);
if (expire == 0)
return 0;
if (rmtp) {
jiffies_to_timespec(expire, &t);
if (put_compat_timespec(&t, rmtp))
return -EFAULT;
}
restart = &current_thread_info()->restart_block;
restart->fn = compat_nanosleep_restart;
restart->arg0 = jiffies + expire;
restart->arg1 = (unsigned long) rmtp;
return -ERESTART_RESTARTBLOCK;
}
static inline long get_compat_itimerval(struct itimerval *o,
struct compat_itimerval __user *i)
{
return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
(__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
__get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
__get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
__get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
}
static inline long put_compat_itimerval(struct compat_itimerval __user *o,
struct itimerval *i)
{
return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
(__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
__put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
__put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
asmlinkage long compat_sys_getitimer(int which,
struct compat_itimerval __user *it)
{
struct itimerval kit;
int error;
error = do_getitimer(which, &kit);
if (!error && put_compat_itimerval(it, &kit))
error = -EFAULT;
return error;
}
asmlinkage long compat_sys_setitimer(int which,
struct compat_itimerval __user *in,
struct compat_itimerval __user *out)
{
struct itimerval kin, kout;
int error;
if (in) {
if (get_compat_itimerval(&kin, in))
return -EFAULT;
} else
memset(&kin, 0, sizeof(kin));
error = do_setitimer(which, &kin, out ? &kout : NULL);
if (error || !out)
return error;
if (put_compat_itimerval(out, &kout))
return -EFAULT;
return 0;
}
asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
{
/*
* In the SMP world we might just be unlucky and have one of
* the times increment as we use it. Since the value is an
* atomically safe type this is just fine. Conceptually its
* as if the syscall took an instant longer to occur.
*/
if (tbuf) {
struct compat_tms tmp;
struct task_struct *tsk = current;
struct task_struct *t;
cputime_t utime, stime, cutime, cstime;
read_lock(&tasklist_lock);
utime = tsk->signal->utime;
stime = tsk->signal->stime;
t = tsk;
do {
utime = cputime_add(utime, t->utime);
stime = cputime_add(stime, t->stime);
t = next_thread(t);
} while (t != tsk);
/*
* While we have tasklist_lock read-locked, no dying thread
* can be updating current->signal->[us]time. Instead,
* we got their counts included in the live thread loop.
* However, another thread can come in right now and
* do a wait call that updates current->signal->c[us]time.
* To make sure we always see that pair updated atomically,
* we take the siglock around fetching them.
*/
spin_lock_irq(&tsk->sighand->siglock);
cutime = tsk->signal->cutime;
cstime = tsk->signal->cstime;
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
return -EFAULT;
}
return compat_jiffies_to_clock_t(jiffies);
}
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
*/
asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
{
old_sigset_t s;
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_sigpending((old_sigset_t __user *) &s);
set_fs(old_fs);
if (ret == 0)
ret = put_user(s, set);
return ret;
}
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
compat_old_sigset_t __user *oset)
{
old_sigset_t s;
long ret;
mm_segment_t old_fs;
if (set && get_user(s, set))
return -EFAULT;
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_sigprocmask(how,
set ? (old_sigset_t __user *) &s : NULL,
oset ? (old_sigset_t __user *) &s : NULL);
set_fs(old_fs);
if (ret == 0)
if (oset)
ret = put_user(s, oset);
return ret;
}
#ifdef CONFIG_FUTEX
asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
struct compat_timespec __user *utime, u32 __user *uaddr2,
int val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
if ((op == FUTEX_WAIT) && utime) {
if (get_compat_timespec(&t, utime))
return -EFAULT;
timeout = timespec_to_jiffies(&t) + 1;
}
if (op >= FUTEX_REQUEUE)
val2 = (int) (unsigned long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout,
(unsigned long)uaddr2, val2, val3);
}
#endif
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs ();
if (resource >= RLIM_NLIMITS)
return -EINVAL;
if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
__get_user(r.rlim_cur, &rlim->rlim_cur) ||
__get_user(r.rlim_max, &rlim->rlim_max))
return -EFAULT;
if (r.rlim_cur == COMPAT_RLIM_INFINITY)
r.rlim_cur = RLIM_INFINITY;
if (r.rlim_max == COMPAT_RLIM_INFINITY)
r.rlim_max = RLIM_INFINITY;
set_fs(KERNEL_DS);
ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
set_fs(old_fs);
return ret;
}
#ifdef COMPAT_RLIM_OLD_INFINITY
asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_old_getrlimit(resource, &r);
set_fs(old_fs);
if (!ret) {
if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
r.rlim_cur = COMPAT_RLIM_INFINITY;
if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
r.rlim_max = COMPAT_RLIM_INFINITY;
if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
__put_user(r.rlim_cur, &rlim->rlim_cur) ||
__put_user(r.rlim_max, &rlim->rlim_max))
return -EFAULT;
}
return ret;
}
#endif
asmlinkage long compat_sys_getrlimit (unsigned int resource,
struct compat_rlimit __user *rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
set_fs(old_fs);
if (!ret) {
if (r.rlim_cur > COMPAT_RLIM_INFINITY)
r.rlim_cur = COMPAT_RLIM_INFINITY;
if (r.rlim_max > COMPAT_RLIM_INFINITY)
r.rlim_max = COMPAT_RLIM_INFINITY;
if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
__put_user(r.rlim_cur, &rlim->rlim_cur) ||
__put_user(r.rlim_max, &rlim->rlim_max))
return -EFAULT;
}
return ret;
}
int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
__put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
__put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
__put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
__put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
__put_user(r->ru_maxrss, &ru->ru_maxrss) ||
__put_user(r->ru_ixrss, &ru->ru_ixrss) ||
__put_user(r->ru_idrss, &ru->ru_idrss) ||
__put_user(r->ru_isrss, &ru->ru_isrss) ||
__put_user(r->ru_minflt, &ru->ru_minflt) ||
__put_user(r->ru_majflt, &ru->ru_majflt) ||
__put_user(r->ru_nswap, &ru->ru_nswap) ||
__put_user(r->ru_inblock, &ru->ru_inblock) ||
__put_user(r->ru_oublock, &ru->ru_oublock) ||
__put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
__put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
__put_user(r->ru_nsignals, &ru->ru_nsignals) ||
__put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
__put_user(r->ru_nivcsw, &ru->ru_nivcsw))
return -EFAULT;
return 0;
}
asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
{
struct rusage r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
ret = sys_getrusage(who, (struct rusage __user *) &r);
set_fs(old_fs);
if (ret)
return ret;
if (put_compat_rusage(&r, ru))
return -EFAULT;
return 0;
}
asmlinkage long
compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
struct compat_rusage __user *ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
} else {
struct rusage r;
int ret;
unsigned int status;
mm_segment_t old_fs = get_fs();
set_fs (KERNEL_DS);
ret = sys_wait4(pid,
(stat_addr ?
(unsigned int __user *) &status : NULL),
options, (struct rusage __user *) &r);
set_fs (old_fs);
if (ret > 0) {
if (put_compat_rusage(&r, ru))
return -EFAULT;
if (stat_addr && put_user(status, stat_addr))
return -EFAULT;
}
return ret;
}
}
asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
struct compat_siginfo __user *uinfo, int options,
struct compat_rusage __user *uru)
{
siginfo_t info;
struct rusage ru;
long ret;
mm_segment_t old_fs = get_fs();
memset(&info, 0, sizeof(info));
set_fs(KERNEL_DS);
ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
uru ? (struct rusage __user *)&ru : NULL);
set_fs(old_fs);
if ((ret < 0) || (info.si_signo == 0))
return ret;
if (uru) {
ret = put_compat_rusage(&ru, uru);
if (ret)
return ret;
}
BUG_ON(info.si_code & __SI_MASK);
info.si_code |= __SI_CHLD;
return copy_siginfo_to_user32(uinfo, &info);
}
static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
unsigned len, cpumask_t *new_mask)
{
unsigned long *k;
if (len < sizeof(cpumask_t))
memset(new_mask, 0, sizeof(cpumask_t));
else if (len > sizeof(cpumask_t))
len = sizeof(cpumask_t);
k = cpus_addr(*new_mask);
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
unsigned int len,
compat_ulong_t __user *user_mask_ptr)
{
cpumask_t new_mask;
int retval;
retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
if (retval)
return retval;
return sched_setaffinity(pid, new_mask);
}
asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
compat_ulong_t __user *user_mask_ptr)
{
int ret;
cpumask_t mask;
unsigned long *k;
unsigned int min_length = sizeof(cpumask_t);
if (NR_CPUS <= BITS_PER_COMPAT_LONG)
min_length = sizeof(compat_ulong_t);
if (len < min_length)
return -EINVAL;
ret = sched_getaffinity(pid, &mask);
if (ret < 0)
return ret;
k = cpus_addr(mask);
ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
if (ret)
return ret;
return min_length;
}
static int get_compat_itimerspec(struct itimerspec *dst,
struct compat_itimerspec __user *src)
{
if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
get_compat_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
struct itimerspec *src)
{
if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
put_compat_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
long compat_sys_timer_settime(timer_t timer_id, int flags,
struct compat_itimerspec __user *new,
struct compat_itimerspec __user *old)
{
long err;
mm_segment_t oldfs;
struct itimerspec newts, oldts;
if (!new)
return -EINVAL;
if (get_compat_itimerspec(&newts, new))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_timer_settime(timer_id, flags,
(struct itimerspec __user *) &newts,
(struct itimerspec __user *) &oldts);
set_fs(oldfs);
if (!err && old && put_compat_itimerspec(old, &oldts))
return -EFAULT;
return err;
}
long compat_sys_timer_gettime(timer_t timer_id,
struct compat_itimerspec __user *setting)
{
long err;
mm_segment_t oldfs;
struct itimerspec ts;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_timer_gettime(timer_id,
(struct itimerspec __user *) &ts);
set_fs(oldfs);
if (!err && put_compat_itimerspec(setting, &ts))
return -EFAULT;
return err;
}
long compat_sys_clock_settime(clockid_t which_clock,
struct compat_timespec __user *tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
if (get_compat_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_clock_settime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
return err;
}
long compat_sys_clock_gettime(clockid_t which_clock,
struct compat_timespec __user *tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
if (!err && put_compat_timespec(&ts, tp))
return -EFAULT;
return err;
}
long compat_sys_clock_getres(clockid_t which_clock,
struct compat_timespec __user *tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
if (!err && tp && put_compat_timespec(&ts, tp))
return -EFAULT;
return err;
}
long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
if (get_compat_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
err = sys_clock_nanosleep(which_clock, flags,
(struct timespec __user *) &in,
(struct timespec __user *) &out);
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
put_compat_timespec(&out, rmtp))
return -EFAULT;
return err;
}
/*
* We currently only need the following fields from the sigevent
* structure: sigev_value, sigev_signo, sig_notify and (sometimes
* sigev_notify_thread_id). The others are handled in user mode.
* We also assume that copying sigev_value.sival_int is sufficient
* to keep all the bits of sigev_value.sival_ptr intact.
*/
int get_compat_sigevent(struct sigevent *event,
const struct compat_sigevent __user *u_event)
{
memset(&event, 0, sizeof(*event));
return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
__get_user(event->sigev_value.sival_int,
&u_event->sigev_value.sival_int) ||
__get_user(event->sigev_signo, &u_event->sigev_signo) ||
__get_user(event->sigev_notify, &u_event->sigev_notify) ||
__get_user(event->sigev_notify_thread_id,
&u_event->sigev_notify_thread_id))
? -EFAULT : 0;
}
/* timer_create is architecture specific because it needs sigevent conversion */
long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
int i, j;
unsigned long m;
compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
return -EFAULT;
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
m = 0;
for (j = 0; j < sizeof(m)/sizeof(um); j++) {
/*
* We dont want to read past the end of the userspace
* bitmap. We must however ensure the end of the
* kernel bitmap is zeroed.
*/
if (nr_compat_longs-- > 0) {
if (__get_user(um, umask))
return -EFAULT;
} else {
um = 0;
}
umask++;
m |= (long)um << (j * BITS_PER_COMPAT_LONG);
}
*mask++ = m;
}
return 0;
}
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
unsigned long bitmap_size)
{
int i, j;
unsigned long m;
compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
return -EFAULT;
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
m = *mask++;
for (j = 0; j < sizeof(m)/sizeof(um); j++) {
um = m;
/*
* We dont want to write past the end of the userspace
* bitmap.
*/
if (nr_compat_longs-- > 0) {
if (__put_user(um, umask))
return -EFAULT;
}
umask++;
m >>= 4*sizeof(um);
m >>= 4*sizeof(um);
}
}
return 0;
}
void
sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
#if defined (__COMPAT_ENDIAN_SWAP__)
case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
#else
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
#endif
}
}
asmlinkage long
compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
struct compat_siginfo __user *uinfo,
struct compat_timespec __user *uts, compat_size_t sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
int sig;
struct timespec t;
siginfo_t info;
long ret, timeout = 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
signotset(&s);
if (uts) {
if (get_compat_timespec (&t, uts))
return -EFAULT;
if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
|| t.tv_sec < 0)
return -EINVAL;
}
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &s, &info);
if (!sig) {
timeout = MAX_SCHEDULE_TIMEOUT;
if (uts)
timeout = timespec_to_jiffies(&t)
+(t.tv_sec || t.tv_nsec);
if (timeout) {
current->real_blocked = current->blocked;
sigandsets(&current->blocked, &current->blocked, &s);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
current->state = TASK_INTERRUPTIBLE;
timeout = schedule_timeout(timeout);
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &s, &info);
current->blocked = current->real_blocked;
siginitset(&current->real_blocked, 0);
recalc_sigpending();
}
}
spin_unlock_irq(&current->sighand->siglock);
if (sig) {
ret = sig;
if (uinfo) {
if (copy_siginfo_to_user32(uinfo, &info))
ret = -EFAULT;
}
}else {
ret = timeout?-EINTR:-EAGAIN;
}
return ret;
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
asmlinkage long compat_sys_time(compat_time_t __user * tloc)
{
compat_time_t i;
struct timeval tv;
do_gettimeofday(&tv);
i = tv.tv_sec;
if (tloc) {
if (put_user(i,tloc))
i = -EFAULT;
}
return i;
}
asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
{
struct timespec tv;
int err;
if (get_user(tv.tv_sec, tptr))
return -EFAULT;
tv.tv_nsec = 0;
err = security_settime(&tv, NULL);
if (err)
return err;
do_settimeofday(&tv);
return 0;
}
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */

118
kernel/configs.c Normal file
View File

@@ -0,0 +1,118 @@
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
*
* Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
* Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <asm/uaccess.h>
/**************************************************/
/* the actual current config file */
/*
* Define kernel_config_data and kernel_config_data_size, which contains the
* wrapped and compressed configuration file. The file is first compressed
* with gzip and then bounded by two eight byte magic numbers to allow
* extraction from a binary kernel image:
*
* IKCFG_ST
* <image>
* IKCFG_ED
*/
#define MAGIC_START "IKCFG_ST"
#define MAGIC_END "IKCFG_ED"
#include "config_data.h"
#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
#define kernel_config_data_size \
(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
#ifdef CONFIG_IKCONFIG_PROC
/**************************************************/
/* globals and useful constants */
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
loff_t pos = *offset;
ssize_t count;
if (pos >= kernel_config_data_size)
return 0;
count = min(len, (size_t)(kernel_config_data_size - pos));
if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
return -EFAULT;
*offset += count;
return count;
}
static struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
};
/***************************************************/
/* ikconfig_init: start up everything we need to */
static int __init ikconfig_init(void)
{
struct proc_dir_entry *entry;
/* create the current config file */
entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
&proc_root);
if (!entry)
return -ENOMEM;
entry->proc_fops = &ikconfig_file_ops;
entry->size = kernel_config_data_size;
return 0;
}
/***************************************************/
/* ikconfig_cleanup: clean up our mess */
static void __exit ikconfig_cleanup(void)
{
remove_proc_entry("config.gz", &proc_root);
}
module_init(ikconfig_init);
module_exit(ikconfig_cleanup);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Randy Dunlap");
MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
#endif /* CONFIG_IKCONFIG_PROC */

193
kernel/cpu.c Normal file
View File

@@ -0,0 +1,193 @@
/* CPU control.
* (C) 2001, 2002, 2003, 2004 Rusty Russell
*
* This code is licenced under the GPL.
*/
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <asm/semaphore.h>
/* This protects CPUs going up and down... */
DECLARE_MUTEX(cpucontrol);
static struct notifier_block *cpu_chain;
/* Need to know about CPUs going up/down? */
int register_cpu_notifier(struct notifier_block *nb)
{
int ret;
if ((ret = down_interruptible(&cpucontrol)) != 0)
return ret;
ret = notifier_chain_register(&cpu_chain, nb);
up(&cpucontrol);
return ret;
}
EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
down(&cpucontrol);
notifier_chain_unregister(&cpu_chain, nb);
up(&cpucontrol);
}
EXPORT_SYMBOL(unregister_cpu_notifier);
#ifdef CONFIG_HOTPLUG_CPU
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
if (task_cpu(p) == cpu &&
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
(state = %ld, flags = %lx) \n",
p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
/* Take this CPU down. */
static int take_cpu_down(void *unused)
{
int err;
/* Take offline: makes arch_cpu_down somewhat easier. */
cpu_clear(smp_processor_id(), cpu_online_map);
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
return err;
}
int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
cpumask_t old_allowed, tmp;
if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
if (num_online_cpus() == 1) {
err = -EBUSY;
goto out;
}
if (!cpu_online(cpu)) {
err = -EINVAL;
goto out;
}
err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
err = -EINVAL;
goto out;
}
/* Ensure that we are not runnable on dying cpu */
old_allowed = current->cpus_allowed;
tmp = CPU_MASK_ALL;
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
err = PTR_ERR(p);
goto out_allowed;
}
if (cpu_online(cpu))
goto out_thread;
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
/* This actually kills the CPU. */
__cpu_die(cpu);
/* Move it here so it can run. */
kthread_bind(p, get_cpu());
put_cpu();
/* CPU is completely dead: tell everyone. Too late to complain. */
if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
== NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
int __devinit cpu_up(unsigned int cpu)
{
int ret;
void *hcpu = (void *)(long)cpu;
if ((ret = down_interruptible(&cpucontrol)) != 0)
return ret;
if (cpu_online(cpu) || !cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
ret = -EINVAL;
goto out_notify;
}
/* Arch-specific enabling code. */
ret = __cpu_up(cpu);
if (ret != 0)
goto out_notify;
if (!cpu_online(cpu))
BUG();
/* Now call notifier in preparation. */
notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
out_notify:
if (ret != 0)
notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
out:
up(&cpucontrol);
return ret;
}

1564
kernel/cpuset.c Normal file

File diff suppressed because it is too large Load Diff

158
kernel/dma.c Normal file
View File

@@ -0,0 +1,158 @@
/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
* Written by Hennus Bergman, 1992.
*
* 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
* In the previous version the reported device could end up being wrong,
* if a device requested a DMA channel that was already in use.
* [It also happened to remove the sizeof(char *) == sizeof(int)
* assumption introduced because of those /proc/dma patches. -- Hennus]
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
#include <asm/system.h>
/* A note on resource allocation:
*
* All drivers needing DMA channels, should allocate and release them
* through the public routines `request_dma()' and `free_dma()'.
*
* In order to avoid problems, all processes should allocate resources in
* the same sequence and release them in the reverse order.
*
* So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
* When releasing them, first release the DMA, then release the IRQ.
* If you don't, you may cause allocation requests to fail unnecessarily.
* This doesn't really matter now, but it will once we get real semaphores
* in the kernel.
*/
DEFINE_SPINLOCK(dma_spin_lock);
/*
* If our port doesn't define this it has no PC like DMA
*/
#ifdef MAX_DMA_CHANNELS
/* Channel n is busy iff dma_chan_busy[n].lock != 0.
* DMA0 used to be reserved for DRAM refresh, but apparently not any more...
* DMA4 is reserved for cascading.
*/
struct dma_chan {
int lock;
const char *device_id;
};
static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
[4] = { 1, "cascade" },
};
int request_dma(unsigned int dmanr, const char * device_id)
{
if (dmanr >= MAX_DMA_CHANNELS)
return -EINVAL;
if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
return -EBUSY;
dma_chan_busy[dmanr].device_id = device_id;
/* old flag was 0, now contains 1 to indicate busy */
return 0;
} /* request_dma */
void free_dma(unsigned int dmanr)
{
if (dmanr >= MAX_DMA_CHANNELS) {
printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
return;
}
if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
return;
}
} /* free_dma */
#else
int request_dma(unsigned int dmanr, const char *device_id)
{
return -EINVAL;
}
void free_dma(unsigned int dmanr)
{
}
#endif
#ifdef CONFIG_PROC_FS
#ifdef MAX_DMA_CHANNELS
static int proc_dma_show(struct seq_file *m, void *v)
{
int i;
for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
if (dma_chan_busy[i].lock) {
seq_printf(m, "%2d: %s\n", i,
dma_chan_busy[i].device_id);
}
}
return 0;
}
#else
static int proc_dma_show(struct seq_file *m, void *v)
{
seq_puts(m, "No DMA\n");
return 0;
}
#endif /* MAX_DMA_CHANNELS */
static int proc_dma_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_dma_show, NULL);
}
static struct file_operations proc_dma_operations = {
.open = proc_dma_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init proc_dma_init(void)
{
struct proc_dir_entry *e;
e = create_proc_entry("dma", 0, NULL);
if (e)
e->proc_fops = &proc_dma_operations;
return 0;
}
__initcall(proc_dma_init);
#endif
EXPORT_SYMBOL(request_dma);
EXPORT_SYMBOL(free_dma);
EXPORT_SYMBOL(dma_spin_lock);

209
kernel/exec_domain.c Normal file
View File

@@ -0,0 +1,209 @@
/*
* Handling of different ABIs (personalities).
*
* We group personalities into execution domains which have their
* own handlers for kernel entry points, signal mapping, etc...
*
* 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
*/
#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
static void default_handler(int, struct pt_regs *);
static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);
static u_long ident_map[32] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31
};
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
.pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
};
static void
default_handler(int segment, struct pt_regs *regp)
{
set_personality(0);
if (current_thread_info()->exec_domain->handler != default_handler)
current_thread_info()->exec_domain->handler(segment, regp);
else
send_sig(SIGSEGV, current, 1);
}
static struct exec_domain *
lookup_exec_domain(u_long personality)
{
struct exec_domain * ep;
u_long pers = personality(personality);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
#ifdef CONFIG_KMOD
read_unlock(&exec_domains_lock);
request_module("personality-%ld", pers);
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep; ep = ep->next) {
if (pers >= ep->pers_low && pers <= ep->pers_high)
if (try_module_get(ep->module))
goto out;
}
#endif
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
return (ep);
}
int
register_exec_domain(struct exec_domain *ep)
{
struct exec_domain *tmp;
int err = -EBUSY;
if (ep == NULL)
return -EINVAL;
if (ep->next != NULL)
return -EBUSY;
write_lock(&exec_domains_lock);
for (tmp = exec_domains; tmp; tmp = tmp->next) {
if (tmp == ep)
goto out;
}
ep->next = exec_domains;
exec_domains = ep;
err = 0;
out:
write_unlock(&exec_domains_lock);
return (err);
}
int
unregister_exec_domain(struct exec_domain *ep)
{
struct exec_domain **epp;
epp = &exec_domains;
write_lock(&exec_domains_lock);
for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
if (ep == *epp)
goto unregister;
}
write_unlock(&exec_domains_lock);
return -EINVAL;
unregister:
*epp = ep->next;
ep->next = NULL;
write_unlock(&exec_domains_lock);
return 0;
}
int
__set_personality(u_long personality)
{
struct exec_domain *ep, *oep;
ep = lookup_exec_domain(personality);
if (ep == current_thread_info()->exec_domain) {
current->personality = personality;
return 0;
}
if (atomic_read(&current->fs->count) != 1) {
struct fs_struct *fsp, *ofsp;
fsp = copy_fs_struct(current->fs);
if (fsp == NULL) {
module_put(ep->module);
return -ENOMEM;
}
task_lock(current);
ofsp = current->fs;
current->fs = fsp;
task_unlock(current);
put_fs_struct(ofsp);
}
/*
* At that point we are guaranteed to be the sole owner of
* current->fs.
*/
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;
set_fs_altroot();
module_put(oep->module);
return 0;
}
int
get_exec_domain_list(char *page)
{
struct exec_domain *ep;
int len = 0;
read_lock(&exec_domains_lock);
for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
ep->pers_low, ep->pers_high, ep->name,
module_name(ep->module));
read_unlock(&exec_domains_lock);
return (len);
}
asmlinkage long
sys_personality(u_long personality)
{
u_long old = current->personality;
if (personality != 0xffffffff) {
set_personality(personality);
if (current->personality != personality)
return -EINVAL;
}
return (long)old;
}
EXPORT_SYMBOL(register_exec_domain);
EXPORT_SYMBOL(unregister_exec_domain);
EXPORT_SYMBOL(__set_personality);

1527
kernel/exit.c Normal file

File diff suppressed because it is too large Load Diff

67
kernel/extable.c Normal file
View File

@@ -0,0 +1,67 @@
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/module.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <asm/sections.h>
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
sort_extable(__start___ex_table, __stop___ex_table);
}
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
if (!e)
e = search_module_extables(addr);
return e;
}
static int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 1;
if (addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
return 1;
return 0;
}
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return __module_text_address(addr) != NULL;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return module_text_address(addr) != NULL;
}

1274
kernel/fork.c Normal file

File diff suppressed because it is too large Load Diff

798
kernel/futex.c Normal file
View File

@@ -0,0 +1,798 @@
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
*
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
* Removed page pinning, fix privately mapped COW pages and other cleanups
* (C) Copyright 2003, 2004 Jamie Lokier
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
* Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping.
* Don't rearrange members without looking at hash_futex().
*
* offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
* We set bit 0 to indicate if it's an inode-based key.
*/
union futex_key {
struct {
unsigned long pgoff;
struct inode *inode;
int offset;
} shared;
struct {
unsigned long uaddr;
struct mm_struct *mm;
int offset;
} private;
struct {
unsigned long word;
void *ptr;
int offset;
} both;
};
/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
/* Which hash list lock to use. */
spinlock_t *lock_ptr;
/* Key which the futex is hashed on. */
union futex_key key;
/* For fd, sigio sent using these. */
int fd;
struct file *filp;
};
/*
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
spinlock_t lock;
struct list_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
/* Futex-fs vfsmount entry: */
static struct vfsmount *futex_mnt;
/*
* We hash on the keys returned from get_futex_key (see below).
*/
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
}
/*
* Return 1 if two futex_keys are equal, 0 otherwise.
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
return (key1->both.word == key2->both.word
&& key1->both.ptr == key2->both.ptr
&& key1->both.offset == key2->both.offset);
}
/*
* Get parameters which are the keys for a futex.
*
* For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
* Returns: 0, or negative error code.
* The key words are stored in *key on success.
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
static int get_futex_key(unsigned long uaddr, union futex_key *key)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct page *page;
int err;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = uaddr % PAGE_SIZE;
if (unlikely((key->both.offset % sizeof(u32)) != 0))
return -EINVAL;
uaddr -= key->both.offset;
/*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
vma = find_extend_vma(mm, uaddr);
if (unlikely(!vma))
return -EFAULT;
/*
* Permissions.
*/
if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
/*
* Private mappings are handled in a simple way.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process. Therefore we use
* VM_MAYSHARE here, not VM_SHARED which is restricted to shared
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
key->private.mm = mm;
key->private.uaddr = uaddr;
return 0;
}
/*
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
return 0;
}
/*
* We could walk the page table to read the non-linear
* pte, and get the page index without fetching the page
* from swap. But that's a lot of code to duplicate here
* for a rare case, so we simply fetch the page.
*/
/*
* Do a quick atomic lookup first - this is the fastpath.
*/
spin_lock(&current->mm->page_table_lock);
page = follow_page(mm, uaddr, 0);
if (likely(page != NULL)) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
spin_unlock(&current->mm->page_table_lock);
return 0;
}
spin_unlock(&current->mm->page_table_lock);
/*
* Do it the general way.
*/
err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
if (err >= 0) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
put_page(page);
return 0;
}
return err;
}
/*
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
* NOTE: mmap_sem MUST be held between get_futex_key() and calling this
* function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
static inline void get_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
atomic_inc(&key->shared.inode->i_count);
else
atomic_inc(&key->private.mm->mm_count);
}
}
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
static void drop_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
iput(key->shared.inode);
else
mmdrop(key->private.mm);
}
}
static inline int get_futex_value_locked(int *dest, int __user *from)
{
int ret;
inc_preempt_count();
ret = __copy_from_user_inatomic(dest, from, sizeof(int));
dec_preempt_count();
return ret ? -EFAULT : 0;
}
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
static void wake_futex(struct futex_q *q)
{
list_del_init(&q->list);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
* list_del_init() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
*/
q->lock_ptr = NULL;
}
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
static int futex_wake(unsigned long uaddr, int nr_wake)
{
union futex_key key;
struct futex_hash_bucket *bh;
struct list_head *head;
struct futex_q *this, *next;
int ret;
down_read(&current->mm->mmap_sem);
ret = get_futex_key(uaddr, &key);
if (unlikely(ret != 0))
goto out;
bh = hash_futex(&key);
spin_lock(&bh->lock);
head = &bh->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
wake_futex(this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&bh->lock);
out:
up_read(&current->mm->mmap_sem);
return ret;
}
/*
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
int nr_wake, int nr_requeue, int *valp)
{
union futex_key key1, key2;
struct futex_hash_bucket *bh1, *bh2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
down_read(&current->mm->mmap_sem);
ret = get_futex_key(uaddr1, &key1);
if (unlikely(ret != 0))
goto out;
ret = get_futex_key(uaddr2, &key2);
if (unlikely(ret != 0))
goto out;
bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2);
if (bh1 < bh2)
spin_lock(&bh1->lock);
spin_lock(&bh2->lock);
if (bh1 > bh2)
spin_lock(&bh1->lock);
if (likely(valp != NULL)) {
int curval;
ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
if (unlikely(ret)) {
spin_unlock(&bh1->lock);
if (bh1 != bh2)
spin_unlock(&bh2->lock);
/* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
up_read(&current->mm->mmap_sem);
ret = get_user(curval, (int __user *)uaddr1);
if (!ret)
goto retry;
return ret;
}
if (curval != *valp) {
ret = -EAGAIN;
goto out_unlock;
}
}
head1 = &bh1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
wake_futex(this);
} else {
list_move_tail(&this->list, &bh2->chain);
this->lock_ptr = &bh2->lock;
this->key = key2;
get_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
break;
/* Make sure to stop if key1 == key2 */
if (head1 == &bh2->chain && head1 != &next->list)
head1 = &this->list;
}
}
out_unlock:
spin_unlock(&bh1->lock);
if (bh1 != bh2)
spin_unlock(&bh2->lock);
/* drop_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
drop_key_refs(&key1);
out:
up_read(&current->mm->mmap_sem);
return ret;
}
/* The key must be already stored in q->key. */
static inline struct futex_hash_bucket *
queue_lock(struct futex_q *q, int fd, struct file *filp)
{
struct futex_hash_bucket *bh;
q->fd = fd;
q->filp = filp;
init_waitqueue_head(&q->waiters);
get_key_refs(&q->key);
bh = hash_futex(&q->key);
q->lock_ptr = &bh->lock;
spin_lock(&bh->lock);
return bh;
}
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
{
list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock);
}
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
{
spin_unlock(&bh->lock);
drop_key_refs(&q->key);
}
/*
* queue_me and unqueue_me must be called as a pair, each
* exactly once. They are called with the hashed spinlock held.
*/
/* The key must be already stored in q->key. */
static void queue_me(struct futex_q *q, int fd, struct file *filp)
{
struct futex_hash_bucket *bh;
bh = queue_lock(q, fd, filp);
__queue_me(q, bh);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
int ret = 0;
spinlock_t *lock_ptr;
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
* q->lock_ptr can change between reading it and
* spin_lock(), causing us to take the wrong lock. This
* corrects the race condition.
*
* Reasoning goes like this: if we have the wrong lock,
* q->lock_ptr must have changed (maybe several times)
* between reading it and the spin_lock(). It can
* change again after the spin_lock() but only if it was
* already changed before the spin_lock(). It cannot,
* however, change back to the original value. Therefore
* we can detect whether we acquired the correct lock.
*/
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
WARN_ON(list_empty(&q->list));
list_del(&q->list);
spin_unlock(lock_ptr);
ret = 1;
}
drop_key_refs(&q->key);
return ret;
}
static int futex_wait(unsigned long uaddr, int val, unsigned long time)
{
DECLARE_WAITQUEUE(wait, current);
int ret, curval;
struct futex_q q;
struct futex_hash_bucket *bh;
retry:
down_read(&current->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
bh = queue_lock(&q, -1, NULL);
/*
* Access the page AFTER the futex is queued.
* Order is important:
*
* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
* Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
* any cond. If we queued after testing *uaddr, that would open
* a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
* A consequence is that futex_wait() can return zero and absorb
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
* We hold the mmap semaphore, so the mapping cannot have changed
* since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&curval, (int __user *)uaddr);
if (unlikely(ret)) {
queue_unlock(&q, bh);
/* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
up_read(&current->mm->mmap_sem);
ret = get_user(curval, (int __user *)uaddr);
if (!ret)
goto retry;
return ret;
}
if (curval != val) {
ret = -EWOULDBLOCK;
queue_unlock(&q, bh);
goto out_release_sem;
}
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, bh);
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
up_read(&current->mm->mmap_sem);
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
* faults, and we cannot just set TASK_INTERRUPTIBLE state when
* queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code removing us from hash when it
* wakes us up.
*/
/* add_wait_queue is the barrier after __set_current_state. */
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
* !list_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
if (likely(!list_empty(&q.list)))
time = schedule_timeout(time);
__set_current_state(TASK_RUNNING);
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
*/
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
if (time == 0)
return -ETIMEDOUT;
/* We expect signal_pending(current), but another thread may
* have handled it for us already. */
return -EINTR;
out_release_sem:
up_read(&current->mm->mmap_sem);
return ret;
}
static int futex_close(struct inode *inode, struct file *filp)
{
struct futex_q *q = filp->private_data;
unqueue_me(q);
kfree(q);
return 0;
}
/* This is one-shot: once it's gone off you need a new fd */
static unsigned int futex_poll(struct file *filp,
struct poll_table_struct *wait)
{
struct futex_q *q = filp->private_data;
int ret = 0;
poll_wait(filp, &q->waiters, wait);
/*
* list_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
if (list_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
}
static struct file_operations futex_fops = {
.release = futex_close,
.poll = futex_poll,
};
/*
* Signal allows caller to avoid the race which would occur if they
* set the sigio stuff up afterwards.
*/
static int futex_fd(unsigned long uaddr, int signal)
{
struct futex_q *q;
struct file *filp;
int ret, err;
ret = -EINVAL;
if (signal < 0 || signal > _NSIG)
goto out;
ret = get_unused_fd();
if (ret < 0)
goto out;
filp = get_empty_filp();
if (!filp) {
put_unused_fd(ret);
ret = -ENFILE;
goto out;
}
filp->f_op = &futex_fops;
filp->f_vfsmnt = mntget(futex_mnt);
filp->f_dentry = dget(futex_mnt->mnt_root);
filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
if (signal) {
int err;
err = f_setown(filp, current->pid, 1);
if (err < 0) {
put_unused_fd(ret);
put_filp(filp);
ret = err;
goto out;
}
filp->f_owner.signum = signal;
}
q = kmalloc(sizeof(*q), GFP_KERNEL);
if (!q) {
put_unused_fd(ret);
put_filp(filp);
ret = -ENOMEM;
goto out;
}
down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &q->key);
if (unlikely(err != 0)) {
up_read(&current->mm->mmap_sem);
put_unused_fd(ret);
put_filp(filp);
kfree(q);
return err;
}
/*
* queue_me() must be called before releasing mmap_sem, because
* key->shared.inode needs to be referenced while holding it.
*/
filp->private_data = q;
queue_me(q, ret, filp);
up_read(&current->mm->mmap_sem);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
out:
return ret;
}
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
unsigned long uaddr2, int val2, int val3)
{
int ret;
switch (op) {
case FUTEX_WAIT:
ret = futex_wait(uaddr, val, timeout);
break;
case FUTEX_WAKE:
ret = futex_wake(uaddr, val);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
}
return ret;
}
asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
struct timespec __user *utime, u32 __user *uaddr2,
int val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
if ((op == FUTEX_WAIT) && utime) {
if (copy_from_user(&t, utime, sizeof(t)) != 0)
return -EFAULT;
timeout = timespec_to_jiffies(&t) + 1;
}
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
if (op >= FUTEX_REQUEUE)
val2 = (int) (unsigned long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout,
(unsigned long)uaddr2, val2, val3);
}
static struct super_block *
futexfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
}
static struct file_system_type futex_fs_type = {
.name = "futexfs",
.get_sb = futexfs_get_sb,
.kill_sb = kill_anon_super,
};
static int __init init(void)
{
unsigned int i;
register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
INIT_LIST_HEAD(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
}
__initcall(init);

182
kernel/intermodule.c Normal file
View File

@@ -0,0 +1,182 @@
/* Deprecated, do not use. Moved from module.c to here. --RR */
/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
#include <linux/module.h>
#include <linux/kmod.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/slab.h>
/* inter_module functions are always available, even when the kernel is
* compiled without modules. Consumers of inter_module_xxx routines
* will always work, even when both are built into the kernel, this
* approach removes lots of #ifdefs in mainline code.
*/
static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
static DEFINE_SPINLOCK(ime_lock);
static int kmalloc_failed;
struct inter_module_entry {
struct list_head list;
const char *im_name;
struct module *owner;
const void *userdata;
};
/**
* inter_module_register - register a new set of inter module data.
* @im_name: an arbitrary string to identify the data, must be unique
* @owner: module that is registering the data, always use THIS_MODULE
* @userdata: pointer to arbitrary userdata to be registered
*
* Description: Check that the im_name has not already been registered,
* complain if it has. For new data, add it to the inter_module_entry
* list.
*/
void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
{
struct list_head *tmp;
struct inter_module_entry *ime, *ime_new;
if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
/* Overloaded kernel, not fatal */
printk(KERN_ERR
"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
im_name);
kmalloc_failed = 1;
return;
}
memset(ime_new, 0, sizeof(*ime_new));
ime_new->im_name = im_name;
ime_new->owner = owner;
ime_new->userdata = userdata;
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
spin_unlock(&ime_lock);
kfree(ime_new);
/* Program logic error, fatal */
printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
BUG();
}
}
list_add(&(ime_new->list), &ime_list);
spin_unlock(&ime_lock);
}
/**
* inter_module_unregister - unregister a set of inter module data.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: Check that the im_name has been registered, complain if
* it has not. For existing data, remove it from the
* inter_module_entry list.
*/
void inter_module_unregister(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
list_del(&(ime->list));
spin_unlock(&ime_lock);
kfree(ime);
return;
}
}
spin_unlock(&ime_lock);
if (kmalloc_failed) {
printk(KERN_ERR
"inter_module_unregister: no entry for '%s', "
"probably caused by previous kmalloc failure\n",
im_name);
return;
}
else {
/* Program logic error, fatal */
printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
BUG();
}
}
/**
* inter_module_get - return arbitrary userdata from another module.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: If the im_name has not been registered, return NULL.
* Try to increment the use count on the owning module, if that fails
* then return NULL. Otherwise return the userdata.
*/
static const void *inter_module_get(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
const void *result = NULL;
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
if (try_module_get(ime->owner))
result = ime->userdata;
break;
}
}
spin_unlock(&ime_lock);
return(result);
}
/**
* inter_module_get_request - im get with automatic request_module.
* @im_name: an arbitrary string to identify the data, must be unique
* @modname: module that is expected to register im_name
*
* Description: If inter_module_get fails, do request_module then retry.
*/
const void *inter_module_get_request(const char *im_name, const char *modname)
{
const void *result = inter_module_get(im_name);
if (!result) {
request_module("%s", modname);
result = inter_module_get(im_name);
}
return(result);
}
/**
* inter_module_put - release use of data from another module.
* @im_name: an arbitrary string to identify the data, must be unique
*
* Description: If the im_name has not been registered, complain,
* otherwise decrement the use count on the owning module.
*/
void inter_module_put(const char *im_name)
{
struct list_head *tmp;
struct inter_module_entry *ime;
spin_lock(&ime_lock);
list_for_each(tmp, &ime_list) {
ime = list_entry(tmp, struct inter_module_entry, list);
if (strcmp(ime->im_name, im_name) == 0) {
if (ime->owner)
module_put(ime->owner);
spin_unlock(&ime_lock);
return;
}
}
spin_unlock(&ime_lock);
printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
BUG();
}
EXPORT_SYMBOL(inter_module_register);
EXPORT_SYMBOL(inter_module_unregister);
EXPORT_SYMBOL(inter_module_get_request);
EXPORT_SYMBOL(inter_module_put);

5
kernel/irq/Makefile Normal file
View File

@@ -0,0 +1,5 @@
obj-y := handle.o manage.o spurious.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o

189
kernel/irq/autoprobe.c Normal file
View File

@@ -0,0 +1,189 @@
/*
* linux/kernel/irq/autoprobe.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
* "IRQ_WAITING" cleared and the interrupt disabled.
*/
static DECLARE_MUTEX(probe_sem);
/**
* probe_irq_on - begin an interrupt autodetect
*
* Commence probing for an interrupt. The interrupts are scanned
* and a mask of potential interrupt lines is returned.
*
*/
unsigned long probe_irq_on(void)
{
unsigned long val, delay;
irq_desc_t *desc;
unsigned int i;
down(&probe_sem);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
*/
for (i = NR_IRQS-1; i > 0; i--) {
desc = irq_desc + i;
spin_lock_irq(&desc->lock);
if (!irq_desc[i].action)
irq_desc[i].handler->startup(i);
spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
/* about 20ms delay */ barrier();
/*
* enable any unassigned irqs
* (we must startup again here because if a longstanding irq
* happened in the previous stage, it may have masked itself)
*/
for (i = NR_IRQS-1; i > 0; i--) {
desc = irq_desc + i;
spin_lock_irq(&desc->lock);
if (!desc->action) {
desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
if (desc->handler->startup(i))
desc->status |= IRQ_PENDING;
}
spin_unlock_irq(&desc->lock);
}
/*
* Wait for spurious interrupts to trigger
*/
for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
/* about 100ms delay */ barrier();
/*
* Now filter out any obviously spurious interrupts
*/
val = 0;
for (i = 0; i < NR_IRQS; i++) {
irq_desc_t *desc = irq_desc + i;
unsigned int status;
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(status & IRQ_WAITING)) {
desc->status = status & ~IRQ_AUTODETECT;
desc->handler->shutdown(i);
} else
if (i < 32)
val |= 1 << i;
}
spin_unlock_irq(&desc->lock);
}
return val;
}
EXPORT_SYMBOL(probe_irq_on);
/**
* probe_irq_mask - scan a bitmap of interrupt lines
* @val: mask of interrupts to consider
*
* Scan the interrupt lines and return a bitmap of active
* autodetect interrupts. The interrupt probe logic state
* is then returned to its previous value.
*
* Note: we need to scan all the irq's even though we will
* only return autodetect irq numbers - just so that we reset
* them all to a known state.
*/
unsigned int probe_irq_mask(unsigned long val)
{
unsigned int mask;
int i;
mask = 0;
for (i = 0; i < NR_IRQS; i++) {
irq_desc_t *desc = irq_desc + i;
unsigned int status;
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
if (i < 16 && !(status & IRQ_WAITING))
mask |= 1 << i;
desc->status = status & ~IRQ_AUTODETECT;
desc->handler->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
up(&probe_sem);
return mask & val;
}
EXPORT_SYMBOL(probe_irq_mask);
/**
* probe_irq_off - end an interrupt autodetect
* @val: mask of potential interrupts (unused)
*
* Scans the unused interrupt lines and returns the line which
* appears to have triggered the interrupt. If no interrupt was
* found then zero is returned. If more than one interrupt is
* found then minus the first candidate is returned to indicate
* their is doubt.
*
* The interrupt probe logic state is returned to its previous
* value.
*
* BUGS: When used in a module (which arguably shouldn't happen)
* nothing prevents two IRQ probe callers from overlapping. The
* results of this are non-optimal.
*/
int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_irqs = 0;
for (i = 0; i < NR_IRQS; i++) {
irq_desc_t *desc = irq_desc + i;
unsigned int status;
spin_lock_irq(&desc->lock);
status = desc->status;
if (status & IRQ_AUTODETECT) {
if (!(status & IRQ_WAITING)) {
if (!nr_irqs)
irq_found = i;
nr_irqs++;
}
desc->status = status & ~IRQ_AUTODETECT;
desc->handler->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
up(&probe_sem);
if (nr_irqs > 1)
irq_found = -irq_found;
return irq_found;
}
EXPORT_SYMBOL(probe_irq_off);

193
kernel/irq/handle.c Normal file
View File

@@ -0,0 +1,193 @@
/*
* linux/kernel/irq/handle.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the core interrupt handling code.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include "internals.h"
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
* by the main code to do the right thing. Each driver-visible
* interrupt source is transparently wired to the apropriate
* controller. Thus drivers need not be aware of the
* interrupt-controller.
*
* The code is designed to be easily extended with new/different
* interrupt controllers, without having to do assembly magic or
* having to touch the generic code.
*
* Controller mappings for all interrupt sources:
*/
irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
[0 ... NR_IRQS-1] = {
.handler = &no_irq_type,
.lock = SPIN_LOCK_UNLOCKED
}
};
/*
* Generic 'no controller' code
*/
static void end_none(unsigned int irq) { }
static void enable_none(unsigned int irq) { }
static void disable_none(unsigned int irq) { }
static void shutdown_none(unsigned int irq) { }
static unsigned int startup_none(unsigned int irq) { return 0; }
static void ack_none(unsigned int irq)
{
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themself.
*/
ack_bad_irq(irq);
}
struct hw_interrupt_type no_irq_type = {
.typename = "none",
.startup = startup_none,
.shutdown = shutdown_none,
.enable = enable_none,
.disable = disable_none,
.ack = ack_none,
.end = end_none,
.set_affinity = NULL
};
/*
* Special, empty irq handler:
*/
irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
{
return IRQ_NONE;
}
/*
* Have got an event to handle:
*/
fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
struct irqaction *action)
{
int ret, retval = 0, status = 0;
if (!(action->flags & SA_INTERRUPT))
local_irq_enable();
do {
ret = action->handler(irq, action->dev_id, regs);
if (ret == IRQ_HANDLED)
status |= action->flags;
retval |= ret;
action = action->next;
} while (action);
if (status & SA_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
local_irq_disable();
return retval;
}
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
{
irq_desc_t *desc = irq_desc + irq;
struct irqaction * action;
unsigned int status;
kstat_this_cpu.irqs[irq]++;
if (desc->status & IRQ_PER_CPU) {
irqreturn_t action_ret;
/*
* No locking required for CPU-local interrupts:
*/
desc->handler->ack(irq);
action_ret = handle_IRQ_event(irq, regs, desc->action);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
desc->handler->end(irq);
return 1;
}
spin_lock(&desc->lock);
desc->handler->ack(irq);
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
*/
status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
status |= IRQ_PENDING; /* we _want_ to handle it */
/*
* If the IRQ is disabled for whatever reason, we cannot
* use the action we have.
*/
action = NULL;
if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
action = desc->action;
status &= ~IRQ_PENDING; /* we commit to handling */
status |= IRQ_INPROGRESS; /* we are handling it */
}
desc->status = status;
/*
* If there is no IRQ handler or it was disabled, exit early.
* Since we set PENDING, if another processor is handling
* a different instance of this same irq, the other processor
* will take care of it.
*/
if (unlikely(!action))
goto out;
/*
* Edge triggered interrupts need to remember
* pending events.
* This applies to any hw interrupts that allow a second
* instance of the same irq to arrive while we are in do_IRQ
* or in the handler. But the code here only handles the _second_
* instance of the irq, not the third or fourth. So it is mostly
* useful for irq hardware that does not mask cleanly in an
* SMP environment.
*/
for (;;) {
irqreturn_t action_ret;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, regs, action);
spin_lock(&desc->lock);
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
if (likely(!(desc->status & IRQ_PENDING)))
break;
desc->status &= ~IRQ_PENDING;
}
desc->status &= ~IRQ_INPROGRESS;
out:
/*
* The ->end() handler has to deal with interrupts which got
* disabled while the handler was running.
*/
desc->handler->end(irq);
spin_unlock(&desc->lock);
return 1;
}

18
kernel/irq/internals.h Normal file
View File

@@ -0,0 +1,18 @@
/*
* IRQ subsystem internal functions and variables:
*/
extern int noirqdebug;
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
static inline void register_irq_proc(unsigned int irq) { }
static inline void register_handler_proc(unsigned int irq,
struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif

349
kernel/irq/manage.c Normal file
View File

@@ -0,0 +1,349 @@
/*
* linux/kernel/irq/manage.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains driver APIs to the irq subsystem.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include "internals.h"
#ifdef CONFIG_SMP
cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
*
* This function waits for any pending IRQ handlers for this interrupt
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
* This function may be called - with care - from IRQ context.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
while (desc->status & IRQ_INPROGRESS)
cpu_relax();
}
EXPORT_SYMBOL(synchronize_irq);
#endif
/**
* disable_irq_nosync - disable an irq without waiting
* @irq: Interrupt to disable
*
* Disable the selected interrupt line. Disables and Enables are
* nested.
* Unlike disable_irq(), this function does not ensure existing
* instances of the IRQ handler have completed before returning.
*
* This function may be called from IRQ context.
*/
void disable_irq_nosync(unsigned int irq)
{
irq_desc_t *desc = irq_desc + irq;
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
if (!desc->depth++) {
desc->status |= IRQ_DISABLED;
desc->handler->disable(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL(disable_irq_nosync);
/**
* disable_irq - disable an irq and wait for completion
* @irq: Interrupt to disable
*
* Disable the selected interrupt line. Enables and Disables are
* nested.
* This function waits for any pending IRQ handlers for this interrupt
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
* This function may be called - with care - from IRQ context.
*/
void disable_irq(unsigned int irq)
{
irq_desc_t *desc = irq_desc + irq;
disable_irq_nosync(irq);
if (desc->action)
synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);
/**
* enable_irq - enable handling of an irq
* @irq: Interrupt to enable
*
* Undoes the effect of one call to disable_irq(). If this
* matches the last disable, processing of interrupts on this
* IRQ line is re-enabled.
*
* This function may be called from IRQ context.
*/
void enable_irq(unsigned int irq)
{
irq_desc_t *desc = irq_desc + irq;
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
switch (desc->depth) {
case 0:
WARN_ON(1);
break;
case 1: {
unsigned int status = desc->status & ~IRQ_DISABLED;
desc->status = status;
if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = status | IRQ_REPLAY;
hw_resend_irq(desc->handler,irq);
}
desc->handler->enable(irq);
/* fall-through */
}
default:
desc->depth--;
}
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL(enable_irq);
/*
* Internal function that tells the architecture code whether a
* particular irq has been exclusively allocated or is available
* for driver use.
*/
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
struct irqaction *action;
if (irq >= NR_IRQS)
return 0;
action = irq_desc[irq].action;
if (action)
if (irqflags & action->flags & SA_SHIRQ)
action = NULL;
return !action;
}
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
*/
int setup_irq(unsigned int irq, struct irqaction * new)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *old, **p;
unsigned long flags;
int shared = 0;
if (desc->handler == &no_irq_type)
return -ENOSYS;
/*
* Some drivers like serial.c use request_irq() heavily,
* so we have to be careful not to interfere with a
* running system.
*/
if (new->flags & SA_SAMPLE_RANDOM) {
/*
* This function might sleep, we want to call it first,
* outside of the atomic block.
* Yes, this might clear the entropy pool if the wrong
* driver is attempted to be loaded, without actually
* installing a new handler, but is this really a problem,
* only the sysadmin is able to do this.
*/
rand_initialize_irq(irq);
}
/*
* The following block of code has to be executed atomically
*/
spin_lock_irqsave(&desc->lock,flags);
p = &desc->action;
if ((old = *p) != NULL) {
/* Can't share interrupts unless both agree to */
if (!(old->flags & new->flags & SA_SHIRQ)) {
spin_unlock_irqrestore(&desc->lock,flags);
return -EBUSY;
}
/* add new interrupt at end of irq queue */
do {
p = &old->next;
old = *p;
} while (old);
shared = 1;
}
*p = new;
if (!shared) {
desc->depth = 0;
desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
IRQ_WAITING | IRQ_INPROGRESS);
if (desc->handler->startup)
desc->handler->startup(irq);
else
desc->handler->enable(irq);
}
spin_unlock_irqrestore(&desc->lock,flags);
new->irq = irq;
register_irq_proc(irq);
new->dir = NULL;
register_handler_proc(irq, new);
return 0;
}
/**
* free_irq - free an interrupt
* @irq: Interrupt line to free
* @dev_id: Device identity to free
*
* Remove an interrupt handler. The handler is removed and if the
* interrupt line is no longer in use by any driver it is disabled.
* On a shared IRQ the caller must ensure the interrupt is disabled
* on the card it drives before calling this function. The function
* does not return until any executing interrupts for this IRQ
* have completed.
*
* This function must not be called from interrupt context.
*/
void free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc;
struct irqaction **p;
unsigned long flags;
if (irq >= NR_IRQS)
return;
desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock,flags);
p = &desc->action;
for (;;) {
struct irqaction * action = *p;
if (action) {
struct irqaction **pp = p;
p = &action->next;
if (action->dev_id != dev_id)
continue;
/* Found it - now remove it from the list of entries */
*pp = action->next;
if (!desc->action) {
desc->status |= IRQ_DISABLED;
if (desc->handler->shutdown)
desc->handler->shutdown(irq);
else
desc->handler->disable(irq);
}
spin_unlock_irqrestore(&desc->lock,flags);
unregister_handler_proc(irq, action);
/* Make sure it's not being used on another CPU */
synchronize_irq(irq);
kfree(action);
return;
}
printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
spin_unlock_irqrestore(&desc->lock,flags);
return;
}
}
EXPORT_SYMBOL(free_irq);
/**
* request_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
*
* This call allocates interrupt resources and enables the
* interrupt line and IRQ handling. From the point this
* call is made your handler function may be invoked. Since
* your handler function must clear any interrupt the board
* raises, you must take care both to initialise your hardware
* and to set up the interrupt handler in the right order.
*
* Dev_id must be globally unique. Normally the address of the
* device data structure is used as the cookie. Since the handler
* receives this value it makes sense to use it.
*
* If your interrupt is shared you must pass a non NULL dev_id
* as this is required when freeing the interrupt.
*
* Flags:
*
* SA_SHIRQ Interrupt is shared
* SA_INTERRUPT Disable local interrupts while processing
* SA_SAMPLE_RANDOM The interrupt can be used for entropy
*
*/
int request_irq(unsigned int irq,
irqreturn_t (*handler)(int, void *, struct pt_regs *),
unsigned long irqflags, const char * devname, void *dev_id)
{
struct irqaction * action;
int retval;
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*/
if ((irqflags & SA_SHIRQ) && !dev_id)
return -EINVAL;
if (irq >= NR_IRQS)
return -EINVAL;
if (!handler)
return -EINVAL;
action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
if (!action)
return -ENOMEM;
action->handler = handler;
action->flags = irqflags;
cpus_clear(action->mask);
action->name = devname;
action->next = NULL;
action->dev_id = dev_id;
retval = setup_irq(irq, action);
if (retval)
kfree(action);
return retval;
}
EXPORT_SYMBOL(request_irq);

159
kernel/irq/proc.c Normal file
View File

@@ -0,0 +1,159 @@
/*
* linux/kernel/irq/proc.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
*/
#include <linux/irq.h>
#include <linux/proc_fs.h>
#include <linux/interrupt.h>
static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
#ifdef CONFIG_SMP
/*
* The /proc/irq/<irq>/smp_affinity values:
*/
static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
void __attribute__((weak))
proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
irq_affinity[irq] = mask_val;
irq_desc[irq].handler->set_affinity(irq, mask_val);
}
static int irq_affinity_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
if (count - len < 2)
return -EINVAL;
len += sprintf(page + len, "\n");
return len;
}
int no_irq_affinity;
static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
unsigned long count, void *data)
{
unsigned int irq = (int)(long)data, full_count = count, err;
cpumask_t new_value, tmp;
if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
return -EIO;
err = cpumask_parse(buffer, count, new_value);
if (err)
return err;
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
cpus_and(tmp, new_value, cpu_online_map);
if (cpus_empty(tmp))
return -EINVAL;
proc_set_irq_affinity(irq, new_value);
return full_count;
}
#endif
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
for (action = desc->action ; action; action = action->next)
if ((action != new_action) && action->name &&
!strcmp(new_action->name, action->name))
return 0;
return 1;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
char name [MAX_NAMELEN];
if (!irq_dir[irq] || action->dir || !action->name ||
!name_unique(irq, action))
return;
memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
action->dir = proc_mkdir(name, irq_dir[irq]);
}
#undef MAX_NAMELEN
#define MAX_NAMELEN 10
void register_irq_proc(unsigned int irq)
{
char name [MAX_NAMELEN];
if (!root_irq_dir ||
(irq_desc[irq].handler == &no_irq_type) ||
irq_dir[irq])
return;
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
irq_dir[irq] = proc_mkdir(name, root_irq_dir);
#ifdef CONFIG_SMP
{
struct proc_dir_entry *entry;
/* create /proc/irq/<irq>/smp_affinity */
entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
if (entry) {
entry->nlink = 1;
entry->data = (void *)(long)irq;
entry->read_proc = irq_affinity_read_proc;
entry->write_proc = irq_affinity_write_proc;
}
smp_affinity_entry[irq] = entry;
}
#endif
}
#undef MAX_NAMELEN
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
if (action->dir)
remove_proc_entry(action->dir->name, irq_dir[irq]);
}
void init_irq_proc(void)
{
int i;
/* create /proc/irq */
root_irq_dir = proc_mkdir("irq", NULL);
if (!root_irq_dir)
return;
/*
* Create entries for all existing IRQs.
*/
for (i = 0; i < NR_IRQS; i++)
register_irq_proc(i);
}

96
kernel/irq/spurious.c Normal file
View File

@@ -0,0 +1,96 @@
/*
* linux/kernel/irq/spurious.c
*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
* and try to turn the IRQ off.
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*
* Called under desc->lock
*/
static void
__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
{
struct irqaction *action;
if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
irq, action_ret);
} else {
printk(KERN_ERR "irq %d: nobody cared!\n", irq);
}
dump_stack();
printk(KERN_ERR "handlers:\n");
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>]", action->handler);
print_symbol(" (%s)",
(unsigned long)action->handler);
printk("\n");
action = action->next;
}
}
void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
__report_bad_irq(irq, desc, action_ret);
}
}
void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
{
if (action_ret != IRQ_HANDLED) {
desc->irqs_unhandled++;
if (action_ret != IRQ_NONE)
report_bad_irq(irq, desc, action_ret);
}
desc->irq_count++;
if (desc->irq_count < 100000)
return;
desc->irq_count = 0;
if (desc->irqs_unhandled > 99900) {
/*
* The interrupt is stuck
*/
__report_bad_irq(irq, desc, action_ret);
/*
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->status |= IRQ_DISABLED;
desc->handler->disable(irq);
}
desc->irqs_unhandled = 0;
}
int noirqdebug;
int __init noirqdebug_setup(char *str)
{
noirqdebug = 1;
printk(KERN_INFO "IRQ lockup detection disabled\n");
return 1;
}
__setup("noirqdebug", noirqdebug_setup);

241
kernel/itimer.c Normal file
View File

@@ -0,0 +1,241 @@
/*
* linux/kernel/itimer.c
*
* Copyright (C) 1992 Darren Senn
*/
/* These are all the functions necessary to implement itimers */
#include <linux/mm.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/posix-timers.h>
#include <asm/uaccess.h>
static unsigned long it_real_value(struct signal_struct *sig)
{
unsigned long val = 0;
if (timer_pending(&sig->real_timer)) {
val = sig->real_timer.expires - jiffies;
/* look out for negative/zero itimer.. */
if ((long) val <= 0)
val = 1;
}
return val;
}
int do_getitimer(int which, struct itimerval *value)
{
struct task_struct *tsk = current;
unsigned long interval, val;
cputime_t cinterval, cval;
switch (which) {
case ITIMER_REAL:
spin_lock_irq(&tsk->sighand->siglock);
interval = tsk->signal->it_real_incr;
val = it_real_value(tsk->signal);
spin_unlock_irq(&tsk->sighand->siglock);
jiffies_to_timeval(val, &value->it_value);
jiffies_to_timeval(interval, &value->it_interval);
break;
case ITIMER_VIRTUAL:
read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_struct *t = tsk;
cputime_t utime = tsk->signal->utime;
do {
utime = cputime_add(utime, t->utime);
t = next_thread(t);
} while (t != tsk);
if (cputime_le(cval, utime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
cval = cputime_sub(cval, utime);
}
}
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
break;
case ITIMER_PROF:
read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
if (!cputime_eq(cval, cputime_zero)) {
struct task_struct *t = tsk;
cputime_t ptime = cputime_add(tsk->signal->utime,
tsk->signal->stime);
do {
ptime = cputime_add(ptime,
cputime_add(t->utime,
t->stime));
t = next_thread(t);
} while (t != tsk);
if (cputime_le(cval, ptime)) { /* about to fire */
cval = jiffies_to_cputime(1);
} else {
cval = cputime_sub(cval, ptime);
}
}
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
cputime_to_timeval(cval, &value->it_value);
cputime_to_timeval(cinterval, &value->it_interval);
break;
default:
return(-EINVAL);
}
return 0;
}
asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
{
int error = -EFAULT;
struct itimerval get_buffer;
if (value) {
error = do_getitimer(which, &get_buffer);
if (!error &&
copy_to_user(value, &get_buffer, sizeof(get_buffer)))
error = -EFAULT;
}
return error;
}
/*
* Called with P->sighand->siglock held and P->signal->real_timer inactive.
* If interval is nonzero, arm the timer for interval ticks from now.
*/
static inline void it_real_arm(struct task_struct *p, unsigned long interval)
{
p->signal->it_real_value = interval; /* XXX unnecessary field?? */
if (interval == 0)
return;
if (interval > (unsigned long) LONG_MAX)
interval = LONG_MAX;
p->signal->real_timer.expires = jiffies + interval;
add_timer(&p->signal->real_timer);
}
void it_real_fn(unsigned long __data)
{
struct task_struct * p = (struct task_struct *) __data;
send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
/*
* Now restart the timer if necessary. We don't need any locking
* here because do_setitimer makes sure we have finished running
* before it touches anything.
*/
it_real_arm(p, p->signal->it_real_incr);
}
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
unsigned long val, interval;
cputime_t cval, cinterval, nval, ninterval;
switch (which) {
case ITIMER_REAL:
spin_lock_irq(&tsk->sighand->siglock);
interval = tsk->signal->it_real_incr;
val = it_real_value(tsk->signal);
if (val)
del_timer_sync(&tsk->signal->real_timer);
tsk->signal->it_real_incr =
timeval_to_jiffies(&value->it_interval);
it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
jiffies_to_timeval(val, &ovalue->it_value);
jiffies_to_timeval(interval,
&ovalue->it_interval);
}
break;
case ITIMER_VIRTUAL:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_virt_expires;
cinterval = tsk->signal->it_virt_incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval,
jiffies_to_cputime(1));
set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
&nval, &cval);
}
tsk->signal->it_virt_expires = nval;
tsk->signal->it_virt_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
break;
case ITIMER_PROF:
nval = timeval_to_cputime(&value->it_value);
ninterval = timeval_to_cputime(&value->it_interval);
read_lock(&tasklist_lock);
spin_lock_irq(&tsk->sighand->siglock);
cval = tsk->signal->it_prof_expires;
cinterval = tsk->signal->it_prof_incr;
if (!cputime_eq(cval, cputime_zero) ||
!cputime_eq(nval, cputime_zero)) {
if (cputime_gt(nval, cputime_zero))
nval = cputime_add(nval,
jiffies_to_cputime(1));
set_process_cpu_timer(tsk, CPUCLOCK_PROF,
&nval, &cval);
}
tsk->signal->it_prof_expires = nval;
tsk->signal->it_prof_incr = ninterval;
spin_unlock_irq(&tsk->sighand->siglock);
read_unlock(&tasklist_lock);
if (ovalue) {
cputime_to_timeval(cval, &ovalue->it_value);
cputime_to_timeval(cinterval, &ovalue->it_interval);
}
break;
default:
return -EINVAL;
}
return 0;
}
asmlinkage long sys_setitimer(int which,
struct itimerval __user *value,
struct itimerval __user *ovalue)
{
struct itimerval set_buffer, get_buffer;
int error;
if (value) {
if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
return -EFAULT;
} else
memset((char *) &set_buffer, 0, sizeof(set_buffer));
error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
if (error || !ovalue)
return error;
if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
return -EFAULT;
return 0;
}

411
kernel/kallsyms.c Normal file
View File

@@ -0,0 +1,411 @@
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
* Rewritten and vastly simplified by Rusty Russell for in-kernel
* module loader:
* Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
*
* ChangeLog:
*
* (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
* Changed the compression method from stem compression to "table lookup"
* compression (see scripts/kallsyms.c for a more complete description)
*/
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/mm.h>
#include <asm/sections.h>
#ifdef CONFIG_KALLSYMS_ALL
#define all_var 1
#else
#define all_var 0
#endif
/* These will be re-linked against their real values during the second link stage */
extern unsigned long kallsyms_addresses[] __attribute__((weak));
extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
extern u8 kallsyms_names[] __attribute__((weak));
extern u8 kallsyms_token_table[] __attribute__((weak));
extern u16 kallsyms_token_index[] __attribute__((weak));
extern unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext
&& addr <= (unsigned long)_einittext)
return 1;
return 0;
}
static inline int is_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
return 1;
return in_gate_area_no_task(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
return in_gate_area_no_task(addr);
}
/* expand a compressed symbol data into the resulting uncompressed string,
given the offset to where the symbol is in the compressed stream */
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
u8 *tptr, *data;
/* get the compressed symbol length from the first symbol byte */
data = &kallsyms_names[off];
len = *data;
data++;
/* update the offset to return the offset for the next symbol on
* the compressed stream */
off += len + 1;
/* for every byte on the compressed symbol data, copy the table
entry for that byte */
while(len) {
tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
data++;
len--;
while (*tptr) {
if(skipped_first) {
*result = *tptr;
result++;
} else
skipped_first = 1;
tptr++;
}
}
*result = '\0';
/* return to offset to the next symbol */
return off;
}
/* get symbol type information. This is encoded as a single char at the
* begining of the symbol name */
static char kallsyms_get_symbol_type(unsigned int off)
{
/* get just the first code, look it up in the token table, and return the
* first char from this token */
return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
}
/* find the offset on the compressed stream given and index in the
* kallsyms array */
static unsigned int get_symbol_offset(unsigned long pos)
{
u8 *name;
int i;
/* use the closest marker we have. We have markers every 256 positions,
* so that should be close enough */
name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
/* sequentially scan all the symbols up to the point we're searching for.
* Every symbol is stored in a [<len>][<len> bytes of data] format, so we
* just need to add the len to the current pointer for every symbol we
* wish to skip */
for(i = 0; i < (pos&0xFF); i++)
name = name + (*name) + 1;
return name - kallsyms_names;
}
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
char namebuf[KSYM_NAME_LEN+1];
unsigned long i;
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf);
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
}
return module_kallsyms_lookup_name(name);
}
EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
/*
* Lookup an address
* - modname is set to NULL if it's in the kernel
* - we guarantee that the returned name is valid until we reschedule even if
* it resides in a module
* - we also guarantee that modname will be valid until rescheduled
*/
const char *kallsyms_lookup(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset,
char **modname, char *namebuf)
{
unsigned long i, low, high, mid;
const char *msym;
/* This kernel should never had been booted. */
BUG_ON(!kallsyms_addresses);
namebuf[KSYM_NAME_LEN] = 0;
namebuf[0] = 0;
if ((all_var && is_kernel(addr)) ||
(!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
unsigned long symbol_end=0;
/* do a binary search on the sorted kallsyms_addresses array */
low = 0;
high = kallsyms_num_syms;
while (high-low > 1) {
mid = (low + high) / 2;
if (kallsyms_addresses[mid] <= addr) low = mid;
else high = mid;
}
/* search for the first aliased symbol. Aliased symbols are
symbols with the same address */
while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
--low;
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
/* Search for next non-aliased symbol */
for (i = low + 1; i < kallsyms_num_syms; i++) {
if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
symbol_end = kallsyms_addresses[i];
break;
}
}
/* if we found no next symbol, we use the end of the section */
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
else
symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
}
*symbolsize = symbol_end - kallsyms_addresses[low];
*modname = NULL;
*offset = addr - kallsyms_addresses[low];
return namebuf;
}
/* see if it's in a module */
msym = module_address_lookup(addr, symbolsize, offset, modname);
if (msym)
return strncpy(namebuf, msym, KSYM_NAME_LEN);
return NULL;
}
/* Replace "%s" in format with address, or returns -errno. */
void __print_symbol(const char *fmt, unsigned long address)
{
char *modname;
const char *name;
unsigned long offset, size;
char namebuf[KSYM_NAME_LEN+1];
char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
if (!name)
sprintf(buffer, "0x%lx", address);
else {
if (modname)
sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
size, modname);
else
sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
}
printk(fmt, buffer);
}
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter
{
loff_t pos;
struct module *owner;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols */
char type;
char name[KSYM_NAME_LEN+1];
};
/* Only label it "global" if it is exported. */
static void upcase_if_global(struct kallsym_iter *iter)
{
if (is_exported(iter->name, iter->owner))
iter->type += 'A' - 'a';
}
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
&iter->value,
&iter->type, iter->name);
if (iter->owner == NULL)
return 0;
upcase_if_global(iter);
return 1;
}
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
iter->owner = NULL;
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
off = kallsyms_expand_symbol(off, iter->name);
return off - iter->nameoff;
}
static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
{
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
if (pos >= kallsyms_num_syms) {
iter->pos = pos;
return get_ksymbol_mod(iter);
}
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
reset_iter(iter, pos);
iter->nameoff += get_ksymbol_core(iter);
iter->pos++;
return 1;
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
(*pos)++;
if (!update_iter(m->private, *pos))
return NULL;
return p;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
if (!update_iter(m->private, *pos))
return NULL;
return m->private;
}
static void s_stop(struct seq_file *m, void *p)
{
}
static int s_show(struct seq_file *m, void *p)
{
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
if (iter->owner)
seq_printf(m, "%0*lx %c %s\t[%s]\n",
(int)(2*sizeof(void*)),
iter->value, iter->type, iter->name,
module_name(iter->owner));
else
seq_printf(m, "%0*lx %c %s\n",
(int)(2*sizeof(void*)),
iter->value, iter->type, iter->name);
return 0;
}
static struct seq_operations kallsyms_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show
};
static int kallsyms_open(struct inode *inode, struct file *file)
{
/* We keep iterator in m->private, since normal case is to
* s_start from where we left off, so we avoid doing
* using get_symbol_offset for every symbol */
struct kallsym_iter *iter;
int ret;
iter = kmalloc(sizeof(*iter), GFP_KERNEL);
if (!iter)
return -ENOMEM;
reset_iter(iter, 0);
ret = seq_open(file, &kallsyms_op);
if (ret == 0)
((struct seq_file *)file->private_data)->private = iter;
else
kfree(iter);
return ret;
}
static int kallsyms_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
kfree(m->private);
return seq_release(inode, file);
}
static struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
.release = kallsyms_release,
};
static int __init kallsyms_init(void)
{
struct proc_dir_entry *entry;
entry = create_proc_entry("kallsyms", 0444, NULL);
if (entry)
entry->proc_fops = &kallsyms_operations;
return 0;
}
__initcall(kallsyms_init);
EXPORT_SYMBOL(__print_symbol);

168
kernel/kfifo.c Normal file
View File

@@ -0,0 +1,168 @@
/*
* A simple kernel FIFO implementation.
*
* Copyright (C) 2004 Stelian Pop <stelian@popies.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kfifo.h>
/**
* kfifo_init - allocates a new FIFO using a preallocated buffer
* @buffer: the preallocated buffer to be used.
* @size: the size of the internal buffer, this have to be a power of 2.
* @gfp_mask: get_free_pages mask, passed to kmalloc()
* @lock: the lock to be used to protect the fifo buffer
*
* Do NOT pass the kfifo to kfifo_free() after use ! Simply free the
* struct kfifo with kfree().
*/
struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
unsigned int __nocast gfp_mask, spinlock_t *lock)
{
struct kfifo *fifo;
/* size must be a power of 2 */
BUG_ON(size & (size - 1));
fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
if (!fifo)
return ERR_PTR(-ENOMEM);
fifo->buffer = buffer;
fifo->size = size;
fifo->in = fifo->out = 0;
fifo->lock = lock;
return fifo;
}
EXPORT_SYMBOL(kfifo_init);
/**
* kfifo_alloc - allocates a new FIFO and its internal buffer
* @size: the size of the internal buffer to be allocated.
* @gfp_mask: get_free_pages mask, passed to kmalloc()
* @lock: the lock to be used to protect the fifo buffer
*
* The size will be rounded-up to a power of 2.
*/
struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
{
unsigned char *buffer;
struct kfifo *ret;
/*
* round up to the next power of 2, since our 'let the indices
* wrap' tachnique works only in this case.
*/
if (size & (size - 1)) {
BUG_ON(size > 0x80000000);
size = roundup_pow_of_two(size);
}
buffer = kmalloc(size, gfp_mask);
if (!buffer)
return ERR_PTR(-ENOMEM);
ret = kfifo_init(buffer, size, gfp_mask, lock);
if (IS_ERR(ret))
kfree(buffer);
return ret;
}
EXPORT_SYMBOL(kfifo_alloc);
/**
* kfifo_free - frees the FIFO
* @fifo: the fifo to be freed.
*/
void kfifo_free(struct kfifo *fifo)
{
kfree(fifo->buffer);
kfree(fifo);
}
EXPORT_SYMBOL(kfifo_free);
/**
* __kfifo_put - puts some data into the FIFO, no locking version
* @fifo: the fifo to be used.
* @buffer: the data to be added.
* @len: the length of the data to be added.
*
* This function copies at most 'len' bytes from the 'buffer' into
* the FIFO depending on the free space, and returns the number of
* bytes copied.
*
* Note that with only one concurrent reader and one concurrent
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_put(struct kfifo *fifo,
unsigned char *buffer, unsigned int len)
{
unsigned int l;
len = min(len, fifo->size - fifo->in + fifo->out);
/* first put the data starting from fifo->in to buffer end */
l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
/* then put the rest (if any) at the beginning of the buffer */
memcpy(fifo->buffer, buffer + l, len - l);
fifo->in += len;
return len;
}
EXPORT_SYMBOL(__kfifo_put);
/**
* __kfifo_get - gets some data from the FIFO, no locking version
* @fifo: the fifo to be used.
* @buffer: where the data must be copied.
* @len: the size of the destination buffer.
*
* This function copies at most 'len' bytes from the FIFO into the
* 'buffer' and returns the number of copied bytes.
*
* Note that with only one concurrent reader and one concurrent
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_get(struct kfifo *fifo,
unsigned char *buffer, unsigned int len)
{
unsigned int l;
len = min(len, fifo->in - fifo->out);
/* first get the data from fifo->out until the end of the buffer */
l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
/* then get the rest (if any) from the beginning of the buffer */
memcpy(buffer + l, fifo->buffer, len - l);
fifo->out += len;
return len;
}
EXPORT_SYMBOL(__kfifo_get);

256
kernel/kmod.c Normal file
View File

@@ -0,0 +1,256 @@
/*
kmod, the new module loader (replaces kerneld)
Kirk Petersen
Reorganized not to be a daemon by Adam Richter, with guidance
from Greg Zornetzer.
Modified to avoid chroot and file sharing problems.
Mikael Pettersson
Limit the concurrent number of kmod modprobes to catch loops from
"modprobe needs a service that is in a module".
Keith Owens <kaos@ocs.com.au> December 1999
Unblock all signals when we exec a usermode process.
Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
call_usermodehelper wait flag, and remove exec_usermodehelper.
Rusty Russell <rusty@rustcorp.com.au> Jan 2003
*/
#define __KERNEL_SYSCALLS__
#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <asm/uaccess.h>
extern int max_threads;
static struct workqueue_struct *khelper_wq;
#ifdef CONFIG_KMOD
/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
/**
* request_module - try to load a kernel module
* @fmt: printf style format string for the name of the module
* @varargs: arguements as specified in the format string
*
* Load a module using the user mode module loader. The function returns
* zero on success or a negative errno code on failure. Note that a
* successful module load does not mean the module did not then unload
* and exit on an error of its own. Callers must check that the service
* they requested is now available not blindly invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int request_module(const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
static char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG;
/* If modprobe needs a service that is in a module, we get a recursive
* loop. Limit the number of running kmod threads to max_threads/2 or
* MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
* would be to run the parents of this process, counting how many times
* kmod was invoked. That would mean accessing the internals of the
* process tables to get the command line, proc_pid_cmdline is static
* and it is not worth changing the proc code just to handle this case.
* KAO.
*
* "trace the ppid" is simple, but will fail if someone's
* parent exits. I think this is as good as it gets. --RR
*/
max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > max_modprobes) {
/* We may be blaming an innocent here, but unlikely */
if (kmod_loop_msg++ < 5)
printk(KERN_ERR
"request_module: runaway loop modprobe %s\n",
module_name);
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
ret = call_usermodehelper(modprobe_path, argv, envp, 1);
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(request_module);
#endif /* CONFIG_KMOD */
struct subprocess_info {
struct completion *complete;
char *path;
char **argv;
char **envp;
int wait;
int retval;
};
/*
* This is the task which runs the usermode application
*/
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
int retval;
/* Unblock all signals. */
flush_signals(current);
spin_lock_irq(&current->sighand->siglock);
flush_signal_handlers(current, 1);
sigemptyset(&current->blocked);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
retval = -EPERM;
if (current->fs->root)
retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
do_exit(0);
}
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
struct k_sigaction sa;
/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
* populate the status, but will return -ECHILD. */
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;
siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
* But wait_for_helper() always runs as keventd, and put_user()
* to a kernel address works OK for kernel threads, due to their
* having an mm_segment_t which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
}
complete(sub_info->complete);
return 0;
}
/* This is run by khelper thread */
static void __call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
if (sub_info->wait)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
pid = kernel_thread(____call_usermodehelper, sub_info,
CLONE_VFORK | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
complete(sub_info->complete);
} else if (!sub_info->wait)
complete(sub_info->complete);
}
/**
* call_usermodehelper - start a usermode application
* @path: pathname for the application
* @argv: null-terminated argument list
* @envp: null-terminated environment list
* @wait: wait for the application to finish and return status.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*
* Must be called from process context. Returns a negative error code
* if program was not execed successfully, or 0.
*/
int call_usermodehelper(char *path, char **argv, char **envp, int wait)
{
DECLARE_COMPLETION(done);
struct subprocess_info sub_info = {
.complete = &done,
.path = path,
.argv = argv,
.envp = envp,
.wait = wait,
.retval = 0,
};
DECLARE_WORK(work, __call_usermodehelper, &sub_info);
if (!khelper_wq)
return -EBUSY;
if (path[0] == '\0')
return 0;
queue_work(khelper_wq, &work);
wait_for_completion(&done);
return sub_info.retval;
}
EXPORT_SYMBOL(call_usermodehelper);
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
BUG_ON(!khelper_wq);
}

157
kernel/kprobes.c Normal file
View File

@@ -0,0 +1,157 @@
/*
* Kernel Probes (KProbes)
* kernel/kprobes.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
* Probes initial implementation (includes suggestions from
* Rusty Russell).
* 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
* hlists and exceptions notifier as suggested by Andi Kleen.
* 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
* interface to access function arguments.
* 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
* exceptions notifier to be first on the priority list.
*/
#include <linux/kprobes.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <asm/kdebug.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
unsigned int kprobe_cpu = NR_CPUS;
static DEFINE_SPINLOCK(kprobe_lock);
/* Locks kprobe: irqs must be disabled */
void lock_kprobes(void)
{
spin_lock(&kprobe_lock);
kprobe_cpu = smp_processor_id();
}
void unlock_kprobes(void)
{
kprobe_cpu = NR_CPUS;
spin_unlock(&kprobe_lock);
}
/* You have to be holding the kprobe_lock */
struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
struct hlist_node *node;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
hlist_for_each(node, head) {
struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
if (p->addr == addr)
return p;
}
return NULL;
}
int register_kprobe(struct kprobe *p)
{
int ret = 0;
unsigned long flags = 0;
if ((ret = arch_prepare_kprobe(p)) != 0) {
goto rm_kprobe;
}
spin_lock_irqsave(&kprobe_lock, flags);
INIT_HLIST_NODE(&p->hlist);
if (get_kprobe(p->addr)) {
ret = -EEXIST;
goto out;
}
arch_copy_kprobe(p);
hlist_add_head(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
p->opcode = *p->addr;
*p->addr = BREAKPOINT_INSTRUCTION;
flush_icache_range((unsigned long) p->addr,
(unsigned long) p->addr + sizeof(kprobe_opcode_t));
out:
spin_unlock_irqrestore(&kprobe_lock, flags);
rm_kprobe:
if (ret == -EEXIST)
arch_remove_kprobe(p);
return ret;
}
void unregister_kprobe(struct kprobe *p)
{
unsigned long flags;
arch_remove_kprobe(p);
spin_lock_irqsave(&kprobe_lock, flags);
*p->addr = p->opcode;
hlist_del(&p->hlist);
flush_icache_range((unsigned long) p->addr,
(unsigned long) p->addr + sizeof(kprobe_opcode_t));
spin_unlock_irqrestore(&kprobe_lock, flags);
}
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to notified first */
};
int register_jprobe(struct jprobe *jp)
{
/* Todo: Verify probepoint is a function entry point */
jp->kp.pre_handler = setjmp_pre_handler;
jp->kp.break_handler = longjmp_break_handler;
return register_kprobe(&jp->kp);
}
void unregister_jprobe(struct jprobe *jp)
{
unregister_kprobe(&jp->kp);
}
static int __init init_kprobes(void)
{
int i, err = 0;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
for (i = 0; i < KPROBE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&kprobe_table[i]);
err = register_die_notifier(&kprobe_exceptions_nb);
return err;
}
__initcall(init_kprobes);
EXPORT_SYMBOL_GPL(register_kprobe);
EXPORT_SYMBOL_GPL(unregister_kprobe);
EXPORT_SYMBOL_GPL(register_jprobe);
EXPORT_SYMBOL_GPL(unregister_jprobe);
EXPORT_SYMBOL_GPL(jprobe_return);

57
kernel/ksysfs.c Normal file
View File

@@ -0,0 +1,57 @@
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*
* This file is release under the GPLv2
*
*/
#include <linux/config.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/module.h>
#include <linux/init.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
static struct subsys_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
#ifdef CONFIG_HOTPLUG
static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
}
KERNEL_ATTR_RO(hotplug_seqnum);
#endif
decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_HOTPLUG
&hotplug_seqnum_attr.attr,
#endif
NULL
};
static struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
static int __init ksysfs_init(void)
{
int error = subsystem_register(&kernel_subsys);
if (!error)
error = sysfs_create_group(&kernel_subsys.kset.kobj,
&kernel_attr_group);
return error;
}
core_initcall(ksysfs_init);

202
kernel/kthread.c Normal file
View File

@@ -0,0 +1,202 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
* Creation is done via keventd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/module.h>
#include <asm/semaphore.h>
/*
* We dont want to execute off keventd since it might
* hold a semaphore our callers hold too:
*/
static struct workqueue_struct *helper_wq;
struct kthread_create_info
{
/* Information passed to kthread() from keventd. */
int (*threadfn)(void *data);
void *data;
struct completion started;
/* Result passed back to kthread_create() from keventd. */
struct task_struct *result;
struct completion done;
};
struct kthread_stop_info
{
struct task_struct *k;
int err;
struct completion done;
};
/* Thread stopping is done by setthing this var: lock serializes
* multiple kthread_stop calls. */
static DECLARE_MUTEX(kthread_stop_lock);
static struct kthread_stop_info kthread_stop_info;
int kthread_should_stop(void)
{
return (kthread_stop_info.k == current);
}
EXPORT_SYMBOL(kthread_should_stop);
static void kthread_exit_files(void)
{
struct fs_struct *fs;
struct task_struct *tsk = current;
exit_fs(tsk); /* current->fs->count--; */
fs = init_task.fs;
tsk->fs = fs;
atomic_inc(&fs->count);
exit_files(tsk);
current->files = init_task.files;
atomic_inc(&tsk->files->count);
}
static int kthread(void *_create)
{
struct kthread_create_info *create = _create;
int (*threadfn)(void *data);
void *data;
sigset_t blocked;
int ret = -EINTR;
kthread_exit_files();
/* Copy data: it's on keventd's stack */
threadfn = create->threadfn;
data = create->data;
/* Block and flush all signals (in case we're not from keventd). */
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
/* By default we can run anywhere, unlike keventd. */
set_cpus_allowed(current, CPU_MASK_ALL);
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_INTERRUPTIBLE);
complete(&create->started);
schedule();
if (!kthread_should_stop())
ret = threadfn(data);
/* It might have exited on its own, w/o kthread_stop. Check. */
if (kthread_should_stop()) {
kthread_stop_info.err = ret;
complete(&kthread_stop_info.done);
}
return 0;
}
/* We are keventd: create a thread. */
static void keventd_create_kthread(void *_create)
{
struct kthread_create_info *create = _create;
int pid;
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
create->result = ERR_PTR(pid);
} else {
wait_for_completion(&create->started);
create->result = find_task_by_pid(pid);
}
complete(&create->done);
}
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)
{
struct kthread_create_info create;
DECLARE_WORK(work, keventd_create_kthread, &create);
create.threadfn = threadfn;
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
/*
* The workqueue needs to start up first:
*/
if (!helper_wq)
work.func(work.data);
else {
queue_work(helper_wq, &work);
wait_for_completion(&create.done);
}
if (!IS_ERR(create.result)) {
va_list args;
va_start(args, namefmt);
vsnprintf(create.result->comm, sizeof(create.result->comm),
namefmt, args);
va_end(args);
}
return create.result;
}
EXPORT_SYMBOL(kthread_create);
void kthread_bind(struct task_struct *k, unsigned int cpu)
{
BUG_ON(k->state != TASK_INTERRUPTIBLE);
/* Must have done schedule() in kthread() before we set_task_cpu */
wait_task_inactive(k);
set_task_cpu(k, cpu);
k->cpus_allowed = cpumask_of_cpu(cpu);
}
EXPORT_SYMBOL(kthread_bind);
int kthread_stop(struct task_struct *k)
{
int ret;
down(&kthread_stop_lock);
/* It could exit after stop_info.k set, but before wake_up_process. */
get_task_struct(k);
/* Must init completion *before* thread sees kthread_stop_info.k */
init_completion(&kthread_stop_info.done);
wmb();
/* Now set kthread_should_stop() to true, and wake it up. */
kthread_stop_info.k = k;
wake_up_process(k);
put_task_struct(k);
/* Once it dies, reset stop ptr, gather result and we're done. */
wait_for_completion(&kthread_stop_info.done);
kthread_stop_info.k = NULL;
ret = kthread_stop_info.err;
up(&kthread_stop_lock);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
static __init int helper_init(void)
{
helper_wq = create_singlethread_workqueue("kthread");
BUG_ON(!helper_wq);
return 0;
}
core_initcall(helper_init);

2108
kernel/module.c Normal file

File diff suppressed because it is too large Load Diff

157
kernel/panic.c Normal file
View File

@@ -0,0 +1,157 @@
/*
* linux/kernel/panic.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
int panic_timeout;
int panic_on_oops;
int tainted;
EXPORT_SYMBOL(panic_timeout);
struct notifier_block *panic_notifier_list;
EXPORT_SYMBOL(panic_notifier_list);
static int __init panic_setup(char *str)
{
panic_timeout = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("panic=", panic_setup);
static long no_blink(long time)
{
return 0;
}
/* Returns how long it waited in ms */
long (*panic_blink)(long time);
EXPORT_SYMBOL(panic_blink);
/**
* panic - halt the system
* @fmt: The text string to print
*
* Display a message, then perform cleanups.
*
* This function never returns.
*/
NORET_TYPE void panic(const char * fmt, ...)
{
long i;
static char buf[1024];
va_list args;
#if defined(CONFIG_ARCH_S390)
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
bust_spinlocks(0);
#ifdef CONFIG_SMP
smp_send_stop();
#endif
notifier_call_chain(&panic_notifier_list, 0, buf);
if (!panic_blink)
panic_blink = no_blink;
if (panic_timeout > 0)
{
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked..
*/
printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
/*
* Should we run the reboot notifier. For the moment Im
* choosing not too. It might crash, be corrupt or do
* more harm than good for other reasons.
*/
machine_restart(NULL);
}
#ifdef __sparc__
{
extern int stop_a_enabled;
/* Make sure the user can actually press L1-A */
stop_a_enabled = 1;
printk(KERN_EMERG "Press L1-A to return to the boot prom\n");
}
#endif
#if defined(CONFIG_ARCH_S390)
disabled_wait(caller);
#endif
local_irq_enable();
for (i = 0;;) {
i += panic_blink(i);
mdelay(1);
i++;
}
}
EXPORT_SYMBOL(panic);
/**
* print_tainted - return a string to represent the kernel taint state.
*
* 'P' - Proprietary module has been loaded.
* 'F' - Module has been forcibly loaded.
* 'S' - SMP with CPUs not designed for SMP.
* 'R' - User forced a module unload.
* 'M' - Machine had a machine check experience.
* 'B' - System has hit bad_page.
*
* The string is overwritten by the next call to print_taint().
*/
const char *print_tainted(void)
{
static char buf[20];
if (tainted) {
snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c",
tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
tainted & TAINT_BAD_PAGE ? 'B' : ' ');
}
else
snprintf(buf, sizeof(buf), "Not tainted");
return(buf);
}
void add_taint(unsigned flag)
{
tainted |= flag;
}
EXPORT_SYMBOL(add_taint);

721
kernel/params.c Normal file
View File

@@ -0,0 +1,721 @@
/* Helpers for initial module or kernel cmdline parsing
Copyright (C) 2001 Rusty Russell.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/config.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/err.h>
#if 0
#define DEBUGP printk
#else
#define DEBUGP(fmt, a...)
#endif
static inline int dash2underscore(char c)
{
if (c == '-')
return '_';
return c;
}
static inline int parameq(const char *input, const char *paramname)
{
unsigned int i;
for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
if (input[i] == '\0')
return 1;
return 0;
}
static int parse_one(char *param,
char *val,
struct kernel_param *params,
unsigned num_params,
int (*handle_unknown)(char *param, char *val))
{
unsigned int i;
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
DEBUGP("They are equal! Calling %p\n",
params[i].set);
return params[i].set(val, &params[i]);
}
}
if (handle_unknown) {
DEBUGP("Unknown argument: calling %p\n", handle_unknown);
return handle_unknown(param, val);
}
DEBUGP("Unknown argument `%s'\n", param);
return -ENOENT;
}
/* You can use " around spaces, but can't escape ". */
/* Hyphens and underscores equivalent in parameter names. */
static char *next_arg(char *args, char **param, char **val)
{
unsigned int i, equals = 0;
int in_quote = 0, quoted = 0;
char *next;
/* Chew any extra spaces */
while (*args == ' ') args++;
if (*args == '"') {
args++;
in_quote = 1;
quoted = 1;
}
for (i = 0; args[i]; i++) {
if (args[i] == ' ' && !in_quote)
break;
if (equals == 0) {
if (args[i] == '=')
equals = i;
}
if (args[i] == '"')
in_quote = !in_quote;
}
*param = args;
if (!equals)
*val = NULL;
else {
args[equals] = '\0';
*val = args + equals + 1;
/* Don't include quotes in value. */
if (**val == '"') {
(*val)++;
if (args[i-1] == '"')
args[i-1] = '\0';
}
if (quoted && args[i-1] == '"')
args[i-1] = '\0';
}
if (args[i]) {
args[i] = '\0';
next = args + i + 1;
} else
next = args + i;
return next;
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
int parse_args(const char *name,
char *args,
struct kernel_param *params,
unsigned num,
int (*unknown)(char *param, char *val))
{
char *param, *val;
DEBUGP("Parsing ARGS: %s\n", args);
while (*args) {
int ret;
args = next_arg(args, &param, &val);
ret = parse_one(param, val, params, num, unknown);
switch (ret) {
case -ENOENT:
printk(KERN_ERR "%s: Unknown parameter `%s'\n",
name, param);
return ret;
case -ENOSPC:
printk(KERN_ERR
"%s: `%s' too large for parameter `%s'\n",
name, val ?: "", param);
return ret;
case 0:
break;
default:
printk(KERN_ERR
"%s: `%s' invalid for parameter `%s'\n",
name, val ?: "", param);
return ret;
}
}
/* All parsed OK. */
return 0;
}
/* Lazy bastard, eh? */
#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
int param_set_##name(const char *val, struct kernel_param *kp) \
{ \
char *endp; \
tmptype l; \
\
if (!val) return -EINVAL; \
l = strtolfn(val, &endp, 0); \
if (endp == val || ((type)l != l)) \
return -EINVAL; \
*((type *)kp->arg) = l; \
return 0; \
} \
int param_get_##name(char *buffer, struct kernel_param *kp) \
{ \
return sprintf(buffer, format, *((type *)kp->arg)); \
}
STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
int param_set_charp(const char *val, struct kernel_param *kp)
{
if (!val) {
printk(KERN_ERR "%s: string parameter expected\n",
kp->name);
return -EINVAL;
}
if (strlen(val) > 1024) {
printk(KERN_ERR "%s: string parameter too long\n",
kp->name);
return -ENOSPC;
}
*(char **)kp->arg = (char *)val;
return 0;
}
int param_get_charp(char *buffer, struct kernel_param *kp)
{
return sprintf(buffer, "%s", *((char **)kp->arg));
}
int param_set_bool(const char *val, struct kernel_param *kp)
{
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
switch (val[0]) {
case 'y': case 'Y': case '1':
*(int *)kp->arg = 1;
return 0;
case 'n': case 'N': case '0':
*(int *)kp->arg = 0;
return 0;
}
return -EINVAL;
}
int param_get_bool(char *buffer, struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
}
int param_set_invbool(const char *val, struct kernel_param *kp)
{
int boolval, ret;
struct kernel_param dummy = { .arg = &boolval };
ret = param_set_bool(val, &dummy);
if (ret == 0)
*(int *)kp->arg = !boolval;
return ret;
}
int param_get_invbool(char *buffer, struct kernel_param *kp)
{
int val;
struct kernel_param dummy = { .arg = &val };
val = !*(int *)kp->arg;
return param_get_bool(buffer, &dummy);
}
/* We cheat here and temporarily mangle the string. */
int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, struct kernel_param *kp),
int *num)
{
int ret;
struct kernel_param kp;
char save;
/* Get the name right for errors. */
kp.name = name;
kp.arg = elem;
/* No equals sign? */
if (!val) {
printk(KERN_ERR "%s: expects arguments\n", name);
return -EINVAL;
}
*num = 0;
/* We expect a comma-separated list of values. */
do {
int len;
if (*num == max) {
printk(KERN_ERR "%s: can only take %i arguments\n",
name, max);
return -EINVAL;
}
len = strcspn(val, ",");
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
ret = set(val, &kp);
if (ret != 0)
return ret;
kp.arg += elemsize;
val += len+1;
(*num)++;
} while (save == ',');
if (*num < min) {
printk(KERN_ERR "%s: needs at least %i arguments\n",
name, min);
return -EINVAL;
}
return 0;
}
int param_array_set(const char *val, struct kernel_param *kp)
{
struct kparam_array *arr = kp->arg;
return param_array(kp->name, val, 1, arr->max, arr->elem,
arr->elemsize, arr->set, arr->num ?: &arr->max);
}
int param_array_get(char *buffer, struct kernel_param *kp)
{
int i, off, ret;
struct kparam_array *arr = kp->arg;
struct kernel_param p;
p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
if (i)
buffer[off++] = ',';
p.arg = arr->elem + arr->elemsize * i;
ret = arr->get(buffer + off, &p);
if (ret < 0)
return ret;
off += ret;
}
buffer[off] = '\0';
return off;
}
int param_set_copystring(const char *val, struct kernel_param *kp)
{
struct kparam_string *kps = kp->arg;
if (strlen(val)+1 > kps->maxlen) {
printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
strcpy(kps->string, val);
return 0;
}
int param_get_string(char *buffer, struct kernel_param *kp)
{
struct kparam_string *kps = kp->arg;
return strlcpy(buffer, kps->string, kps->maxlen);
}
/* sysfs output in /sys/modules/XYZ/parameters/ */
extern struct kernel_param __start___param[], __stop___param[];
#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
struct param_attribute
{
struct module_attribute mattr;
struct kernel_param *param;
};
struct module_param_attrs
{
struct attribute_group grp;
struct param_attribute attrs[0];
};
#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
static ssize_t param_attr_show(struct module_attribute *mattr,
struct module *mod, char *buf)
{
int count;
struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->get)
return -EPERM;
count = attribute->param->get(buf, attribute->param);
if (count > 0) {
strcat(buf, "\n");
++count;
}
return count;
}
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
static ssize_t param_attr_store(struct module_attribute *mattr,
struct module *owner,
const char *buf, size_t len)
{
int err;
struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->set)
return -EPERM;
err = attribute->param->set(buf, attribute->param);
if (!err)
return len;
return err;
}
#ifdef CONFIG_MODULES
#define __modinit
#else
#define __modinit __init
#endif
/*
* param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
* @mk: struct module_kobject (contains parent kobject)
* @kparam: array of struct kernel_param, the actual parameter definitions
* @num_params: number of entries in array
* @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
*
* Create a kobject for a (per-module) group of parameters, and create files
* in sysfs. A pointer to the param_kobject is returned on success,
* NULL if there's no parameter to export, or other ERR_PTR(err).
*/
static __modinit struct module_param_attrs *
param_sysfs_setup(struct module_kobject *mk,
struct kernel_param *kparam,
unsigned int num_params,
unsigned int name_skip)
{
struct module_param_attrs *mp;
unsigned int valid_attrs = 0;
unsigned int i, size[2];
struct param_attribute *pattr;
struct attribute **gattr;
int err;
for (i=0; i<num_params; i++) {
if (kparam[i].perm)
valid_attrs++;
}
if (!valid_attrs)
return NULL;
size[0] = ALIGN(sizeof(*mp) +
valid_attrs * sizeof(mp->attrs[0]),
sizeof(mp->grp.attrs[0]));
size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
mp = kmalloc(size[0] + size[1], GFP_KERNEL);
if (!mp)
return ERR_PTR(-ENOMEM);
mp->grp.name = "parameters";
mp->grp.attrs = (void *)mp + size[0];
pattr = &mp->attrs[0];
gattr = &mp->grp.attrs[0];
for (i = 0; i < num_params; i++) {
struct kernel_param *kp = &kparam[i];
if (kp->perm) {
pattr->param = kp;
pattr->mattr.show = param_attr_show;
pattr->mattr.store = param_attr_store;
pattr->mattr.attr.name = (char *)&kp->name[name_skip];
pattr->mattr.attr.owner = mk->mod;
pattr->mattr.attr.mode = kp->perm;
*(gattr++) = &(pattr++)->mattr.attr;
}
}
*gattr = NULL;
if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
kfree(mp);
return ERR_PTR(err);
}
return mp;
}
#ifdef CONFIG_MODULES
/*
* module_param_sysfs_setup - setup sysfs support for one module
* @mod: module
* @kparam: module parameters (array)
* @num_params: number of module parameters
*
* Adds sysfs entries for module parameters, and creates a link from
* /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
*/
int module_param_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
unsigned int num_params)
{
struct module_param_attrs *mp;
mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
if (IS_ERR(mp))
return PTR_ERR(mp);
mod->param_attrs = mp;
return 0;
}
/*
* module_param_sysfs_remove - remove sysfs support for one module
* @mod: module
*
* Remove sysfs entries for module parameters and the corresponding
* kobject.
*/
void module_param_sysfs_remove(struct module *mod)
{
if (mod->param_attrs) {
sysfs_remove_group(&mod->mkobj.kobj,
&mod->param_attrs->grp);
/* We are positive that no one is using any param
* attrs at this point. Deallocate immediately. */
kfree(mod->param_attrs);
mod->param_attrs = NULL;
}
}
#endif
/*
* kernel_param_sysfs_setup - wrapper for built-in params support
*/
static void __init kernel_param_sysfs_setup(const char *name,
struct kernel_param *kparam,
unsigned int num_params,
unsigned int name_skip)
{
struct module_kobject *mk;
mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
memset(mk, 0, sizeof(struct module_kobject));
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
kobject_set_name(&mk->kobj, name);
kobject_register(&mk->kobj);
/* no need to keep the kobject if no parameter is exported */
if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
kobject_unregister(&mk->kobj);
kfree(mk);
}
}
/*
* param_sysfs_builtin - add contents in /sys/parameters for built-in modules
*
* Add module_parameters to sysfs for "modules" built into the kernel.
*
* The "module" name (KBUILD_MODNAME) is stored before a dot, the
* "parameter" name is stored behind a dot in kernel_param->name. So,
* extract the "module" name for all built-in kernel_param-eters,
* and for all who have the same, call kernel_param_sysfs_setup.
*/
static void __init param_sysfs_builtin(void)
{
struct kernel_param *kp, *kp_begin = NULL;
unsigned int i, name_len, count = 0;
char modname[MAX_KBUILD_MODNAME + 1] = "";
for (i=0; i < __stop___param - __start___param; i++) {
char *dot;
kp = &__start___param[i];
/* We do not handle args without periods. */
dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME);
if (!dot) {
DEBUGP("couldn't find period in %s\n", kp->name);
continue;
}
name_len = dot - kp->name;
/* new kbuild_modname? */
if (strlen(modname) != name_len
|| strncmp(modname, kp->name, name_len) != 0) {
/* add a new kobject for previous kernel_params. */
if (count)
kernel_param_sysfs_setup(modname,
kp_begin,
count,
strlen(modname)+1);
strncpy(modname, kp->name, name_len);
modname[name_len] = '\0';
count = 0;
kp_begin = kp;
}
count++;
}
/* last kernel_params need to be registered as well */
if (count)
kernel_param_sysfs_setup(modname, kp_begin, count,
strlen(modname)+1);
}
/* module-related sysfs stuff */
#ifdef CONFIG_MODULES
#define to_module_attr(n) container_of(n, struct module_attribute, attr);
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
attribute = to_module_attr(attr);
mk = to_module_kobject(kobj);
if (!attribute->show)
return -EPERM;
if (!try_module_get(mk->mod))
return -ENODEV;
ret = attribute->show(attribute, mk->mod, buf);
module_put(mk->mod);
return ret;
}
static ssize_t module_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
attribute = to_module_attr(attr);
mk = to_module_kobject(kobj);
if (!attribute->store)
return -EPERM;
if (!try_module_get(mk->mod))
return -ENODEV;
ret = attribute->store(attribute, mk->mod, buf, len);
module_put(mk->mod);
return ret;
}
static struct sysfs_ops module_sysfs_ops = {
.show = module_attr_show,
.store = module_attr_store,
};
#else
static struct sysfs_ops module_sysfs_ops = {
.show = NULL,
.store = NULL,
};
#endif
static struct kobj_type module_ktype = {
.sysfs_ops = &module_sysfs_ops,
};
decl_subsys(module, &module_ktype, NULL);
/*
* param_sysfs_init - wrapper for built-in params support
*/
static int __init param_sysfs_init(void)
{
subsystem_register(&module_subsys);
param_sysfs_builtin();
return 0;
}
__initcall(param_sysfs_init);
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);
EXPORT_SYMBOL(param_set_short);
EXPORT_SYMBOL(param_get_short);
EXPORT_SYMBOL(param_set_ushort);
EXPORT_SYMBOL(param_get_ushort);
EXPORT_SYMBOL(param_set_int);
EXPORT_SYMBOL(param_get_int);
EXPORT_SYMBOL(param_set_uint);
EXPORT_SYMBOL(param_get_uint);
EXPORT_SYMBOL(param_set_long);
EXPORT_SYMBOL(param_get_long);
EXPORT_SYMBOL(param_set_ulong);
EXPORT_SYMBOL(param_get_ulong);
EXPORT_SYMBOL(param_set_charp);
EXPORT_SYMBOL(param_get_charp);
EXPORT_SYMBOL(param_set_bool);
EXPORT_SYMBOL(param_get_bool);
EXPORT_SYMBOL(param_set_invbool);
EXPORT_SYMBOL(param_get_invbool);
EXPORT_SYMBOL(param_array_set);
EXPORT_SYMBOL(param_array_get);
EXPORT_SYMBOL(param_set_copystring);
EXPORT_SYMBOL(param_get_string);

292
kernel/pid.c Normal file
View File

@@ -0,0 +1,292 @@
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
* (C) 2002-2003 William Irwin, IBM
* (C) 2004 William Irwin, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
* against. There is very little to them aside from hashing them and
* parking tasks using given ID's on a list.
*
* The hash is always changed with the tasklist_lock write-acquired,
* and the hash is only accessed with the tasklist_lock at least
* read-acquired, so there's no additional SMP locking needed here.
*
* We have a list of bitmap pages, which bitmaps represent the PID space.
* Allocating and freeing PIDs is completely lockless. The worst-case
* allocation scenario when all but one out of 1 million PIDs possible are
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
static struct hlist_head *pid_hash[PIDTYPE_MAX];
static int pidhash_shift;
int pid_max = PID_MAX_DEFAULT;
int last_pid;
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off))
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
typedef struct pidmap {
atomic_t nr_free;
void *page;
} pidmap_t;
static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
{ [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
fastcall void free_pidmap(int pid)
{
pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
int alloc_pidmap(void)
{
int i, offset, max_scan, pid, last = last_pid;
pidmap_t *map;
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
map = &pidmap_array[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
unsigned long page = get_zeroed_page(GFP_KERNEL);
/*
* Free the page if someone raced with us
* installing it:
*/
spin_lock(&pidmap_lock);
if (map->page)
free_page(page);
else
map->page = (void *)page;
spin_unlock(&pidmap_lock);
if (unlikely(!map->page))
break;
}
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
pid = mk_pid(map, offset);
/*
* find_next_offset() found a bit, the pid from it
* is in-bounds, and if we fell back to the last
* bitmap block and the final block was the same
* as the starting point, pid is before last_pid.
*/
} while (offset < BITS_PER_PAGE && pid < pid_max &&
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
map = &pidmap_array[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
pid = mk_pid(map, offset);
}
return -1;
}
struct pid * fastcall find_pid(enum pid_type type, int nr)
{
struct hlist_node *elem;
struct pid *pid;
hlist_for_each_entry(pid, elem,
&pid_hash[type][pid_hashfn(nr)], pid_chain) {
if (pid->nr == nr)
return pid;
}
return NULL;
}
int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
{
struct pid *pid, *task_pid;
task_pid = &task->pids[type];
pid = find_pid(type, nr);
if (pid == NULL) {
hlist_add_head(&task_pid->pid_chain,
&pid_hash[type][pid_hashfn(nr)]);
INIT_LIST_HEAD(&task_pid->pid_list);
} else {
INIT_HLIST_NODE(&task_pid->pid_chain);
list_add_tail(&task_pid->pid_list, &pid->pid_list);
}
task_pid->nr = nr;
return 0;
}
static fastcall int __detach_pid(task_t *task, enum pid_type type)
{
struct pid *pid, *pid_next;
int nr = 0;
pid = &task->pids[type];
if (!hlist_unhashed(&pid->pid_chain)) {
hlist_del(&pid->pid_chain);
if (list_empty(&pid->pid_list))
nr = pid->nr;
else {
pid_next = list_entry(pid->pid_list.next,
struct pid, pid_list);
/* insert next pid from pid_list to hash */
hlist_add_head(&pid_next->pid_chain,
&pid_hash[type][pid_hashfn(pid_next->nr)]);
}
}
list_del(&pid->pid_list);
pid->nr = 0;
return nr;
}
void fastcall detach_pid(task_t *task, enum pid_type type)
{
int tmp, nr;
nr = __detach_pid(task, type);
if (!nr)
return;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (tmp != type && find_pid(tmp, nr))
return;
free_pidmap(nr);
}
task_t *find_task_by_pid_type(int type, int nr)
{
struct pid *pid;
pid = find_pid(type, nr);
if (!pid)
return NULL;
return pid_task(&pid->pid_list, type);
}
EXPORT_SYMBOL(find_task_by_pid_type);
/*
* This function switches the PIDs if a non-leader thread calls
* sys_execve() - this must be done without releasing the PID.
* (which a detach_pid() would eventually do.)
*/
void switch_exec_pids(task_t *leader, task_t *thread)
{
__detach_pid(leader, PIDTYPE_PID);
__detach_pid(leader, PIDTYPE_TGID);
__detach_pid(leader, PIDTYPE_PGID);
__detach_pid(leader, PIDTYPE_SID);
__detach_pid(thread, PIDTYPE_PID);
__detach_pid(thread, PIDTYPE_TGID);
leader->pid = leader->tgid = thread->pid;
thread->pid = thread->tgid;
attach_pid(thread, PIDTYPE_PID, thread->pid);
attach_pid(thread, PIDTYPE_TGID, thread->tgid);
attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
attach_pid(thread, PIDTYPE_SID, thread->signal->session);
list_add_tail(&thread->tasks, &init_task.tasks);
attach_pid(leader, PIDTYPE_PID, leader->pid);
attach_pid(leader, PIDTYPE_TGID, leader->tgid);
attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
attach_pid(leader, PIDTYPE_SID, leader->signal->session);
}
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
* more.
*/
void __init pidhash_init(void)
{
int i, j, pidhash_size;
unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
pidhash_shift = max(4, fls(megabytes * 4));
pidhash_shift = min(12, pidhash_shift);
pidhash_size = 1 << pidhash_shift;
printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
pidhash_size, pidhash_shift,
PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
for (i = 0; i < PIDTYPE_MAX; i++) {
pid_hash[i] = alloc_bootmem(pidhash_size *
sizeof(*(pid_hash[i])));
if (!pid_hash[i])
panic("Could not alloc pidhash!\n");
for (j = 0; j < pidhash_size; j++)
INIT_HLIST_HEAD(&pid_hash[i][j]);
}
}
void __init pidmap_init(void)
{
int i;
pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
set_bit(0, pidmap_array->page);
atomic_dec(&pidmap_array->nr_free);
/*
* Allocate PID 0, and hash it via all PID types:
*/
for (i = 0; i < PIDTYPE_MAX; i++)
attach_pid(current, i, 0);
}

1559
kernel/posix-cpu-timers.c Normal file

File diff suppressed because it is too large Load Diff

1584
kernel/posix-timers.c Normal file

File diff suppressed because it is too large Load Diff

74
kernel/power/Kconfig Normal file
View File

@@ -0,0 +1,74 @@
config PM
bool "Power Management support"
---help---
"Power Management" means that parts of your computer are shut
off or put into a power conserving "sleep" mode if they are not
being used. There are two competing standards for doing this: APM
and ACPI. If you want to use either one, say Y here and then also
to the requisite support below.
Power Management is most important for battery powered laptop
computers; if you have a laptop, check out the Linux Laptop home
page on the WWW at <http://www.linux-on-laptops.com/> or
Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
Note that, even if you say N here, Linux on the x86 architecture
will issue the hlt instruction if nothing is to be done, thereby
sending the processor to sleep and saving power.
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
---help---
This option enables verbose debugging support in the Power Management
code. This is helpful when debugging and reporting various PM bugs,
like suspend support.
config SOFTWARE_SUSPEND
bool "Software Suspend (EXPERIMENTAL)"
depends on EXPERIMENTAL && PM && SWAP
---help---
Enable the possibility of suspending the machine.
It doesn't need APM.
You may suspend your machine by 'swsusp' or 'shutdown -z <time>'
(patch for sysvinit needed).
It creates an image which is saved in your active swap. Upon next
boot, pass the 'resume=/dev/swappartition' argument to the kernel to
have it detect the saved image, restore memory state from it, and
continue to run as before. If you do not want the previous state to
be reloaded, then use the 'noresume' kernel argument. However, note
that your partitions will be fsck'd and you must re-mkswap your swap
partitions. It does not work with swap files.
Right now you may boot without resuming and then later resume but
in meantime you cannot use those swap partitions/files which were
involved in suspending. Also in this case there is a risk that buffers
on disk won't match with saved ones.
For more information take a look at <file:Documentation/power/swsusp.txt>.
config PM_STD_PARTITION
string "Default resume partition"
depends on SOFTWARE_SUSPEND
default ""
---help---
The default resume partition is the partition that the suspend-
to-disk implementation will look for a suspended disk image.
The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
on before suspending.
The partition specified can be overridden by specifying:
resume=/dev/<other device>
which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
suspended image to. It will simply pick the first available swap
device.

11
kernel/power/Makefile Normal file
View File

@@ -0,0 +1,11 @@
ifeq ($(CONFIG_PM_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
swsusp-smp-$(CONFIG_SMP) += smp.o
obj-y := main.o process.o console.o pm.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o

58
kernel/power/console.c Normal file
View File

@@ -0,0 +1,58 @@
/*
* drivers/power/process.c - Functions for saving/restoring console.
*
* Originally from swsusp.
*/
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/console.h>
#include "power.h"
static int new_loglevel = 10;
static int orig_loglevel;
#ifdef SUSPEND_CONSOLE
static int orig_fgconsole, orig_kmsg;
#endif
int pm_prepare_console(void)
{
orig_loglevel = console_loglevel;
console_loglevel = new_loglevel;
#ifdef SUSPEND_CONSOLE
acquire_console_sem();
orig_fgconsole = fg_console;
if (vc_allocate(SUSPEND_CONSOLE)) {
/* we can't have a free VC for now. Too bad,
* we don't want to mess the screen for now. */
release_console_sem();
return 1;
}
set_console(SUSPEND_CONSOLE);
release_console_sem();
if (vt_waitactive(SUSPEND_CONSOLE)) {
pr_debug("Suspend: Can't switch VCs.");
return 1;
}
orig_kmsg = kmsg_redirect;
kmsg_redirect = SUSPEND_CONSOLE;
#endif
return 0;
}
void pm_restore_console(void)
{
console_loglevel = orig_loglevel;
#ifdef SUSPEND_CONSOLE
acquire_console_sem();
set_console(orig_fgconsole);
release_console_sem();
kmsg_redirect = orig_kmsg;
#endif
return;
}

431
kernel/power/disk.c Normal file
View File

@@ -0,0 +1,431 @@
/*
* kernel/power/disk.c - Suspend-to-disk support.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
*
* This file is released under the GPLv2.
*
*/
#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/fs.h>
#include "power.h"
extern suspend_disk_method_t pm_disk_mode;
extern struct pm_ops * pm_ops;
extern int swsusp_suspend(void);
extern int swsusp_write(void);
extern int swsusp_check(void);
extern int swsusp_read(void);
extern void swsusp_close(void);
extern int swsusp_resume(void);
extern int swsusp_free(void);
static int noresume = 0;
char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
/**
* power_down - Shut machine down for hibernate.
* @mode: Suspend-to-disk mode
*
* Use the platform driver, if configured so, and return gracefully if it
* fails.
* Otherwise, try to power off and reboot. If they fail, halt the machine,
* there ain't no turning back.
*/
static void power_down(suspend_disk_method_t mode)
{
unsigned long flags;
int error = 0;
local_irq_save(flags);
switch(mode) {
case PM_DISK_PLATFORM:
device_shutdown();
error = pm_ops->enter(PM_SUSPEND_DISK);
break;
case PM_DISK_SHUTDOWN:
printk("Powering off system\n");
device_shutdown();
machine_power_off();
break;
case PM_DISK_REBOOT:
device_shutdown();
machine_restart(NULL);
break;
}
machine_halt();
/* Valid image is on the disk, if we continue we risk serious data corruption
after resume. */
printk(KERN_CRIT "Please power me down manually\n");
while(1);
}
static int in_suspend __nosavedata = 0;
/**
* free_some_memory - Try to free as much memory as possible
*
* ... but do not OOM-kill anyone
*
* Notice: all userland should be stopped at this point, or
* livelock is possible.
*/
static void free_some_memory(void)
{
unsigned int i = 0;
unsigned int tmp;
unsigned long pages = 0;
char *p = "-\\|/";
printk("Freeing memory... ");
while ((tmp = shrink_all_memory(10000))) {
pages += tmp;
printk("\b%c", p[i]);
i++;
if (i > 3)
i = 0;
}
printk("\bdone (%li pages freed)\n", pages);
}
static inline void platform_finish(void)
{
if (pm_disk_mode == PM_DISK_PLATFORM) {
if (pm_ops && pm_ops->finish)
pm_ops->finish(PM_SUSPEND_DISK);
}
}
static void finish(void)
{
device_resume();
platform_finish();
enable_nonboot_cpus();
thaw_processes();
pm_restore_console();
}
static int prepare_processes(void)
{
int error;
pm_prepare_console();
sys_sync();
if (freeze_processes()) {
error = -EBUSY;
return error;
}
if (pm_disk_mode == PM_DISK_PLATFORM) {
if (pm_ops && pm_ops->prepare) {
if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
return error;
}
}
/* Free memory before shutting down devices. */
free_some_memory();
return 0;
}
static void unprepare_processes(void)
{
enable_nonboot_cpus();
thaw_processes();
pm_restore_console();
}
static int prepare_devices(void)
{
int error;
disable_nonboot_cpus();
if ((error = device_suspend(PMSG_FREEZE))) {
printk("Some devices failed to suspend\n");
platform_finish();
enable_nonboot_cpus();
return error;
}
return 0;
}
/**
* pm_suspend_disk - The granpappy of power management.
*
* If we're going through the firmware, then get it over with quickly.
*
* If not, then call swsusp to do its thing, then figure out how
* to power down the system.
*/
int pm_suspend_disk(void)
{
int error;
error = prepare_processes();
if (!error) {
error = prepare_devices();
}
if (error) {
unprepare_processes();
return error;
}
pr_debug("PM: Attempting to suspend to disk.\n");
if (pm_disk_mode == PM_DISK_FIRMWARE)
return pm_ops->enter(PM_SUSPEND_DISK);
pr_debug("PM: snapshotting memory.\n");
in_suspend = 1;
if ((error = swsusp_suspend()))
goto Done;
if (in_suspend) {
pr_debug("PM: writing image.\n");
error = swsusp_write();
if (!error)
power_down(pm_disk_mode);
} else
pr_debug("PM: Image restored successfully.\n");
swsusp_free();
Done:
finish();
return error;
}
/**
* software_resume - Resume from a saved image.
*
* Called as a late_initcall (so all devices are discovered and
* initialized), we call swsusp to see if we have a saved image or not.
* If so, we quiesce devices, the restore the saved image. We will
* return above (in pm_suspend_disk() ) if everything goes well.
* Otherwise, we fail gracefully and return to the normally
* scheduled program.
*
*/
static int software_resume(void)
{
int error;
if (noresume) {
/**
* FIXME: If noresume is specified, we need to find the partition
* and reset it back to normal swap space.
*/
return 0;
}
pr_debug("PM: Checking swsusp image.\n");
if ((error = swsusp_check()))
goto Done;
pr_debug("PM: Preparing processes for restore.\n");
if ((error = prepare_processes())) {
swsusp_close();
goto Cleanup;
}
pr_debug("PM: Reading swsusp image.\n");
if ((error = swsusp_read()))
goto Cleanup;
pr_debug("PM: Preparing devices for restore.\n");
if ((error = prepare_devices()))
goto Free;
mb();
pr_debug("PM: Restoring saved image.\n");
swsusp_resume();
pr_debug("PM: Restore failed, recovering.n");
finish();
Free:
swsusp_free();
Cleanup:
unprepare_processes();
Done:
pr_debug("PM: Resume from disk failed.\n");
return 0;
}
late_initcall(software_resume);
static char * pm_disk_modes[] = {
[PM_DISK_FIRMWARE] = "firmware",
[PM_DISK_PLATFORM] = "platform",
[PM_DISK_SHUTDOWN] = "shutdown",
[PM_DISK_REBOOT] = "reboot",
};
/**
* disk - Control suspend-to-disk mode
*
* Suspend-to-disk can be handled in several ways. The greatest
* distinction is who writes memory to disk - the firmware or the OS.
* If the firmware does it, we assume that it also handles suspending
* the system.
* If the OS does it, then we have three options for putting the system
* to sleep - using the platform driver (e.g. ACPI or other PM registers),
* powering off the system or rebooting the system (for testing).
*
* The system will support either 'firmware' or 'platform', and that is
* known a priori (and encoded in pm_ops). But, the user may choose
* 'shutdown' or 'reboot' as alternatives.
*
* show() will display what the mode is currently set to.
* store() will accept one of
*
* 'firmware'
* 'platform'
* 'shutdown'
* 'reboot'
*
* It will only change to 'firmware' or 'platform' if the system
* supports it (as determined from pm_ops->pm_disk_mode).
*/
static ssize_t disk_show(struct subsystem * subsys, char * buf)
{
return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
}
static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
{
int error = 0;
int i;
int len;
char *p;
suspend_disk_method_t mode = 0;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
down(&pm_sem);
for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
if (!strncmp(buf, pm_disk_modes[i], len)) {
mode = i;
break;
}
}
if (mode) {
if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
pm_disk_mode = mode;
else {
if (pm_ops && pm_ops->enter &&
(mode == pm_ops->pm_disk_mode))
pm_disk_mode = mode;
else
error = -EINVAL;
}
} else
error = -EINVAL;
pr_debug("PM: suspend-to-disk mode set to '%s'\n",
pm_disk_modes[mode]);
up(&pm_sem);
return error ? error : n;
}
power_attr(disk);
static ssize_t resume_show(struct subsystem * subsys, char *buf)
{
return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
{
int len;
char *p;
unsigned int maj, min;
int error = -EINVAL;
dev_t res;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
res = MKDEV(maj,min);
if (maj == MAJOR(res) && min == MINOR(res)) {
swsusp_resume_device = res;
printk("Attempting manual resume\n");
noresume = 0;
software_resume();
}
}
return error >= 0 ? n : error;
}
power_attr(resume);
static struct attribute * g[] = {
&disk_attr.attr,
&resume_attr.attr,
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_disk_init(void)
{
return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
}
core_initcall(pm_disk_init);
static int __init resume_setup(char *str)
{
if (noresume)
return 1;
strncpy( resume_file, str, 255 );
return 1;
}
static int __init noresume_setup(char *str)
{
noresume = 1;
return 1;
}
__setup("noresume", noresume_setup);
__setup("resume=", resume_setup);

269
kernel/power/main.c Normal file
View File

@@ -0,0 +1,269 @@
/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
*
* This file is released under the GPLv2
*
*/
#include <linux/suspend.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/pm.h>
#include "power.h"
DECLARE_MUTEX(pm_sem);
struct pm_ops * pm_ops = NULL;
suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
/**
* pm_set_ops - Set the global power method table.
* @ops: Pointer to ops structure.
*/
void pm_set_ops(struct pm_ops * ops)
{
down(&pm_sem);
pm_ops = ops;
up(&pm_sem);
}
/**
* suspend_prepare - Do prep work before entering low-power state.
* @state: State we're entering.
*
* This is common code that is called for each state that we're
* entering. Allocate a console, stop all processes, then make sure
* the platform can enter the requested state.
*/
static int suspend_prepare(suspend_state_t state)
{
int error = 0;
if (!pm_ops || !pm_ops->enter)
return -EPERM;
pm_prepare_console();
if (freeze_processes()) {
error = -EAGAIN;
goto Thaw;
}
if (pm_ops->prepare) {
if ((error = pm_ops->prepare(state)))
goto Thaw;
}
if ((error = device_suspend(PMSG_SUSPEND))) {
printk(KERN_ERR "Some devices failed to suspend\n");
goto Finish;
}
return 0;
Finish:
if (pm_ops->finish)
pm_ops->finish(state);
Thaw:
thaw_processes();
pm_restore_console();
return error;
}
static int suspend_enter(suspend_state_t state)
{
int error = 0;
unsigned long flags;
local_irq_save(flags);
if ((error = device_power_down(PMSG_SUSPEND))) {
printk(KERN_ERR "Some devices failed to power down\n");
goto Done;
}
error = pm_ops->enter(state);
device_power_up();
Done:
local_irq_restore(flags);
return error;
}
/**
* suspend_finish - Do final work before exiting suspend sequence.
* @state: State we're coming out of.
*
* Call platform code to clean up, restart processes, and free the
* console that we've allocated. This is not called for suspend-to-disk.
*/
static void suspend_finish(suspend_state_t state)
{
device_resume();
if (pm_ops && pm_ops->finish)
pm_ops->finish(state);
thaw_processes();
pm_restore_console();
}
static char * pm_states[] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
[PM_SUSPEND_DISK] = "disk",
NULL,
};
/**
* enter_state - Do common work of entering low-power state.
* @state: pm_state structure for state we're entering.
*
* Make sure we're the only ones trying to enter a sleep state. Fail
* if someone has beat us to it, since we don't want anything weird to
* happen when we wake up.
* Then, do the setup for suspend, enter the state, and cleaup (after
* we've woken up).
*/
static int enter_state(suspend_state_t state)
{
int error;
if (down_trylock(&pm_sem))
return -EBUSY;
if (state == PM_SUSPEND_DISK) {
error = pm_suspend_disk();
goto Unlock;
}
/* Suspend is hard to get right on SMP. */
if (num_online_cpus() != 1) {
error = -EPERM;
goto Unlock;
}
pr_debug("PM: Preparing system for suspend\n");
if ((error = suspend_prepare(state)))
goto Unlock;
pr_debug("PM: Entering state.\n");
error = suspend_enter(state);
pr_debug("PM: Finishing up.\n");
suspend_finish(state);
Unlock:
up(&pm_sem);
return error;
}
/*
* This is main interface to the outside world. It needs to be
* called from process context.
*/
int software_suspend(void)
{
return enter_state(PM_SUSPEND_DISK);
}
/**
* pm_suspend - Externally visible function for suspending system.
* @state: Enumarted value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
*/
int pm_suspend(suspend_state_t state)
{
if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
return enter_state(state);
return -EINVAL;
}
decl_subsys(power,NULL,NULL);
/**
* state - control system power state.
*
* show() returns what states are supported, which is hard-coded to
* 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
* 'disk' (Suspend-to-Disk).
*
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct subsystem * subsys, char * buf)
{
int i;
char * s = buf;
for (i = 0; i < PM_SUSPEND_MAX; i++) {
if (pm_states[i])
s += sprintf(s,"%s ",pm_states[i]);
}
s += sprintf(s,"\n");
return (s - buf);
}
static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
{
suspend_state_t state = PM_SUSPEND_STANDBY;
char ** s;
char *p;
int error;
int len;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && !strncmp(buf, *s, len))
break;
}
if (*s)
error = enter_state(state);
else
error = -EINVAL;
return error ? error : n;
}
power_attr(state);
static struct attribute * g[] = {
&state_attr.attr,
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_init(void)
{
int error = subsystem_register(&power_subsys);
if (!error)
error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
return error;
}
core_initcall(pm_init);

265
kernel/power/pm.c Normal file
View File

@@ -0,0 +1,265 @@
/*
* pm.c - Power management interface
*
* Copyright (C) 2000 Andrew Henroid
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/pm.h>
#include <linux/interrupt.h>
int pm_active;
/*
* Locking notes:
* pm_devs_lock can be a semaphore providing pm ops are not called
* from an interrupt handler (already a bad idea so no change here). Each
* change must be protected so that an unlink of an entry doesn't clash
* with a pm send - which is permitted to sleep in the current architecture
*
* Module unloads clashing with pm events now work out safely, the module
* unload path will block until the event has been sent. It may well block
* until a resume but that will be fine.
*/
static DECLARE_MUTEX(pm_devs_lock);
static LIST_HEAD(pm_devs);
/**
* pm_register - register a device with power management
* @type: device type
* @id: device ID
* @callback: callback function
*
* Add a device to the list of devices that wish to be notified about
* power management events. A &pm_dev structure is returned on success,
* on failure the return is %NULL.
*
* The callback function will be called in process context and
* it may sleep.
*/
struct pm_dev *pm_register(pm_dev_t type,
unsigned long id,
pm_callback callback)
{
struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
if (dev) {
memset(dev, 0, sizeof(*dev));
dev->type = type;
dev->id = id;
dev->callback = callback;
down(&pm_devs_lock);
list_add(&dev->entry, &pm_devs);
up(&pm_devs_lock);
}
return dev;
}
/**
* pm_unregister - unregister a device with power management
* @dev: device to unregister
*
* Remove a device from the power management notification lists. The
* dev passed must be a handle previously returned by pm_register.
*/
void pm_unregister(struct pm_dev *dev)
{
if (dev) {
down(&pm_devs_lock);
list_del(&dev->entry);
up(&pm_devs_lock);
kfree(dev);
}
}
static void __pm_unregister(struct pm_dev *dev)
{
if (dev) {
list_del(&dev->entry);
kfree(dev);
}
}
/**
* pm_unregister_all - unregister all devices with matching callback
* @callback: callback function pointer
*
* Unregister every device that would call the callback passed. This
* is primarily meant as a helper function for loadable modules. It
* enables a module to give up all its managed devices without keeping
* its own private list.
*/
void pm_unregister_all(pm_callback callback)
{
struct list_head *entry;
if (!callback)
return;
down(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
entry = entry->next;
if (dev->callback == callback)
__pm_unregister(dev);
}
up(&pm_devs_lock);
}
/**
* pm_send - send request to a single device
* @dev: device to send to
* @rqst: power management request
* @data: data for the callback
*
* Issue a power management request to a given device. The
* %PM_SUSPEND and %PM_RESUME events are handled specially. The
* data field must hold the intended next state. No call is made
* if the state matches.
*
* BUGS: what stops two power management requests occurring in parallel
* and conflicting.
*
* WARNING: Calling pm_send directly is not generally recommended, in
* particular there is no locking against the pm_dev going away. The
* caller must maintain all needed locking or have 'inside knowledge'
* on the safety. Also remember that this function is not locked against
* pm_unregister. This means that you must handle SMP races on callback
* execution and unload yourself.
*/
static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
{
int status = 0;
unsigned long prev_state, next_state;
if (in_interrupt())
BUG();
switch (rqst) {
case PM_SUSPEND:
case PM_RESUME:
prev_state = dev->state;
next_state = (unsigned long) data;
if (prev_state != next_state) {
if (dev->callback)
status = (*dev->callback)(dev, rqst, data);
if (!status) {
dev->state = next_state;
dev->prev_state = prev_state;
}
}
else {
dev->prev_state = prev_state;
}
break;
default:
if (dev->callback)
status = (*dev->callback)(dev, rqst, data);
break;
}
return status;
}
/*
* Undo incomplete request
*/
static void pm_undo_all(struct pm_dev *last)
{
struct list_head *entry = last->entry.prev;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
if (dev->state != dev->prev_state) {
/* previous state was zero (running) resume or
* previous state was non-zero (suspended) suspend
*/
pm_request_t undo = (dev->prev_state
? PM_SUSPEND:PM_RESUME);
pm_send(dev, undo, (void*) dev->prev_state);
}
entry = entry->prev;
}
}
/**
* pm_send_all - send request to all managed devices
* @rqst: power management request
* @data: data for the callback
*
* Issue a power management request to a all devices. The
* %PM_SUSPEND events are handled specially. Any device is
* permitted to fail a suspend by returning a non zero (error)
* value from its callback function. If any device vetoes a
* suspend request then all other devices that have suspended
* during the processing of this request are restored to their
* previous state.
*
* WARNING: This function takes the pm_devs_lock. The lock is not dropped until
* the callbacks have completed. This prevents races against pm locking
* functions, races against module unload pm_unregister code. It does
* mean however that you must not issue pm_ functions within the callback
* or you will deadlock and users will hate you.
*
* Zero is returned on success. If a suspend fails then the status
* from the device that vetoes the suspend is returned.
*
* BUGS: what stops two power management requests occurring in parallel
* and conflicting.
*/
int pm_send_all(pm_request_t rqst, void *data)
{
struct list_head *entry;
down(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
if (dev->callback) {
int status = pm_send(dev, rqst, data);
if (status) {
/* return devices to previous state on
* failed suspend request
*/
if (rqst == PM_SUSPEND)
pm_undo_all(dev);
up(&pm_devs_lock);
return status;
}
}
entry = entry->next;
}
up(&pm_devs_lock);
return 0;
}
EXPORT_SYMBOL(pm_register);
EXPORT_SYMBOL(pm_unregister);
EXPORT_SYMBOL(pm_unregister_all);
EXPORT_SYMBOL(pm_send_all);
EXPORT_SYMBOL(pm_active);

52
kernel/power/power.h Normal file
View File

@@ -0,0 +1,52 @@
#include <linux/suspend.h>
#include <linux/utsname.h>
/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
we probably do not take enough locks for switching consoles, etc,
so bad things might happen.
*/
#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
#endif
struct swsusp_info {
struct new_utsname uts;
u32 version_code;
unsigned long num_physpages;
int cpus;
unsigned long image_pages;
unsigned long pagedir_pages;
suspend_pagedir_t * suspend_pagedir;
swp_entry_t pagedir[768];
} __attribute__((aligned(PAGE_SIZE)));
#ifdef CONFIG_SOFTWARE_SUSPEND
extern int pm_suspend_disk(void);
#else
static inline int pm_suspend_disk(void)
{
return -EPERM;
}
#endif
extern struct semaphore pm_sem;
#define power_attr(_name) \
static struct subsys_attribute _name##_attr = { \
.attr = { \
.name = __stringify(_name), \
.mode = 0644, \
}, \
.show = _name##_show, \
.store = _name##_store, \
}
extern struct subsystem power_subsys;
extern int freeze_processes(void);
extern void thaw_processes(void);
extern int pm_prepare_console(void);
extern void pm_restore_console(void);

45
kernel/power/poweroff.c Normal file
View File

@@ -0,0 +1,45 @@
/*
* poweroff.c - sysrq handler to gracefully power down machine.
*
* This file is released under the GPL v2
*/
#include <linux/kernel.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/workqueue.h>
/*
* When the user hits Sys-Rq o to power down the machine this is the
* callback we use.
*/
static void do_poweroff(void *dummy)
{
if (pm_power_off)
pm_power_off();
}
static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
static void handle_poweroff(int key, struct pt_regs *pt_regs,
struct tty_struct *tty)
{
schedule_work(&poweroff_work);
}
static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "powerOff",
.action_msg = "Power Off",
.enable_mask = SYSRQ_ENABLE_BOOT,
};
static int pm_sysrq_init(void)
{
register_sysrq_key('o', &sysrq_poweroff_op);
return 0;
}
subsys_initcall(pm_sysrq_init);

121
kernel/power/process.c Normal file
View File

@@ -0,0 +1,121 @@
/*
* drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
*
* Originally from swsusp.
*/
#undef DEBUG
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
/*
* Timeout for stopping processes
*/
#define TIMEOUT (6 * HZ)
static inline int freezeable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
(p->exit_state == EXIT_ZOMBIE) ||
(p->exit_state == EXIT_DEAD) ||
(p->state == TASK_STOPPED) ||
(p->state == TASK_TRACED))
return 0;
return 1;
}
/* Refrigerator is place where frozen processes are stored :-). */
void refrigerator(unsigned long flag)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
long save;
save = current->state;
current->state = TASK_UNINTERRUPTIBLE;
pr_debug("%s entered refrigerator\n", current->comm);
printk("=");
current->flags &= ~PF_FREEZE;
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
current->flags |= PF_FROZEN;
while (current->flags & PF_FROZEN)
schedule();
pr_debug("%s left refrigerator\n", current->comm);
current->state = save;
}
/* 0 = success, else # of processes that we failed to stop */
int freeze_processes(void)
{
int todo;
unsigned long start_time;
struct task_struct *g, *p;
printk( "Stopping tasks: " );
start_time = jiffies;
do {
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
unsigned long flags;
if (!freezeable(p))
continue;
if ((p->flags & PF_FROZEN) ||
(p->state == TASK_TRACED) ||
(p->state == TASK_STOPPED))
continue;
/* FIXME: smp problem here: we may not access other process' flags
without locking */
p->flags |= PF_FREEZE;
spin_lock_irqsave(&p->sighand->siglock, flags);
signal_wake_up(p, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
yield(); /* Yield is okay here */
if (time_after(jiffies, start_time + TIMEOUT)) {
printk( "\n" );
printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
return todo;
}
} while(todo);
printk( "|\n" );
BUG_ON(in_atomic());
return 0;
}
void thaw_processes(void)
{
struct task_struct *g, *p;
printk( "Restarting tasks..." );
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (!freezeable(p))
continue;
if (p->flags & PF_FROZEN) {
p->flags &= ~PF_FROZEN;
wake_up_process(p);
} else
printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
schedule();
printk( " done\n" );
}
EXPORT_SYMBOL(refrigerator);

85
kernel/power/smp.c Normal file
View File

@@ -0,0 +1,85 @@
/*
* drivers/power/smp.c - Functions for stopping other CPUs.
*
* Copyright 2004 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
*
* This file is released under the GPLv2.
*/
#undef DEBUG
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
#include <asm/atomic.h>
#include <asm/tlbflush.h>
static atomic_t cpu_counter, freeze;
static void smp_pause(void * data)
{
struct saved_context ctxt;
__save_processor_state(&ctxt);
printk("Sleeping in:\n");
dump_stack();
atomic_inc(&cpu_counter);
while (atomic_read(&freeze)) {
/* FIXME: restore takes place at random piece inside this.
This should probably be written in assembly, and
preserve general-purpose registers, too
What about stack? We may need to move to new stack here.
This should better be ran with interrupts disabled.
*/
cpu_relax();
barrier();
}
atomic_dec(&cpu_counter);
__restore_processor_state(&ctxt);
}
static cpumask_t oldmask;
void disable_nonboot_cpus(void)
{
printk("Freezing CPUs (at %d)", smp_processor_id());
oldmask = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(0));
current->state = TASK_INTERRUPTIBLE;
schedule_timeout(HZ);
printk("...");
BUG_ON(smp_processor_id() != 0);
/* FIXME: for this to work, all the CPUs must be running
* "idle" thread (or we deadlock). Is that guaranteed? */
atomic_set(&cpu_counter, 0);
atomic_set(&freeze, 1);
smp_call_function(smp_pause, NULL, 0, 0);
while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
cpu_relax();
barrier();
}
printk("ok\n");
}
void enable_nonboot_cpus(void)
{
printk("Restarting CPUs");
atomic_set(&freeze, 0);
while (atomic_read(&cpu_counter)) {
cpu_relax();
barrier();
}
printk("...");
set_cpus_allowed(current, oldmask);
schedule();
printk("ok\n");
}

1433
kernel/power/swsusp.c Normal file

File diff suppressed because it is too large Load Diff

996
kernel/printk.c Normal file
View File

@@ -0,0 +1,996 @@
/*
* linux/kernel/printk.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Modified to make sys_syslog() more flexible: added commands to
* return the last 4k of kernel messages, regardless of whether
* they've been read or not. Added option to suppress kernel printk's
* to the console. Added hook for sending the console messages
* elsewhere, in preparation for a serial line console (someday).
* Ted Ts'o, 2/11/93.
* Modified for sysctl support, 1/8/97, Chris Horn.
* Fixed SMP synchronization, 08/08/99, Manfred Spraul
* manfreds@colorfullife.com
* Rewrote bits to get rid of console_lock
* 01Mar01 Andrew Morton <andrewm@uow.edu.au>
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/config.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
/* We show everything that is MORE important than this.. */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
DECLARE_WAIT_QUEUE_HEAD(log_wait);
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
EXPORT_SYMBOL(console_printk);
/*
* Low lever drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
*/
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
* console_sem protects the console_drivers list, and also
* provides serialisation for access to the entire console
* driver system.
*/
static DECLARE_MUTEX(console_sem);
struct console *console_drivers;
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
* hold it are racing, but it helps tracking those weird code
* path in the console code where we end up in places I want
* locked without the console sempahore held
*/
static int console_locked;
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
* It is also used in interesting ways to provide interlocking in
* release_console_sem().
*/
static DEFINE_SPINLOCK(logbuf_lock);
static char __log_buf[__LOG_BUF_LEN];
static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
#define LOG_BUF_MASK (log_buf_len-1)
#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
/*
* The indices into log_buf are not constrained to log_buf_len - they
* must be masked before subscripting
*/
static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */
static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
/*
* Array of consoles built from command line options (console=)
*/
struct console_cmdline
{
char name[8]; /* Name of the driver */
int index; /* Minor dev. to use */
char *options; /* Options for the driver */
};
#define MAX_CMDLINECONSOLES 8
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int selected_console = -1;
static int preferred_console = -1;
/* Flag: console code may call schedule() */
static int console_may_schedule;
/*
* Setup a list of consoles. Called from init/main.c
*/
static int __init console_setup(char *str)
{
char name[sizeof(console_cmdline[0].name)];
char *s, *options;
int idx;
/*
* Decode str into name, index, options.
*/
if (str[0] >= '0' && str[0] <= '9') {
strcpy(name, "ttyS");
strncpy(name + 4, str, sizeof(name) - 5);
} else
strncpy(name, str, sizeof(name) - 1);
name[sizeof(name) - 1] = 0;
if ((options = strchr(str, ',')) != NULL)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
strcpy(name, "ttyS0");
if (!strcmp(str, "ttyb"))
strcpy(name, "ttyS1");
#endif
for(s = name; *s; s++)
if ((*s >= '0' && *s <= '9') || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
add_preferred_console(name, idx, options);
return 1;
}
__setup("console=", console_setup);
/**
* add_preferred_console - add a device to the list of preferred consoles.
*
* The last preferred console added will be used for kernel messages
* and stdin/out/err for init. Normally this is used by console_setup
* above to handle user-supplied console arguments; however it can also
* be used by arch-specific code either to override the user or more
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
int __init add_preferred_console(char *name, int idx, char *options)
{
struct console_cmdline *c;
int i;
/*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
if (strcmp(console_cmdline[i].name, name) == 0 &&
console_cmdline[i].index == idx) {
selected_console = i;
return 0;
}
if (i == MAX_CMDLINECONSOLES)
return -E2BIG;
selected_console = i;
c = &console_cmdline[i];
memcpy(c->name, name, sizeof(c->name));
c->name[sizeof(c->name) - 1] = 0;
c->options = options;
c->index = idx;
return 0;
}
static int __init log_buf_len_setup(char *str)
{
unsigned long size = memparse(str, &str);
unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len) {
unsigned long start, dest_idx, offset;
char * new_log_buf;
new_log_buf = alloc_bootmem(size);
if (!new_log_buf) {
printk("log_buf_len: allocation failed\n");
goto out;
}
spin_lock_irqsave(&logbuf_lock, flags);
log_buf_len = size;
log_buf = new_log_buf;
offset = start = min(con_start, log_start);
dest_idx = 0;
while (start != log_end) {
log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
start++;
dest_idx++;
}
log_start -= offset;
con_start -= offset;
log_end -= offset;
spin_unlock_irqrestore(&logbuf_lock, flags);
printk("log_buf_len: %d\n", log_buf_len);
}
out:
return 1;
}
__setup("log_buf_len=", log_buf_len_setup);
/*
* Commands to do_syslog:
*
* 0 -- Close the log. Currently a NOP.
* 1 -- Open the log. Currently a NOP.
* 2 -- Read from the log.
* 3 -- Read all messages remaining in the ring buffer.
* 4 -- Read and clear all messages remaining in the ring buffer
* 5 -- Clear ring buffer.
* 6 -- Disable printk's to console
* 7 -- Enable printk's to console
* 8 -- Set level of messages printed to console
* 9 -- Return number of unread characters in the log buffer
* 10 -- Return size of the log buffer
*/
int do_syslog(int type, char __user * buf, int len)
{
unsigned long i, j, limit, count;
int do_clear = 0;
char c;
int error = 0;
error = security_syslog(type);
if (error)
return error;
switch (type) {
case 0: /* Close log */
break;
case 1: /* Open log */
break;
case 2: /* Read from log */
error = -EINVAL;
if (!buf || len < 0)
goto out;
error = 0;
if (!len)
goto out;
if (!access_ok(VERIFY_WRITE, buf, len)) {
error = -EFAULT;
goto out;
}
error = wait_event_interruptible(log_wait, (log_start - log_end));
if (error)
goto out;
i = 0;
spin_lock_irq(&logbuf_lock);
while (!error && (log_start != log_end) && i < len) {
c = LOG_BUF(log_start);
log_start++;
spin_unlock_irq(&logbuf_lock);
error = __put_user(c,buf);
buf++;
i++;
cond_resched();
spin_lock_irq(&logbuf_lock);
}
spin_unlock_irq(&logbuf_lock);
if (!error)
error = i;
break;
case 4: /* Read/clear last kernel messages */
do_clear = 1;
/* FALL THRU */
case 3: /* Read last kernel messages */
error = -EINVAL;
if (!buf || len < 0)
goto out;
error = 0;
if (!len)
goto out;
if (!access_ok(VERIFY_WRITE, buf, len)) {
error = -EFAULT;
goto out;
}
count = len;
if (count > log_buf_len)
count = log_buf_len;
spin_lock_irq(&logbuf_lock);
if (count > logged_chars)
count = logged_chars;
if (do_clear)
logged_chars = 0;
limit = log_end;
/*
* __put_user() could sleep, and while we sleep
* printk() could overwrite the messages
* we try to copy to user space. Therefore
* the messages are copied in reverse. <manfreds>
*/
for(i = 0; i < count && !error; i++) {
j = limit-1-i;
if (j + log_buf_len < log_end)
break;
c = LOG_BUF(j);
spin_unlock_irq(&logbuf_lock);
error = __put_user(c,&buf[count-1-i]);
cond_resched();
spin_lock_irq(&logbuf_lock);
}
spin_unlock_irq(&logbuf_lock);
if (error)
break;
error = i;
if(i != count) {
int offset = count-error;
/* buffer overflow during copy, correct user buffer. */
for(i=0;i<error;i++) {
if (__get_user(c,&buf[i+offset]) ||
__put_user(c,&buf[i])) {
error = -EFAULT;
break;
}
cond_resched();
}
}
break;
case 5: /* Clear ring buffer */
logged_chars = 0;
break;
case 6: /* Disable logging to console */
console_loglevel = minimum_console_loglevel;
break;
case 7: /* Enable logging to console */
console_loglevel = default_console_loglevel;
break;
case 8: /* Set level of messages printed to console */
error = -EINVAL;
if (len < 1 || len > 8)
goto out;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
error = 0;
break;
case 9: /* Number of chars in the log buffer */
error = log_end - log_start;
break;
case 10: /* Size of the log buffer */
error = log_buf_len;
break;
default:
error = -EINVAL;
break;
}
out:
return error;
}
asmlinkage long sys_syslog(int type, char __user * buf, int len)
{
return do_syslog(type, buf, len);
}
/*
* Call the console drivers on a range of log_buf
*/
static void __call_console_drivers(unsigned long start, unsigned long end)
{
struct console *con;
for (con = console_drivers; con; con = con->next) {
if ((con->flags & CON_ENABLED) && con->write)
con->write(con, &LOG_BUF(start), end - start);
}
}
/*
* Write out chars from start to end - 1 inclusive
*/
static void _call_console_drivers(unsigned long start,
unsigned long end, int msg_log_level)
{
if (msg_log_level < console_loglevel &&
console_drivers && start != end) {
if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
/* wrapped write */
__call_console_drivers(start & LOG_BUF_MASK,
log_buf_len);
__call_console_drivers(0, end & LOG_BUF_MASK);
} else {
__call_console_drivers(start, end);
}
}
}
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_sem must be held.
*/
static void call_console_drivers(unsigned long start, unsigned long end)
{
unsigned long cur_index, start_print;
static int msg_level = -1;
if (((long)(start - end)) > 0)
BUG();
cur_index = start;
start_print = start;
while (cur_index != end) {
if ( msg_level < 0 &&
((end - cur_index) > 2) &&
LOG_BUF(cur_index + 0) == '<' &&
LOG_BUF(cur_index + 1) >= '0' &&
LOG_BUF(cur_index + 1) <= '7' &&
LOG_BUF(cur_index + 2) == '>')
{
msg_level = LOG_BUF(cur_index + 1) - '0';
cur_index += 3;
start_print = cur_index;
}
while (cur_index != end) {
char c = LOG_BUF(cur_index);
cur_index++;
if (c == '\n') {
if (msg_level < 0) {
/*
* printk() has already given us loglevel tags in
* the buffer. This code is here in case the
* log buffer has wrapped right round and scribbled
* on those tags
*/
msg_level = default_message_loglevel;
}
_call_console_drivers(start_print, cur_index, msg_level);
msg_level = -1;
start_print = cur_index;
break;
}
}
}
_call_console_drivers(start_print, end, msg_level);
}
static void emit_log_char(char c)
{
LOG_BUF(log_end) = c;
log_end++;
if (log_end - log_start > log_buf_len)
log_start = log_end - log_buf_len;
if (log_end - con_start > log_buf_len)
con_start = log_end - log_buf_len;
if (logged_chars < log_buf_len)
logged_chars++;
}
/*
* Zap console related locks when oopsing. Only zap at most once
* every 10 seconds, to leave time for slow consoles to print a
* full oops.
*/
static void zap_locks(void)
{
static unsigned long oops_timestamp;
if (time_after_eq(jiffies, oops_timestamp) &&
!time_after(jiffies, oops_timestamp + 30*HZ))
return;
oops_timestamp = jiffies;
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
init_MUTEX(&console_sem);
}
#if defined(CONFIG_PRINTK_TIME)
static int printk_time = 1;
#else
static int printk_time = 0;
#endif
static int __init printk_time_setup(char *str)
{
if (*str)
return 0;
printk_time = 1;
return 1;
}
__setup("time", printk_time_setup);
/*
* This is printk. It can be called from any context. We want it to work.
*
* We try to grab the console_sem. If we succeed, it's easy - we log the output and
* call the console drivers. If we fail to get the semaphore we place the output
* into the log buffer and return. The current holder of the console_sem will
* notice the new output in release_console_sem() and will send it to the
* consoles before releasing the semaphore.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
* is inspected when the actual printing occurs.
*/
asmlinkage int printk(const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
r = vprintk(fmt, args);
va_end(args);
return r;
}
asmlinkage int vprintk(const char *fmt, va_list args)
{
unsigned long flags;
int printed_len;
char *p;
static char printk_buf[1024];
static int log_level_unknown = 1;
if (unlikely(oops_in_progress))
zap_locks();
/* This stops the holder of console_sem just where we want him */
spin_lock_irqsave(&logbuf_lock, flags);
/* Emit the output into the temporary buffer */
printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
/*
* Copy the output into log_buf. If the caller didn't provide
* appropriate log level tags, we insert them here
*/
for (p = printk_buf; *p; p++) {
if (log_level_unknown) {
/* log_level_unknown signals the start of a new line */
if (printk_time) {
int loglev_char;
char tbuf[50], *tp;
unsigned tlen;
unsigned long long t;
unsigned long nanosec_rem;
/*
* force the log level token to be
* before the time output.
*/
if (p[0] == '<' && p[1] >='0' &&
p[1] <= '7' && p[2] == '>') {
loglev_char = p[1];
p += 3;
printed_len += 3;
} else {
loglev_char = default_message_loglevel
+ '0';
}
t = sched_clock();
nanosec_rem = do_div(t, 1000000000);
tlen = sprintf(tbuf,
"<%c>[%5lu.%06lu] ",
loglev_char,
(unsigned long)t,
nanosec_rem/1000);
for (tp = tbuf; tp < tbuf + tlen; tp++)
emit_log_char(*tp);
printed_len += tlen - 3;
} else {
if (p[0] != '<' || p[1] < '0' ||
p[1] > '7' || p[2] != '>') {
emit_log_char('<');
emit_log_char(default_message_loglevel
+ '0');
emit_log_char('>');
}
printed_len += 3;
}
log_level_unknown = 0;
if (!*p)
break;
}
emit_log_char(*p);
if (*p == '\n')
log_level_unknown = 1;
}
if (!cpu_online(smp_processor_id()) &&
system_state != SYSTEM_RUNNING) {
/*
* Some console drivers may assume that per-cpu resources have
* been allocated. So don't allow them to be called by this
* CPU until it is officially up. We shouldn't be calling into
* random console drivers on a CPU which doesn't exist yet..
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
goto out;
}
if (!down_trylock(&console_sem)) {
console_locked = 1;
/*
* We own the drivers. We can drop the spinlock and let
* release_console_sem() print the text
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
console_may_schedule = 0;
release_console_sem();
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
* allows the semaphore holder to proceed and to call the
* console drivers with the output which we just produced.
*/
spin_unlock_irqrestore(&logbuf_lock, flags);
}
out:
return printed_len;
}
EXPORT_SYMBOL(printk);
EXPORT_SYMBOL(vprintk);
/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
void acquire_console_sem(void)
{
if (in_interrupt())
BUG();
down(&console_sem);
console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(acquire_console_sem);
int try_acquire_console_sem(void)
{
if (down_trylock(&console_sem))
return -1;
console_locked = 1;
console_may_schedule = 0;
return 0;
}
EXPORT_SYMBOL(try_acquire_console_sem);
int is_console_locked(void)
{
return console_locked;
}
EXPORT_SYMBOL(is_console_locked);
/**
* release_console_sem - unlock the console system
*
* Releases the semaphore which the caller holds on the console system
* and the console driver list.
*
* While the semaphore was held, console output may have been buffered
* by printk(). If this is the case, release_console_sem() emits
* the output prior to releasing the semaphore.
*
* If there is output waiting for klogd, we wake it up.
*
* release_console_sem() may be called from any context.
*/
void release_console_sem(void)
{
unsigned long flags;
unsigned long _con_start, _log_end;
unsigned long wake_klogd = 0;
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
if (con_start == log_end)
break; /* Nothing to print */
_con_start = con_start;
_log_end = log_end;
con_start = log_end; /* Flush */
spin_unlock(&logbuf_lock);
call_console_drivers(_con_start, _log_end);
local_irq_restore(flags);
}
console_locked = 0;
console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
wake_up_interruptible(&log_wait);
}
EXPORT_SYMBOL(release_console_sem);
/** console_conditional_schedule - yield the CPU if required
*
* If the console code is currently allowed to sleep, and
* if this CPU should yield the CPU to another task, do
* so here.
*
* Must be called within acquire_console_sem().
*/
void __sched console_conditional_schedule(void)
{
if (console_may_schedule)
cond_resched();
}
EXPORT_SYMBOL(console_conditional_schedule);
void console_print(const char *s)
{
printk(KERN_EMERG "%s", s);
}
EXPORT_SYMBOL(console_print);
void console_unblank(void)
{
struct console *c;
/*
* console_unblank can no longer be called in interrupt context unless
* oops_in_progress is set to 1..
*/
if (oops_in_progress) {
if (down_trylock(&console_sem) != 0)
return;
} else
acquire_console_sem();
console_locked = 1;
console_may_schedule = 0;
for (c = console_drivers; c != NULL; c = c->next)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
release_console_sem();
}
EXPORT_SYMBOL(console_unblank);
/*
* Return the console tty driver structure and its associated index
*/
struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
acquire_console_sem();
for (c = console_drivers; c != NULL; c = c->next) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
release_console_sem();
return driver;
}
/*
* Prevent further output on the passed console device so that (for example)
* serial drivers can disable console output before suspending a port, and can
* re-enable output afterwards.
*/
void console_stop(struct console *console)
{
acquire_console_sem();
console->flags &= ~CON_ENABLED;
release_console_sem();
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
acquire_console_sem();
console->flags |= CON_ENABLED;
release_console_sem();
}
EXPORT_SYMBOL(console_start);
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
* print any messages that were printed by the kernel before the
* console driver was initialized.
*/
void register_console(struct console * console)
{
int i;
unsigned long flags;
if (preferred_console < 0)
preferred_console = selected_console;
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
* that registers here.
*/
if (preferred_console < 0) {
if (console->index < 0)
console->index = 0;
if (console->setup == NULL ||
console->setup(console, NULL) == 0) {
console->flags |= CON_ENABLED | CON_CONSDEV;
preferred_console = 0;
}
}
/*
* See if this console matches one we selected on
* the command line.
*/
for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
if (strcmp(console_cmdline[i].name, console->name) != 0)
continue;
if (console->index >= 0 &&
console->index != console_cmdline[i].index)
continue;
if (console->index < 0)
console->index = console_cmdline[i].index;
if (console->setup &&
console->setup(console, console_cmdline[i].options) != 0)
break;
console->flags |= CON_ENABLED;
console->index = console_cmdline[i].index;
if (i == preferred_console)
console->flags |= CON_CONSDEV;
break;
}
if (!(console->flags & CON_ENABLED))
return;
if (console_drivers && (console_drivers->flags & CON_BOOT)) {
unregister_console(console_drivers);
console->flags &= ~CON_PRINTBUFFER;
}
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
acquire_console_sem();
if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
console->next = console_drivers;
console_drivers = console;
} else {
console->next = console_drivers->next;
console_drivers->next = console;
}
if (console->flags & CON_PRINTBUFFER) {
/*
* release_console_sem() will print out the buffered messages
* for us.
*/
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
release_console_sem();
}
EXPORT_SYMBOL(register_console);
int unregister_console(struct console * console)
{
struct console *a,*b;
int res = 1;
acquire_console_sem();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
} else {
for (a=console_drivers->next, b=console_drivers ;
a; b=a, a=b->next) {
if (a == console) {
b->next = a->next;
res = 0;
break;
}
}
}
/* If last console is removed, we re-enable picking the first
* one that gets registered. Without that, pmac early boot console
* would prevent fbcon from taking over.
*/
if (console_drivers == NULL)
preferred_console = selected_console;
release_console_sem();
return res;
}
EXPORT_SYMBOL(unregister_console);
/**
* tty_write_message - write a message to a certain tty, not just the console.
*
* This is used for messages that need to be redirected to a specific tty.
* We don't put it into the syslog queue right now maybe in the future if
* really needed.
*/
void tty_write_message(struct tty_struct *tty, char *msg)
{
if (tty && tty->driver->write)
tty->driver->write(tty, msg, strlen(msg));
return;
}
/*
* printk rate limiting, lifted from the networking subsystem.
*
* This enforces a rate limit: not more than one kernel message
* every printk_ratelimit_jiffies to make a denial-of-service
* attack impossible.
*/
int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
{
static DEFINE_SPINLOCK(ratelimit_lock);
static unsigned long toks = 10*5*HZ;
static unsigned long last_msg;
static int missed;
unsigned long flags;
unsigned long now = jiffies;
spin_lock_irqsave(&ratelimit_lock, flags);
toks += now - last_msg;
last_msg = now;
if (toks > (ratelimit_burst * ratelimit_jiffies))
toks = ratelimit_burst * ratelimit_jiffies;
if (toks >= ratelimit_jiffies) {
int lost = missed;
missed = 0;
toks -= ratelimit_jiffies;
spin_unlock_irqrestore(&ratelimit_lock, flags);
if (lost)
printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
return 1;
}
missed++;
spin_unlock_irqrestore(&ratelimit_lock, flags);
return 0;
}
EXPORT_SYMBOL(__printk_ratelimit);
/* minimum time in jiffies between messages */
int printk_ratelimit_jiffies = 5*HZ;
/* number of messages we send before ratelimiting */
int printk_ratelimit_burst = 10;
int printk_ratelimit(void)
{
return __printk_ratelimit(printk_ratelimit_jiffies,
printk_ratelimit_burst);
}
EXPORT_SYMBOL(printk_ratelimit);

563
kernel/profile.c Normal file
View File

@@ -0,0 +1,563 @@
/*
* linux/kernel/profile.c
* Simple profiling. Manages a direct-mapped profile hit count buffer,
* with configurable resolution, support for restricting the cpus on
* which profiling is done, and switching between cpu time and
* schedule() calls via kernel command line parameters passed at boot.
*
* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
* Red Hat, July 2004
* Consolidation of architecture support code for profiling,
* William Irwin, Oracle, July 2004
* Amortized hit count accounting via per-cpu open-addressed hashtables
* to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/profile.h>
#include <linux/bootmem.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/profile.h>
#include <linux/highmem.h>
#include <asm/sections.h>
#include <asm/semaphore.h>
struct profile_hit {
u32 pc, hits;
};
#define PROFILE_GRPSHIFT 3
#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
/* Oprofile timer tick hook */
int (*timer_hook)(struct pt_regs *);
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
static int prof_on;
static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DECLARE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */
static int __init profile_setup(char * str)
{
int par;
if (!strncmp(str, "schedule", 8)) {
prof_on = SCHED_PROFILING;
printk(KERN_INFO "kernel schedule profiling enabled\n");
if (str[7] == ',')
str += 8;
}
if (get_option(&str,&par)) {
prof_shift = par;
prof_on = CPU_PROFILING;
printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
prof_shift);
}
return 1;
}
__setup("profile=", profile_setup);
void __init profile_init(void)
{
if (!prof_on)
return;
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
}
/* Profile event notifications */
#ifdef CONFIG_PROFILING
static DECLARE_RWSEM(profile_rwsem);
static DEFINE_RWLOCK(handoff_lock);
static struct notifier_block * task_exit_notifier;
static struct notifier_block * task_free_notifier;
static struct notifier_block * munmap_notifier;
void profile_task_exit(struct task_struct * task)
{
down_read(&profile_rwsem);
notifier_call_chain(&task_exit_notifier, 0, task);
up_read(&profile_rwsem);
}
int profile_handoff_task(struct task_struct * task)
{
int ret;
read_lock(&handoff_lock);
ret = notifier_call_chain(&task_free_notifier, 0, task);
read_unlock(&handoff_lock);
return (ret == NOTIFY_OK) ? 1 : 0;
}
void profile_munmap(unsigned long addr)
{
down_read(&profile_rwsem);
notifier_call_chain(&munmap_notifier, 0, (void *)addr);
up_read(&profile_rwsem);
}
int task_handoff_register(struct notifier_block * n)
{
int err = -EINVAL;
write_lock(&handoff_lock);
err = notifier_chain_register(&task_free_notifier, n);
write_unlock(&handoff_lock);
return err;
}
int task_handoff_unregister(struct notifier_block * n)
{
int err = -EINVAL;
write_lock(&handoff_lock);
err = notifier_chain_unregister(&task_free_notifier, n);
write_unlock(&handoff_lock);
return err;
}
int profile_event_register(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
down_write(&profile_rwsem);
switch (type) {
case PROFILE_TASK_EXIT:
err = notifier_chain_register(&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = notifier_chain_register(&munmap_notifier, n);
break;
}
up_write(&profile_rwsem);
return err;
}
int profile_event_unregister(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
down_write(&profile_rwsem);
switch (type) {
case PROFILE_TASK_EXIT:
err = notifier_chain_unregister(&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = notifier_chain_unregister(&munmap_notifier, n);
break;
}
up_write(&profile_rwsem);
return err;
}
int register_timer_hook(int (*hook)(struct pt_regs *))
{
if (timer_hook)
return -EBUSY;
timer_hook = hook;
return 0;
}
void unregister_timer_hook(int (*hook)(struct pt_regs *))
{
WARN_ON(hook != timer_hook);
timer_hook = NULL;
/* make sure all CPUs see the NULL hook */
synchronize_kernel();
}
EXPORT_SYMBOL_GPL(register_timer_hook);
EXPORT_SYMBOL_GPL(unregister_timer_hook);
EXPORT_SYMBOL_GPL(task_handoff_register);
EXPORT_SYMBOL_GPL(task_handoff_unregister);
#endif /* CONFIG_PROFILING */
EXPORT_SYMBOL_GPL(profile_event_register);
EXPORT_SYMBOL_GPL(profile_event_unregister);
#ifdef CONFIG_SMP
/*
* Each cpu has a pair of open-addressed hashtables for pending
* profile hits. read_profile() IPI's all cpus to request them
* to flip buffers and flushes their contents to prof_buffer itself.
* Flip requests are serialized by the profile_flip_mutex. The sole
* use of having a second hashtable is for avoiding cacheline
* contention that would otherwise happen during flushes of pending
* profile hits required for the accuracy of reported profile hits
* and so resurrect the interrupt livelock issue.
*
* The open-addressed hashtables are indexed by profile buffer slot
* and hold the number of pending hits to that profile buffer slot on
* a cpu in an entry. When the hashtable overflows, all pending hits
* are accounted to their corresponding profile buffer slots with
* atomic_add() and the hashtable emptied. As numerous pending hits
* may be accounted to a profile buffer slot in a hashtable entry,
* this amortizes a number of atomic profile buffer increments likely
* to be far larger than the number of entries in the hashtable,
* particularly given that the number of distinct profile buffer
* positions to which hits are accounted during short intervals (e.g.
* several seconds) is usually very small. Exclusion from buffer
* flipping is provided by interrupt disablement (note that for
* SCHED_PROFILING profile_hit() may be called from process context).
* The hash function is meant to be lightweight as opposed to strong,
* and was vaguely inspired by ppc64 firmware-supported inverted
* pagetable hash functions, but uses a full hashtable full of finite
* collision chains, not just pairs of them.
*
* -- wli
*/
static void __profile_flip_buffers(void *unused)
{
int cpu = smp_processor_id();
per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
}
static void profile_flip_buffers(void)
{
int i, j, cpu;
down(&profile_flip_mutex);
j = per_cpu(cpu_profile_flip, get_cpu());
put_cpu();
on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
for_each_online_cpu(cpu) {
struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
for (i = 0; i < NR_PROFILE_HIT; ++i) {
if (!hits[i].hits) {
if (hits[i].pc)
hits[i].pc = 0;
continue;
}
atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
hits[i].hits = hits[i].pc = 0;
}
}
up(&profile_flip_mutex);
}
static void profile_discard_flip_buffers(void)
{
int i, cpu;
down(&profile_flip_mutex);
i = per_cpu(cpu_profile_flip, get_cpu());
put_cpu();
on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
for_each_online_cpu(cpu) {
struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
}
up(&profile_flip_mutex);
}
void profile_hit(int type, void *__pc)
{
unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu;
struct profile_hit *hits;
if (prof_on != type || !prof_buffer)
return;
pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
cpu = get_cpu();
hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
if (!hits) {
put_cpu();
return;
}
local_irq_save(flags);
do {
for (j = 0; j < PROFILE_GRPSZ; ++j) {
if (hits[i + j].pc == pc) {
hits[i + j].hits++;
goto out;
} else if (!hits[i + j].hits) {
hits[i + j].pc = pc;
hits[i + j].hits = 1;
goto out;
}
}
i = (i + secondary) & (NR_PROFILE_HIT - 1);
} while (i != primary);
atomic_inc(&prof_buffer[pc]);
for (i = 0; i < NR_PROFILE_HIT; ++i) {
atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
hits[i].pc = hits[i].hits = 0;
}
out:
local_irq_restore(flags);
put_cpu();
}
#ifdef CONFIG_HOTPLUG_CPU
static int __devinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
struct page *page;
switch (action) {
case CPU_UP_PREPARE:
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
return NOTIFY_BAD;
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_free;
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
}
break;
out_free:
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
return NOTIFY_BAD;
case CPU_ONLINE:
cpu_set(cpu, prof_cpu_mask);
break;
case CPU_UP_CANCELED:
case CPU_DEAD:
cpu_clear(cpu, prof_cpu_mask);
if (per_cpu(cpu_profile_hits, cpu)[0]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
per_cpu(cpu_profile_hits, cpu)[0] = NULL;
__free_page(page);
}
if (per_cpu(cpu_profile_hits, cpu)[1]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
}
break;
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
#else /* !CONFIG_SMP */
#define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0)
void profile_hit(int type, void *__pc)
{
unsigned long pc;
if (prof_on != type || !prof_buffer)
return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
void profile_tick(int type, struct pt_regs *regs)
{
if (type == CPU_PROFILING && timer_hook)
timer_hook(regs);
if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
}
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
if (count - len < 2)
return -EINVAL;
len += sprintf(page + len, "\n");
return len;
}
static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
unsigned long count, void *data)
{
cpumask_t *mask = (cpumask_t *)data;
unsigned long full_count = count, err;
cpumask_t new_value;
err = cpumask_parse(buffer, count, new_value);
if (err)
return err;
*mask = new_value;
return full_count;
}
void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
{
struct proc_dir_entry *entry;
/* create /proc/irq/prof_cpu_mask */
if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
return;
entry->nlink = 1;
entry->data = (void *)&prof_cpu_mask;
entry->read_proc = prof_cpu_mask_read_proc;
entry->write_proc = prof_cpu_mask_write_proc;
}
/*
* This function accesses profiling information. The returned data is
* binary: the sampling step and the actual contents of the profile
* buffer. Use of the program readprofile is recommended in order to
* get meaningful info out of these data.
*/
static ssize_t
read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned long p = *ppos;
ssize_t read;
char * pnt;
unsigned int sample_step = 1 << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
return 0;
if (count > (prof_len+1)*sizeof(unsigned int) - p)
count = (prof_len+1)*sizeof(unsigned int) - p;
read = 0;
while (p < sizeof(unsigned int) && count > 0) {
put_user(*((char *)(&sample_step)+p),buf);
buf++; p++; count--; read++;
}
pnt = (char *)prof_buffer + p - sizeof(atomic_t);
if (copy_to_user(buf,(void *)pnt,count))
return -EFAULT;
read += count;
*ppos += read;
return read;
}
/*
* Writing to /proc/profile resets the counters
*
* Writing a 'profiling multiplier' value into it also re-sets the profiling
* interrupt frequency, on architectures that support this.
*/
static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
extern int setup_profiling_timer (unsigned int multiplier);
if (count == sizeof(int)) {
unsigned int multiplier;
if (copy_from_user(&multiplier, buf, sizeof(int)))
return -EFAULT;
if (setup_profiling_timer(multiplier))
return -EINVAL;
}
#endif
profile_discard_flip_buffers();
memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
return count;
}
static struct file_operations proc_profile_operations = {
.read = read_profile,
.write = write_profile,
};
#ifdef CONFIG_SMP
static void __init profile_nop(void *unused)
{
}
static int __init create_hash_tables(void)
{
int cpu;
for_each_online_cpu(cpu) {
int node = cpu_to_node(cpu);
struct page *page;
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[0]
= (struct profile_hit *)page_address(page);
}
return 0;
out_cleanup:
prof_on = 0;
mb();
on_each_cpu(profile_nop, NULL, 0, 1);
for_each_online_cpu(cpu) {
struct page *page;
if (per_cpu(cpu_profile_hits, cpu)[0]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
per_cpu(cpu_profile_hits, cpu)[0] = NULL;
__free_page(page);
}
if (per_cpu(cpu_profile_hits, cpu)[1]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
}
}
return -1;
}
#else
#define create_hash_tables() ({ 0; })
#endif
static int __init create_proc_profile(void)
{
struct proc_dir_entry *entry;
if (!prof_on)
return 0;
if (create_hash_tables())
return -1;
if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
return 0;
entry->proc_fops = &proc_profile_operations;
entry->size = (1+prof_len) * sizeof(atomic_t);
hotcpu_notifier(profile_cpu_callback, 0);
return 0;
}
module_init(create_proc_profile);
#endif /* CONFIG_PROC_FS */

389
kernel/ptrace.c Normal file
View File

@@ -0,0 +1,389 @@
/*
* linux/kernel/ptrace.c
*
* (C) Copyright 1999 Linus Torvalds
*
* Common interfaces for "ptrace()" which we do not want
* to continually duplicate across every architecture.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_link(task_t *child, task_t *new_parent)
{
if (!list_empty(&child->ptrace_list))
BUG();
if (child->parent == new_parent)
return;
list_add(&child->ptrace_list, &child->parent->ptrace_children);
REMOVE_LINKS(child);
child->parent = new_parent;
SET_LINKS(child);
}
/*
* Turn a tracing stop into a normal stop now, since with no tracer there
* would be no way to wake it up with SIGCONT or SIGKILL. If there was a
* signal sent that would resume the child, but didn't because it was in
* TASK_TRACED, resume it now.
* Requires that irqs be disabled.
*/
void ptrace_untrace(task_t *child)
{
spin_lock(&child->sighand->siglock);
if (child->state == TASK_TRACED) {
if (child->signal->flags & SIGNAL_STOP_STOPPED) {
child->state = TASK_STOPPED;
} else {
signal_wake_up(child, 1);
}
}
spin_unlock(&child->sighand->siglock);
}
/*
* unptrace a task: move it back to its original parent and
* remove it from the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
void __ptrace_unlink(task_t *child)
{
if (!child->ptrace)
BUG();
child->ptrace = 0;
if (!list_empty(&child->ptrace_list)) {
list_del_init(&child->ptrace_list);
REMOVE_LINKS(child);
child->parent = child->real_parent;
SET_LINKS(child);
}
if (child->state == TASK_TRACED)
ptrace_untrace(child);
}
/*
* Check that we have indeed attached to the thing..
*/
int ptrace_check_attach(struct task_struct *child, int kill)
{
int ret = -ESRCH;
/*
* We take the read lock around doing both checks to close a
* possible race where someone else was tracing our child and
* detached between these two checks. After this locked check,
* we are sure that this is our traced child and that can only
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current &&
(!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
&& child->signal != NULL) {
ret = 0;
spin_lock_irq(&child->sighand->siglock);
if (child->state == TASK_STOPPED) {
child->state = TASK_TRACED;
} else if (child->state != TASK_TRACED && !kill) {
ret = -ESRCH;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
if (!ret && !kill) {
wait_task_inactive(child);
}
/* All systems go.. */
return ret;
}
int ptrace_attach(struct task_struct *task)
{
int retval;
task_lock(task);
retval = -EPERM;
if (task->pid <= 1)
goto bad;
if (task == current)
goto bad;
if (!task->mm)
goto bad;
if(((current->uid != task->euid) ||
(current->uid != task->suid) ||
(current->uid != task->uid) ||
(current->gid != task->egid) ||
(current->gid != task->sgid) ||
(current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
goto bad;
rmb();
if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
goto bad;
/* the same process cannot be attached many times */
if (task->ptrace & PT_PTRACED)
goto bad;
retval = security_ptrace(current, task);
if (retval)
goto bad;
/* Go */
task->ptrace |= PT_PTRACED | ((task->real_parent != current)
? PT_ATTACHED : 0);
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
task_unlock(task);
write_lock_irq(&tasklist_lock);
__ptrace_link(task, current);
write_unlock_irq(&tasklist_lock);
force_sig_specific(SIGSTOP, task);
return 0;
bad:
task_unlock(task);
return retval;
}
int ptrace_detach(struct task_struct *child, unsigned int data)
{
if ((unsigned long) data > _NSIG)
return -EIO;
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
/* .. re-parent .. */
child->exit_code = data;
write_lock_irq(&tasklist_lock);
__ptrace_unlink(child);
/* .. and wake it up. */
if (child->exit_state != EXIT_ZOMBIE)
wake_up_process(child);
write_unlock_irq(&tasklist_lock);
return 0;
}
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
struct page *page;
void *old_buf = buf;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was sucessfully transfered */
while (len) {
int bytes, ret, offset;
void *maddr;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0)
break;
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
while (len > 0) {
char buf[128];
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
retval = access_process_vm(tsk, src, buf, this_len, 0);
if (!retval) {
if (copied)
break;
return -EIO;
}
if (copy_to_user(dst, buf, retval))
return -EFAULT;
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
{
int copied = 0;
while (len > 0) {
char buf[128];
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
retval = access_process_vm(tsk, dst, buf, this_len, 1);
if (!retval) {
if (copied)
break;
return -EIO;
}
copied += retval;
src += retval;
dst += retval;
len -= retval;
}
return copied;
}
static int ptrace_setoptions(struct task_struct *child, long data)
{
child->ptrace &= ~PT_TRACE_MASK;
if (data & PTRACE_O_TRACESYSGOOD)
child->ptrace |= PT_TRACESYSGOOD;
if (data & PTRACE_O_TRACEFORK)
child->ptrace |= PT_TRACE_FORK;
if (data & PTRACE_O_TRACEVFORK)
child->ptrace |= PT_TRACE_VFORK;
if (data & PTRACE_O_TRACECLONE)
child->ptrace |= PT_TRACE_CLONE;
if (data & PTRACE_O_TRACEEXEC)
child->ptrace |= PT_TRACE_EXEC;
if (data & PTRACE_O_TRACEVFORKDONE)
child->ptrace |= PT_TRACE_VFORK_DONE;
if (data & PTRACE_O_TRACEEXIT)
child->ptrace |= PT_TRACE_EXIT;
return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
}
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
{
siginfo_t lastinfo;
int error = -ESRCH;
read_lock(&tasklist_lock);
if (likely(child->sighand != NULL)) {
error = -EINVAL;
spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
lastinfo = *child->last_siginfo;
error = 0;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
if (!error)
return copy_siginfo_to_user(data, &lastinfo);
return error;
}
static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
{
siginfo_t newinfo;
int error = -ESRCH;
if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
return -EFAULT;
read_lock(&tasklist_lock);
if (likely(child->sighand != NULL)) {
error = -EINVAL;
spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*child->last_siginfo = newinfo;
error = 0;
}
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
return error;
}
int ptrace_request(struct task_struct *child, long request,
long addr, long data)
{
int ret = -EIO;
switch (request) {
#ifdef PTRACE_OLDSETOPTIONS
case PTRACE_OLDSETOPTIONS:
#endif
case PTRACE_SETOPTIONS:
ret = ptrace_setoptions(child, data);
break;
case PTRACE_GETEVENTMSG:
ret = put_user(child->ptrace_message, (unsigned long __user *) data);
break;
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
break;
case PTRACE_SETSIGINFO:
ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
break;
default:
break;
}
return ret;
}

470
kernel/rcupdate.c Normal file
View File

@@ -0,0 +1,470 @@
/*
* Read-Copy Update mechanism for mutual exclusion
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* http://lse.sourceforge.net/locking/rcupdate.html
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
/* Definition for rcupdate control block. */
struct rcu_ctrlblk rcu_ctrlblk =
{ .cur = -300, .completed = -300 };
struct rcu_ctrlblk rcu_bh_ctrlblk =
{ .cur = -300, .completed = -300 };
/* Bookkeeping of the progress of the grace period */
struct rcu_state {
spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */
cpumask_t cpumask; /* CPUs that need to switch in order */
/* for current batch to proceed. */
};
static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
{.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
{.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
static int maxbatch = 10;
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void fastcall call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
local_irq_restore(flags);
}
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by rcu_read_lock() and
* rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
* and rcu_read_unlock_bh(), if in process context. These may be nested.
*/
void fastcall call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
local_irq_restore(flags);
}
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
struct rcu_head *next, *list;
int count = 0;
list = rdp->donelist;
while (list) {
next = rdp->donelist = list->next;
list->func(list);
list = next;
if (++count >= maxbatch)
break;
}
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
}
/*
* Grace period handling:
* The grace period handling consists out of two steps:
* - A new grace period is started.
* This is done by rcu_start_batch. The start is not broadcasted to
* all cpus, they must pick this up by comparing rcp->cur with
* rdp->quiescbatch. All cpus are recorded in the
* rcu_state.cpumask bitmap.
* - All cpus must go through a quiescent state.
* Since the start of the grace period is not broadcasted, at least two
* calls to rcu_check_quiescent_state are required:
* The first call just notices that a new grace period is running. The
* following calls check if there was a quiescent state since the beginning
* of the grace period. If so, it updates rcu_state.cpumask. If
* the bitmap is empty, then the grace period is completed.
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
* Caller must hold rcu_state.lock.
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
int next_pending)
{
if (next_pending)
rcp->next_pending = 1;
if (rcp->next_pending &&
rcp->completed == rcp->cur) {
/* Can't change, since spin lock held. */
cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
rcp->next_pending = 0;
/* next_pending == 0 must be visible in __rcu_process_callbacks()
* before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
}
}
/*
* cpu went through a quiescent state since the beginning of the grace period.
* Clear it from the cpu mask and complete the grace period if it was the last
* cpu. Start another grace period if someone has further entries pending
*/
static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
{
cpu_clear(cpu, rsp->cpumask);
if (cpus_empty(rsp->cpumask)) {
/* batch completed ! */
rcp->completed = rcp->cur;
rcu_start_batch(rcp, rsp, 0);
}
}
/*
* Check if the cpu has gone through a quiescent state (say context
* switch). If so and if it already hasn't done so in this RCU
* quiescent cycle, then indicate that it has done so.
*/
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_state *rsp, struct rcu_data *rdp)
{
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->quiescbatch = rcp->cur;
return;
}
/* Grace period already completed for this cpu?
* qs_pending is checked instead of the actual bitmap to avoid
* cacheline trashing.
*/
if (!rdp->qs_pending)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
if (!rdp->passed_quiesc)
return;
rdp->qs_pending = 0;
spin_lock(&rsp->lock);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
*/
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp, rsp);
spin_unlock(&rsp->lock);
}
#ifdef CONFIG_HOTPLUG_CPU
/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
* locking requirements, the list it's pulling from has to belong to a cpu
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rsp->lock);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp, rsp);
spin_unlock_bh(&rsp->lock);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
}
static void rcu_offline_cpu(int cpu)
{
struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
&per_cpu(rcu_data, cpu));
__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
&per_cpu(rcu_bh_data, cpu));
put_cpu_var(rcu_data);
put_cpu_var(rcu_bh_data);
tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
}
#else
static void rcu_offline_cpu(int cpu)
{
}
#endif
/*
* This does the RCU processing work from tasklet context.
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_state *rsp, struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
local_irq_disable();
if (rdp->nxtlist && !rdp->curlist) {
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
/*
* start the next batch of callbacks
*/
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
*/
smp_rmb();
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rsp->lock);
rcu_start_batch(rcp, rsp, 1);
spin_unlock(&rsp->lock);
}
} else {
local_irq_enable();
}
rcu_check_quiescent_state(rcp, rsp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
}
static void rcu_process_callbacks(unsigned long unused)
{
__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
&__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
&__get_cpu_var(rcu_bh_data));
}
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
} else if (!in_softirq())
rcu_bh_qsctr_inc(cpu);
tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
}
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
}
static void __devinit rcu_online_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
static int __devinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __devinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
/*
* Initializes rcu mechanism. Assumed to be called early.
* That is before local timer(SMP) or jiffie timer (uniproc) is setup.
* Note that rcu_qsctr and friends are implicitly
* initialized due to the choice of ``0'' for RCU_CTR_INVALID.
*/
void __init rcu_init(void)
{
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}
struct rcu_synchronize {
struct rcu_head head;
struct completion completion;
};
/* Because of FASTCALL declaration of complete, we use this wrapper */
static void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
/**
* synchronize_kernel - wait until a grace period has elapsed.
*
* Control will return to the caller some time after a full grace
* period has elapsed, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void synchronize_kernel(void)
{
struct rcu_synchronize rcu;
init_completion(&rcu.completion);
/* Will wake me after RCU finished */
call_rcu(&rcu.head, wakeme_after_rcu);
/* Wait for it */
wait_for_completion(&rcu.completion);
}
module_param(maxbatch, int, 0);
EXPORT_SYMBOL_GPL(call_rcu);
EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_kernel);

551
kernel/resource.c Normal file
View File

@@ -0,0 +1,551 @@
/*
* linux/kernel/resource.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 1999 Martin Mares <mj@ucw.cz>
*
* Arbitrary resource management.
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <asm/io.h>
struct resource ioport_resource = {
.name = "PCI IO",
.start = 0x0000,
.end = IO_SPACE_LIMIT,
.flags = IORESOURCE_IO,
};
EXPORT_SYMBOL(ioport_resource);
struct resource iomem_resource = {
.name = "PCI mem",
.start = 0UL,
.end = ~0UL,
.flags = IORESOURCE_MEM,
};
EXPORT_SYMBOL(iomem_resource);
static DEFINE_RWLOCK(resource_lock);
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
(*pos)++;
if (p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
struct resource *p = m->private;
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
;
return p;
}
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
read_unlock(&resource_lock);
}
static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
seq_printf(m, "%*s%0*lx-%0*lx : %s\n",
depth * 2, "",
width, r->start,
width, r->end,
r->name ? r->name : "<BAD>");
return 0;
}
static struct seq_operations resource_op = {
.start = r_start,
.next = r_next,
.stop = r_stop,
.show = r_show,
};
static int ioports_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &ioport_resource;
}
return res;
}
static int iomem_open(struct inode *inode, struct file *file)
{
int res = seq_open(file, &resource_op);
if (!res) {
struct seq_file *m = file->private_data;
m->private = &iomem_resource;
}
return res;
}
static struct file_operations proc_ioports_operations = {
.open = ioports_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static struct file_operations proc_iomem_operations = {
.open = iomem_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init ioresources_init(void)
{
struct proc_dir_entry *entry;
entry = create_proc_entry("ioports", 0, NULL);
if (entry)
entry->proc_fops = &proc_ioports_operations;
entry = create_proc_entry("iomem", 0, NULL);
if (entry)
entry->proc_fops = &proc_iomem_operations;
return 0;
}
__initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
unsigned long start = new->start;
unsigned long end = new->end;
struct resource *tmp, **p;
if (end < start)
return root;
if (start < root->start)
return root;
if (end > root->end)
return root;
p = &root->child;
for (;;) {
tmp = *p;
if (!tmp || tmp->start > end) {
new->sibling = tmp;
*p = new;
new->parent = root;
return NULL;
}
p = &tmp->sibling;
if (tmp->end < start)
continue;
return tmp;
}
}
static int __release_resource(struct resource *old)
{
struct resource *tmp, **p;
p = &old->parent->child;
for (;;) {
tmp = *p;
if (!tmp)
break;
if (tmp == old) {
*p = tmp->sibling;
old->parent = NULL;
return 0;
}
p = &tmp->sibling;
}
return -EINVAL;
}
int request_resource(struct resource *root, struct resource *new)
{
struct resource *conflict;
write_lock(&resource_lock);
conflict = __request_resource(root, new);
write_unlock(&resource_lock);
return conflict ? -EBUSY : 0;
}
EXPORT_SYMBOL(request_resource);
struct resource *____request_resource(struct resource *root, struct resource *new)
{
struct resource *conflict;
write_lock(&resource_lock);
conflict = __request_resource(root, new);
write_unlock(&resource_lock);
return conflict;
}
EXPORT_SYMBOL(____request_resource);
int release_resource(struct resource *old)
{
int retval;
write_lock(&resource_lock);
retval = __release_resource(old);
write_unlock(&resource_lock);
return retval;
}
EXPORT_SYMBOL(release_resource);
/*
* Find empty slot in the resource tree given range and alignment.
*/
static int find_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
unsigned long align,
void (*alignf)(void *, struct resource *,
unsigned long, unsigned long),
void *alignf_data)
{
struct resource *this = root->child;
new->start = root->start;
/*
* Skip past an allocated resource that starts at 0, since the assignment
* of this->start - 1 to new->end below would cause an underflow.
*/
if (this && this->start == 0) {
new->start = this->end + 1;
this = this->sibling;
}
for(;;) {
if (this)
new->end = this->start - 1;
else
new->end = root->end;
if (new->start < min)
new->start = min;
if (new->end > max)
new->end = max;
new->start = (new->start + align - 1) & ~(align - 1);
if (alignf)
alignf(alignf_data, new, size, align);
if (new->start < new->end && new->end - new->start + 1 >= size) {
new->end = new->start + size - 1;
return 0;
}
if (!this)
break;
new->start = this->end + 1;
this = this->sibling;
}
return -EBUSY;
}
/*
* Allocate empty slot in the resource tree given range and alignment.
*/
int allocate_resource(struct resource *root, struct resource *new,
unsigned long size,
unsigned long min, unsigned long max,
unsigned long align,
void (*alignf)(void *, struct resource *,
unsigned long, unsigned long),
void *alignf_data)
{
int err;
write_lock(&resource_lock);
err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
return err;
}
EXPORT_SYMBOL(allocate_resource);
/**
* insert_resource - Inserts a resource in the resource tree
* @parent: parent of the new resource
* @new: new resource to insert
*
* Returns 0 on success, -EBUSY if the resource can't be inserted.
*
* This function is equivalent of request_resource when no conflict
* happens. If a conflict happens, and the conflicting resources
* entirely fit within the range of the new resource, then the new
* resource is inserted and the conflicting resources become childs of
* the new resource. Otherwise the new resource becomes the child of
* the conflicting resource
*/
int insert_resource(struct resource *parent, struct resource *new)
{
int result;
struct resource *first, *next;
write_lock(&resource_lock);
begin:
result = 0;
first = __request_resource(parent, new);
if (!first)
goto out;
result = -EBUSY;
if (first == parent)
goto out;
/* Resource fully contained by the clashing resource? Recurse into it */
if (first->start <= new->start && first->end >= new->end) {
parent = first;
goto begin;
}
for (next = first; ; next = next->sibling) {
/* Partial overlap? Bad, and unfixable */
if (next->start < new->start || next->end > new->end)
goto out;
if (!next->sibling)
break;
if (next->sibling->start > new->end)
break;
}
result = 0;
new->parent = parent;
new->sibling = next->sibling;
new->child = first;
next->sibling = NULL;
for (next = first; next; next = next->sibling)
next->parent = new;
if (parent->child == first) {
parent->child = new;
} else {
next = parent->child;
while (next->sibling != first)
next = next->sibling;
next->sibling = new;
}
out:
write_unlock(&resource_lock);
return result;
}
EXPORT_SYMBOL(insert_resource);
/*
* Given an existing resource, change its start and size to match the
* arguments. Returns -EBUSY if it can't fit. Existing children of
* the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, unsigned long start, unsigned long size)
{
struct resource *tmp, *parent = res->parent;
unsigned long end = start + size - 1;
int result = -EBUSY;
write_lock(&resource_lock);
if ((start < parent->start) || (end > parent->end))
goto out;
for (tmp = res->child; tmp; tmp = tmp->sibling) {
if ((tmp->start < start) || (tmp->end > end))
goto out;
}
if (res->sibling && (res->sibling->start <= end))
goto out;
tmp = parent->child;
if (tmp != res) {
while (tmp->sibling != res)
tmp = tmp->sibling;
if (start <= tmp->end)
goto out;
}
res->start = start;
res->end = end;
result = 0;
out:
write_unlock(&resource_lock);
return result;
}
EXPORT_SYMBOL(adjust_resource);
/*
* This is compatibility stuff for IO resources.
*
* Note how this, unlike the above, knows about
* the IO flag meanings (busy etc).
*
* Request-region creates a new busy region.
*
* Check-region returns non-zero if the area is already busy
*
* Release-region releases a matching busy region.
*/
struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
{
struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
if (res) {
memset(res, 0, sizeof(*res));
res->name = name;
res->start = start;
res->end = start + n - 1;
res->flags = IORESOURCE_BUSY;
write_lock(&resource_lock);
for (;;) {
struct resource *conflict;
conflict = __request_resource(parent, res);
if (!conflict)
break;
if (conflict != parent) {
parent = conflict;
if (!(conflict->flags & IORESOURCE_BUSY))
continue;
}
/* Uhhuh, that didn't work out.. */
kfree(res);
res = NULL;
break;
}
write_unlock(&resource_lock);
}
return res;
}
EXPORT_SYMBOL(__request_region);
int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n)
{
struct resource * res;
res = __request_region(parent, start, n, "check-region");
if (!res)
return -EBUSY;
release_resource(res);
kfree(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
void __release_region(struct resource *parent, unsigned long start, unsigned long n)
{
struct resource **p;
unsigned long end;
p = &parent->child;
end = start + n - 1;
write_lock(&resource_lock);
for (;;) {
struct resource *res = *p;
if (!res)
break;
if (res->start <= start && res->end >= end) {
if (!(res->flags & IORESOURCE_BUSY)) {
p = &res->child;
continue;
}
if (res->start != start || res->end != end)
break;
*p = res->sibling;
write_unlock(&resource_lock);
kfree(res);
return;
}
p = &res->sibling;
}
write_unlock(&resource_lock);
printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
}
EXPORT_SYMBOL(__release_region);
/*
* Called from init/main.c to reserve IO ports.
*/
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
{
static int reserved;
static struct resource reserve[MAXRESERVE];
for (;;) {
int io_start, io_num;
int x = reserved;
if (get_option (&str, &io_start) != 2)
break;
if (get_option (&str, &io_num) == 0)
break;
if (x < MAXRESERVE) {
struct resource *res = reserve + x;
res->name = "reserved";
res->start = io_start;
res->end = io_start + io_num - 1;
res->flags = IORESOURCE_BUSY;
res->child = NULL;
if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
reserved = x+1;
}
}
return 1;
}
__setup("reserve=", reserve_setup);

5004
kernel/sched.c Normal file

File diff suppressed because it is too large Load Diff

56
kernel/seccomp.c Normal file
View File

@@ -0,0 +1,56 @@
/*
* linux/kernel/seccomp.c
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
* This defines a simple but solid secure-computing mode.
*/
#include <linux/seccomp.h>
#include <linux/sched.h>
/* #define SECCOMP_DEBUG 1 */
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
* To be fully secure this must be combined with rlimit
* to limit the stack allocations too.
*/
static int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
0, /* null terminated */
};
#ifdef TIF_32BIT
static int mode1_syscalls_32[] = {
__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
0, /* null terminated */
};
#endif
void __secure_computing(int this_syscall)
{
int mode = current->seccomp.mode;
int * syscall;
switch (mode) {
case 1:
syscall = mode1_syscalls;
#ifdef TIF_32BIT
if (test_thread_flag(TIF_32BIT))
syscall = mode1_syscalls_32;
#endif
do {
if (*syscall == this_syscall)
return;
} while (*++syscall);
break;
default:
BUG();
}
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
do_exit(SIGKILL);
}

2662
kernel/signal.c Normal file

File diff suppressed because it is too large Load Diff

496
kernel/softirq.c Normal file
View File

@@ -0,0 +1,496 @@
/*
* linux/kernel/softirq.c
*
* Copyright (C) 1992 Linus Torvalds
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*/
#include <linux/module.h>
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
by its own spinlocks.
- Even if softirq is serialized, only local cpu is marked for
execution. Hence, we get something sort of weak cpu binding.
Though it is still not clear, will it result in better locality
or will not.
Examples:
- NET RX softirq. It is multithreaded and does not require
any global serialization.
- NET TX softirq. It kicks software netdevice queues, hence
it is logically serialized per device, but this serialization
is invisible to common code.
- Tasklets: serialized wrt itself.
*/
#ifndef __ARCH_IRQ_STAT
irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
EXPORT_SYMBOL(irq_stat);
#endif
static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
static inline void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __get_cpu_var(ksoftirqd);
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
}
/*
* We restart softirq processing MAX_SOFTIRQ_RESTART times,
* and we fall back to softirqd after that.
*
* This number has been established via experimentation.
* The two things to balance is latency against fairness -
* we want to handle softirqs as soon as possible, but they
* should not be able to lock up the box.
*/
#define MAX_SOFTIRQ_RESTART 10
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
__u32 pending;
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;
pending = local_softirq_pending();
local_bh_disable();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
local_softirq_pending() = 0;
local_irq_enable();
h = softirq_vec;
do {
if (pending & 1) {
h->action(h);
rcu_bh_qsctr_inc(cpu);
}
h++;
pending >>= 1;
} while (pending);
local_irq_disable();
pending = local_softirq_pending();
if (pending && --max_restart)
goto restart;
if (pending)
wakeup_softirqd();
__local_bh_enable();
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
asmlinkage void do_softirq(void)
{
__u32 pending;
unsigned long flags;
if (in_interrupt())
return;
local_irq_save(flags);
pending = local_softirq_pending();
if (pending)
__do_softirq();
local_irq_restore(flags);
}
EXPORT_SYMBOL(do_softirq);
#endif
void local_bh_enable(void)
{
WARN_ON(irqs_disabled());
/*
* Keep preemption disabled until we are done with
* softirq processing:
*/
sub_preempt_count(SOFTIRQ_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
dec_preempt_count();
preempt_check_resched();
}
EXPORT_SYMBOL(local_bh_enable);
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
# define invoke_softirq() __do_softirq()
#else
# define invoke_softirq() do_softirq()
#endif
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
account_system_vtime(current);
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
preempt_enable_no_resched();
}
/*
* This function must run with irqs disabled!
*/
inline fastcall void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
/*
* If we're in an interrupt or softirq, we're done
* (this also catches softirq-disabled code). We will
* actually run the softirq once we return from
* the irq or softirq.
*
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
if (!in_interrupt())
wakeup_softirqd();
}
EXPORT_SYMBOL(raise_softirq_irqoff);
void fastcall raise_softirq(unsigned int nr)
{
unsigned long flags;
local_irq_save(flags);
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}
void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
{
softirq_vec[nr].data = data;
softirq_vec[nr].action = action;
}
EXPORT_SYMBOL(open_softirq);
/* Tasklets */
struct tasklet_head
{
struct tasklet_struct *list;
};
/* Some compilers disobey section attribute on statics when not
initialized -- RR */
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
void fastcall __tasklet_schedule(struct tasklet_struct *t)
{
unsigned long flags;
local_irq_save(flags);
t->next = __get_cpu_var(tasklet_vec).list;
__get_cpu_var(tasklet_vec).list = t;
raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_schedule);
void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
{
unsigned long flags;
local_irq_save(flags);
t->next = __get_cpu_var(tasklet_hi_vec).list;
__get_cpu_var(tasklet_hi_vec).list = t;
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
local_irq_disable();
list = __get_cpu_var(tasklet_vec).list;
__get_cpu_var(tasklet_vec).list = NULL;
local_irq_enable();
while (list) {
struct tasklet_struct *t = list;
list = list->next;
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
local_irq_disable();
t->next = __get_cpu_var(tasklet_vec).list;
__get_cpu_var(tasklet_vec).list = t;
__raise_softirq_irqoff(TASKLET_SOFTIRQ);
local_irq_enable();
}
}
static void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
local_irq_disable();
list = __get_cpu_var(tasklet_hi_vec).list;
__get_cpu_var(tasklet_hi_vec).list = NULL;
local_irq_enable();
while (list) {
struct tasklet_struct *t = list;
list = list->next;
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
local_irq_disable();
t->next = __get_cpu_var(tasklet_hi_vec).list;
__get_cpu_var(tasklet_hi_vec).list = t;
__raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
}
}
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
t->next = NULL;
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init);
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
printk("Attempt to kill tasklet from interrupt\n");
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
do
yield();
while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
}
EXPORT_SYMBOL(tasklet_kill);
void __init softirq_init(void)
{
open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
}
static int ksoftirqd(void * __bind_cpu)
{
set_user_nice(current, 19);
current->flags |= PF_NOFREEZE;
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
preempt_enable_no_resched();
schedule();
preempt_disable();
}
__set_current_state(TASK_RUNNING);
while (local_softirq_pending()) {
/* Preempt disable stops cpu going offline.
If already offline, we'll be on wrong CPU:
don't process */
if (cpu_is_offline((long)__bind_cpu))
goto wait_to_die;
do_softirq();
preempt_enable_no_resched();
cond_resched();
preempt_disable();
}
preempt_enable();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
wait_to_die:
preempt_enable();
/* Wait for kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
/*
* tasklet_kill_immediate is called to remove a tasklet which can already be
* scheduled for execution on @cpu.
*
* Unlike tasklet_kill, this function removes the tasklet
* _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
*
* When this function is called, @cpu must be in the CPU_DEAD state.
*/
void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
{
struct tasklet_struct **i;
BUG_ON(cpu_online(cpu));
BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
if (!test_bit(TASKLET_STATE_SCHED, &t->state))
return;
/* CPU is dead, so no lock needed. */
for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
if (*i == t) {
*i = t->next;
return;
}
}
BUG();
}
static void takeover_tasklets(unsigned int cpu)
{
struct tasklet_struct **i;
/* CPU is dead, so no lock needed. */
local_irq_disable();
/* Find end, append list for that CPU. */
for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
*i = per_cpu(tasklet_vec, cpu).list;
per_cpu(tasklet_vec, cpu).list = NULL;
raise_softirq_irqoff(TASKLET_SOFTIRQ);
for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
*i = per_cpu(tasklet_hi_vec, cpu).list;
per_cpu(tasklet_hi_vec, cpu).list = NULL;
raise_softirq_irqoff(HI_SOFTIRQ);
local_irq_enable();
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
struct task_struct *p;
switch (action) {
case CPU_UP_PREPARE:
BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
kthread_bind(p, hotcpu);
per_cpu(ksoftirqd, hotcpu) = p;
break;
case CPU_ONLINE:
wake_up_process(per_cpu(ksoftirqd, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
case CPU_DEAD:
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
kthread_stop(p);
takeover_tasklets(hotcpu);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}
static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
__init int spawn_ksoftirqd(void)
{
void *cpu = (void *)(long)smp_processor_id();
cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
}

371
kernel/spinlock.c Normal file
View File

@@ -0,0 +1,371 @@
/*
* Copyright (2004) Linus Torvalds
*
* Author: Zwane Mwaikambo <zwane@fsmlabs.com>
*
* Copyright (2004) Ingo Molnar
*/
#include <linux/config.h>
#include <linux/linkage.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/module.h>
/*
* Generic declaration of the raw read_trylock() function,
* architectures are supposed to optimize this:
*/
int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
{
_raw_read_lock(lock);
return 1;
}
EXPORT_SYMBOL(generic_raw_read_trylock);
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
if (_raw_spin_trylock(lock))
return 1;
preempt_enable();
return 0;
}
EXPORT_SYMBOL(_spin_trylock);
int __lockfunc _read_trylock(rwlock_t *lock)
{
preempt_disable();
if (_raw_read_trylock(lock))
return 1;
preempt_enable();
return 0;
}
EXPORT_SYMBOL(_read_trylock);
int __lockfunc _write_trylock(rwlock_t *lock)
{
preempt_disable();
if (_raw_write_trylock(lock))
return 1;
preempt_enable();
return 0;
}
EXPORT_SYMBOL(_write_trylock);
#ifndef CONFIG_PREEMPT
void __lockfunc _read_lock(rwlock_t *lock)
{
preempt_disable();
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock);
unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
{
unsigned long flags;
local_irq_save(flags);
preempt_disable();
_raw_spin_lock_flags(lock, flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave);
void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
local_irq_disable();
preempt_disable();
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_irq);
void __lockfunc _spin_lock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_bh);
unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
{
unsigned long flags;
local_irq_save(flags);
preempt_disable();
_raw_read_lock(lock);
return flags;
}
EXPORT_SYMBOL(_read_lock_irqsave);
void __lockfunc _read_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_irq);
void __lockfunc _read_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_bh);
unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
{
unsigned long flags;
local_irq_save(flags);
preempt_disable();
_raw_write_lock(lock);
return flags;
}
EXPORT_SYMBOL(_write_lock_irqsave);
void __lockfunc _write_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_irq);
void __lockfunc _write_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_bh);
void __lockfunc _spin_lock(spinlock_t *lock)
{
preempt_disable();
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock);
void __lockfunc _write_lock(rwlock_t *lock)
{
preempt_disable();
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock);
#else /* CONFIG_PREEMPT: */
/*
* This could be a long-held lock. We both prepare to spin for a long
* time (making _this_ CPU preemptable if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*
* (We do this in a function because inlining it would be excessive.)
*/
#define BUILD_LOCK_OPS(op, locktype) \
void __lockfunc _##op##_lock(locktype##_t *lock) \
{ \
preempt_disable(); \
for (;;) { \
if (likely(_raw_##op##_trylock(lock))) \
break; \
preempt_enable(); \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
cpu_relax(); \
preempt_disable(); \
} \
(lock)->break_lock = 0; \
} \
\
EXPORT_SYMBOL(_##op##_lock); \
\
unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
{ \
unsigned long flags; \
\
preempt_disable(); \
for (;;) { \
local_irq_save(flags); \
if (likely(_raw_##op##_trylock(lock))) \
break; \
local_irq_restore(flags); \
\
preempt_enable(); \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
cpu_relax(); \
preempt_disable(); \
} \
(lock)->break_lock = 0; \
return flags; \
} \
\
EXPORT_SYMBOL(_##op##_lock_irqsave); \
\
void __lockfunc _##op##_lock_irq(locktype##_t *lock) \
{ \
_##op##_lock_irqsave(lock); \
} \
\
EXPORT_SYMBOL(_##op##_lock_irq); \
\
void __lockfunc _##op##_lock_bh(locktype##_t *lock) \
{ \
unsigned long flags; \
\
/* */ \
/* Careful: we must exclude softirqs too, hence the */ \
/* irq-disabling. We use the generic preemption-aware */ \
/* function: */ \
/**/ \
flags = _##op##_lock_irqsave(lock); \
local_bh_disable(); \
local_irq_restore(flags); \
} \
\
EXPORT_SYMBOL(_##op##_lock_bh)
/*
* Build preemption-friendly versions of the following
* lock-spinning functions:
*
* _[spin|read|write]_lock()
* _[spin|read|write]_lock_irq()
* _[spin|read|write]_lock_irqsave()
* _[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, spinlock);
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
#endif /* CONFIG_PREEMPT */
void __lockfunc _spin_unlock(spinlock_t *lock)
{
_raw_spin_unlock(lock);
preempt_enable();
}
EXPORT_SYMBOL(_spin_unlock);
void __lockfunc _write_unlock(rwlock_t *lock)
{
_raw_write_unlock(lock);
preempt_enable();
}
EXPORT_SYMBOL(_write_unlock);
void __lockfunc _read_unlock(rwlock_t *lock)
{
_raw_read_unlock(lock);
preempt_enable();
}
EXPORT_SYMBOL(_read_unlock);
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
_raw_spin_unlock(lock);
local_irq_restore(flags);
preempt_enable();
}
EXPORT_SYMBOL(_spin_unlock_irqrestore);
void __lockfunc _spin_unlock_irq(spinlock_t *lock)
{
_raw_spin_unlock(lock);
local_irq_enable();
preempt_enable();
}
EXPORT_SYMBOL(_spin_unlock_irq);
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
_raw_spin_unlock(lock);
preempt_enable();
local_bh_enable();
}
EXPORT_SYMBOL(_spin_unlock_bh);
void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
_raw_read_unlock(lock);
local_irq_restore(flags);
preempt_enable();
}
EXPORT_SYMBOL(_read_unlock_irqrestore);
void __lockfunc _read_unlock_irq(rwlock_t *lock)
{
_raw_read_unlock(lock);
local_irq_enable();
preempt_enable();
}
EXPORT_SYMBOL(_read_unlock_irq);
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
_raw_read_unlock(lock);
preempt_enable();
local_bh_enable();
}
EXPORT_SYMBOL(_read_unlock_bh);
void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
_raw_write_unlock(lock);
local_irq_restore(flags);
preempt_enable();
}
EXPORT_SYMBOL(_write_unlock_irqrestore);
void __lockfunc _write_unlock_irq(rwlock_t *lock)
{
_raw_write_unlock(lock);
local_irq_enable();
preempt_enable();
}
EXPORT_SYMBOL(_write_unlock_irq);
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
_raw_write_unlock(lock);
preempt_enable();
local_bh_enable();
}
EXPORT_SYMBOL(_write_unlock_bh);
int __lockfunc _spin_trylock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
if (_raw_spin_trylock(lock))
return 1;
preempt_enable();
local_bh_enable();
return 0;
}
EXPORT_SYMBOL(_spin_trylock_bh);
int in_lock_functions(unsigned long addr)
{
/* Linker adds these: start and end of __lockfunc functions */
extern char __lock_text_start[], __lock_text_end[];
return addr >= (unsigned long)__lock_text_start
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);

212
kernel/stop_machine.c Normal file
View File

@@ -0,0 +1,212 @@
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/syscalls.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <asm/uaccess.h>
/* Since we effect priority and affinity (both of which are visible
* to, and settable by outside processes) we do indirection via a
* kthread. */
/* Thread to stop each CPU in user context. */
enum stopmachine_state {
STOPMACHINE_WAIT,
STOPMACHINE_PREPARE,
STOPMACHINE_DISABLE_IRQ,
STOPMACHINE_EXIT,
};
static enum stopmachine_state stopmachine_state;
static unsigned int stopmachine_num_threads;
static atomic_t stopmachine_thread_ack;
static DECLARE_MUTEX(stopmachine_mutex);
static int stopmachine(void *cpu)
{
int irqs_disabled = 0;
int prepared = 0;
set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
/* Ack: we are alive */
mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
atomic_inc(&stopmachine_thread_ack);
/* Simple state machine */
while (stopmachine_state != STOPMACHINE_EXIT) {
if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
&& !irqs_disabled) {
local_irq_disable();
irqs_disabled = 1;
/* Ack: irqs disabled. */
mb(); /* Must read state first. */
atomic_inc(&stopmachine_thread_ack);
} else if (stopmachine_state == STOPMACHINE_PREPARE
&& !prepared) {
/* Everyone is in place, hold CPU. */
preempt_disable();
prepared = 1;
mb(); /* Must read state first. */
atomic_inc(&stopmachine_thread_ack);
}
/* Yield in first stage: migration threads need to
* help our sisters onto their CPUs. */
if (!prepared && !irqs_disabled)
yield();
else
cpu_relax();
}
/* Ack: we are exiting. */
mb(); /* Must read state first. */
atomic_inc(&stopmachine_thread_ack);
if (irqs_disabled)
local_irq_enable();
if (prepared)
preempt_enable();
return 0;
}
/* Change the thread state */
static void stopmachine_set_state(enum stopmachine_state state)
{
atomic_set(&stopmachine_thread_ack, 0);
wmb();
stopmachine_state = state;
while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
cpu_relax();
}
static int stop_machine(void)
{
int i, ret = 0;
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
mm_segment_t old_fs = get_fs();
/* One high-prio thread per cpu. We'll do this one. */
set_fs(KERNEL_DS);
sys_sched_setscheduler(current->pid, SCHED_FIFO,
(struct sched_param __user *)&param);
set_fs(old_fs);
atomic_set(&stopmachine_thread_ack, 0);
stopmachine_num_threads = 0;
stopmachine_state = STOPMACHINE_WAIT;
for_each_online_cpu(i) {
if (i == _smp_processor_id())
continue;
ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
if (ret < 0)
break;
stopmachine_num_threads++;
}
/* Wait for them all to come to life. */
while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
yield();
/* If some failed, kill them all. */
if (ret < 0) {
stopmachine_set_state(STOPMACHINE_EXIT);
up(&stopmachine_mutex);
return ret;
}
/* Don't schedule us away at this point, please. */
local_irq_disable();
/* Now they are all started, make them hold the CPUs, ready. */
stopmachine_set_state(STOPMACHINE_PREPARE);
/* Make them disable irqs. */
stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
return 0;
}
static void restart_machine(void)
{
stopmachine_set_state(STOPMACHINE_EXIT);
local_irq_enable();
}
struct stop_machine_data
{
int (*fn)(void *);
void *data;
struct completion done;
};
static int do_stop(void *_smdata)
{
struct stop_machine_data *smdata = _smdata;
int ret;
ret = stop_machine();
if (ret == 0) {
ret = smdata->fn(smdata->data);
restart_machine();
}
/* We're done: you can kthread_stop us now */
complete(&smdata->done);
/* Wait for kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return ret;
}
struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
unsigned int cpu)
{
struct stop_machine_data smdata;
struct task_struct *p;
smdata.fn = fn;
smdata.data = data;
init_completion(&smdata.done);
down(&stopmachine_mutex);
/* If they don't care which CPU fn runs on, bind to any online one. */
if (cpu == NR_CPUS)
cpu = _smp_processor_id();
p = kthread_create(do_stop, &smdata, "kstopmachine");
if (!IS_ERR(p)) {
kthread_bind(p, cpu);
wake_up_process(p);
wait_for_completion(&smdata.done);
}
up(&stopmachine_mutex);
return p;
}
int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
{
struct task_struct *p;
int ret;
/* No CPUs can come up or down during this. */
lock_cpu_hotplug();
p = __stop_machine_run(fn, data, cpu);
if (!IS_ERR(p))
ret = kthread_stop(p);
else
ret = PTR_ERR(p);
unlock_cpu_hotplug();
return ret;
}

1725
kernel/sys.c Normal file

File diff suppressed because it is too large Load Diff

86
kernel/sys_ni.c Normal file
View File

@@ -0,0 +1,86 @@
#include <linux/linkage.h>
#include <linux/errno.h>
#include <asm/unistd.h>
/*
* Non-implemented system calls get redirected here.
*/
asmlinkage long sys_ni_syscall(void)
{
return -ENOSYS;
}
cond_syscall(sys_nfsservctl);
cond_syscall(sys_quotactl);
cond_syscall(sys_acct);
cond_syscall(sys_lookup_dcookie);
cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_init_module);
cond_syscall(sys_delete_module);
cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
cond_syscall(sys_listen);
cond_syscall(sys_accept);
cond_syscall(sys_connect);
cond_syscall(sys_getsockname);
cond_syscall(sys_getpeername);
cond_syscall(sys_sendto);
cond_syscall(sys_send);
cond_syscall(sys_recvfrom);
cond_syscall(sys_recv);
cond_syscall(sys_socket);
cond_syscall(sys_setsockopt);
cond_syscall(sys_getsockopt);
cond_syscall(sys_shutdown);
cond_syscall(sys_sendmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_socketcall);
cond_syscall(sys_futex);
cond_syscall(compat_sys_futex);
cond_syscall(sys_epoll_create);
cond_syscall(sys_epoll_ctl);
cond_syscall(sys_epoll_wait);
cond_syscall(sys_semget);
cond_syscall(sys_semop);
cond_syscall(sys_semtimedop);
cond_syscall(sys_semctl);
cond_syscall(sys_msgget);
cond_syscall(sys_msgsnd);
cond_syscall(sys_msgrcv);
cond_syscall(sys_msgctl);
cond_syscall(sys_shmget);
cond_syscall(sys_shmdt);
cond_syscall(sys_shmctl);
cond_syscall(sys_mq_open);
cond_syscall(sys_mq_unlink);
cond_syscall(sys_mq_timedsend);
cond_syscall(sys_mq_timedreceive);
cond_syscall(sys_mq_notify);
cond_syscall(sys_mq_getsetattr);
cond_syscall(compat_sys_mq_open);
cond_syscall(compat_sys_mq_timedsend);
cond_syscall(compat_sys_mq_timedreceive);
cond_syscall(compat_sys_mq_notify);
cond_syscall(compat_sys_mq_getsetattr);
cond_syscall(sys_mbind);
cond_syscall(sys_get_mempolicy);
cond_syscall(sys_set_mempolicy);
cond_syscall(compat_sys_mbind);
cond_syscall(compat_sys_get_mempolicy);
cond_syscall(compat_sys_set_mempolicy);
cond_syscall(sys_add_key);
cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
cond_syscall(compat_sys_keyctl);
cond_syscall(compat_sys_socketcall);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
cond_syscall(sys_pciconfig_write);
cond_syscall(sys_pciconfig_iobase);
cond_syscall(sys32_ipc);
cond_syscall(sys32_sysctl);
cond_syscall(ppc_rtas);

2337
kernel/sysctl.c Normal file

File diff suppressed because it is too large Load Diff

599
kernel/time.c Normal file
View File

@@ -0,0 +1,599 @@
/*
* linux/kernel/time.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* This file contains the interface functions for the various
* time related system calls: time, stime, gettimeofday, settimeofday,
* adjtime
*/
/*
* Modification history kernel/time.c
*
* 1993-09-02 Philip Gladstone
* Created file with time related functions from sched.c and adjtimex()
* 1993-10-08 Torsten Duwe
* adjtime interface update and CMOS clock write code
* 1995-08-13 Torsten Duwe
* kernel PLL updated to 1994-12-13 specs (rfc-1589)
* 1999-01-16 Ulrich Windl
* Introduced error checking for many cases in adjtimex().
* Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
* (Even though the technical memorandum forbids it)
* 2004-07-14 Christoph Lameter
* Added getnstimeofday to allow the posix timer functions to return
* with nanosecond accuracy
*/
#include <linux/module.h>
#include <linux/timex.h>
#include <linux/errno.h>
#include <linux/smp_lock.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
/*
* The timezone where the local system is located. Used as a default by some
* programs who obtain this value by using gettimeofday.
*/
struct timezone sys_tz;
EXPORT_SYMBOL(sys_tz);
#ifdef __ARCH_WANT_SYS_TIME
/*
* sys_time() can be implemented in user-level using
* sys_gettimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
asmlinkage long sys_time(time_t __user * tloc)
{
time_t i;
struct timeval tv;
do_gettimeofday(&tv);
i = tv.tv_sec;
if (tloc) {
if (put_user(i,tloc))
i = -EFAULT;
}
return i;
}
/*
* sys_stime() can be implemented in user-level using
* sys_settimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
asmlinkage long sys_stime(time_t __user *tptr)
{
struct timespec tv;
int err;
if (get_user(tv.tv_sec, tptr))
return -EFAULT;
tv.tv_nsec = 0;
err = security_settime(&tv, NULL);
if (err)
return err;
do_settimeofday(&tv);
return 0;
}
#endif /* __ARCH_WANT_SYS_TIME */
asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
{
if (likely(tv != NULL)) {
struct timeval ktv;
do_gettimeofday(&ktv);
if (copy_to_user(tv, &ktv, sizeof(ktv)))
return -EFAULT;
}
if (unlikely(tz != NULL)) {
if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
return -EFAULT;
}
return 0;
}
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time.
*
* This is ugly, but preferable to the alternatives. Otherwise we
* would either need to write a program to do it in /etc/rc (and risk
* confusion if the program gets run more than once; it would also be
* hard to make the program warp the clock precisely n hours) or
* compile in the timezone information into the kernel. Bad, bad....
*
* - TYT, 1992-01-01
*
* The best thing to do is to keep the CMOS clock in universal time (UTC)
* as real UNIX machines always do it. This avoids all headaches about
* daylight saving times and warping kernel clocks.
*/
inline static void warp_clock(void)
{
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
xtime.tv_sec += sys_tz.tz_minuteswest * 60;
time_interpolator_reset();
write_sequnlock_irq(&xtime_lock);
clock_was_set();
}
/*
* In case for some reason the CMOS clock has not already been running
* in UTC, but in some local time: The first time we set the timezone,
* we will warp the clock so that it is ticking UTC time instead of
* local time. Presumably, if someone is setting the timezone then we
* are running in an environment where the programs understand about
* timezones. This should be done at boot time in the /etc/rc script,
* as soon as possible, so that the clock can be set right. Otherwise,
* various programs will get confused when the clock gets warped.
*/
int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
{
static int firsttime = 1;
int error = 0;
error = security_settime(tv, tz);
if (error)
return error;
if (tz) {
/* SMP safe, global irq locking makes it work. */
sys_tz = *tz;
if (firsttime) {
firsttime = 0;
if (!tv)
warp_clock();
}
}
if (tv)
{
/* SMP safe, again the code in arch/foo/time.c should
* globally block out interrupts when it runs.
*/
return do_settimeofday(tv);
}
return 0;
}
asmlinkage long sys_settimeofday(struct timeval __user *tv,
struct timezone __user *tz)
{
struct timeval user_tv;
struct timespec new_ts;
struct timezone new_tz;
if (tv) {
if (copy_from_user(&user_tv, tv, sizeof(*tv)))
return -EFAULT;
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
long pps_offset; /* pps time offset (us) */
long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
long pps_freq; /* frequency offset (scaled ppm) */
long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
long pps_valid = PPS_VALID; /* pps signal watchdog counter */
int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
long pps_jitcnt; /* jitter limit exceeded */
long pps_calcnt; /* calibration intervals */
long pps_errcnt; /* calibration errors */
long pps_stbcnt; /* stability limit exceeded */
/* hook for a loadable hardpps kernel module */
void (*hardpps_ptr)(struct timeval *);
/* we call this to notify the arch when the clock is being
* controlled. If no such arch routine, do nothing.
*/
void __attribute__ ((weak)) notify_arch_cmos_timer(void)
{
return;
}
/* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
int do_adjtimex(struct timex *txc)
{
long ltemp, mtemp, save_adjust;
int result;
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
/* Now we validate the data before disabling interrupts */
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
/* singleshot must not be used with any other mode bits */
if (txc->modes != ADJ_OFFSET_SINGLESHOT)
return -EINVAL;
if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
/* adjustment Offset limited to +- .512 seconds */
if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
return -EINVAL;
/* if the quartz is off by more than 10% something is VERY wrong ! */
if (txc->modes & ADJ_TICK)
if (txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ)
return -EINVAL;
write_seqlock_irq(&xtime_lock);
result = time_state; /* mostly `TIME_OK' */
/* Save for later - semantics of adjtime is to return old value */
save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
#if 0 /* STA_CLOCKERR is never set yet */
time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
#endif
/* If there are input parameters, then process them */
if (txc->modes)
{
if (txc->modes & ADJ_STATUS) /* only set allowed bits */
time_status = (txc->status & ~STA_RONLY) |
(time_status & STA_RONLY);
if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
result = -EINVAL;
goto leave;
}
time_freq = txc->freq - pps_freq;
}
if (txc->modes & ADJ_MAXERROR) {
if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
result = -EINVAL;
goto leave;
}
time_maxerror = txc->maxerror;
}
if (txc->modes & ADJ_ESTERROR) {
if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
result = -EINVAL;
goto leave;
}
time_esterror = txc->esterror;
}
if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
if (txc->constant < 0) { /* NTP v4 uses values > 6 */
result = -EINVAL;
goto leave;
}
time_constant = txc->constant;
}
if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
/* adjtime() is independent from ntp_adjtime() */
if ((time_next_adjust = txc->offset) == 0)
time_adjust = 0;
}
else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
(STA_PPSTIME | STA_PPSSIGNAL) ?
pps_offset : txc->offset;
/*
* Scale the phase adjustment and
* clamp to the operating range.
*/
if (ltemp > MAXPHASE)
time_offset = MAXPHASE << SHIFT_UPDATE;
else if (ltemp < -MAXPHASE)
time_offset = -(MAXPHASE << SHIFT_UPDATE);
else
time_offset = ltemp << SHIFT_UPDATE;
/*
* Select whether the frequency is to be controlled
* and in which mode (PLL or FLL). Clamp to the operating
* range. Ugly multiply/divide should be replaced someday.
*/
if (time_status & STA_FREQHOLD || time_reftime == 0)
time_reftime = xtime.tv_sec;
mtemp = xtime.tv_sec - time_reftime;
time_reftime = xtime.tv_sec;
if (time_status & STA_FLL) {
if (mtemp >= MINSEC) {
ltemp = (time_offset / mtemp) << (SHIFT_USEC -
SHIFT_UPDATE);
if (ltemp < 0)
time_freq -= -ltemp >> SHIFT_KH;
else
time_freq += ltemp >> SHIFT_KH;
} else /* calibration interval too short (p. 12) */
result = TIME_ERROR;
} else { /* PLL mode */
if (mtemp < MAXSEC) {
ltemp *= mtemp;
if (ltemp < 0)
time_freq -= -ltemp >> (time_constant +
time_constant +
SHIFT_KF - SHIFT_USEC);
else
time_freq += ltemp >> (time_constant +
time_constant +
SHIFT_KF - SHIFT_USEC);
} else /* calibration interval too long (p. 12) */
result = TIME_ERROR;
}
if (time_freq > time_tolerance)
time_freq = time_tolerance;
else if (time_freq < -time_tolerance)
time_freq = -time_tolerance;
} /* STA_PLL || STA_PPSTIME */
} /* txc->modes & ADJ_OFFSET */
if (txc->modes & ADJ_TICK) {
tick_usec = txc->tick;
tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
}
} /* txc->modes */
leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
|| ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
&& (time_status & STA_PPSSIGNAL) == 0)
/* p. 24, (b) */
|| ((time_status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
/* p. 24, (c) */
|| ((time_status & STA_PPSFREQ) != 0
&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
/* p. 24, (d) */
result = TIME_ERROR;
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
txc->offset = save_adjust;
else {
if (time_offset < 0)
txc->offset = -(-time_offset >> SHIFT_UPDATE);
else
txc->offset = time_offset >> SHIFT_UPDATE;
}
txc->freq = time_freq + pps_freq;
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
txc->constant = time_constant;
txc->precision = time_precision;
txc->tolerance = time_tolerance;
txc->tick = tick_usec;
txc->ppsfreq = pps_freq;
txc->jitter = pps_jitter >> PPS_AVG;
txc->shift = pps_shift;
txc->stabil = pps_stabil;
txc->jitcnt = pps_jitcnt;
txc->calcnt = pps_calcnt;
txc->errcnt = pps_errcnt;
txc->stbcnt = pps_stbcnt;
write_sequnlock_irq(&xtime_lock);
do_gettimeofday(&txc->time);
notify_arch_cmos_timer();
return(result);
}
asmlinkage long sys_adjtimex(struct timex __user *txc_p)
{
struct timex txc; /* Local copy of parameter */
int ret;
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
}
inline struct timespec current_kernel_time(void)
{
struct timespec now;
unsigned long seq;
do {
seq = read_seqbegin(&xtime_lock);
now = xtime;
} while (read_seqretry(&xtime_lock, seq));
return now;
}
EXPORT_SYMBOL(current_kernel_time);
/**
* current_fs_time - Return FS time
* @sb: Superblock.
*
* Return the current time truncated to the time granuality supported by
* the fs.
*/
struct timespec current_fs_time(struct super_block *sb)
{
struct timespec now = current_kernel_time();
return timespec_trunc(now, sb->s_time_gran);
}
EXPORT_SYMBOL(current_fs_time);
/**
* timespec_trunc - Truncate timespec to a granuality
* @t: Timespec
* @gran: Granuality in ns.
*
* Truncate a timespec to a granuality. gran must be smaller than a second.
* Always rounds down.
*
* This function should be only used for timestamps returned by
* current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
* it doesn't handle the better resolution of the later.
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
/*
* Division is pretty slow so avoid it for common cases.
* Currently current_kernel_time() never returns better than
* jiffies resolution. Exploit that.
*/
if (gran <= jiffies_to_usecs(1) * 1000) {
/* nothing */
} else if (gran == 1000000000) {
t.tv_nsec = 0;
} else {
t.tv_nsec -= t.tv_nsec % gran;
}
return t;
}
EXPORT_SYMBOL(timespec_trunc);
#ifdef CONFIG_TIME_INTERPOLATION
void getnstimeofday (struct timespec *tv)
{
unsigned long seq,sec,nsec;
do {
seq = read_seqbegin(&xtime_lock);
sec = xtime.tv_sec;
nsec = xtime.tv_nsec+time_interpolator_get_offset();
} while (unlikely(read_seqretry(&xtime_lock, seq)));
while (unlikely(nsec >= NSEC_PER_SEC)) {
nsec -= NSEC_PER_SEC;
++sec;
}
tv->tv_sec = sec;
tv->tv_nsec = nsec;
}
EXPORT_SYMBOL_GPL(getnstimeofday);
int do_settimeofday (struct timespec *tv)
{
time_t wtm_sec, sec = tv->tv_sec;
long wtm_nsec, nsec = tv->tv_nsec;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
write_seqlock_irq(&xtime_lock);
{
/*
* This is revolting. We need to set "xtime" correctly. However, the value
* in this location is the value at the most recent update of wall time.
* Discover what correction gettimeofday would have done, and then undo
* it!
*/
nsec -= time_interpolator_get_offset();
wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
set_normalized_timespec(&xtime, sec, nsec);
set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
time_interpolator_reset();
}
write_sequnlock_irq(&xtime_lock);
clock_was_set();
return 0;
}
void do_gettimeofday (struct timeval *tv)
{
unsigned long seq, nsec, usec, sec, offset;
do {
seq = read_seqbegin(&xtime_lock);
offset = time_interpolator_get_offset();
sec = xtime.tv_sec;
nsec = xtime.tv_nsec;
} while (unlikely(read_seqretry(&xtime_lock, seq)));
usec = (nsec + offset) / 1000;
while (unlikely(usec >= USEC_PER_SEC)) {
usec -= USEC_PER_SEC;
++sec;
}
tv->tv_sec = sec;
tv->tv_usec = usec;
}
EXPORT_SYMBOL(do_gettimeofday);
#else
/*
* Simulate gettimeofday using do_gettimeofday which only allows a timeval
* and therefore only yields usec accuracy
*/
void getnstimeofday(struct timespec *tv)
{
struct timeval x;
do_gettimeofday(&x);
tv->tv_sec = x.tv_sec;
tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
}
#endif
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
unsigned long seq;
u64 ret;
do {
seq = read_seqbegin(&xtime_lock);
ret = jiffies_64;
} while (read_seqretry(&xtime_lock, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
#endif
EXPORT_SYMBOL(jiffies);

1611
kernel/timer.c Normal file

File diff suppressed because it is too large Load Diff

196
kernel/uid16.c Normal file
View File

@@ -0,0 +1,196 @@
/*
* Wrapper functions for 16bit uid back compatibility. All nicely tied
* together in the faint hope we can take the out in five years time.
*/
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
{
return sys_chown(filename, low2highuid(user), low2highgid(group));
}
asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
{
return sys_lchown(filename, low2highuid(user), low2highgid(group));
}
asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
{
return sys_fchown(fd, low2highuid(user), low2highgid(group));
}
asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
{
return sys_setregid(low2highgid(rgid), low2highgid(egid));
}
asmlinkage long sys_setgid16(old_gid_t gid)
{
return sys_setgid(low2highgid(gid));
}
asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
{
return sys_setreuid(low2highuid(ruid), low2highuid(euid));
}
asmlinkage long sys_setuid16(old_uid_t uid)
{
return sys_setuid(low2highuid(uid));
}
asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
{
return sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
}
asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
{
int retval;
if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
!(retval = put_user(high2lowuid(current->euid), euid)))
retval = put_user(high2lowuid(current->suid), suid);
return retval;
}
asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
{
return sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
}
asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
{
int retval;
if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
!(retval = put_user(high2lowgid(current->egid), egid)))
retval = put_user(high2lowgid(current->sgid), sgid);
return retval;
}
asmlinkage long sys_setfsuid16(old_uid_t uid)
{
return sys_setfsuid(low2highuid(uid));
}
asmlinkage long sys_setfsgid16(old_gid_t gid)
{
return sys_setfsgid(low2highgid(gid));
}
static int groups16_to_user(old_gid_t __user *grouplist,
struct group_info *group_info)
{
int i;
old_gid_t group;
for (i = 0; i < group_info->ngroups; i++) {
group = high2lowgid(GROUP_AT(group_info, i));
if (put_user(group, grouplist+i))
return -EFAULT;
}
return 0;
}
static int groups16_from_user(struct group_info *group_info,
old_gid_t __user *grouplist)
{
int i;
old_gid_t group;
for (i = 0; i < group_info->ngroups; i++) {
if (get_user(group, grouplist+i))
return -EFAULT;
GROUP_AT(group_info, i) = low2highgid(group);
}
return 0;
}
asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
{
int i = 0;
if (gidsetsize < 0)
return -EINVAL;
get_group_info(current->group_info);
i = current->group_info->ngroups;
if (gidsetsize) {
if (i > gidsetsize) {
i = -EINVAL;
goto out;
}
if (groups16_to_user(grouplist, current->group_info)) {
i = -EFAULT;
goto out;
}
}
out:
put_group_info(current->group_info);
return i;
}
asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
{
struct group_info *group_info;
int retval;
if (!capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
group_info = groups_alloc(gidsetsize);
if (!group_info)
return -ENOMEM;
retval = groups16_from_user(group_info, grouplist);
if (retval) {
put_group_info(group_info);
return retval;
}
retval = set_current_groups(group_info);
put_group_info(group_info);
return retval;
}
asmlinkage long sys_getuid16(void)
{
return high2lowuid(current->uid);
}
asmlinkage long sys_geteuid16(void)
{
return high2lowuid(current->euid);
}
asmlinkage long sys_getgid16(void)
{
return high2lowgid(current->gid);
}
asmlinkage long sys_getegid16(void)
{
return high2lowgid(current->egid);
}

189
kernel/user.c Normal file
View File

@@ -0,0 +1,189 @@
/*
* The "user cache".
*
* (C) Copyright 1991-2000 Linus Torvalds
*
* We have a per-user structure to keep track of how many
* processes, files etc the user has claimed, in order to be
* able to have per-user limits for system resources.
*/
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/key.h>
/*
* UID task count cache, to get fast user lookup in "alloc_uid"
* when changing user ID's (ie setuid() and friends).
*/
#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
#define UIDHASH_SZ (1 << UIDHASH_BITS)
#define UIDHASH_MASK (UIDHASH_SZ - 1)
#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
static kmem_cache_t *uid_cachep;
static struct list_head uidhash_table[UIDHASH_SZ];
static DEFINE_SPINLOCK(uidhash_lock);
struct user_struct root_user = {
.__count = ATOMIC_INIT(1),
.processes = ATOMIC_INIT(1),
.files = ATOMIC_INIT(0),
.sigpending = ATOMIC_INIT(0),
.mq_bytes = 0,
.locked_shm = 0,
#ifdef CONFIG_KEYS
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
};
/*
* These routines must be called with the uidhash spinlock held!
*/
static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
{
list_add(&up->uidhash_list, hashent);
}
static inline void uid_hash_remove(struct user_struct *up)
{
list_del(&up->uidhash_list);
}
static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
{
struct list_head *up;
list_for_each(up, hashent) {
struct user_struct *user;
user = list_entry(up, struct user_struct, uidhash_list);
if(user->uid == uid) {
atomic_inc(&user->__count);
return user;
}
}
return NULL;
}
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
*
* If the user_struct could not be found, return NULL.
*/
struct user_struct *find_user(uid_t uid)
{
struct user_struct *ret;
spin_lock(&uidhash_lock);
ret = uid_hash_find(uid, uidhashentry(uid));
spin_unlock(&uidhash_lock);
return ret;
}
void free_uid(struct user_struct *up)
{
if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
spin_unlock(&uidhash_lock);
}
}
struct user_struct * alloc_uid(uid_t uid)
{
struct list_head *hashent = uidhashentry(uid);
struct user_struct *up;
spin_lock(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock(&uidhash_lock);
if (!up) {
struct user_struct *new;
new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
if (!new)
return NULL;
new->uid = uid;
atomic_set(&new->__count, 1);
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
atomic_set(&new->sigpending, 0);
new->mq_bytes = 0;
new->locked_shm = 0;
if (alloc_uid_keyring(new) < 0) {
kmem_cache_free(uid_cachep, new);
return NULL;
}
/*
* Before adding this, check whether we raced
* on adding the same user already..
*/
spin_lock(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
up = new;
}
spin_unlock(&uidhash_lock);
}
return up;
}
void switch_uid(struct user_struct *new_user)
{
struct user_struct *old_user;
/* What if a process setreuid()'s and this brings the
* new uid over his NPROC rlimit? We can check this now
* cheaply with the new uid cache, so if it matters
* we should be checking for it. -DaveM
*/
old_user = current->user;
atomic_inc(&new_user->processes);
atomic_dec(&old_user->processes);
switch_uid_keyring(new_user);
current->user = new_user;
free_uid(old_user);
suid_keys(current);
}
static int __init uid_cache_init(void)
{
int n;
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_LIST_HEAD(uidhash_table + n);
/* Insert the root user immediately (init already runs as root) */
spin_lock(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(0));
spin_unlock(&uidhash_lock);
return 0;
}
module_init(uid_cache_init);

246
kernel/wait.c Normal file
View File

@@ -0,0 +1,246 @@
/*
* Generic waiting primitives.
*
* (C) 2004 William Irwin, Oracle
*/
#include <linux/config.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_tail(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__remove_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(remove_wait_queue);
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
* will be guaranteed to see waitqueue addition _or_ subsequent
* tests in this thread will see the wakeup having taken place.
*
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
* loads to move into the the critical region).
*/
void fastcall
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
/*
* don't alter the task state if this is just going to
* queue an async wait queue callback
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);
void fastcall
prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue_tail(q, wait);
/*
* don't alter the task state if this is just going to
* queue an async wait queue callback
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
/*
* We can check for list emptiness outside the lock
* IFF:
* - we use the "careful" check that verifies both
* the next and prev pointers, so that there cannot
* be any half-pending updates in progress on other
* CPU's that we haven't seen yet (and that might
* still change the stack area.
* and
* - all other users take the lock (ie we can only
* have _one_ other CPU that looks at or modifies
* the list).
*/
if (!list_empty_careful(&wait->task_list)) {
spin_lock_irqsave(&q->lock, flags);
list_del_init(&wait->task_list);
spin_unlock_irqrestore(&q->lock, flags);
}
}
EXPORT_SYMBOL(finish_wait);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
if (ret)
list_del_init(&wait->task_list);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
struct wait_bit_queue *wait_bit
= container_of(wait, struct wait_bit_queue, wait);
if (wait_bit->key.flags != key->flags ||
wait_bit->key.bit_nr != key->bit_nr ||
test_bit(key->bit_nr, key->flags))
return 0;
else
return autoremove_wake_function(wait, mode, sync, key);
}
EXPORT_SYMBOL(wake_bit_function);
/*
* To allow interruptible waiting and asynchronous (i.e. nonblocking)
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
* permitted return codes. Nonzero return codes halt waiting and return.
*/
int __sched fastcall
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
int ret = 0;
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags))
ret = (*action)(q->key.flags);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
}
EXPORT_SYMBOL(__wait_on_bit);
int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
int (*action)(void *), unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
return __wait_on_bit(wq, &wait, action, mode);
}
EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched fastcall
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
int ret = 0;
do {
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags)) {
if ((ret = (*action)(q->key.flags)))
break;
}
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
return ret;
}
EXPORT_SYMBOL(__wait_on_bit_lock);
int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
int (*action)(void *), unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
return __wait_on_bit_lock(wq, &wait, action, mode);
}
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
if (waitqueue_active(wq))
__wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key);
}
EXPORT_SYMBOL(__wake_up_bit);
/**
* wake_up_bit - wake up a waiter on a bit
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
*
* There is a standard hashed waitqueue table for generic use. This
* is the part of the hashtable's accessor API that wakes up waiters
* on a bit. For instance, if one were to have waiters on a bitflag,
* one would call wake_up_bit() after clearing the bit.
*
* In order for this to function properly, as it uses waitqueue_active()
* internally, some kind of memory barrier must be done prior to calling
* this. Typically, this will be smp_mb__after_clear_bit(), but in some
* cases where bitflags are manipulated non-atomically under a lock, one
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
*/
void fastcall wake_up_bit(void *word, int bit)
{
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
}
EXPORT_SYMBOL(wake_up_bit);
fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
const struct zone *zone = page_zone(virt_to_page(word));
unsigned long val = (unsigned long)word << shift | bit;
return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
}
EXPORT_SYMBOL(bit_waitqueue);

555
kernel/workqueue.c Normal file
View File

@@ -0,0 +1,555 @@
/*
* linux/kernel/workqueue.c
*
* Generic mechanism for defining kernel helper threads for running
* arbitrary tasks in process context.
*
* Started by Ingo Molnar, Copyright (C) 2002
*
* Derived from the taskqueue/keventd code by:
*
* David Woodhouse <dwmw2@infradead.org>
* Andrew Morton <andrewm@uow.edu.au>
* Kai Petzke <wpp@marie.physik.tu-berlin.de>
* Theodore Ts'o <tytso@mit.edu>
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
/*
* The per-CPU workqueue (if single thread, we always use cpu 0's).
*
* The sequence counters are for flush_scheduled_work(). It wants to wait
* until until all currently-scheduled works are completed, but it doesn't
* want to be livelocked by new, incoming ones. So it waits until
* remove_sequence is >= the insert_sequence which pertained when
* flush_scheduled_work() was called.
*/
struct cpu_workqueue_struct {
spinlock_t lock;
long remove_sequence; /* Least-recently added (next to run) */
long insert_sequence; /* Next to add */
struct list_head worklist;
wait_queue_head_t more_work;
wait_queue_head_t work_done;
struct workqueue_struct *wq;
task_t *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
} ____cacheline_aligned;
/*
* The externally visible workqueue abstraction is an array of
* per-CPU workqueues:
*/
struct workqueue_struct {
struct cpu_workqueue_struct cpu_wq[NR_CPUS];
const char *name;
struct list_head list; /* Empty if single thread */
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);
/* If it's single threaded, it isn't in the list of workqueues. */
static inline int is_single_threaded(struct workqueue_struct *wq)
{
return list_empty(&wq->list);
}
/* Preempt must be disabled. */
static void __queue_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work)
{
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
work->wq_data = cwq;
list_add_tail(&work->entry, &cwq->worklist);
cwq->insert_sequence++;
wake_up(&cwq->more_work);
spin_unlock_irqrestore(&cwq->lock, flags);
}
/*
* Queue work on a workqueue. Return non-zero if it was successfully
* added.
*
* We queue the work to the CPU it was submitted, but there is no
* guarantee that it will be processed by that CPU.
*/
int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0, cpu = get_cpu();
if (!test_and_set_bit(0, &work->pending)) {
if (unlikely(is_single_threaded(wq)))
cpu = 0;
BUG_ON(!list_empty(&work->entry));
__queue_work(wq->cpu_wq + cpu, work);
ret = 1;
}
put_cpu();
return ret;
}
static void delayed_work_timer_fn(unsigned long __data)
{
struct work_struct *work = (struct work_struct *)__data;
struct workqueue_struct *wq = work->wq_data;
int cpu = smp_processor_id();
if (unlikely(is_single_threaded(wq)))
cpu = 0;
__queue_work(wq->cpu_wq + cpu, work);
}
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct work_struct *work, unsigned long delay)
{
int ret = 0;
struct timer_list *timer = &work->timer;
if (!test_and_set_bit(0, &work->pending)) {
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
/* This stores wq for the moment, for the timer_fn */
work->wq_data = wq;
timer->expires = jiffies + delay;
timer->data = (unsigned long)work;
timer->function = delayed_work_timer_fn;
add_timer(timer);
ret = 1;
}
return ret;
}
static inline void run_workqueue(struct cpu_workqueue_struct *cwq)
{
unsigned long flags;
/*
* Keep taking off work from the queue until
* done.
*/
spin_lock_irqsave(&cwq->lock, flags);
cwq->run_depth++;
if (cwq->run_depth > 3) {
/* morton gets to eat his hat */
printk("%s: recursion depth exceeded: %d\n",
__FUNCTION__, cwq->run_depth);
dump_stack();
}
while (!list_empty(&cwq->worklist)) {
struct work_struct *work = list_entry(cwq->worklist.next,
struct work_struct, entry);
void (*f) (void *) = work->func;
void *data = work->data;
list_del_init(cwq->worklist.next);
spin_unlock_irqrestore(&cwq->lock, flags);
BUG_ON(work->wq_data != cwq);
clear_bit(0, &work->pending);
f(data);
spin_lock_irqsave(&cwq->lock, flags);
cwq->remove_sequence++;
wake_up(&cwq->work_done);
}
cwq->run_depth--;
spin_unlock_irqrestore(&cwq->lock, flags);
}
static int worker_thread(void *__cwq)
{
struct cpu_workqueue_struct *cwq = __cwq;
DECLARE_WAITQUEUE(wait, current);
struct k_sigaction sa;
sigset_t blocked;
current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
/* Block and flush all signals */
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
/* SIG_IGN makes children autoreap: see do_notify_parent(). */
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;
siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
add_wait_queue(&cwq->more_work, &wait);
if (list_empty(&cwq->worklist))
schedule();
else
__set_current_state(TASK_RUNNING);
remove_wait_queue(&cwq->more_work, &wait);
if (!list_empty(&cwq->worklist))
run_workqueue(cwq);
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
if (cwq->thread == current) {
/*
* Probably keventd trying to flush its own queue. So simply run
* it by hand rather than deadlocking.
*/
run_workqueue(cwq);
} else {
DEFINE_WAIT(wait);
long sequence_needed;
spin_lock_irq(&cwq->lock);
sequence_needed = cwq->insert_sequence;
while (sequence_needed - cwq->remove_sequence > 0) {
prepare_to_wait(&cwq->work_done, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&cwq->lock);
schedule();
spin_lock_irq(&cwq->lock);
}
finish_wait(&cwq->work_done, &wait);
spin_unlock_irq(&cwq->lock);
}
}
/*
* flush_workqueue - ensure that any scheduled work has run to completion.
*
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
* This function will sample each workqueue's current insert_sequence number and
* will sleep until the head sequence is greater than or equal to that. This
* means that we sleep until all works which were queued on entry have been
* handled, but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
might_sleep();
if (is_single_threaded(wq)) {
/* Always use cpu 0's area. */
flush_cpu_workqueue(wq->cpu_wq + 0);
} else {
int cpu;
lock_cpu_hotplug();
for_each_online_cpu(cpu)
flush_cpu_workqueue(wq->cpu_wq + cpu);
unlock_cpu_hotplug();
}
}
static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
int cpu)
{
struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
struct task_struct *p;
spin_lock_init(&cwq->lock);
cwq->wq = wq;
cwq->thread = NULL;
cwq->insert_sequence = 0;
cwq->remove_sequence = 0;
INIT_LIST_HEAD(&cwq->worklist);
init_waitqueue_head(&cwq->more_work);
init_waitqueue_head(&cwq->work_done);
if (is_single_threaded(wq))
p = kthread_create(worker_thread, cwq, "%s", wq->name);
else
p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
if (IS_ERR(p))
return NULL;
cwq->thread = p;
return p;
}
struct workqueue_struct *__create_workqueue(const char *name,
int singlethread)
{
int cpu, destroy = 0;
struct workqueue_struct *wq;
struct task_struct *p;
BUG_ON(strlen(name) > 10);
wq = kmalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
memset(wq, 0, sizeof(*wq));
wq->name = name;
/* We don't need the distraction of CPUs appearing and vanishing. */
lock_cpu_hotplug();
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
p = create_workqueue_thread(wq, 0);
if (!p)
destroy = 1;
else
wake_up_process(p);
} else {
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
for_each_online_cpu(cpu) {
p = create_workqueue_thread(wq, cpu);
if (p) {
kthread_bind(p, cpu);
wake_up_process(p);
} else
destroy = 1;
}
}
unlock_cpu_hotplug();
/*
* Was there any error during startup? If yes then clean up:
*/
if (destroy) {
destroy_workqueue(wq);
wq = NULL;
}
return wq;
}
static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq;
unsigned long flags;
struct task_struct *p;
cwq = wq->cpu_wq + cpu;
spin_lock_irqsave(&cwq->lock, flags);
p = cwq->thread;
cwq->thread = NULL;
spin_unlock_irqrestore(&cwq->lock, flags);
if (p)
kthread_stop(p);
}
void destroy_workqueue(struct workqueue_struct *wq)
{
int cpu;
flush_workqueue(wq);
/* We don't need the distraction of CPUs appearing and vanishing. */
lock_cpu_hotplug();
if (is_single_threaded(wq))
cleanup_workqueue_thread(wq, 0);
else {
for_each_online_cpu(cpu)
cleanup_workqueue_thread(wq, cpu);
spin_lock(&workqueue_lock);
list_del(&wq->list);
spin_unlock(&workqueue_lock);
}
unlock_cpu_hotplug();
kfree(wq);
}
static struct workqueue_struct *keventd_wq;
int fastcall schedule_work(struct work_struct *work)
{
return queue_work(keventd_wq, work);
}
int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
{
return queue_delayed_work(keventd_wq, work, delay);
}
int schedule_delayed_work_on(int cpu,
struct work_struct *work, unsigned long delay)
{
int ret = 0;
struct timer_list *timer = &work->timer;
if (!test_and_set_bit(0, &work->pending)) {
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
/* This stores keventd_wq for the moment, for the timer_fn */
work->wq_data = keventd_wq;
timer->expires = jiffies + delay;
timer->data = (unsigned long)work;
timer->function = delayed_work_timer_fn;
add_timer_on(timer, cpu);
ret = 1;
}
return ret;
}
void flush_scheduled_work(void)
{
flush_workqueue(keventd_wq);
}
/**
* cancel_rearming_delayed_workqueue - reliably kill off a delayed
* work whose handler rearms the delayed work.
* @wq: the controlling workqueue structure
* @work: the delayed work struct
*/
static void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
struct work_struct *work)
{
while (!cancel_delayed_work(work))
flush_workqueue(wq);
}
/**
* cancel_rearming_delayed_work - reliably kill off a delayed keventd
* work whose handler rearms the delayed work.
* @work: the delayed work struct
*/
void cancel_rearming_delayed_work(struct work_struct *work)
{
cancel_rearming_delayed_workqueue(keventd_wq, work);
}
EXPORT_SYMBOL(cancel_rearming_delayed_work);
int keventd_up(void)
{
return keventd_wq != NULL;
}
int current_is_keventd(void)
{
struct cpu_workqueue_struct *cwq;
int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */
int ret = 0;
BUG_ON(!keventd_wq);
cwq = keventd_wq->cpu_wq + cpu;
if (current == cwq->thread)
ret = 1;
return ret;
}
#ifdef CONFIG_HOTPLUG_CPU
/* Take the work from this (downed) CPU. */
static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
{
struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
LIST_HEAD(list);
struct work_struct *work;
spin_lock_irq(&cwq->lock);
list_splice_init(&cwq->worklist, &list);
while (!list_empty(&list)) {
printk("Taking work for %s\n", wq->name);
work = list_entry(list.next,struct work_struct,entry);
list_del(&work->entry);
__queue_work(wq->cpu_wq + smp_processor_id(), work);
}
spin_unlock_irq(&cwq->lock);
}
/* We're holding the cpucontrol mutex here */
static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
unsigned int hotcpu = (unsigned long)hcpu;
struct workqueue_struct *wq;
switch (action) {
case CPU_UP_PREPARE:
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
if (create_workqueue_thread(wq, hotcpu) < 0) {
printk("workqueue for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
}
break;
case CPU_ONLINE:
/* Kick off worker threads. */
list_for_each_entry(wq, &workqueues, list) {
kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
wake_up_process(wq->cpu_wq[hotcpu].thread);
}
break;
case CPU_UP_CANCELED:
list_for_each_entry(wq, &workqueues, list) {
/* Unbind so it can run. */
kthread_bind(wq->cpu_wq[hotcpu].thread,
smp_processor_id());
cleanup_workqueue_thread(wq, hotcpu);
}
break;
case CPU_DEAD:
list_for_each_entry(wq, &workqueues, list)
cleanup_workqueue_thread(wq, hotcpu);
list_for_each_entry(wq, &workqueues, list)
take_over_work(wq, hotcpu);
break;
}
return NOTIFY_OK;
}
#endif
void init_workqueues(void)
{
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
}
EXPORT_SYMBOL_GPL(__create_workqueue);
EXPORT_SYMBOL_GPL(queue_work);
EXPORT_SYMBOL_GPL(queue_delayed_work);
EXPORT_SYMBOL_GPL(flush_workqueue);
EXPORT_SYMBOL_GPL(destroy_workqueue);
EXPORT_SYMBOL(schedule_work);
EXPORT_SYMBOL(schedule_delayed_work);
EXPORT_SYMBOL(schedule_delayed_work_on);
EXPORT_SYMBOL(flush_scheduled_work);