From 778d03a089c7cc31c5d3f983ecd18704d287fa0e Mon Sep 17 00:00:00 2001 From: "iap10@labyrinth.cl.cam.ac.uk" Date: Thu, 15 Apr 2004 15:55:59 +0000 Subject: bitkeeper revision 1.864.1.1 (407eb08fW2eUAKOIh6v4T-Ew4bCchg) upgrade to linux 2.4.26 --- xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c | 1653 ++++++++++++++ xenolinux-2.4.26-sparse/drivers/char/mem.c | 815 +++++++ xenolinux-2.4.26-sparse/drivers/char/tty_io.c | 2472 +++++++++++++++++++++ 3 files changed, 4940 insertions(+) create mode 100644 xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c create mode 100644 xenolinux-2.4.26-sparse/drivers/char/mem.c create mode 100644 xenolinux-2.4.26-sparse/drivers/char/tty_io.c (limited to 'xenolinux-2.4.26-sparse/drivers') diff --git a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c new file mode 100644 index 0000000000..20a934addd --- /dev/null +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c @@ -0,0 +1,1653 @@ +/* + * linux/drivers/block/ll_rw_blk.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli SuSE + * Queue request tables / lock, selectable elevator, Jens Axboe + * kernel-doc documentation started by NeilBrown - July2000 + */ + +/* + * This handles all read/write requests to block devices + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * MAC Floppy IWM hooks + */ + +#ifdef CONFIG_MAC_FLOPPY_IWM +extern int mac_floppy_init(void); +#endif + +/* + * For the allocated request tables + */ +static kmem_cache_t *request_cachep; + +/* + * The "disk" task queue is used to start the actual requests + * after a plug + */ +DECLARE_TASK_QUEUE(tq_disk); + +/* + * Protect the request list against multiple users.. + * + * With this spinlock the Linux block IO subsystem is 100% SMP threaded + * from the IRQ event side, and almost 100% SMP threaded from the syscall + * side (we still have protect against block device array operations, and + * the do_request() side is casually still unsafe. The kernel lock protects + * this part currently.). + * + * there is a fair chance that things will work just OK if these functions + * are called with no global kernel lock held ... + */ +spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; + +/* This specifies how many sectors to read ahead on the disk. */ + +int read_ahead[MAX_BLKDEV]; + +/* blk_dev_struct is: + * *request_fn + * *current_request + */ +struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */ + +/* + * blk_size contains the size of all block-devices in units of 1024 byte + * sectors: + * + * blk_size[MAJOR][MINOR] + * + * if (!blk_size[MAJOR]) then no minor size checking is done. + */ +int * blk_size[MAX_BLKDEV]; + +/* + * blksize_size contains the size of all block-devices: + * + * blksize_size[MAJOR][MINOR] + * + * if (!blksize_size[MAJOR]) then 1024 bytes is assumed. + */ +int * blksize_size[MAX_BLKDEV]; + +/* + * hardsect_size contains the size of the hardware sector of a device. + * + * hardsect_size[MAJOR][MINOR] + * + * if (!hardsect_size[MAJOR]) + * then 512 bytes is assumed. + * else + * sector_size is hardsect_size[MAJOR][MINOR] + * This is currently set by some scsi devices and read by the msdos fs driver. + * Other uses may appear later. + */ +int * hardsect_size[MAX_BLKDEV]; + +/* + * The following tunes the read-ahead algorithm in mm/filemap.c + */ +int * max_readahead[MAX_BLKDEV]; + +/* + * Max number of sectors per request + */ +int * max_sectors[MAX_BLKDEV]; + +unsigned long blk_max_low_pfn, blk_max_pfn; +int blk_nohighio = 0; + +int block_dump = 0; + +static struct timer_list writeback_timer; + +static inline int get_max_sectors(kdev_t dev) +{ + if (!max_sectors[MAJOR(dev)]) + return MAX_SECTORS; + return max_sectors[MAJOR(dev)][MINOR(dev)]; +} + +inline request_queue_t *blk_get_queue(kdev_t dev) +{ + struct blk_dev_struct *bdev = blk_dev + MAJOR(dev); + + if (bdev->queue) + return bdev->queue(dev); + else + return &blk_dev[MAJOR(dev)].request_queue; +} + +static int __blk_cleanup_queue(struct request_list *list) +{ + struct list_head *head = &list->free; + struct request *rq; + int i = 0; + + while (!list_empty(head)) { + rq = list_entry(head->next, struct request, queue); + list_del(&rq->queue); + kmem_cache_free(request_cachep, rq); + i++; + }; + + if (i != list->count) + printk("request list leak!\n"); + + list->count = 0; + return i; +} + +/** + * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed + * @q: the request queue to be released + * + * Description: + * blk_cleanup_queue is the pair to blk_init_queue(). It should + * be called when a request queue is being released; typically + * when a block device is being de-registered. Currently, its + * primary task it to free all the &struct request structures that + * were allocated to the queue. + * Caveat: + * Hopefully the low level driver will have finished any + * outstanding requests first... + **/ +void blk_cleanup_queue(request_queue_t * q) +{ + int count = q->nr_requests; + + count -= __blk_cleanup_queue(&q->rq); + + if (count) + printk("blk_cleanup_queue: leaked requests (%d)\n", count); + if (atomic_read(&q->nr_sectors)) + printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors)); + + memset(q, 0, sizeof(*q)); +} + +/** + * blk_queue_headactive - indicate whether head of request queue may be active + * @q: The queue which this applies to. + * @active: A flag indication where the head of the queue is active. + * + * Description: + * The driver for a block device may choose to leave the currently active + * request on the request queue, removing it only when it has completed. + * The queue handling routines assume this by default for safety reasons + * and will not involve the head of the request queue in any merging or + * reordering of requests when the queue is unplugged (and thus may be + * working on this particular request). + * + * If a driver removes requests from the queue before processing them, then + * it may indicate that it does so, there by allowing the head of the queue + * to be involved in merging and reordering. This is done be calling + * blk_queue_headactive() with an @active flag of %0. + * + * If a driver processes several requests at once, it must remove them (or + * at least all but one of them) from the request queue. + * + * When a queue is plugged the head will be assumed to be inactive. + **/ + +void blk_queue_headactive(request_queue_t * q, int active) +{ + q->head_active = active; +} + +/** + * blk_queue_throttle_sectors - indicates you will call sector throttling funcs + * @q: The queue which this applies to. + * @active: A flag indication if you want sector throttling on + * + * Description: + * The sector throttling code allows us to put a limit on the number of + * sectors pending io to the disk at a given time, sending @active nonzero + * indicates you will call blk_started_sectors and blk_finished_sectors in + * addition to calling blk_started_io and blk_finished_io in order to + * keep track of the number of sectors in flight. + **/ + +void blk_queue_throttle_sectors(request_queue_t * q, int active) +{ + q->can_throttle = active; +} + +/** + * blk_queue_make_request - define an alternate make_request function for a device + * @q: the request queue for the device to be affected + * @mfn: the alternate make_request function + * + * Description: + * The normal way for &struct buffer_heads to be passed to a device + * driver is for them to be collected into requests on a request + * queue, and then to allow the device driver to select requests + * off that queue when it is ready. This works well for many block + * devices. However some block devices (typically virtual devices + * such as md or lvm) do not benefit from the processing on the + * request queue, and are served best by having the requests passed + * directly to them. This can be achieved by providing a function + * to blk_queue_make_request(). + * + * Caveat: + * The driver that does this *must* be able to deal appropriately + * with buffers in "highmemory", either by calling bh_kmap() to get + * a kernel mapping, to by calling create_bounce() to create a + * buffer in normal memory. + **/ + +void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) +{ + q->make_request_fn = mfn; +} + +/** + * blk_queue_bounce_limit - set bounce buffer limit for queue + * @q: the request queue for the device + * @dma_addr: bus address limit + * + * Description: + * Different hardware can have different requirements as to what pages + * it can do I/O directly to. A low level driver can call + * blk_queue_bounce_limit to have lower memory pages allocated as bounce + * buffers for doing I/O to pages residing above @page. By default + * the block layer sets this to the highest numbered "low" memory page. + **/ +void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) +{ + unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + unsigned long mb = dma_addr >> 20; + static request_queue_t *old_q; + + /* + * keep this for debugging for now... + */ + if (dma_addr != BLK_BOUNCE_HIGH && q != old_q) { + old_q = q; + printk("blk: queue %p, ", q); + if (dma_addr == BLK_BOUNCE_ANY) + printk("no I/O memory limit\n"); + else + printk("I/O limit %luMb (mask 0x%Lx)\n", mb, + (long long) dma_addr); + } + + q->bounce_pfn = bounce_pfn; +} + + +/* + * can we merge the two segments, or do we need to start a new one? + */ +inline int blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt) +{ + /* + * if bh and nxt are contigous and don't cross a 4g boundary, it's ok + */ + if (BH_CONTIG(bh, nxt) && BH_PHYS_4G(bh, nxt)) + return 1; + + return 0; +} + +static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments) +{ + if (req->nr_segments < max_segments) { + req->nr_segments++; + return 1; + } + return 0; +} + +static int ll_back_merge_fn(request_queue_t *q, struct request *req, + struct buffer_head *bh, int max_segments) +{ + if (blk_seg_merge_ok(req->bhtail, bh)) + return 1; + + return ll_new_segment(q, req, max_segments); +} + +static int ll_front_merge_fn(request_queue_t *q, struct request *req, + struct buffer_head *bh, int max_segments) +{ + if (blk_seg_merge_ok(bh, req->bh)) + return 1; + + return ll_new_segment(q, req, max_segments); +} + +static int ll_merge_requests_fn(request_queue_t *q, struct request *req, + struct request *next, int max_segments) +{ + int total_segments = req->nr_segments + next->nr_segments; + + if (blk_seg_merge_ok(req->bhtail, next->bh)) + total_segments--; + + if (total_segments > max_segments) + return 0; + + req->nr_segments = total_segments; + return 1; +} + +/* + * "plug" the device if there are no outstanding requests: this will + * force the transfer to start only after we have put all the requests + * on the list. + * + * This is called with interrupts off and no requests on the queue. + * (and with the request spinlock acquired) + */ +static void generic_plug_device(request_queue_t *q, kdev_t dev) +{ + /* + * no need to replug device + */ + if (!list_empty(&q->queue_head) || q->plugged) + return; + + q->plugged = 1; + queue_task(&q->plug_tq, &tq_disk); +} + +/* + * remove the plug and let it rip.. + */ +static inline void __generic_unplug_device(request_queue_t *q) +{ + if (q->plugged) { + q->plugged = 0; + if (!list_empty(&q->queue_head)) + q->request_fn(q); + } +} + +void generic_unplug_device(void *data) +{ + request_queue_t *q = (request_queue_t *) data; + unsigned long flags; + + spin_lock_irqsave(&io_request_lock, flags); + __generic_unplug_device(q); + spin_unlock_irqrestore(&io_request_lock, flags); +} + +/** blk_grow_request_list + * @q: The &request_queue_t + * @nr_requests: how many requests are desired + * + * More free requests are added to the queue's free lists, bringing + * the total number of requests to @nr_requests. + * + * The requests are added equally to the request queue's read + * and write freelists. + * + * This function can sleep. + * + * Returns the (new) number of requests which the queue has available. + */ +int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors) +{ + unsigned long flags; + /* Several broken drivers assume that this function doesn't sleep, + * this causes system hangs during boot. + * As a temporary fix, make the function non-blocking. + */ + spin_lock_irqsave(&io_request_lock, flags); + while (q->nr_requests < nr_requests) { + struct request *rq; + + rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC); + if (rq == NULL) + break; + memset(rq, 0, sizeof(*rq)); + rq->rq_status = RQ_INACTIVE; + list_add(&rq->queue, &q->rq.free); + q->rq.count++; + + q->nr_requests++; + } + + /* + * Wakeup waiters after both one quarter of the + * max-in-fligh queue and one quarter of the requests + * are available again. + */ + + q->batch_requests = q->nr_requests / 4; + if (q->batch_requests > 32) + q->batch_requests = 32; + q->batch_sectors = max_queue_sectors / 4; + + q->max_queue_sectors = max_queue_sectors; + + BUG_ON(!q->batch_sectors); + atomic_set(&q->nr_sectors, 0); + + spin_unlock_irqrestore(&io_request_lock, flags); + return q->nr_requests; +} + +static void blk_init_free_list(request_queue_t *q) +{ + struct sysinfo si; + int megs; /* Total memory, in megabytes */ + int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS; + + INIT_LIST_HEAD(&q->rq.free); + q->rq.count = 0; + q->rq.pending[READ] = q->rq.pending[WRITE] = 0; + q->nr_requests = 0; + + si_meminfo(&si); + megs = si.totalram >> (20 - PAGE_SHIFT); + nr_requests = MAX_NR_REQUESTS; + if (megs < 30) { + nr_requests /= 2; + max_queue_sectors /= 2; + } + /* notice early if anybody screwed the defaults */ + BUG_ON(!nr_requests); + BUG_ON(!max_queue_sectors); + + blk_grow_request_list(q, nr_requests, max_queue_sectors); + + init_waitqueue_head(&q->wait_for_requests); + + spin_lock_init(&q->queue_lock); +} + +static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh); + +/** + * blk_init_queue - prepare a request queue for use with a block device + * @q: The &request_queue_t to be initialised + * @rfn: The function to be called to process requests that have been + * placed on the queue. + * + * Description: + * If a block device wishes to use the standard request handling procedures, + * which sorts requests and coalesces adjacent requests, then it must + * call blk_init_queue(). The function @rfn will be called when there + * are requests on the queue that need to be processed. If the device + * supports plugging, then @rfn may not be called immediately when requests + * are available on the queue, but may be called at some time later instead. + * Plugged queues are generally unplugged when a buffer belonging to one + * of the requests on the queue is needed, or due to memory pressure. + * + * @rfn is not required, or even expected, to remove all requests off the + * queue, but only as many as it can handle at a time. If it does leave + * requests on the queue, it is responsible for arranging that the requests + * get dealt with eventually. + * + * A global spin lock $io_request_lock must be held while manipulating the + * requests on the request queue. + * + * The request on the head of the queue is by default assumed to be + * potentially active, and it is not considered for re-ordering or merging + * whenever the given queue is unplugged. This behaviour can be changed with + * blk_queue_headactive(). + * + * Note: + * blk_init_queue() must be paired with a blk_cleanup_queue() call + * when the block device is deactivated (such as at module unload). + **/ +void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) +{ + INIT_LIST_HEAD(&q->queue_head); + elevator_init(&q->elevator, ELEVATOR_LINUS); + blk_init_free_list(q); + q->request_fn = rfn; + q->back_merge_fn = ll_back_merge_fn; + q->front_merge_fn = ll_front_merge_fn; + q->merge_requests_fn = ll_merge_requests_fn; + q->make_request_fn = __make_request; + q->plug_tq.sync = 0; + q->plug_tq.routine = &generic_unplug_device; + q->plug_tq.data = q; + q->plugged = 0; + q->can_throttle = 0; + + /* + * These booleans describe the queue properties. We set the + * default (and most common) values here. Other drivers can + * use the appropriate functions to alter the queue properties. + * as appropriate. + */ + q->plug_device_fn = generic_plug_device; + q->head_active = 1; + + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); +} + +#define blkdev_free_rq(list) list_entry((list)->next, struct request, queue); +/* + * Get a free request. io_request_lock must be held and interrupts + * disabled on the way in. Returns NULL if there are no free requests. + */ +static struct request *get_request(request_queue_t *q, int rw) +{ + struct request *rq = NULL; + struct request_list *rl = &q->rq; + + if (blk_oversized_queue(q)) { + int rlim = q->nr_requests >> 5; + + if (rlim < 4) + rlim = 4; + + /* + * if its a write, or we have more than a handful of reads + * pending, bail out + */ + if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim)) + return NULL; + if (blk_oversized_queue_reads(q)) + return NULL; + } + + if (!list_empty(&rl->free)) { + rq = blkdev_free_rq(&rl->free); + list_del(&rq->queue); + rl->count--; + rl->pending[rw]++; + rq->rq_status = RQ_ACTIVE; + rq->cmd = rw; + rq->special = NULL; + rq->q = q; + } + + return rq; +} + +/* + * Here's the request allocation design, low latency version: + * + * 1: Blocking on request exhaustion is a key part of I/O throttling. + * + * 2: We want to be `fair' to all requesters. We must avoid starvation, and + * attempt to ensure that all requesters sleep for a similar duration. Hence + * no stealing requests when there are other processes waiting. + * + * There used to be more here, attempting to allow a process to send in a + * number of requests once it has woken up. But, there's no way to + * tell if a process has just been woken up, or if it is a new process + * coming in to steal requests from the waiters. So, we give up and force + * everyone to wait fairly. + * + * So here's what we do: + * + * a) A READA requester fails if free_requests < batch_requests + * + * We don't want READA requests to prevent sleepers from ever + * waking. Note that READA is used extremely rarely - a few + * filesystems use it for directory readahead. + * + * When a process wants a new request: + * + * b) If free_requests == 0, the requester sleeps in FIFO manner, and + * the queue full condition is set. The full condition is not + * cleared until there are no longer any waiters. Once the full + * condition is set, all new io must wait, hopefully for a very + * short period of time. + * + * When a request is released: + * + * c) If free_requests < batch_requests, do nothing. + * + * d) If free_requests >= batch_requests, wake up a single waiter. + * + * As each waiter gets a request, he wakes another waiter. We do this + * to prevent a race where an unplug might get run before a request makes + * it's way onto the queue. The result is a cascade of wakeups, so delaying + * the initial wakeup until we've got batch_requests available helps avoid + * wakeups where there aren't any requests available yet. + */ + +static struct request *__get_request_wait(request_queue_t *q, int rw) +{ + register struct request *rq; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&q->wait_for_requests, &wait); + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irq(&io_request_lock); + if (blk_oversized_queue(q) || q->rq.count == 0) { + __generic_unplug_device(q); + spin_unlock_irq(&io_request_lock); + schedule(); + spin_lock_irq(&io_request_lock); + } + rq = get_request(q, rw); + spin_unlock_irq(&io_request_lock); + } while (rq == NULL); + remove_wait_queue(&q->wait_for_requests, &wait); + current->state = TASK_RUNNING; + + return rq; +} + +static void get_request_wait_wakeup(request_queue_t *q, int rw) +{ + /* + * avoid losing an unplug if a second __get_request_wait did the + * generic_unplug_device while our __get_request_wait was running + * w/o the queue_lock held and w/ our request out of the queue. + */ + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); +} + +/* RO fail safe mechanism */ + +static long ro_bits[MAX_BLKDEV][8]; + +int is_read_only(kdev_t dev) +{ + int minor,major; + + major = MAJOR(dev); + minor = MINOR(dev); + if (major < 0 || major >= MAX_BLKDEV) return 0; + return ro_bits[major][minor >> 5] & (1 << (minor & 31)); +} + +void set_device_ro(kdev_t dev,int flag) +{ + int minor,major; + + major = MAJOR(dev); + minor = MINOR(dev); + if (major < 0 || major >= MAX_BLKDEV) return; + if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31); + else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31)); +} + +inline void drive_stat_acct (kdev_t dev, int rw, + unsigned long nr_sectors, int new_io) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + kstat.dk_drive[major][index] += new_io; + if (rw == READ) { + kstat.dk_drive_rio[major][index] += new_io; + kstat.dk_drive_rblk[major][index] += nr_sectors; + } else if (rw == WRITE) { + kstat.dk_drive_wio[major][index] += new_io; + kstat.dk_drive_wblk[major][index] += nr_sectors; + } else + printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n"); +} + +#ifdef CONFIG_BLK_STATS +/* + * Return up to two hd_structs on which to do IO accounting for a given + * request. + * + * On a partitioned device, we want to account both against the partition + * and against the whole disk. + */ +static void locate_hd_struct(struct request *req, + struct hd_struct **hd1, + struct hd_struct **hd2) +{ + struct gendisk *gd; + + *hd1 = NULL; + *hd2 = NULL; + + gd = get_gendisk(req->rq_dev); + if (gd && gd->part) { + /* Mask out the partition bits: account for the entire disk */ + int devnr = MINOR(req->rq_dev) >> gd->minor_shift; + int whole_minor = devnr << gd->minor_shift; + + *hd1 = &gd->part[whole_minor]; + if (whole_minor != MINOR(req->rq_dev)) + *hd2= &gd->part[MINOR(req->rq_dev)]; + } +} + +/* + * Round off the performance stats on an hd_struct. + * + * The average IO queue length and utilisation statistics are maintained + * by observing the current state of the queue length and the amount of + * time it has been in this state for. + * Normally, that accounting is done on IO completion, but that can result + * in more than a second's worth of IO being accounted for within any one + * second, leading to >100% utilisation. To deal with that, we do a + * round-off before returning the results when reading /proc/partitions, + * accounting immediately for all queue usage up to the current jiffies and + * restarting the counters again. + */ +void disk_round_stats(struct hd_struct *hd) +{ + unsigned long now = jiffies; + + hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change)); + hd->last_queue_change = now; + + if (hd->ios_in_flight) + hd->io_ticks += (now - hd->last_idle_time); + hd->last_idle_time = now; +} + +static inline void down_ios(struct hd_struct *hd) +{ + disk_round_stats(hd); + --hd->ios_in_flight; +} + +static inline void up_ios(struct hd_struct *hd) +{ + disk_round_stats(hd); + ++hd->ios_in_flight; +} + +static void account_io_start(struct hd_struct *hd, struct request *req, + int merge, int sectors) +{ + switch (req->cmd) { + case READ: + if (merge) + hd->rd_merges++; + hd->rd_sectors += sectors; + break; + case WRITE: + if (merge) + hd->wr_merges++; + hd->wr_sectors += sectors; + break; + } + if (!merge) + up_ios(hd); +} + +static void account_io_end(struct hd_struct *hd, struct request *req) +{ + unsigned long duration = jiffies - req->start_time; + switch (req->cmd) { + case READ: + hd->rd_ticks += duration; + hd->rd_ios++; + break; + case WRITE: + hd->wr_ticks += duration; + hd->wr_ios++; + break; + } + down_ios(hd); +} + +void req_new_io(struct request *req, int merge, int sectors) +{ + struct hd_struct *hd1, *hd2; + + locate_hd_struct(req, &hd1, &hd2); + if (hd1) + account_io_start(hd1, req, merge, sectors); + if (hd2) + account_io_start(hd2, req, merge, sectors); +} + +void req_merged_io(struct request *req) +{ + struct hd_struct *hd1, *hd2; + + locate_hd_struct(req, &hd1, &hd2); + if (hd1) + down_ios(hd1); + if (hd2) + down_ios(hd2); +} + +void req_finished_io(struct request *req) +{ + struct hd_struct *hd1, *hd2; + + locate_hd_struct(req, &hd1, &hd2); + if (hd1) + account_io_end(hd1, req); + if (hd2) + account_io_end(hd2, req); +} +EXPORT_SYMBOL(req_finished_io); +#endif /* CONFIG_BLK_STATS */ + +/* + * add-request adds a request to the linked list. + * io_request_lock is held and interrupts disabled, as we muck with the + * request queue list. + * + * By this point, req->cmd is always either READ/WRITE, never READA, + * which is important for drive_stat_acct() above. + */ +static inline void add_request(request_queue_t * q, struct request * req, + struct list_head *insert_here) +{ + drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1); + + if (!q->plugged && q->head_active && insert_here == &q->queue_head) { + spin_unlock_irq(&io_request_lock); + BUG(); + } + + /* + * elevator indicated where it wants this request to be + * inserted at elevator_merge time + */ + list_add(&req->queue, insert_here); +} + +/* + * Must be called with io_request_lock held and interrupts disabled + */ +void blkdev_release_request(struct request *req) +{ + request_queue_t *q = req->q; + + req->rq_status = RQ_INACTIVE; + req->q = NULL; + + /* + * Request may not have originated from ll_rw_blk. if not, + * assume it has free buffers and check waiters + */ + if (q) { + struct request_list *rl = &q->rq; + int oversized_batch = 0; + + if (q->can_throttle) + oversized_batch = blk_oversized_queue_batch(q); + rl->count++; + /* + * paranoia check + */ + if (req->cmd == READ || req->cmd == WRITE) + rl->pending[req->cmd]--; + if (rl->pending[READ] > q->nr_requests) + printk("blk: reads: %u\n", rl->pending[READ]); + if (rl->pending[WRITE] > q->nr_requests) + printk("blk: writes: %u\n", rl->pending[WRITE]); + if (rl->pending[READ] + rl->pending[WRITE] > q->nr_requests) + printk("blk: r/w: %u + %u > %u\n", rl->pending[READ], rl->pending[WRITE], q->nr_requests); + list_add(&req->queue, &rl->free); + if (rl->count >= q->batch_requests && !oversized_batch) { + smp_mb(); + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); + } + } +} + +/* + * Has to be called with the request spinlock acquired + */ +static void attempt_merge(request_queue_t * q, + struct request *req, + int max_sectors, + int max_segments) +{ + struct request *next; + + next = blkdev_next_request(req); + if (req->sector + req->nr_sectors != next->sector) + return; + if (req->cmd != next->cmd + || req->rq_dev != next->rq_dev + || req->nr_sectors + next->nr_sectors > max_sectors + || next->waiting) + return; + /* + * If we are not allowed to merge these requests, then + * return. If we are allowed to merge, then the count + * will have been updated to the appropriate number, + * and we shouldn't do it here too. + */ + if (!q->merge_requests_fn(q, req, next, max_segments)) + return; + + q->elevator.elevator_merge_req_fn(req, next); + + /* At this point we have either done a back merge + * or front merge. We need the smaller start_time of + * the merged requests to be the current request + * for accounting purposes. + */ + if (time_after(req->start_time, next->start_time)) + req->start_time = next->start_time; + + req->bhtail->b_reqnext = next->bh; + req->bhtail = next->bhtail; + req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; + list_del(&next->queue); + + /* One last thing: we have removed a request, so we now have one + less expected IO to complete for accounting purposes. */ + req_merged_io(req); + + blkdev_release_request(next); +} + +static inline void attempt_back_merge(request_queue_t * q, + struct request *req, + int max_sectors, + int max_segments) +{ + if (&req->queue == q->queue_head.prev) + return; + attempt_merge(q, req, max_sectors, max_segments); +} + +static inline void attempt_front_merge(request_queue_t * q, + struct list_head * head, + struct request *req, + int max_sectors, + int max_segments) +{ + struct list_head * prev; + + prev = req->queue.prev; + if (head == prev) + return; + attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments); +} + +static int __make_request(request_queue_t * q, int rw, + struct buffer_head * bh) +{ + unsigned int sector, count, sync; + int max_segments = MAX_SEGMENTS; + struct request * req, *freereq = NULL; + int rw_ahead, max_sectors, el_ret; + struct list_head *head, *insert_here; + int latency; + elevator_t *elevator = &q->elevator; + int should_wake = 0; + + count = bh->b_size >> 9; + sector = bh->b_rsector; + sync = test_and_clear_bit(BH_Sync, &bh->b_state); + + rw_ahead = 0; /* normal case; gets changed below for READA */ + switch (rw) { + case READA: +#if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */ + rw_ahead = 1; +#endif + rw = READ; /* drop into READ */ + case READ: + case WRITE: + latency = elevator_request_latency(elevator, rw); + break; + default: + BUG(); + goto end_io; + } + + /* We'd better have a real physical mapping! + Check this bit only if the buffer was dirty and just locked + down by us so at this point flushpage will block and + won't clear the mapped bit under us. */ + if (!buffer_mapped(bh)) + BUG(); + + /* + * Temporary solution - in 2.5 this will be done by the lowlevel + * driver. Create a bounce buffer if the buffer data points into + * high memory - keep the original buffer otherwise. + */ + bh = blk_queue_bounce(q, rw, bh); + +/* look for a free request. */ + /* + * Try to coalesce the new request with old requests + */ + max_sectors = get_max_sectors(bh->b_rdev); + + req = NULL; + head = &q->queue_head; + /* + * Now we acquire the request spinlock, we have to be mega careful + * not to schedule or do something nonatomic + */ + spin_lock_irq(&io_request_lock); + +again: + insert_here = head->prev; + + if (list_empty(head)) { + q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + goto get_rq; + } else if (q->head_active && !q->plugged) + head = head->next; + + el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); + switch (el_ret) { + + case ELEVATOR_BACK_MERGE: + if (!q->back_merge_fn(q, req, bh, max_segments)) { + insert_here = &req->queue; + break; + } + req->bhtail->b_reqnext = bh; + req->bhtail = bh; + req->nr_sectors = req->hard_nr_sectors += count; + blk_started_io(count); + blk_started_sectors(req, count); + drive_stat_acct(req->rq_dev, req->cmd, count, 0); + req_new_io(req, 1, count); + attempt_back_merge(q, req, max_sectors, max_segments); + goto out; + + case ELEVATOR_FRONT_MERGE: + if (!q->front_merge_fn(q, req, bh, max_segments)) { + insert_here = req->queue.prev; + break; + } + bh->b_reqnext = req->bh; + req->bh = bh; + /* + * may not be valid, but queues not having bounce + * enabled for highmem pages must not look at + * ->buffer anyway + */ + req->buffer = bh->b_data; + req->current_nr_sectors = req->hard_cur_sectors = count; + req->sector = req->hard_sector = sector; + req->nr_sectors = req->hard_nr_sectors += count; + blk_started_io(count); + blk_started_sectors(req, count); + drive_stat_acct(req->rq_dev, req->cmd, count, 0); + req_new_io(req, 1, count); + attempt_front_merge(q, head, req, max_sectors, max_segments); + goto out; + + /* + * elevator says don't/can't merge. get new request + */ + case ELEVATOR_NO_MERGE: + /* + * use elevator hints as to where to insert the + * request. if no hints, just add it to the back + * of the queue + */ + if (req) + insert_here = &req->queue; + break; + + default: + printk("elevator returned crap (%d)\n", el_ret); + BUG(); + } + +get_rq: + if (freereq) { + req = freereq; + freereq = NULL; + } else { + /* + * See description above __get_request_wait() + */ + if (rw_ahead) { + if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) { + spin_unlock_irq(&io_request_lock); + goto end_io; + } + req = get_request(q, rw); + if (req == NULL) + BUG(); + } else { + req = get_request(q, rw); + if (req == NULL) { + spin_unlock_irq(&io_request_lock); + freereq = __get_request_wait(q, rw); + head = &q->queue_head; + spin_lock_irq(&io_request_lock); + should_wake = 1; + goto again; + } + } + } + +/* fill up the request-info, and add it to the queue */ + req->elevator_sequence = latency; + req->cmd = rw; + req->errors = 0; + req->hard_sector = req->sector = sector; + req->hard_nr_sectors = req->nr_sectors = count; + req->current_nr_sectors = req->hard_cur_sectors = count; + req->nr_segments = 1; /* Always 1 for a new request. */ + req->nr_hw_segments = 1; /* Always 1 for a new request. */ + req->buffer = bh->b_data; + req->waiting = NULL; + req->bh = bh; + req->bhtail = bh; + req->rq_dev = bh->b_rdev; + req->start_time = jiffies; + req_new_io(req, 0, count); + blk_started_io(count); + blk_started_sectors(req, count); + add_request(q, req, insert_here); +out: + if (freereq) + blkdev_release_request(freereq); + if (should_wake) + get_request_wait_wakeup(q, rw); + if (sync) + __generic_unplug_device(q); + spin_unlock_irq(&io_request_lock); + return 0; +end_io: + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; +} + +/** + * generic_make_request: hand a buffer head to it's device driver for I/O + * @rw: READ, WRITE, or READA - what sort of I/O is desired. + * @bh: The buffer head describing the location in memory and on the device. + * + * generic_make_request() is used to make I/O requests of block + * devices. It is passed a &struct buffer_head and a &rw value. The + * %READ and %WRITE options are (hopefully) obvious in meaning. The + * %READA value means that a read is required, but that the driver is + * free to fail the request if, for example, it cannot get needed + * resources immediately. + * + * generic_make_request() does not return any status. The + * success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the bh->b_end_io + * function described (one day) else where. + * + * The caller of generic_make_request must make sure that b_page, + * b_addr, b_size are set to describe the memory buffer, that b_rdev + * and b_rsector are set to describe the device address, and the + * b_end_io and optionally b_private are set to describe how + * completion notification should be signaled. BH_Mapped should also + * be set (to confirm that b_dev and b_blocknr are valid). + * + * generic_make_request and the drivers it calls may use b_reqnext, + * and may change b_rdev and b_rsector. So the values of these fields + * should NOT be depended on after the call to generic_make_request. + * Because of this, the caller should record the device address + * information in b_dev and b_blocknr. + * + * Apart from those fields mentioned above, no other fields, and in + * particular, no other flags, are changed by generic_make_request or + * any lower level drivers. + * */ +void generic_make_request (int rw, struct buffer_head * bh) +{ + int major = MAJOR(bh->b_rdev); + int minorsize = 0; + request_queue_t *q; + + if (!bh->b_end_io) + BUG(); + + /* Test device size, when known. */ + if (blk_size[major]) + minorsize = blk_size[major][MINOR(bh->b_rdev)]; + if (minorsize) { + unsigned long maxsector = (minorsize << 1) + 1; + unsigned long sector = bh->b_rsector; + unsigned int count = bh->b_size >> 9; + + if (maxsector < count || maxsector - count < sector) { + /* Yecch */ + bh->b_state &= ~(1 << BH_Dirty); + + /* This may well happen - the kernel calls bread() + without checking the size of the device, e.g., + when mounting a device. */ + printk(KERN_INFO + "attempt to access beyond end of device\n"); + printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n", + kdevname(bh->b_rdev), rw, + (sector + count)>>1, minorsize); + + bh->b_end_io(bh, 0); + return; + } + } + + /* + * Resolve the mapping until finished. (drivers are + * still free to implement/resolve their own stacking + * by explicitly returning 0) + */ + /* NOTE: we don't repeat the blk_size check for each new device. + * Stacking drivers are expected to know what they are doing. + */ + do { + q = blk_get_queue(bh->b_rdev); + if (!q) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%ld)\n", + kdevname(bh->b_rdev), bh->b_rsector); + buffer_IO_error(bh); + break; + } + } while (q->make_request_fn(q, rw, bh)); +} + + +/** + * submit_bh: submit a buffer_head to the block device later for I/O + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bh: The &struct buffer_head which describes the I/O + * + * submit_bh() is very similar in purpose to generic_make_request(), and + * uses that function to do most of the work. + * + * The extra functionality provided by submit_bh is to determine + * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev. + * This is is appropriate for IO requests that come from the buffer + * cache and page cache which (currently) always use aligned blocks. + */ +void submit_bh(int rw, struct buffer_head * bh) +{ + int count = bh->b_size >> 9; + + if (!test_bit(BH_Lock, &bh->b_state)) + BUG(); + + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Launder, &bh->b_state); + + /* + * First step, 'identity mapping' - RAID or LVM might + * further remap this. + */ + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr * count; + + get_bh(bh); + generic_make_request(rw, bh); + + /* fix race condition with wait_on_buffer() */ + smp_mb(); /* spin_unlock may have inclusive semantics */ + if (waitqueue_active(&bh->b_wait)) + wake_up(&bh->b_wait); + + if (block_dump) + printk(KERN_DEBUG "%s: %s block %lu/%u on %s\n", current->comm, rw == WRITE ? "WRITE" : "READ", bh->b_rsector, count, kdevname(bh->b_rdev)); + + put_bh(bh); + switch (rw) { + case WRITE: + kstat.pgpgout += count; + break; + default: + kstat.pgpgin += count; + break; + } +} + +/** + * ll_rw_block: low-level access to block devices + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, + * and requests an I/O operation on them, either a %READ or a %WRITE. + * The third %READA option is described in the documentation for + * generic_make_request() which ll_rw_block() calls. + * + * This function provides extra functionality that is not in + * generic_make_request() that is relevant to buffers in the buffer + * cache or page cache. In particular it drops any buffer that it + * cannot get a lock on (with the BH_Lock state bit), any buffer that + * appears to be clean when doing a write request, and any buffer that + * appears to be up-to-date when doing read request. Further it marks + * as clean buffers that are processed for writing (the buffer cache + * wont assume that they are actually clean until the buffer gets + * unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * any waiters. As client that needs a more interesting completion + * routine should call submit_bh() (or generic_make_request()) + * directly. + * + * Caveat: + * All of the buffers must be for the same device, and must also be + * of the current approved size for the device. */ + +void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]) +{ + unsigned int major; + int correct_size; + int i; + + if (!nr) + return; + + major = MAJOR(bhs[0]->b_dev); + + /* Determine correct block size for this device. */ + correct_size = get_hardsect_size(bhs[0]->b_dev); + + /* Verify requested block sizes. */ + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + if (bh->b_size % correct_size) { + printk(KERN_NOTICE "ll_rw_block: device %s: " + "only %d-char blocks implemented (%u)\n", + kdevname(bhs[0]->b_dev), + correct_size, bh->b_size); + goto sorry; + } + } + + if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) { + printk(KERN_NOTICE "Can't write to read-only device %s\n", + kdevname(bhs[0]->b_dev)); + goto sorry; + } + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + lock_buffer(bh); + + /* We have the buffer lock */ + atomic_inc(&bh->b_count); + bh->b_end_io = end_buffer_io_sync; + + switch(rw) { + case WRITE: + if (!atomic_set_buffer_clean(bh)) + /* Hmmph! Nothing to write */ + goto end_io; + __mark_buffer_clean(bh); + break; + + case READA: + case READ: + if (buffer_uptodate(bh)) + /* Hmmph! Already have it */ + goto end_io; + break; + default: + BUG(); + end_io: + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + continue; + } + + submit_bh(rw, bh); + } + return; + +sorry: + /* Make sure we don't get infinite dirty retries.. */ + for (i = 0; i < nr; i++) + mark_buffer_clean(bhs[i]); +} + +#ifdef CONFIG_STRAM_SWAP +extern int stram_device_init (void); +#endif + +static void blk_writeback_timer(unsigned long data) +{ + wakeup_bdflush(); + wakeup_kupdate(); +} + +/** + * end_that_request_first - end I/O on one buffer. + * @req: the request being processed + * @uptodate: 0 for I/O error + * @name: the name printed for an I/O error + * + * Description: + * Ends I/O on the first buffer attached to @req, and sets it up + * for the next buffer_head (if any) in the cluster. + * + * Return: + * 0 - we are done with this request, call end_that_request_last() + * 1 - still buffers pending for this request + * + * Caveat: + * Drivers implementing their own end_request handling must call + * blk_finished_io() appropriately. + **/ + +int end_that_request_first (struct request *req, int uptodate, char *name) +{ + struct buffer_head * bh; + int nsect; + + req->errors = 0; + if (!uptodate) + printk("end_request: I/O error, dev %s (%s), sector %lu\n", + kdevname(req->rq_dev), name, req->sector); + + if ((bh = req->bh) != NULL) { + nsect = bh->b_size >> 9; + blk_finished_io(nsect); + blk_finished_sectors(req, nsect); + req->bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, uptodate); + if ((bh = req->bh) != NULL) { + req->hard_sector += nsect; + req->hard_nr_sectors -= nsect; + req->sector = req->hard_sector; + req->nr_sectors = req->hard_nr_sectors; + + req->current_nr_sectors = bh->b_size >> 9; + req->hard_cur_sectors = req->current_nr_sectors; + if (req->nr_sectors < req->current_nr_sectors) { + req->nr_sectors = req->current_nr_sectors; + printk("end_request: buffer-list destroyed\n"); + } + req->buffer = bh->b_data; + return 1; + } + } + return 0; +} + +extern int laptop_mode; + +void end_that_request_last(struct request *req) +{ + struct completion *waiting = req->waiting; + + /* + * schedule the writeout of pending dirty data when the disk is idle + */ + if (laptop_mode && req->cmd == READ) + mod_timer(&writeback_timer, jiffies + 5 * HZ); + + req_finished_io(req); + blkdev_release_request(req); + if (waiting) + complete(waiting); +} + +int __init blk_dev_init(void) +{ + struct blk_dev_struct *dev; + + request_cachep = kmem_cache_create("blkdev_requests", + sizeof(struct request), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + + if (!request_cachep) + panic("Can't create request pool slab cache\n"); + + for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) + dev->queue = NULL; + + memset(ro_bits,0,sizeof(ro_bits)); + memset(max_readahead, 0, sizeof(max_readahead)); + memset(max_sectors, 0, sizeof(max_sectors)); + + blk_max_low_pfn = max_low_pfn - 1; + blk_max_pfn = max_pfn - 1; + + init_timer(&writeback_timer); + writeback_timer.function = blk_writeback_timer; + +#ifdef CONFIG_AMIGA_Z2RAM + z2_init(); +#endif +#ifdef CONFIG_STRAM_SWAP + stram_device_init(); +#endif +#ifdef CONFIG_ISP16_CDI + isp16_init(); +#endif +#ifdef CONFIG_BLK_DEV_PS2 + ps2esdi_init(); +#endif +#ifdef CONFIG_BLK_DEV_XD + xd_init(); +#endif +#ifdef CONFIG_BLK_DEV_MFM + mfm_init(); +#endif +#ifdef CONFIG_PARIDE + { extern void paride_init(void); paride_init(); }; +#endif +#ifdef CONFIG_MAC_FLOPPY + swim3_init(); +#endif +#ifdef CONFIG_BLK_DEV_SWIM_IOP + swimiop_init(); +#endif +#ifdef CONFIG_AMIGA_FLOPPY + amiga_floppy_init(); +#endif +#ifdef CONFIG_ATARI_FLOPPY + atari_floppy_init(); +#endif +#ifdef CONFIG_BLK_DEV_FD + floppy_init(); +#else +#if defined(__i386__) && !defined(CONFIG_XEN) /* Do we even need this? */ + outb_p(0xc, 0x3f2); +#endif +#endif +#ifdef CONFIG_CDU31A + cdu31a_init(); +#endif +#ifdef CONFIG_ATARI_ACSI + acsi_init(); +#endif +#ifdef CONFIG_MCD + mcd_init(); +#endif +#ifdef CONFIG_MCDX + mcdx_init(); +#endif +#ifdef CONFIG_SBPCD + sbpcd_init(); +#endif +#ifdef CONFIG_AZTCD + aztcd_init(); +#endif +#ifdef CONFIG_CDU535 + sony535_init(); +#endif +#ifdef CONFIG_GSCD + gscd_init(); +#endif +#ifdef CONFIG_CM206 + cm206_init(); +#endif +#ifdef CONFIG_OPTCD + optcd_init(); +#endif +#ifdef CONFIG_SJCD + sjcd_init(); +#endif +#ifdef CONFIG_APBLOCK + ap_init(); +#endif +#ifdef CONFIG_DDV + ddv_init(); +#endif +#ifdef CONFIG_MDISK + mdisk_init(); +#endif +#ifdef CONFIG_DASD + dasd_init(); +#endif +#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK) + tapeblock_init(); +#endif +#ifdef CONFIG_BLK_DEV_XPRAM + xpram_init(); +#endif + +#ifdef CONFIG_SUN_JSFLASH + jsfd_init(); +#endif + +#ifdef CONFIG_XEN_VBD + xlblk_init(); +#endif + + return 0; +}; + +EXPORT_SYMBOL(io_request_lock); +EXPORT_SYMBOL(end_that_request_first); +EXPORT_SYMBOL(end_that_request_last); +EXPORT_SYMBOL(blk_grow_request_list); +EXPORT_SYMBOL(blk_init_queue); +EXPORT_SYMBOL(blk_get_queue); +EXPORT_SYMBOL(blk_cleanup_queue); +EXPORT_SYMBOL(blk_queue_headactive); +EXPORT_SYMBOL(blk_queue_throttle_sectors); +EXPORT_SYMBOL(blk_queue_make_request); +EXPORT_SYMBOL(generic_make_request); +EXPORT_SYMBOL(blkdev_release_request); +EXPORT_SYMBOL(generic_unplug_device); +EXPORT_SYMBOL(blk_queue_bounce_limit); +EXPORT_SYMBOL(blk_max_low_pfn); +EXPORT_SYMBOL(blk_max_pfn); +EXPORT_SYMBOL(blk_seg_merge_ok); +EXPORT_SYMBOL(blk_nohighio); diff --git a/xenolinux-2.4.26-sparse/drivers/char/mem.c b/xenolinux-2.4.26-sparse/drivers/char/mem.c new file mode 100644 index 0000000000..dbc10d6382 --- /dev/null +++ b/xenolinux-2.4.26-sparse/drivers/char/mem.c @@ -0,0 +1,815 @@ +/* + * linux/drivers/char/mem.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Added devfs support. + * Jan-11-1998, C. Scott Ananian + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar + * + * MODIFIED FOR XEN by Keir Fraser, 10th July 2003. + * Linux running on Xen has strange semantics for /dev/mem and /dev/kmem!! + * 1. mmap will not work on /dev/kmem + * 2. mmap on /dev/mem interprets the 'file offset' as a machine address + * rather than a physical address. + * I don't believe anyone sane mmaps /dev/kmem, but /dev/mem is mmapped + * to get at memory-mapped I/O spaces (eg. the VESA X server does this). + * For this to work at all we need to expect machine addresses. + * Reading/writing of /dev/kmem expects kernel virtual addresses, as usual. + * Reading/writing of /dev/mem expects 'physical addresses' as usual -- this + * is because /dev/mem can only read/write existing kernel mappings, which + * will be normal RAM, and we should present pseudo-physical layout for all + * except I/O (which is the sticky case that mmap is hacked to deal with). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_I2C +extern int i2c_init_all(void); +#endif +#ifdef CONFIG_FB +extern void fbmem_init(void); +#endif +#ifdef CONFIG_PROM_CONSOLE +extern void prom_con_init(void); +#endif +#ifdef CONFIG_MDA_CONSOLE +extern void mda_console_init(void); +#endif +#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) +extern void tapechar_init(void); +#endif + +static ssize_t do_write_mem(struct file * file, void *p, unsigned long realp, + const char * buf, size_t count, loff_t *ppos) +{ + ssize_t written; + + written = 0; +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (realp < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE-realp; + if (sz > count) sz = count; + /* Hmm. Do something? */ + buf+=sz; + p+=sz; + count-=sz; + written+=sz; + } +#endif + if (copy_from_user(p, buf, count)) + return -EFAULT; + written += count; + *ppos += written; + return written; +} + + +/* + * This funcion reads the *physical* memory. The f_pos points directly to the + * memory location. + */ +static ssize_t read_mem(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + unsigned long end_mem; + ssize_t read; + + end_mem = __pa(high_memory); + if (p >= end_mem) + return 0; + if (count > end_mem - p) + count = end_mem - p; + read = 0; +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE-p; + if (sz > count) + sz = count; + if (sz > 0) { + if (clear_user(buf, sz)) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + } +#endif + if (copy_to_user(buf, __va(p), count)) + return -EFAULT; + read += count; + *ppos += read; + return read; +} + +static ssize_t write_mem(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + unsigned long end_mem; + + end_mem = __pa(high_memory); + if (p >= end_mem) + return 0; + if (count > end_mem - p) + count = end_mem - p; + return do_write_mem(file, __va(p), p, buf, count, ppos); +} + +#ifndef pgprot_noncached + +/* + * This should probably be per-architecture in + */ +static inline pgprot_t pgprot_noncached(pgprot_t _prot) +{ + unsigned long prot = pgprot_val(_prot); + +#if defined(__i386__) || defined(__x86_64__) + /* On PPro and successors, PCD alone doesn't always mean + uncached because of interactions with the MTRRs. PCD | PWT + means definitely uncached. */ + if (boot_cpu_data.x86 > 3) + prot |= _PAGE_PCD | _PAGE_PWT; +#elif defined(__powerpc__) + prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; +#elif defined(__mc68000__) +#ifdef SUN3_PAGE_NOCACHE + if (MMU_IS_SUN3) + prot |= SUN3_PAGE_NOCACHE; + else +#endif + if (MMU_IS_851 || MMU_IS_030) + prot |= _PAGE_NOCACHE030; + /* Use no-cache mode, serialized */ + else if (MMU_IS_040 || MMU_IS_060) + prot = (prot & _CACHEMASK040) | _PAGE_NOCACHE_S; +#endif + + return __pgprot(prot); +} + +#endif /* !pgprot_noncached */ + +/* + * Architectures vary in how they handle caching for addresses + * outside of main memory. + */ +static inline int noncached_address(unsigned long addr) +{ +#if defined(__i386__) + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting PCD or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + return !( test_bit(X86_FEATURE_MTRR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, &boot_cpu_data.x86_capability) ) + && addr >= __pa(high_memory); +#else + return addr >= __pa(high_memory); +#endif +} + +static int mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + +#if defined(CONFIG_XEN) && defined(CONFIG_XEN_PRIVILEGED_GUEST) + if (!(start_info.flags & SIF_PRIVILEGED)) + return -ENXIO; + + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, + vma->vm_end-vma->vm_start, vma->vm_page_prot)) + return -EAGAIN; + return 0; +#elif defined(CONFIG_XEN) + return -ENXIO; +#else + /* + * Accessing memory above the top the kernel knows about or + * through a file pointer that was marked O_SYNC will be + * done non-cached. + */ + if (noncached_address(offset) || (file->f_flags & O_SYNC)) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* Don't try to swap out physical pages.. */ + vma->vm_flags |= VM_RESERVED; + + /* + * Don't dump addresses that are not real memory to a core file. + */ + if (offset >= __pa(high_memory) || (file->f_flags & O_SYNC)) + vma->vm_flags |= VM_IO; + + if (remap_page_range(vma->vm_start, offset, vma->vm_end-vma->vm_start, + vma->vm_page_prot)) + return -EAGAIN; + return 0; +#endif +} + +/* + * This function reads the *virtual* memory as seen by the kernel. + */ +static ssize_t read_kmem(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read = 0; + ssize_t virtr = 0; + char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ + + if (p < (unsigned long) high_memory) { + read = count; + if (count > (unsigned long) high_memory - p) + read = (unsigned long) high_memory - p; + +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE && read > 0) { + size_t tmp = PAGE_SIZE - p; + if (tmp > read) tmp = read; + if (clear_user(buf, tmp)) + return -EFAULT; + buf += tmp; + p += tmp; + read -= tmp; + count -= tmp; + } +#endif + if (copy_to_user(buf, (char *)p, read)) + return -EFAULT; + p += read; + buf += read; + count -= read; + } + + if (count > 0) { + kbuf = (char *)__get_free_page(GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + while (count > 0) { + int len = count; + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + len = vread(kbuf, (char *)p, len); + if (!len) + break; + if (copy_to_user(buf, kbuf, len)) { + free_page((unsigned long)kbuf); + return -EFAULT; + } + count -= len; + buf += len; + virtr += len; + p += len; + } + free_page((unsigned long)kbuf); + } + *ppos = p; + return virtr + read; +} + +extern long vwrite(char *buf, char *addr, unsigned long count); + +/* + * This function writes to the *virtual* memory as seen by the kernel. + */ +static ssize_t write_kmem(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t wrote = 0; + ssize_t virtr = 0; + char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ + + if (p < (unsigned long) high_memory) { + wrote = count; + if (count > (unsigned long) high_memory - p) + wrote = (unsigned long) high_memory - p; + + wrote = do_write_mem(file, (void*)p, p, buf, wrote, ppos); + + p += wrote; + buf += wrote; + count -= wrote; + } + + if (count > 0) { + kbuf = (char *)__get_free_page(GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + while (count > 0) { + int len = count; + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + if (len && copy_from_user(kbuf, buf, len)) { + free_page((unsigned long)kbuf); + return -EFAULT; + } + len = vwrite(kbuf, (char *)p, len); + count -= len; + buf += len; + virtr += len; + p += len; + } + free_page((unsigned long)kbuf); + } + + *ppos = p; + return virtr + wrote; +} + +#if defined(CONFIG_ISA) || !defined(__mc68000__) +static ssize_t read_port(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long i = *ppos; + char *tmp = buf; + + if (verify_area(VERIFY_WRITE,buf,count)) + return -EFAULT; + while (count-- > 0 && i < 65536) { + if (__put_user(inb(i),tmp) < 0) + return -EFAULT; + i++; + tmp++; + } + *ppos = i; + return tmp-buf; +} + +static ssize_t write_port(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long i = *ppos; + const char * tmp = buf; + + if (verify_area(VERIFY_READ,buf,count)) + return -EFAULT; + while (count-- > 0 && i < 65536) { + char c; + if (__get_user(c, tmp)) + return -EFAULT; + outb(c,i); + i++; + tmp++; + } + *ppos = i; + return tmp-buf; +} +#endif + +static ssize_t read_null(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t write_null(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return count; +} + +/* + * For fun, we are using the MMU for this. + */ +static inline size_t read_zero_pagealigned(char * buf, size_t size) +{ + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long addr=(unsigned long)buf; + + mm = current->mm; + /* Oops, this was forgotten before. -ben */ + down_read(&mm->mmap_sem); + + /* For private mappings, just map in zero pages. */ + for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + unsigned long count; + + if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) + goto out_up; + if (vma->vm_flags & VM_SHARED) + break; +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) + if (vma->vm_flags & VM_IO) + break; +#endif + count = vma->vm_end - addr; + if (count > size) + count = size; + + zap_page_range(mm, addr, count); + zeromap_page_range(addr, count, PAGE_COPY); + + size -= count; + buf += count; + addr += count; + if (size == 0) + goto out_up; + } + + up_read(&mm->mmap_sem); + + /* The shared case is hard. Let's do the conventional zeroing. */ + do { + unsigned long unwritten = clear_user(buf, PAGE_SIZE); + if (unwritten) + return size + unwritten - PAGE_SIZE; + if (current->need_resched) + schedule(); + buf += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size); + + return size; +out_up: + up_read(&mm->mmap_sem); + return size; +} + +static ssize_t read_zero(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long left, unwritten, written = 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + left = count; + + /* do we want to be clever? Arbitrary cut-off */ + if (count >= PAGE_SIZE*4) { + unsigned long partial; + + /* How much left of the page? */ + partial = (PAGE_SIZE-1) & -(unsigned long) buf; + unwritten = clear_user(buf, partial); + written = partial - unwritten; + if (unwritten) + goto out; + left -= partial; + buf += partial; + unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); + written += (left & PAGE_MASK) - unwritten; + if (unwritten) + goto out; + buf += left & PAGE_MASK; + left &= ~PAGE_MASK; + } + unwritten = clear_user(buf, left); + written += left - unwritten; +out: + return written ? written : -EFAULT; +} + +static int mmap_zero(struct file * file, struct vm_area_struct * vma) +{ + if (vma->vm_flags & VM_SHARED) + return shmem_zero_setup(vma); + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -EAGAIN; + return 0; +} + +static ssize_t write_full(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return -ENOSPC; +} + +/* + * Special lseek() function for /dev/null and /dev/zero. Most notably, you + * can fopen() both devices with "a" now. This was previously impossible. + * -- SRB. + */ + +static loff_t null_lseek(struct file * file, loff_t offset, int orig) +{ + return file->f_pos = 0; +} + +/* + * The memory devices use the full 32/64 bits of the offset, and so we cannot + * check against negative addresses: they are ok. The return value is weird, + * though, in that case (0). + * + * also note that seeking relative to the "end of file" isn't supported: + * it has no meaning, so it returns -EINVAL. + */ +static loff_t memory_lseek(struct file * file, loff_t offset, int orig) +{ + loff_t ret; + + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + default: + ret = -EINVAL; + } + return ret; +} + +static int open_port(struct inode * inode, struct file * filp) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +struct page *kmem_vm_nopage(struct vm_area_struct *vma, unsigned long address, int write) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long kaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + struct page *page = NULL; + + /* address is user VA; convert to kernel VA of desired page */ + kaddr = (address - vma->vm_start) + offset; + kaddr = VMALLOC_VMADDR(kaddr); + + spin_lock(&init_mm.page_table_lock); + + /* Lookup page structure for kernel VA */ + pgd = pgd_offset(&init_mm, kaddr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + pmd = pmd_offset(pgd, kaddr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + ptep = pte_offset(pmd, kaddr); + if (!ptep) + goto out; + pte = *ptep; + if (!pte_present(pte)) + goto out; + if (write && !pte_write(pte)) + goto out; + page = pte_page(pte); + if (!VALID_PAGE(page)) { + page = NULL; + goto out; + } + + /* Increment reference count on page */ + get_page(page); + +out: + spin_unlock(&init_mm.page_table_lock); + + return page; +} + +struct vm_operations_struct kmem_vm_ops = { + nopage: kmem_vm_nopage, +}; + +static int mmap_kmem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + +#if defined(CONFIG_XEN) + return -ENXIO; +#endif + + /* + * If the user is not attempting to mmap a high memory address then + * the standard mmap_mem mechanism will work. High memory addresses + * need special handling, as remap_page_range expects a physically- + * contiguous range of kernel addresses (such as obtained in kmalloc). + */ + if ((offset + size) < (unsigned long) high_memory) + return mmap_mem(file, vma); + + /* + * Accessing memory above the top the kernel knows about or + * through a file pointer that was marked O_SYNC will be + * done non-cached. + */ + if (noncached_address(offset) || (file->f_flags & O_SYNC)) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* Don't do anything here; "nopage" will fill the holes */ + vma->vm_ops = &kmem_vm_ops; + + /* Don't try to swap out physical pages.. */ + vma->vm_flags |= VM_RESERVED; + + /* + * Don't dump addresses that are not real memory to a core file. + */ + vma->vm_flags |= VM_IO; + + return 0; +} + +#define zero_lseek null_lseek +#define full_lseek null_lseek +#define write_zero write_null +#define read_full read_zero +#define open_mem open_port +#define open_kmem open_mem + +static struct file_operations mem_fops = { + llseek: memory_lseek, + read: read_mem, + write: write_mem, + mmap: mmap_mem, + open: open_mem, +}; + +static struct file_operations kmem_fops = { + llseek: memory_lseek, + read: read_kmem, + write: write_kmem, + mmap: mmap_kmem, + open: open_kmem, +}; + +static struct file_operations null_fops = { + llseek: null_lseek, + read: read_null, + write: write_null, +}; + +#if defined(CONFIG_ISA) || !defined(__mc68000__) +static struct file_operations port_fops = { + llseek: memory_lseek, + read: read_port, + write: write_port, + open: open_port, +}; +#endif + +static struct file_operations zero_fops = { + llseek: zero_lseek, + read: read_zero, + write: write_zero, + mmap: mmap_zero, +}; + +static struct file_operations full_fops = { + llseek: full_lseek, + read: read_full, + write: write_full, +}; + +static int memory_open(struct inode * inode, struct file * filp) +{ + switch (MINOR(inode->i_rdev)) { + case 1: + filp->f_op = &mem_fops; + break; + case 2: + filp->f_op = &kmem_fops; + break; + case 3: + filp->f_op = &null_fops; + break; +#if defined(CONFIG_ISA) || !defined(__mc68000__) + case 4: +#if defined(CONFIG_XEN) +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) + if (!(start_info.flags & SIF_PRIVILEGED)) +#endif + return -ENXIO; +#endif + filp->f_op = &port_fops; + break; +#endif + case 5: + filp->f_op = &zero_fops; + break; + case 7: + filp->f_op = &full_fops; + break; + case 8: + filp->f_op = &random_fops; + break; + case 9: + filp->f_op = &urandom_fops; + break; + default: + return -ENXIO; + } + if (filp->f_op && filp->f_op->open) + return filp->f_op->open(inode,filp); + return 0; +} + +void __init memory_devfs_register (void) +{ + /* These are never unregistered */ + static const struct { + unsigned short minor; + char *name; + umode_t mode; + struct file_operations *fops; + } list[] = { /* list of minor devices */ + {1, "mem", S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops}, + {2, "kmem", S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops}, + {3, "null", S_IRUGO | S_IWUGO, &null_fops}, +#if defined(CONFIG_ISA) || !defined(__mc68000__) + {4, "port", S_IRUSR | S_IWUSR | S_IRGRP, &port_fops}, +#endif + {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, + {7, "full", S_IRUGO | S_IWUGO, &full_fops}, + {8, "random", S_IRUGO | S_IWUSR, &random_fops}, + {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} + }; + int i; + + for (i=0; i<(sizeof(list)/sizeof(*list)); i++) + devfs_register (NULL, list[i].name, DEVFS_FL_NONE, + MEM_MAJOR, list[i].minor, + list[i].mode | S_IFCHR, + list[i].fops, NULL); +} + +static struct file_operations memory_fops = { + open: memory_open, /* just a selector for the real open */ +}; + +int __init chr_dev_init(void) +{ + if (devfs_register_chrdev(MEM_MAJOR,"mem",&memory_fops)) + printk("unable to get major %d for memory devs\n", MEM_MAJOR); + memory_devfs_register(); + rand_initialize(); +#ifdef CONFIG_I2C + i2c_init_all(); +#endif +#if defined (CONFIG_FB) + fbmem_init(); +#endif +#if defined (CONFIG_PROM_CONSOLE) + prom_con_init(); +#endif +#if defined (CONFIG_MDA_CONSOLE) + mda_console_init(); +#endif + tty_init(); +#ifdef CONFIG_M68K_PRINTER + lp_m68k_init(); +#endif + misc_init(); +#if CONFIG_QIC02_TAPE + qic02_tape_init(); +#endif +#ifdef CONFIG_FTAPE + ftape_init(); +#endif +#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) + tapechar_init(); +#endif + return 0; +} + +__initcall(chr_dev_init); diff --git a/xenolinux-2.4.26-sparse/drivers/char/tty_io.c b/xenolinux-2.4.26-sparse/drivers/char/tty_io.c new file mode 100644 index 0000000000..53fb06b465 --- /dev/null +++ b/xenolinux-2.4.26-sparse/drivers/char/tty_io.c @@ -0,0 +1,2472 @@ +/* + * linux/drivers/char/tty_io.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles + * or rs-channels. It also implements echoing, cooked mode etc. + * + * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. + * + * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the + * tty_struct and tty_queue structures. Previously there was an array + * of 256 tty_struct's which was statically allocated, and the + * tty_queue structures were allocated at boot time. Both are now + * dynamically allocated only when the tty is open. + * + * Also restructured routines so that there is more of a separation + * between the high-level tty routines (tty_io.c and tty_ioctl.c) and + * the low-level tty routines (serial.c, pty.c, console.c). This + * makes for cleaner and more compact code. -TYT, 9/17/92 + * + * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines + * which can be dynamically activated and de-activated by the line + * discipline handling modules (like SLIP). + * + * NOTE: pay no attention to the line discipline code (yet); its + * interface is still subject to change in this version... + * -- TYT, 1/31/92 + * + * Added functionality to the OPOST tty handling. No delays, but all + * other bits should be there. + * -- Nick Holloway , 27th May 1993. + * + * Rewrote canonical mode and added more termios flags. + * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 + * + * Reorganized FASYNC support so mouse code can share it. + * -- ctm@ardi.com, 9Sep95 + * + * New TIOCLINUX variants added. + * -- mj@k332.feld.cvut.cz, 19-Nov-95 + * + * Restrict vt switching via ioctl() + * -- grif@cs.ucr.edu, 5-Dec-95 + * + * Move console and virtual terminal code to more appropriate files, + * implement CONFIG_VT and generalize console device interface. + * -- Marko Kohtala , March 97 + * + * Rewrote init_dev and release_dev to eliminate races. + * -- Bill Hawes , June 97 + * + * Added devfs support. + * -- C. Scott Ananian , 13-Jan-1998 + * + * Added support for a Unix98-style ptmx device. + * -- C. Scott Ananian , 14-Jan-1998 + * + * Reduced memory usage for older ARM systems + * -- Russell King + * + * Move do_SAK() into process context. Less stack use in devfs functions. + * alloc_tty_struct() always uses kmalloc() -- Andrew Morton 17Mar01 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#ifdef CONFIG_XEN_CONSOLE +extern void xen_console_init(void); +#endif + +#ifdef CONFIG_VT +extern void con_init_devfs (void); +#endif + +extern void disable_early_printk(void); + +#define CONSOLE_DEV MKDEV(TTY_MAJOR,0) +#define TTY_DEV MKDEV(TTYAUX_MAJOR,0) +#define SYSCONS_DEV MKDEV(TTYAUX_MAJOR,1) +#define PTMX_DEV MKDEV(TTYAUX_MAJOR,2) + +#undef TTY_DEBUG_HANGUP + +#define TTY_PARANOIA_CHECK 1 +#define CHECK_TTY_COUNT 1 + +struct termios tty_std_termios; /* for the benefit of tty drivers */ +struct tty_driver *tty_drivers; /* linked list of tty drivers */ +struct tty_ldisc ldiscs[NR_LDISCS]; /* line disc dispatch table */ + +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver ptm_driver[]; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver pts_driver[]; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +static void initialize_tty_struct(struct tty_struct *tty); + +static ssize_t tty_read(struct file *, char *, size_t, loff_t *); +static ssize_t tty_write(struct file *, const char *, size_t, loff_t *); +static unsigned int tty_poll(struct file *, poll_table *); +static int tty_open(struct inode *, struct file *); +static int tty_release(struct inode *, struct file *); +int tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg); +static int tty_fasync(int fd, struct file * filp, int on); +extern int vme_scc_init (void); +extern long vme_scc_console_init(void); +extern int serial167_init(void); +extern long serial167_console_init(void); +extern void console_8xx_init(void); +extern void au1x00_serial_console_init(void); +extern int rs_8xx_init(void); +extern void mac_scc_console_init(void); +extern void hwc_console_init(void); +extern void hwc_tty_init(void); +extern void con3215_init(void); +extern void tty3215_init(void); +extern void tub3270_con_init(void); +extern void tub3270_init(void); +extern void rs285_console_init(void); +extern void sa1100_rs_console_init(void); +extern void sgi_serial_console_init(void); +extern void sn_sal_serial_console_init(void); +extern void sci_console_init(void); +extern void dec_serial_console_init(void); +extern void tx3912_console_init(void); +extern void tx3912_rs_init(void); +extern void txx927_console_init(void); +extern void txx9_rs_init(void); +extern void txx9_serial_console_init(void); +extern void sb1250_serial_console_init(void); +extern void arc_console_init(void); +extern int hvc_console_init(void); + +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a,b) ((a) < (b) ? (b) : (a)) +#endif + +static struct tty_struct *alloc_tty_struct(void) +{ + struct tty_struct *tty; + + tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); + if (tty) + memset(tty, 0, sizeof(struct tty_struct)); + return tty; +} + +static inline void free_tty_struct(struct tty_struct *tty) +{ + kfree(tty); +} + +/* + * This routine returns the name of tty. + */ +static char * +_tty_make_name(struct tty_struct *tty, const char *name, char *buf) +{ + int idx = (tty)?MINOR(tty->device) - tty->driver.minor_start:0; + + if (!tty) /* Hmm. NULL pointer. That's fun. */ + strcpy(buf, "NULL tty"); + else + sprintf(buf, name, + idx + tty->driver.name_base); + + return buf; +} + +#define TTY_NUMBER(tty) (MINOR((tty)->device) - (tty)->driver.minor_start + \ + (tty)->driver.name_base) + +char *tty_name(struct tty_struct *tty, char *buf) +{ + return _tty_make_name(tty, (tty)?tty->driver.name:NULL, buf); +} + +inline int tty_paranoia_check(struct tty_struct *tty, kdev_t device, + const char *routine) +{ +#ifdef TTY_PARANOIA_CHECK + static const char badmagic[] = KERN_WARNING + "Warning: bad magic number for tty struct (%s) in %s\n"; + static const char badtty[] = KERN_WARNING + "Warning: null TTY for (%s) in %s\n"; + + if (!tty) { + printk(badtty, kdevname(device), routine); + return 1; + } + if (tty->magic != TTY_MAGIC) { + printk(badmagic, kdevname(device), routine); + return 1; + } +#endif + return 0; +} + +static int check_tty_count(struct tty_struct *tty, const char *routine) +{ +#ifdef CHECK_TTY_COUNT + struct list_head *p; + int count = 0; + + file_list_lock(); + for(p = tty->tty_files.next; p != &tty->tty_files; p = p->next) { + if(list_entry(p, struct file, f_list)->private_data == tty) + count++; + } + file_list_unlock(); + if (tty->driver.type == TTY_DRIVER_TYPE_PTY && + tty->driver.subtype == PTY_TYPE_SLAVE && + tty->link && tty->link->count) + count++; + if (tty->count != count) { + printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " + "!= #fd's(%d) in %s\n", + kdevname(tty->device), tty->count, count, routine); + return count; + } +#endif + return 0; +} + +int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) +{ + if (disc < N_TTY || disc >= NR_LDISCS) + return -EINVAL; + + if (new_ldisc) { + ldiscs[disc] = *new_ldisc; + ldiscs[disc].flags |= LDISC_FLAG_DEFINED; + ldiscs[disc].num = disc; + } else + memset(&ldiscs[disc], 0, sizeof(struct tty_ldisc)); + + return 0; +} + +EXPORT_SYMBOL(tty_register_ldisc); + +/* Set the discipline of a tty line. */ +static int tty_set_ldisc(struct tty_struct *tty, int ldisc) +{ + int retval = 0; + struct tty_ldisc o_ldisc; + char buf[64]; + + if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS)) + return -EINVAL; + /* Eduardo Blanco */ + /* Cyrus Durgin */ + if (!(ldiscs[ldisc].flags & LDISC_FLAG_DEFINED)) { + char modname [20]; + sprintf(modname, "tty-ldisc-%d", ldisc); + request_module (modname); + } + if (!(ldiscs[ldisc].flags & LDISC_FLAG_DEFINED)) + return -EINVAL; + + if (tty->ldisc.num == ldisc) + return 0; /* We are already in the desired discipline */ + o_ldisc = tty->ldisc; + + tty_wait_until_sent(tty, 0); + + /* Shutdown the current discipline. */ + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + + /* Now set up the new line discipline. */ + tty->ldisc = ldiscs[ldisc]; + tty->termios->c_line = ldisc; + if (tty->ldisc.open) + retval = (tty->ldisc.open)(tty); + if (retval < 0) { + tty->ldisc = o_ldisc; + tty->termios->c_line = tty->ldisc.num; + if (tty->ldisc.open && (tty->ldisc.open(tty) < 0)) { + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (tty->ldisc.open) { + int r = tty->ldisc.open(tty); + + if (r < 0) + panic("Couldn't open N_TTY ldisc for " + "%s --- error %d.", + tty_name(tty, buf), r); + } + } + } + if (tty->ldisc.num != o_ldisc.num && tty->driver.set_ldisc) + tty->driver.set_ldisc(tty); + return retval; +} + +/* + * This routine returns a tty driver structure, given a device number + */ +struct tty_driver *get_tty_driver(kdev_t device) +{ + int major, minor; + struct tty_driver *p; + + minor = MINOR(device); + major = MAJOR(device); + + for (p = tty_drivers; p; p = p->next) { + if (p->major != major) + continue; + if (minor < p->minor_start) + continue; + if (minor >= p->minor_start + p->num) + continue; + return p; + } + return NULL; +} + +/* + * If we try to write to, or set the state of, a terminal and we're + * not in the foreground, send a SIGTTOU. If the signal is blocked or + * ignored, go ahead and perform the operation. (POSIX 7.2) + */ +int tty_check_change(struct tty_struct * tty) +{ + if (current->tty != tty) + return 0; + if (tty->pgrp <= 0) { + printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); + return 0; + } + if (current->pgrp == tty->pgrp) + return 0; + if (is_ignored(SIGTTOU)) + return 0; + if (is_orphaned_pgrp(current->pgrp)) + return -EIO; + (void) kill_pg(current->pgrp,SIGTTOU,1); + return -ERESTARTSYS; +} + +static ssize_t hung_up_tty_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + /* Can't seek (pread) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + return 0; +} + +static ssize_t hung_up_tty_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + /* Can't seek (pwrite) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + return -EIO; +} + +/* No kernel lock held - none needed ;) */ +static unsigned int hung_up_tty_poll(struct file * filp, poll_table * wait) +{ + return POLLIN | POLLOUT | POLLERR | POLLHUP | POLLRDNORM | POLLWRNORM; +} + +static int hung_up_tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + return cmd == TIOCSPGRP ? -ENOTTY : -EIO; +} + +static struct file_operations tty_fops = { + llseek: no_llseek, + read: tty_read, + write: tty_write, + poll: tty_poll, + ioctl: tty_ioctl, + open: tty_open, + release: tty_release, + fasync: tty_fasync, +}; + +static struct file_operations hung_up_tty_fops = { + llseek: no_llseek, + read: hung_up_tty_read, + write: hung_up_tty_write, + poll: hung_up_tty_poll, + ioctl: hung_up_tty_ioctl, + release: tty_release, +}; + +static spinlock_t redirect_lock = SPIN_LOCK_UNLOCKED; +static struct file *redirect; +/* + * This can be called by the "eventd" kernel thread. That is process synchronous, + * but doesn't hold any locks, so we need to make sure we have the appropriate + * locks for what we're doing.. + */ +void do_tty_hangup(void *data) +{ + struct tty_struct *tty = (struct tty_struct *) data; + struct file * cons_filp = NULL; + struct file *f = NULL; + struct task_struct *p; + struct list_head *l; + int closecount = 0, n; + + if (!tty) + return; + + /* inuse_filps is protected by the single kernel lock */ + lock_kernel(); + + spin_lock(&redirect_lock); + if (redirect && redirect->private_data == tty) { + f = redirect; + redirect = NULL; + } + spin_unlock(&redirect_lock); + + check_tty_count(tty, "do_tty_hangup"); + file_list_lock(); + for (l = tty->tty_files.next; l != &tty->tty_files; l = l->next) { + struct file * filp = list_entry(l, struct file, f_list); + if (filp->f_dentry->d_inode->i_rdev == CONSOLE_DEV || + filp->f_dentry->d_inode->i_rdev == SYSCONS_DEV) { + cons_filp = filp; + continue; + } + if (filp->f_op != &tty_fops) + continue; + closecount++; + tty_fasync(-1, filp, 0); /* can't block */ + filp->f_op = &hung_up_tty_fops; + } + file_list_unlock(); + + /* FIXME! What are the locking issues here? This may me overdoing things.. */ + { + unsigned long flags; + + save_flags(flags); cli(); + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + if (tty->driver.flush_buffer) + tty->driver.flush_buffer(tty); + if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && + tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + restore_flags(flags); + } + + wake_up_interruptible(&tty->write_wait); + wake_up_interruptible(&tty->read_wait); + + /* + * Shutdown the current line discipline, and reset it to + * N_TTY. + */ + if (tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) + *tty->termios = tty->driver.init_termios; + if (tty->ldisc.num != ldiscs[N_TTY].num) { + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (tty->ldisc.open) { + int i = (tty->ldisc.open)(tty); + if (i < 0) + printk(KERN_ERR "do_tty_hangup: N_TTY open: " + "error %d\n", -i); + } + } + + read_lock(&tasklist_lock); + for_each_task(p) { + if ((tty->session > 0) && (p->session == tty->session) && + p->leader) { + send_sig(SIGHUP,p,1); + send_sig(SIGCONT,p,1); + if (tty->pgrp > 0) + p->tty_old_pgrp = tty->pgrp; + } + if (p->tty == tty) + p->tty = NULL; + } + read_unlock(&tasklist_lock); + + tty->flags = 0; + tty->session = 0; + tty->pgrp = -1; + tty->ctrl_status = 0; + /* + * If one of the devices matches a console pointer, we + * cannot just call hangup() because that will cause + * tty->count and state->count to go out of sync. + * So we just call close() the right number of times. + */ + if (cons_filp) { + if (tty->driver.close) + for (n = 0; n < closecount; n++) + tty->driver.close(tty, cons_filp); + } else if (tty->driver.hangup) + (tty->driver.hangup)(tty); + unlock_kernel(); + if (f) + fput(f); +} + +void tty_hangup(struct tty_struct * tty) +{ +#ifdef TTY_DEBUG_HANGUP + char buf[64]; + + printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf)); +#endif + schedule_task(&tty->tq_hangup); +} + +void tty_vhangup(struct tty_struct * tty) +{ +#ifdef TTY_DEBUG_HANGUP + char buf[64]; + + printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf)); +#endif + do_tty_hangup((void *) tty); +} + +int tty_hung_up_p(struct file * filp) +{ + return (filp->f_op == &hung_up_tty_fops); +} + +/* + * This function is typically called only by the session leader, when + * it wants to disassociate itself from its controlling tty. + * + * It performs the following functions: + * (1) Sends a SIGHUP and SIGCONT to the foreground process group + * (2) Clears the tty from being controlling the session + * (3) Clears the controlling tty for all processes in the + * session group. + * + * The argument on_exit is set to 1 if called when a process is + * exiting; it is 0 if called by the ioctl TIOCNOTTY. + */ +void disassociate_ctty(int on_exit) +{ + struct tty_struct *tty = current->tty; + struct task_struct *p; + int tty_pgrp = -1; + + if (tty) { + tty_pgrp = tty->pgrp; + if (on_exit && tty->driver.type != TTY_DRIVER_TYPE_PTY) + tty_vhangup(tty); + } else { + if (current->tty_old_pgrp) { + kill_pg(current->tty_old_pgrp, SIGHUP, on_exit); + kill_pg(current->tty_old_pgrp, SIGCONT, on_exit); + } + return; + } + if (tty_pgrp > 0) { + kill_pg(tty_pgrp, SIGHUP, on_exit); + if (!on_exit) + kill_pg(tty_pgrp, SIGCONT, on_exit); + } + + current->tty_old_pgrp = 0; + tty->session = 0; + tty->pgrp = -1; + + read_lock(&tasklist_lock); + for_each_task(p) + if (p->session == current->session) + p->tty = NULL; + read_unlock(&tasklist_lock); +} + +void stop_tty(struct tty_struct *tty) +{ + if (tty->stopped) + return; + tty->stopped = 1; + if (tty->link && tty->link->packet) { + tty->ctrl_status &= ~TIOCPKT_START; + tty->ctrl_status |= TIOCPKT_STOP; + wake_up_interruptible(&tty->link->read_wait); + } + if (tty->driver.stop) + (tty->driver.stop)(tty); +} + +void start_tty(struct tty_struct *tty) +{ + if (!tty->stopped || tty->flow_stopped) + return; + tty->stopped = 0; + if (tty->link && tty->link->packet) { + tty->ctrl_status &= ~TIOCPKT_STOP; + tty->ctrl_status |= TIOCPKT_START; + wake_up_interruptible(&tty->link->read_wait); + } + if (tty->driver.start) + (tty->driver.start)(tty); + if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && + tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + wake_up_interruptible(&tty->write_wait); +} + +static ssize_t tty_read(struct file * file, char * buf, size_t count, + loff_t *ppos) +{ + int i; + struct tty_struct * tty; + struct inode *inode; + + /* Can't seek (pread) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + + tty = (struct tty_struct *)file->private_data; + inode = file->f_dentry->d_inode; + if (tty_paranoia_check(tty, inode->i_rdev, "tty_read")) + return -EIO; + if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags))) + return -EIO; + + /* This check not only needs to be done before reading, but also + whenever read_chan() gets woken up after sleeping, so I've + moved it to there. This should only be done for the N_TTY + line discipline, anyway. Same goes for write_chan(). -- jlc. */ +#if 0 + if ((inode->i_rdev != CONSOLE_DEV) && /* don't stop on /dev/console */ + (tty->pgrp > 0) && + (current->tty == tty) && + (tty->pgrp != current->pgrp)) + if (is_ignored(SIGTTIN) || is_orphaned_pgrp(current->pgrp)) + return -EIO; + else { + (void) kill_pg(current->pgrp, SIGTTIN, 1); + return -ERESTARTSYS; + } +#endif + lock_kernel(); + if (tty->ldisc.read) + i = (tty->ldisc.read)(tty,file,buf,count); + else + i = -EIO; + unlock_kernel(); + if (i > 0) + inode->i_atime = CURRENT_TIME; + return i; +} + +/* + * Split writes up in sane blocksizes to avoid + * denial-of-service type attacks + */ +static inline ssize_t do_tty_write( + ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), + struct tty_struct *tty, + struct file *file, + const unsigned char *buf, + size_t count) +{ + ssize_t ret = 0, written = 0; + + if (file->f_flags & O_NONBLOCK) { + if (down_trylock(&tty->atomic_write)) + return -EAGAIN; + } + else { + if (down_interruptible(&tty->atomic_write)) + return -ERESTARTSYS; + } + if ( test_bit(TTY_NO_WRITE_SPLIT, &tty->flags) ) { + lock_kernel(); + written = write(tty, file, buf, count); + unlock_kernel(); + } else { + for (;;) { + unsigned long size = MAX(PAGE_SIZE*2,16384); + if (size > count) + size = count; + lock_kernel(); + ret = write(tty, file, buf, size); + unlock_kernel(); + if (ret <= 0) + break; + written += ret; + buf += ret; + count -= ret; + if (!count) + break; + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + if (current->need_resched) + schedule(); + } + } + if (written) { + file->f_dentry->d_inode->i_mtime = CURRENT_TIME; + ret = written; + } + up(&tty->atomic_write); + return ret; +} + + +static ssize_t tty_write(struct file * file, const char * buf, size_t count, + loff_t *ppos) +{ + int is_console; + struct tty_struct * tty; + struct inode *inode = file->f_dentry->d_inode; + + /* Can't seek (pwrite) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + + /* + * For now, we redirect writes from /dev/console as + * well as /dev/tty0. + */ + inode = file->f_dentry->d_inode; + is_console = (inode->i_rdev == SYSCONS_DEV || + inode->i_rdev == CONSOLE_DEV); + + if (is_console) { + struct file *p = NULL; + + spin_lock(&redirect_lock); + if (redirect) { + get_file(redirect); + p = redirect; + } + spin_unlock(&redirect_lock); + + if (p) { + ssize_t res = p->f_op->write(p, buf, count, &p->f_pos); + fput(p); + return res; + } + } + + tty = (struct tty_struct *)file->private_data; + if (tty_paranoia_check(tty, inode->i_rdev, "tty_write")) + return -EIO; + if (!tty || !tty->driver.write || (test_bit(TTY_IO_ERROR, &tty->flags))) + return -EIO; +#if 0 + if (!is_console && L_TOSTOP(tty) && (tty->pgrp > 0) && + (current->tty == tty) && (tty->pgrp != current->pgrp)) { + if (is_orphaned_pgrp(current->pgrp)) + return -EIO; + if (!is_ignored(SIGTTOU)) { + (void) kill_pg(current->pgrp, SIGTTOU, 1); + return -ERESTARTSYS; + } + } +#endif + if (!tty->ldisc.write) + return -EIO; + return do_tty_write(tty->ldisc.write, tty, file, + (const unsigned char *)buf, count); +} + +/* Semaphore to protect creating and releasing a tty */ +static DECLARE_MUTEX(tty_sem); + +static void down_tty_sem(int index) +{ + down(&tty_sem); +} + +static void up_tty_sem(int index) +{ + up(&tty_sem); +} + +static void release_mem(struct tty_struct *tty, int idx); + +/* + * WSH 06/09/97: Rewritten to remove races and properly clean up after a + * failed open. The new code protects the open with a semaphore, so it's + * really quite straightforward. The semaphore locking can probably be + * relaxed for the (most common) case of reopening a tty. + */ +static int init_dev(kdev_t device, struct tty_struct **ret_tty) +{ + struct tty_struct *tty, *o_tty; + struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; + struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; + struct tty_driver *driver; + int retval=0; + int idx; + + driver = get_tty_driver(device); + if (!driver) + return -ENODEV; + + idx = MINOR(device) - driver->minor_start; + + /* + * Check whether we need to acquire the tty semaphore to avoid + * race conditions. For now, play it safe. + */ + down_tty_sem(idx); + + /* check whether we're reopening an existing tty */ + tty = driver->table[idx]; + if (tty) goto fast_track; + + /* + * First time open is complex, especially for PTY devices. + * This code guarantees that either everything succeeds and the + * TTY is ready for operation, or else the table slots are vacated + * and the allocated memory released. (Except that the termios + * and locked termios may be retained.) + */ + + o_tty = NULL; + tp = o_tp = NULL; + ltp = o_ltp = NULL; + + tty = alloc_tty_struct(); + if(!tty) + goto fail_no_mem; + initialize_tty_struct(tty); + tty->device = device; + tty->driver = *driver; + + tp_loc = &driver->termios[idx]; + if (!*tp_loc) { + tp = (struct termios *) kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!tp) + goto free_mem_out; + *tp = driver->init_termios; + } + + ltp_loc = &driver->termios_locked[idx]; + if (!*ltp_loc) { + ltp = (struct termios *) kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!ltp) + goto free_mem_out; + memset(ltp, 0, sizeof(struct termios)); + } + + if (driver->type == TTY_DRIVER_TYPE_PTY) { + o_tty = alloc_tty_struct(); + if (!o_tty) + goto free_mem_out; + initialize_tty_struct(o_tty); + o_tty->device = (kdev_t) MKDEV(driver->other->major, + driver->other->minor_start + idx); + o_tty->driver = *driver->other; + + o_tp_loc = &driver->other->termios[idx]; + if (!*o_tp_loc) { + o_tp = (struct termios *) + kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_tp) + goto free_mem_out; + *o_tp = driver->other->init_termios; + } + + o_ltp_loc = &driver->other->termios_locked[idx]; + if (!*o_ltp_loc) { + o_ltp = (struct termios *) + kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_ltp) + goto free_mem_out; + memset(o_ltp, 0, sizeof(struct termios)); + } + + /* + * Everything allocated ... set up the o_tty structure. + */ + driver->other->table[idx] = o_tty; + if (!*o_tp_loc) + *o_tp_loc = o_tp; + if (!*o_ltp_loc) + *o_ltp_loc = o_ltp; + o_tty->termios = *o_tp_loc; + o_tty->termios_locked = *o_ltp_loc; + (*driver->other->refcount)++; + if (driver->subtype == PTY_TYPE_MASTER) + o_tty->count++; + + /* Establish the links in both directions */ + tty->link = o_tty; + o_tty->link = tty; + } + + /* + * All structures have been allocated, so now we install them. + * Failures after this point use release_mem to clean up, so + * there's no need to null out the local pointers. + */ + driver->table[idx] = tty; + + if (!*tp_loc) + *tp_loc = tp; + if (!*ltp_loc) + *ltp_loc = ltp; + tty->termios = *tp_loc; + tty->termios_locked = *ltp_loc; + (*driver->refcount)++; + tty->count++; + + /* + * Structures all installed ... call the ldisc open routines. + * If we fail here just call release_mem to clean up. No need + * to decrement the use counts, as release_mem doesn't care. + */ + if (tty->ldisc.open) { + retval = (tty->ldisc.open)(tty); + if (retval) + goto release_mem_out; + } + if (o_tty && o_tty->ldisc.open) { + retval = (o_tty->ldisc.open)(o_tty); + if (retval) { + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + goto release_mem_out; + } + } + goto success; + + /* + * This fast open can be used if the tty is already open. + * No memory is allocated, and the only failures are from + * attempting to open a closing tty or attempting multiple + * opens on a pty master. + */ +fast_track: + if (test_bit(TTY_CLOSING, &tty->flags)) { + retval = -EIO; + goto end_init; + } + if (driver->type == TTY_DRIVER_TYPE_PTY && + driver->subtype == PTY_TYPE_MASTER) { + /* + * special case for PTY masters: only one open permitted, + * and the slave side open count is incremented as well. + */ + if (tty->count) { + retval = -EIO; + goto end_init; + } + tty->link->count++; + } + tty->count++; + tty->driver = *driver; /* N.B. why do this every time?? */ + +success: + *ret_tty = tty; + + /* All paths come through here to release the semaphore */ +end_init: + up_tty_sem(idx); + return retval; + + /* Release locally allocated memory ... nothing placed in slots */ +free_mem_out: + if (o_tp) + kfree(o_tp); + if (o_tty) + free_tty_struct(o_tty); + if (ltp) + kfree(ltp); + if (tp) + kfree(tp); + free_tty_struct(tty); + +fail_no_mem: + retval = -ENOMEM; + goto end_init; + + /* call the tty release_mem routine to clean out this slot */ +release_mem_out: + printk(KERN_INFO "init_dev: ldisc open failed, " + "clearing slot %d\n", idx); + release_mem(tty, idx); + goto end_init; +} + +/* + * Releases memory associated with a tty structure, and clears out the + * driver table slots. + */ +static void release_mem(struct tty_struct *tty, int idx) +{ + struct tty_struct *o_tty; + struct termios *tp; + + if ((o_tty = tty->link) != NULL) { + o_tty->driver.table[idx] = NULL; + if (o_tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) { + tp = o_tty->driver.termios[idx]; + o_tty->driver.termios[idx] = NULL; + kfree(tp); + } + o_tty->magic = 0; + (*o_tty->driver.refcount)--; + list_del_init(&o_tty->tty_files); + free_tty_struct(o_tty); + } + + tty->driver.table[idx] = NULL; + if (tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) { + tp = tty->driver.termios[idx]; + tty->driver.termios[idx] = NULL; + kfree(tp); + } + tty->magic = 0; + (*tty->driver.refcount)--; + list_del_init(&tty->tty_files); + free_tty_struct(tty); +} + +/* + * Even releasing the tty structures is a tricky business.. We have + * to be very careful that the structures are all released at the + * same time, as interrupts might otherwise get the wrong pointers. + * + * WSH 09/09/97: rewritten to avoid some nasty race conditions that could + * lead to double frees or releasing memory still in use. + */ +static void release_dev(struct file * filp) +{ + struct tty_struct *tty, *o_tty; + int pty_master, tty_closing, o_tty_closing, do_sleep; + int idx; + char buf[64]; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "release_dev")) + return; + + check_tty_count(tty, "release_dev"); + + tty_fasync(-1, filp, 0); + + idx = MINOR(tty->device) - tty->driver.minor_start; + pty_master = (tty->driver.type == TTY_DRIVER_TYPE_PTY && + tty->driver.subtype == PTY_TYPE_MASTER); + o_tty = tty->link; + +#ifdef TTY_PARANOIA_CHECK + if (idx < 0 || idx >= tty->driver.num) { + printk(KERN_DEBUG "release_dev: bad idx when trying to " + "free (%s)\n", kdevname(tty->device)); + return; + } + if (tty != tty->driver.table[idx]) { + printk(KERN_DEBUG "release_dev: driver.table[%d] not tty " + "for (%s)\n", idx, kdevname(tty->device)); + return; + } + if (tty->termios != tty->driver.termios[idx]) { + printk(KERN_DEBUG "release_dev: driver.termios[%d] not termios " + "for (%s)\n", + idx, kdevname(tty->device)); + return; + } + if (tty->termios_locked != tty->driver.termios_locked[idx]) { + printk(KERN_DEBUG "release_dev: driver.termios_locked[%d] not " + "termios_locked for (%s)\n", + idx, kdevname(tty->device)); + return; + } +#endif + +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "release_dev of %s (tty count=%d)...", + tty_name(tty, buf), tty->count); +#endif + +#ifdef TTY_PARANOIA_CHECK + if (tty->driver.other) { + if (o_tty != tty->driver.other->table[idx]) { + printk(KERN_DEBUG "release_dev: other->table[%d] " + "not o_tty for (%s)\n", + idx, kdevname(tty->device)); + return; + } + if (o_tty->termios != tty->driver.other->termios[idx]) { + printk(KERN_DEBUG "release_dev: other->termios[%d] " + "not o_termios for (%s)\n", + idx, kdevname(tty->device)); + return; + } + if (o_tty->termios_locked != + tty->driver.other->termios_locked[idx]) { + printk(KERN_DEBUG "release_dev: other->termios_locked[" + "%d] not o_termios_locked for (%s)\n", + idx, kdevname(tty->device)); + return; + } + if (o_tty->link != tty) { + printk(KERN_DEBUG "release_dev: bad pty pointers\n"); + return; + } + } +#endif + + if (tty->driver.close) + tty->driver.close(tty, filp); + + /* + * Sanity check: if tty->count is going to zero, there shouldn't be + * any waiters on tty->read_wait or tty->write_wait. We test the + * wait queues and kick everyone out _before_ actually starting to + * close. This ensures that we won't block while releasing the tty + * structure. + * + * The test for the o_tty closing is necessary, since the master and + * slave sides may close in any order. If the slave side closes out + * first, its count will be one, since the master side holds an open. + * Thus this test wouldn't be triggered at the time the slave closes, + * so we do it now. + * + * Note that it's possible for the tty to be opened again while we're + * flushing out waiters. By recalculating the closing flags before + * each iteration we avoid any problems. + */ + while (1) { + tty_closing = tty->count <= 1; + o_tty_closing = o_tty && + (o_tty->count <= (pty_master ? 1 : 0)); + do_sleep = 0; + + if (tty_closing) { + if (waitqueue_active(&tty->read_wait)) { + wake_up(&tty->read_wait); + do_sleep++; + } + if (waitqueue_active(&tty->write_wait)) { + wake_up(&tty->write_wait); + do_sleep++; + } + } + if (o_tty_closing) { + if (waitqueue_active(&o_tty->read_wait)) { + wake_up(&o_tty->read_wait); + do_sleep++; + } + if (waitqueue_active(&o_tty->write_wait)) { + wake_up(&o_tty->write_wait); + do_sleep++; + } + } + if (!do_sleep) + break; + + printk(KERN_WARNING "release_dev: %s: read/write wait queue " + "active!\n", tty_name(tty, buf)); + schedule(); + } + + /* + * The closing flags are now consistent with the open counts on + * both sides, and we've completed the last operation that could + * block, so it's safe to proceed with closing. + */ + if (pty_master) { + if (--o_tty->count < 0) { + printk(KERN_WARNING "release_dev: bad pty slave count " + "(%d) for %s\n", + o_tty->count, tty_name(o_tty, buf)); + o_tty->count = 0; + } + } + if (--tty->count < 0) { + printk(KERN_WARNING "release_dev: bad tty->count (%d) for %s\n", + tty->count, tty_name(tty, buf)); + tty->count = 0; + } + + /* + * We've decremented tty->count, so we should zero out + * filp->private_data, to break the link between the tty and + * the file descriptor. Otherwise if filp_close() blocks before + * the file descriptor is removed from the inuse_filp + * list, check_tty_count() could observe a discrepancy and + * printk a warning message to the user. + */ + filp->private_data = 0; + + /* + * Perform some housekeeping before deciding whether to return. + * + * Set the TTY_CLOSING flag if this was the last open. In the + * case of a pty we may have to wait around for the other side + * to close, and TTY_CLOSING makes sure we can't be reopened. + */ + if(tty_closing) + set_bit(TTY_CLOSING, &tty->flags); + if(o_tty_closing) + set_bit(TTY_CLOSING, &o_tty->flags); + + /* + * If _either_ side is closing, make sure there aren't any + * processes that still think tty or o_tty is their controlling + * tty. + */ + if (tty_closing || o_tty_closing) { + struct task_struct *p; + + read_lock(&tasklist_lock); + for_each_task(p) { + if (p->tty == tty || (o_tty && p->tty == o_tty)) + p->tty = NULL; + } + read_unlock(&tasklist_lock); + } + + /* check whether both sides are closing ... */ + if (!tty_closing || (o_tty && !o_tty_closing)) + return; + +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "freeing tty structure..."); +#endif + + /* + * Shutdown the current line discipline, and reset it to N_TTY. + * N.B. why reset ldisc when we're releasing the memory?? + */ + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (o_tty) { + if (o_tty->ldisc.close) + (o_tty->ldisc.close)(o_tty); + o_tty->ldisc = ldiscs[N_TTY]; + } + + /* + * Make sure that the tty's task queue isn't activated. + */ + run_task_queue(&tq_timer); + flush_scheduled_tasks(); + + /* + * The release_mem function takes care of the details of clearing + * the slots and preserving the termios structure. + */ + release_mem(tty, idx); +} + +/* + * tty_open and tty_release keep up the tty count that contains the + * number of opens done on a tty. We cannot use the inode-count, as + * different inodes might point to the same tty. + * + * Open-counting is needed for pty masters, as well as for keeping + * track of serial lines: DTR is dropped when the last close happens. + * (This is not done solely through tty->count, now. - Ted 1/27/92) + * + * The termios state of a pty is reset on first open so that + * settings don't persist across reuse. + */ +static int tty_open(struct inode * inode, struct file * filp) +{ + struct tty_struct *tty; + int noctty, retval; + kdev_t device; + unsigned short saved_flags; + char buf[64]; + + saved_flags = filp->f_flags; +retry_open: + noctty = filp->f_flags & O_NOCTTY; + device = inode->i_rdev; + if (device == TTY_DEV) { + if (!current->tty) + return -ENXIO; + device = current->tty->device; + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ + /* noctty = 1; */ + } +#ifdef CONFIG_VT + if (device == CONSOLE_DEV) { + extern int fg_console; + device = MKDEV(TTY_MAJOR, fg_console + 1); + noctty = 1; + } +#endif + if (device == SYSCONS_DEV) { + struct console *c = console_drivers; + while(c && !c->device) + c = c->next; + if (!c) + return -ENODEV; + device = c->device(c); + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/console block */ + noctty = 1; + } + + if (device == PTMX_DEV) { +#ifdef CONFIG_UNIX98_PTYS + + /* find a free pty. */ + int major, minor; + struct tty_driver *driver; + + /* find a device that is not in use. */ + retval = -1; + for ( major = 0 ; major < UNIX98_NR_MAJORS ; major++ ) { + driver = &ptm_driver[major]; + for (minor = driver->minor_start ; + minor < driver->minor_start + driver->num ; + minor++) { + device = MKDEV(driver->major, minor); + if (!init_dev(device, &tty)) goto ptmx_found; /* ok! */ + } + } + return -EIO; /* no free ptys */ + ptmx_found: + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + minor -= driver->minor_start; + devpts_pty_new(driver->other->name_base + minor, MKDEV(driver->other->major, minor + driver->other->minor_start)); + tty_register_devfs(&pts_driver[major], DEVFS_FL_DEFAULT, + pts_driver[major].minor_start + minor); + noctty = 1; + goto init_dev_done; + +#else /* CONFIG_UNIX_98_PTYS */ + + return -ENODEV; + +#endif /* CONFIG_UNIX_98_PTYS */ + } + + retval = init_dev(device, &tty); + if (retval) + return retval; + +#ifdef CONFIG_UNIX98_PTYS +init_dev_done: +#endif + filp->private_data = tty; + file_move(filp, &tty->tty_files); + check_tty_count(tty, "tty_open"); + if (tty->driver.type == TTY_DRIVER_TYPE_PTY && + tty->driver.subtype == PTY_TYPE_MASTER) + noctty = 1; +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "opening %s...", tty_name(tty, buf)); +#endif + if (tty->driver.open) + retval = tty->driver.open(tty, filp); + else + retval = -ENODEV; + filp->f_flags = saved_flags; + + if (!retval && test_bit(TTY_EXCLUSIVE, &tty->flags) && !suser()) + retval = -EBUSY; + + if (retval) { +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "error %d in opening %s...", retval, + tty_name(tty, buf)); +#endif + + release_dev(filp); + if (retval != -ERESTARTSYS) + return retval; + if (signal_pending(current)) + return retval; + schedule(); + /* + * Need to reset f_op in case a hangup happened. + */ + filp->f_op = &tty_fops; + goto retry_open; + } + if (!noctty && + current->leader && + !current->tty && + tty->session == 0) { + task_lock(current); + current->tty = tty; + task_unlock(current); + current->tty_old_pgrp = 0; + tty->session = current->session; + tty->pgrp = current->pgrp; + } + if ((tty->driver.type == TTY_DRIVER_TYPE_SERIAL) && + (tty->driver.subtype == SERIAL_TYPE_CALLOUT) && + (tty->count == 1)) { + static int nr_warns; + if (nr_warns < 5) { + printk(KERN_WARNING "tty_io.c: " + "process %d (%s) used obsolete /dev/%s - " + "update software to use /dev/ttyS%d\n", + current->pid, current->comm, + tty_name(tty, buf), TTY_NUMBER(tty)); + nr_warns++; + } + } + return 0; +} + +static int tty_release(struct inode * inode, struct file * filp) +{ + lock_kernel(); + release_dev(filp); + unlock_kernel(); + return 0; +} + +/* No kernel lock held - fine */ +static unsigned int tty_poll(struct file * filp, poll_table * wait) +{ + struct tty_struct * tty; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "tty_poll")) + return 0; + + if (tty->ldisc.poll) + return (tty->ldisc.poll)(tty, filp, wait); + return 0; +} + +static int tty_fasync(int fd, struct file * filp, int on) +{ + struct tty_struct * tty; + int retval; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "tty_fasync")) + return 0; + + retval = fasync_helper(fd, filp, on, &tty->fasync); + if (retval <= 0) + return retval; + + if (on) { + if (!waitqueue_active(&tty->read_wait)) + tty->minimum_to_wake = 1; + if (filp->f_owner.pid == 0) { + filp->f_owner.pid = (-tty->pgrp) ? : current->pid; + filp->f_owner.uid = current->uid; + filp->f_owner.euid = current->euid; + } + } else { + if (!tty->fasync && !waitqueue_active(&tty->read_wait)) + tty->minimum_to_wake = N_TTY_BUF_SIZE; + } + return 0; +} + +static int tiocsti(struct tty_struct *tty, char * arg) +{ + char ch, mbz = 0; + + if ((current->tty != tty) && !suser()) + return -EPERM; + if (get_user(ch, arg)) + return -EFAULT; + tty->ldisc.receive_buf(tty, &ch, &mbz, 1); + return 0; +} + +static int tiocgwinsz(struct tty_struct *tty, struct winsize * arg) +{ + if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) + return -EFAULT; + return 0; +} + +static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, + struct winsize * arg) +{ + struct winsize tmp_ws; + + if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) + return -EFAULT; + if (!memcmp(&tmp_ws, &tty->winsize, sizeof(*arg))) + return 0; + if (tty->pgrp > 0) + kill_pg(tty->pgrp, SIGWINCH, 1); + if ((real_tty->pgrp != tty->pgrp) && (real_tty->pgrp > 0)) + kill_pg(real_tty->pgrp, SIGWINCH, 1); + tty->winsize = tmp_ws; + real_tty->winsize = tmp_ws; + return 0; +} + +static int tioccons(struct inode *inode, struct file *file) +{ + if (inode->i_rdev == SYSCONS_DEV || + inode->i_rdev == CONSOLE_DEV) { + struct file *f; + if (!suser()) + return -EPERM; + spin_lock(&redirect_lock); + f = redirect; + redirect = NULL; + spin_unlock(&redirect_lock); + if (f) + fput(f); + return 0; + } + spin_lock(&redirect_lock); + if (redirect) { + spin_unlock(&redirect_lock); + return -EBUSY; + } + get_file(file); + redirect = file; + spin_unlock(&redirect_lock); + return 0; +} + + +static int fionbio(struct file *file, int *arg) +{ + int nonblock; + + if (get_user(nonblock, arg)) + return -EFAULT; + + if (nonblock) + file->f_flags |= O_NONBLOCK; + else + file->f_flags &= ~O_NONBLOCK; + return 0; +} + +static int tiocsctty(struct tty_struct *tty, int arg) +{ + if (current->leader && + (current->session == tty->session)) + return 0; + /* + * The process must be a session leader and + * not have a controlling tty already. + */ + if (!current->leader || current->tty) + return -EPERM; + if (tty->session > 0) { + /* + * This tty is already the controlling + * tty for another session group! + */ + if ((arg == 1) && suser()) { + /* + * Steal it away + */ + struct task_struct *p; + + read_lock(&tasklist_lock); + for_each_task(p) + if (p->tty == tty) + p->tty = NULL; + read_unlock(&tasklist_lock); + } else + return -EPERM; + } + task_lock(current); + current->tty = tty; + task_unlock(current); + current->tty_old_pgrp = 0; + tty->session = current->session; + tty->pgrp = current->pgrp; + return 0; +} + +static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->tty != real_tty) + return -ENOTTY; + return put_user(real_tty->pgrp, arg); +} + +static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + pid_t pgrp; + int retval = tty_check_change(real_tty); + + if (retval == -EIO) + return -ENOTTY; + if (retval) + return retval; + if (!current->tty || + (current->tty != real_tty) || + (real_tty->session != current->session)) + return -ENOTTY; + if (get_user(pgrp, (pid_t *) arg)) + return -EFAULT; + if (pgrp < 0) + return -EINVAL; + if (session_of_pgrp(pgrp) != current->session) + return -EPERM; + real_tty->pgrp = pgrp; + return 0; +} + +static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->tty != real_tty) + return -ENOTTY; + if (real_tty->session <= 0) + return -ENOTTY; + return put_user(real_tty->session, arg); +} + +static int tiocttygstruct(struct tty_struct *tty, struct tty_struct *arg) +{ + if (copy_to_user(arg, tty, sizeof(*arg))) + return -EFAULT; + return 0; +} + +static int tiocsetd(struct tty_struct *tty, int *arg) +{ + int ldisc; + + if (get_user(ldisc, arg)) + return -EFAULT; + return tty_set_ldisc(tty, ldisc); +} + +static int send_break(struct tty_struct *tty, int duration) +{ + set_current_state(TASK_INTERRUPTIBLE); + + tty->driver.break_ctl(tty, -1); + if (!signal_pending(current)) + schedule_timeout(duration); + tty->driver.break_ctl(tty, 0); + if (signal_pending(current)) + return -EINTR; + return 0; +} + +static int tty_generic_brk(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) +{ + if (cmd == TCSBRK && arg) + { + /* tcdrain case */ + int retval = tty_check_change(tty); + if (retval) + return retval; + tty_wait_until_sent(tty, 0); + if (signal_pending(current)) + return -EINTR; + } + return 0; +} + +/* + * Split this up, as gcc can choke on it otherwise.. + */ +int tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + struct tty_struct *tty, *real_tty; + int retval; + + tty = (struct tty_struct *)file->private_data; + if (tty_paranoia_check(tty, inode->i_rdev, "tty_ioctl")) + return -EINVAL; + + real_tty = tty; + if (tty->driver.type == TTY_DRIVER_TYPE_PTY && + tty->driver.subtype == PTY_TYPE_MASTER) + real_tty = tty->link; + + /* + * Break handling by driver + */ + if (!tty->driver.break_ctl) { + switch(cmd) { + case TIOCSBRK: + case TIOCCBRK: + if (tty->driver.ioctl) + return tty->driver.ioctl(tty, file, cmd, arg); + return -EINVAL; + + /* These two ioctl's always return success; even if */ + /* the driver doesn't support them. */ + case TCSBRK: + case TCSBRKP: + retval = -ENOIOCTLCMD; + if (tty->driver.ioctl) + retval = tty->driver.ioctl(tty, file, cmd, arg); + /* Not driver handled */ + if (retval == -ENOIOCTLCMD) + retval = tty_generic_brk(tty, file, cmd, arg); + return retval; + } + } + + /* + * Factor out some common prep work + */ + switch (cmd) { + case TIOCSETD: + case TIOCSBRK: + case TIOCCBRK: + case TCSBRK: + case TCSBRKP: + retval = tty_check_change(tty); + if (retval) + return retval; + if (cmd != TIOCCBRK) { + tty_wait_until_sent(tty, 0); + if (signal_pending(current)) + return -EINTR; + } + break; + } + + switch (cmd) { + case TIOCSTI: + return tiocsti(tty, (char *)arg); + case TIOCGWINSZ: + return tiocgwinsz(tty, (struct winsize *) arg); + case TIOCSWINSZ: + return tiocswinsz(tty, real_tty, (struct winsize *) arg); + case TIOCCONS: + return real_tty!=tty ? -EINVAL : tioccons(inode, file); + case FIONBIO: + return fionbio(file, (int *) arg); + case TIOCEXCL: + set_bit(TTY_EXCLUSIVE, &tty->flags); + return 0; + case TIOCNXCL: + clear_bit(TTY_EXCLUSIVE, &tty->flags); + return 0; + case TIOCNOTTY: + if (current->tty != tty) + return -ENOTTY; + if (current->leader) + disassociate_ctty(0); + task_lock(current); + current->tty = NULL; + task_unlock(current); + return 0; + case TIOCSCTTY: + return tiocsctty(tty, arg); + case TIOCGPGRP: + return tiocgpgrp(tty, real_tty, (pid_t *) arg); + case TIOCSPGRP: + return tiocspgrp(tty, real_tty, (pid_t *) arg); + case TIOCGSID: + return tiocgsid(tty, real_tty, (pid_t *) arg); + case TIOCGETD: + return put_user(tty->ldisc.num, (int *) arg); + case TIOCSETD: + return tiocsetd(tty, (int *) arg); +#ifdef CONFIG_VT + case TIOCLINUX: + return tioclinux(tty, arg); +#endif + case TIOCTTYGSTRUCT: + return tiocttygstruct(tty, (struct tty_struct *) arg); + + /* + * Break handling + */ + case TIOCSBRK: /* Turn break on, unconditionally */ + tty->driver.break_ctl(tty, -1); + return 0; + + case TIOCCBRK: /* Turn break off, unconditionally */ + tty->driver.break_ctl(tty, 0); + return 0; + case TCSBRK: /* SVID version: non-zero arg --> no break */ + /* + * XXX is the above comment correct, or the + * code below correct? Is this ioctl used at + * all by anyone? + */ + if (!arg) + return send_break(tty, HZ/4); + return 0; + case TCSBRKP: /* support for POSIX tcsendbreak() */ + return send_break(tty, arg ? arg*(HZ/10) : HZ/4); + } + if (tty->driver.ioctl) { + int retval = (tty->driver.ioctl)(tty, file, cmd, arg); + if (retval != -ENOIOCTLCMD) + return retval; + } + if (tty->ldisc.ioctl) { + int retval = (tty->ldisc.ioctl)(tty, file, cmd, arg); + if (retval != -ENOIOCTLCMD) + return retval; + } + return -EINVAL; +} + + +/* + * This implements the "Secure Attention Key" --- the idea is to + * prevent trojan horses by killing all processes associated with this + * tty when the user hits the "Secure Attention Key". Required for + * super-paranoid applications --- see the Orange Book for more details. + * + * This code could be nicer; ideally it should send a HUP, wait a few + * seconds, then send a INT, and then a KILL signal. But you then + * have to coordinate with the init process, since all processes associated + * with the current tty must be dead before the new getty is allowed + * to spawn. + * + * Now, if it would be correct ;-/ The current code has a nasty hole - + * it doesn't catch files in flight. We may send the descriptor to ourselves + * via AF_UNIX socket, close it and later fetch from socket. FIXME. + * + * Nasty bug: do_SAK is being called in interrupt context. This can + * deadlock. We punt it up to process context. AKPM - 16Mar2001 + */ +static void __do_SAK(void *arg) +{ +#ifdef TTY_SOFT_SAK + tty_hangup(tty); +#else + struct tty_struct *tty = arg; + struct task_struct *p; + int session; + int i; + struct file *filp; + + if (!tty) + return; + session = tty->session; + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + if (tty->driver.flush_buffer) + tty->driver.flush_buffer(tty); + read_lock(&tasklist_lock); + for_each_task(p) { + if ((p->tty == tty) || + ((session > 0) && (p->session == session))) { + send_sig(SIGKILL, p, 1); + continue; + } + task_lock(p); + if (p->files) { + read_lock(&p->files->file_lock); + for (i=0; i < p->files->max_fds; i++) { + filp = fcheck_files(p->files, i); + if (filp && (filp->f_op == &tty_fops) && + (filp->private_data == tty)) { + send_sig(SIGKILL, p, 1); + break; + } + } + read_unlock(&p->files->file_lock); + } + task_unlock(p); + } + read_unlock(&tasklist_lock); +#endif +} + +/* + * The tq handling here is a little racy - tty->SAK_tq may already be queued. + * But there's no mechanism to fix that without futzing with tqueue_lock. + * Fortunately we don't need to worry, because if ->SAK_tq is already queued, + * the values which we write to it will be identical to the values which it + * already has. --akpm + */ +void do_SAK(struct tty_struct *tty) +{ + if (!tty) + return; + PREPARE_TQUEUE(&tty->SAK_tq, __do_SAK, tty); + schedule_task(&tty->SAK_tq); +} + +/* + * This routine is called out of the software interrupt to flush data + * from the flip buffer to the line discipline. + */ +static void flush_to_ldisc(void *private_) +{ + struct tty_struct *tty = (struct tty_struct *) private_; + unsigned char *cp; + char *fp; + int count; + unsigned long flags; + + if (test_bit(TTY_DONT_FLIP, &tty->flags)) { + queue_task(&tty->flip.tqueue, &tq_timer); + return; + } + if (tty->flip.buf_num) { + cp = tty->flip.char_buf + TTY_FLIPBUF_SIZE; + fp = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; + tty->flip.buf_num = 0; + + save_flags(flags); cli(); + tty->flip.char_buf_ptr = tty->flip.char_buf; + tty->flip.flag_buf_ptr = tty->flip.flag_buf; + } else { + cp = tty->flip.char_buf; + fp = tty->flip.flag_buf; + tty->flip.buf_num = 1; + + save_flags(flags); cli(); + tty->flip.char_buf_ptr = tty->flip.char_buf + TTY_FLIPBUF_SIZE; + tty->flip.flag_buf_ptr = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; + } + count = tty->flip.count; + tty->flip.count = 0; + restore_flags(flags); + + tty->ldisc.receive_buf(tty, cp, fp, count); +} + +/* + * Routine which returns the baud rate of the tty + * + * Note that the baud_table needs to be kept in sync with the + * include/asm/termbits.h file. + */ +static int baud_table[] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, + 9600, 19200, 38400, 57600, 115200, 230400, 460800, +#ifdef __sparc__ + 76800, 153600, 307200, 614400, 921600 +#else + 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, + 2500000, 3000000, 3500000, 4000000 +#endif +}; + +static int n_baud_table = sizeof(baud_table)/sizeof(int); + +int tty_get_baud_rate(struct tty_struct *tty) +{ + unsigned int cflag, i; + + cflag = tty->termios->c_cflag; + + i = cflag & CBAUD; + if (i & CBAUDEX) { + i &= ~CBAUDEX; + if (i < 1 || i+15 >= n_baud_table) + tty->termios->c_cflag &= ~CBAUDEX; + else + i += 15; + } + if (i==15 && tty->alt_speed) { + if (!tty->warned) { + printk(KERN_WARNING "Use of setserial/setrocket to " + "set SPD_* flags is deprecated\n"); + tty->warned = 1; + } + return(tty->alt_speed); + } + + return baud_table[i]; +} + +void tty_flip_buffer_push(struct tty_struct *tty) +{ + if (tty->low_latency) + flush_to_ldisc((void *) tty); + else + queue_task(&tty->flip.tqueue, &tq_timer); +} + +/* + * This subroutine initializes a tty structure. + */ +static void initialize_tty_struct(struct tty_struct *tty) +{ + memset(tty, 0, sizeof(struct tty_struct)); + tty->magic = TTY_MAGIC; + tty->ldisc = ldiscs[N_TTY]; + tty->pgrp = -1; + tty->flip.char_buf_ptr = tty->flip.char_buf; + tty->flip.flag_buf_ptr = tty->flip.flag_buf; + tty->flip.tqueue.routine = flush_to_ldisc; + tty->flip.tqueue.data = tty; + init_MUTEX(&tty->flip.pty_sem); + init_waitqueue_head(&tty->write_wait); + init_waitqueue_head(&tty->read_wait); + tty->tq_hangup.routine = do_tty_hangup; + tty->tq_hangup.data = tty; + sema_init(&tty->atomic_read, 1); + sema_init(&tty->atomic_write, 1); + spin_lock_init(&tty->read_lock); + INIT_LIST_HEAD(&tty->tty_files); + INIT_TQUEUE(&tty->SAK_tq, 0, 0); +} + +/* + * The default put_char routine if the driver did not define one. + */ +void tty_default_put_char(struct tty_struct *tty, unsigned char ch) +{ + tty->driver.write(tty, 0, &ch, 1); +} + +/* + * Register a tty device described by , with minor number . + */ +void tty_register_devfs (struct tty_driver *driver, unsigned int flags, unsigned minor) +{ +#ifdef CONFIG_DEVFS_FS + umode_t mode = S_IFCHR | S_IRUSR | S_IWUSR; + kdev_t device = MKDEV (driver->major, minor); + int idx = minor - driver->minor_start; + char buf[32]; + + switch (device) { + case TTY_DEV: + case PTMX_DEV: + mode |= S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + break; + default: + if (driver->major == PTY_MASTER_MAJOR) + mode |= S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + break; + } + if ( (minor < driver->minor_start) || + (minor >= driver->minor_start + driver->num) ) { + printk(KERN_ERR "Attempt to register invalid minor number " + "with devfs (%d:%d).\n", (int)driver->major,(int)minor); + return; + } +# ifdef CONFIG_UNIX98_PTYS + if ( (driver->major >= UNIX98_PTY_SLAVE_MAJOR) && + (driver->major < UNIX98_PTY_SLAVE_MAJOR + UNIX98_NR_MAJORS) ) + flags |= DEVFS_FL_CURRENT_OWNER; +# endif + sprintf(buf, driver->name, idx + driver->name_base); + devfs_register (NULL, buf, flags | DEVFS_FL_DEFAULT, + driver->major, minor, mode, &tty_fops, NULL); +#endif /* CONFIG_DEVFS_FS */ +} + +void tty_unregister_devfs (struct tty_driver *driver, unsigned minor) +{ +#ifdef CONFIG_DEVFS_FS + void * handle; + int idx = minor - driver->minor_start; + char buf[32]; + + sprintf(buf, driver->name, idx + driver->name_base); + handle = devfs_find_handle (NULL, buf, driver->major, minor, + DEVFS_SPECIAL_CHR, 0); + devfs_unregister (handle); +#endif /* CONFIG_DEVFS_FS */ +} + +EXPORT_SYMBOL(tty_register_devfs); +EXPORT_SYMBOL(tty_unregister_devfs); + +/* + * Called by a tty driver to register itself. + */ +int tty_register_driver(struct tty_driver *driver) +{ + int error; + int i; + + if (driver->flags & TTY_DRIVER_INSTALLED) + return 0; + + error = devfs_register_chrdev(driver->major, driver->name, &tty_fops); + if (error < 0) + return error; + else if(driver->major == 0) + driver->major = error; + + if (!driver->put_char) + driver->put_char = tty_default_put_char; + + driver->prev = 0; + driver->next = tty_drivers; + if (tty_drivers) tty_drivers->prev = driver; + tty_drivers = driver; + + if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { + for(i = 0; i < driver->num; i++) + tty_register_devfs(driver, 0, driver->minor_start + i); + } + proc_tty_register_driver(driver); + return error; +} + +/* + * Called by a tty driver to unregister itself. + */ +int tty_unregister_driver(struct tty_driver *driver) +{ + int retval; + struct tty_driver *p; + int i, found = 0; + struct termios *tp; + const char *othername = NULL; + + if (*driver->refcount) + return -EBUSY; + + for (p = tty_drivers; p; p = p->next) { + if (p == driver) + found++; + else if (p->major == driver->major) + othername = p->name; + } + + if (!found) + return -ENOENT; + + if (othername == NULL) { + retval = devfs_unregister_chrdev(driver->major, driver->name); + if (retval) + return retval; + } else + devfs_register_chrdev(driver->major, othername, &tty_fops); + + if (driver->prev) + driver->prev->next = driver->next; + else + tty_drivers = driver->next; + + if (driver->next) + driver->next->prev = driver->prev; + + /* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + tty_unregister_devfs(driver, driver->minor_start + i); + } + proc_tty_unregister_driver(driver); + return 0; +} + + +/* + * Initialize the console device. This is called *early*, so + * we can't necessarily depend on lots of kernel help here. + * Just do some early initializations, and do the complex setup + * later. + */ +void __init console_init(void) +{ + /* Setup the default TTY line discipline. */ + memset(ldiscs, 0, sizeof(ldiscs)); + (void) tty_register_ldisc(N_TTY, &tty_ldisc_N_TTY); + + /* + * Set up the standard termios. Individual tty drivers may + * deviate from this; this is used as a template. + */ + memset(&tty_std_termios, 0, sizeof(struct termios)); + memcpy(tty_std_termios.c_cc, INIT_C_CC, NCCS); + tty_std_termios.c_iflag = ICRNL | IXON; + tty_std_termios.c_oflag = OPOST | ONLCR; + tty_std_termios.c_cflag = B38400 | CS8 | CREAD | HUPCL; + tty_std_termios.c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | + ECHOCTL | ECHOKE | IEXTEN; + + /* + * set up the console device so that later boot sequences can + * inform about problems etc.. + */ +#ifdef CONFIG_EARLY_PRINTK + disable_early_printk(); +#endif + +#ifdef CONFIG_XEN_CONSOLE + xen_console_init(); +#endif + +#ifdef CONFIG_VT + con_init(); +#endif +#ifdef CONFIG_AU1X00_SERIAL_CONSOLE + au1x00_serial_console_init(); +#endif +#ifdef CONFIG_SERIAL_CONSOLE +#if (defined(CONFIG_8xx) || defined(CONFIG_CPM2)) + console_8xx_init(); +#elif defined(CONFIG_MAC_SERIAL) && defined(CONFIG_SERIAL) + if (_machine == _MACH_Pmac) + mac_scc_console_init(); + else + serial_console_init(); +#elif defined(CONFIG_MAC_SERIAL) + mac_scc_console_init(); +#elif defined(CONFIG_PARISC) + pdc_console_init(); +#elif defined(CONFIG_SERIAL) + serial_console_init(); +#endif /* CONFIG_8xx */ +#if defined(CONFIG_MVME162_SCC) || defined(CONFIG_BVME6000_SCC) || defined(CONFIG_MVME147_SCC) + vme_scc_console_init(); +#endif +#if defined(CONFIG_SERIAL167) + serial167_console_init(); +#endif +#if defined(CONFIG_SH_SCI) + sci_console_init(); +#endif +#endif +#ifdef CONFIG_SERIAL_DEC_CONSOLE + dec_serial_console_init(); +#endif +#ifdef CONFIG_TN3270_CONSOLE + tub3270_con_init(); +#endif +#ifdef CONFIG_TN3215 + con3215_init(); +#endif +#ifdef CONFIG_HWC + hwc_console_init(); +#endif +#ifdef CONFIG_STDIO_CONSOLE + stdio_console_init(); +#endif +#ifdef CONFIG_SERIAL_21285_CONSOLE + rs285_console_init(); +#endif +#ifdef CONFIG_SERIAL_SA1100_CONSOLE + sa1100_rs_console_init(); +#endif +#ifdef CONFIG_ARC_CONSOLE + arc_console_init(); +#endif +#ifdef CONFIG_SERIAL_AMBA_CONSOLE + ambauart_console_init(); +#endif +#ifdef CONFIG_SERIAL_TX3912_CONSOLE + tx3912_console_init(); +#endif +#ifdef CONFIG_TXX927_SERIAL_CONSOLE + txx927_console_init(); +#endif +#ifdef CONFIG_SERIAL_TXX9_CONSOLE + txx9_serial_console_init(); +#endif +#ifdef CONFIG_SIBYTE_SB1250_DUART_CONSOLE + sb1250_serial_console_init(); +#endif +#ifdef CONFIG_IP22_SERIAL + sgi_serial_console_init(); +#endif +} + +static struct tty_driver dev_tty_driver, dev_syscons_driver; +#ifdef CONFIG_UNIX98_PTYS +static struct tty_driver dev_ptmx_driver; +#endif +#ifdef CONFIG_HVC_CONSOLE + hvc_console_init(); +#endif +#ifdef CONFIG_VT +static struct tty_driver dev_console_driver; +#endif + +/* + * Ok, now we can initialize the rest of the tty devices and can count + * on memory allocations, interrupts etc.. + */ +void __init tty_init(void) +{ + /* + * dev_tty_driver and dev_console_driver are actually magic + * devices which get redirected at open time. Nevertheless, + * we register them so that register_chrdev is called + * appropriately. + */ + memset(&dev_tty_driver, 0, sizeof(struct tty_driver)); + dev_tty_driver.magic = TTY_DRIVER_MAGIC; + dev_tty_driver.driver_name = "/dev/tty"; + dev_tty_driver.name = dev_tty_driver.driver_name + 5; + dev_tty_driver.name_base = 0; + dev_tty_driver.major = TTYAUX_MAJOR; + dev_tty_driver.minor_start = 0; + dev_tty_driver.num = 1; + dev_tty_driver.type = TTY_DRIVER_TYPE_SYSTEM; + dev_tty_driver.subtype = SYSTEM_TYPE_TTY; + + if (tty_register_driver(&dev_tty_driver)) + panic("Couldn't register /dev/tty driver\n"); + + dev_syscons_driver = dev_tty_driver; + dev_syscons_driver.driver_name = "/dev/console"; + dev_syscons_driver.name = dev_syscons_driver.driver_name + 5; + dev_syscons_driver.major = TTYAUX_MAJOR; + dev_syscons_driver.minor_start = 1; + dev_syscons_driver.type = TTY_DRIVER_TYPE_SYSTEM; + dev_syscons_driver.subtype = SYSTEM_TYPE_SYSCONS; + + if (tty_register_driver(&dev_syscons_driver)) + panic("Couldn't register /dev/console driver\n"); + + /* console calls tty_register_driver() before kmalloc() works. + * Thus, we can't devfs_register() then. Do so now, instead. + */ +#ifdef CONFIG_VT + con_init_devfs(); +#endif + +#ifdef CONFIG_UNIX98_PTYS + dev_ptmx_driver = dev_tty_driver; + dev_ptmx_driver.driver_name = "/dev/ptmx"; + dev_ptmx_driver.name = dev_ptmx_driver.driver_name + 5; + dev_ptmx_driver.major= MAJOR(PTMX_DEV); + dev_ptmx_driver.minor_start = MINOR(PTMX_DEV); + dev_ptmx_driver.type = TTY_DRIVER_TYPE_SYSTEM; + dev_ptmx_driver.subtype = SYSTEM_TYPE_SYSPTMX; + + if (tty_register_driver(&dev_ptmx_driver)) + panic("Couldn't register /dev/ptmx driver\n"); +#endif + +#ifdef CONFIG_VT + dev_console_driver = dev_tty_driver; + dev_console_driver.driver_name = "/dev/vc/0"; + dev_console_driver.name = dev_console_driver.driver_name + 5; + dev_console_driver.major = TTY_MAJOR; + dev_console_driver.type = TTY_DRIVER_TYPE_SYSTEM; + dev_console_driver.subtype = SYSTEM_TYPE_CONSOLE; + + if (tty_register_driver(&dev_console_driver)) + panic("Couldn't register /dev/tty0 driver\n"); + + kbd_init(); +#endif + +#ifdef CONFIG_SGI_L1_SERIAL_CONSOLE + if (ia64_platform_is("sn2")) { + sn_sal_serial_console_init(); + return; /* only one console right now for SN2 */ + } +#endif +#ifdef CONFIG_ESPSERIAL /* init ESP before rs, so rs doesn't see the port */ + espserial_init(); +#endif +#if defined(CONFIG_MVME162_SCC) || defined(CONFIG_BVME6000_SCC) || defined(CONFIG_MVME147_SCC) + vme_scc_init(); +#endif +#ifdef CONFIG_SERIAL_TX3912 + tx3912_rs_init(); +#endif +#ifdef CONFIG_ROCKETPORT + rp_init(); +#endif +#ifdef CONFIG_SERIAL167 + serial167_init(); +#endif +#ifdef CONFIG_CYCLADES + cy_init(); +#endif +#ifdef CONFIG_STALLION + stl_init(); +#endif +#ifdef CONFIG_ISTALLION + stli_init(); +#endif +#ifdef CONFIG_DIGI + pcxe_init(); +#endif +#ifdef CONFIG_DIGIEPCA + pc_init(); +#endif +#ifdef CONFIG_SPECIALIX + specialix_init(); +#endif +#if (defined(CONFIG_8xx) || defined(CONFIG_CPM2)) + rs_8xx_init(); +#endif /* CONFIG_8xx */ + pty_init(); +#ifdef CONFIG_MOXA_SMARTIO + mxser_init(); +#endif +#ifdef CONFIG_MOXA_INTELLIO + moxa_init(); +#endif +#ifdef CONFIG_VT + vcs_init(); +#endif +#ifdef CONFIG_TN3270 + tub3270_init(); +#endif +#ifdef CONFIG_TN3215 + tty3215_init(); +#endif +#ifdef CONFIG_HWC + hwc_tty_init(); +#endif +#ifdef CONFIG_A2232 + a2232board_init(); +#endif +} -- cgit v1.2.3