diff options
Diffstat (limited to 'openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch')
-rw-r--r-- | openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch | 9859 |
1 files changed, 0 insertions, 9859 deletions
diff --git a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch b/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch deleted file mode 100644 index 4075cab..0000000 --- a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch +++ /dev/null @@ -1,9859 +0,0 @@ -diff -pruN ./drivers/md.dm/dm-bio-list.h ./drivers/md/dm-bio-list.h ---- ./drivers/md.dm/dm-bio-list.h 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-bio-list.h 2006-03-17 13:16:38.000000000 +0300 -@@ -33,6 +33,9 @@ static inline void bio_list_add(struct b - - static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) - { -+ if (!bl2->head) -+ return; -+ - if (bl->tail) - bl->tail->bi_next = bl2->head; - else -diff -pruN ./drivers/md.dm/dm-bio-record.h ./drivers/md/dm-bio-record.h ---- ./drivers/md.dm/dm-bio-record.h 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-bio-record.h 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,45 @@ -+/* -+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ */ -+ -+#ifndef DM_BIO_RECORD_H -+#define DM_BIO_RECORD_H -+ -+#include <linux/bio.h> -+ -+/* -+ * There are lots of mutable fields in the bio struct that get -+ * changed by the lower levels of the block layer. Some targets, -+ * such as multipath, may wish to resubmit a bio on error. The -+ * functions in this file help the target record and restore the -+ * original bio state. -+ */ -+struct dm_bio_details { -+ sector_t bi_sector; -+ struct block_device *bi_bdev; -+ unsigned int bi_size; -+ unsigned short bi_idx; -+ unsigned long bi_flags; -+}; -+ -+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) -+{ -+ bd->bi_sector = bio->bi_sector; -+ bd->bi_bdev = bio->bi_bdev; -+ bd->bi_size = bio->bi_size; -+ bd->bi_idx = bio->bi_idx; -+ bd->bi_flags = bio->bi_flags; -+} -+ -+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) -+{ -+ bio->bi_sector = bd->bi_sector; -+ bio->bi_bdev = bd->bi_bdev; -+ bio->bi_size = bd->bi_size; -+ bio->bi_idx = bd->bi_idx; -+ bio->bi_flags = bd->bi_flags; -+} -+ -+#endif -diff -pruN ./drivers/md.dm/dm.c ./drivers/md/dm.c ---- ./drivers/md.dm/dm.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm.c 2006-03-17 13:16:38.000000000 +0300 -@@ -15,15 +15,13 @@ - #include <linux/buffer_head.h> - #include <linux/mempool.h> - #include <linux/slab.h> -+#include <linux/idr.h> - - static const char *_name = DM_NAME; - - static unsigned int major = 0; - static unsigned int _major = 0; - --static int realloc_minor_bits(unsigned long requested_minor); --static void free_minor_bits(void); -- - /* - * One of these is allocated per bio. - */ -@@ -32,6 +30,7 @@ struct dm_io { - int error; - struct bio *bio; - atomic_t io_count; -+ unsigned long start_time; - }; - - /* -@@ -44,15 +43,23 @@ struct target_io { - union map_info info; - }; - -+union map_info *dm_get_mapinfo(struct bio *bio) -+{ -+ if (bio && bio->bi_private) -+ return &((struct target_io *)bio->bi_private)->info; -+ return NULL; -+} -+ - /* - * Bits for the md->flags field. - */ - #define DMF_BLOCK_IO 0 - #define DMF_SUSPENDED 1 --#define DMF_FS_LOCKED 2 -+#define DMF_FROZEN 2 - - struct mapped_device { -- struct rw_semaphore lock; -+ struct rw_semaphore io_lock; -+ struct semaphore suspend_lock; - rwlock_t map_lock; - atomic_t holders; - -@@ -61,6 +68,8 @@ struct mapped_device { - request_queue_t *queue; - struct gendisk *disk; - -+ void *interface_ptr; -+ - /* - * A list of ios that arrived while we were suspended. - */ -@@ -89,6 +98,7 @@ struct mapped_device { - * freeze/thaw support require holding onto a super block - */ - struct super_block *frozen_sb; -+ struct block_device *suspended_bdev; - }; - - #define MIN_IOS 256 -@@ -113,19 +123,11 @@ static int __init local_init(void) - return -ENOMEM; - } - -- r = realloc_minor_bits(1024); -- if (r < 0) { -- kmem_cache_destroy(_tio_cache); -- kmem_cache_destroy(_io_cache); -- return r; -- } -- - _major = major; - r = register_blkdev(_major, _name); - if (r < 0) { - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); -- free_minor_bits(); - return r; - } - -@@ -139,7 +141,6 @@ static void local_exit(void) - { - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); -- free_minor_bits(); - - if (unregister_blkdev(_major, _name) < 0) - DMERR("devfs_unregister_blkdev failed"); -@@ -238,21 +239,53 @@ static inline void free_tio(struct mappe - mempool_free(tio, md->tio_pool); - } - -+static void start_io_acct(struct dm_io *io) -+{ -+ struct mapped_device *md = io->md; -+ -+ io->start_time = jiffies; -+ -+ disk_round_stats(dm_disk(md)); -+ dm_disk(md)->in_flight = atomic_inc_return(&md->pending); -+} -+ -+static int end_io_acct(struct dm_io *io) -+{ -+ struct mapped_device *md = io->md; -+ struct bio *bio = io->bio; -+ unsigned long duration = jiffies - io->start_time; -+ int pending; -+ -+ disk_round_stats(dm_disk(md)); -+ dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); -+ -+ switch (bio_data_dir(bio)) { -+ case WRITE: -+ disk_stat_add(dm_disk(md), write_ticks, duration); -+ break; -+ case READ: -+ disk_stat_add(dm_disk(md), read_ticks, duration); -+ break; -+ } -+ -+ return !pending; -+} -+ - /* - * Add the bio to the list of deferred io. - */ - static int queue_io(struct mapped_device *md, struct bio *bio) - { -- down_write(&md->lock); -+ down_write(&md->io_lock); - - if (!test_bit(DMF_BLOCK_IO, &md->flags)) { -- up_write(&md->lock); -+ up_write(&md->io_lock); - return 1; - } - - bio_list_add(&md->deferred, bio); - -- up_write(&md->lock); -+ up_write(&md->io_lock); - return 0; /* deferred successfully */ - } - -@@ -293,7 +326,7 @@ static inline void dec_pending(struct dm - io->error = error; - - if (atomic_dec_and_test(&io->io_count)) { -- if (atomic_dec_and_test(&io->md->pending)) -+ if (end_io_acct(io)) - /* nudge anyone waiting on suspend queue */ - wake_up(&io->md->wait); - -@@ -342,8 +375,8 @@ static sector_t max_io_len(struct mapped - */ - if (ti->split_io) { - sector_t boundary; -- boundary = dm_round_up(offset + 1, ti->split_io) - offset; -- -+ boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) -+ - offset; - if (len > boundary) - len = boundary; - } -@@ -379,7 +412,7 @@ static void __map_bio(struct dm_target * - /* error the io and bail out */ - struct dm_io *io = tio->io; - free_tio(tio->io->md, tio); -- dec_pending(io, -EIO); -+ dec_pending(io, r); - bio_put(clone); - } - } -@@ -542,7 +575,7 @@ static void __split_bio(struct mapped_de - ci.sector_count = bio_sectors(bio); - ci.idx = bio->bi_idx; - -- atomic_inc(&md->pending); -+ start_io_acct(ci.io); - while (ci.sector_count) - __clone_and_map(&ci); - -@@ -563,14 +596,22 @@ static int dm_request(request_queue_t *q - int r; - struct mapped_device *md = q->queuedata; - -- down_read(&md->lock); -+ down_read(&md->io_lock); -+ -+ if (bio_data_dir(bio) == WRITE) { -+ disk_stat_inc(dm_disk(md), writes); -+ disk_stat_add(dm_disk(md), write_sectors, bio_sectors(bio)); -+ } else { -+ disk_stat_inc(dm_disk(md), reads); -+ disk_stat_add(dm_disk(md), read_sectors, bio_sectors(bio)); -+ } - - /* - * If we're suspended we have to queue - * this io for later. - */ - while (test_bit(DMF_BLOCK_IO, &md->flags)) { -- up_read(&md->lock); -+ up_read(&md->io_lock); - - if (bio_rw(bio) == READA) { - bio_io_error(bio, bio->bi_size); -@@ -589,14 +630,29 @@ static int dm_request(request_queue_t *q - * We're in a while loop, because someone could suspend - * before we get to the following read lock. - */ -- down_read(&md->lock); -+ down_read(&md->io_lock); - } - - __split_bio(md, bio); -- up_read(&md->lock); -+ up_read(&md->io_lock); - return 0; - } - -+static int dm_flush_all(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ struct mapped_device *md = q->queuedata; -+ struct dm_table *map = dm_get_table(md); -+ int ret = -ENXIO; -+ -+ if (map) { -+ ret = dm_table_flush_all(map); -+ dm_table_put(map); -+ } -+ -+ return ret; -+} -+ - static void dm_unplug_all(request_queue_t *q) - { - struct mapped_device *md = q->queuedata; -@@ -624,109 +680,86 @@ static int dm_any_congested(void *conges - } - - /*----------------------------------------------------------------- -- * A bitset is used to keep track of allocated minor numbers. -+ * An IDR is used to keep track of allocated minor numbers. - *---------------------------------------------------------------*/ - static DECLARE_MUTEX(_minor_lock); --static unsigned long *_minor_bits = NULL; --static unsigned long _max_minors = 0; -- --#define MINORS_SIZE(minors) ((minors / BITS_PER_LONG) * sizeof(unsigned long)) -- --static int realloc_minor_bits(unsigned long requested_minor) --{ -- unsigned long max_minors; -- unsigned long *minor_bits, *tmp; -- -- if (requested_minor < _max_minors) -- return -EINVAL; -- -- /* Round up the requested minor to the next power-of-2. */ -- max_minors = 1 << fls(requested_minor - 1); -- if (max_minors > (1 << MINORBITS)) -- return -EINVAL; -- -- minor_bits = kmalloc(MINORS_SIZE(max_minors), GFP_KERNEL); -- if (!minor_bits) -- return -ENOMEM; -- memset(minor_bits, 0, MINORS_SIZE(max_minors)); -- -- /* Copy the existing bit-set to the new one. */ -- if (_minor_bits) -- memcpy(minor_bits, _minor_bits, MINORS_SIZE(_max_minors)); -- -- tmp = _minor_bits; -- _minor_bits = minor_bits; -- _max_minors = max_minors; -- if (tmp) -- kfree(tmp); -- -- return 0; --} -- --static void free_minor_bits(void) --{ -- down(&_minor_lock); -- kfree(_minor_bits); -- _minor_bits = NULL; -- _max_minors = 0; -- up(&_minor_lock); --} -+static DEFINE_IDR(_minor_idr); - - static void free_minor(unsigned int minor) - { - down(&_minor_lock); -- if (minor < _max_minors) -- clear_bit(minor, _minor_bits); -+ idr_remove(&_minor_idr, minor); - up(&_minor_lock); - } - - /* - * See if the device with a specific minor # is free. - */ --static int specific_minor(unsigned int minor) -+static int specific_minor(struct mapped_device *md, unsigned int minor) - { -- int r = 0; -+ int r, m; - -- if (minor > (1 << MINORBITS)) -+ if (minor >= (1 << MINORBITS)) - return -EINVAL; - - down(&_minor_lock); -- if (minor >= _max_minors) { -- r = realloc_minor_bits(minor); -- if (r) { -- up(&_minor_lock); -- return r; -- } -+ -+ if (idr_find(&_minor_idr, minor)) { -+ r = -EBUSY; -+ goto out; -+ } -+ -+ r = idr_pre_get(&_minor_idr, GFP_KERNEL); -+ if (!r) { -+ r = -ENOMEM; -+ goto out; -+ } -+ -+ r = idr_get_new_above(&_minor_idr, md, minor, &m); -+ if (r) { -+ goto out; - } - -- if (test_and_set_bit(minor, _minor_bits)) -+ if (m != minor) { -+ idr_remove(&_minor_idr, m); - r = -EBUSY; -- up(&_minor_lock); -+ goto out; -+ } - -+out: -+ up(&_minor_lock); - return r; - } - --static int next_free_minor(unsigned int *minor) -+static int next_free_minor(struct mapped_device *md, unsigned int *minor) - { - int r; - unsigned int m; - - down(&_minor_lock); -- m = find_first_zero_bit(_minor_bits, _max_minors); -- if (m >= _max_minors) { -- r = realloc_minor_bits(_max_minors * 2); -- if (r) { -- up(&_minor_lock); -- return r; -- } -- m = find_first_zero_bit(_minor_bits, _max_minors); -+ -+ r = idr_pre_get(&_minor_idr, GFP_KERNEL); -+ if (!r) { -+ r = -ENOMEM; -+ goto out; -+ } -+ -+ r = idr_get_new(&_minor_idr, md, &m); -+ if (r) { -+ goto out; -+ } -+ -+ if (m >= (1 << MINORBITS)) { -+ idr_remove(&_minor_idr, m); -+ r = -ENOSPC; -+ goto out; - } - -- set_bit(m, _minor_bits); - *minor = m; -- up(&_minor_lock); - -- return 0; -+out: -+ up(&_minor_lock); -+ return r; - } - - static struct block_device_operations dm_blk_dops; -@@ -745,12 +778,13 @@ static struct mapped_device *alloc_dev(u - } - - /* get a minor number for the dev */ -- r = persistent ? specific_minor(minor) : next_free_minor(&minor); -+ r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); - if (r < 0) - goto bad1; - - memset(md, 0, sizeof(*md)); -- init_rwsem(&md->lock); -+ init_rwsem(&md->io_lock); -+ init_MUTEX(&md->suspend_lock); - rwlock_init(&md->map_lock); - atomic_set(&md->holders, 1); - atomic_set(&md->event_nr, 0); -@@ -764,6 +798,7 @@ static struct mapped_device *alloc_dev(u - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - md->queue->unplug_fn = dm_unplug_all; -+ md->queue->issue_flush_fn = dm_flush_all; - - md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, - mempool_free_slab, _io_cache); -@@ -823,22 +858,17 @@ static void event_callback(void *context - { - struct mapped_device *md = (struct mapped_device *) context; - -- atomic_inc(&md->event_nr);; -+ atomic_inc(&md->event_nr); - wake_up(&md->eventq); - } - --static void __set_size(struct gendisk *disk, sector_t size) -+static void __set_size(struct mapped_device *md, sector_t size) - { -- struct block_device *bdev; -+ set_capacity(md->disk, size); - -- set_capacity(disk, size); -- bdev = bdget_disk(disk, 0); -- if (bdev) { -- down(&bdev->bd_inode->i_sem); -- i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -- up(&bdev->bd_inode->i_sem); -- bdput(bdev); -- } -+ down(&md->suspended_bdev->bd_inode->i_sem); -+ i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -+ up(&md->suspended_bdev->bd_inode->i_sem); - } - - static int __bind(struct mapped_device *md, struct dm_table *t) -@@ -847,17 +877,18 @@ static int __bind(struct mapped_device * - sector_t size; - - size = dm_table_get_size(t); -- __set_size(md->disk, size); -+ __set_size(md, size); - if (size == 0) - return 0; - -+ dm_table_get(t); -+ dm_table_event_callback(t, event_callback, md); -+ - write_lock(&md->map_lock); - md->map = t; -+ dm_table_set_restrictions(t, q); - write_unlock(&md->map_lock); - -- dm_table_get(t); -- dm_table_event_callback(md->map, event_callback, md); -- dm_table_set_restrictions(t, q); - return 0; - } - -@@ -901,6 +932,32 @@ int dm_create_with_minor(unsigned int mi - return create_aux(minor, 1, result); - } - -+void *dm_get_mdptr(dev_t dev) -+{ -+ struct mapped_device *md; -+ void *mdptr = NULL; -+ unsigned minor = MINOR(dev); -+ -+ if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) -+ return NULL; -+ -+ down(&_minor_lock); -+ -+ md = idr_find(&_minor_idr, minor); -+ -+ if (md && (dm_disk(md)->first_minor == minor)) -+ mdptr = md->interface_ptr; -+ -+ up(&_minor_lock); -+ -+ return mdptr; -+} -+ -+void dm_set_mdptr(struct mapped_device *md, void *ptr) -+{ -+ md->interface_ptr = ptr; -+} -+ - void dm_get(struct mapped_device *md) - { - atomic_inc(&md->holders); -@@ -911,8 +968,10 @@ void dm_put(struct mapped_device *md) - struct dm_table *map = dm_get_table(md); - - if (atomic_dec_and_test(&md->holders)) { -- if (!test_bit(DMF_SUSPENDED, &md->flags) && map) -- dm_table_suspend_targets(map); -+ if (!dm_suspended(md)) { -+ dm_table_presuspend_targets(map); -+ dm_table_postsuspend_targets(map); -+ } - __unbind(md); - free_dev(md); - } -@@ -940,69 +999,55 @@ static void __flush_deferred_io(struct m - */ - int dm_swap_table(struct mapped_device *md, struct dm_table *table) - { -- int r; -+ int r = -EINVAL; - -- down_write(&md->lock); -+ down(&md->suspend_lock); - - /* device must be suspended */ -- if (!test_bit(DMF_SUSPENDED, &md->flags)) { -- up_write(&md->lock); -- return -EPERM; -- } -+ if (!dm_suspended(md)) -+ goto out; - - __unbind(md); - r = __bind(md, table); -- if (r) -- return r; - -- up_write(&md->lock); -- return 0; -+out: -+ up(&md->suspend_lock); -+ return r; - } - - /* - * Functions to lock and unlock any filesystem running on the - * device. - */ --static int __lock_fs(struct mapped_device *md) -+static int lock_fs(struct mapped_device *md) - { -- struct block_device *bdev; -+ int r; - -- if (test_and_set_bit(DMF_FS_LOCKED, &md->flags)) -- return 0; -+ WARN_ON(md->frozen_sb); - -- bdev = bdget_disk(md->disk, 0); -- if (!bdev) { -- DMWARN("bdget failed in __lock_fs"); -- return -ENOMEM; -+ md->frozen_sb = freeze_bdev(md->suspended_bdev); -+ if (IS_ERR(md->frozen_sb)) { -+ r = PTR_ERR(md->frozen_sb); -+ md->frozen_sb = NULL; -+ return r; - } - -- WARN_ON(md->frozen_sb); -- md->frozen_sb = freeze_bdev(bdev); -+ set_bit(DMF_FROZEN, &md->flags); -+ - /* don't bdput right now, we don't want the bdev -- * to go away while it is locked. We'll bdput -- * in __unlock_fs -+ * to go away while it is locked. - */ - return 0; - } - --static int __unlock_fs(struct mapped_device *md) -+static void unlock_fs(struct mapped_device *md) - { -- struct block_device *bdev; -- -- if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags)) -- return 0; -- -- bdev = bdget_disk(md->disk, 0); -- if (!bdev) { -- DMWARN("bdget failed in __unlock_fs"); -- return -ENOMEM; -- } -+ if (!test_bit(DMF_FROZEN, &md->flags)) -+ return; - -- thaw_bdev(bdev, md->frozen_sb); -+ thaw_bdev(md->suspended_bdev, md->frozen_sb); - md->frozen_sb = NULL; -- bdput(bdev); -- bdput(bdev); -- return 0; -+ clear_bit(DMF_FROZEN, &md->flags); - } - - /* -@@ -1012,46 +1057,48 @@ static int __unlock_fs(struct mapped_dev - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ --int dm_suspend(struct mapped_device *md) -+int dm_suspend(struct mapped_device *md, int do_lockfs) - { -- struct dm_table *map; -+ struct dm_table *map = NULL; - DECLARE_WAITQUEUE(wait, current); -+ int r = -EINVAL; - -- /* Flush I/O to the device. */ -- down_read(&md->lock); -- if (test_bit(DMF_BLOCK_IO, &md->flags)) { -- up_read(&md->lock); -- return -EINVAL; -+ down(&md->suspend_lock); -+ -+ if (dm_suspended(md)) -+ goto out; -+ -+ map = dm_get_table(md); -+ -+ /* This does not get reverted if there's an error later. */ -+ dm_table_presuspend_targets(map); -+ -+ md->suspended_bdev = bdget_disk(md->disk, 0); -+ if (!md->suspended_bdev) { -+ DMWARN("bdget failed in dm_suspend"); -+ r = -ENOMEM; -+ goto out; - } - -- __lock_fs(md); -- up_read(&md->lock); -+ /* Flush I/O to the device. */ -+ if (do_lockfs) { -+ r = lock_fs(md); -+ if (r) -+ goto out; -+ } - - /* -- * First we set the BLOCK_IO flag so no more ios will be -- * mapped. -+ * First we set the BLOCK_IO flag so no more ios will be mapped. - */ -- down_write(&md->lock); -- if (test_bit(DMF_BLOCK_IO, &md->flags)) { -- /* -- * If we get here we know another thread is -- * trying to suspend as well, so we leave the fs -- * locked for this thread. -- */ -- up_write(&md->lock); -- return -EINVAL; -- } -- -+ down_write(&md->io_lock); - set_bit(DMF_BLOCK_IO, &md->flags); -+ - add_wait_queue(&md->wait, &wait); -- up_write(&md->lock); -+ up_write(&md->io_lock); - - /* unplug */ -- map = dm_get_table(md); -- if (map) { -+ if (map) - dm_table_unplug_all(map); -- dm_table_put(map); -- } - - /* - * Then we wait for the already mapped ios to -@@ -1067,54 +1114,75 @@ int dm_suspend(struct mapped_device *md) - } - set_current_state(TASK_RUNNING); - -- down_write(&md->lock); -+ down_write(&md->io_lock); - remove_wait_queue(&md->wait, &wait); - - /* were we interrupted ? */ -+ r = -EINTR; - if (atomic_read(&md->pending)) { -- __unlock_fs(md); -+ up_write(&md->io_lock); -+ unlock_fs(md); - clear_bit(DMF_BLOCK_IO, &md->flags); -- up_write(&md->lock); -- return -EINTR; -+ goto out; - } -+ up_write(&md->io_lock); -+ -+ dm_table_postsuspend_targets(map); - - set_bit(DMF_SUSPENDED, &md->flags); - -- map = dm_get_table(md); -- if (map) -- dm_table_suspend_targets(map); -- dm_table_put(map); -- up_write(&md->lock); -+ r = 0; - -- return 0; -+out: -+ if (r && md->suspended_bdev) { -+ bdput(md->suspended_bdev); -+ md->suspended_bdev = NULL; -+ } -+ -+ dm_table_put(map); -+ up(&md->suspend_lock); -+ return r; - } - - int dm_resume(struct mapped_device *md) - { -+ int r = -EINVAL; - struct bio *def; -- struct dm_table *map = dm_get_table(md); -+ struct dm_table *map = NULL; - -- down_write(&md->lock); -- if (!map || -- !test_bit(DMF_SUSPENDED, &md->flags) || -- !dm_table_get_size(map)) { -- up_write(&md->lock); -- dm_table_put(map); -- return -EINVAL; -- } -+ down(&md->suspend_lock); -+ if (!dm_suspended(md)) -+ goto out; -+ -+ map = dm_get_table(md); -+ if (!map || !dm_table_get_size(map)) -+ goto out; - - dm_table_resume_targets(map); -- clear_bit(DMF_SUSPENDED, &md->flags); -+ -+ down_write(&md->io_lock); - clear_bit(DMF_BLOCK_IO, &md->flags); - - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); -- up_write(&md->lock); -- __unlock_fs(md); -+ up_write(&md->io_lock); -+ -+ unlock_fs(md); -+ -+ bdput(md->suspended_bdev); -+ md->suspended_bdev = NULL; -+ -+ clear_bit(DMF_SUSPENDED, &md->flags); -+ - dm_table_unplug_all(map); -+ -+ r = 0; -+ -+out: - dm_table_put(map); -+ up(&md->suspend_lock); - -- return 0; -+ return r; - } - - /*----------------------------------------------------------------- -@@ -1151,6 +1219,8 @@ static struct block_device_operations dm - .owner = THIS_MODULE - }; - -+EXPORT_SYMBOL(dm_get_mapinfo); -+ - /* - * module hooks - */ -@@ -1160,5 +1230,5 @@ module_exit(dm_exit); - module_param(major, uint, 0); - MODULE_PARM_DESC(major, "The major number of the device mapper"); - MODULE_DESCRIPTION(DM_NAME " driver"); --MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>"); -+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); - MODULE_LICENSE("GPL"); -diff -pruN ./drivers/md.dm/dm-crypt.c ./drivers/md/dm-crypt.c ---- ./drivers/md.dm/dm-crypt.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-crypt.c 2006-03-17 13:16:38.000000000 +0300 -@@ -40,8 +40,8 @@ struct convert_context { - struct bio *bio_out; - unsigned int offset_in; - unsigned int offset_out; -- int idx_in; -- int idx_out; -+ unsigned int idx_in; -+ unsigned int idx_out; - sector_t sector; - int write; - }; -@@ -67,8 +67,8 @@ struct crypt_config { - struct crypto_tfm *tfm; - sector_t iv_offset; - int (*iv_generator)(struct crypt_config *cc, u8 *iv, sector_t sector); -- int iv_size; -- int key_size; -+ unsigned int iv_size; -+ unsigned int key_size; - u8 key[0]; - }; - -@@ -97,10 +97,8 @@ static void mempool_free_page(void *page - */ - static int crypt_iv_plain(struct crypt_config *cc, u8 *iv, sector_t sector) - { -+ memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); -- if (cc->iv_size > sizeof(u32) / sizeof(u8)) -- memset(iv + (sizeof(u32) / sizeof(u8)), 0, -- cc->iv_size - (sizeof(u32) / sizeof(u8))); - - return 0; - } -@@ -200,13 +198,13 @@ static int crypt_convert(struct crypt_co - */ - static struct bio * - crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, -- struct bio *base_bio, int *bio_vec_idx) -+ struct bio *base_bio, unsigned int *bio_vec_idx) - { - struct bio *bio; -- int nr_iovecs = dm_div_up(size, PAGE_SIZE); -+ unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; -- int flags = current->flags; -- int i; -+ unsigned long flags = current->flags; -+ unsigned int i; - - /* - * Tell VM to act less aggressively and fail earlier. -@@ -280,9 +278,8 @@ crypt_alloc_buffer(struct crypt_config * - static void crypt_free_buffer_pages(struct crypt_config *cc, - struct bio *bio, unsigned int bytes) - { -- unsigned int start, end; -+ unsigned int i, start, end; - struct bio_vec *bv; -- int i; - - /* - * This is ugly, but Jens Axboe thinks that using bi_idx in the -@@ -366,11 +363,11 @@ static void kcryptd_queue_io(struct cryp - /* - * Decode key from its hex representation - */ --static int crypt_decode_key(u8 *key, char *hex, int size) -+static int crypt_decode_key(u8 *key, char *hex, unsigned int size) - { - char buffer[3]; - char *endp; -- int i; -+ unsigned int i; - - buffer[2] = '\0'; - -@@ -393,9 +390,9 @@ static int crypt_decode_key(u8 *key, cha - /* - * Encode key into its hex representation - */ --static void crypt_encode_key(char *hex, u8 *key, int size) -+static void crypt_encode_key(char *hex, u8 *key, unsigned int size) - { -- int i; -+ unsigned int i; - - for(i = 0; i < size; i++) { - sprintf(hex, "%02x", *key); -@@ -415,8 +412,8 @@ static int crypt_ctr(struct dm_target *t - char *tmp; - char *cipher; - char *mode; -- int crypto_flags; -- int key_size; -+ unsigned int crypto_flags; -+ unsigned int key_size; - - if (argc != 5) { - ti->error = PFX "Not enough arguments"; -@@ -464,9 +461,9 @@ static int crypt_ctr(struct dm_target *t - } - - if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv) -- /* at least a 32 bit sector number should fit in our buffer */ -+ /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(crypto_tfm_alg_ivsize(tfm), -- (unsigned int)(sizeof(u32) / sizeof(u8))); -+ (unsigned int)(sizeof(u64) / sizeof(u8))); - else { - cc->iv_size = 0; - if (cc->iv_generator) { -@@ -528,6 +525,8 @@ bad3: - bad2: - crypto_free_tfm(tfm); - bad1: -+ /* Must zero key material before freeing */ -+ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); - return -EINVAL; - } -@@ -541,6 +540,9 @@ static void crypt_dtr(struct dm_target * - - crypto_free_tfm(cc->tfm); - dm_put_device(ti, cc->dev); -+ -+ /* Must zero key material before freeing */ -+ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); - } - -@@ -577,7 +579,8 @@ static int crypt_endio(struct bio *bio, - - static inline struct bio * - crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, -- sector_t sector, int *bvec_idx, struct convert_context *ctx) -+ sector_t sector, unsigned int *bvec_idx, -+ struct convert_context *ctx) - { - struct bio *clone; - -@@ -630,7 +633,7 @@ static int crypt_map(struct dm_target *t - struct bio *clone; - unsigned int remaining = bio->bi_size; - sector_t sector = bio->bi_sector - ti->begin; -- int bvec_idx = 0; -+ unsigned int bvec_idx = 0; - - io->target = ti; - io->bio = bio; -@@ -693,7 +696,7 @@ static int crypt_status(struct dm_target - char buffer[32]; - const char *cipher; - const char *mode = NULL; -- int offset; -+ unsigned int offset; - - switch (type) { - case STATUSTYPE_INFO: -diff -pruN ./drivers/md.dm/dm-emc.c ./drivers/md/dm-emc.c ---- ./drivers/md.dm/dm-emc.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-emc.c 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,359 @@ -+/* -+ * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved. -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ * -+ * Multipath support for EMC CLARiiON AX/CX-series hardware. -+ */ -+ -+#include "dm.h" -+#include "dm-hw-handler.h" -+#include <scsi/scsi.h> -+#include <scsi/scsi_cmnd.h> -+ -+struct emc_handler { -+ spinlock_t lock; -+ -+ /* Whether we should send the short trespass command (FC-series) -+ * or the long version (default for AX/CX CLARiiON arrays). */ -+ unsigned short_trespass; -+ /* Whether or not to honor SCSI reservations when initiating a -+ * switch-over. Default: Don't. */ -+ unsigned hr; -+ -+ unsigned char sense[SCSI_SENSE_BUFFERSIZE]; -+}; -+ -+#define TRESPASS_PAGE 0x22 -+#define EMC_FAILOVER_TIMEOUT (60 * HZ) -+ -+/* Code borrowed from dm-lsi-rdac by Mike Christie */ -+ -+static inline void free_bio(struct bio *bio) -+{ -+ __free_page(bio->bi_io_vec[0].bv_page); -+ bio_put(bio); -+} -+ -+static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) -+{ -+ struct path *path = bio->bi_private; -+ -+ if (bio->bi_size) -+ return 1; -+ -+ /* We also need to look at the sense keys here whether or not to -+ * switch to the next PG etc. -+ * -+ * For now simple logic: either it works or it doesn't. -+ */ -+ if (error) -+ dm_pg_init_complete(path, MP_FAIL_PATH); -+ else -+ dm_pg_init_complete(path, 0); -+ -+ /* request is freed in block layer */ -+ free_bio(bio); -+ -+ return 0; -+} -+ -+static struct bio *get_failover_bio(struct path *path, unsigned data_size) -+{ -+ struct bio *bio; -+ struct page *page; -+ -+ bio = bio_alloc(GFP_ATOMIC, 1); -+ if (!bio) { -+ DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); -+ return NULL; -+ } -+ -+ bio->bi_rw |= (1 << BIO_RW); -+ bio->bi_bdev = path->dev->bdev; -+ bio->bi_sector = 0; -+ bio->bi_private = path; -+ bio->bi_end_io = emc_endio; -+ -+ page = alloc_page(GFP_ATOMIC); -+ if (!page) { -+ DMERR("dm-emc: get_failover_bio: alloc_page() failed."); -+ bio_put(bio); -+ return NULL; -+ } -+ -+ if (bio_add_page(bio, page, data_size, 0) != data_size) { -+ DMERR("dm-emc: get_failover_bio: alloc_page() failed."); -+ __free_page(page); -+ bio_put(bio); -+ return NULL; -+ } -+ -+ return bio; -+} -+ -+static struct request *get_failover_req(struct emc_handler *h, -+ struct bio *bio, struct path *path) -+{ -+ struct request *rq; -+ struct block_device *bdev = bio->bi_bdev; -+ struct request_queue *q = bdev_get_queue(bdev); -+ -+ /* FIXME: Figure out why it fails with GFP_ATOMIC. */ -+ rq = blk_get_request(q, WRITE, __GFP_WAIT); -+ if (!rq) { -+ DMERR("dm-emc: get_failover_req: blk_get_request failed"); -+ return NULL; -+ } -+ -+ rq->bio = rq->biotail = bio; -+ blk_rq_bio_prep(q, rq, bio); -+ -+ rq->rq_disk = bdev->bd_contains->bd_disk; -+ -+ /* bio backed don't set data */ -+ rq->buffer = rq->data = NULL; -+ /* rq data_len used for pc cmd's request_bufflen */ -+ rq->data_len = bio->bi_size; -+ -+ rq->sense = h->sense; -+ memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); -+ rq->sense_len = 0; -+ -+ memset(&rq->cmd, 0, BLK_MAX_CDB); -+ -+ rq->timeout = EMC_FAILOVER_TIMEOUT; -+ rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); -+ -+ return rq; -+} -+ -+static struct request *emc_trespass_get(struct emc_handler *h, -+ struct path *path) -+{ -+ struct bio *bio; -+ struct request *rq; -+ unsigned char *page22; -+ unsigned char long_trespass_pg[] = { -+ 0, 0, 0, 0, -+ TRESPASS_PAGE, /* Page code */ -+ 0x09, /* Page length - 2 */ -+ h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ -+ 0xff, 0xff, /* Trespass target */ -+ 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */ -+ }; -+ unsigned char short_trespass_pg[] = { -+ 0, 0, 0, 0, -+ TRESPASS_PAGE, /* Page code */ -+ 0x02, /* Page length - 2 */ -+ h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ -+ 0xff, /* Trespass target */ -+ }; -+ unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) : -+ sizeof(long_trespass_pg); -+ -+ /* get bio backing */ -+ if (data_size > PAGE_SIZE) -+ /* this should never happen */ -+ return NULL; -+ -+ bio = get_failover_bio(path, data_size); -+ if (!bio) { -+ DMERR("dm-emc: emc_trespass_get: no bio"); -+ return NULL; -+ } -+ -+ page22 = (unsigned char *)bio_data(bio); -+ memset(page22, 0, data_size); -+ -+ memcpy(page22, h->short_trespass ? -+ short_trespass_pg : long_trespass_pg, data_size); -+ -+ /* get request for block layer packet command */ -+ rq = get_failover_req(h, bio, path); -+ if (!rq) { -+ DMERR("dm-emc: emc_trespass_get: no rq"); -+ free_bio(bio); -+ return NULL; -+ } -+ -+ /* Prepare the command. */ -+ rq->cmd[0] = MODE_SELECT; -+ rq->cmd[1] = 0x10; -+ rq->cmd[4] = data_size; -+ rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); -+ -+ return rq; -+} -+ -+static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, -+ struct path *path) -+{ -+ struct request *rq; -+ struct request_queue *q = bdev_get_queue(path->dev->bdev); -+ -+ /* -+ * We can either blindly init the pg (then look at the sense), -+ * or we can send some commands to get the state here (then -+ * possibly send the fo cmnd), or we can also have the -+ * initial state passed into us and then get an update here. -+ */ -+ if (!q) { -+ DMINFO("dm-emc: emc_pg_init: no queue"); -+ goto fail_path; -+ } -+ -+ /* FIXME: The request should be pre-allocated. */ -+ rq = emc_trespass_get(hwh->context, path); -+ if (!rq) { -+ DMERR("dm-emc: emc_pg_init: no rq"); -+ goto fail_path; -+ } -+ -+ DMINFO("dm-emc: emc_pg_init: sending switch-over command"); -+ elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); -+ return; -+ -+fail_path: -+ dm_pg_init_complete(path, MP_FAIL_PATH); -+} -+ -+static struct emc_handler *alloc_emc_handler(void) -+{ -+ struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL); -+ -+ if (h) { -+ memset(h, 0, sizeof(*h)); -+ spin_lock_init(&h->lock); -+ } -+ -+ return h; -+} -+ -+static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) -+{ -+ struct emc_handler *h; -+ unsigned hr, short_trespass; -+ -+ if (argc == 0) { -+ /* No arguments: use defaults */ -+ hr = 0; -+ short_trespass = 0; -+ } else if (argc != 2) { -+ DMWARN("dm-emc hwhandler: incorrect number of arguments"); -+ return -EINVAL; -+ } else { -+ if ((sscanf(argv[0], "%u", &short_trespass) != 1) -+ || (short_trespass > 1)) { -+ DMWARN("dm-emc: invalid trespass mode selected"); -+ return -EINVAL; -+ } -+ -+ if ((sscanf(argv[1], "%u", &hr) != 1) -+ || (hr > 1)) { -+ DMWARN("dm-emc: invalid honor reservation flag selected"); -+ return -EINVAL; -+ } -+ } -+ -+ h = alloc_emc_handler(); -+ if (!h) -+ return -ENOMEM; -+ -+ hwh->context = h; -+ -+ if ((h->short_trespass = short_trespass)) -+ DMWARN("dm-emc: short trespass command will be send"); -+ else -+ DMWARN("dm-emc: long trespass command will be send"); -+ -+ if ((h->hr = hr)) -+ DMWARN("dm-emc: honor reservation bit will be set"); -+ else -+ DMWARN("dm-emc: honor reservation bit will not be set (default)"); -+ -+ return 0; -+} -+ -+static void emc_destroy(struct hw_handler *hwh) -+{ -+ struct emc_handler *h = (struct emc_handler *) hwh->context; -+ -+ kfree(h); -+ hwh->context = NULL; -+} -+ -+static unsigned emc_error(struct hw_handler *hwh, struct bio *bio) -+{ -+ /* FIXME: Patch from axboe still missing */ -+#if 0 -+ int sense; -+ -+ if (bio->bi_error & BIO_SENSE) { -+ sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */ -+ -+ if (sense == 0x020403) { -+ /* LUN Not Ready - Manual Intervention Required -+ * indicates this is a passive path. -+ * -+ * FIXME: However, if this is seen and EVPD C0 -+ * indicates that this is due to a NDU in -+ * progress, we should set FAIL_PATH too. -+ * This indicates we might have to do a SCSI -+ * inquiry in the end_io path. Ugh. */ -+ return MP_BYPASS_PG | MP_RETRY_IO; -+ } else if (sense == 0x052501) { -+ /* An array based copy is in progress. Do not -+ * fail the path, do not bypass to another PG, -+ * do not retry. Fail the IO immediately. -+ * (Actually this is the same conclusion as in -+ * the default handler, but lets make sure.) */ -+ return 0; -+ } else if (sense == 0x062900) { -+ /* Unit Attention Code. This is the first IO -+ * to the new path, so just retry. */ -+ return MP_RETRY_IO; -+ } -+ } -+#endif -+ -+ /* Try default handler */ -+ return dm_scsi_err_handler(hwh, bio); -+} -+ -+static struct hw_handler_type emc_hwh = { -+ .name = "emc", -+ .module = THIS_MODULE, -+ .create = emc_create, -+ .destroy = emc_destroy, -+ .pg_init = emc_pg_init, -+ .error = emc_error, -+}; -+ -+static int __init dm_emc_init(void) -+{ -+ int r = dm_register_hw_handler(&emc_hwh); -+ -+ if (r < 0) -+ DMERR("emc: register failed %d", r); -+ -+ DMINFO("dm-emc version 0.0.3 loaded"); -+ -+ return r; -+} -+ -+static void __exit dm_emc_exit(void) -+{ -+ int r = dm_unregister_hw_handler(&emc_hwh); -+ -+ if (r < 0) -+ DMERR("emc: unregister failed %d", r); -+} -+ -+module_init(dm_emc_init); -+module_exit(dm_emc_exit); -+ -+MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath"); -+MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>"); -+MODULE_LICENSE("GPL"); -diff -pruN ./drivers/md.dm/dm.h ./drivers/md/dm.h ---- ./drivers/md.dm/dm.h 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm.h 2006-03-17 13:16:38.000000000 +0300 -@@ -19,6 +19,9 @@ - #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) - #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) - -+#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ -+ 0 : scnprintf(result + sz, maxlen - sz, x)) -+ - /* - * FIXME: I think this should be with the definition of sector_t - * in types.h. -@@ -40,6 +43,7 @@ struct dm_dev { - atomic_t count; - int mode; - struct block_device *bdev; -+ char name[16]; - }; - - struct dm_table; -@@ -51,6 +55,8 @@ struct mapped_device; - *---------------------------------------------------------------*/ - int dm_create(struct mapped_device **md); - int dm_create_with_minor(unsigned int minor, struct mapped_device **md); -+void dm_set_mdptr(struct mapped_device *md, void *ptr); -+void *dm_get_mdptr(dev_t dev); - - /* - * Reference counting for md. -@@ -61,7 +67,7 @@ void dm_put(struct mapped_device *md); - /* - * A device can still be used while suspended, but I/O is deferred. - */ --int dm_suspend(struct mapped_device *md); -+int dm_suspend(struct mapped_device *md, int with_lockfs); - int dm_resume(struct mapped_device *md); - - /* -@@ -109,10 +115,12 @@ void dm_table_set_restrictions(struct dm - unsigned int dm_table_get_num_targets(struct dm_table *t); - struct list_head *dm_table_get_devices(struct dm_table *t); - int dm_table_get_mode(struct dm_table *t); --void dm_table_suspend_targets(struct dm_table *t); -+void dm_table_presuspend_targets(struct dm_table *t); -+void dm_table_postsuspend_targets(struct dm_table *t); - void dm_table_resume_targets(struct dm_table *t); - int dm_table_any_congested(struct dm_table *t, int bdi_bits); - void dm_table_unplug_all(struct dm_table *t); -+int dm_table_flush_all(struct dm_table *t); - - /*----------------------------------------------------------------- - * A registry of target types. -@@ -135,21 +143,22 @@ static inline int array_too_big(unsigned - } - - /* -- * ceiling(n / size) * size -+ * Ceiling(n / sz) - */ --static inline unsigned long dm_round_up(unsigned long n, unsigned long size) --{ -- unsigned long r = n % size; -- return n + (r ? (size - r) : 0); --} -+#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz)) -+ -+#define dm_sector_div_up(n, sz) ( \ -+{ \ -+ sector_t _r = ((n) + (sz) - 1); \ -+ sector_div(_r, (sz)); \ -+ _r; \ -+} \ -+) - - /* -- * Ceiling(n / size) -+ * ceiling(n / size) * size - */ --static inline unsigned long dm_div_up(unsigned long n, unsigned long size) --{ -- return dm_round_up(n, size) / size; --} -+#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) - - static inline sector_t to_sector(unsigned long n) - { -@@ -161,6 +170,8 @@ static inline unsigned long to_bytes(sec - return (n << 9); - } - -+int dm_split_args(int *argc, char ***argvp, char *input); -+ - /* - * The device-mapper can be driven through one of two interfaces; - * ioctl or filesystem, depending which patch you have applied. -@@ -178,5 +189,6 @@ int dm_stripe_init(void); - void dm_stripe_exit(void); - - void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); -+union map_info *dm_get_mapinfo(struct bio *bio); - - #endif -diff -pruN ./drivers/md.dm/dm-hw-handler.c ./drivers/md/dm-hw-handler.c ---- ./drivers/md.dm/dm-hw-handler.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-hw-handler.c 2006-03-20 09:38:13.000000000 +0300 -@@ -0,0 +1,216 @@ -+/* -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ * -+ * Multipath hardware handler registration. -+ */ -+ -+#include "dm.h" -+#include "dm-hw-handler.h" -+ -+#include <linux/slab.h> -+ -+struct hwh_internal { -+ struct hw_handler_type hwht; -+ -+ struct list_head list; -+ long use; -+}; -+ -+#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht) -+ -+static LIST_HEAD(_hw_handlers); -+static DECLARE_RWSEM(_hwh_lock); -+ -+struct hwh_internal *__find_hw_handler_type(const char *name) -+{ -+ struct hwh_internal *hwhi; -+ -+ list_for_each_entry(hwhi, &_hw_handlers, list) { -+ if (!strcmp(name, hwhi->hwht.name)) -+ return hwhi; -+ } -+ -+ return NULL; -+} -+ -+static struct hwh_internal *get_hw_handler(const char *name) -+{ -+ struct hwh_internal *hwhi; -+ -+ down_read(&_hwh_lock); -+ hwhi = __find_hw_handler_type(name); -+ if (hwhi) { -+ if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module)) -+ hwhi = NULL; -+ else -+ hwhi->use++; -+ } -+ up_read(&_hwh_lock); -+ -+ return hwhi; -+} -+ -+struct hw_handler_type *dm_get_hw_handler(const char *name) -+{ -+ struct hwh_internal *hwhi; -+ -+ if (!name) -+ return NULL; -+ -+ hwhi = get_hw_handler(name); -+ if (!hwhi) { -+ request_module("dm-%s", name); -+ hwhi = get_hw_handler(name); -+ } -+ -+ return hwhi ? &hwhi->hwht : NULL; -+} -+ -+void dm_put_hw_handler(struct hw_handler_type *hwht) -+{ -+ struct hwh_internal *hwhi; -+ -+ if (!hwht) -+ return; -+ -+ down_read(&_hwh_lock); -+ hwhi = __find_hw_handler_type(hwht->name); -+ if (!hwhi) -+ goto out; -+ -+ if (--hwhi->use == 0) -+ module_put(hwhi->hwht.module); -+ -+ if (hwhi->use < 0) -+ BUG(); -+ -+ out: -+ up_read(&_hwh_lock); -+} -+ -+static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht) -+{ -+ struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL); -+ -+ if (hwhi) { -+ memset(hwhi, 0, sizeof(*hwhi)); -+ hwhi->hwht = *hwht; -+ } -+ -+ return hwhi; -+} -+ -+int dm_register_hw_handler(struct hw_handler_type *hwht) -+{ -+ int r = 0; -+ struct hwh_internal *hwhi = _alloc_hw_handler(hwht); -+ -+ if (!hwhi) -+ return -ENOMEM; -+ -+ down_write(&_hwh_lock); -+ -+ if (__find_hw_handler_type(hwht->name)) { -+ kfree(hwhi); -+ r = -EEXIST; -+ } else -+ list_add(&hwhi->list, &_hw_handlers); -+ -+ up_write(&_hwh_lock); -+ -+ return r; -+} -+ -+int dm_unregister_hw_handler(struct hw_handler_type *hwht) -+{ -+ struct hwh_internal *hwhi; -+ -+ down_write(&_hwh_lock); -+ -+ hwhi = __find_hw_handler_type(hwht->name); -+ if (!hwhi) { -+ up_write(&_hwh_lock); -+ return -EINVAL; -+ } -+ -+ if (hwhi->use) { -+ up_write(&_hwh_lock); -+ return -ETXTBSY; -+ } -+ -+ list_del(&hwhi->list); -+ -+ up_write(&_hwh_lock); -+ -+ kfree(hwhi); -+ -+ return 0; -+} -+ -+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio) -+{ -+#if 0 -+ int sense_key, asc, ascq; -+ -+ if (bio->bi_error & BIO_SENSE) { -+ /* FIXME: This is just an initial guess. */ -+ /* key / asc / ascq */ -+ sense_key = (bio->bi_error >> 16) & 0xff; -+ asc = (bio->bi_error >> 8) & 0xff; -+ ascq = bio->bi_error & 0xff; -+ -+ switch (sense_key) { -+ /* This block as a whole comes from the device. -+ * So no point retrying on another path. */ -+ case 0x03: /* Medium error */ -+ case 0x05: /* Illegal request */ -+ case 0x07: /* Data protect */ -+ case 0x08: /* Blank check */ -+ case 0x0a: /* copy aborted */ -+ case 0x0c: /* obsolete - no clue ;-) */ -+ case 0x0d: /* volume overflow */ -+ case 0x0e: /* data miscompare */ -+ case 0x0f: /* reserved - no idea either. */ -+ return MP_ERROR_IO; -+ -+ /* For these errors it's unclear whether they -+ * come from the device or the controller. -+ * So just lets try a different path, and if -+ * it eventually succeeds, user-space will clear -+ * the paths again... */ -+ case 0x02: /* Not ready */ -+ case 0x04: /* Hardware error */ -+ case 0x09: /* vendor specific */ -+ case 0x0b: /* Aborted command */ -+ return MP_FAIL_PATH; -+ -+ case 0x06: /* Unit attention - might want to decode */ -+ if (asc == 0x04 && ascq == 0x01) -+ /* "Unit in the process of -+ * becoming ready" */ -+ return 0; -+ return MP_FAIL_PATH; -+ -+ /* FIXME: For Unit Not Ready we may want -+ * to have a generic pg activation -+ * feature (START_UNIT). */ -+ -+ /* Should these two ever end up in the -+ * error path? I don't think so. */ -+ case 0x00: /* No sense */ -+ case 0x01: /* Recovered error */ -+ return 0; -+ } -+ } -+#endif -+ -+ /* We got no idea how to decode the other kinds of errors -> -+ * assume generic error condition. */ -+ return MP_FAIL_PATH; -+} -+ -+EXPORT_SYMBOL_GPL(dm_register_hw_handler); -+EXPORT_SYMBOL_GPL(dm_unregister_hw_handler); -+EXPORT_SYMBOL_GPL(dm_scsi_err_handler); -diff -pruN ./drivers/md.dm/dm-hw-handler.h ./drivers/md/dm-hw-handler.h ---- ./drivers/md.dm/dm-hw-handler.h 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-hw-handler.h 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,61 @@ -+/* -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ * -+ * Multipath hardware handler registration. -+ */ -+ -+#ifndef DM_HW_HANDLER_H -+#define DM_HW_HANDLER_H -+ -+#include <linux/device-mapper.h> -+ -+#include "dm-mpath.h" -+ -+struct hw_handler_type; -+struct hw_handler { -+ struct hw_handler_type *type; -+ void *context; -+}; -+ -+/* -+ * Constructs a hardware handler object, takes custom arguments -+ */ -+/* Information about a hardware handler type */ -+struct hw_handler_type { -+ char *name; -+ struct module *module; -+ -+ int (*create) (struct hw_handler *handler, unsigned int argc, -+ char **argv); -+ void (*destroy) (struct hw_handler *hwh); -+ -+ void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, -+ struct path *path); -+ unsigned (*error) (struct hw_handler *hwh, struct bio *bio); -+ int (*status) (struct hw_handler *hwh, status_type_t type, -+ char *result, unsigned int maxlen); -+}; -+ -+/* Register a hardware handler */ -+int dm_register_hw_handler(struct hw_handler_type *type); -+ -+/* Unregister a hardware handler */ -+int dm_unregister_hw_handler(struct hw_handler_type *type); -+ -+/* Returns a registered hardware handler type */ -+struct hw_handler_type *dm_get_hw_handler(const char *name); -+ -+/* Releases a hardware handler */ -+void dm_put_hw_handler(struct hw_handler_type *hwht); -+ -+/* Default err function */ -+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio); -+ -+/* Error flags for err and dm_pg_init_complete */ -+#define MP_FAIL_PATH 1 -+#define MP_BYPASS_PG 2 -+#define MP_ERROR_IO 4 /* Don't retry this I/O */ -+ -+#endif -diff -pruN ./drivers/md.dm/dm-io.c ./drivers/md/dm-io.c ---- ./drivers/md.dm/dm-io.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-io.c 2006-03-17 13:16:38.000000000 +0300 -@@ -267,7 +267,7 @@ static int resize_pool(unsigned int new_ - /* create new pool */ - _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL); - if (!_io_pool) -- r = -ENOMEM; -+ return -ENOMEM; - - r = bio_set_init(&_bios, "dm-io", 512, 1); - if (r) { -diff -pruN ./drivers/md.dm/dm-ioctl.c ./drivers/md/dm-ioctl.c ---- ./drivers/md.dm/dm-ioctl.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-ioctl.c 2006-03-17 13:16:38.000000000 +0300 -@@ -1,5 +1,6 @@ - /* - * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. -+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. - * - * This file is released under the GPL. - */ -@@ -17,7 +18,7 @@ - - #include <asm/uaccess.h> - --#define DM_DRIVER_EMAIL "dm@uk.sistina.com" -+#define DM_DRIVER_EMAIL "dm-devel@redhat.com" - - /*----------------------------------------------------------------- - * The ioctl interface needs to be able to look up devices by -@@ -121,14 +122,6 @@ static struct hash_cell *__get_uuid_cell - /*----------------------------------------------------------------- - * Inserting, removing and renaming a device. - *---------------------------------------------------------------*/ --static inline char *kstrdup(const char *str) --{ -- char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); -- if (r) -- strcpy(r, str); -- return r; --} -- - static struct hash_cell *alloc_cell(const char *name, const char *uuid, - struct mapped_device *md) - { -@@ -138,7 +131,7 @@ static struct hash_cell *alloc_cell(cons - if (!hc) - return NULL; - -- hc->name = kstrdup(name); -+ hc->name = kstrdup(name, GFP_KERNEL); - if (!hc->name) { - kfree(hc); - return NULL; -@@ -148,7 +141,7 @@ static struct hash_cell *alloc_cell(cons - hc->uuid = NULL; - - else { -- hc->uuid = kstrdup(uuid); -+ hc->uuid = kstrdup(uuid, GFP_KERNEL); - if (!hc->uuid) { - kfree(hc->name); - kfree(hc); -@@ -224,6 +217,7 @@ static int dm_hash_insert(const char *na - } - register_with_devfs(cell); - dm_get(md); -+ dm_set_mdptr(md, cell); - up_write(&_hash_lock); - - return 0; -@@ -236,10 +230,20 @@ static int dm_hash_insert(const char *na - - static void __hash_remove(struct hash_cell *hc) - { -+ struct dm_table *table; -+ - /* remove from the dev hash */ - list_del(&hc->uuid_list); - list_del(&hc->name_list); - unregister_with_devfs(hc); -+ dm_set_mdptr(hc->md, NULL); -+ -+ table = dm_get_table(hc->md); -+ if (table) { -+ dm_table_event(table); -+ dm_table_put(table); -+ } -+ - dm_put(hc->md); - if (hc->new_map) - dm_table_put(hc->new_map); -@@ -266,11 +270,12 @@ static int dm_hash_rename(const char *ol - { - char *new_name, *old_name; - struct hash_cell *hc; -+ struct dm_table *table; - - /* - * duplicate new. - */ -- new_name = kstrdup(new); -+ new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) - return -ENOMEM; - -@@ -313,6 +318,15 @@ static int dm_hash_rename(const char *ol - /* rename the device node in devfs */ - register_with_devfs(hc); - -+ /* -+ * Wake up any dm event waiters. -+ */ -+ table = dm_get_table(hc->md); -+ if (table) { -+ dm_table_event(table); -+ dm_table_put(table); -+ } -+ - up_write(&_hash_lock); - kfree(old_name); - return 0; -@@ -421,8 +435,8 @@ static void list_version_get_needed(stru - { - size_t *needed = needed_param; - -+ *needed += sizeof(struct dm_target_versions); - *needed += strlen(tt->name); -- *needed += sizeof(tt->version); - *needed += ALIGN_MASK; - } - -@@ -517,19 +531,22 @@ static int __dev_status(struct mapped_de - if (dm_suspended(md)) - param->flags |= DM_SUSPEND_FLAG; - -- bdev = bdget_disk(disk, 0); -- if (!bdev) -- return -ENXIO; -- - param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); - -- /* -- * Yes, this will be out of date by the time it gets back -- * to userland, but it is still very useful ofr -- * debugging. -- */ -- param->open_count = bdev->bd_openers; -- bdput(bdev); -+ if (!(param->flags & DM_SKIP_BDGET_FLAG)) { -+ bdev = bdget_disk(disk, 0); -+ if (!bdev) -+ return -ENXIO; -+ -+ /* -+ * Yes, this will be out of date by the time it gets back -+ * to userland, but it is still very useful for -+ * debugging. -+ */ -+ param->open_count = bdev->bd_openers; -+ bdput(bdev); -+ } else -+ param->open_count = -1; - - if (disk->policy) - param->flags |= DM_READONLY_FLAG; -@@ -579,12 +596,16 @@ static int dev_create(struct dm_ioctl *p - } - - /* -- * Always use UUID for lookups if it's present, otherwise use name. -+ * Always use UUID for lookups if it's present, otherwise use name or dev. - */ - static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) - { -- return *param->uuid ? -- __get_uuid_cell(param->uuid) : __get_name_cell(param->name); -+ if (*param->uuid) -+ return __get_uuid_cell(param->uuid); -+ else if (*param->name) -+ return __get_name_cell(param->name); -+ else -+ return dm_get_mdptr(huge_decode_dev(param->dev)); - } - - static inline struct mapped_device *find_device(struct dm_ioctl *param) -@@ -596,6 +617,7 @@ static inline struct mapped_device *find - hc = __find_device_hash_cell(param); - if (hc) { - md = hc->md; -+ dm_get(md); - - /* - * Sneakily write in both the name and the uuid -@@ -611,8 +633,6 @@ static inline struct mapped_device *find - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; -- -- dm_get(md); - } - up_read(&_hash_lock); - -@@ -673,14 +693,18 @@ static int dev_rename(struct dm_ioctl *p - static int do_suspend(struct dm_ioctl *param) - { - int r = 0; -+ int do_lockfs = 1; - struct mapped_device *md; - - md = find_device(param); - if (!md) - return -ENXIO; - -+ if (param->flags & DM_SKIP_LOCKFS_FLAG) -+ do_lockfs = 0; -+ - if (!dm_suspended(md)) -- r = dm_suspend(md); -+ r = dm_suspend(md, do_lockfs); - - if (!r) - r = __dev_status(md, param); -@@ -692,6 +716,7 @@ static int do_suspend(struct dm_ioctl *p - static int do_resume(struct dm_ioctl *param) - { - int r = 0; -+ int do_lockfs = 1; - struct hash_cell *hc; - struct mapped_device *md; - struct dm_table *new_map; -@@ -717,8 +742,10 @@ static int do_resume(struct dm_ioctl *pa - /* Do we need to load a new map ? */ - if (new_map) { - /* Suspend if it isn't already suspended */ -+ if (param->flags & DM_SKIP_LOCKFS_FLAG) -+ do_lockfs = 0; - if (!dm_suspended(md)) -- dm_suspend(md); -+ dm_suspend(md, do_lockfs); - - r = dm_swap_table(md, new_map); - if (r) { -@@ -964,6 +991,7 @@ static int table_load(struct dm_ioctl *p - if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); - up_write(&_hash_lock); -+ dm_table_put(t); - return -ENXIO; - } - -@@ -1097,6 +1125,67 @@ static int table_status(struct dm_ioctl - return r; - } - -+/* -+ * Pass a message to the target that's at the supplied device offset. -+ */ -+static int target_message(struct dm_ioctl *param, size_t param_size) -+{ -+ int r, argc; -+ char **argv; -+ struct mapped_device *md; -+ struct dm_table *table; -+ struct dm_target *ti; -+ struct dm_target_msg *tmsg = (void *) param + param->data_start; -+ -+ md = find_device(param); -+ if (!md) -+ return -ENXIO; -+ -+ r = __dev_status(md, param); -+ if (r) -+ goto out; -+ -+ if (tmsg < (struct dm_target_msg *) (param + 1) || -+ invalid_str(tmsg->message, (void *) param + param_size)) { -+ DMWARN("Invalid target message parameters."); -+ r = -EINVAL; -+ goto out; -+ } -+ -+ r = dm_split_args(&argc, &argv, tmsg->message); -+ if (r) { -+ DMWARN("Failed to split target message parameters"); -+ goto out; -+ } -+ -+ table = dm_get_table(md); -+ if (!table) -+ goto out_argv; -+ -+ if (tmsg->sector >= dm_table_get_size(table)) { -+ DMWARN("Target message sector outside device."); -+ r = -EINVAL; -+ goto out_table; -+ } -+ -+ ti = dm_table_find_target(table, tmsg->sector); -+ if (ti->type->message) -+ r = ti->type->message(ti, argc, argv); -+ else { -+ DMWARN("Target type does not support messages"); -+ r = -EINVAL; -+ } -+ -+ out_table: -+ dm_table_put(table); -+ out_argv: -+ kfree(argv); -+ out: -+ param->data_size = 0; -+ dm_put(md); -+ return r; -+} -+ - /*----------------------------------------------------------------- - * Implementation of open/close/ioctl on the special char - * device. -@@ -1123,7 +1212,9 @@ static ioctl_fn lookup_ioctl(unsigned in - {DM_TABLE_DEPS_CMD, table_deps}, - {DM_TABLE_STATUS_CMD, table_status}, - -- {DM_LIST_VERSIONS_CMD, list_versions} -+ {DM_LIST_VERSIONS_CMD, list_versions}, -+ -+ {DM_TARGET_MSG_CMD, target_message} - }; - - return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; -@@ -1202,14 +1293,14 @@ static int validate_params(uint cmd, str - cmd == DM_LIST_VERSIONS_CMD) - return 0; - -- /* Unless creating, either name or uuid but not both */ -- if (cmd != DM_DEV_CREATE_CMD) { -- if ((!*param->uuid && !*param->name) || -- (*param->uuid && *param->name)) { -- DMWARN("one of name or uuid must be supplied, cmd(%u)", -- cmd); -+ if ((cmd == DM_DEV_CREATE_CMD)) { -+ if (!*param->name) { -+ DMWARN("name not supplied when creating device"); - return -EINVAL; - } -+ } else if ((*param->uuid && *param->name)) { -+ DMWARN("only supply one of name or uuid, cmd(%u)", cmd); -+ return -EINVAL; - } - - /* Ensure strings are terminated */ -@@ -1268,16 +1359,11 @@ static int ctl_ioctl(struct inode *inode - * Copy the parameters into kernel space. - */ - r = copy_params(user, ¶m); -- if (r) { -- current->flags &= ~PF_MEMALLOC; -- return r; -- } - -- /* -- * FIXME: eventually we will remove the PF_MEMALLOC flag -- * here. However the tools still do nasty things like -- * 'load' while a device is suspended. -- */ -+ current->flags &= ~PF_MEMALLOC; -+ -+ if (r) -+ return r; - - r = validate_params(cmd, param); - if (r) -@@ -1295,7 +1381,6 @@ static int ctl_ioctl(struct inode *inode - - out: - free_params(param); -- current->flags &= ~PF_MEMALLOC; - return r; - } - -diff -pruN ./drivers/md.dm/dm-linear.c ./drivers/md/dm-linear.c ---- ./drivers/md.dm/dm-linear.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-linear.c 2006-03-17 13:16:38.000000000 +0300 -@@ -80,7 +80,6 @@ static int linear_status(struct dm_targe - char *result, unsigned int maxlen) - { - struct linear_c *lc = (struct linear_c *) ti->private; -- char buffer[32]; - - switch (type) { - case STATUSTYPE_INFO: -@@ -88,8 +87,8 @@ static int linear_status(struct dm_targe - break; - - case STATUSTYPE_TABLE: -- format_dev_t(buffer, lc->dev->bdev->bd_dev); -- snprintf(result, maxlen, "%s " SECTOR_FORMAT, buffer, lc->start); -+ snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name, -+ lc->start); - break; - } - return 0; -diff -pruN ./drivers/md.dm/dm-log.c ./drivers/md/dm-log.c ---- ./drivers/md.dm/dm-log.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-log.c 2006-03-17 13:16:38.000000000 +0300 -@@ -17,9 +17,6 @@ static spinlock_t _lock = SPIN_LOCK_UNLO - - int dm_register_dirty_log_type(struct dirty_log_type *type) - { -- if (!try_module_get(type->module)) -- return -EINVAL; -- - spin_lock(&_lock); - type->use_count = 0; - list_add(&type->list, &_log_types); -@@ -33,11 +30,10 @@ int dm_unregister_dirty_log_type(struct - spin_lock(&_lock); - - if (type->use_count) -- DMWARN("Attempt to unregister a log type that is still in use"); -- else { -+ DMWARN("Unregister failed: log type '%s' still in use", -+ type->name); -+ else - list_del(&type->list); -- module_put(type->module); -- } - - spin_unlock(&_lock); - -@@ -51,6 +47,10 @@ static struct dirty_log_type *get_type(c - spin_lock(&_lock); - list_for_each_entry (type, &_log_types, list) - if (!strcmp(type_name, type->name)) { -+ if (!type->use_count && !try_module_get(type->module)){ -+ spin_unlock(&_lock); -+ return NULL; -+ } - type->use_count++; - spin_unlock(&_lock); - return type; -@@ -63,7 +63,8 @@ static struct dirty_log_type *get_type(c - static void put_type(struct dirty_log_type *type) - { - spin_lock(&_lock); -- type->use_count--; -+ if (!--type->use_count) -+ module_put(type->module); - spin_unlock(&_lock); - } - -@@ -112,7 +113,7 @@ void dm_destroy_dirty_log(struct dirty_l - /* - * The on-disk version of the metadata. - */ --#define MIRROR_DISK_VERSION 1 -+#define MIRROR_DISK_VERSION 2 - #define LOG_OFFSET 2 - - struct log_header { -@@ -129,20 +130,32 @@ struct log_header { - struct log_c { - struct dm_target *ti; - int touched; -- sector_t region_size; -+ uint32_t region_size; - unsigned int region_count; - region_t sync_count; - - unsigned bitset_uint32_count; - uint32_t *clean_bits; - uint32_t *sync_bits; -- uint32_t *recovering_bits; /* FIXME: this seems excessive */ -+ uint32_t *recovering_bits; - - int sync_search; - -+ /* Resync flag */ -+ enum sync { -+ DEFAULTSYNC, /* Synchronize if necessary */ -+ NOSYNC, /* Devices known to be already in sync */ -+ FORCESYNC, /* Force a sync to happen */ -+ } sync; -+ -+ int failure_response; -+ - /* - * Disk log fields - */ -+ int log_dev_failed; -+ atomic_t suspended; -+ struct completion failure_completion; - struct dm_dev *log_dev; - struct log_header header; - -@@ -150,7 +163,6 @@ struct log_c { - struct log_header *disk_header; - - struct io_region bits_location; -- uint32_t *disk_bits; - }; - - /* -@@ -159,20 +171,20 @@ struct log_c { - */ - static inline int log_test_bit(uint32_t *bs, unsigned bit) - { -- return test_bit(bit, (unsigned long *) bs) ? 1 : 0; -+ return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; - } - - static inline void log_set_bit(struct log_c *l, - uint32_t *bs, unsigned bit) - { -- set_bit(bit, (unsigned long *) bs); -+ ext2_set_bit(bit, (unsigned long *) bs); - l->touched = 1; - } - - static inline void log_clear_bit(struct log_c *l, - uint32_t *bs, unsigned bit) - { -- clear_bit(bit, (unsigned long *) bs); -+ ext2_clear_bit(bit, (unsigned long *) bs); - l->touched = 1; - } - -@@ -205,12 +217,19 @@ static int read_header(struct log_c *log - - header_from_disk(&log->header, log->disk_header); - -- if (log->header.magic != MIRROR_MAGIC) { -+ /* New log required? */ -+ if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { - log->header.magic = MIRROR_MAGIC; - log->header.version = MIRROR_DISK_VERSION; - log->header.nr_regions = 0; - } - -+ /* Version 2 is like version 1 but always little endian on disk. */ -+#ifdef __LITTLE_ENDIAN -+ if (log->header.version == 1) -+ log->header.version = 2; -+#endif -+ - if (log->header.version != MIRROR_DISK_VERSION) { - DMWARN("incompatible disk log version"); - return -EINVAL; -@@ -231,70 +250,69 @@ static inline int write_header(struct lo - /*---------------------------------------------------------------- - * Bits IO - *--------------------------------------------------------------*/ --static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count) --{ -- unsigned i; -- -- for (i = 0; i < count; i++) -- core[i] = le32_to_cpu(disk[i]); --} -- --static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count) --{ -- unsigned i; -- -- /* copy across the clean/dirty bitset */ -- for (i = 0; i < count; i++) -- disk[i] = cpu_to_le32(core[i]); --} -- - static int read_bits(struct log_c *log) - { - int r; - unsigned long ebits; - - r = dm_io_sync_vm(1, &log->bits_location, READ, -- log->disk_bits, &ebits); -+ log->clean_bits, &ebits); - if (r) - return r; - -- bits_to_core(log->clean_bits, log->disk_bits, -- log->bitset_uint32_count); - return 0; - } - - static int write_bits(struct log_c *log) - { - unsigned long ebits; -- bits_to_disk(log->clean_bits, log->disk_bits, -- log->bitset_uint32_count); - return dm_io_sync_vm(1, &log->bits_location, WRITE, -- log->disk_bits, &ebits); -+ log->clean_bits, &ebits); - } - - /*---------------------------------------------------------------- -- * constructor/destructor -+ * core log constructor/destructor -+ * -+ * argv contains: <region_size> [[no]sync] [block_on_error] - *--------------------------------------------------------------*/ - #define BYTE_SHIFT 3 - static int core_ctr(struct dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv) - { -+ enum sync sync = DEFAULTSYNC; -+ int failure_response = DMLOG_IOERR_IGNORE; -+ - struct log_c *lc; -- sector_t region_size; -+ uint32_t region_size; - unsigned int region_count; - size_t bitset_size; -+ unsigned i; - -- if (argc != 1) { -- DMWARN("wrong number of arguments to log_c"); -+ if (argc < 1 || argc > 3) { -+ DMWARN("wrong number of arguments to mirror log"); - return -EINVAL; - } - -- if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { -+ for (i = 1; i < argc; i++) { -+ if (!strcmp(argv[i], "sync")) -+ sync = FORCESYNC; -+ else if (!strcmp(argv[i], "nosync")) -+ sync = NOSYNC; -+ else if (!strcmp(argv[i], "block_on_error")) -+ failure_response = DMLOG_IOERR_BLOCK; -+ else { -+ DMWARN("unrecognised sync argument to mirror log: %s", -+ argv[i]); -+ return -EINVAL; -+ } -+ } -+ -+ if (sscanf(argv[0], "%u", ®ion_size) != 1) { - DMWARN("invalid region size string"); - return -EINVAL; - } - -- region_count = dm_div_up(ti->len, region_size); -+ region_count = dm_sector_div_up(ti->len, region_size); - - lc = kmalloc(sizeof(*lc), GFP_KERNEL); - if (!lc) { -@@ -306,12 +324,14 @@ static int core_ctr(struct dirty_log *lo - lc->touched = 0; - lc->region_size = region_size; - lc->region_count = region_count; -+ lc->sync = sync; -+ lc->failure_response = failure_response; - - /* -- * Work out how many words we need to hold the bitset. -+ * Work out how many "unsigned long"s we need to hold the bitset. - */ - bitset_size = dm_round_up(region_count, -- sizeof(*lc->clean_bits) << BYTE_SHIFT); -+ sizeof(unsigned long) << BYTE_SHIFT); - bitset_size >>= BYTE_SHIFT; - - lc->bitset_uint32_count = bitset_size / 4; -@@ -330,12 +350,12 @@ static int core_ctr(struct dirty_log *lo - kfree(lc); - return -ENOMEM; - } -- memset(lc->sync_bits, 0, bitset_size); -- lc->sync_count = 0; -+ memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); -+ lc->sync_count = (sync == NOSYNC) ? region_count : 0; - - lc->recovering_bits = vmalloc(bitset_size); - if (!lc->recovering_bits) { -- DMWARN("couldn't allocate sync bitset"); -+ DMWARN("couldn't allocate recovering bitset"); - vfree(lc->sync_bits); - vfree(lc->clean_bits); - kfree(lc); -@@ -356,6 +376,11 @@ static void core_dtr(struct dirty_log *l - kfree(lc); - } - -+/*---------------------------------------------------------------- -+ * disk log constructor/destructor -+ * -+ * argv contains log_device region_size followed optionally by [no]sync -+ *--------------------------------------------------------------*/ - static int disk_ctr(struct dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv) - { -@@ -364,8 +389,8 @@ static int disk_ctr(struct dirty_log *lo - struct log_c *lc; - struct dm_dev *dev; - -- if (argc != 2) { -- DMWARN("wrong number of arguments to log_d"); -+ if (argc < 2 || argc > 3) { -+ DMWARN("wrong number of arguments to disk mirror log"); - return -EINVAL; - } - -@@ -382,6 +407,8 @@ static int disk_ctr(struct dirty_log *lo - - lc = (struct log_c *) log->context; - lc->log_dev = dev; -+ lc->log_dev_failed = 0; -+ init_completion(&lc->failure_completion); - - /* setup the disk header fields */ - lc->header_location.bdev = lc->log_dev->bdev; -@@ -403,11 +430,6 @@ static int disk_ctr(struct dirty_log *lo - size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), - 1 << SECTOR_SHIFT); - lc->bits_location.count = size >> SECTOR_SHIFT; -- lc->disk_bits = vmalloc(size); -- if (!lc->disk_bits) { -- vfree(lc->disk_header); -- goto bad; -- } - return 0; - - bad: -@@ -421,7 +443,6 @@ static void disk_dtr(struct dirty_log *l - struct log_c *lc = (struct log_c *) log->context; - dm_put_device(lc->ti, lc->log_dev); - vfree(lc->disk_header); -- vfree(lc->disk_bits); - core_dtr(log); - } - -@@ -435,42 +456,65 @@ static int count_bits32(uint32_t *addr, - return count; - } - -+static void fail_log_device(struct log_c *lc) -+{ -+ lc->log_dev_failed = 1; -+ if (lc->failure_response == DMLOG_IOERR_BLOCK) -+ dm_table_event(lc->ti->table); -+} -+ -+static void restore_log_device(struct log_c *lc) -+{ -+ lc->log_dev_failed = 0; -+} -+ - static int disk_resume(struct dirty_log *log) - { -- int r; -+ int r = 0; - unsigned i; - struct log_c *lc = (struct log_c *) log->context; - size_t size = lc->bitset_uint32_count * sizeof(uint32_t); - -- /* read the disk header */ -- r = read_header(lc); -- if (r) -- return r; -- -- /* read the bits */ -- r = read_bits(lc); -- if (r) -- return r; -- -- /* zero any new bits if the mirror has grown */ -- for (i = lc->header.nr_regions; i < lc->region_count; i++) -- /* FIXME: amazingly inefficient */ -- log_clear_bit(lc, lc->clean_bits, i); -+ /* -+ * Read the disk header, but only if we know it is good. -+ * Assume the worst in the event of failure. -+ */ -+ if (!lc->log_dev_failed && -+ ((r = read_header(lc)) || read_bits(lc))) { -+ DMWARN("Read %s failed on mirror log device, %s.", -+ r ? "header" : "bits", lc->log_dev->name); -+ fail_log_device(lc); -+ lc->header.nr_regions = 0; -+ } -+ -+ /* set or clear any new bits */ -+ if (lc->sync == NOSYNC) -+ for (i = lc->header.nr_regions; i < lc->region_count; i++) -+ /* FIXME: amazingly inefficient */ -+ log_set_bit(lc, lc->clean_bits, i); -+ else -+ for (i = lc->header.nr_regions; i < lc->region_count; i++) -+ /* FIXME: amazingly inefficient */ -+ log_clear_bit(lc, lc->clean_bits, i); - - /* copy clean across to sync */ - memcpy(lc->sync_bits, lc->clean_bits, size); - lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); - -- /* write the bits */ -- r = write_bits(lc); -- if (r) -- return r; -- - /* set the correct number of regions in the header */ - lc->header.nr_regions = lc->region_count; - -- /* write the new header */ -- return write_header(lc); -+ /* write out the log. 'i' tells us which has failed if any */ -+ i = 1; -+ if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) { -+ DMWARN("Write %s failed on mirror log device, %s.", -+ i ? "bits" : "header", lc->log_dev->name); -+ fail_log_device(lc); -+ } else -+ restore_log_device(lc); -+ -+ atomic_set(&lc->suspended, 0); -+ return r; - } - - static sector_t core_get_region_size(struct dirty_log *log) -@@ -497,6 +541,17 @@ static int core_flush(struct dirty_log * - return 0; - } - -+static int disk_presuspend(struct dirty_log *log) -+{ -+ struct log_c *lc = (struct log_c *) log->context; -+ -+ atomic_set(&lc->suspended, 1); -+ if (lc->log_dev_failed && (lc->failure_response == DMLOG_IOERR_BLOCK)) -+ complete(&lc->failure_completion); -+ -+ return 0; -+} -+ - static int disk_flush(struct dirty_log *log) - { - int r; -@@ -506,9 +561,24 @@ static int disk_flush(struct dirty_log * - if (!lc->touched) - return 0; - -+ /* -+ * If a failure occurs, we must wait for a suspension. -+ * We must not proceed in the event of a failure, -+ * because if the machine reboots with the log -+ * incorrect, recovery could be compromised -+ */ - r = write_bits(lc); -- if (!r) -+ if (!r) { - lc->touched = 0; -+ restore_log_device(lc); -+ } else { -+ DMERR("Write failure on mirror log device, %s.", -+ lc->log_dev->name); -+ fail_log_device(lc); -+ if (!atomic_read(&lc->suspended) && -+ (lc->failure_response == DMLOG_IOERR_BLOCK)) -+ wait_for_completion(&lc->failure_completion); -+ } - - return r; - } -@@ -538,7 +608,7 @@ static int core_get_resync_work(struct d - lc->sync_search); - lc->sync_search = *region + 1; - -- if (*region == lc->region_count) -+ if (*region >= lc->region_count) - return 0; - - } while (log_test_bit(lc->recovering_bits, *region)); -@@ -566,6 +636,60 @@ static region_t core_get_sync_count(stru - return lc->sync_count; - } - -+#define DMEMIT_SYNC \ -+ if (lc->sync != DEFAULTSYNC) \ -+ DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") -+ -+static int core_status(struct dirty_log *log, status_type_t status, -+ char *result, unsigned int maxlen) -+{ -+ int sz = 0; -+ struct log_c *lc = log->context; -+ -+ switch(status) { -+ case STATUSTYPE_INFO: -+ DMEMIT("1 core"); -+ break; -+ -+ case STATUSTYPE_TABLE: -+ DMEMIT("%s %u %u ", log->type->name, -+ lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); -+ DMEMIT_SYNC; -+ } -+ -+ return sz; -+} -+ -+static int disk_status(struct dirty_log *log, status_type_t status, -+ char *result, unsigned int maxlen) -+{ -+ int sz = 0; -+ struct log_c *lc = log->context; -+ -+ switch(status) { -+ case STATUSTYPE_INFO: -+ DMEMIT("3 disk %s %c", lc->log_dev->name, -+ lc->log_dev_failed ? 'D' : 'A'); -+ break; -+ -+ case STATUSTYPE_TABLE: -+ DMEMIT("%s %u %s %u ", log->type->name, -+ lc->sync == DEFAULTSYNC ? 2 : 3, -+ lc->log_dev->name, -+ lc->region_size); -+ DMEMIT_SYNC; -+ } -+ -+ return sz; -+} -+ -+static int core_get_failure_response(struct dirty_log *log) -+{ -+ struct log_c *lc = log->context; -+ -+ return lc->failure_response; -+} -+ - static struct dirty_log_type _core_type = { - .name = "core", - .module = THIS_MODULE, -@@ -579,7 +703,9 @@ static struct dirty_log_type _core_type - .clear_region = core_clear_region, - .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, -- .get_sync_count = core_get_sync_count -+ .get_sync_count = core_get_sync_count, -+ .status = core_status, -+ .get_failure_response = core_get_failure_response, - }; - - static struct dirty_log_type _disk_type = { -@@ -587,7 +713,8 @@ static struct dirty_log_type _disk_type - .module = THIS_MODULE, - .ctr = disk_ctr, - .dtr = disk_dtr, -- .suspend = disk_flush, -+ .presuspend = disk_presuspend, -+ .postsuspend = disk_flush, - .resume = disk_resume, - .get_region_size = core_get_region_size, - .is_clean = core_is_clean, -@@ -597,7 +724,9 @@ static struct dirty_log_type _disk_type - .clear_region = core_clear_region, - .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, -- .get_sync_count = core_get_sync_count -+ .get_sync_count = core_get_sync_count, -+ .status = disk_status, -+ .get_failure_response = core_get_failure_response, - }; - - int __init dm_dirty_log_init(void) -diff -pruN ./drivers/md.dm/dm-log.h ./drivers/md/dm-log.h ---- ./drivers/md.dm/dm-log.h 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-log.h 2006-03-17 13:16:38.000000000 +0300 -@@ -9,6 +9,15 @@ - - #include "dm.h" - -+/* -+ * Values returned by get_failure_response() -+ * DMLOG_IOERR_IGNORE: ignore device failures -+ * DMLOG_IOERR_BLOCK: issue dm event, and do not complete -+ * I/O until presuspend is recieved. -+ */ -+#define DMLOG_IOERR_IGNORE 0 -+#define DMLOG_IOERR_BLOCK 1 -+ - typedef sector_t region_t; - - struct dirty_log_type; -@@ -32,7 +41,8 @@ struct dirty_log_type { - * There are times when we don't want the log to touch - * the disk. - */ -- int (*suspend)(struct dirty_log *log); -+ int (*presuspend)(struct dirty_log *log); -+ int (*postsuspend)(struct dirty_log *log); - int (*resume)(struct dirty_log *log); - - /* -@@ -48,6 +58,16 @@ struct dirty_log_type { - int (*is_clean)(struct dirty_log *log, region_t region); - - /* -+ * Returns: 0, 1 -+ * -+ * This is necessary for cluster mirroring. It provides -+ * a way to detect recovery on another node, so we -+ * aren't writing concurrently. This function is likely -+ * to block (when a cluster log is used). -+ */ -+ int (*is_remote_recovering)(struct dirty_log *log, region_t region); -+ -+ /* - * Returns: 0, 1, -EWOULDBLOCK, < 0 - * - * A predicate function to check the area given by -@@ -101,6 +121,18 @@ struct dirty_log_type { - * Returns the number of regions that are in sync. - */ - region_t (*get_sync_count)(struct dirty_log *log); -+ -+ /* -+ * Support function for mirror status requests. -+ */ -+ int (*status)(struct dirty_log *log, status_type_t status_type, -+ char *result, unsigned int maxlen); -+ -+ /* -+ * Return the code describing what to do in the event -+ * of a device failure. -+ */ -+ int (*get_failure_response)(struct dirty_log *log); - }; - - int dm_register_dirty_log_type(struct dirty_log_type *type); -diff -pruN ./drivers/md.dm/dm-mpath.c ./drivers/md/dm-mpath.c ---- ./drivers/md.dm/dm-mpath.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-mpath.c 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,1342 @@ -+/* -+ * Copyright (C) 2003 Sistina Software Limited. -+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ */ -+ -+#include "dm.h" -+#include "dm-path-selector.h" -+#include "dm-hw-handler.h" -+#include "dm-bio-list.h" -+#include "dm-bio-record.h" -+ -+#include <linux/ctype.h> -+#include <linux/init.h> -+#include <linux/mempool.h> -+#include <linux/module.h> -+#include <linux/pagemap.h> -+#include <linux/slab.h> -+#include <linux/time.h> -+#include <linux/workqueue.h> -+#include <asm/atomic.h> -+ -+#define MESG_STR(x) x, sizeof(x) -+ -+/* Path properties */ -+struct pgpath { -+ struct list_head list; -+ -+ struct priority_group *pg; /* Owning PG */ -+ unsigned fail_count; /* Cumulative failure count */ -+ -+ struct path path; -+}; -+ -+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) -+ -+/* -+ * Paths are grouped into Priority Groups and numbered from 1 upwards. -+ * Each has a path selector which controls which path gets used. -+ */ -+struct priority_group { -+ struct list_head list; -+ -+ struct multipath *m; /* Owning multipath instance */ -+ struct path_selector ps; -+ -+ unsigned pg_num; /* Reference number */ -+ unsigned bypassed; /* Temporarily bypass this PG? */ -+ -+ unsigned nr_pgpaths; /* Number of paths in PG */ -+ struct list_head pgpaths; -+}; -+ -+/* Multipath context */ -+struct multipath { -+ struct list_head list; -+ struct dm_target *ti; -+ -+ spinlock_t lock; -+ -+ struct hw_handler hw_handler; -+ unsigned nr_priority_groups; -+ struct list_head priority_groups; -+ unsigned pg_init_required; /* pg_init needs calling? */ -+ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ -+ -+ unsigned nr_valid_paths; /* Total number of usable paths */ -+ struct pgpath *current_pgpath; -+ struct priority_group *current_pg; -+ struct priority_group *next_pg; /* Switch to this PG if set */ -+ unsigned repeat_count; /* I/Os left before calling PS again */ -+ -+ unsigned queue_io; /* Must we queue all I/O? */ -+ unsigned queue_if_no_path; /* Queue I/O if last path fails? */ -+ unsigned saved_queue_if_no_path;/* Saved state during suspension */ -+ -+ struct work_struct process_queued_ios; -+ struct bio_list queued_ios; -+ unsigned queue_size; -+ -+ struct work_struct trigger_event; -+ -+ /* -+ * We must use a mempool of mpath_io structs so that we -+ * can resubmit bios on error. -+ */ -+ mempool_t *mpio_pool; -+}; -+ -+/* -+ * Context information attached to each bio we process. -+ */ -+struct mpath_io { -+ struct pgpath *pgpath; -+ struct dm_bio_details details; -+}; -+ -+typedef int (*action_fn) (struct pgpath *pgpath); -+ -+#define MIN_IOS 256 /* Mempool size */ -+ -+static kmem_cache_t *_mpio_cache; -+ -+struct workqueue_struct *kmultipathd; -+static void process_queued_ios(void *data); -+static void trigger_event(void *data); -+ -+ -+/*----------------------------------------------- -+ * Allocation routines -+ *-----------------------------------------------*/ -+ -+static struct pgpath *alloc_pgpath(void) -+{ -+ struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); -+ -+ if (pgpath) { -+ memset(pgpath, 0, sizeof(*pgpath)); -+ pgpath->path.is_active = 1; -+ } -+ -+ return pgpath; -+} -+ -+static inline void free_pgpath(struct pgpath *pgpath) -+{ -+ kfree(pgpath); -+} -+ -+static struct priority_group *alloc_priority_group(void) -+{ -+ struct priority_group *pg; -+ -+ pg = kmalloc(sizeof(*pg), GFP_KERNEL); -+ if (!pg) -+ return NULL; -+ -+ memset(pg, 0, sizeof(*pg)); -+ INIT_LIST_HEAD(&pg->pgpaths); -+ -+ return pg; -+} -+ -+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) -+{ -+ struct pgpath *pgpath, *tmp; -+ -+ list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { -+ list_del(&pgpath->list); -+ dm_put_device(ti, pgpath->path.dev); -+ free_pgpath(pgpath); -+ } -+} -+ -+static void free_priority_group(struct priority_group *pg, -+ struct dm_target *ti) -+{ -+ struct path_selector *ps = &pg->ps; -+ -+ if (ps->type) { -+ ps->type->destroy(ps); -+ dm_put_path_selector(ps->type); -+ } -+ -+ free_pgpaths(&pg->pgpaths, ti); -+ kfree(pg); -+} -+ -+static struct multipath *alloc_multipath(void) -+{ -+ struct multipath *m; -+ -+ m = kmalloc(sizeof(*m), GFP_KERNEL); -+ if (m) { -+ memset(m, 0, sizeof(*m)); -+ INIT_LIST_HEAD(&m->priority_groups); -+ spin_lock_init(&m->lock); -+ m->queue_io = 1; -+ INIT_WORK(&m->process_queued_ios, process_queued_ios, m); -+ INIT_WORK(&m->trigger_event, trigger_event, m); -+ m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, -+ mempool_free_slab, _mpio_cache); -+ if (!m->mpio_pool) { -+ kfree(m); -+ return NULL; -+ } -+ } -+ -+ return m; -+} -+ -+static void free_multipath(struct multipath *m) -+{ -+ struct priority_group *pg, *tmp; -+ struct hw_handler *hwh = &m->hw_handler; -+ -+ list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { -+ list_del(&pg->list); -+ free_priority_group(pg, m->ti); -+ } -+ -+ if (hwh->type) { -+ hwh->type->destroy(hwh); -+ dm_put_hw_handler(hwh->type); -+ } -+ -+ mempool_destroy(m->mpio_pool); -+ kfree(m); -+} -+ -+ -+/*----------------------------------------------- -+ * Path selection -+ *-----------------------------------------------*/ -+ -+static void __switch_pg(struct multipath *m, struct pgpath *pgpath) -+{ -+ struct hw_handler *hwh = &m->hw_handler; -+ -+ m->current_pg = pgpath->pg; -+ -+ /* Must we initialise the PG first, and queue I/O till it's ready? */ -+ if (hwh->type && hwh->type->pg_init) { -+ m->pg_init_required = 1; -+ m->queue_io = 1; -+ } else { -+ m->pg_init_required = 0; -+ m->queue_io = 0; -+ } -+} -+ -+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) -+{ -+ struct path *path; -+ -+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); -+ if (!path) -+ return -ENXIO; -+ -+ m->current_pgpath = path_to_pgpath(path); -+ -+ if (m->current_pg != pg) -+ __switch_pg(m, m->current_pgpath); -+ -+ return 0; -+} -+ -+static void __choose_pgpath(struct multipath *m) -+{ -+ struct priority_group *pg; -+ unsigned bypassed = 1; -+ -+ if (!m->nr_valid_paths) -+ goto failed; -+ -+ /* Were we instructed to switch PG? */ -+ if (m->next_pg) { -+ pg = m->next_pg; -+ m->next_pg = NULL; -+ if (!__choose_path_in_pg(m, pg)) -+ return; -+ } -+ -+ /* Don't change PG until it has no remaining paths */ -+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) -+ return; -+ -+ /* -+ * Loop through priority groups until we find a valid path. -+ * First time we skip PGs marked 'bypassed'. -+ * Second time we only try the ones we skipped. -+ */ -+ do { -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ if (pg->bypassed == bypassed) -+ continue; -+ if (!__choose_path_in_pg(m, pg)) -+ return; -+ } -+ } while (bypassed--); -+ -+failed: -+ m->current_pgpath = NULL; -+ m->current_pg = NULL; -+} -+ -+static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, -+ unsigned was_queued) -+{ -+ int r = 1; -+ unsigned long flags; -+ struct pgpath *pgpath; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ /* Do we need to select a new pgpath? */ -+ if (!m->current_pgpath || -+ (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) -+ __choose_pgpath(m); -+ -+ pgpath = m->current_pgpath; -+ -+ if (was_queued) -+ m->queue_size--; -+ -+ if ((pgpath && m->queue_io) || -+ (!pgpath && m->queue_if_no_path)) { -+ /* Queue for the daemon to resubmit */ -+ bio_list_add(&m->queued_ios, bio); -+ m->queue_size++; -+ if ((m->pg_init_required && !m->pg_init_in_progress) || -+ !m->queue_io) -+ queue_work(kmultipathd, &m->process_queued_ios); -+ pgpath = NULL; -+ r = 0; -+ } else if (!pgpath) -+ r = -EIO; /* Failed */ -+ else -+ bio->bi_bdev = pgpath->path.dev->bdev; -+ -+ mpio->pgpath = pgpath; -+ -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return r; -+} -+ -+/* -+ * If we run out of usable paths, should we queue I/O or error it? -+ */ -+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, -+ unsigned save_old_value) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ if (save_old_value) -+ m->saved_queue_if_no_path = m->queue_if_no_path; -+ else -+ m->saved_queue_if_no_path = queue_if_no_path; -+ m->queue_if_no_path = queue_if_no_path; -+ if (!m->queue_if_no_path && m->queue_size) -+ queue_work(kmultipathd, &m->process_queued_ios); -+ -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return 0; -+} -+ -+/*----------------------------------------------------------------- -+ * The multipath daemon is responsible for resubmitting queued ios. -+ *---------------------------------------------------------------*/ -+ -+static void dispatch_queued_ios(struct multipath *m) -+{ -+ int r; -+ unsigned long flags; -+ struct bio *bio = NULL, *next; -+ struct mpath_io *mpio; -+ union map_info *info; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ bio = bio_list_get(&m->queued_ios); -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ while (bio) { -+ next = bio->bi_next; -+ bio->bi_next = NULL; -+ -+ info = dm_get_mapinfo(bio); -+ mpio = info->ptr; -+ -+ r = map_io(m, bio, mpio, 1); -+ if (r < 0) -+ bio_endio(bio, bio->bi_size, r); -+ else if (r == 1) -+ generic_make_request(bio); -+ -+ bio = next; -+ } -+} -+ -+static void process_queued_ios(void *data) -+{ -+ struct multipath *m = (struct multipath *) data; -+ struct hw_handler *hwh = &m->hw_handler; -+ struct pgpath *pgpath = NULL; -+ unsigned init_required = 0, must_queue = 1; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ if (!m->queue_size) -+ goto out; -+ -+ if (!m->current_pgpath) -+ __choose_pgpath(m); -+ -+ pgpath = m->current_pgpath; -+ -+ if ((pgpath && !m->queue_io) || -+ (!pgpath && !m->queue_if_no_path)) -+ must_queue = 0; -+ -+ if (m->pg_init_required && !m->pg_init_in_progress) { -+ m->pg_init_required = 0; -+ m->pg_init_in_progress = 1; -+ init_required = 1; -+ } -+ -+out: -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ if (init_required) -+ hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path); -+ -+ if (!must_queue) -+ dispatch_queued_ios(m); -+} -+ -+/* -+ * An event is triggered whenever a path is taken out of use. -+ * Includes path failure and PG bypass. -+ */ -+static void trigger_event(void *data) -+{ -+ struct multipath *m = (struct multipath *) data; -+ -+ dm_table_event(m->ti->table); -+} -+ -+/*----------------------------------------------------------------- -+ * Constructor/argument parsing: -+ * <#multipath feature args> [<arg>]* -+ * <#hw_handler args> [hw_handler [<arg>]*] -+ * <#priority groups> -+ * <initial priority group> -+ * [<selector> <#selector args> [<arg>]* -+ * <#paths> <#per-path selector args> -+ * [<path> [<arg>]* ]+ ]+ -+ *---------------------------------------------------------------*/ -+struct param { -+ unsigned min; -+ unsigned max; -+ char *error; -+}; -+ -+#define ESTR(s) ("dm-multipath: " s) -+ -+static int read_param(struct param *param, char *str, unsigned *v, char **error) -+{ -+ if (!str || -+ (sscanf(str, "%u", v) != 1) || -+ (*v < param->min) || -+ (*v > param->max)) { -+ *error = param->error; -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+struct arg_set { -+ unsigned argc; -+ char **argv; -+}; -+ -+static char *shift(struct arg_set *as) -+{ -+ char *r; -+ -+ if (as->argc) { -+ as->argc--; -+ r = *as->argv; -+ as->argv++; -+ return r; -+ } -+ -+ return NULL; -+} -+ -+static void consume(struct arg_set *as, unsigned n) -+{ -+ BUG_ON (as->argc < n); -+ as->argc -= n; -+ as->argv += n; -+} -+ -+static int parse_path_selector(struct arg_set *as, struct priority_group *pg, -+ struct dm_target *ti) -+{ -+ int r; -+ struct path_selector_type *pst; -+ unsigned ps_argc; -+ -+ static struct param _params[] = { -+ {0, 1024, ESTR("invalid number of path selector args")}, -+ }; -+ -+ pst = dm_get_path_selector(shift(as)); -+ if (!pst) { -+ ti->error = ESTR("unknown path selector type"); -+ return -EINVAL; -+ } -+ -+ r = read_param(_params, shift(as), &ps_argc, &ti->error); -+ if (r) -+ return -EINVAL; -+ -+ r = pst->create(&pg->ps, ps_argc, as->argv); -+ if (r) { -+ dm_put_path_selector(pst); -+ ti->error = ESTR("path selector constructor failed"); -+ return r; -+ } -+ -+ pg->ps.type = pst; -+ consume(as, ps_argc); -+ -+ return 0; -+} -+ -+static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, -+ struct dm_target *ti) -+{ -+ int r; -+ struct pgpath *p; -+ -+ /* we need at least a path arg */ -+ if (as->argc < 1) { -+ ti->error = ESTR("no device given"); -+ return NULL; -+ } -+ -+ p = alloc_pgpath(); -+ if (!p) -+ return NULL; -+ -+ r = dm_get_device(ti, shift(as), ti->begin, ti->len, -+ dm_table_get_mode(ti->table), &p->path.dev); -+ if (r) { -+ ti->error = ESTR("error getting device"); -+ goto bad; -+ } -+ -+ r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); -+ if (r) { -+ dm_put_device(ti, p->path.dev); -+ goto bad; -+ } -+ -+ return p; -+ -+ bad: -+ free_pgpath(p); -+ return NULL; -+} -+ -+static struct priority_group *parse_priority_group(struct arg_set *as, -+ struct multipath *m, -+ struct dm_target *ti) -+{ -+ static struct param _params[] = { -+ {1, 1024, ESTR("invalid number of paths")}, -+ {0, 1024, ESTR("invalid number of selector args")} -+ }; -+ -+ int r; -+ unsigned i, nr_selector_args, nr_params; -+ struct priority_group *pg; -+ -+ if (as->argc < 2) { -+ as->argc = 0; -+ ti->error = ESTR("not enough priority group aruments"); -+ return NULL; -+ } -+ -+ pg = alloc_priority_group(); -+ if (!pg) { -+ ti->error = ESTR("couldn't allocate priority group"); -+ return NULL; -+ } -+ pg->m = m; -+ -+ r = parse_path_selector(as, pg, ti); -+ if (r) -+ goto bad; -+ -+ /* -+ * read the paths -+ */ -+ r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); -+ if (r) -+ goto bad; -+ -+ r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); -+ if (r) -+ goto bad; -+ -+ nr_params = 1 + nr_selector_args; -+ for (i = 0; i < pg->nr_pgpaths; i++) { -+ struct pgpath *pgpath; -+ struct arg_set path_args; -+ -+ if (as->argc < nr_params) -+ goto bad; -+ -+ path_args.argc = nr_params; -+ path_args.argv = as->argv; -+ -+ pgpath = parse_path(&path_args, &pg->ps, ti); -+ if (!pgpath) -+ goto bad; -+ -+ pgpath->pg = pg; -+ list_add_tail(&pgpath->list, &pg->pgpaths); -+ consume(as, nr_params); -+ } -+ -+ return pg; -+ -+ bad: -+ free_priority_group(pg, ti); -+ return NULL; -+} -+ -+static int parse_hw_handler(struct arg_set *as, struct multipath *m, -+ struct dm_target *ti) -+{ -+ int r; -+ struct hw_handler_type *hwht; -+ unsigned hw_argc; -+ -+ static struct param _params[] = { -+ {0, 1024, ESTR("invalid number of hardware handler args")}, -+ }; -+ -+ r = read_param(_params, shift(as), &hw_argc, &ti->error); -+ if (r) -+ return -EINVAL; -+ -+ if (!hw_argc) -+ return 0; -+ -+ hwht = dm_get_hw_handler(shift(as)); -+ if (!hwht) { -+ ti->error = ESTR("unknown hardware handler type"); -+ return -EINVAL; -+ } -+ -+ r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); -+ if (r) { -+ dm_put_hw_handler(hwht); -+ ti->error = ESTR("hardware handler constructor failed"); -+ return r; -+ } -+ -+ m->hw_handler.type = hwht; -+ consume(as, hw_argc - 1); -+ -+ return 0; -+} -+ -+static int parse_features(struct arg_set *as, struct multipath *m, -+ struct dm_target *ti) -+{ -+ int r; -+ unsigned argc; -+ -+ static struct param _params[] = { -+ {0, 1, ESTR("invalid number of feature args")}, -+ }; -+ -+ r = read_param(_params, shift(as), &argc, &ti->error); -+ if (r) -+ return -EINVAL; -+ -+ if (!argc) -+ return 0; -+ -+ if (!strnicmp(shift(as), MESG_STR("queue_if_no_path"))) -+ return queue_if_no_path(m, 1, 0); -+ else { -+ ti->error = "Unrecognised multipath feature request"; -+ return -EINVAL; -+ } -+} -+ -+static int multipath_ctr(struct dm_target *ti, unsigned int argc, -+ char **argv) -+{ -+ /* target parameters */ -+ static struct param _params[] = { -+ {1, 1024, ESTR("invalid number of priority groups")}, -+ {1, 1024, ESTR("invalid initial priority group number")}, -+ }; -+ -+ int r; -+ struct multipath *m; -+ struct arg_set as; -+ unsigned pg_count = 0; -+ unsigned next_pg_num; -+ -+ as.argc = argc; -+ as.argv = argv; -+ -+ m = alloc_multipath(); -+ if (!m) { -+ ti->error = ESTR("can't allocate multipath"); -+ return -EINVAL; -+ } -+ -+ r = parse_features(&as, m, ti); -+ if (r) -+ goto bad; -+ -+ r = parse_hw_handler(&as, m, ti); -+ if (r) -+ goto bad; -+ -+ r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); -+ if (r) -+ goto bad; -+ -+ r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); -+ if (r) -+ goto bad; -+ -+ /* parse the priority groups */ -+ while (as.argc) { -+ struct priority_group *pg; -+ -+ pg = parse_priority_group(&as, m, ti); -+ if (!pg) { -+ r = -EINVAL; -+ goto bad; -+ } -+ -+ m->nr_valid_paths += pg->nr_pgpaths; -+ list_add_tail(&pg->list, &m->priority_groups); -+ pg_count++; -+ pg->pg_num = pg_count; -+ if (!--next_pg_num) -+ m->next_pg = pg; -+ } -+ -+ if (pg_count != m->nr_priority_groups) { -+ ti->error = ESTR("priority group count mismatch"); -+ r = -EINVAL; -+ goto bad; -+ } -+ -+ ti->private = m; -+ m->ti = ti; -+ -+ return 0; -+ -+ bad: -+ free_multipath(m); -+ return r; -+} -+ -+static void multipath_dtr(struct dm_target *ti) -+{ -+ struct multipath *m = (struct multipath *) ti->private; -+ -+ flush_workqueue(kmultipathd); -+ free_multipath(m); -+} -+ -+/* -+ * Map bios, recording original fields for later in case we have to resubmit -+ */ -+static int multipath_map(struct dm_target *ti, struct bio *bio, -+ union map_info *map_context) -+{ -+ int r; -+ struct mpath_io *mpio; -+ struct multipath *m = (struct multipath *) ti->private; -+ -+ if (bio_barrier(bio)) -+ return -EOPNOTSUPP; -+ -+ mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); -+ dm_bio_record(&mpio->details, bio); -+ -+ map_context->ptr = mpio; -+ bio->bi_rw |= (1 << BIO_RW_FAILFAST); -+ r = map_io(m, bio, mpio, 0); -+ if (r < 0) -+ mempool_free(mpio, m->mpio_pool); -+ -+ return r; -+} -+ -+/* -+ * Take a path out of use. -+ */ -+static int fail_path(struct pgpath *pgpath) -+{ -+ unsigned long flags; -+ struct multipath *m = pgpath->pg->m; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ if (!pgpath->path.is_active) -+ goto out; -+ -+ DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); -+ -+ pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); -+ pgpath->path.is_active = 0; -+ pgpath->fail_count++; -+ -+ m->nr_valid_paths--; -+ -+ if (pgpath == m->current_pgpath) -+ m->current_pgpath = NULL; -+ -+ queue_work(kmultipathd, &m->trigger_event); -+ -+out: -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return 0; -+} -+ -+/* -+ * Reinstate a previously-failed path -+ */ -+static int reinstate_path(struct pgpath *pgpath) -+{ -+ int r = 0; -+ unsigned long flags; -+ struct multipath *m = pgpath->pg->m; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ if (pgpath->path.is_active) -+ goto out; -+ -+ if (!pgpath->pg->ps.type) { -+ DMWARN("Reinstate path not supported by path selector %s", -+ pgpath->pg->ps.type->name); -+ r = -EINVAL; -+ goto out; -+ } -+ -+ r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); -+ if (r) -+ goto out; -+ -+ pgpath->path.is_active = 1; -+ -+ m->current_pgpath = NULL; -+ if (!m->nr_valid_paths++ && m->queue_size) -+ queue_work(kmultipathd, &m->process_queued_ios); -+ -+ queue_work(kmultipathd, &m->trigger_event); -+ -+out: -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return r; -+} -+ -+/* -+ * Fail or reinstate all paths that match the provided struct dm_dev. -+ */ -+static int action_dev(struct multipath *m, struct dm_dev *dev, -+ action_fn action) -+{ -+ int r = 0; -+ struct pgpath *pgpath; -+ struct priority_group *pg; -+ -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ list_for_each_entry(pgpath, &pg->pgpaths, list) { -+ if (pgpath->path.dev == dev) -+ r = action(pgpath); -+ } -+ } -+ -+ return r; -+} -+ -+/* -+ * Temporarily try to avoid having to use the specified PG -+ */ -+static void bypass_pg(struct multipath *m, struct priority_group *pg, -+ int bypassed) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ pg->bypassed = bypassed; -+ m->current_pgpath = NULL; -+ m->current_pg = NULL; -+ -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ queue_work(kmultipathd, &m->trigger_event); -+} -+ -+/* -+ * Switch to using the specified PG from the next I/O that gets mapped -+ */ -+static int switch_pg_num(struct multipath *m, const char *pgstr) -+{ -+ struct priority_group *pg; -+ unsigned pgnum; -+ unsigned long flags; -+ -+ if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || -+ (pgnum > m->nr_priority_groups)) { -+ DMWARN("invalid PG number supplied to switch_pg_num"); -+ return -EINVAL; -+ } -+ -+ spin_lock_irqsave(&m->lock, flags); -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ pg->bypassed = 0; -+ if (--pgnum) -+ continue; -+ -+ m->current_pgpath = NULL; -+ m->current_pg = NULL; -+ m->next_pg = pg; -+ } -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ queue_work(kmultipathd, &m->trigger_event); -+ return 0; -+} -+ -+/* -+ * Set/clear bypassed status of a PG. -+ * PGs are numbered upwards from 1 in the order they were declared. -+ */ -+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) -+{ -+ struct priority_group *pg; -+ unsigned pgnum; -+ -+ if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || -+ (pgnum > m->nr_priority_groups)) { -+ DMWARN("invalid PG number supplied to bypass_pg"); -+ return -EINVAL; -+ } -+ -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ if (!--pgnum) -+ break; -+ } -+ -+ bypass_pg(m, pg, bypassed); -+ return 0; -+} -+ -+/* -+ * pg_init must call this when it has completed its initialisation -+ */ -+void dm_pg_init_complete(struct path *path, unsigned err_flags) -+{ -+ struct pgpath *pgpath = path_to_pgpath(path); -+ struct priority_group *pg = pgpath->pg; -+ struct multipath *m = pg->m; -+ unsigned long flags; -+ -+ /* We insist on failing the path if the PG is already bypassed. */ -+ if (err_flags && pg->bypassed) -+ err_flags |= MP_FAIL_PATH; -+ -+ if (err_flags & MP_FAIL_PATH) -+ fail_path(pgpath); -+ -+ if (err_flags & MP_BYPASS_PG) -+ bypass_pg(m, pg, 1); -+ -+ spin_lock_irqsave(&m->lock, flags); -+ if (err_flags) { -+ m->current_pgpath = NULL; -+ m->current_pg = NULL; -+ } else if (!m->pg_init_required) -+ m->queue_io = 0; -+ -+ m->pg_init_in_progress = 0; -+ queue_work(kmultipathd, &m->process_queued_ios); -+ spin_unlock_irqrestore(&m->lock, flags); -+} -+ -+/* -+ * end_io handling -+ */ -+static int do_end_io(struct multipath *m, struct bio *bio, -+ int error, struct mpath_io *mpio) -+{ -+ struct hw_handler *hwh = &m->hw_handler; -+ unsigned err_flags = MP_FAIL_PATH; /* Default behavior */ -+ unsigned long flags; -+ -+ if (!error) -+ return 0; /* I/O complete */ -+ -+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) -+ return error; -+ -+ if (error == -EOPNOTSUPP) -+ return error; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ if (!m->nr_valid_paths) { -+ if (!m->queue_if_no_path) { -+ spin_unlock_irqrestore(&m->lock, flags); -+ return -EIO; -+ } else { -+ spin_unlock_irqrestore(&m->lock, flags); -+ goto requeue; -+ } -+ } -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ if (hwh->type && hwh->type->error) -+ err_flags = hwh->type->error(hwh, bio); -+ else -+ err_flags = dm_scsi_err_handler(hwh, bio); -+ -+ if (mpio->pgpath) { -+ if (err_flags & MP_FAIL_PATH) -+ fail_path(mpio->pgpath); -+ -+ if (err_flags & MP_BYPASS_PG) -+ bypass_pg(m, mpio->pgpath->pg, 1); -+ } -+ -+ if (err_flags & MP_ERROR_IO) -+ return -EIO; -+ -+ requeue: -+ dm_bio_restore(&mpio->details, bio); -+ -+ /* queue for the daemon to resubmit or fail */ -+ spin_lock_irqsave(&m->lock, flags); -+ bio_list_add(&m->queued_ios, bio); -+ m->queue_size++; -+ if (!m->queue_io) -+ queue_work(kmultipathd, &m->process_queued_ios); -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return 1; /* io not complete */ -+} -+ -+static int multipath_end_io(struct dm_target *ti, struct bio *bio, -+ int error, union map_info *map_context) -+{ -+ struct multipath *m = (struct multipath *) ti->private; -+ struct mpath_io *mpio = (struct mpath_io *) map_context->ptr; -+ struct pgpath *pgpath = mpio->pgpath; -+ struct path_selector *ps; -+ int r; -+ -+ r = do_end_io(m, bio, error, mpio); -+ if (pgpath) { -+ ps = &pgpath->pg->ps; -+ if (ps->type->end_io) -+ ps->type->end_io(ps, &pgpath->path); -+ } -+ if (r <= 0) -+ mempool_free(mpio, m->mpio_pool); -+ -+ return r; -+} -+ -+/* -+ * Suspend can't complete until all the I/O is processed so if -+ * the last path fails we must error any remaining I/O. -+ * Note that if the freeze_bdev fails while suspending, the -+ * queue_if_no_path state is lost - userspace should reset it. -+ */ -+static void multipath_presuspend(struct dm_target *ti) -+{ -+ struct multipath *m = (struct multipath *) ti->private; -+ -+ queue_if_no_path(m, 0, 1); -+} -+ -+/* -+ * Restore the queue_if_no_path setting. -+ */ -+static void multipath_resume(struct dm_target *ti) -+{ -+ struct multipath *m = (struct multipath *) ti->private; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ m->queue_if_no_path = m->saved_queue_if_no_path; -+ spin_unlock_irqrestore(&m->lock, flags); -+} -+ -+/* -+ * Info output has the following format: -+ * num_multipath_feature_args [multipath_feature_args]* -+ * num_handler_status_args [handler_status_args]* -+ * num_groups init_group_number -+ * [A|D|E num_ps_status_args [ps_status_args]* -+ * num_paths num_selector_args -+ * [path_dev A|F fail_count [selector_args]* ]+ ]+ -+ * -+ * Table output has the following format (identical to the constructor string): -+ * num_feature_args [features_args]* -+ * num_handler_args hw_handler [hw_handler_args]* -+ * num_groups init_group_number -+ * [priority selector-name num_ps_args [ps_args]* -+ * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ -+ */ -+static int multipath_status(struct dm_target *ti, status_type_t type, -+ char *result, unsigned int maxlen) -+{ -+ int sz = 0; -+ unsigned long flags; -+ struct multipath *m = (struct multipath *) ti->private; -+ struct hw_handler *hwh = &m->hw_handler; -+ struct priority_group *pg; -+ struct pgpath *p; -+ unsigned pg_num; -+ char state; -+ -+ spin_lock_irqsave(&m->lock, flags); -+ -+ /* Features */ -+ if (type == STATUSTYPE_INFO) -+ DMEMIT("1 %u ", m->queue_size); -+ else if (m->queue_if_no_path) -+ DMEMIT("1 queue_if_no_path "); -+ else -+ DMEMIT("0 "); -+ -+ if (hwh->type && hwh->type->status) -+ sz += hwh->type->status(hwh, type, result + sz, maxlen - sz); -+ else if (!hwh->type || type == STATUSTYPE_INFO) -+ DMEMIT("0 "); -+ else -+ DMEMIT("1 %s ", hwh->type->name); -+ -+ DMEMIT("%u ", m->nr_priority_groups); -+ -+ if (m->next_pg) -+ pg_num = m->next_pg->pg_num; -+ else if (m->current_pg) -+ pg_num = m->current_pg->pg_num; -+ else -+ pg_num = 1; -+ -+ DMEMIT("%u ", pg_num); -+ -+ switch (type) { -+ case STATUSTYPE_INFO: -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ if (pg->bypassed) -+ state = 'D'; /* Disabled */ -+ else if (pg == m->current_pg) -+ state = 'A'; /* Currently Active */ -+ else -+ state = 'E'; /* Enabled */ -+ -+ DMEMIT("%c ", state); -+ -+ if (pg->ps.type->status) -+ sz += pg->ps.type->status(&pg->ps, NULL, type, -+ result + sz, -+ maxlen - sz); -+ else -+ DMEMIT("0 "); -+ -+ DMEMIT("%u %u ", pg->nr_pgpaths, -+ pg->ps.type->info_args); -+ -+ list_for_each_entry(p, &pg->pgpaths, list) { -+ DMEMIT("%s %s %u ", p->path.dev->name, -+ p->path.is_active ? "A" : "F", -+ p->fail_count); -+ if (pg->ps.type->status) -+ sz += pg->ps.type->status(&pg->ps, -+ &p->path, type, result + sz, -+ maxlen - sz); -+ } -+ } -+ break; -+ -+ case STATUSTYPE_TABLE: -+ list_for_each_entry(pg, &m->priority_groups, list) { -+ DMEMIT("%s ", pg->ps.type->name); -+ -+ if (pg->ps.type->status) -+ sz += pg->ps.type->status(&pg->ps, NULL, type, -+ result + sz, -+ maxlen - sz); -+ else -+ DMEMIT("0 "); -+ -+ DMEMIT("%u %u ", pg->nr_pgpaths, -+ pg->ps.type->table_args); -+ -+ list_for_each_entry(p, &pg->pgpaths, list) { -+ DMEMIT("%s ", p->path.dev->name); -+ if (pg->ps.type->status) -+ sz += pg->ps.type->status(&pg->ps, -+ &p->path, type, result + sz, -+ maxlen - sz); -+ } -+ } -+ break; -+ } -+ -+ spin_unlock_irqrestore(&m->lock, flags); -+ -+ return 0; -+} -+ -+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) -+{ -+ int r; -+ struct dm_dev *dev; -+ struct multipath *m = (struct multipath *) ti->private; -+ action_fn action; -+ -+ if (argc == 1) { -+ if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) -+ return queue_if_no_path(m, 1, 0); -+ else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) -+ return queue_if_no_path(m, 0, 0); -+ } -+ -+ if (argc != 2) -+ goto error; -+ -+ if (!strnicmp(argv[0], MESG_STR("disable_group"))) -+ return bypass_pg_num(m, argv[1], 1); -+ else if (!strnicmp(argv[0], MESG_STR("enable_group"))) -+ return bypass_pg_num(m, argv[1], 0); -+ else if (!strnicmp(argv[0], MESG_STR("switch_group"))) -+ return switch_pg_num(m, argv[1]); -+ else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) -+ action = reinstate_path; -+ else if (!strnicmp(argv[0], MESG_STR("fail_path"))) -+ action = fail_path; -+ else -+ goto error; -+ -+ r = dm_get_device(ti, argv[1], ti->begin, ti->len, -+ dm_table_get_mode(ti->table), &dev); -+ if (r) { -+ DMWARN("dm-multipath message: error getting device %s", -+ argv[1]); -+ return -EINVAL; -+ } -+ -+ r = action_dev(m, dev, action); -+ -+ dm_put_device(ti, dev); -+ -+ return r; -+ -+error: -+ DMWARN("Unrecognised multipath message received."); -+ return -EINVAL; -+} -+ -+/*----------------------------------------------------------------- -+ * Module setup -+ *---------------------------------------------------------------*/ -+static struct target_type multipath_target = { -+ .name = "multipath", -+ .version = {1, 0, 4}, -+ .module = THIS_MODULE, -+ .ctr = multipath_ctr, -+ .dtr = multipath_dtr, -+ .map = multipath_map, -+ .end_io = multipath_end_io, -+ .presuspend = multipath_presuspend, -+ .resume = multipath_resume, -+ .status = multipath_status, -+ .message = multipath_message, -+}; -+ -+static int __init dm_multipath_init(void) -+{ -+ int r; -+ -+ /* allocate a slab for the dm_ios */ -+ _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io), -+ 0, 0, NULL, NULL); -+ if (!_mpio_cache) -+ return -ENOMEM; -+ -+ r = dm_register_target(&multipath_target); -+ if (r < 0) { -+ DMERR("%s: register failed %d", multipath_target.name, r); -+ kmem_cache_destroy(_mpio_cache); -+ return -EINVAL; -+ } -+ -+ kmultipathd = create_workqueue("kmpathd"); -+ if (!kmultipathd) { -+ DMERR("%s: failed to create workqueue kmpathd", -+ multipath_target.name); -+ dm_unregister_target(&multipath_target); -+ kmem_cache_destroy(_mpio_cache); -+ return -ENOMEM; -+ } -+ -+ DMINFO("dm-multipath version %u.%u.%u loaded", -+ multipath_target.version[0], multipath_target.version[1], -+ multipath_target.version[2]); -+ -+ return r; -+} -+ -+static void __exit dm_multipath_exit(void) -+{ -+ int r; -+ -+ destroy_workqueue(kmultipathd); -+ -+ r = dm_unregister_target(&multipath_target); -+ if (r < 0) -+ DMERR("%s: target unregister failed %d", -+ multipath_target.name, r); -+ kmem_cache_destroy(_mpio_cache); -+} -+ -+EXPORT_SYMBOL_GPL(dm_pg_init_complete); -+ -+module_init(dm_multipath_init); -+module_exit(dm_multipath_exit); -+ -+MODULE_DESCRIPTION(DM_NAME " multipath target"); -+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); -+MODULE_LICENSE("GPL"); -diff -pruN ./drivers/md.dm/dm-mpath.h ./drivers/md/dm-mpath.h ---- ./drivers/md.dm/dm-mpath.h 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-mpath.h 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,25 @@ -+/* -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * This file is released under the GPL. -+ * -+ * Multipath. -+ */ -+ -+#ifndef DM_MPATH_H -+#define DM_MPATH_H -+ -+struct dm_dev; -+ -+struct path { -+ struct dm_dev *dev; /* Read-only */ -+ unsigned is_active; /* Read-only */ -+ -+ void *pscontext; /* For path-selector use */ -+ void *hwhcontext; /* For hw-handler use */ -+}; -+ -+/* Callback for hwh_pg_init_fn to use when complete */ -+void dm_pg_init_complete(struct path *path, unsigned err_flags); -+ -+#endif -diff -pruN ./drivers/md.dm/dm-path-selector.c ./drivers/md/dm-path-selector.c ---- ./drivers/md.dm/dm-path-selector.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-path-selector.c 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,156 @@ -+/* -+ * Copyright (C) 2003 Sistina Software. -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * Module Author: Heinz Mauelshagen -+ * -+ * This file is released under the GPL. -+ * -+ * Path selector registration. -+ */ -+ -+#include "dm.h" -+#include "dm-path-selector.h" -+ -+#include <linux/slab.h> -+ -+struct ps_internal { -+ struct path_selector_type pst; -+ -+ struct list_head list; -+ long use; -+}; -+ -+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) -+ -+static LIST_HEAD(_path_selectors); -+static DECLARE_RWSEM(_ps_lock); -+ -+struct ps_internal *__find_path_selector_type(const char *name) -+{ -+ struct ps_internal *psi; -+ -+ list_for_each_entry(psi, &_path_selectors, list) { -+ if (!strcmp(name, psi->pst.name)) -+ return psi; -+ } -+ -+ return NULL; -+} -+ -+static struct ps_internal *get_path_selector(const char *name) -+{ -+ struct ps_internal *psi; -+ -+ down_read(&_ps_lock); -+ psi = __find_path_selector_type(name); -+ if (psi) { -+ if ((psi->use == 0) && !try_module_get(psi->pst.module)) -+ psi = NULL; -+ else -+ psi->use++; -+ } -+ up_read(&_ps_lock); -+ -+ return psi; -+} -+ -+struct path_selector_type *dm_get_path_selector(const char *name) -+{ -+ struct ps_internal *psi; -+ -+ if (!name) -+ return NULL; -+ -+ psi = get_path_selector(name); -+ if (!psi) { -+ request_module("dm-%s", name); -+ psi = get_path_selector(name); -+ } -+ -+ return psi ? &psi->pst : NULL; -+} -+ -+void dm_put_path_selector(struct path_selector_type *pst) -+{ -+ struct ps_internal *psi; -+ -+ if (!pst) -+ return; -+ -+ down_read(&_ps_lock); -+ psi = __find_path_selector_type(pst->name); -+ if (!psi) -+ goto out; -+ -+ if (--psi->use == 0) -+ module_put(psi->pst.module); -+ -+ if (psi->use < 0) -+ BUG(); -+ -+out: -+ up_read(&_ps_lock); -+} -+ -+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) -+{ -+ struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL); -+ -+ if (psi) { -+ memset(psi, 0, sizeof(*psi)); -+ psi->pst = *pst; -+ } -+ -+ return psi; -+} -+ -+int dm_register_path_selector(struct path_selector_type *pst) -+{ -+ int r = 0; -+ struct ps_internal *psi = _alloc_path_selector(pst); -+ -+ if (!psi) -+ return -ENOMEM; -+ -+ down_write(&_ps_lock); -+ -+ if (__find_path_selector_type(pst->name)) { -+ kfree(psi); -+ r = -EEXIST; -+ } else -+ list_add(&psi->list, &_path_selectors); -+ -+ up_write(&_ps_lock); -+ -+ return r; -+} -+ -+int dm_unregister_path_selector(struct path_selector_type *pst) -+{ -+ struct ps_internal *psi; -+ -+ down_write(&_ps_lock); -+ -+ psi = __find_path_selector_type(pst->name); -+ if (!psi) { -+ up_write(&_ps_lock); -+ return -EINVAL; -+ } -+ -+ if (psi->use) { -+ up_write(&_ps_lock); -+ return -ETXTBSY; -+ } -+ -+ list_del(&psi->list); -+ -+ up_write(&_ps_lock); -+ -+ kfree(psi); -+ -+ return 0; -+} -+ -+EXPORT_SYMBOL_GPL(dm_register_path_selector); -+EXPORT_SYMBOL_GPL(dm_unregister_path_selector); -diff -pruN ./drivers/md.dm/dm-path-selector.h ./drivers/md/dm-path-selector.h ---- ./drivers/md.dm/dm-path-selector.h 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-path-selector.h 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,93 @@ -+/* -+ * Copyright (C) 2003 Sistina Software. -+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved. -+ * -+ * Module Author: Heinz Mauelshagen -+ * -+ * This file is released under the GPL. -+ * -+ * Path-Selector registration. -+ */ -+ -+#ifndef DM_PATH_SELECTOR_H -+#define DM_PATH_SELECTOR_H -+ -+#include <linux/device-mapper.h> -+ -+#include "dm-mpath.h" -+ -+/* -+ * We provide an abstraction for the code that chooses which path -+ * to send some io down. -+ */ -+struct path_selector_type; -+struct path_selector { -+ struct path_selector_type *type; -+ void *context; -+}; -+ -+/* Information about a path selector type */ -+struct path_selector_type { -+ char *name; -+ struct module *module; -+ -+ unsigned int table_args; -+ unsigned int info_args; -+ -+ /* -+ * Constructs a path selector object, takes custom arguments -+ */ -+ int (*create) (struct path_selector *ps, unsigned argc, char **argv); -+ void (*destroy) (struct path_selector *ps); -+ -+ /* -+ * Add an opaque path object, along with some selector specific -+ * path args (eg, path priority). -+ */ -+ int (*add_path) (struct path_selector *ps, struct path *path, -+ int argc, char **argv, char **error); -+ -+ /* -+ * Chooses a path for this io, if no paths are available then -+ * NULL will be returned. -+ * -+ * repeat_count is the number of times to use the path before -+ * calling the function again. 0 means don't call it again unless -+ * the path fails. -+ */ -+ struct path *(*select_path) (struct path_selector *ps, -+ unsigned *repeat_count); -+ -+ /* -+ * Notify the selector that a path has failed. -+ */ -+ void (*fail_path) (struct path_selector *ps, struct path *p); -+ -+ /* -+ * Ask selector to reinstate a path. -+ */ -+ int (*reinstate_path) (struct path_selector *ps, struct path *p); -+ -+ /* -+ * Table content based on parameters added in ps_add_path_fn -+ * or path selector status -+ */ -+ int (*status) (struct path_selector *ps, struct path *path, -+ status_type_t type, char *result, unsigned int maxlen); -+ -+ int (*end_io) (struct path_selector *ps, struct path *path); -+}; -+ -+/* Register a path selector */ -+int dm_register_path_selector(struct path_selector_type *type); -+ -+/* Unregister a path selector */ -+int dm_unregister_path_selector(struct path_selector_type *type); -+ -+/* Returns a registered path selector type */ -+struct path_selector_type *dm_get_path_selector(const char *name); -+ -+/* Releases a path selector */ -+void dm_put_path_selector(struct path_selector_type *pst); -+ -+#endif -diff -pruN ./drivers/md.dm/dm-raid1.c ./drivers/md/dm-raid1.c ---- ./drivers/md.dm/dm-raid1.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-raid1.c 2006-03-17 13:16:38.000000000 +0300 -@@ -6,6 +6,7 @@ - - #include "dm.h" - #include "dm-bio-list.h" -+#include "dm-bio-record.h" - #include "dm-io.h" - #include "dm-log.h" - #include "kcopyd.h" -@@ -28,6 +29,8 @@ static inline void wake(void) - queue_work(_kmirrord_wq, &_kmirrord_work); - } - -+static struct workqueue_struct *_kmir_mon_wq; -+ - /*----------------------------------------------------------------- - * Region hash - * -@@ -67,7 +70,7 @@ static inline void wake(void) - struct mirror_set; - struct region_hash { - struct mirror_set *ms; -- sector_t region_size; -+ uint32_t region_size; - unsigned region_shift; - - /* holds persistent region state */ -@@ -135,7 +138,7 @@ static void region_free(void *element, v - #define MIN_REGIONS 64 - #define MAX_RECOVERY 1 - static int rh_init(struct region_hash *rh, struct mirror_set *ms, -- struct dirty_log *log, sector_t region_size, -+ struct dirty_log *log, uint32_t region_size, - region_t nr_regions) - { - unsigned int nr_buckets, max_buckets; -@@ -253,9 +256,9 @@ static struct region *__rh_alloc(struct - else { - __rh_insert(rh, nreg); - if (nreg->state == RH_CLEAN) { -- spin_lock_irq(&rh->region_lock); -+ spin_lock(&rh->region_lock); - list_add(&nreg->list, &rh->clean_regions); -- spin_unlock_irq(&rh->region_lock); -+ spin_unlock(&rh->region_lock); - } - reg = nreg; - } -@@ -375,16 +378,19 @@ static void rh_inc(struct region_hash *r - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); -- if (reg->state == RH_CLEAN) { -- rh->log->type->mark_region(rh->log, reg->key); - -- spin_lock_irq(&rh->region_lock); -+ spin_lock_irq(&rh->region_lock); -+ atomic_inc(®->pending); -+ -+ if (reg->state == RH_CLEAN) { - reg->state = RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); -- } - -- atomic_inc(®->pending); -+ rh->log->type->mark_region(rh->log, reg->key); -+ } else -+ spin_unlock_irq(&rh->region_lock); -+ - read_unlock(&rh->hash_lock); - } - -@@ -406,17 +412,17 @@ static void rh_dec(struct region_hash *r - reg = __rh_lookup(rh, region); - read_unlock(&rh->hash_lock); - -+ spin_lock_irqsave(&rh->region_lock, flags); - if (atomic_dec_and_test(®->pending)) { -- spin_lock_irqsave(&rh->region_lock, flags); - if (reg->state == RH_RECOVERING) { - list_add_tail(®->list, &rh->quiesced_regions); - } else { - reg->state = RH_CLEAN; - list_add(®->list, &rh->clean_regions); - } -- spin_unlock_irqrestore(&rh->region_lock, flags); - should_wake = 1; - } -+ spin_unlock_irqrestore(&rh->region_lock, flags); - - if (should_wake) - wake(); -@@ -539,7 +545,8 @@ static void rh_start_recovery(struct reg - * Mirror set structures. - *---------------------------------------------------------------*/ - struct mirror { -- atomic_t error_count; -+ atomic_t error_count; /* Error counter to flag mirror failure */ -+ struct mirror_set *ms; - struct dm_dev *dev; - sector_t offset; - }; -@@ -550,36 +557,59 @@ struct mirror_set { - struct region_hash rh; - struct kcopyd_client *kcopyd_client; - -- spinlock_t lock; /* protects the next two lists */ -+ spinlock_t lock; /* protects the lists */ - struct bio_list reads; - struct bio_list writes; -+ struct bio_list failures; -+ struct work_struct failure_work; -+ struct completion failure_completion; - - /* recovery */ -+ atomic_t suspended; - region_t nr_regions; - int in_sync; - - unsigned int nr_mirrors; -- struct mirror mirror[0]; -+ spinlock_t choose_lock; /* protects select in choose_mirror(). */ -+ atomic_t read_count; /* Read counter for read balancing. */ -+ unsigned int read_mirror; /* Last mirror read. */ -+ struct mirror *default_mirror; /* Default mirror. */ -+ struct mirror mirror[0]; - }; - -+struct bio_map_info { -+ struct mirror *bmi_m; -+ struct dm_bio_details bmi_bd; -+}; -+ -+static mempool_t *bio_map_info_pool = NULL; -+ -+static void *bio_map_info_alloc(int gfp_mask, void *pool_data){ -+ return kmalloc(sizeof(struct bio_map_info), gfp_mask); -+} -+ -+static void bio_map_info_free(void *element, void *pool_data){ -+ kfree(element); -+} -+ - /* - * Every mirror should look like this one. - */ - #define DEFAULT_MIRROR 0 - - /* -- * This is yucky. We squirrel the mirror_set struct away inside -- * bi_next for write buffers. This is safe since the bh -+ * This is yucky. We squirrel the mirror struct away inside -+ * bi_next for read/write buffers. This is safe since the bh - * doesn't get submitted to the lower levels of block layer. - */ --static struct mirror_set *bio_get_ms(struct bio *bio) -+static struct mirror *bio_get_m(struct bio *bio) - { -- return (struct mirror_set *) bio->bi_next; -+ return (struct mirror *) bio->bi_next; - } - --static void bio_set_ms(struct bio *bio, struct mirror_set *ms) -+static void bio_set_m(struct bio *bio, struct mirror *m) - { -- bio->bi_next = (struct bio *) ms; -+ bio->bi_next = (struct bio *) m; - } - - /*----------------------------------------------------------------- -@@ -607,7 +637,7 @@ static int recover(struct mirror_set *ms - unsigned long flags = 0; - - /* fill in the source */ -- m = ms->mirror + DEFAULT_MIRROR; -+ m = ms->default_mirror; - from.bdev = m->dev->bdev; - from.sector = m->offset + region_to_sector(reg->rh, reg->key); - if (reg->key == (ms->nr_regions - 1)) { -@@ -623,7 +653,7 @@ static int recover(struct mirror_set *ms - - /* fill in the destinations */ - for (i = 0, dest = to; i < ms->nr_mirrors; i++) { -- if (i == DEFAULT_MIRROR) -+ if (&ms->mirror[i] == ms->default_mirror) - continue; - - m = ms->mirror + i; -@@ -673,42 +703,163 @@ static void do_recovery(struct mirror_se - } - - /*----------------------------------------------------------------- -- * Reads -+ * Misc Functions - *---------------------------------------------------------------*/ --static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) -+#define MIN_READS 128 -+/* -+ * choose_mirror -+ * @ms: the mirror set -+ * @m: mirror that has failed, or NULL if just choosing -+ * -+ * Returns: chosen mirror, or NULL on failure -+ */ -+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m) - { -- /* FIXME: add read balancing */ -- return ms->mirror + DEFAULT_MIRROR; -+ int i, retry; -+ unsigned long flags; -+ struct mirror *ret = NULL; -+ -+ spin_lock_irqsave(&ms->choose_lock, flags); -+ -+ if (unlikely(m == ms->default_mirror)) { -+ i = DEFAULT_MIRROR; -+ atomic_set(&ms->read_count, MIN_READS); -+ } else -+ i = ms->read_mirror; -+ -+ for (retry = 0; retry < ms->nr_mirrors; ) { -+ i %= ms->nr_mirrors; -+ ret = ms->mirror + i; -+ -+ if (unlikely(atomic_read(&ret->error_count))) { -+ retry++; -+ i++; -+ } else { -+ /* -+ * Guarantee that a number of read IOs -+ * get queued to the same mirror. -+ */ -+ if (atomic_dec_and_test(&ms->read_count)) { -+ atomic_set(&ms->read_count, MIN_READS); -+ i++; -+ } -+ -+ ms->read_mirror = i; -+ break; -+ } -+ } -+ -+ /* Check for failure of default mirror, reset if necessary */ -+ if (unlikely(m == ms->default_mirror)) -+ ms->default_mirror = ret; -+ -+ spin_unlock_irqrestore(&ms->choose_lock, flags); -+ -+ if (unlikely(atomic_read(&ret->error_count))) { -+ DMERR("All mirror devices are dead. Unable to choose mirror."); -+ return NULL; -+ } -+ -+ return ret; -+} -+ -+static void fail_mirror(struct mirror *m) -+{ -+ DMINFO("incrementing error_count on %s", m->dev->name); -+ atomic_inc(&m->error_count); -+ -+ choose_mirror(m->ms, m); -+} -+ -+static int default_ok(struct mirror *m) -+{ -+ return !atomic_read(&m->ms->default_mirror->error_count); - } - - /* - * remap a buffer to a particular mirror. - */ --static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) -+static sector_t map_sector(struct mirror *m, struct bio *bio) -+{ -+ return m->offset + (bio->bi_sector - m->ms->ti->begin); -+} -+ -+static void map_bio(struct mirror *m, struct bio *bio) - { - bio->bi_bdev = m->dev->bdev; -- bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); -+ bio->bi_sector = map_sector(m, bio); -+} -+ -+static void map_region(struct io_region *io, struct mirror *m, -+ struct bio *bio) -+{ -+ io->bdev = m->dev->bdev; -+ io->sector = map_sector(m, bio); -+ io->count = bio->bi_size >> 9; -+} -+ -+/*----------------------------------------------------------------- -+ * Reads -+ *---------------------------------------------------------------*/ -+static void read_callback(unsigned long error, void *context) -+{ -+ struct bio *bio = (struct bio *)context; -+ struct mirror *m; -+ -+ m = bio_get_m(bio); -+ bio_set_m(bio, NULL); -+ -+ if (unlikely(error)) { -+ DMWARN("A read failure occurred on a mirror device."); -+ fail_mirror(m); -+ if (likely(default_ok(m))) { -+ DMWARN("Trying different device."); -+ queue_bio(m->ms, bio, bio_rw(bio)); -+ } else { -+ DMERR("No other device available, failing I/O."); -+ bio_endio(bio, 0, -EIO); -+ } -+ } else -+ bio_endio(bio, bio->bi_size, 0); -+} -+ -+/* Asynchronous read. */ -+static void read_async_bio(struct mirror *m, struct bio *bio) -+{ -+ struct io_region io; -+ -+ map_region(&io, m, bio); -+ bio_set_m(bio, m); -+ dm_io_async_bvec(1, &io, READ, -+ bio->bi_io_vec + bio->bi_idx, -+ read_callback, bio); - } - - static void do_reads(struct mirror_set *ms, struct bio_list *reads) - { -- region_t region; - struct bio *bio; - struct mirror *m; - - while ((bio = bio_list_pop(reads))) { -- region = bio_to_region(&ms->rh, bio); -- - /* - * We can only read balance if the region is in sync. - */ -- if (rh_in_sync(&ms->rh, region, 0)) -- m = choose_mirror(ms, bio->bi_sector); -- else -- m = ms->mirror + DEFAULT_MIRROR; -+ if (likely(rh_in_sync(&ms->rh, -+ bio_to_region(&ms->rh, bio), -+ 0))) -+ m = choose_mirror(ms, NULL); -+ else { -+ m = ms->default_mirror; -+ -+ /* If the default fails, we give up .*/ -+ if (unlikely(m && atomic_read(&m->error_count))) -+ m = NULL; -+ } - -- map_bio(ms, m, bio); -- generic_make_request(bio); -+ if (likely(m)) -+ read_async_bio(m, bio); -+ else -+ bio_endio(bio, 0, -EIO); - } - } - -@@ -722,56 +873,116 @@ static void do_reads(struct mirror_set * - * RECOVERING: delay the io until recovery completes - * NOSYNC: increment pending, just write to the default mirror - *---------------------------------------------------------------*/ -+static void write_failure_handler(void *data) -+{ -+ struct bio *bio; -+ struct bio_list failed_writes; -+ struct mirror_set *ms = (struct mirror_set *)data; -+ struct dirty_log *log = ms->rh.log; -+ -+ if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK) { -+ dm_table_event(ms->ti->table); -+ wait_for_completion(&ms->failure_completion); -+ } -+ -+ /* Take list out to handle endios. */ -+ spin_lock_irq(&ms->lock); -+ failed_writes = ms->failures; -+ bio_list_init(&ms->failures); -+ spin_unlock_irq(&ms->lock); -+ -+ while ((bio = bio_list_pop(&failed_writes))) -+ bio_endio(bio, bio->bi_size, 0); -+} -+ - static void write_callback(unsigned long error, void *context) - { -- unsigned int i; -- int uptodate = 1; -+ unsigned int i, ret = 0; - struct bio *bio = (struct bio *) context; - struct mirror_set *ms; -- -- ms = bio_get_ms(bio); -- bio_set_ms(bio, NULL); -- -+ int uptodate = 0, run; -+ -+ ms = (bio_get_m(bio))->ms; -+ bio_set_m(bio, NULL); -+ - /* - * NOTE: We don't decrement the pending count here, - * instead it is done by the targets endio function. - * This way we handle both writes to SYNC and NOSYNC - * regions with the same code. - */ -+ if (unlikely(error)) { -+ DMERR("Error during write occurred."); - -- if (error) { - /* -- * only error the io if all mirrors failed. -- * FIXME: bogus -+ * Test all bits - if all failed, fail io. -+ * Otherwise, go through hassle of failing a device... - */ -- uptodate = 0; -- for (i = 0; i < ms->nr_mirrors; i++) -- if (!test_bit(i, &error)) { -+ for (i = 0; i < ms->nr_mirrors; i++) { -+ if (test_bit(i, &error)) -+ fail_mirror(ms->mirror + i); -+ else - uptodate = 1; -- break; -+ } -+ -+ if (likely(uptodate)) { -+ spin_lock(&ms->lock); -+ if (atomic_read(&ms->suspended)) { -+ /* -+ * The device is suspended, it is -+ * safe to complete I/O. -+ */ -+ spin_unlock(&ms->lock); -+ } else { -+ /* -+ * Need to raise event. Since raising -+ * events can block, we need to do it in -+ * seperate thread. -+ * -+ * run gets set if this will be the first -+ * bio in the list. -+ */ -+ run = !ms->failures.head; -+ bio_list_add(&ms->failures, bio); -+ spin_unlock(&ms->lock); -+ -+ if (run) -+ queue_work(_kmir_mon_wq, -+ &ms->failure_work); -+ -+ return; - } -+ } else { -+ DMERR("All replicated volumes dead, failing I/O"); -+ /* None of the writes succeeded, fail the I/O. */ -+ ret = -EIO; -+ } - } -- bio_endio(bio, bio->bi_size, 0); -+ -+ bio_endio(bio, bio->bi_size, ret); - } - - static void do_write(struct mirror_set *ms, struct bio *bio) - { - unsigned int i; -- struct io_region io[KCOPYD_MAX_REGIONS+1]; -+ struct io_region io[ms->nr_mirrors], *dest = io; - struct mirror *m; - -- for (i = 0; i < ms->nr_mirrors; i++) { -- m = ms->mirror + i; -- -- io[i].bdev = m->dev->bdev; -- io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); -- io[i].count = bio->bi_size >> 9; -- } -+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) -+ map_region(dest++, m, bio); - -- bio_set_ms(bio, ms); -- dm_io_async_bvec(ms->nr_mirrors, io, WRITE, -- bio->bi_io_vec + bio->bi_idx, -- write_callback, bio); -+ if (likely(dest - io)) { -+ /* -+ * We can use the default mirror here, because we -+ * only need it in order to retrieve the reference -+ * to the mirror set in write_callback(). -+ */ -+ bio_set_m(bio, ms->default_mirror); -+ dm_io_async_bvec(dest - io, io, WRITE, -+ bio->bi_io_vec + bio->bi_idx, -+ write_callback, bio); -+ } else -+ bio_endio(bio, bio->bi_size, -EIO); - } - - static void do_writes(struct mirror_set *ms, struct bio_list *writes) -@@ -779,6 +990,9 @@ static void do_writes(struct mirror_set - int state; - struct bio *bio; - struct bio_list sync, nosync, recover, *this_list = NULL; -+ struct bio_list requeue; -+ struct dirty_log *log = ms->rh.log; -+ region_t region; - - if (!writes->head) - return; -@@ -789,9 +1003,18 @@ static void do_writes(struct mirror_set - bio_list_init(&sync); - bio_list_init(&nosync); - bio_list_init(&recover); -+ bio_list_init(&requeue); - - while ((bio = bio_list_pop(writes))) { -- state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); -+ region = bio_to_region(&ms->rh, bio); -+ -+ if (log->type->is_remote_recovering && -+ log->type->is_remote_recovering(log, region)) { -+ bio_list_add(&requeue, bio); -+ continue; -+ } -+ -+ state = rh_state(&ms->rh, region, 1); - switch (state) { - case RH_CLEAN: - case RH_DIRTY: -@@ -810,6 +1033,8 @@ static void do_writes(struct mirror_set - bio_list_add(this_list, bio); - } - -+ bio_list_merge(writes, &requeue); -+ - /* - * Increment the pending counts for any regions that will - * be written to (writes to recover regions are going to -@@ -829,7 +1054,7 @@ static void do_writes(struct mirror_set - rh_delay(&ms->rh, bio); - - while ((bio = bio_list_pop(&nosync))) { -- map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); -+ map_bio(ms->default_mirror, bio); - generic_make_request(bio); - } - } -@@ -844,12 +1069,12 @@ static void do_mirror(struct mirror_set - { - struct bio_list reads, writes; - -- spin_lock(&ms->lock); -+ spin_lock_irq(&ms->lock); - reads = ms->reads; - writes = ms->writes; - bio_list_init(&ms->reads); - bio_list_init(&ms->writes); -- spin_unlock(&ms->lock); -+ spin_unlock_irq(&ms->lock); - - rh_update_states(&ms->rh); - do_recovery(ms); -@@ -871,7 +1096,7 @@ static void do_work(void *ignored) - * Target functions - *---------------------------------------------------------------*/ - static struct mirror_set *alloc_context(unsigned int nr_mirrors, -- sector_t region_size, -+ uint32_t region_size, - struct dm_target *ti, - struct dirty_log *dl) - { -@@ -891,11 +1116,16 @@ static struct mirror_set *alloc_context( - - memset(ms, 0, len); - spin_lock_init(&ms->lock); -+ spin_lock_init(&ms->choose_lock); - - ms->ti = ti; - ms->nr_mirrors = nr_mirrors; -- ms->nr_regions = dm_div_up(ti->len, region_size); -+ ms->nr_regions = dm_sector_div_up(ti->len, region_size); - ms->in_sync = 0; -+ ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; -+ -+ /* a resume must be issued to start the device */ -+ atomic_set(&ms->suspended, 1); - - if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { - ti->error = "dm-mirror: Error creating dirty region hash"; -@@ -903,6 +1133,13 @@ static struct mirror_set *alloc_context( - return NULL; - } - -+ atomic_set(&ms->read_count, MIN_READS); -+ -+ bio_list_init(&ms->failures); -+ INIT_WORK(&ms->failure_work, write_failure_handler, ms); -+ -+ init_completion(&ms->failure_completion); -+ - return ms; - } - -@@ -916,7 +1153,7 @@ static void free_context(struct mirror_s - kfree(ms); - } - --static inline int _check_region_size(struct dm_target *ti, sector_t size) -+static inline int _check_region_size(struct dm_target *ti, uint32_t size) - { - return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || - size > ti->len); -@@ -940,6 +1177,8 @@ static int get_mirror(struct mirror_set - } - - ms->mirror[mirror].offset = offset; -+ atomic_set(&(ms->mirror[mirror].error_count), 0); -+ ms->mirror[mirror].ms = ms; - - return 0; - } -@@ -1009,8 +1248,8 @@ static struct dirty_log *create_dirty_lo - * log_type #log_params <log_params> - * #mirrors [mirror_path offset]{2,} - * -- * For now, #log_params = 1, log_type = "core" -- * -+ * log_type is "core" or "disk" -+ * #log_params is between 1 and 3 - */ - #define DM_IO_PAGES 64 - static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) -@@ -1060,6 +1299,7 @@ static int mirror_ctr(struct dm_target * - } - - ti->private = ms; -+ ti->split_io = ms->rh.region_size; - - r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); - if (r) { -@@ -1082,14 +1322,15 @@ static void mirror_dtr(struct dm_target - - static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) - { -+ unsigned long flags; - int should_wake = 0; - struct bio_list *bl; - - bl = (rw == WRITE) ? &ms->writes : &ms->reads; -- spin_lock(&ms->lock); -+ spin_lock_irqsave(&ms->lock, flags); - should_wake = !(bl->head); - bio_list_add(bl, bio); -- spin_unlock(&ms->lock); -+ spin_unlock_irqrestore(&ms->lock, flags); - - if (should_wake) - wake(); -@@ -1104,42 +1345,64 @@ static int mirror_map(struct dm_target * - int r, rw = bio_rw(bio); - struct mirror *m; - struct mirror_set *ms = ti->private; -- -- map_context->ll = bio->bi_sector >> ms->rh.region_shift; -+ struct dm_bio_details *bd; -+ struct bio_map_info *bmi; - - if (rw == WRITE) { -+ /* Save region for mirror_end_io() handler */ -+ map_context->ll = bio_to_region(&ms->rh, bio); - queue_bio(ms, bio, rw); - return 0; - } - -+ /* It's all about the READs now */ -+ - r = ms->rh.log->type->in_sync(ms->rh.log, - bio_to_region(&ms->rh, bio), 0); - if (r < 0 && r != -EWOULDBLOCK) - return r; - -- if (r == -EWOULDBLOCK) /* FIXME: ugly */ -+ if (r == -EWOULDBLOCK) - r = 0; - -- /* -- * We don't want to fast track a recovery just for a read -- * ahead. So we just let it silently fail. -- * FIXME: get rid of this. -- */ -- if (!r && rw == READA) -- return -EIO; -+ if (likely(r)) { -+ /* -+ * Optimize reads by avoiding to hand them to daemon. -+ * -+ * In case they fail, queue them for another shot -+ * in the mirror_end_io() function. -+ */ -+ m = choose_mirror(ms, NULL); -+ if (likely(m)) { -+ bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO); -+ -+ if (likely(bmi)) { -+ /* without this, a read is not retryable */ -+ bd = &bmi->bmi_bd; -+ dm_bio_record(bd, bio); -+ map_context->ptr = bmi; -+ bmi->bmi_m = m; -+ } else { -+ /* we could fail now, but we can at least ** -+ ** give it a shot. The bd is only used to ** -+ ** retry in the event of a failure anyway. ** -+ ** If we fail, we can fail the I/O then. */ -+ map_context->ptr = NULL; -+ } -+ -+ map_bio(m, bio); -+ return 1; /* Mapped -> queue request. */ -+ } else -+ return -EIO; -+ } else { -+ /* Either not clean, or -EWOULDBLOCK */ -+ if (rw == READA) -+ return -EWOULDBLOCK; - -- if (!r) { -- /* Pass this io over to the daemon */ - queue_bio(ms, bio, rw); -- return 0; - } - -- m = choose_mirror(ms, bio->bi_sector); -- if (!m) -- return -EIO; -- -- map_bio(ms, m, bio); -- return 1; -+ return 0; - } - - static int mirror_end_io(struct dm_target *ti, struct bio *bio, -@@ -1147,71 +1410,140 @@ static int mirror_end_io(struct dm_targe - { - int rw = bio_rw(bio); - struct mirror_set *ms = (struct mirror_set *) ti->private; -- region_t region = map_context->ll; -+ struct mirror *m = NULL; -+ struct dm_bio_details *bd = NULL; - - /* - * We need to dec pending if this was a write. - */ -- if (rw == WRITE) -- rh_dec(&ms->rh, region); -+ if (rw == WRITE) { -+ rh_dec(&ms->rh, map_context->ll); -+ return error; -+ } - -- return 0; -+ if (error == -EOPNOTSUPP) -+ goto out; -+ -+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) -+ goto out; -+ -+ if (unlikely(error)) { -+ DMERR("A read failure occurred on a mirror device."); -+ if (!map_context->ptr) { -+ /* -+ * There wasn't enough memory to record necessary -+ * information for a retry. -+ */ -+ DMERR("Out of memory causing inability to retry read."); -+ return -EIO; -+ } -+ m = ((struct bio_map_info *)map_context->ptr)->bmi_m; -+ fail_mirror(m); /* Flag error on mirror. */ -+ -+ /* -+ * A failed read needs to get queued -+ * to the daemon for another shot to -+ * one (if any) intact mirrors. -+ */ -+ if (default_ok(m)) { -+ bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd); -+ -+ DMWARN("Trying different device."); -+ dm_bio_restore(bd, bio); -+ mempool_free(map_context->ptr, bio_map_info_pool); -+ map_context->ptr = NULL; -+ queue_bio(ms, bio, rw); -+ return 1; /* We want another shot on the bio. */ -+ } -+ DMERR("All replicated volumes dead, failing I/O"); -+ } -+ -+ out: -+ if (map_context->ptr) -+ mempool_free(map_context->ptr, bio_map_info_pool); -+ -+ return error; - } - --static void mirror_suspend(struct dm_target *ti) -+static void mirror_presuspend(struct dm_target *ti) - { - struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dirty_log *log = ms->rh.log; -+ unsigned long flags; -+ int run; -+ -+ /* -+ * Only run the completion if we are suspending after -+ * a disk failure. -+ */ -+ spin_lock_irqsave(&ms->lock, flags); -+ run = ms->failures.head ? 1 : 0; -+ spin_unlock_irqrestore(&ms->lock, flags); -+ -+ if (run && (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK)) -+ complete(&ms->failure_completion); -+ -+ if (log->type->presuspend && log->type->presuspend(log)) -+ /* FIXME: need better error handling */ -+ DMWARN("log presuspend failed"); -+ -+} -+ -+static void mirror_postsuspend(struct dm_target *ti) -+{ -+ struct mirror_set *ms = (struct mirror_set *) ti->private; -+ struct dirty_log *log = ms->rh.log; -+ - rh_stop_recovery(&ms->rh); -- if (log->type->suspend && log->type->suspend(log)) -+ if (log->type->postsuspend && log->type->postsuspend(log)) - /* FIXME: need better error handling */ -- DMWARN("log suspend failed"); -+ DMWARN("log postsuspend failed"); -+ atomic_set(&ms->suspended, 1); - } - - static void mirror_resume(struct dm_target *ti) - { - struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dirty_log *log = ms->rh.log; -+ - if (log->type->resume && log->type->resume(log)) - /* FIXME: need better error handling */ - DMWARN("log resume failed"); -- rh_start_recovery(&ms->rh); -+ -+ if (atomic_dec_and_test(&ms->suspended)) -+ rh_start_recovery(&ms->rh); -+ atomic_set(&ms->suspended, 0); - } - - static int mirror_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) - { -- char buffer[32]; - unsigned int m, sz = 0; - struct mirror_set *ms = (struct mirror_set *) ti->private; -- --#define EMIT(x...) sz += ((sz >= maxlen) ? \ -- 0 : scnprintf(result + sz, maxlen - sz, x)) -+ char buffer[ms->nr_mirrors + 1]; - - switch (type) { - case STATUSTYPE_INFO: -- EMIT("%d ", ms->nr_mirrors); -- -+ DMEMIT("%d ", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) { -- format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); -- EMIT("%s ", buffer); -+ DMEMIT("%s ", ms->mirror[m].dev->name); -+ buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ? -+ 'D' : 'A'; - } -+ buffer[m] = '\0'; - -- EMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, -- ms->rh.log->type->get_sync_count(ms->rh.log), -- ms->nr_regions); -+ DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ", -+ ms->rh.log->type->get_sync_count(ms->rh.log), -+ ms->nr_regions, buffer); -+ ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); - break; - - case STATUSTYPE_TABLE: -- EMIT("%s 1 " SECTOR_FORMAT " %d ", -- ms->rh.log->type->name, ms->rh.region_size, -- ms->nr_mirrors); -- -- for (m = 0; m < ms->nr_mirrors; m++) { -- format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); -- EMIT("%s " SECTOR_FORMAT " ", -- buffer, ms->mirror[m].offset); -- } -+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); -+ DMEMIT("%d ", ms->nr_mirrors); -+ for (m = 0; m < ms->nr_mirrors; m++) -+ DMEMIT("%s " SECTOR_FORMAT " ", -+ ms->mirror[m].dev->name, ms->mirror[m].offset); - } - - return 0; -@@ -1219,13 +1551,14 @@ static int mirror_status(struct dm_targe - - static struct target_type mirror_target = { - .name = "mirror", -- .version = {1, 0, 1}, -+ .version = {1, 1, 0}, - .module = THIS_MODULE, - .ctr = mirror_ctr, - .dtr = mirror_dtr, - .map = mirror_map, - .end_io = mirror_end_io, -- .suspend = mirror_suspend, -+ .presuspend = mirror_presuspend, -+ .postsuspend = mirror_postsuspend, - .resume = mirror_resume, - .status = mirror_status, - }; -@@ -1234,24 +1567,38 @@ static int __init dm_mirror_init(void) - { - int r; - -+ bio_map_info_pool = mempool_create(100, bio_map_info_alloc, -+ bio_map_info_free, NULL); -+ if (!bio_map_info_pool) -+ return -ENOMEM; -+ - r = dm_dirty_log_init(); - if (r) - return r; - -- _kmirrord_wq = create_workqueue("kmirrord"); -+ _kmirrord_wq = create_singlethread_workqueue("kmirrord"); - if (!_kmirrord_wq) { - DMERR("couldn't start kmirrord"); - dm_dirty_log_exit(); -- return r; -+ return -ENOMEM; - } - INIT_WORK(&_kmirrord_work, do_work, NULL); - -+ _kmir_mon_wq = create_singlethread_workqueue("kmir_mon"); -+ if (!_kmir_mon_wq) { -+ DMERR("couldn't start kmir_mon"); -+ dm_dirty_log_exit(); -+ destroy_workqueue(_kmirrord_wq); -+ return -ENOMEM; -+ } -+ - r = dm_register_target(&mirror_target); - if (r < 0) { - DMERR("%s: Failed to register mirror target", - mirror_target.name); - dm_dirty_log_exit(); - destroy_workqueue(_kmirrord_wq); -+ destroy_workqueue(_kmir_mon_wq); - } - - return r; -diff -pruN ./drivers/md.dm/dm-round-robin.c ./drivers/md/dm-round-robin.c ---- ./drivers/md.dm/dm-round-robin.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/dm-round-robin.c 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,214 @@ -+/* -+ * Copyright (C) 2003 Sistina Software. -+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. -+ * -+ * Module Author: Heinz Mauelshagen -+ * -+ * This file is released under the GPL. -+ * -+ * Round-robin path selector. -+ */ -+ -+#include "dm.h" -+#include "dm-path-selector.h" -+ -+#include <linux/slab.h> -+ -+/*----------------------------------------------------------------- -+ * Path-handling code, paths are held in lists -+ *---------------------------------------------------------------*/ -+struct path_info { -+ struct list_head list; -+ struct path *path; -+ unsigned repeat_count; -+}; -+ -+static void free_paths(struct list_head *paths) -+{ -+ struct path_info *pi, *next; -+ -+ list_for_each_entry_safe(pi, next, paths, list) { -+ list_del(&pi->list); -+ kfree(pi); -+ } -+} -+ -+/*----------------------------------------------------------------- -+ * Round-robin selector -+ *---------------------------------------------------------------*/ -+ -+#define RR_MIN_IO 1000 -+ -+struct selector { -+ struct list_head valid_paths; -+ struct list_head invalid_paths; -+}; -+ -+static struct selector *alloc_selector(void) -+{ -+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); -+ -+ if (s) { -+ INIT_LIST_HEAD(&s->valid_paths); -+ INIT_LIST_HEAD(&s->invalid_paths); -+ } -+ -+ return s; -+} -+ -+static int rr_create(struct path_selector *ps, unsigned argc, char **argv) -+{ -+ struct selector *s; -+ -+ s = alloc_selector(); -+ if (!s) -+ return -ENOMEM; -+ -+ ps->context = s; -+ return 0; -+} -+ -+static void rr_destroy(struct path_selector *ps) -+{ -+ struct selector *s = (struct selector *) ps->context; -+ -+ free_paths(&s->valid_paths); -+ free_paths(&s->invalid_paths); -+ kfree(s); -+ ps->context = NULL; -+} -+ -+static int rr_status(struct path_selector *ps, struct path *path, -+ status_type_t type, char *result, unsigned int maxlen) -+{ -+ struct path_info *pi; -+ int sz = 0; -+ -+ if (!path) -+ DMEMIT("0 "); -+ else { -+ switch(type) { -+ case STATUSTYPE_INFO: -+ break; -+ case STATUSTYPE_TABLE: -+ pi = path->pscontext; -+ DMEMIT("%u ", pi->repeat_count); -+ break; -+ } -+ } -+ -+ return sz; -+} -+ -+/* -+ * Called during initialisation to register each path with an -+ * optional repeat_count. -+ */ -+static int rr_add_path(struct path_selector *ps, struct path *path, -+ int argc, char **argv, char **error) -+{ -+ struct selector *s = (struct selector *) ps->context; -+ struct path_info *pi; -+ unsigned repeat_count = RR_MIN_IO; -+ -+ if (argc > 1) { -+ *error = "round-robin ps: incorrect number of arguments"; -+ return -EINVAL; -+ } -+ -+ /* First path argument is number of I/Os before switching path */ -+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { -+ *error = "round-robin ps: invalid repeat count"; -+ return -EINVAL; -+ } -+ -+ /* allocate the path */ -+ pi = kmalloc(sizeof(*pi), GFP_KERNEL); -+ if (!pi) { -+ *error = "round-robin ps: Error allocating path context"; -+ return -ENOMEM; -+ } -+ -+ pi->path = path; -+ pi->repeat_count = repeat_count; -+ -+ path->pscontext = pi; -+ -+ list_add(&pi->list, &s->valid_paths); -+ -+ return 0; -+} -+ -+static void rr_fail_path(struct path_selector *ps, struct path *p) -+{ -+ struct selector *s = (struct selector *) ps->context; -+ struct path_info *pi = p->pscontext; -+ -+ list_move(&pi->list, &s->invalid_paths); -+} -+ -+static int rr_reinstate_path(struct path_selector *ps, struct path *p) -+{ -+ struct selector *s = (struct selector *) ps->context; -+ struct path_info *pi = p->pscontext; -+ -+ list_move(&pi->list, &s->valid_paths); -+ -+ return 0; -+} -+ -+static struct path *rr_select_path(struct path_selector *ps, -+ unsigned *repeat_count) -+{ -+ struct selector *s = (struct selector *) ps->context; -+ struct path_info *pi = NULL; -+ -+ if (!list_empty(&s->valid_paths)) { -+ pi = list_entry(s->valid_paths.next, struct path_info, list); -+ list_move_tail(&pi->list, &s->valid_paths); -+ *repeat_count = pi->repeat_count; -+ } -+ -+ return pi ? pi->path : NULL; -+} -+ -+static struct path_selector_type rr_ps = { -+ .name = "round-robin", -+ .module = THIS_MODULE, -+ .table_args = 1, -+ .info_args = 0, -+ .create = rr_create, -+ .destroy = rr_destroy, -+ .status = rr_status, -+ .add_path = rr_add_path, -+ .fail_path = rr_fail_path, -+ .reinstate_path = rr_reinstate_path, -+ .select_path = rr_select_path, -+}; -+ -+static int __init dm_rr_init(void) -+{ -+ int r = dm_register_path_selector(&rr_ps); -+ -+ if (r < 0) -+ DMERR("round-robin: register failed %d", r); -+ -+ DMINFO("dm-round-robin version 1.0.0 loaded"); -+ -+ return r; -+} -+ -+static void __exit dm_rr_exit(void) -+{ -+ int r = dm_unregister_path_selector(&rr_ps); -+ -+ if (r < 0) -+ DMERR("round-robin: unregister failed %d", r); -+} -+ -+module_init(dm_rr_init); -+module_exit(dm_rr_exit); -+ -+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); -+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); -+MODULE_LICENSE("GPL"); -diff -pruN ./drivers/md.dm/dm-snap.c ./drivers/md/dm-snap.c ---- ./drivers/md.dm/dm-snap.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-snap.c 2006-03-17 13:16:38.000000000 +0300 -@@ -49,6 +49,11 @@ struct pending_exception { - struct bio_list snapshot_bios; - - /* -+ * Short-term queue of pending exceptions prior to submission. -+ */ -+ struct list_head list; -+ -+ /* - * Other pending_exceptions that are processing this - * chunk. When this list is empty, we know we can - * complete the origins. -@@ -371,6 +376,15 @@ static inline ulong round_up(ulong n, ul - return (n + size) & ~size; - } - -+static void read_snapshot_metadata(struct dm_snapshot *s) -+{ -+ if (s->store.read_metadata(&s->store)) { -+ down_write(&s->lock); -+ s->valid = 0; -+ up_write(&s->lock); -+ } -+} -+ - /* - * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> - */ -@@ -457,7 +471,7 @@ static int snapshot_ctr(struct dm_target - s->chunk_shift = ffs(chunk_size) - 1; - - s->valid = 1; -- s->have_metadata = 0; -+ s->active = 0; - s->last_percent = 0; - init_rwsem(&s->lock); - s->table = ti->table; -@@ -492,7 +506,11 @@ static int snapshot_ctr(struct dm_target - goto bad5; - } - -+ /* Metadata must only be loaded into one table at once */ -+ read_snapshot_metadata(s); -+ - /* Add snapshot to the list of snapshots for this origin */ -+ /* Exceptions aren't triggered till snapshot_resume() is called */ - if (register_snapshot(s)) { - r = -EINVAL; - ti->error = "Cannot register snapshot origin"; -@@ -529,8 +547,12 @@ static void snapshot_dtr(struct dm_targe - { - struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - -+ /* Prevent further origin writes from using this snapshot. */ -+ /* After this returns there can be no new kcopyd jobs. */ - unregister_snapshot(s); - -+ kcopyd_client_destroy(s->kcopyd_client); -+ - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); - -@@ -539,7 +561,7 @@ static void snapshot_dtr(struct dm_targe - - dm_put_device(ti, s->origin); - dm_put_device(ti, s->cow); -- kcopyd_client_destroy(s->kcopyd_client); -+ - kfree(s); - } - -@@ -777,7 +799,10 @@ static int snapshot_map(struct dm_target - - /* Full snapshots are not usable */ - if (!s->valid) -- return -1; -+ return -EIO; -+ -+ if (unlikely(bio_barrier(bio))) -+ return -EOPNOTSUPP; - - /* - * Write to snapshot - higher level takes care of RW/RO -@@ -848,24 +873,15 @@ static void snapshot_resume(struct dm_ta - { - struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - -- if (s->have_metadata) -- return; -- -- if (s->store.read_metadata(&s->store)) { -- down_write(&s->lock); -- s->valid = 0; -- up_write(&s->lock); -- } -- -- s->have_metadata = 1; -+ down_write(&s->lock); -+ s->active = 1; -+ up_write(&s->lock); - } - - static int snapshot_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) - { - struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; -- char cow[32]; -- char org[32]; - - switch (type) { - case STATUSTYPE_INFO: -@@ -892,9 +908,8 @@ static int snapshot_status(struct dm_tar - * to make private copies if the output is to - * make sense. - */ -- format_dev_t(cow, snap->cow->bdev->bd_dev); -- format_dev_t(org, snap->origin->bdev->bd_dev); -- snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, org, cow, -+ snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, -+ snap->origin->name, snap->cow->name, - snap->type, snap->chunk_size); - break; - } -@@ -924,14 +939,19 @@ static int __origin_write(struct list_he - int r = 1, first = 1; - struct dm_snapshot *snap; - struct exception *e; -- struct pending_exception *pe, *last = NULL; -+ struct pending_exception *pe, *next_pe, *last = NULL; - chunk_t chunk; -+ LIST_HEAD(pe_queue); - - /* Do all the snapshots on this origin */ - list_for_each_entry (snap, snapshots, list) { - -- /* Only deal with valid snapshots */ -- if (!snap->valid) -+ /* Only deal with valid and active snapshots */ -+ if (!snap->valid || !snap->active) -+ continue; -+ -+ /* Nothing to do if writing beyond end of snapshot */ -+ if (bio->bi_sector >= dm_table_get_size(snap->table)) - continue; - - down_write(&snap->lock); -@@ -955,12 +975,19 @@ static int __origin_write(struct list_he - snap->valid = 0; - - } else { -- if (last) -+ if (first) { -+ bio_list_add(&pe->origin_bios, bio); -+ r = 0; -+ first = 0; -+ } -+ if (last && list_empty(&pe->siblings)) - list_merge(&pe->siblings, - &last->siblings); -- -+ if (!pe->started) { -+ pe->started = 1; -+ list_add_tail(&pe->list, &pe_queue); -+ } - last = pe; -- r = 0; - } - } - -@@ -970,24 +997,8 @@ static int __origin_write(struct list_he - /* - * Now that we have a complete pe list we can start the copying. - */ -- if (last) { -- pe = last; -- do { -- down_write(&pe->snap->lock); -- if (first) -- bio_list_add(&pe->origin_bios, bio); -- if (!pe->started) { -- pe->started = 1; -- up_write(&pe->snap->lock); -- start_copy(pe); -- } else -- up_write(&pe->snap->lock); -- first = 0; -- pe = list_entry(pe->siblings.next, -- struct pending_exception, siblings); -- -- } while (pe != last); -- } -+ list_for_each_entry_safe(pe, next_pe, &pe_queue, list) -+ start_copy(pe); - - return r; - } -@@ -1051,6 +1062,9 @@ static int origin_map(struct dm_target * - struct dm_dev *dev = (struct dm_dev *) ti->private; - bio->bi_bdev = dev->bdev; - -+ if (unlikely(bio_barrier(bio))) -+ return -EOPNOTSUPP; -+ - /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; - } -@@ -1082,7 +1096,6 @@ static int origin_status(struct dm_targe - unsigned int maxlen) - { - struct dm_dev *dev = (struct dm_dev *) ti->private; -- char buffer[32]; - - switch (type) { - case STATUSTYPE_INFO: -@@ -1090,8 +1103,7 @@ static int origin_status(struct dm_targe - break; - - case STATUSTYPE_TABLE: -- format_dev_t(buffer, dev->bdev->bd_dev); -- snprintf(result, maxlen, "%s", buffer); -+ snprintf(result, maxlen, "%s", dev->name); - break; - } - -@@ -1100,7 +1112,7 @@ static int origin_status(struct dm_targe - - static struct target_type origin_target = { - .name = "snapshot-origin", -- .version = {1, 0, 1}, -+ .version = {1, 2, 0}, - .module = THIS_MODULE, - .ctr = origin_ctr, - .dtr = origin_dtr, -@@ -1111,7 +1123,7 @@ static struct target_type origin_target - - static struct target_type snapshot_target = { - .name = "snapshot", -- .version = {1, 0, 1}, -+ .version = {1, 2, 0}, - .module = THIS_MODULE, - .ctr = snapshot_ctr, - .dtr = snapshot_dtr, -diff -pruN ./drivers/md.dm/dm-snap.h ./drivers/md/dm-snap.h ---- ./drivers/md.dm/dm-snap.h 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-snap.h 2006-03-17 13:16:38.000000000 +0300 -@@ -99,7 +99,9 @@ struct dm_snapshot { - - /* You can't use a snapshot if this is 0 (e.g. if full) */ - int valid; -- int have_metadata; -+ -+ /* Origin writes don't trigger exceptions until this is set */ -+ int active; - - /* Used for display of table */ - char type; -diff -pruN ./drivers/md.dm/dm-stripe.c ./drivers/md/dm-stripe.c ---- ./drivers/md.dm/dm-stripe.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-stripe.c 2006-03-17 13:16:38.000000000 +0300 -@@ -21,7 +21,7 @@ struct stripe_c { - uint32_t stripes; - - /* The size of this target / num. stripes */ -- uint32_t stripe_width; -+ sector_t stripe_width; - - /* stripe chunk size */ - uint32_t chunk_shift; -@@ -173,9 +173,8 @@ static int stripe_map(struct dm_target * - struct stripe_c *sc = (struct stripe_c *) ti->private; - - sector_t offset = bio->bi_sector - ti->begin; -- uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); -- uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ -- chunk = chunk / sc->stripes; -+ sector_t chunk = offset >> sc->chunk_shift; -+ uint32_t stripe = do_div(chunk, sc->stripes); - - bio->bi_bdev = sc->stripe[stripe].dev->bdev; - bio->bi_sector = sc->stripe[stripe].physical_start + -@@ -189,10 +188,6 @@ static int stripe_status(struct dm_targe - struct stripe_c *sc = (struct stripe_c *) ti->private; - unsigned int sz = 0; - unsigned int i; -- char buffer[32]; -- --#define EMIT(x...) sz += ((sz >= maxlen) ? \ -- 0 : scnprintf(result + sz, maxlen - sz, x)) - - switch (type) { - case STATUSTYPE_INFO: -@@ -200,12 +195,10 @@ static int stripe_status(struct dm_targe - break; - - case STATUSTYPE_TABLE: -- EMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); -- for (i = 0; i < sc->stripes; i++) { -- format_dev_t(buffer, sc->stripe[i].dev->bdev->bd_dev); -- EMIT(" %s " SECTOR_FORMAT, buffer, -- sc->stripe[i].physical_start); -- } -+ DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); -+ for (i = 0; i < sc->stripes; i++) -+ DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name, -+ sc->stripe[i].physical_start); - break; - } - return 0; -@@ -213,7 +206,7 @@ static int stripe_status(struct dm_targe - - static struct target_type stripe_target = { - .name = "striped", -- .version= {1, 0, 1}, -+ .version= {1, 0, 2}, - .module = THIS_MODULE, - .ctr = stripe_ctr, - .dtr = stripe_dtr, -diff -pruN ./drivers/md.dm/dm-table.c ./drivers/md/dm-table.c ---- ./drivers/md.dm/dm-table.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-table.c 2006-03-17 13:16:38.000000000 +0300 -@@ -57,7 +57,7 @@ struct dm_table { - /* - * Similar to ceiling(log_size(n)) - */ --static unsigned int int_log(unsigned long n, unsigned long base) -+static unsigned int int_log(unsigned int n, unsigned int base) - { - int result = 0; - -@@ -454,6 +454,8 @@ static int __table_get_device(struct dm_ - return r; - } - -+ format_dev_t(dd->name, dev); -+ - atomic_set(&dd->count, 0); - list_add(&dd->list, &t->devices); - -@@ -575,7 +577,7 @@ static char **realloc_argv(unsigned *arr - /* - * Destructively splits up the argument list to pass to ctr. - */ --static int split_args(int *argc, char ***argvp, char *input) -+int dm_split_args(int *argc, char ***argvp, char *input) - { - char *start, *end = input, *out, **argv = NULL; - unsigned array_size = 0; -@@ -663,14 +665,14 @@ int dm_table_add_target(struct dm_table - - if (!len) { - tgt->error = "zero-length target"; -- DMERR(": %s\n", tgt->error); -+ DMERR("%s", tgt->error); - return -EINVAL; - } - - tgt->type = dm_get_target_type(type); - if (!tgt->type) { - tgt->error = "unknown target type"; -- DMERR(": %s\n", tgt->error); -+ DMERR("%s", tgt->error); - return -EINVAL; - } - -@@ -688,7 +690,7 @@ int dm_table_add_target(struct dm_table - goto bad; - } - -- r = split_args(&argc, &argv, params); -+ r = dm_split_args(&argc, &argv, params); - if (r) { - tgt->error = "couldn't split parameters (insufficient memory)"; - goto bad; -@@ -707,7 +709,7 @@ int dm_table_add_target(struct dm_table - return 0; - - bad: -- DMERR(": %s\n", tgt->error); -+ DMERR("%s", tgt->error); - dm_put_target_type(tgt->type); - return r; - } -@@ -825,7 +827,7 @@ void dm_table_set_restrictions(struct dm - * Make sure we obey the optimistic sub devices - * restrictions. - */ -- q->max_sectors = t->limits.max_sectors; -+ blk_queue_max_sectors(q, t->limits.max_sectors); - q->max_phys_segments = t->limits.max_phys_segments; - q->max_hw_segments = t->limits.max_hw_segments; - q->hardsect_size = t->limits.hardsect_size; -@@ -848,18 +850,38 @@ int dm_table_get_mode(struct dm_table *t - return t->mode; - } - --void dm_table_suspend_targets(struct dm_table *t) -+static void suspend_targets(struct dm_table *t, unsigned postsuspend) - { -- int i; -+ int i = t->num_targets; -+ struct dm_target *ti = t->targets; - -- for (i = 0; i < t->num_targets; i++) { -- struct dm_target *ti = t->targets + i; -+ while (i--) { -+ if (postsuspend) { -+ if (ti->type->postsuspend) -+ ti->type->postsuspend(ti); -+ } else if (ti->type->presuspend) -+ ti->type->presuspend(ti); - -- if (ti->type->suspend) -- ti->type->suspend(ti); -+ ti++; - } - } - -+void dm_table_presuspend_targets(struct dm_table *t) -+{ -+ if (!t) -+ return; -+ -+ return suspend_targets(t, 0); -+} -+ -+void dm_table_postsuspend_targets(struct dm_table *t) -+{ -+ if (!t) -+ return; -+ -+ return suspend_targets(t, 1); -+} -+ - void dm_table_resume_targets(struct dm_table *t) - { - int i; -@@ -900,11 +922,35 @@ void dm_table_unplug_all(struct dm_table - } - } - -+int dm_table_flush_all(struct dm_table *t) -+{ -+ struct list_head *d, *devices = dm_table_get_devices(t); -+ int ret = 0; -+ -+ for (d = devices->next; d != devices; d = d->next) { -+ struct dm_dev *dd = list_entry(d, struct dm_dev, list); -+ request_queue_t *q = bdev_get_queue(dd->bdev); -+ int err; -+ -+ if (!q->issue_flush_fn) -+ err = -EOPNOTSUPP; -+ else -+ err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); -+ -+ if (!ret) -+ ret = err; -+ } -+ -+ return ret; -+} -+ - EXPORT_SYMBOL(dm_vcalloc); - EXPORT_SYMBOL(dm_get_device); - EXPORT_SYMBOL(dm_put_device); - EXPORT_SYMBOL(dm_table_event); -+EXPORT_SYMBOL(dm_table_get_size); - EXPORT_SYMBOL(dm_table_get_mode); - EXPORT_SYMBOL(dm_table_put); - EXPORT_SYMBOL(dm_table_get); - EXPORT_SYMBOL(dm_table_unplug_all); -+EXPORT_SYMBOL(dm_table_flush_all); -diff -pruN ./drivers/md.dm/dm-target.c ./drivers/md/dm-target.c ---- ./drivers/md.dm/dm-target.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/dm-target.c 2006-03-17 13:16:38.000000000 +0300 -@@ -120,10 +120,9 @@ int dm_register_target(struct target_typ - return -ENOMEM; - - down_write(&_lock); -- if (__find_target_type(t->name)) { -- kfree(ti); -+ if (__find_target_type(t->name)) - rv = -EEXIST; -- } else -+ else - list_add(&ti->list, &_targets); - - up_write(&_lock); -diff -pruN ./drivers/md.dm/Kconfig ./drivers/md/Kconfig ---- ./drivers/md.dm/Kconfig 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/Kconfig 2006-03-17 13:16:38.000000000 +0300 -@@ -85,6 +85,24 @@ config MD_RAID1 - - If unsure, say Y. - -+config MD_RAID10 -+ tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" -+ depends on BLK_DEV_MD && EXPERIMENTAL -+ ---help--- -+ RAID-10 provides a combination of striping (RAID-0) and -+ mirroring (RAID-1) with easier configuration and more flexable -+ layout. -+ Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to -+ be the same size (or atleast, only as much as the smallest device -+ will be used). -+ RAID-10 provides a variety of layouts that provide different levels -+ of redundancy and performance. -+ -+ RAID-10 requires mdadm-1.7.0 or later, available at: -+ -+ ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ -+ -+ - config MD_RAID5 - tristate "RAID-4/RAID-5 mode" - depends on BLK_DEV_MD -@@ -200,5 +218,17 @@ config DM_ZERO - A target that discards writes, and returns all zeroes for - reads. Useful in some recovery situations. - -+config DM_MULTIPATH -+ tristate "Multipath target (EXPERIMENTAL)" -+ depends on BLK_DEV_DM && EXPERIMENTAL -+ ---help--- -+ Allow volume managers to support multipath hardware. -+ -+config DM_MULTIPATH_EMC -+ tristate "EMC CX/AX multipath support (EXPERIMENTAL)" -+ depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL -+ ---help--- -+ Multipath support for EMC CX/AX series hardware. -+ - endmenu - -diff -pruN ./drivers/md.dm/kcopyd.c ./drivers/md/kcopyd.c ---- ./drivers/md.dm/kcopyd.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/kcopyd.c 2006-03-20 09:36:55.000000000 +0300 -@@ -43,6 +43,10 @@ struct kcopyd_client { - struct page_list *pages; - unsigned int nr_pages; - unsigned int nr_free_pages; -+#ifndef __GENKSYMS__ -+ wait_queue_head_t destroyq; -+ atomic_t nr_jobs; -+#endif - }; - - static struct page_list *alloc_pl(void) -@@ -292,10 +296,15 @@ static int run_complete_job(struct kcopy - int read_err = job->read_err; - unsigned int write_err = job->write_err; - kcopyd_notify_fn fn = job->fn; -+ struct kcopyd_client *kc = job->kc; - -- kcopyd_put_pages(job->kc, job->pages); -+ kcopyd_put_pages(kc, job->pages); - mempool_free(job, _job_pool); - fn(read_err, write_err, context); -+ -+ if (atomic_dec_and_test(&kc->nr_jobs)) -+ wake_up(&kc->destroyq); -+ - return 0; - } - -@@ -430,6 +439,7 @@ static void do_work(void *ignored) - */ - static void dispatch_job(struct kcopyd_job *job) - { -+ atomic_inc(&job->kc->nr_jobs); - push(&_pages_jobs, job); - wake(); - } -@@ -667,6 +677,9 @@ int kcopyd_client_create(unsigned int nr - return r; - } - -+ init_waitqueue_head(&kc->destroyq); -+ atomic_set(&kc->nr_jobs, 0); -+ - client_add(kc); - *result = kc; - return 0; -@@ -674,6 +687,9 @@ int kcopyd_client_create(unsigned int nr - - void kcopyd_client_destroy(struct kcopyd_client *kc) - { -+ /* Wait for completion of all jobs submitted by this client. */ -+ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); -+ - dm_io_put(kc->nr_pages); - client_free_pages(kc); - client_del(kc); -diff -pruN ./drivers/md.dm/linear.c ./drivers/md/linear.c ---- ./drivers/md.dm/linear.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/linear.c 2006-03-17 13:16:38.000000000 +0300 -@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde - return hash->dev0; - } - -- - /** - * linear_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue -@@ -93,13 +92,35 @@ static void linear_unplug(request_queue_ - } - } - -+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ linear_conf_t *conf = mddev_to_conf(mddev); -+ int i, ret = 0; -+ -+ for (i=0; i < mddev->raid_disks; i++) { -+ struct block_device *bdev = conf->disks[i].rdev->bdev; -+ request_queue_t *r_queue = bdev_get_queue(bdev); -+ -+ if (!r_queue->issue_flush_fn) { -+ ret = -EOPNOTSUPP; -+ break; -+ } -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ return ret; -+} - - static int linear_run (mddev_t *mddev) - { - linear_conf_t *conf; - struct linear_hash *table; - mdk_rdev_t *rdev; -- int size, i, nb_zone, cnt; -+ int i, nb_zone, cnt; -+ sector_t size; - unsigned int curr_offset; - struct list_head *tmp; - -@@ -137,7 +158,7 @@ static int linear_run (mddev_t *mddev) - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - disk->size = rdev->size; - mddev->array_size += rdev->size; -@@ -200,6 +221,7 @@ static int linear_run (mddev_t *mddev) - - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->unplug_fn = linear_unplug; -+ mddev->queue->issue_flush_fn = linear_issue_flush; - return 0; - - out: -@@ -247,10 +269,11 @@ static int linear_make_request (request_ - char b[BDEVNAME_SIZE]; - - printk("linear_make_request: Block %llu out of bounds on " -- "dev %s size %ld offset %ld\n", -+ "dev %s size %llu offset %llu\n", - (unsigned long long)block, - bdevname(tmp_dev->rdev->bdev, b), -- tmp_dev->size, tmp_dev->offset); -+ (unsigned long long)tmp_dev->size, -+ (unsigned long long)tmp_dev->offset); - bio_io_error(bio, bio->bi_size); - return 0; - } -diff -pruN ./drivers/md.dm/Makefile ./drivers/md/Makefile ---- ./drivers/md.dm/Makefile 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/Makefile 2006-03-17 13:16:38.000000000 +0300 -@@ -4,13 +4,16 @@ - - dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o kcopyd.o -+dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o - dm-snapshot-objs := dm-snap.o dm-exception-store.o - dm-mirror-objs := dm-log.o dm-raid1.o - raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ - raid6int1.o raid6int2.o raid6int4.o \ - raid6int8.o raid6int16.o raid6int32.o \ - raid6mmx.o raid6sse1.o raid6sse2.o --host-progs := mktables -+hostprogs-y := mktables -+ -+CFLAGS_raid6int8.o += -O2 - - # Note: link order is important. All raid personalities - # and xor.o must come before md.o, as they each initialise -@@ -20,12 +23,15 @@ host-progs := mktables - obj-$(CONFIG_MD_LINEAR) += linear.o - obj-$(CONFIG_MD_RAID0) += raid0.o - obj-$(CONFIG_MD_RAID1) += raid1.o -+obj-$(CONFIG_MD_RAID10) += raid10.o - obj-$(CONFIG_MD_RAID5) += raid5.o xor.o - obj-$(CONFIG_MD_RAID6) += raid6.o xor.o - obj-$(CONFIG_MD_MULTIPATH) += multipath.o - obj-$(CONFIG_BLK_DEV_MD) += md.o - obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o - obj-$(CONFIG_DM_CRYPT) += dm-crypt.o -+obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o -+obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o - obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o - obj-$(CONFIG_DM_MIRROR) += dm-mirror.o - obj-$(CONFIG_DM_ZERO) += dm-zero.o -diff -pruN ./drivers/md.dm/md.c ./drivers/md/md.c ---- ./drivers/md.dm/md.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/md.c 2006-03-17 13:22:09.000000000 +0300 -@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN - tmp = tmp->next;}) \ - ) - -+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector) -+{ -+ struct list_head *tmp; -+ mdk_rdev_t *rdev; -+ int ret = 0; -+ -+ /* -+ * this list iteration is done without any locking in md?! -+ */ -+ ITERATE_RDEV(mddev, rdev, tmp) { -+ request_queue_t *r_queue = bdev_get_queue(rdev->bdev); -+ int err; -+ -+ if (!r_queue->issue_flush_fn) -+ err = -EOPNOTSUPP; -+ else -+ err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector); -+ -+ if (!ret) -+ ret = err; -+ } -+ -+ return ret; -+} -+ -+static int md_flush_all(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ -+ return md_flush_mddev(mddev, error_sector); -+} -+ - static int md_fail_request (request_queue_t *q, struct bio *bio) - { - bio_io_error(bio, bio->bi_size); -@@ -331,29 +364,24 @@ static int bi_complete(struct bio *bio, - static int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw) - { -- struct bio bio; -- struct bio_vec vec; -+ struct bio *bio = bio_alloc(GFP_NOIO, 1); - struct completion event; -+ int ret; - - rw |= (1 << BIO_RW_SYNC); - -- bio_init(&bio); -- bio.bi_io_vec = &vec; -- vec.bv_page = page; -- vec.bv_len = size; -- vec.bv_offset = 0; -- bio.bi_vcnt = 1; -- bio.bi_idx = 0; -- bio.bi_size = size; -- bio.bi_bdev = bdev; -- bio.bi_sector = sector; -+ bio->bi_bdev = bdev; -+ bio->bi_sector = sector; -+ bio_add_page(bio, page, size, 0); - init_completion(&event); -- bio.bi_private = &event; -- bio.bi_end_io = bi_complete; -- submit_bio(rw, &bio); -+ bio->bi_private = &event; -+ bio->bi_end_io = bi_complete; -+ submit_bio(rw, bio); - wait_for_completion(&event); - -- return test_bit(BIO_UPTODATE, &bio.bi_flags); -+ ret = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ bio_put(bio); -+ return ret; - } - - static int read_disk_sb(mdk_rdev_t * rdev) -@@ -373,7 +401,7 @@ static int read_disk_sb(mdk_rdev_t * rde - return 0; - - fail: -- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", -+ printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", - bdevname(rdev->bdev,b)); - return -EINVAL; - } -@@ -439,6 +467,31 @@ static unsigned int calc_sb_csum(mdp_sup - return csum; - } - -+/* csum_partial is not consistent between different architectures. -+ * Some (i386) do a 32bit csum. Some (alpha) do 16 bit. -+ * This makes it hard for user-space to know what to do. -+ * So we use calc_sb_csum to set the checksum to allow working -+ * with older kernels, but allow calc_sb_csum_common to -+ * be used when checking if a checksum is correct, to -+ * make life easier for user-space tools that might write -+ * a superblock. -+ */ -+static unsigned int calc_sb_csum_common(mdp_super_t *super) -+{ -+ unsigned int disk_csum = super->sb_csum; -+ unsigned long long newcsum = 0; -+ unsigned int csum; -+ int i; -+ unsigned int *superc = (int*) super; -+ super->sb_csum = 0; -+ -+ for (i=0; i<MD_SB_BYTES/4; i++) -+ newcsum+= superc[i]; -+ csum = (newcsum& 0xffffffff) + (newcsum>>32); -+ super->sb_csum = disk_csum; -+ return csum; -+} -+ - /* - * Handle superblock details. - * We want to be able to handle multiple superblock formats -@@ -521,7 +574,8 @@ static int super_90_load(mdk_rdev_t *rde - if (sb->raid_disks <= 0) - goto abort; - -- if (calc_sb_csum(sb) != sb->sb_csum) { -+ if (calc_sb_csum(sb) != sb->sb_csum && -+ calc_sb_csum_common(sb) != sb->sb_csum) { - printk(KERN_WARNING "md: invalid superblock checksum on %s\n", - b); - goto abort; -@@ -530,7 +584,7 @@ static int super_90_load(mdk_rdev_t *rde - rdev->preferred_minor = sb->md_minor; - rdev->data_offset = 0; - -- if (sb->level == MULTIPATH) -+ if (sb->level == LEVEL_MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; -@@ -745,11 +799,21 @@ static void super_90_sync(mddev_t *mddev - static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) - { - unsigned int disk_csum, csum; -+ unsigned long long newcsum; - int size = 256 + sb->max_dev*2; -+ unsigned int *isuper = (unsigned int*)sb; -+ int i; - - disk_csum = sb->sb_csum; - sb->sb_csum = 0; -- csum = csum_partial((void *)sb, size, 0); -+ newcsum = 0; -+ for (i=0; size>=4; size -= 4 ) -+ newcsum += le32_to_cpu(*isuper++); -+ -+ if (size == 2) -+ newcsum += le16_to_cpu(*(unsigned short*) isuper); -+ -+ csum = (newcsum & 0xffffffff) + (newcsum >> 32); - sb->sb_csum = disk_csum; - return csum; - } -@@ -924,12 +988,12 @@ static void super_1_sync(mddev_t *mddev, - - max_dev = 0; - ITERATE_RDEV(mddev,rdev2,tmp) -- if (rdev2->desc_nr > max_dev) -- max_dev = rdev2->desc_nr; -+ if (rdev2->desc_nr+1 > max_dev) -+ max_dev = rdev2->desc_nr+1; - - sb->max_dev = max_dev; - for (i=0; i<max_dev;i++) -- sb->dev_roles[max_dev] = cpu_to_le16(0xfffe); -+ sb->dev_roles[i] = cpu_to_le16(0xfffe); - - ITERATE_RDEV(mddev,rdev2,tmp) { - i = rdev2->desc_nr; -@@ -942,6 +1006,7 @@ static void super_1_sync(mddev_t *mddev, - } - - sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ -+ sb->sb_csum = calc_sb_1_csum(sb); - } - - -@@ -1042,20 +1107,24 @@ static void unbind_rdev_from_array(mdk_r - /* - * prevent the device from being mounted, repartitioned or - * otherwise reused by a RAID array (or any other kernel -- * subsystem), by opening the device. [simply getting an -- * inode is not enough, the SCSI module usage code needs -- * an explicit open() on the device] -+ * subsystem), by bd_claiming the device. - */ - static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) - { - int err = 0; - struct block_device *bdev; -+ char b[BDEVNAME_SIZE]; - - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); -- if (IS_ERR(bdev)) -+ if (IS_ERR(bdev)) { -+ printk(KERN_ERR "md: could not open %s.\n", -+ __bdevname(dev, b)); - return PTR_ERR(bdev); -+ } - err = bd_claim(bdev, rdev); - if (err) { -+ printk(KERN_ERR "md: could not bd_claim %s.\n", -+ bdevname(bdev, b)); - blkdev_put(bdev); - return err; - } -@@ -1117,10 +1186,7 @@ static void export_array(mddev_t *mddev) - - static void print_desc(mdp_disk_t *desc) - { -- char b[BDEVNAME_SIZE]; -- -- printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, -- __bdevname(MKDEV(desc->major, desc->minor), b), -+ printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, - desc->major,desc->minor,desc->raid_disk,desc->state); - } - -@@ -1312,8 +1378,7 @@ static mdk_rdev_t *md_import_device(dev_ - - rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); - if (!rdev) { -- printk(KERN_ERR "md: could not alloc mem for %s!\n", -- __bdevname(newdev, b)); -+ printk(KERN_ERR "md: could not alloc mem for new device!\n"); - return ERR_PTR(-ENOMEM); - } - memset(rdev, 0, sizeof(*rdev)); -@@ -1322,11 +1387,9 @@ static mdk_rdev_t *md_import_device(dev_ - goto abort_free; - - err = lock_rdev(rdev, newdev); -- if (err) { -- printk(KERN_ERR "md: could not lock %s.\n", -- __bdevname(newdev, b)); -+ if (err) - goto abort_free; -- } -+ - rdev->desc_nr = -1; - rdev->faulty = 0; - rdev->in_sync = 0; -@@ -1436,9 +1499,8 @@ static int analyze_sbs(mddev_t * mddev) - goto abort; - } - -- if ((mddev->recovery_cp != MaxSector) && -- ((mddev->level == 1) || -- ((mddev->level >= 4) && (mddev->level <= 6)))) -+ if (mddev->recovery_cp != MaxSector && -+ mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); -@@ -1615,6 +1677,8 @@ static int do_md_run(mddev_t * mddev) - mddev->pers = pers[pnum]; - spin_unlock(&pers_lock); - -+ mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ -+ - err = mddev->pers->run(mddev); - if (err) { - printk(KERN_ERR "md: pers->run() failed ...\n"); -@@ -1645,6 +1709,7 @@ static int do_md_run(mddev_t * mddev) - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; -+ mddev->queue->issue_flush_fn = md_flush_all; - - mddev->changed = 1; - return 0; -@@ -1881,11 +1946,9 @@ static int autostart_array(dev_t startde - mdk_rdev_t *start_rdev = NULL, *rdev; - - start_rdev = md_import_device(startdev, 0, 0); -- if (IS_ERR(start_rdev)) { -- printk(KERN_WARNING "md: could not import %s!\n", -- __bdevname(startdev, b)); -+ if (IS_ERR(start_rdev)) - return err; -- } -+ - - /* NOTE: this can only work for 0.90.0 superblocks */ - sb = (mdp_super_t*)page_address(start_rdev->sb_page); -@@ -1916,12 +1979,9 @@ static int autostart_array(dev_t startde - if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) - continue; - rdev = md_import_device(dev, 0, 0); -- if (IS_ERR(rdev)) { -- printk(KERN_WARNING "md: could not import %s," -- " trying to run array nevertheless.\n", -- __bdevname(dev, b)); -+ if (IS_ERR(rdev)) - continue; -- } -+ - list_add(&rdev->same_set, &pending_raid_disks); - } - -@@ -2153,42 +2213,6 @@ static int add_new_disk(mddev_t * mddev, - return 0; - } - --static int hot_generate_error(mddev_t * mddev, dev_t dev) --{ -- char b[BDEVNAME_SIZE]; -- struct request_queue *q; -- mdk_rdev_t *rdev; -- -- if (!mddev->pers) -- return -ENODEV; -- -- printk(KERN_INFO "md: trying to generate %s error in %s ... \n", -- __bdevname(dev, b), mdname(mddev)); -- -- rdev = find_rdev(mddev, dev); -- if (!rdev) { -- /* MD_BUG(); */ /* like hell - it's not a driver bug */ -- return -ENXIO; -- } -- -- if (rdev->desc_nr == -1) { -- MD_BUG(); -- return -EINVAL; -- } -- if (!rdev->in_sync) -- return -ENODEV; -- -- q = bdev_get_queue(rdev->bdev); -- if (!q) { -- MD_BUG(); -- return -ENODEV; -- } -- printk(KERN_INFO "md: okay, generating error!\n"); --// q->oneshot_error = 1; // disabled for now -- -- return 0; --} -- - static int hot_remove_disk(mddev_t * mddev, dev_t dev) - { - char b[BDEVNAME_SIZE]; -@@ -2197,9 +2221,6 @@ static int hot_remove_disk(mddev_t * mdd - if (!mddev->pers) - return -ENODEV; - -- printk(KERN_INFO "md: trying to remove %s from %s ... \n", -- __bdevname(dev, b), mdname(mddev)); -- - rdev = find_rdev(mddev, dev); - if (!rdev) - return -ENXIO; -@@ -2227,9 +2248,6 @@ static int hot_add_disk(mddev_t * mddev, - if (!mddev->pers) - return -ENODEV; - -- printk(KERN_INFO "md: trying to hot-add %s to %s ... \n", -- __bdevname(dev, b), mdname(mddev)); -- - if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: HOT_ADD may only be used with" - " version-0 superblocks.\n", -@@ -2478,6 +2496,9 @@ static int set_disk_faulty(mddev_t *mdde - { - mdk_rdev_t *rdev; - -+ if (mddev->pers == NULL) -+ return -ENODEV; -+ - rdev = find_rdev(mddev, dev); - if (!rdev) - return -ENODEV; -@@ -2489,7 +2510,6 @@ static int set_disk_faulty(mddev_t *mdde - static int md_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) - { -- char b[BDEVNAME_SIZE]; - int err = 0; - void __user *argp = (void __user *)arg; - struct hd_geometry __user *loc = argp; -@@ -2548,8 +2568,7 @@ static int md_ioctl(struct inode *inode, - } - err = autostart_array(new_decode_dev(arg)); - if (err) { -- printk(KERN_WARNING "md: autostart %s failed!\n", -- __bdevname(arg, b)); -+ printk(KERN_WARNING "md: autostart failed!\n"); - goto abort; - } - goto done; -@@ -2690,9 +2709,7 @@ static int md_ioctl(struct inode *inode, - err = add_new_disk(mddev, &info); - goto done_unlock; - } -- case HOT_GENERATE_ERROR: -- err = hot_generate_error(mddev, new_decode_dev(arg)); -- goto done_unlock; -+ - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; -@@ -2876,7 +2893,7 @@ mdk_thread_t *md_register_thread(void (* - return thread; - } - --void md_interrupt_thread(mdk_thread_t *thread) -+static void md_interrupt_thread(mdk_thread_t *thread) - { - if (!thread->tsk) { - MD_BUG(); -@@ -2919,6 +2936,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t - if (!mddev->pers->error_handler) - return; - mddev->pers->error_handler(mddev,rdev); -+ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - } -@@ -2951,7 +2969,11 @@ static void status_resync(struct seq_fil - unsigned long max_blocks, resync, res, dt, db, rt; - - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; -- max_blocks = mddev->size; -+ -+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) -+ max_blocks = mddev->resync_max_sectors >> 1; -+ else -+ max_blocks = mddev->size; - - /* - * Should not happen. -@@ -3187,11 +3209,6 @@ int unregister_md_personality(int pnum) - return 0; - } - --void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) --{ -- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; --} -- - static int is_mddev_idle(mddev_t *mddev) - { - mdk_rdev_t * rdev; -@@ -3204,8 +3221,12 @@ static int is_mddev_idle(mddev_t *mddev) - struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = disk_stat_read(disk, read_sectors) + - disk_stat_read(disk, write_sectors) - -- disk->sync_io; -- if ((curr_events - rdev->last_events) > 32) { -+ atomic_read(&disk->sync_io); -+ /* Allow some slack between valud of curr_events and last_events, -+ * as there are some uninteresting races. -+ * Note: the following is an unsigned comparison. -+ */ -+ if ((curr_events - rdev->last_events + 32) > 64) { - rdev->last_events = curr_events; - idle = 0; - } -@@ -3339,7 +3360,14 @@ static void md_do_sync(mddev_t *mddev) - } - } while (mddev->curr_resync < 2); - -- max_sectors = mddev->size << 1; -+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) -+ /* resync follows the size requested by the personality, -+ * which default to physical size, but can be virtual size -+ */ -+ max_sectors = mddev->resync_max_sectors; -+ else -+ /* recovery follows the physical size of devices */ -+ max_sectors = mddev->size << 1; - - printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" -@@ -3372,10 +3400,12 @@ static void md_do_sync(mddev_t *mddev) - init_waitqueue_head(&mddev->recovery_wait); - last_check = 0; - -- if (j) -+ if (j>2) { - printk(KERN_INFO - "md: resuming recovery of %s from checkpoint.\n", - mdname(mddev)); -+ mddev->curr_resync = j; -+ } - - while (j < max_sectors) { - int sectors; -@@ -3458,7 +3488,7 @@ static void md_do_sync(mddev_t *mddev) - - if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - mddev->curr_resync > 2 && -- mddev->curr_resync > mddev->recovery_cp) { -+ mddev->curr_resync >= mddev->recovery_cp) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - printk(KERN_INFO - "md: checkpointing recovery of %s.\n", -@@ -3697,7 +3727,6 @@ void md_autodetect_dev(dev_t dev) - - static void autostart_arrays(int part) - { -- char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; - int i; - -@@ -3707,11 +3736,9 @@ static void autostart_arrays(int part) - dev_t dev = detected_devices[i]; - - rdev = md_import_device(dev,0, 0); -- if (IS_ERR(rdev)) { -- printk(KERN_ALERT "md: could not import %s!\n", -- __bdevname(dev, b)); -+ if (IS_ERR(rdev)) - continue; -- } -+ - if (rdev->faulty) { - MD_BUG(); - continue; -@@ -3762,7 +3789,6 @@ module_exit(md_exit) - EXPORT_SYMBOL(register_md_personality); - EXPORT_SYMBOL(unregister_md_personality); - EXPORT_SYMBOL(md_error); --EXPORT_SYMBOL(md_sync_acct); - EXPORT_SYMBOL(md_done_sync); - EXPORT_SYMBOL(md_write_start); - EXPORT_SYMBOL(md_write_end); -@@ -3771,6 +3797,5 @@ EXPORT_SYMBOL(md_register_thread); - EXPORT_SYMBOL(md_unregister_thread); - EXPORT_SYMBOL(md_wakeup_thread); - EXPORT_SYMBOL(md_print_devices); --EXPORT_SYMBOL(md_interrupt_thread); - EXPORT_SYMBOL(md_check_recovery); - MODULE_LICENSE("GPL"); -diff -pruN ./drivers/md.dm/multipath.c ./drivers/md/multipath.c ---- ./drivers/md.dm/multipath.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/multipath.c 2006-03-17 13:16:38.000000000 +0300 -@@ -99,12 +99,12 @@ static void multipath_reschedule_retry ( - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ --static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate) -+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) - { - struct bio *bio = mp_bh->master_bio; - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); - -- bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO); -+ bio_endio(bio, bio->bi_size, err); - mempool_free(mp_bh, conf->pool); - } - -@@ -119,8 +119,8 @@ int multipath_end_request(struct bio *bi - return 1; - - if (uptodate) -- multipath_end_bh_io(mp_bh, uptodate); -- else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) { -+ multipath_end_bh_io(mp_bh, 0); -+ else if (!bio_rw_ahead(bio)) { - /* - * oops, IO error: - */ -@@ -131,7 +131,7 @@ int multipath_end_request(struct bio *bi - (unsigned long long)bio->bi_sector); - multipath_reschedule_retry(mp_bh); - } else -- multipath_end_bh_io(mp_bh, 0); -+ multipath_end_bh_io(mp_bh, error); - rdev_dec_pending(rdev, conf->mddev); - return 0; - } -@@ -155,7 +155,7 @@ static void unplug_slaves(mddev_t *mddev - r_queue->unplug_fn(r_queue); - - spin_lock_irqsave(&conf->device_lock, flags); -- atomic_dec(&rdev->nr_pending); -+ rdev_dec_pending(rdev, mddev); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -@@ -217,6 +217,31 @@ static void multipath_status (struct seq - seq_printf (seq, "]"); - } - -+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ multipath_conf_t *conf = mddev_to_conf(mddev); -+ int i, ret = 0; -+ -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->multipaths[i].rdev; -+ if (rdev && !rdev->faulty) { -+ struct block_device *bdev = rdev->bdev; -+ request_queue_t *r_queue = bdev_get_queue(bdev); -+ -+ if (!r_queue->issue_flush_fn) { -+ ret = -EOPNOTSUPP; -+ break; -+ } -+ -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ } -+ return ret; -+} - - /* - * Careful, this can execute in IRQ contexts as well! -@@ -300,7 +325,7 @@ static int multipath_add_disk(mddev_t *m - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - conf->working_disks++; - rdev->raid_disk = path; -@@ -377,7 +402,7 @@ static void multipathd (mddev_t *mddev) - " error for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_sector); -- multipath_end_bh_io(mp_bh, 0); -+ multipath_end_bh_io(mp_bh, -EIO); - } else { - printk(KERN_ERR "multipath: %s: redirecting sector %llu" - " to another IO path\n", -@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev - - mddev->queue->unplug_fn = multipath_unplug; - -+ mddev->queue->issue_flush_fn = multipath_issue_flush; -+ - conf->working_disks = 0; - ITERATE_RDEV(mddev,rdev,tmp) { - disk_idx = rdev->raid_disk; -@@ -452,7 +479,7 @@ static int multipath_run (mddev_t *mddev - * a merge_bvec_fn to be involved in multipath */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - if (!rdev->faulty) - conf->working_disks++; -diff -pruN ./drivers/md.dm/raid0.c ./drivers/md/raid0.c ---- ./drivers/md.dm/raid0.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/raid0.c 2006-03-17 13:16:38.000000000 +0300 -@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t - } - } - -+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ raid0_conf_t *conf = mddev_to_conf(mddev); -+ mdk_rdev_t **devlist = conf->strip_zone[0].dev; -+ int i, ret = 0; -+ -+ for (i=0; i<mddev->raid_disks; i++) { -+ struct block_device *bdev = devlist[i]->bdev; -+ request_queue_t *r_queue = bdev_get_queue(bdev); -+ -+ if (!r_queue->issue_flush_fn) { -+ ret = -EOPNOTSUPP; -+ break; -+ } -+ -+ ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ return ret; -+} -+ -+ - static int create_strip_zones (mddev_t *mddev) - { - int i, c, j; -@@ -137,7 +162,7 @@ static int create_strip_zones (mddev_t * - - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - if (!smallest || (rdev1->size <smallest->size)) - smallest = rdev1; -@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t * - - mddev->queue->unplug_fn = raid0_unplug; - -+ mddev->queue->issue_flush_fn = raid0_issue_flush; -+ - printk("raid0: done.\n"); - return 0; - abort: -diff -pruN ./drivers/md.dm/raid10.c ./drivers/md/raid10.c ---- ./drivers/md.dm/raid10.c 1970-01-01 03:00:00.000000000 +0300 -+++ ./drivers/md/raid10.c 2006-03-17 13:16:38.000000000 +0300 -@@ -0,0 +1,1780 @@ -+/* -+ * raid10.c : Multiple Devices driver for Linux -+ * -+ * Copyright (C) 2000-2004 Neil Brown -+ * -+ * RAID-10 support for md. -+ * -+ * Base on code in raid1.c. See raid1.c for futher copyright information. -+ * -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2, or (at your option) -+ * any later version. -+ * -+ * You should have received a copy of the GNU General Public License -+ * (for example /usr/src/linux/COPYING); if not, write to the Free -+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -+ */ -+ -+#include <linux/raid/raid10.h> -+ -+/* -+ * RAID10 provides a combination of RAID0 and RAID1 functionality. -+ * The layout of data is defined by -+ * chunk_size -+ * raid_disks -+ * near_copies (stored in low byte of layout) -+ * far_copies (stored in second byte of layout) -+ * -+ * The data to be stored is divided into chunks using chunksize. -+ * Each device is divided into far_copies sections. -+ * In each section, chunks are laid out in a style similar to raid0, but -+ * near_copies copies of each chunk is stored (each on a different drive). -+ * The starting device for each section is offset near_copies from the starting -+ * device of the previous section. -+ * Thus there are (near_copies*far_copies) of each chunk, and each is on a different -+ * drive. -+ * near_copies and far_copies must be at least one, and there product is at most -+ * raid_disks. -+ */ -+ -+/* -+ * Number of guaranteed r10bios in case of extreme VM load: -+ */ -+#define NR_RAID10_BIOS 256 -+ -+static void unplug_slaves(mddev_t *mddev); -+ -+static void * r10bio_pool_alloc(int gfp_flags, void *data) -+{ -+ conf_t *conf = data; -+ r10bio_t *r10_bio; -+ int size = offsetof(struct r10bio_s, devs[conf->copies]); -+ -+ /* allocate a r10bio with room for raid_disks entries in the bios array */ -+ r10_bio = kmalloc(size, gfp_flags); -+ if (r10_bio) -+ memset(r10_bio, 0, size); -+ else -+ unplug_slaves(conf->mddev); -+ -+ return r10_bio; -+} -+ -+static void r10bio_pool_free(void *r10_bio, void *data) -+{ -+ kfree(r10_bio); -+} -+ -+#define RESYNC_BLOCK_SIZE (64*1024) -+//#define RESYNC_BLOCK_SIZE PAGE_SIZE -+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) -+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -+#define RESYNC_WINDOW (2048*1024) -+ -+/* -+ * When performing a resync, we need to read and compare, so -+ * we need as many pages are there are copies. -+ * When performing a recovery, we need 2 bios, one for read, -+ * one for write (we recover only one drive per r10buf) -+ * -+ */ -+static void * r10buf_pool_alloc(int gfp_flags, void *data) -+{ -+ conf_t *conf = data; -+ struct page *page; -+ r10bio_t *r10_bio; -+ struct bio *bio; -+ int i, j; -+ int nalloc; -+ -+ r10_bio = r10bio_pool_alloc(gfp_flags, conf); -+ if (!r10_bio) { -+ unplug_slaves(conf->mddev); -+ return NULL; -+ } -+ -+ if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) -+ nalloc = conf->copies; /* resync */ -+ else -+ nalloc = 2; /* recovery */ -+ -+ /* -+ * Allocate bios. -+ */ -+ for (j = nalloc ; j-- ; ) { -+ bio = bio_alloc(gfp_flags, RESYNC_PAGES); -+ if (!bio) -+ goto out_free_bio; -+ r10_bio->devs[j].bio = bio; -+ } -+ /* -+ * Allocate RESYNC_PAGES data pages and attach them -+ * where needed. -+ */ -+ for (j = 0 ; j < nalloc; j++) { -+ bio = r10_bio->devs[j].bio; -+ for (i = 0; i < RESYNC_PAGES; i++) { -+ page = alloc_page(gfp_flags); -+ if (unlikely(!page)) -+ goto out_free_pages; -+ -+ bio->bi_io_vec[i].bv_page = page; -+ } -+ } -+ -+ return r10_bio; -+ -+out_free_pages: -+ for ( ; i > 0 ; i--) -+ __free_page(bio->bi_io_vec[i-1].bv_page); -+ while (j--) -+ for (i = 0; i < RESYNC_PAGES ; i++) -+ __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); -+ j = -1; -+out_free_bio: -+ while ( ++j < nalloc ) -+ bio_put(r10_bio->devs[j].bio); -+ r10bio_pool_free(r10_bio, conf); -+ return NULL; -+} -+ -+static void r10buf_pool_free(void *__r10_bio, void *data) -+{ -+ int i; -+ conf_t *conf = data; -+ r10bio_t *r10bio = __r10_bio; -+ int j; -+ -+ for (j=0; j < conf->copies; j++) { -+ struct bio *bio = r10bio->devs[j].bio; -+ if (bio) { -+ for (i = 0; i < RESYNC_PAGES; i++) { -+ __free_page(bio->bi_io_vec[i].bv_page); -+ bio->bi_io_vec[i].bv_page = NULL; -+ } -+ bio_put(bio); -+ } -+ } -+ r10bio_pool_free(r10bio, conf); -+} -+ -+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) -+{ -+ int i; -+ -+ for (i = 0; i < conf->copies; i++) { -+ struct bio **bio = & r10_bio->devs[i].bio; -+ if (*bio) -+ bio_put(*bio); -+ *bio = NULL; -+ } -+} -+ -+static inline void free_r10bio(r10bio_t *r10_bio) -+{ -+ unsigned long flags; -+ -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ -+ /* -+ * Wake up any possible resync thread that waits for the device -+ * to go idle. -+ */ -+ spin_lock_irqsave(&conf->resync_lock, flags); -+ if (!--conf->nr_pending) { -+ wake_up(&conf->wait_idle); -+ wake_up(&conf->wait_resume); -+ } -+ spin_unlock_irqrestore(&conf->resync_lock, flags); -+ -+ put_all_bios(conf, r10_bio); -+ mempool_free(r10_bio, conf->r10bio_pool); -+} -+ -+static inline void put_buf(r10bio_t *r10_bio) -+{ -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ unsigned long flags; -+ -+ mempool_free(r10_bio, conf->r10buf_pool); -+ -+ spin_lock_irqsave(&conf->resync_lock, flags); -+ if (!conf->barrier) -+ BUG(); -+ --conf->barrier; -+ wake_up(&conf->wait_resume); -+ wake_up(&conf->wait_idle); -+ -+ if (!--conf->nr_pending) { -+ wake_up(&conf->wait_idle); -+ wake_up(&conf->wait_resume); -+ } -+ spin_unlock_irqrestore(&conf->resync_lock, flags); -+} -+ -+static void reschedule_retry(r10bio_t *r10_bio) -+{ -+ unsigned long flags; -+ mddev_t *mddev = r10_bio->mddev; -+ conf_t *conf = mddev_to_conf(mddev); -+ -+ spin_lock_irqsave(&conf->device_lock, flags); -+ list_add(&r10_bio->retry_list, &conf->retry_list); -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ -+ md_wakeup_thread(mddev->thread); -+} -+ -+/* -+ * raid_end_bio_io() is called when we have finished servicing a mirrored -+ * operation and are ready to return a success/failure code to the buffer -+ * cache layer. -+ */ -+static void raid_end_bio_io(r10bio_t *r10_bio) -+{ -+ struct bio *bio = r10_bio->master_bio; -+ -+ bio_endio(bio, bio->bi_size, -+ test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); -+ free_r10bio(r10_bio); -+} -+ -+/* -+ * Update disk head position estimator based on IRQ completion info. -+ */ -+static inline void update_head_pos(int slot, r10bio_t *r10_bio) -+{ -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ -+ conf->mirrors[r10_bio->devs[slot].devnum].head_position = -+ r10_bio->devs[slot].addr + (r10_bio->sectors); -+} -+ -+static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) -+{ -+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); -+ int slot, dev; -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ -+ if (bio->bi_size) -+ return 1; -+ -+ slot = r10_bio->read_slot; -+ dev = r10_bio->devs[slot].devnum; -+ /* -+ * this branch is our 'one mirror IO has finished' event handler: -+ */ -+ if (!uptodate) -+ md_error(r10_bio->mddev, conf->mirrors[dev].rdev); -+ else -+ /* -+ * Set R10BIO_Uptodate in our master bio, so that -+ * we will return a good error code to the higher -+ * levels even if IO on some other mirrored buffer fails. -+ * -+ * The 'master' represents the composite IO operation to -+ * user-side. So if something waits for IO, then it will -+ * wait for the 'master' bio. -+ */ -+ set_bit(R10BIO_Uptodate, &r10_bio->state); -+ -+ update_head_pos(slot, r10_bio); -+ -+ /* -+ * we have only one bio on the read side -+ */ -+ if (uptodate) -+ raid_end_bio_io(r10_bio); -+ else { -+ /* -+ * oops, read error: -+ */ -+ char b[BDEVNAME_SIZE]; -+ if (printk_ratelimit()) -+ printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", -+ bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); -+ reschedule_retry(r10_bio); -+ } -+ -+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); -+ return 0; -+} -+ -+static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) -+{ -+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); -+ int slot, dev; -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ -+ if (bio->bi_size) -+ return 1; -+ -+ for (slot = 0; slot < conf->copies; slot++) -+ if (r10_bio->devs[slot].bio == bio) -+ break; -+ dev = r10_bio->devs[slot].devnum; -+ -+ /* -+ * this branch is our 'one mirror IO has finished' event handler: -+ */ -+ if (!uptodate) -+ md_error(r10_bio->mddev, conf->mirrors[dev].rdev); -+ else -+ /* -+ * Set R10BIO_Uptodate in our master bio, so that -+ * we will return a good error code for to the higher -+ * levels even if IO on some other mirrored buffer fails. -+ * -+ * The 'master' represents the composite IO operation to -+ * user-side. So if something waits for IO, then it will -+ * wait for the 'master' bio. -+ */ -+ set_bit(R10BIO_Uptodate, &r10_bio->state); -+ -+ update_head_pos(slot, r10_bio); -+ -+ /* -+ * -+ * Let's see if all mirrored write operations have finished -+ * already. -+ */ -+ if (atomic_dec_and_test(&r10_bio->remaining)) { -+ md_write_end(r10_bio->mddev); -+ raid_end_bio_io(r10_bio); -+ } -+ -+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); -+ return 0; -+} -+ -+ -+/* -+ * RAID10 layout manager -+ * Aswell as the chunksize and raid_disks count, there are two -+ * parameters: near_copies and far_copies. -+ * near_copies * far_copies must be <= raid_disks. -+ * Normally one of these will be 1. -+ * If both are 1, we get raid0. -+ * If near_copies == raid_disks, we get raid1. -+ * -+ * Chunks are layed out in raid0 style with near_copies copies of the -+ * first chunk, followed by near_copies copies of the next chunk and -+ * so on. -+ * If far_copies > 1, then after 1/far_copies of the array has been assigned -+ * as described above, we start again with a device offset of near_copies. -+ * So we effectively have another copy of the whole array further down all -+ * the drives, but with blocks on different drives. -+ * With this layout, and block is never stored twice on the one device. -+ * -+ * raid10_find_phys finds the sector offset of a given virtual sector -+ * on each device that it is on. If a block isn't on a device, -+ * that entry in the array is set to MaxSector. -+ * -+ * raid10_find_virt does the reverse mapping, from a device and a -+ * sector offset to a virtual address -+ */ -+ -+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) -+{ -+ int n,f; -+ sector_t sector; -+ sector_t chunk; -+ sector_t stripe; -+ int dev; -+ -+ int slot = 0; -+ -+ /* now calculate first sector/dev */ -+ chunk = r10bio->sector >> conf->chunk_shift; -+ sector = r10bio->sector & conf->chunk_mask; -+ -+ chunk *= conf->near_copies; -+ stripe = chunk; -+ dev = sector_div(stripe, conf->raid_disks); -+ -+ sector += stripe << conf->chunk_shift; -+ -+ /* and calculate all the others */ -+ for (n=0; n < conf->near_copies; n++) { -+ int d = dev; -+ sector_t s = sector; -+ r10bio->devs[slot].addr = sector; -+ r10bio->devs[slot].devnum = d; -+ slot++; -+ -+ for (f = 1; f < conf->far_copies; f++) { -+ d += conf->near_copies; -+ if (d >= conf->raid_disks) -+ d -= conf->raid_disks; -+ s += conf->stride; -+ r10bio->devs[slot].devnum = d; -+ r10bio->devs[slot].addr = s; -+ slot++; -+ } -+ dev++; -+ if (dev >= conf->raid_disks) { -+ dev = 0; -+ sector += (conf->chunk_mask + 1); -+ } -+ } -+ BUG_ON(slot != conf->copies); -+} -+ -+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) -+{ -+ sector_t offset, chunk, vchunk; -+ -+ while (sector > conf->stride) { -+ sector -= conf->stride; -+ if (dev < conf->near_copies) -+ dev += conf->raid_disks - conf->near_copies; -+ else -+ dev -= conf->near_copies; -+ } -+ -+ offset = sector & conf->chunk_mask; -+ chunk = sector >> conf->chunk_shift; -+ vchunk = chunk * conf->raid_disks + dev; -+ sector_div(vchunk, conf->near_copies); -+ return (vchunk << conf->chunk_shift) + offset; -+} -+ -+/** -+ * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged -+ * @q: request queue -+ * @bio: the buffer head that's been built up so far -+ * @biovec: the request that could be merged to it. -+ * -+ * Return amount of bytes we can accept at this offset -+ * If near_copies == raid_disk, there are no striping issues, -+ * but in that case, the function isn't called at all. -+ */ -+static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, -+ struct bio_vec *bio_vec) -+{ -+ mddev_t *mddev = q->queuedata; -+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); -+ int max; -+ unsigned int chunk_sectors = mddev->chunk_size >> 9; -+ unsigned int bio_sectors = bio->bi_size >> 9; -+ -+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; -+ if (max < 0) max = 0; /* bio_add cannot handle a negative return */ -+ if (max <= bio_vec->bv_len && bio_sectors == 0) -+ return bio_vec->bv_len; -+ else -+ return max; -+} -+ -+/* -+ * This routine returns the disk from which the requested read should -+ * be done. There is a per-array 'next expected sequential IO' sector -+ * number - if this matches on the next IO then we use the last disk. -+ * There is also a per-disk 'last know head position' sector that is -+ * maintained from IRQ contexts, both the normal and the resync IO -+ * completion handlers update this position correctly. If there is no -+ * perfect sequential match then we pick the disk whose head is closest. -+ * -+ * If there are 2 mirrors in the same 2 devices, performance degrades -+ * because position is mirror, not device based. -+ * -+ * The rdev for the device selected will have nr_pending incremented. -+ */ -+ -+/* -+ * FIXME: possibly should rethink readbalancing and do it differently -+ * depending on near_copies / far_copies geometry. -+ */ -+static int read_balance(conf_t *conf, r10bio_t *r10_bio) -+{ -+ const unsigned long this_sector = r10_bio->sector; -+ int disk, slot, nslot; -+ const int sectors = r10_bio->sectors; -+ sector_t new_distance, current_distance; -+ -+ raid10_find_phys(conf, r10_bio); -+ spin_lock_irq(&conf->device_lock); -+ /* -+ * Check if we can balance. We can balance on the whole -+ * device if no resync is going on, or below the resync window. -+ * We take the first readable disk when above the resync window. -+ */ -+ if (conf->mddev->recovery_cp < MaxSector -+ && (this_sector + sectors >= conf->next_resync)) { -+ /* make sure that disk is operational */ -+ slot = 0; -+ disk = r10_bio->devs[slot].devnum; -+ -+ while (!conf->mirrors[disk].rdev || -+ !conf->mirrors[disk].rdev->in_sync) { -+ slot++; -+ if (slot == conf->copies) { -+ slot = 0; -+ disk = -1; -+ break; -+ } -+ disk = r10_bio->devs[slot].devnum; -+ } -+ goto rb_out; -+ } -+ -+ -+ /* make sure the disk is operational */ -+ slot = 0; -+ disk = r10_bio->devs[slot].devnum; -+ while (!conf->mirrors[disk].rdev || -+ !conf->mirrors[disk].rdev->in_sync) { -+ slot ++; -+ if (slot == conf->copies) { -+ disk = -1; -+ goto rb_out; -+ } -+ disk = r10_bio->devs[slot].devnum; -+ } -+ -+ -+ current_distance = abs(this_sector - conf->mirrors[disk].head_position); -+ -+ /* Find the disk whose head is closest */ -+ -+ for (nslot = slot; nslot < conf->copies; nslot++) { -+ int ndisk = r10_bio->devs[nslot].devnum; -+ -+ -+ if (!conf->mirrors[ndisk].rdev || -+ !conf->mirrors[ndisk].rdev->in_sync) -+ continue; -+ -+ if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { -+ disk = ndisk; -+ slot = nslot; -+ break; -+ } -+ new_distance = abs(r10_bio->devs[nslot].addr - -+ conf->mirrors[ndisk].head_position); -+ if (new_distance < current_distance) { -+ current_distance = new_distance; -+ disk = ndisk; -+ slot = nslot; -+ } -+ } -+ -+rb_out: -+ r10_bio->read_slot = slot; -+/* conf->next_seq_sect = this_sector + sectors;*/ -+ -+ if (disk >= 0 && conf->mirrors[disk].rdev) -+ atomic_inc(&conf->mirrors[disk].rdev->nr_pending); -+ spin_unlock_irq(&conf->device_lock); -+ -+ return disk; -+} -+ -+static void unplug_slaves(mddev_t *mddev) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ int i; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&conf->device_lock, flags); -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->mirrors[i].rdev; -+ if (rdev && atomic_read(&rdev->nr_pending)) { -+ request_queue_t *r_queue = bdev_get_queue(rdev->bdev); -+ -+ atomic_inc(&rdev->nr_pending); -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ -+ if (r_queue->unplug_fn) -+ r_queue->unplug_fn(r_queue); -+ -+ spin_lock_irqsave(&conf->device_lock, flags); -+ rdev_dec_pending(rdev, mddev); -+ } -+ } -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+} -+static void raid10_unplug(request_queue_t *q) -+{ -+ unplug_slaves(q->queuedata); -+} -+ -+static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ conf_t *conf = mddev_to_conf(mddev); -+ unsigned long flags; -+ int i, ret = 0; -+ -+ spin_lock_irqsave(&conf->device_lock, flags); -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->mirrors[i].rdev; -+ if (rdev && !rdev->faulty) { -+ struct block_device *bdev = rdev->bdev; -+ request_queue_t *r_queue = bdev_get_queue(bdev); -+ -+ if (r_queue->issue_flush_fn) { -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ } -+ } -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ return ret; -+} -+ -+/* -+ * Throttle resync depth, so that we can both get proper overlapping of -+ * requests, but are still able to handle normal requests quickly. -+ */ -+#define RESYNC_DEPTH 32 -+ -+static void device_barrier(conf_t *conf, sector_t sect) -+{ -+ spin_lock_irq(&conf->resync_lock); -+ wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), -+ conf->resync_lock, unplug_slaves(conf->mddev)); -+ -+ if (!conf->barrier++) { -+ wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, -+ conf->resync_lock, unplug_slaves(conf->mddev)); -+ if (conf->nr_pending) -+ BUG(); -+ } -+ wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, -+ conf->resync_lock, unplug_slaves(conf->mddev)); -+ conf->next_resync = sect; -+ spin_unlock_irq(&conf->resync_lock); -+} -+ -+static int make_request(request_queue_t *q, struct bio * bio) -+{ -+ mddev_t *mddev = q->queuedata; -+ conf_t *conf = mddev_to_conf(mddev); -+ mirror_info_t *mirror; -+ r10bio_t *r10_bio; -+ struct bio *read_bio; -+ int i; -+ int chunk_sects = conf->chunk_mask + 1; -+ -+ /* If this request crosses a chunk boundary, we need to -+ * split it. This will only happen for 1 PAGE (or less) requests. -+ */ -+ if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) -+ > chunk_sects && -+ conf->near_copies < conf->raid_disks)) { -+ struct bio_pair *bp; -+ /* Sanity check -- queue functions should prevent this happening */ -+ if (bio->bi_vcnt != 1 || -+ bio->bi_idx != 0) -+ goto bad_map; -+ /* This is a one page bio that upper layers -+ * refuse to split for us, so we need to split it. -+ */ -+ bp = bio_split(bio, bio_split_pool, -+ chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); -+ if (make_request(q, &bp->bio1)) -+ generic_make_request(&bp->bio1); -+ if (make_request(q, &bp->bio2)) -+ generic_make_request(&bp->bio2); -+ -+ bio_pair_release(bp); -+ return 0; -+ bad_map: -+ printk("raid10_make_request bug: can't convert block across chunks" -+ " or bigger than %dk %llu %d\n", chunk_sects/2, -+ (unsigned long long)bio->bi_sector, bio->bi_size >> 10); -+ -+ bio_io_error(bio, bio->bi_size); -+ return 0; -+ } -+ -+ /* -+ * Register the new request and wait if the reconstruction -+ * thread has put up a bar for new requests. -+ * Continue immediately if no resync is active currently. -+ */ -+ spin_lock_irq(&conf->resync_lock); -+ wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); -+ conf->nr_pending++; -+ spin_unlock_irq(&conf->resync_lock); -+ -+ if (bio_data_dir(bio)==WRITE) { -+ disk_stat_inc(mddev->gendisk, writes); -+ disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); -+ } else { -+ disk_stat_inc(mddev->gendisk, reads); -+ disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); -+ } -+ -+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); -+ -+ r10_bio->master_bio = bio; -+ r10_bio->sectors = bio->bi_size >> 9; -+ -+ r10_bio->mddev = mddev; -+ r10_bio->sector = bio->bi_sector; -+ -+ if (bio_data_dir(bio) == READ) { -+ /* -+ * read balancing logic: -+ */ -+ int disk = read_balance(conf, r10_bio); -+ int slot = r10_bio->read_slot; -+ if (disk < 0) { -+ raid_end_bio_io(r10_bio); -+ return 0; -+ } -+ mirror = conf->mirrors + disk; -+ -+ read_bio = bio_clone(bio, GFP_NOIO); -+ -+ r10_bio->devs[slot].bio = read_bio; -+ -+ read_bio->bi_sector = r10_bio->devs[slot].addr + -+ mirror->rdev->data_offset; -+ read_bio->bi_bdev = mirror->rdev->bdev; -+ read_bio->bi_end_io = raid10_end_read_request; -+ read_bio->bi_rw = READ; -+ read_bio->bi_private = r10_bio; -+ -+ generic_make_request(read_bio); -+ return 0; -+ } -+ -+ /* -+ * WRITE: -+ */ -+ /* first select target devices under spinlock and -+ * inc refcount on their rdev. Record them by setting -+ * bios[x] to bio -+ */ -+ raid10_find_phys(conf, r10_bio); -+ spin_lock_irq(&conf->device_lock); -+ for (i = 0; i < conf->copies; i++) { -+ int d = r10_bio->devs[i].devnum; -+ if (conf->mirrors[d].rdev && -+ !conf->mirrors[d].rdev->faulty) { -+ atomic_inc(&conf->mirrors[d].rdev->nr_pending); -+ r10_bio->devs[i].bio = bio; -+ } else -+ r10_bio->devs[i].bio = NULL; -+ } -+ spin_unlock_irq(&conf->device_lock); -+ -+ atomic_set(&r10_bio->remaining, 1); -+ md_write_start(mddev); -+ for (i = 0; i < conf->copies; i++) { -+ struct bio *mbio; -+ int d = r10_bio->devs[i].devnum; -+ if (!r10_bio->devs[i].bio) -+ continue; -+ -+ mbio = bio_clone(bio, GFP_NOIO); -+ r10_bio->devs[i].bio = mbio; -+ -+ mbio->bi_sector = r10_bio->devs[i].addr+ -+ conf->mirrors[d].rdev->data_offset; -+ mbio->bi_bdev = conf->mirrors[d].rdev->bdev; -+ mbio->bi_end_io = raid10_end_write_request; -+ mbio->bi_rw = WRITE; -+ mbio->bi_private = r10_bio; -+ -+ atomic_inc(&r10_bio->remaining); -+ generic_make_request(mbio); -+ } -+ -+ if (atomic_dec_and_test(&r10_bio->remaining)) { -+ md_write_end(mddev); -+ raid_end_bio_io(r10_bio); -+ } -+ -+ return 0; -+} -+ -+static void status(struct seq_file *seq, mddev_t *mddev) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ int i; -+ -+ if (conf->near_copies < conf->raid_disks) -+ seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); -+ if (conf->near_copies > 1) -+ seq_printf(seq, " %d near-copies", conf->near_copies); -+ if (conf->far_copies > 1) -+ seq_printf(seq, " %d far-copies", conf->far_copies); -+ -+ seq_printf(seq, " [%d/%d] [", conf->raid_disks, -+ conf->working_disks); -+ for (i = 0; i < conf->raid_disks; i++) -+ seq_printf(seq, "%s", -+ conf->mirrors[i].rdev && -+ conf->mirrors[i].rdev->in_sync ? "U" : "_"); -+ seq_printf(seq, "]"); -+} -+ -+static void error(mddev_t *mddev, mdk_rdev_t *rdev) -+{ -+ char b[BDEVNAME_SIZE]; -+ conf_t *conf = mddev_to_conf(mddev); -+ -+ /* -+ * If it is not operational, then we have already marked it as dead -+ * else if it is the last working disks, ignore the error, let the -+ * next level up know. -+ * else mark the drive as failed -+ */ -+ if (rdev->in_sync -+ && conf->working_disks == 1) -+ /* -+ * Don't fail the drive, just return an IO error. -+ * The test should really be more sophisticated than -+ * "working_disks == 1", but it isn't critical, and -+ * can wait until we do more sophisticated "is the drive -+ * really dead" tests... -+ */ -+ return; -+ if (rdev->in_sync) { -+ mddev->degraded++; -+ conf->working_disks--; -+ /* -+ * if recovery is running, make sure it aborts. -+ */ -+ set_bit(MD_RECOVERY_ERR, &mddev->recovery); -+ } -+ rdev->in_sync = 0; -+ rdev->faulty = 1; -+ mddev->sb_dirty = 1; -+ printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" -+ " Operation continuing on %d devices\n", -+ bdevname(rdev->bdev,b), conf->working_disks); -+} -+ -+static void print_conf(conf_t *conf) -+{ -+ int i; -+ mirror_info_t *tmp; -+ -+ printk("RAID10 conf printout:\n"); -+ if (!conf) { -+ printk("(!conf)\n"); -+ return; -+ } -+ printk(" --- wd:%d rd:%d\n", conf->working_disks, -+ conf->raid_disks); -+ -+ for (i = 0; i < conf->raid_disks; i++) { -+ char b[BDEVNAME_SIZE]; -+ tmp = conf->mirrors + i; -+ if (tmp->rdev) -+ printk(" disk %d, wo:%d, o:%d, dev:%s\n", -+ i, !tmp->rdev->in_sync, !tmp->rdev->faulty, -+ bdevname(tmp->rdev->bdev,b)); -+ } -+} -+ -+static void close_sync(conf_t *conf) -+{ -+ spin_lock_irq(&conf->resync_lock); -+ wait_event_lock_irq(conf->wait_resume, !conf->barrier, -+ conf->resync_lock, unplug_slaves(conf->mddev)); -+ spin_unlock_irq(&conf->resync_lock); -+ -+ if (conf->barrier) BUG(); -+ if (waitqueue_active(&conf->wait_idle)) BUG(); -+ -+ mempool_destroy(conf->r10buf_pool); -+ conf->r10buf_pool = NULL; -+} -+ -+static int raid10_spare_active(mddev_t *mddev) -+{ -+ int i; -+ conf_t *conf = mddev->private; -+ mirror_info_t *tmp; -+ -+ spin_lock_irq(&conf->device_lock); -+ /* -+ * Find all non-in_sync disks within the RAID10 configuration -+ * and mark them in_sync -+ */ -+ for (i = 0; i < conf->raid_disks; i++) { -+ tmp = conf->mirrors + i; -+ if (tmp->rdev -+ && !tmp->rdev->faulty -+ && !tmp->rdev->in_sync) { -+ conf->working_disks++; -+ mddev->degraded--; -+ tmp->rdev->in_sync = 1; -+ } -+ } -+ spin_unlock_irq(&conf->device_lock); -+ -+ print_conf(conf); -+ return 0; -+} -+ -+ -+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) -+{ -+ conf_t *conf = mddev->private; -+ int found = 0; -+ int mirror; -+ mirror_info_t *p; -+ -+ if (mddev->recovery_cp < MaxSector) -+ /* only hot-add to in-sync arrays, as recovery is -+ * very different from resync -+ */ -+ return 0; -+ spin_lock_irq(&conf->device_lock); -+ for (mirror=0; mirror < mddev->raid_disks; mirror++) -+ if ( !(p=conf->mirrors+mirror)->rdev) { -+ p->rdev = rdev; -+ -+ blk_queue_stack_limits(mddev->queue, -+ rdev->bdev->bd_disk->queue); -+ /* as we don't honour merge_bvec_fn, we must never risk -+ * violating it, so limit ->max_sector to one PAGE, as -+ * a one page request is never in violation. -+ */ -+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && -+ mddev->queue->max_sectors > (PAGE_SIZE>>9)) -+ mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ -+ p->head_position = 0; -+ rdev->raid_disk = mirror; -+ found = 1; -+ break; -+ } -+ spin_unlock_irq(&conf->device_lock); -+ -+ print_conf(conf); -+ return found; -+} -+ -+static int raid10_remove_disk(mddev_t *mddev, int number) -+{ -+ conf_t *conf = mddev->private; -+ int err = 1; -+ mirror_info_t *p = conf->mirrors+ number; -+ -+ print_conf(conf); -+ spin_lock_irq(&conf->device_lock); -+ if (p->rdev) { -+ if (p->rdev->in_sync || -+ atomic_read(&p->rdev->nr_pending)) { -+ err = -EBUSY; -+ goto abort; -+ } -+ p->rdev = NULL; -+ err = 0; -+ } -+ if (err) -+ MD_BUG(); -+abort: -+ spin_unlock_irq(&conf->device_lock); -+ -+ print_conf(conf); -+ return err; -+} -+ -+ -+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) -+{ -+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); -+ conf_t *conf = mddev_to_conf(r10_bio->mddev); -+ int i,d; -+ -+ if (bio->bi_size) -+ return 1; -+ -+ for (i=0; i<conf->copies; i++) -+ if (r10_bio->devs[i].bio == bio) -+ break; -+ if (i == conf->copies) -+ BUG(); -+ update_head_pos(i, r10_bio); -+ d = r10_bio->devs[i].devnum; -+ if (!uptodate) -+ md_error(r10_bio->mddev, -+ conf->mirrors[d].rdev); -+ -+ /* for reconstruct, we always reschedule after a read. -+ * for resync, only after all reads -+ */ -+ if (test_bit(R10BIO_IsRecover, &r10_bio->state) || -+ atomic_dec_and_test(&r10_bio->remaining)) { -+ /* we have read all the blocks, -+ * do the comparison in process context in raid10d -+ */ -+ reschedule_retry(r10_bio); -+ } -+ rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); -+ return 0; -+} -+ -+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) -+{ -+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); -+ mddev_t *mddev = r10_bio->mddev; -+ conf_t *conf = mddev_to_conf(mddev); -+ int i,d; -+ -+ if (bio->bi_size) -+ return 1; -+ -+ for (i = 0; i < conf->copies; i++) -+ if (r10_bio->devs[i].bio == bio) -+ break; -+ d = r10_bio->devs[i].devnum; -+ -+ if (!uptodate) -+ md_error(mddev, conf->mirrors[d].rdev); -+ update_head_pos(i, r10_bio); -+ -+ while (atomic_dec_and_test(&r10_bio->remaining)) { -+ if (r10_bio->master_bio == NULL) { -+ /* the primary of several recovery bios */ -+ md_done_sync(mddev, r10_bio->sectors, 1); -+ put_buf(r10_bio); -+ break; -+ } else { -+ r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; -+ put_buf(r10_bio); -+ r10_bio = r10_bio2; -+ } -+ } -+ rdev_dec_pending(conf->mirrors[d].rdev, mddev); -+ return 0; -+} -+ -+/* -+ * Note: sync and recover and handled very differently for raid10 -+ * This code is for resync. -+ * For resync, we read through virtual addresses and read all blocks. -+ * If there is any error, we schedule a write. The lowest numbered -+ * drive is authoritative. -+ * However requests come for physical address, so we need to map. -+ * For every physical address there are raid_disks/copies virtual addresses, -+ * which is always are least one, but is not necessarly an integer. -+ * This means that a physical address can span multiple chunks, so we may -+ * have to submit multiple io requests for a single sync request. -+ */ -+/* -+ * We check if all blocks are in-sync and only write to blocks that -+ * aren't in sync -+ */ -+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ int i, first; -+ struct bio *tbio, *fbio; -+ -+ atomic_set(&r10_bio->remaining, 1); -+ -+ /* find the first device with a block */ -+ for (i=0; i<conf->copies; i++) -+ if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) -+ break; -+ -+ if (i == conf->copies) -+ goto done; -+ -+ first = i; -+ fbio = r10_bio->devs[i].bio; -+ -+ /* now find blocks with errors */ -+ for (i=first+1 ; i < conf->copies ; i++) { -+ int vcnt, j, d; -+ -+ if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) -+ continue; -+ /* We know that the bi_io_vec layout is the same for -+ * both 'first' and 'i', so we just compare them. -+ * All vec entries are PAGE_SIZE; -+ */ -+ tbio = r10_bio->devs[i].bio; -+ vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); -+ for (j = 0; j < vcnt; j++) -+ if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), -+ page_address(tbio->bi_io_vec[j].bv_page), -+ PAGE_SIZE)) -+ break; -+ if (j == vcnt) -+ continue; -+ /* Ok, we need to write this bio -+ * First we need to fixup bv_offset, bv_len and -+ * bi_vecs, as the read request might have corrupted these -+ */ -+ tbio->bi_vcnt = vcnt; -+ tbio->bi_size = r10_bio->sectors << 9; -+ tbio->bi_idx = 0; -+ tbio->bi_phys_segments = 0; -+ tbio->bi_hw_segments = 0; -+ tbio->bi_hw_front_size = 0; -+ tbio->bi_hw_back_size = 0; -+ tbio->bi_flags &= ~(BIO_POOL_MASK - 1); -+ tbio->bi_flags |= 1 << BIO_UPTODATE; -+ tbio->bi_next = NULL; -+ tbio->bi_rw = WRITE; -+ tbio->bi_private = r10_bio; -+ tbio->bi_sector = r10_bio->devs[i].addr; -+ -+ for (j=0; j < vcnt ; j++) { -+ tbio->bi_io_vec[j].bv_offset = 0; -+ tbio->bi_io_vec[j].bv_len = PAGE_SIZE; -+ -+ memcpy(page_address(tbio->bi_io_vec[j].bv_page), -+ page_address(fbio->bi_io_vec[j].bv_page), -+ PAGE_SIZE); -+ } -+ tbio->bi_end_io = end_sync_write; -+ -+ d = r10_bio->devs[i].devnum; -+ atomic_inc(&conf->mirrors[d].rdev->nr_pending); -+ atomic_inc(&r10_bio->remaining); -+ md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); -+ -+ generic_make_request(tbio); -+ } -+ -+done: -+ if (atomic_dec_and_test(&r10_bio->remaining)) { -+ md_done_sync(mddev, r10_bio->sectors, 1); -+ put_buf(r10_bio); -+ } -+} -+ -+/* -+ * Now for the recovery code. -+ * Recovery happens across physical sectors. -+ * We recover all non-is_sync drives by finding the virtual address of -+ * each, and then choose a working drive that also has that virt address. -+ * There is a separate r10_bio for each non-in_sync drive. -+ * Only the first two slots are in use. The first for reading, -+ * The second for writing. -+ * -+ */ -+ -+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ int i, d; -+ struct bio *bio, *wbio; -+ -+ -+ /* move the pages across to the second bio -+ * and submit the write request -+ */ -+ bio = r10_bio->devs[0].bio; -+ wbio = r10_bio->devs[1].bio; -+ for (i=0; i < wbio->bi_vcnt; i++) { -+ struct page *p = bio->bi_io_vec[i].bv_page; -+ bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; -+ wbio->bi_io_vec[i].bv_page = p; -+ } -+ d = r10_bio->devs[1].devnum; -+ -+ atomic_inc(&conf->mirrors[d].rdev->nr_pending); -+ md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); -+ generic_make_request(wbio); -+} -+ -+ -+/* -+ * This is a kernel thread which: -+ * -+ * 1. Retries failed read operations on working mirrors. -+ * 2. Updates the raid superblock when problems encounter. -+ * 3. Performs writes following reads for array syncronising. -+ */ -+ -+static void raid10d(mddev_t *mddev) -+{ -+ r10bio_t *r10_bio; -+ struct bio *bio; -+ unsigned long flags; -+ conf_t *conf = mddev_to_conf(mddev); -+ struct list_head *head = &conf->retry_list; -+ int unplug=0; -+ mdk_rdev_t *rdev; -+ -+ md_check_recovery(mddev); -+ md_handle_safemode(mddev); -+ -+ for (;;) { -+ char b[BDEVNAME_SIZE]; -+ spin_lock_irqsave(&conf->device_lock, flags); -+ if (list_empty(head)) -+ break; -+ r10_bio = list_entry(head->prev, r10bio_t, retry_list); -+ list_del(head->prev); -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ -+ mddev = r10_bio->mddev; -+ conf = mddev_to_conf(mddev); -+ if (test_bit(R10BIO_IsSync, &r10_bio->state)) { -+ sync_request_write(mddev, r10_bio); -+ unplug = 1; -+ } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { -+ recovery_request_write(mddev, r10_bio); -+ unplug = 1; -+ } else { -+ int mirror; -+ bio = r10_bio->devs[r10_bio->read_slot].bio; -+ r10_bio->devs[r10_bio->read_slot].bio = NULL; -+ bio_put(bio); -+ mirror = read_balance(conf, r10_bio); -+ if (mirror == -1) { -+ printk(KERN_ALERT "raid10: %s: unrecoverable I/O" -+ " read error for block %llu\n", -+ bdevname(bio->bi_bdev,b), -+ (unsigned long long)r10_bio->sector); -+ raid_end_bio_io(r10_bio); -+ } else { -+ rdev = conf->mirrors[mirror].rdev; -+ if (printk_ratelimit()) -+ printk(KERN_ERR "raid10: %s: redirecting sector %llu to" -+ " another mirror\n", -+ bdevname(rdev->bdev,b), -+ (unsigned long long)r10_bio->sector); -+ bio = bio_clone(r10_bio->master_bio, GFP_NOIO); -+ r10_bio->devs[r10_bio->read_slot].bio = bio; -+ bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr -+ + rdev->data_offset; -+ bio->bi_bdev = rdev->bdev; -+ bio->bi_rw = READ; -+ bio->bi_private = r10_bio; -+ bio->bi_end_io = raid10_end_read_request; -+ unplug = 1; -+ generic_make_request(bio); -+ } -+ } -+ } -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ if (unplug) -+ unplug_slaves(mddev); -+} -+ -+ -+static int init_resync(conf_t *conf) -+{ -+ int buffs; -+ -+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; -+ if (conf->r10buf_pool) -+ BUG(); -+ conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); -+ if (!conf->r10buf_pool) -+ return -ENOMEM; -+ conf->next_resync = 0; -+ return 0; -+} -+ -+/* -+ * perform a "sync" on one "block" -+ * -+ * We need to make sure that no normal I/O request - particularly write -+ * requests - conflict with active sync requests. -+ * -+ * This is achieved by tracking pending requests and a 'barrier' concept -+ * that can be installed to exclude normal IO requests. -+ * -+ * Resync and recovery are handled very differently. -+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. -+ * -+ * For resync, we iterate over virtual addresses, read all copies, -+ * and update if there are differences. If only one copy is live, -+ * skip it. -+ * For recovery, we iterate over physical addresses, read a good -+ * value for each non-in_sync drive, and over-write. -+ * -+ * So, for recovery we may have several outstanding complex requests for a -+ * given address, one for each out-of-sync device. We model this by allocating -+ * a number of r10_bio structures, one for each out-of-sync device. -+ * As we setup these structures, we collect all bio's together into a list -+ * which we then process collectively to add pages, and then process again -+ * to pass to generic_make_request. -+ * -+ * The r10_bio structures are linked using a borrowed master_bio pointer. -+ * This link is counted in ->remaining. When the r10_bio that points to NULL -+ * has its remaining count decremented to 0, the whole complex operation -+ * is complete. -+ * -+ */ -+ -+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ r10bio_t *r10_bio; -+ struct bio *biolist = NULL, *bio; -+ sector_t max_sector, nr_sectors; -+ int disk; -+ int i; -+ -+ sector_t sectors_skipped = 0; -+ int chunks_skipped = 0; -+ -+ if (!conf->r10buf_pool) -+ if (init_resync(conf)) -+ return -ENOMEM; -+ -+ skipped: -+ max_sector = mddev->size << 1; -+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) -+ max_sector = mddev->resync_max_sectors; -+ if (sector_nr >= max_sector) { -+ close_sync(conf); -+ return sectors_skipped; -+ } -+ if (chunks_skipped >= conf->raid_disks) { -+ /* if there has been nothing to do on any drive, -+ * then there is nothing to do at all.. -+ */ -+ sector_t sec = max_sector - sector_nr; -+ md_done_sync(mddev, sec, 1); -+ return sec + sectors_skipped; -+ } -+ -+ /* make sure whole request will fit in a chunk - if chunks -+ * are meaningful -+ */ -+ if (conf->near_copies < conf->raid_disks && -+ max_sector > (sector_nr | conf->chunk_mask)) -+ max_sector = (sector_nr | conf->chunk_mask) + 1; -+ /* -+ * If there is non-resync activity waiting for us then -+ * put in a delay to throttle resync. -+ */ -+ if (!go_faster && waitqueue_active(&conf->wait_resume)) -+ schedule_timeout(HZ); -+ device_barrier(conf, sector_nr + RESYNC_SECTORS); -+ -+ /* Again, very different code for resync and recovery. -+ * Both must result in an r10bio with a list of bios that -+ * have bi_end_io, bi_sector, bi_bdev set, -+ * and bi_private set to the r10bio. -+ * For recovery, we may actually create several r10bios -+ * with 2 bios in each, that correspond to the bios in the main one. -+ * In this case, the subordinate r10bios link back through a -+ * borrowed master_bio pointer, and the counter in the master -+ * includes a ref from each subordinate. -+ */ -+ /* First, we decide what to do and set ->bi_end_io -+ * To end_sync_read if we want to read, and -+ * end_sync_write if we will want to write. -+ */ -+ -+ if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { -+ /* recovery... the complicated one */ -+ int i, j, k; -+ r10_bio = NULL; -+ -+ for (i=0 ; i<conf->raid_disks; i++) -+ if (conf->mirrors[i].rdev && -+ !conf->mirrors[i].rdev->in_sync) { -+ /* want to reconstruct this device */ -+ r10bio_t *rb2 = r10_bio; -+ -+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); -+ spin_lock_irq(&conf->resync_lock); -+ conf->nr_pending++; -+ if (rb2) conf->barrier++; -+ spin_unlock_irq(&conf->resync_lock); -+ atomic_set(&r10_bio->remaining, 0); -+ -+ r10_bio->master_bio = (struct bio*)rb2; -+ if (rb2) -+ atomic_inc(&rb2->remaining); -+ r10_bio->mddev = mddev; -+ set_bit(R10BIO_IsRecover, &r10_bio->state); -+ r10_bio->sector = raid10_find_virt(conf, sector_nr, i); -+ raid10_find_phys(conf, r10_bio); -+ for (j=0; j<conf->copies;j++) { -+ int d = r10_bio->devs[j].devnum; -+ if (conf->mirrors[d].rdev && -+ conf->mirrors[d].rdev->in_sync) { -+ /* This is where we read from */ -+ bio = r10_bio->devs[0].bio; -+ bio->bi_next = biolist; -+ biolist = bio; -+ bio->bi_private = r10_bio; -+ bio->bi_end_io = end_sync_read; -+ bio->bi_rw = 0; -+ bio->bi_sector = r10_bio->devs[j].addr + -+ conf->mirrors[d].rdev->data_offset; -+ bio->bi_bdev = conf->mirrors[d].rdev->bdev; -+ atomic_inc(&conf->mirrors[d].rdev->nr_pending); -+ atomic_inc(&r10_bio->remaining); -+ /* and we write to 'i' */ -+ -+ for (k=0; k<conf->copies; k++) -+ if (r10_bio->devs[k].devnum == i) -+ break; -+ bio = r10_bio->devs[1].bio; -+ bio->bi_next = biolist; -+ biolist = bio; -+ bio->bi_private = r10_bio; -+ bio->bi_end_io = end_sync_write; -+ bio->bi_rw = 1; -+ bio->bi_sector = r10_bio->devs[k].addr + -+ conf->mirrors[i].rdev->data_offset; -+ bio->bi_bdev = conf->mirrors[i].rdev->bdev; -+ -+ r10_bio->devs[0].devnum = d; -+ r10_bio->devs[1].devnum = i; -+ -+ break; -+ } -+ } -+ if (j == conf->copies) { -+ BUG(); -+ } -+ } -+ if (biolist == NULL) { -+ while (r10_bio) { -+ r10bio_t *rb2 = r10_bio; -+ r10_bio = (r10bio_t*) rb2->master_bio; -+ rb2->master_bio = NULL; -+ put_buf(rb2); -+ } -+ goto giveup; -+ } -+ } else { -+ /* resync. Schedule a read for every block at this virt offset */ -+ int count = 0; -+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); -+ -+ spin_lock_irq(&conf->resync_lock); -+ conf->nr_pending++; -+ spin_unlock_irq(&conf->resync_lock); -+ -+ r10_bio->mddev = mddev; -+ atomic_set(&r10_bio->remaining, 0); -+ -+ r10_bio->master_bio = NULL; -+ r10_bio->sector = sector_nr; -+ set_bit(R10BIO_IsSync, &r10_bio->state); -+ raid10_find_phys(conf, r10_bio); -+ r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; -+ spin_lock_irq(&conf->device_lock); -+ for (i=0; i<conf->copies; i++) { -+ int d = r10_bio->devs[i].devnum; -+ bio = r10_bio->devs[i].bio; -+ bio->bi_end_io = NULL; -+ if (conf->mirrors[d].rdev == NULL || -+ conf->mirrors[d].rdev->faulty) -+ continue; -+ atomic_inc(&conf->mirrors[d].rdev->nr_pending); -+ atomic_inc(&r10_bio->remaining); -+ bio->bi_next = biolist; -+ biolist = bio; -+ bio->bi_private = r10_bio; -+ bio->bi_end_io = end_sync_read; -+ bio->bi_rw = 0; -+ bio->bi_sector = r10_bio->devs[i].addr + -+ conf->mirrors[d].rdev->data_offset; -+ bio->bi_bdev = conf->mirrors[d].rdev->bdev; -+ count++; -+ } -+ spin_unlock_irq(&conf->device_lock); -+ if (count < 2) { -+ for (i=0; i<conf->copies; i++) { -+ int d = r10_bio->devs[i].devnum; -+ if (r10_bio->devs[i].bio->bi_end_io) -+ rdev_dec_pending(conf->mirrors[d].rdev, mddev); -+ } -+ put_buf(r10_bio); -+ biolist = NULL; -+ goto giveup; -+ } -+ } -+ -+ for (bio = biolist; bio ; bio=bio->bi_next) { -+ -+ bio->bi_flags &= ~(BIO_POOL_MASK - 1); -+ if (bio->bi_end_io) -+ bio->bi_flags |= 1 << BIO_UPTODATE; -+ bio->bi_vcnt = 0; -+ bio->bi_idx = 0; -+ bio->bi_phys_segments = 0; -+ bio->bi_hw_segments = 0; -+ bio->bi_size = 0; -+ } -+ -+ nr_sectors = 0; -+ do { -+ struct page *page; -+ int len = PAGE_SIZE; -+ disk = 0; -+ if (sector_nr + (len>>9) > max_sector) -+ len = (max_sector - sector_nr) << 9; -+ if (len == 0) -+ break; -+ for (bio= biolist ; bio ; bio=bio->bi_next) { -+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page; -+ if (bio_add_page(bio, page, len, 0) == 0) { -+ /* stop here */ -+ struct bio *bio2; -+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page; -+ for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { -+ /* remove last page from this bio */ -+ bio2->bi_vcnt--; -+ bio2->bi_size -= len; -+ bio2->bi_flags &= ~(1<< BIO_SEG_VALID); -+ } -+ goto bio_full; -+ } -+ disk = i; -+ } -+ nr_sectors += len>>9; -+ sector_nr += len>>9; -+ } while (biolist->bi_vcnt < RESYNC_PAGES); -+ bio_full: -+ r10_bio->sectors = nr_sectors; -+ -+ while (biolist) { -+ bio = biolist; -+ biolist = biolist->bi_next; -+ -+ bio->bi_next = NULL; -+ r10_bio = bio->bi_private; -+ r10_bio->sectors = nr_sectors; -+ -+ if (bio->bi_end_io == end_sync_read) { -+ md_sync_acct(bio->bi_bdev, nr_sectors); -+ generic_make_request(bio); -+ } -+ } -+ -+ return sectors_skipped + nr_sectors; -+ giveup: -+ /* There is nowhere to write, so all non-sync -+ * drives must be failed, so try the next chunk... -+ */ -+ { -+ int sec = max_sector - sector_nr; -+ sectors_skipped += sec; -+ chunks_skipped ++; -+ sector_nr = max_sector; -+ md_done_sync(mddev, sec, 1); -+ goto skipped; -+ } -+} -+ -+static int run(mddev_t *mddev) -+{ -+ conf_t *conf; -+ int i, disk_idx; -+ mirror_info_t *disk; -+ mdk_rdev_t *rdev; -+ struct list_head *tmp; -+ int nc, fc; -+ sector_t stride, size; -+ -+ if (mddev->level != 10) { -+ printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", -+ mdname(mddev), mddev->level); -+ goto out; -+ } -+ nc = mddev->layout & 255; -+ fc = (mddev->layout >> 8) & 255; -+ if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || -+ (mddev->layout >> 16)) { -+ printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", -+ mdname(mddev), mddev->layout); -+ goto out; -+ } -+ /* -+ * copy the already verified devices into our private RAID10 -+ * bookkeeping area. [whatever we allocate in run(), -+ * should be freed in stop()] -+ */ -+ conf = kmalloc(sizeof(conf_t), GFP_KERNEL); -+ mddev->private = conf; -+ if (!conf) { -+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -+ mdname(mddev)); -+ goto out; -+ } -+ memset(conf, 0, sizeof(*conf)); -+ conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, -+ GFP_KERNEL); -+ if (!conf->mirrors) { -+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -+ mdname(mddev)); -+ goto out_free_conf; -+ } -+ memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); -+ -+ conf->near_copies = nc; -+ conf->far_copies = fc; -+ conf->copies = nc*fc; -+ conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; -+ conf->chunk_shift = ffz(~mddev->chunk_size) - 9; -+ stride = mddev->size >> (conf->chunk_shift-1); -+ sector_div(stride, fc); -+ conf->stride = stride << conf->chunk_shift; -+ -+ conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, -+ r10bio_pool_free, conf); -+ if (!conf->r10bio_pool) { -+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", -+ mdname(mddev)); -+ goto out_free_conf; -+ } -+ mddev->queue->unplug_fn = raid10_unplug; -+ -+ mddev->queue->issue_flush_fn = raid10_issue_flush; -+ -+ ITERATE_RDEV(mddev, rdev, tmp) { -+ disk_idx = rdev->raid_disk; -+ if (disk_idx >= mddev->raid_disks -+ || disk_idx < 0) -+ continue; -+ disk = conf->mirrors + disk_idx; -+ -+ disk->rdev = rdev; -+ -+ blk_queue_stack_limits(mddev->queue, -+ rdev->bdev->bd_disk->queue); -+ /* as we don't honour merge_bvec_fn, we must never risk -+ * violating it, so limit ->max_sector to one PAGE, as -+ * a one page request is never in violation. -+ */ -+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && -+ mddev->queue->max_sectors > (PAGE_SIZE>>9)) -+ mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ -+ disk->head_position = 0; -+ if (!rdev->faulty && rdev->in_sync) -+ conf->working_disks++; -+ } -+ conf->raid_disks = mddev->raid_disks; -+ conf->mddev = mddev; -+ conf->device_lock = SPIN_LOCK_UNLOCKED; -+ INIT_LIST_HEAD(&conf->retry_list); -+ -+ conf->resync_lock = SPIN_LOCK_UNLOCKED; -+ init_waitqueue_head(&conf->wait_idle); -+ init_waitqueue_head(&conf->wait_resume); -+ -+ if (!conf->working_disks) { -+ printk(KERN_ERR "raid10: no operational mirrors for %s\n", -+ mdname(mddev)); -+ goto out_free_conf; -+ } -+ -+ mddev->degraded = 0; -+ for (i = 0; i < conf->raid_disks; i++) { -+ -+ disk = conf->mirrors + i; -+ -+ if (!disk->rdev) { -+ disk->head_position = 0; -+ mddev->degraded++; -+ } -+ } -+ -+ -+ mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); -+ if (!mddev->thread) { -+ printk(KERN_ERR -+ "raid10: couldn't allocate thread for %s\n", -+ mdname(mddev)); -+ goto out_free_conf; -+ } -+ -+ printk(KERN_INFO -+ "raid10: raid set %s active with %d out of %d devices\n", -+ mdname(mddev), mddev->raid_disks - mddev->degraded, -+ mddev->raid_disks); -+ /* -+ * Ok, everything is just fine now -+ */ -+ size = conf->stride * conf->raid_disks; -+ sector_div(size, conf->near_copies); -+ mddev->array_size = size/2; -+ mddev->resync_max_sectors = size; -+ -+ /* Calculate max read-ahead size. -+ * We need to readahead at least twice a whole stripe.... -+ * maybe... -+ */ -+ { -+ int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; -+ stripe /= conf->near_copies; -+ if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) -+ mddev->queue->backing_dev_info.ra_pages = 2* stripe; -+ } -+ -+ if (conf->near_copies < mddev->raid_disks) -+ blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); -+ return 0; -+ -+out_free_conf: -+ if (conf->r10bio_pool) -+ mempool_destroy(conf->r10bio_pool); -+ if (conf->mirrors) -+ kfree(conf->mirrors); -+ kfree(conf); -+ mddev->private = NULL; -+out: -+ return -EIO; -+} -+ -+static int stop(mddev_t *mddev) -+{ -+ conf_t *conf = mddev_to_conf(mddev); -+ -+ md_unregister_thread(mddev->thread); -+ mddev->thread = NULL; -+ if (conf->r10bio_pool) -+ mempool_destroy(conf->r10bio_pool); -+ if (conf->mirrors) -+ kfree(conf->mirrors); -+ kfree(conf); -+ mddev->private = NULL; -+ return 0; -+} -+ -+ -+static mdk_personality_t raid10_personality = -+{ -+ .name = "raid10", -+ .owner = THIS_MODULE, -+ .make_request = make_request, -+ .run = run, -+ .stop = stop, -+ .status = status, -+ .error_handler = error, -+ .hot_add_disk = raid10_add_disk, -+ .hot_remove_disk= raid10_remove_disk, -+ .spare_active = raid10_spare_active, -+ .sync_request = sync_request, -+}; -+ -+static int __init raid_init(void) -+{ -+ return register_md_personality(RAID10, &raid10_personality); -+} -+ -+static void raid_exit(void) -+{ -+ unregister_md_personality(RAID10); -+} -+ -+module_init(raid_init); -+module_exit(raid_exit); -+MODULE_LICENSE("GPL"); -+MODULE_ALIAS("md-personality-9"); /* RAID10 */ -diff -pruN ./drivers/md.dm/raid1.c ./drivers/md/raid1.c ---- ./drivers/md.dm/raid1.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/raid1.c 2006-03-17 13:16:38.000000000 +0300 -@@ -24,10 +24,6 @@ - - #include <linux/raid/raid1.h> - --#define MAJOR_NR MD_MAJOR --#define MD_DRIVER --#define MD_PERSONALITY -- - /* - * Number of guaranteed r1bios in case of extreme VM load: - */ -@@ -44,13 +40,12 @@ static void * r1bio_pool_alloc(int gfp_f - { - struct pool_info *pi = data; - r1bio_t *r1_bio; -+ int size = offsetof(r1bio_t, bios[pi->raid_disks]); - - /* allocate a r1bio with room for raid_disks entries in the bios array */ -- r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks, -- gfp_flags); -+ r1_bio = kmalloc(size, gfp_flags); - if (r1_bio) -- memset(r1_bio, 0, sizeof(*r1_bio) + -- sizeof(struct bio*) * pi->raid_disks); -+ memset(r1_bio, 0, size); - else - unplug_slaves(pi->mddev); - -@@ -104,7 +99,7 @@ static void * r1buf_pool_alloc(int gfp_f - bio->bi_io_vec[i].bv_page = page; - } - -- r1_bio->master_bio = bio; -+ r1_bio->master_bio = NULL; - - return r1_bio; - -@@ -189,32 +184,6 @@ static inline void put_buf(r1bio_t *r1_b - spin_unlock_irqrestore(&conf->resync_lock, flags); - } - --static int map(mddev_t *mddev, mdk_rdev_t **rdevp) --{ -- conf_t *conf = mddev_to_conf(mddev); -- int i, disks = conf->raid_disks; -- -- /* -- * Later we do read balancing on the read side -- * now we use the first available disk. -- */ -- -- spin_lock_irq(&conf->device_lock); -- for (i = 0; i < disks; i++) { -- mdk_rdev_t *rdev = conf->mirrors[i].rdev; -- if (rdev && rdev->in_sync) { -- *rdevp = rdev; -- atomic_inc(&rdev->nr_pending); -- spin_unlock_irq(&conf->device_lock); -- return i; -- } -- } -- spin_unlock_irq(&conf->device_lock); -- -- printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n"); -- return -1; --} -- - static void reschedule_retry(r1bio_t *r1_bio) - { - unsigned long flags; -@@ -292,8 +261,9 @@ static int raid1_end_read_request(struct - * oops, read error: - */ - char b[BDEVNAME_SIZE]; -- printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", -- bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); -+ if (printk_ratelimit()) -+ printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", -+ bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); - reschedule_retry(r1_bio); - } - -@@ -363,12 +333,13 @@ static int raid1_end_write_request(struc - * - * The rdev for the device selected will have nr_pending incremented. - */ --static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) -+static int read_balance(conf_t *conf, r1bio_t *r1_bio) - { - const unsigned long this_sector = r1_bio->sector; - int new_disk = conf->last_used, disk = new_disk; -- const int sectors = bio->bi_size >> 9; -+ const int sectors = r1_bio->sectors; - sector_t new_distance, current_distance; -+ mdk_rdev_t *new_rdev, *rdev; - - spin_lock_irq(&conf->device_lock); - /* -@@ -376,16 +347,17 @@ static int read_balance(conf_t *conf, st - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ -+ retry: - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { -- /* make sure that disk is operational */ -+ /* Choose the first operation device, for consistancy */ - new_disk = 0; - -- while (!conf->mirrors[new_disk].rdev || -- !conf->mirrors[new_disk].rdev->in_sync) { -+ while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || -+ !new_rdev->in_sync) { - new_disk++; - if (new_disk == conf->raid_disks) { -- new_disk = 0; -+ new_disk = -1; - break; - } - } -@@ -394,13 +366,13 @@ static int read_balance(conf_t *conf, st - - - /* make sure the disk is operational */ -- while (!conf->mirrors[new_disk].rdev || -- !conf->mirrors[new_disk].rdev->in_sync) { -+ while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || -+ !new_rdev->in_sync) { - if (new_disk <= 0) - new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) { -- new_disk = conf->last_used; -+ new_disk = -1; - goto rb_out; - } - } -@@ -424,29 +396,38 @@ static int read_balance(conf_t *conf, st - disk = conf->raid_disks; - disk--; - -- if (!conf->mirrors[disk].rdev || -- !conf->mirrors[disk].rdev->in_sync) -+ if ((rdev=conf->mirrors[disk].rdev) == NULL || -+ !rdev->in_sync) - continue; - -- if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) { -+ if (!atomic_read(&rdev->nr_pending)) { - new_disk = disk; -+ new_rdev = rdev; - break; - } - new_distance = abs(this_sector - conf->mirrors[disk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - new_disk = disk; -+ new_rdev = rdev; - } - } while (disk != conf->last_used); - - rb_out: -- r1_bio->read_disk = new_disk; -- conf->next_seq_sect = this_sector + sectors; - -- conf->last_used = new_disk; - -- if (conf->mirrors[new_disk].rdev) -- atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); -+ if (new_disk >= 0) { -+ conf->next_seq_sect = this_sector + sectors; -+ conf->last_used = new_disk; -+ atomic_inc(&new_rdev->nr_pending); -+ if (!new_rdev->in_sync) { -+ /* cannot risk returning a device that failed -+ * before we inc'ed nr_pending -+ */ -+ atomic_dec(&new_rdev->nr_pending); -+ goto retry; -+ } -+ } - spin_unlock_irq(&conf->device_lock); - - return new_disk; -@@ -471,7 +452,7 @@ static void unplug_slaves(mddev_t *mddev - r_queue->unplug_fn(r_queue); - - spin_lock_irqsave(&conf->device_lock, flags); -- atomic_dec(&rdev->nr_pending); -+ rdev_dec_pending(rdev, mddev); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -@@ -481,6 +462,32 @@ static void raid1_unplug(request_queue_t - unplug_slaves(q->queuedata); - } - -+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ conf_t *conf = mddev_to_conf(mddev); -+ unsigned long flags; -+ int i, ret = 0; -+ -+ spin_lock_irqsave(&conf->device_lock, flags); -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->mirrors[i].rdev; -+ if (rdev && !rdev->faulty) { -+ struct block_device *bdev = rdev->bdev; -+ request_queue_t *r_queue = bdev_get_queue(bdev); -+ -+ if (r_queue->issue_flush_fn) { -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ } -+ } -+ spin_unlock_irqrestore(&conf->device_lock, flags); -+ return ret; -+} -+ - /* - * Throttle resync depth, so that we can both get proper overlapping of - * requests, but are still able to handle normal requests quickly. -@@ -513,6 +520,7 @@ static int make_request(request_queue_t - r1bio_t *r1_bio; - struct bio *read_bio; - int i, disks; -+ mdk_rdev_t *rdev; - - /* - * Register the new request and wait if the reconstruction -@@ -545,15 +553,26 @@ static int make_request(request_queue_t - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; - -+ r1_bio->state = 0; -+ - if (bio_data_dir(bio) == READ) { - /* - * read balancing logic: - */ -- mirror = conf->mirrors + read_balance(conf, bio, r1_bio); -+ int rdisk = read_balance(conf, r1_bio); -+ -+ if (rdisk < 0) { -+ /* couldn't find anywhere to read from */ -+ raid_end_bio_io(r1_bio); -+ return 0; -+ } -+ mirror = conf->mirrors + rdisk; -+ -+ r1_bio->read_disk = rdisk; - - read_bio = bio_clone(bio, GFP_NOIO); - -- r1_bio->bios[r1_bio->read_disk] = read_bio; -+ r1_bio->bios[rdisk] = read_bio; - - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; -@@ -575,10 +594,14 @@ static int make_request(request_queue_t - disks = conf->raid_disks; - spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { -- if (conf->mirrors[i].rdev && -- !conf->mirrors[i].rdev->faulty) { -- atomic_inc(&conf->mirrors[i].rdev->nr_pending); -- r1_bio->bios[i] = bio; -+ if ((rdev=conf->mirrors[i].rdev) != NULL && -+ !rdev->faulty) { -+ atomic_inc(&rdev->nr_pending); -+ if (rdev->faulty) { -+ atomic_dec(&rdev->nr_pending); -+ r1_bio->bios[i] = NULL; -+ } else -+ r1_bio->bios[i] = bio; - } else - r1_bio->bios[i] = NULL; - } -@@ -746,7 +769,7 @@ static int raid1_add_disk(mddev_t *mddev - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - p->head_position = 0; - rdev->raid_disk = mirror; -@@ -877,7 +900,7 @@ static void sync_request_write(mddev_t * - - atomic_inc(&conf->mirrors[i].rdev->nr_pending); - atomic_inc(&r1_bio->remaining); -- md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9); -+ md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); - } - -@@ -925,7 +948,7 @@ static void raid1d(mddev_t *mddev) - } else { - int disk; - bio = r1_bio->bios[r1_bio->read_disk]; -- if ((disk=map(mddev, &rdev)) == -1) { -+ if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" - " read error for block %llu\n", - bdevname(bio->bi_bdev,b), -@@ -934,14 +957,20 @@ static void raid1d(mddev_t *mddev) - } else { - r1_bio->bios[r1_bio->read_disk] = NULL; - r1_bio->read_disk = disk; -+ bio_put(bio); -+ bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - r1_bio->bios[r1_bio->read_disk] = bio; -- printk(KERN_ERR "raid1: %s: redirecting sector %llu to" -- " another mirror\n", -- bdevname(rdev->bdev,b), -- (unsigned long long)r1_bio->sector); -- bio->bi_bdev = rdev->bdev; -+ rdev = conf->mirrors[disk].rdev; -+ if (printk_ratelimit()) -+ printk(KERN_ERR "raid1: %s: redirecting sector %llu to" -+ " another mirror\n", -+ bdevname(rdev->bdev,b), -+ (unsigned long long)r1_bio->sector); - bio->bi_sector = r1_bio->sector + rdev->data_offset; -+ bio->bi_bdev = rdev->bdev; -+ bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ; -+ bio->bi_private = r1_bio; - unplug = 1; - generic_make_request(bio); - } -@@ -1078,7 +1107,7 @@ static int sync_request(mddev_t *mddev, - int rv = max_sector - sector_nr; - md_done_sync(mddev, rv, 1); - put_buf(r1_bio); -- atomic_dec(&conf->mirrors[disk].rdev->nr_pending); -+ rdev_dec_pending(conf->mirrors[disk].rdev, mddev); - return rv; - } - -@@ -1117,7 +1146,7 @@ static int sync_request(mddev_t *mddev, - bio = r1_bio->bios[disk]; - r1_bio->sectors = nr_sectors; - -- md_sync_acct(mirror->rdev, nr_sectors); -+ md_sync_acct(mirror->rdev->bdev, nr_sectors); - - generic_make_request(bio); - -@@ -1168,6 +1197,7 @@ static int run(mddev_t *mddev) - - mddev->queue->unplug_fn = raid1_unplug; - -+ mddev->queue->issue_flush_fn = raid1_issue_flush; - - ITERATE_RDEV(mddev, rdev, tmp) { - disk_idx = rdev->raid_disk; -@@ -1186,7 +1216,7 @@ static int run(mddev_t *mddev) - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) -- mddev->queue->max_sectors = (PAGE_SIZE>>9); -+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - disk->head_position = 0; - if (!rdev->faulty && rdev->in_sync) -@@ -1328,7 +1358,7 @@ static int raid1_reshape(mddev_t *mddev, - if (conf->mirrors[d].rdev) - return -EBUSY; - -- newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL); -+ newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); - if (!newpoolinfo) - return -ENOMEM; - newpoolinfo->mddev = mddev; -diff -pruN ./drivers/md.dm/raid5.c ./drivers/md/raid5.c ---- ./drivers/md.dm/raid5.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/raid5.c 2006-03-17 13:16:38.000000000 +0300 -@@ -457,6 +457,7 @@ static void raid5_build_block (struct st - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; -+ dev->req.bi_max_vecs++; - dev->vec.bv_page = dev->page; - dev->vec.bv_len = STRIPE_SIZE; - dev->vec.bv_offset = 0; -@@ -477,8 +478,8 @@ static void error(mddev_t *mddev, mdk_rd - - if (!rdev->faulty) { - mddev->sb_dirty = 1; -- conf->working_disks--; - if (rdev->in_sync) { -+ conf->working_disks--; - mddev->degraded++; - conf->failed_disks++; - rdev->in_sync = 0; -@@ -1071,7 +1072,8 @@ static void handle_stripe(struct stripe_ - PRINTK("Reading block %d (sync=%d)\n", - i, syncing); - if (syncing) -- md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS); -+ md_sync_acct(conf->disks[i].rdev->bdev, -+ STRIPE_SECTORS); - } - } - } -@@ -1256,7 +1258,7 @@ static void handle_stripe(struct stripe_ - - if (rdev) { - if (test_bit(R5_Syncio, &sh->dev[i].flags)) -- md_sync_acct(rdev, STRIPE_SECTORS); -+ md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", -@@ -1265,6 +1267,7 @@ static void handle_stripe(struct stripe_ - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; -+ bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; -@@ -1316,7 +1319,7 @@ static void unplug_slaves(mddev_t *mddev - r_queue->unplug_fn(r_queue); - - spin_lock_irqsave(&conf->device_lock, flags); -- atomic_dec(&rdev->nr_pending); -+ rdev_dec_pending(rdev, mddev); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -@@ -1328,6 +1331,8 @@ static void raid5_unplug_device(request_ - raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned long flags; - -+ if (!conf) return; -+ - spin_lock_irqsave(&conf->device_lock, flags); - - if (blk_remove_plug(q)) -@@ -1339,6 +1344,39 @@ static void raid5_unplug_device(request_ - unplug_slaves(mddev); - } - -+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ raid5_conf_t *conf = mddev_to_conf(mddev); -+ int i, ret = 0; -+ -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->disks[i].rdev; -+ if (rdev && !rdev->faulty) { -+ struct block_device *bdev = rdev->bdev; -+ request_queue_t *r_queue; -+ -+ if (!bdev) -+ continue; -+ -+ r_queue = bdev_get_queue(bdev); -+ if (!r_queue) -+ continue; -+ -+ if (!r_queue->issue_flush_fn) { -+ ret = -EOPNOTSUPP; -+ break; -+ } -+ -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ } -+ return ret; -+} -+ - static inline void raid5_plug_device(raid5_conf_t *conf) - { - spin_lock_irq(&conf->device_lock); -@@ -1545,6 +1583,7 @@ static int run (mddev_t *mddev) - atomic_set(&conf->preread_active_stripes, 0); - - mddev->queue->unplug_fn = raid5_unplug_device; -+ mddev->queue->issue_flush_fn = raid5_issue_flush; - - PRINTK("raid5: run(%s) called.\n", mdname(mddev)); - -diff -pruN ./drivers/md.dm/raid6main.c ./drivers/md/raid6main.c ---- ./drivers/md.dm/raid6main.c 2006-03-17 08:57:42.000000000 +0300 -+++ ./drivers/md/raid6main.c 2006-03-17 13:16:38.000000000 +0300 -@@ -478,6 +478,7 @@ static void raid6_build_block (struct st - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; -+ dev->req.bi_max_vecs++; - dev->vec.bv_page = dev->page; - dev->vec.bv_len = STRIPE_SIZE; - dev->vec.bv_offset = 0; -@@ -498,8 +499,8 @@ static void error(mddev_t *mddev, mdk_rd - - if (!rdev->faulty) { - mddev->sb_dirty = 1; -- conf->working_disks--; - if (rdev->in_sync) { -+ conf->working_disks--; - mddev->degraded++; - conf->failed_disks++; - rdev->in_sync = 0; -@@ -1208,7 +1209,8 @@ static void handle_stripe(struct stripe_ - PRINTK("Reading block %d (sync=%d)\n", - i, syncing); - if (syncing) -- md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS); -+ md_sync_acct(conf->disks[i].rdev->bdev, -+ STRIPE_SECTORS); - } - } - } -@@ -1418,7 +1420,7 @@ static void handle_stripe(struct stripe_ - - if (rdev) { - if (test_bit(R5_Syncio, &sh->dev[i].flags)) -- md_sync_acct(rdev, STRIPE_SECTORS); -+ md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", -@@ -1427,6 +1429,7 @@ static void handle_stripe(struct stripe_ - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; -+ bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; -@@ -1478,7 +1481,7 @@ static void unplug_slaves(mddev_t *mddev - r_queue->unplug_fn(r_queue); - - spin_lock_irqsave(&conf->device_lock, flags); -- atomic_dec(&rdev->nr_pending); -+ rdev_dec_pending(rdev, mddev); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -@@ -1501,6 +1504,39 @@ static void raid6_unplug_device(request_ - unplug_slaves(mddev); - } - -+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, -+ sector_t *error_sector) -+{ -+ mddev_t *mddev = q->queuedata; -+ raid6_conf_t *conf = mddev_to_conf(mddev); -+ int i, ret = 0; -+ -+ for (i=0; i<mddev->raid_disks; i++) { -+ mdk_rdev_t *rdev = conf->disks[i].rdev; -+ if (rdev && !rdev->faulty) { -+ struct block_device *bdev = rdev->bdev; -+ request_queue_t *r_queue; -+ -+ if (!bdev) -+ continue; -+ -+ r_queue = bdev_get_queue(bdev); -+ if (!r_queue) -+ continue; -+ -+ if (!r_queue->issue_flush_fn) { -+ ret = -EOPNOTSUPP; -+ break; -+ } -+ -+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); -+ if (ret) -+ break; -+ } -+ } -+ return ret; -+} -+ - static inline void raid6_plug_device(raid6_conf_t *conf) - { - spin_lock_irq(&conf->device_lock); -@@ -1708,6 +1744,7 @@ static int run (mddev_t *mddev) - atomic_set(&conf->preread_active_stripes, 0); - - mddev->queue->unplug_fn = raid6_unplug_device; -+ mddev->queue->issue_flush_fn = raid6_issue_flush; - - PRINTK("raid6: run(%s) called.\n", mdname(mddev)); - ---- ./include/linux/compat_ioctl.h.dm 2006-03-17 08:58:47.000000000 +0300 -+++ ./include/linux/compat_ioctl.h 2006-03-17 08:16:12.000000000 +0300 -@@ -102,6 +102,7 @@ COMPATIBLE_IOCTL(BLKROGET) - COMPATIBLE_IOCTL(BLKRRPART) - COMPATIBLE_IOCTL(BLKFLSBUF) - COMPATIBLE_IOCTL(BLKSECTSET) -+COMPATIBLE_IOCTL(BLKSECTGET) - COMPATIBLE_IOCTL(BLKSSZGET) - ULONG_IOCTL(BLKRASET) - ULONG_IOCTL(BLKFRASET) -@@ -141,6 +142,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32) - COMPATIBLE_IOCTL(DM_TABLE_DEPS_32) - COMPATIBLE_IOCTL(DM_TABLE_STATUS_32) - COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32) -+COMPATIBLE_IOCTL(DM_TARGET_MSG_32) - COMPATIBLE_IOCTL(DM_VERSION) - COMPATIBLE_IOCTL(DM_REMOVE_ALL) - COMPATIBLE_IOCTL(DM_LIST_DEVICES) -@@ -155,6 +157,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR) - COMPATIBLE_IOCTL(DM_TABLE_DEPS) - COMPATIBLE_IOCTL(DM_TABLE_STATUS) - COMPATIBLE_IOCTL(DM_LIST_VERSIONS) -+COMPATIBLE_IOCTL(DM_TARGET_MSG) - /* Big K */ - COMPATIBLE_IOCTL(PIO_FONT) - COMPATIBLE_IOCTL(GIO_FONT) -@@ -387,6 +390,7 @@ COMPATIBLE_IOCTL(DVD_WRITE_STRUCT) - COMPATIBLE_IOCTL(DVD_AUTH) - /* Big L */ - ULONG_IOCTL(LOOP_SET_FD) -+ULONG_IOCTL(LOOP_CHANGE_FD) - COMPATIBLE_IOCTL(LOOP_CLR_FD) - COMPATIBLE_IOCTL(LOOP_GET_STATUS64) - COMPATIBLE_IOCTL(LOOP_SET_STATUS64) -@@ -595,13 +599,15 @@ COMPATIBLE_IOCTL(ATMTCP_CREATE) - COMPATIBLE_IOCTL(ATMTCP_REMOVE) - COMPATIBLE_IOCTL(ATMMPC_CTRL) - COMPATIBLE_IOCTL(ATMMPC_DATA) --/* Big W */ --/* WIOC_GETSUPPORT not yet implemented -E */ -+/* Watchdog */ -+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) - COMPATIBLE_IOCTL(WDIOC_GETSTATUS) - COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) - COMPATIBLE_IOCTL(WDIOC_GETTEMP) - COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) - COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) -+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) -+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) - /* Big R */ - COMPATIBLE_IOCTL(RNDGETENTCNT) - COMPATIBLE_IOCTL(RNDADDTOENTCNT) -@@ -735,3 +741,20 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY) - COMPATIBLE_IOCTL(SIOCGIWRETRY) - COMPATIBLE_IOCTL(SIOCSIWPOWER) - COMPATIBLE_IOCTL(SIOCGIWPOWER) -+/* hiddev */ -+COMPATIBLE_IOCTL(HIDIOCGVERSION) -+COMPATIBLE_IOCTL(HIDIOCAPPLICATION) -+COMPATIBLE_IOCTL(HIDIOCGDEVINFO) -+COMPATIBLE_IOCTL(HIDIOCGSTRING) -+COMPATIBLE_IOCTL(HIDIOCINITREPORT) -+COMPATIBLE_IOCTL(HIDIOCGREPORT) -+COMPATIBLE_IOCTL(HIDIOCSREPORT) -+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) -+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) -+COMPATIBLE_IOCTL(HIDIOCGUSAGE) -+COMPATIBLE_IOCTL(HIDIOCSUSAGE) -+COMPATIBLE_IOCTL(HIDIOCGUCODE) -+COMPATIBLE_IOCTL(HIDIOCGFLAG) -+COMPATIBLE_IOCTL(HIDIOCSFLAG) -+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) -+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) ---- ./include/linux/device-mapper.h.dm 2006-03-17 08:58:56.000000000 +0300 -+++ ./include/linux/device-mapper.h 2006-03-17 08:16:12.000000000 +0300 -@@ -51,12 +51,15 @@ typedef int (*dm_endio_fn) (struct dm_ta - struct bio *bio, int error, - union map_info *map_context); - --typedef void (*dm_suspend_fn) (struct dm_target *ti); -+typedef void (*dm_presuspend_fn) (struct dm_target *ti); -+typedef void (*dm_postsuspend_fn) (struct dm_target *ti); - typedef void (*dm_resume_fn) (struct dm_target *ti); - - typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, - char *result, unsigned int maxlen); - -+typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv); -+ - void dm_error(const char *message); - - /* -@@ -79,9 +82,11 @@ struct target_type { - dm_dtr_fn dtr; - dm_map_fn map; - dm_endio_fn end_io; -- dm_suspend_fn suspend; -+ dm_presuspend_fn presuspend; -+ dm_postsuspend_fn postsuspend; - dm_resume_fn resume; - dm_status_fn status; -+ dm_message_fn message; - }; - - struct io_restrictions { -@@ -102,6 +107,7 @@ struct dm_target { - sector_t len; - - /* FIXME: turn this into a mask, and merge with io_restrictions */ -+ /* Always a power of 2 */ - sector_t split_io; - - /* ---- ./include/linux/dm-ioctl.h.dm 2006-03-17 08:59:07.000000000 +0300 -+++ ./include/linux/dm-ioctl.h 2006-03-17 08:16:12.000000000 +0300 -@@ -1,5 +1,6 @@ - /* - * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. -+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. - * - * This file is released under the LGPL. - */ -@@ -76,6 +77,9 @@ - * - * DM_TABLE_STATUS: - * Return the targets status for the 'active' table. -+ * -+ * DM_TARGET_MSG: -+ * Pass a message string to the target at a specific offset of a device. - */ - - /* -@@ -179,6 +183,15 @@ struct dm_target_versions { - }; - - /* -+ * Used to pass message to a target -+ */ -+struct dm_target_msg { -+ uint64_t sector; /* Device sector */ -+ -+ char message[0]; -+}; -+ -+/* - * If you change this make sure you make the corresponding change - * to dm-ioctl.c:lookup_ioctl() - */ -@@ -204,6 +217,7 @@ enum { - - /* Added later */ - DM_LIST_VERSIONS_CMD, -+ DM_TARGET_MSG_CMD, - }; - - /* -@@ -232,6 +246,7 @@ typedef char ioctl_struct[308]; - #define DM_TABLE_DEPS_32 _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct) - #define DM_TABLE_STATUS_32 _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct) - #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct) -+#define DM_TARGET_MSG_32 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct) - #endif - - #define DM_IOCTL 0xfd -@@ -254,10 +269,12 @@ typedef char ioctl_struct[308]; - - #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) - -+#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) -+ - #define DM_VERSION_MAJOR 4 --#define DM_VERSION_MINOR 1 -+#define DM_VERSION_MINOR 5 - #define DM_VERSION_PATCHLEVEL 0 --#define DM_VERSION_EXTRA "-ioctl (2003-12-10)" -+#define DM_VERSION_EXTRA "-ioctl (2005-10-04)" - - /* Status bits */ - #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -@@ -283,4 +300,14 @@ typedef char ioctl_struct[308]; - */ - #define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ - -+/* -+ * Set this to improve performance when you aren't going to use open_count. -+ */ -+#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */ -+ -+/* -+ * Set this to avoid attempting to freeze any filesystem when suspending. -+ */ -+#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */ -+ - #endif /* _LINUX_DM_IOCTL_H */ ---- ./include/linux/genhd.h.dm 2006-03-20 08:42:40.000000000 +0300 -+++ ./include/linux/genhd.h 2006-03-17 13:44:40.000000000 +0300 -@@ -100,7 +100,7 @@ struct gendisk { - struct timer_rand_state *random; - int policy; - -- unsigned sync_io; /* RAID */ -+ atomic_t sync_io; /* RAID */ - unsigned long stamp, stamp_idle; - int in_flight; - #ifdef CONFIG_SMP -diff -pruN ./include/linux/raid.dm/linear.h ./include/linux/raid/linear.h ---- ./include/linux/raid.dm/linear.h 2006-03-17 13:26:03.000000000 +0300 -+++ ./include/linux/raid/linear.h 2006-03-17 13:26:59.000000000 +0300 -@@ -5,8 +5,8 @@ - - struct dev_info { - mdk_rdev_t *rdev; -- unsigned long size; -- unsigned long offset; -+ sector_t size; -+ sector_t offset; - }; - - typedef struct dev_info dev_info_t; -diff -pruN ./include/linux/raid.dm/md.h ./include/linux/raid/md.h ---- ./include/linux/raid.dm/md.h 2006-03-17 13:26:03.000000000 +0300 -+++ ./include/linux/raid/md.h 2006-03-17 13:26:59.000000000 +0300 -@@ -69,12 +69,10 @@ extern mdk_thread_t * md_register_thread - extern void md_unregister_thread (mdk_thread_t *thread); - extern void md_wakeup_thread(mdk_thread_t *thread); - extern void md_check_recovery(mddev_t *mddev); --extern void md_interrupt_thread (mdk_thread_t *thread); - extern void md_write_start(mddev_t *mddev); - extern void md_write_end(mddev_t *mddev); - extern void md_handle_safemode(mddev_t *mddev); - extern void md_done_sync(mddev_t *mddev, int blocks, int ok); --extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); - extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); - extern void md_unplug_mddev(mddev_t *mddev); - -diff -pruN ./include/linux/raid.dm/md_k.h ./include/linux/raid/md_k.h ---- ./include/linux/raid.dm/md_k.h 2006-03-17 13:26:03.000000000 +0300 -+++ ./include/linux/raid/md_k.h 2006-03-17 13:26:59.000000000 +0300 -@@ -24,7 +24,8 @@ - #define HSM 6UL - #define MULTIPATH 7UL - #define RAID6 8UL --#define MAX_PERSONALITY 9UL -+#define RAID10 9UL -+#define MAX_PERSONALITY 10UL - - #define LEVEL_MULTIPATH (-4) - #define LEVEL_LINEAR (-1) -@@ -43,6 +44,7 @@ static inline int pers_to_level (int per - case RAID1: return 1; - case RAID5: return 5; - case RAID6: return 6; -+ case RAID10: return 10; - } - BUG(); - return MD_RESERVED; -@@ -60,6 +62,7 @@ static inline int level_to_pers (int lev - case 4: - case 5: return RAID5; - case 6: return RAID6; -+ case 10: return RAID10; - } - return MD_RESERVED; - } -@@ -216,6 +219,7 @@ struct mddev_s - unsigned long resync_mark; /* a recent timestamp */ - sector_t resync_mark_cnt;/* blocks written at resync_mark */ - -+ sector_t resync_max_sectors; /* may be set by personality */ - /* recovery/resync flags - * NEEDED: we might need to start a resync/recover - * RUNNING: a thread is running, or about to be started -@@ -263,6 +267,11 @@ static inline void rdev_dec_pending(mdk_ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } - -+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -+{ -+ atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); -+} -+ - struct mdk_personality_s - { - char *name; -diff -pruN ./include/linux/raid.dm/raid10.h ./include/linux/raid/raid10.h ---- ./include/linux/raid.dm/raid10.h 1970-01-01 03:00:00.000000000 +0300 -+++ ./include/linux/raid/raid10.h 2006-03-17 13:26:59.000000000 +0300 -@@ -0,0 +1,103 @@ -+#ifndef _RAID10_H -+#define _RAID10_H -+ -+#include <linux/raid/md.h> -+ -+typedef struct mirror_info mirror_info_t; -+ -+struct mirror_info { -+ mdk_rdev_t *rdev; -+ sector_t head_position; -+}; -+ -+typedef struct r10bio_s r10bio_t; -+ -+struct r10_private_data_s { -+ mddev_t *mddev; -+ mirror_info_t *mirrors; -+ int raid_disks; -+ int working_disks; -+ spinlock_t device_lock; -+ -+ /* geometry */ -+ int near_copies; /* number of copies layed out raid0 style */ -+ int far_copies; /* number of copies layed out -+ * at large strides across drives -+ */ -+ int copies; /* near_copies * far_copies. -+ * must be <= raid_disks -+ */ -+ sector_t stride; /* distance between far copies. -+ * This is size / far_copies -+ */ -+ -+ int chunk_shift; /* shift from chunks to sectors */ -+ sector_t chunk_mask; -+ -+ struct list_head retry_list; -+ /* for use when syncing mirrors: */ -+ -+ spinlock_t resync_lock; -+ int nr_pending; -+ int barrier; -+ sector_t next_resync; -+ -+ wait_queue_head_t wait_idle; -+ wait_queue_head_t wait_resume; -+ -+ mempool_t *r10bio_pool; -+ mempool_t *r10buf_pool; -+}; -+ -+typedef struct r10_private_data_s conf_t; -+ -+/* -+ * this is the only point in the RAID code where we violate -+ * C type safety. mddev->private is an 'opaque' pointer. -+ */ -+#define mddev_to_conf(mddev) ((conf_t *) mddev->private) -+ -+/* -+ * this is our 'private' RAID10 bio. -+ * -+ * it contains information about what kind of IO operations were started -+ * for this RAID10 operation, and about their status: -+ */ -+ -+struct r10bio_s { -+ atomic_t remaining; /* 'have we finished' count, -+ * used from IRQ handlers -+ */ -+ sector_t sector; /* virtual sector number */ -+ int sectors; -+ unsigned long state; -+ mddev_t *mddev; -+ /* -+ * original bio going to /dev/mdx -+ */ -+ struct bio *master_bio; -+ /* -+ * if the IO is in READ direction, then this is where we read -+ */ -+ int read_slot; -+ -+ struct list_head retry_list; -+ /* -+ * if the IO is in WRITE direction, then multiple bios are used, -+ * one for each copy. -+ * When resyncing we also use one for each copy. -+ * When reconstructing, we use 2 bios, one for read, one for write. -+ * We choose the number when they are allocated. -+ */ -+ struct { -+ struct bio *bio; -+ sector_t addr; -+ int devnum; -+ } devs[0]; -+}; -+ -+/* bits for r10bio.state */ -+#define R10BIO_Uptodate 0 -+#define R10BIO_IsSync 1 -+#define R10BIO_IsRecover 2 -+#endif |