summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch')
-rw-r--r--openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch9859
1 files changed, 0 insertions, 9859 deletions
diff --git a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch b/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch
deleted file mode 100644
index 4075cab..0000000
--- a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch
+++ /dev/null
@@ -1,9859 +0,0 @@
-diff -pruN ./drivers/md.dm/dm-bio-list.h ./drivers/md/dm-bio-list.h
---- ./drivers/md.dm/dm-bio-list.h 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-bio-list.h 2006-03-17 13:16:38.000000000 +0300
-@@ -33,6 +33,9 @@ static inline void bio_list_add(struct b
-
- static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
- {
-+ if (!bl2->head)
-+ return;
-+
- if (bl->tail)
- bl->tail->bi_next = bl2->head;
- else
-diff -pruN ./drivers/md.dm/dm-bio-record.h ./drivers/md/dm-bio-record.h
---- ./drivers/md.dm/dm-bio-record.h 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-bio-record.h 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,45 @@
-+/*
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ */
-+
-+#ifndef DM_BIO_RECORD_H
-+#define DM_BIO_RECORD_H
-+
-+#include <linux/bio.h>
-+
-+/*
-+ * There are lots of mutable fields in the bio struct that get
-+ * changed by the lower levels of the block layer. Some targets,
-+ * such as multipath, may wish to resubmit a bio on error. The
-+ * functions in this file help the target record and restore the
-+ * original bio state.
-+ */
-+struct dm_bio_details {
-+ sector_t bi_sector;
-+ struct block_device *bi_bdev;
-+ unsigned int bi_size;
-+ unsigned short bi_idx;
-+ unsigned long bi_flags;
-+};
-+
-+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
-+{
-+ bd->bi_sector = bio->bi_sector;
-+ bd->bi_bdev = bio->bi_bdev;
-+ bd->bi_size = bio->bi_size;
-+ bd->bi_idx = bio->bi_idx;
-+ bd->bi_flags = bio->bi_flags;
-+}
-+
-+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
-+{
-+ bio->bi_sector = bd->bi_sector;
-+ bio->bi_bdev = bd->bi_bdev;
-+ bio->bi_size = bd->bi_size;
-+ bio->bi_idx = bd->bi_idx;
-+ bio->bi_flags = bd->bi_flags;
-+}
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm.c ./drivers/md/dm.c
---- ./drivers/md.dm/dm.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm.c 2006-03-17 13:16:38.000000000 +0300
-@@ -15,15 +15,13 @@
- #include <linux/buffer_head.h>
- #include <linux/mempool.h>
- #include <linux/slab.h>
-+#include <linux/idr.h>
-
- static const char *_name = DM_NAME;
-
- static unsigned int major = 0;
- static unsigned int _major = 0;
-
--static int realloc_minor_bits(unsigned long requested_minor);
--static void free_minor_bits(void);
--
- /*
- * One of these is allocated per bio.
- */
-@@ -32,6 +30,7 @@ struct dm_io {
- int error;
- struct bio *bio;
- atomic_t io_count;
-+ unsigned long start_time;
- };
-
- /*
-@@ -44,15 +43,23 @@ struct target_io {
- union map_info info;
- };
-
-+union map_info *dm_get_mapinfo(struct bio *bio)
-+{
-+ if (bio && bio->bi_private)
-+ return &((struct target_io *)bio->bi_private)->info;
-+ return NULL;
-+}
-+
- /*
- * Bits for the md->flags field.
- */
- #define DMF_BLOCK_IO 0
- #define DMF_SUSPENDED 1
--#define DMF_FS_LOCKED 2
-+#define DMF_FROZEN 2
-
- struct mapped_device {
-- struct rw_semaphore lock;
-+ struct rw_semaphore io_lock;
-+ struct semaphore suspend_lock;
- rwlock_t map_lock;
- atomic_t holders;
-
-@@ -61,6 +68,8 @@ struct mapped_device {
- request_queue_t *queue;
- struct gendisk *disk;
-
-+ void *interface_ptr;
-+
- /*
- * A list of ios that arrived while we were suspended.
- */
-@@ -89,6 +98,7 @@ struct mapped_device {
- * freeze/thaw support require holding onto a super block
- */
- struct super_block *frozen_sb;
-+ struct block_device *suspended_bdev;
- };
-
- #define MIN_IOS 256
-@@ -113,19 +123,11 @@ static int __init local_init(void)
- return -ENOMEM;
- }
-
-- r = realloc_minor_bits(1024);
-- if (r < 0) {
-- kmem_cache_destroy(_tio_cache);
-- kmem_cache_destroy(_io_cache);
-- return r;
-- }
--
- _major = major;
- r = register_blkdev(_major, _name);
- if (r < 0) {
- kmem_cache_destroy(_tio_cache);
- kmem_cache_destroy(_io_cache);
-- free_minor_bits();
- return r;
- }
-
-@@ -139,7 +141,6 @@ static void local_exit(void)
- {
- kmem_cache_destroy(_tio_cache);
- kmem_cache_destroy(_io_cache);
-- free_minor_bits();
-
- if (unregister_blkdev(_major, _name) < 0)
- DMERR("devfs_unregister_blkdev failed");
-@@ -238,21 +239,53 @@ static inline void free_tio(struct mappe
- mempool_free(tio, md->tio_pool);
- }
-
-+static void start_io_acct(struct dm_io *io)
-+{
-+ struct mapped_device *md = io->md;
-+
-+ io->start_time = jiffies;
-+
-+ disk_round_stats(dm_disk(md));
-+ dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
-+}
-+
-+static int end_io_acct(struct dm_io *io)
-+{
-+ struct mapped_device *md = io->md;
-+ struct bio *bio = io->bio;
-+ unsigned long duration = jiffies - io->start_time;
-+ int pending;
-+
-+ disk_round_stats(dm_disk(md));
-+ dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
-+
-+ switch (bio_data_dir(bio)) {
-+ case WRITE:
-+ disk_stat_add(dm_disk(md), write_ticks, duration);
-+ break;
-+ case READ:
-+ disk_stat_add(dm_disk(md), read_ticks, duration);
-+ break;
-+ }
-+
-+ return !pending;
-+}
-+
- /*
- * Add the bio to the list of deferred io.
- */
- static int queue_io(struct mapped_device *md, struct bio *bio)
- {
-- down_write(&md->lock);
-+ down_write(&md->io_lock);
-
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
-- up_write(&md->lock);
-+ up_write(&md->io_lock);
- return 1;
- }
-
- bio_list_add(&md->deferred, bio);
-
-- up_write(&md->lock);
-+ up_write(&md->io_lock);
- return 0; /* deferred successfully */
- }
-
-@@ -293,7 +326,7 @@ static inline void dec_pending(struct dm
- io->error = error;
-
- if (atomic_dec_and_test(&io->io_count)) {
-- if (atomic_dec_and_test(&io->md->pending))
-+ if (end_io_acct(io))
- /* nudge anyone waiting on suspend queue */
- wake_up(&io->md->wait);
-
-@@ -342,8 +375,8 @@ static sector_t max_io_len(struct mapped
- */
- if (ti->split_io) {
- sector_t boundary;
-- boundary = dm_round_up(offset + 1, ti->split_io) - offset;
--
-+ boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
-+ - offset;
- if (len > boundary)
- len = boundary;
- }
-@@ -379,7 +412,7 @@ static void __map_bio(struct dm_target *
- /* error the io and bail out */
- struct dm_io *io = tio->io;
- free_tio(tio->io->md, tio);
-- dec_pending(io, -EIO);
-+ dec_pending(io, r);
- bio_put(clone);
- }
- }
-@@ -542,7 +575,7 @@ static void __split_bio(struct mapped_de
- ci.sector_count = bio_sectors(bio);
- ci.idx = bio->bi_idx;
-
-- atomic_inc(&md->pending);
-+ start_io_acct(ci.io);
- while (ci.sector_count)
- __clone_and_map(&ci);
-
-@@ -563,14 +596,22 @@ static int dm_request(request_queue_t *q
- int r;
- struct mapped_device *md = q->queuedata;
-
-- down_read(&md->lock);
-+ down_read(&md->io_lock);
-+
-+ if (bio_data_dir(bio) == WRITE) {
-+ disk_stat_inc(dm_disk(md), writes);
-+ disk_stat_add(dm_disk(md), write_sectors, bio_sectors(bio));
-+ } else {
-+ disk_stat_inc(dm_disk(md), reads);
-+ disk_stat_add(dm_disk(md), read_sectors, bio_sectors(bio));
-+ }
-
- /*
- * If we're suspended we have to queue
- * this io for later.
- */
- while (test_bit(DMF_BLOCK_IO, &md->flags)) {
-- up_read(&md->lock);
-+ up_read(&md->io_lock);
-
- if (bio_rw(bio) == READA) {
- bio_io_error(bio, bio->bi_size);
-@@ -589,14 +630,29 @@ static int dm_request(request_queue_t *q
- * We're in a while loop, because someone could suspend
- * before we get to the following read lock.
- */
-- down_read(&md->lock);
-+ down_read(&md->io_lock);
- }
-
- __split_bio(md, bio);
-- up_read(&md->lock);
-+ up_read(&md->io_lock);
- return 0;
- }
-
-+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ struct mapped_device *md = q->queuedata;
-+ struct dm_table *map = dm_get_table(md);
-+ int ret = -ENXIO;
-+
-+ if (map) {
-+ ret = dm_table_flush_all(map);
-+ dm_table_put(map);
-+ }
-+
-+ return ret;
-+}
-+
- static void dm_unplug_all(request_queue_t *q)
- {
- struct mapped_device *md = q->queuedata;
-@@ -624,109 +680,86 @@ static int dm_any_congested(void *conges
- }
-
- /*-----------------------------------------------------------------
-- * A bitset is used to keep track of allocated minor numbers.
-+ * An IDR is used to keep track of allocated minor numbers.
- *---------------------------------------------------------------*/
- static DECLARE_MUTEX(_minor_lock);
--static unsigned long *_minor_bits = NULL;
--static unsigned long _max_minors = 0;
--
--#define MINORS_SIZE(minors) ((minors / BITS_PER_LONG) * sizeof(unsigned long))
--
--static int realloc_minor_bits(unsigned long requested_minor)
--{
-- unsigned long max_minors;
-- unsigned long *minor_bits, *tmp;
--
-- if (requested_minor < _max_minors)
-- return -EINVAL;
--
-- /* Round up the requested minor to the next power-of-2. */
-- max_minors = 1 << fls(requested_minor - 1);
-- if (max_minors > (1 << MINORBITS))
-- return -EINVAL;
--
-- minor_bits = kmalloc(MINORS_SIZE(max_minors), GFP_KERNEL);
-- if (!minor_bits)
-- return -ENOMEM;
-- memset(minor_bits, 0, MINORS_SIZE(max_minors));
--
-- /* Copy the existing bit-set to the new one. */
-- if (_minor_bits)
-- memcpy(minor_bits, _minor_bits, MINORS_SIZE(_max_minors));
--
-- tmp = _minor_bits;
-- _minor_bits = minor_bits;
-- _max_minors = max_minors;
-- if (tmp)
-- kfree(tmp);
--
-- return 0;
--}
--
--static void free_minor_bits(void)
--{
-- down(&_minor_lock);
-- kfree(_minor_bits);
-- _minor_bits = NULL;
-- _max_minors = 0;
-- up(&_minor_lock);
--}
-+static DEFINE_IDR(_minor_idr);
-
- static void free_minor(unsigned int minor)
- {
- down(&_minor_lock);
-- if (minor < _max_minors)
-- clear_bit(minor, _minor_bits);
-+ idr_remove(&_minor_idr, minor);
- up(&_minor_lock);
- }
-
- /*
- * See if the device with a specific minor # is free.
- */
--static int specific_minor(unsigned int minor)
-+static int specific_minor(struct mapped_device *md, unsigned int minor)
- {
-- int r = 0;
-+ int r, m;
-
-- if (minor > (1 << MINORBITS))
-+ if (minor >= (1 << MINORBITS))
- return -EINVAL;
-
- down(&_minor_lock);
-- if (minor >= _max_minors) {
-- r = realloc_minor_bits(minor);
-- if (r) {
-- up(&_minor_lock);
-- return r;
-- }
-+
-+ if (idr_find(&_minor_idr, minor)) {
-+ r = -EBUSY;
-+ goto out;
-+ }
-+
-+ r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-+ if (!r) {
-+ r = -ENOMEM;
-+ goto out;
-+ }
-+
-+ r = idr_get_new_above(&_minor_idr, md, minor, &m);
-+ if (r) {
-+ goto out;
- }
-
-- if (test_and_set_bit(minor, _minor_bits))
-+ if (m != minor) {
-+ idr_remove(&_minor_idr, m);
- r = -EBUSY;
-- up(&_minor_lock);
-+ goto out;
-+ }
-
-+out:
-+ up(&_minor_lock);
- return r;
- }
-
--static int next_free_minor(unsigned int *minor)
-+static int next_free_minor(struct mapped_device *md, unsigned int *minor)
- {
- int r;
- unsigned int m;
-
- down(&_minor_lock);
-- m = find_first_zero_bit(_minor_bits, _max_minors);
-- if (m >= _max_minors) {
-- r = realloc_minor_bits(_max_minors * 2);
-- if (r) {
-- up(&_minor_lock);
-- return r;
-- }
-- m = find_first_zero_bit(_minor_bits, _max_minors);
-+
-+ r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-+ if (!r) {
-+ r = -ENOMEM;
-+ goto out;
-+ }
-+
-+ r = idr_get_new(&_minor_idr, md, &m);
-+ if (r) {
-+ goto out;
-+ }
-+
-+ if (m >= (1 << MINORBITS)) {
-+ idr_remove(&_minor_idr, m);
-+ r = -ENOSPC;
-+ goto out;
- }
-
-- set_bit(m, _minor_bits);
- *minor = m;
-- up(&_minor_lock);
-
-- return 0;
-+out:
-+ up(&_minor_lock);
-+ return r;
- }
-
- static struct block_device_operations dm_blk_dops;
-@@ -745,12 +778,13 @@ static struct mapped_device *alloc_dev(u
- }
-
- /* get a minor number for the dev */
-- r = persistent ? specific_minor(minor) : next_free_minor(&minor);
-+ r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
- if (r < 0)
- goto bad1;
-
- memset(md, 0, sizeof(*md));
-- init_rwsem(&md->lock);
-+ init_rwsem(&md->io_lock);
-+ init_MUTEX(&md->suspend_lock);
- rwlock_init(&md->map_lock);
- atomic_set(&md->holders, 1);
- atomic_set(&md->event_nr, 0);
-@@ -764,6 +798,7 @@ static struct mapped_device *alloc_dev(u
- md->queue->backing_dev_info.congested_data = md;
- blk_queue_make_request(md->queue, dm_request);
- md->queue->unplug_fn = dm_unplug_all;
-+ md->queue->issue_flush_fn = dm_flush_all;
-
- md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _io_cache);
-@@ -823,22 +858,17 @@ static void event_callback(void *context
- {
- struct mapped_device *md = (struct mapped_device *) context;
-
-- atomic_inc(&md->event_nr);;
-+ atomic_inc(&md->event_nr);
- wake_up(&md->eventq);
- }
-
--static void __set_size(struct gendisk *disk, sector_t size)
-+static void __set_size(struct mapped_device *md, sector_t size)
- {
-- struct block_device *bdev;
-+ set_capacity(md->disk, size);
-
-- set_capacity(disk, size);
-- bdev = bdget_disk(disk, 0);
-- if (bdev) {
-- down(&bdev->bd_inode->i_sem);
-- i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-- up(&bdev->bd_inode->i_sem);
-- bdput(bdev);
-- }
-+ down(&md->suspended_bdev->bd_inode->i_sem);
-+ i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-+ up(&md->suspended_bdev->bd_inode->i_sem);
- }
-
- static int __bind(struct mapped_device *md, struct dm_table *t)
-@@ -847,17 +877,18 @@ static int __bind(struct mapped_device *
- sector_t size;
-
- size = dm_table_get_size(t);
-- __set_size(md->disk, size);
-+ __set_size(md, size);
- if (size == 0)
- return 0;
-
-+ dm_table_get(t);
-+ dm_table_event_callback(t, event_callback, md);
-+
- write_lock(&md->map_lock);
- md->map = t;
-+ dm_table_set_restrictions(t, q);
- write_unlock(&md->map_lock);
-
-- dm_table_get(t);
-- dm_table_event_callback(md->map, event_callback, md);
-- dm_table_set_restrictions(t, q);
- return 0;
- }
-
-@@ -901,6 +932,32 @@ int dm_create_with_minor(unsigned int mi
- return create_aux(minor, 1, result);
- }
-
-+void *dm_get_mdptr(dev_t dev)
-+{
-+ struct mapped_device *md;
-+ void *mdptr = NULL;
-+ unsigned minor = MINOR(dev);
-+
-+ if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
-+ return NULL;
-+
-+ down(&_minor_lock);
-+
-+ md = idr_find(&_minor_idr, minor);
-+
-+ if (md && (dm_disk(md)->first_minor == minor))
-+ mdptr = md->interface_ptr;
-+
-+ up(&_minor_lock);
-+
-+ return mdptr;
-+}
-+
-+void dm_set_mdptr(struct mapped_device *md, void *ptr)
-+{
-+ md->interface_ptr = ptr;
-+}
-+
- void dm_get(struct mapped_device *md)
- {
- atomic_inc(&md->holders);
-@@ -911,8 +968,10 @@ void dm_put(struct mapped_device *md)
- struct dm_table *map = dm_get_table(md);
-
- if (atomic_dec_and_test(&md->holders)) {
-- if (!test_bit(DMF_SUSPENDED, &md->flags) && map)
-- dm_table_suspend_targets(map);
-+ if (!dm_suspended(md)) {
-+ dm_table_presuspend_targets(map);
-+ dm_table_postsuspend_targets(map);
-+ }
- __unbind(md);
- free_dev(md);
- }
-@@ -940,69 +999,55 @@ static void __flush_deferred_io(struct m
- */
- int dm_swap_table(struct mapped_device *md, struct dm_table *table)
- {
-- int r;
-+ int r = -EINVAL;
-
-- down_write(&md->lock);
-+ down(&md->suspend_lock);
-
- /* device must be suspended */
-- if (!test_bit(DMF_SUSPENDED, &md->flags)) {
-- up_write(&md->lock);
-- return -EPERM;
-- }
-+ if (!dm_suspended(md))
-+ goto out;
-
- __unbind(md);
- r = __bind(md, table);
-- if (r)
-- return r;
-
-- up_write(&md->lock);
-- return 0;
-+out:
-+ up(&md->suspend_lock);
-+ return r;
- }
-
- /*
- * Functions to lock and unlock any filesystem running on the
- * device.
- */
--static int __lock_fs(struct mapped_device *md)
-+static int lock_fs(struct mapped_device *md)
- {
-- struct block_device *bdev;
-+ int r;
-
-- if (test_and_set_bit(DMF_FS_LOCKED, &md->flags))
-- return 0;
-+ WARN_ON(md->frozen_sb);
-
-- bdev = bdget_disk(md->disk, 0);
-- if (!bdev) {
-- DMWARN("bdget failed in __lock_fs");
-- return -ENOMEM;
-+ md->frozen_sb = freeze_bdev(md->suspended_bdev);
-+ if (IS_ERR(md->frozen_sb)) {
-+ r = PTR_ERR(md->frozen_sb);
-+ md->frozen_sb = NULL;
-+ return r;
- }
-
-- WARN_ON(md->frozen_sb);
-- md->frozen_sb = freeze_bdev(bdev);
-+ set_bit(DMF_FROZEN, &md->flags);
-+
- /* don't bdput right now, we don't want the bdev
-- * to go away while it is locked. We'll bdput
-- * in __unlock_fs
-+ * to go away while it is locked.
- */
- return 0;
- }
-
--static int __unlock_fs(struct mapped_device *md)
-+static void unlock_fs(struct mapped_device *md)
- {
-- struct block_device *bdev;
--
-- if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags))
-- return 0;
--
-- bdev = bdget_disk(md->disk, 0);
-- if (!bdev) {
-- DMWARN("bdget failed in __unlock_fs");
-- return -ENOMEM;
-- }
-+ if (!test_bit(DMF_FROZEN, &md->flags))
-+ return;
-
-- thaw_bdev(bdev, md->frozen_sb);
-+ thaw_bdev(md->suspended_bdev, md->frozen_sb);
- md->frozen_sb = NULL;
-- bdput(bdev);
-- bdput(bdev);
-- return 0;
-+ clear_bit(DMF_FROZEN, &md->flags);
- }
-
- /*
-@@ -1012,46 +1057,48 @@ static int __unlock_fs(struct mapped_dev
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
--int dm_suspend(struct mapped_device *md)
-+int dm_suspend(struct mapped_device *md, int do_lockfs)
- {
-- struct dm_table *map;
-+ struct dm_table *map = NULL;
- DECLARE_WAITQUEUE(wait, current);
-+ int r = -EINVAL;
-
-- /* Flush I/O to the device. */
-- down_read(&md->lock);
-- if (test_bit(DMF_BLOCK_IO, &md->flags)) {
-- up_read(&md->lock);
-- return -EINVAL;
-+ down(&md->suspend_lock);
-+
-+ if (dm_suspended(md))
-+ goto out;
-+
-+ map = dm_get_table(md);
-+
-+ /* This does not get reverted if there's an error later. */
-+ dm_table_presuspend_targets(map);
-+
-+ md->suspended_bdev = bdget_disk(md->disk, 0);
-+ if (!md->suspended_bdev) {
-+ DMWARN("bdget failed in dm_suspend");
-+ r = -ENOMEM;
-+ goto out;
- }
-
-- __lock_fs(md);
-- up_read(&md->lock);
-+ /* Flush I/O to the device. */
-+ if (do_lockfs) {
-+ r = lock_fs(md);
-+ if (r)
-+ goto out;
-+ }
-
- /*
-- * First we set the BLOCK_IO flag so no more ios will be
-- * mapped.
-+ * First we set the BLOCK_IO flag so no more ios will be mapped.
- */
-- down_write(&md->lock);
-- if (test_bit(DMF_BLOCK_IO, &md->flags)) {
-- /*
-- * If we get here we know another thread is
-- * trying to suspend as well, so we leave the fs
-- * locked for this thread.
-- */
-- up_write(&md->lock);
-- return -EINVAL;
-- }
--
-+ down_write(&md->io_lock);
- set_bit(DMF_BLOCK_IO, &md->flags);
-+
- add_wait_queue(&md->wait, &wait);
-- up_write(&md->lock);
-+ up_write(&md->io_lock);
-
- /* unplug */
-- map = dm_get_table(md);
-- if (map) {
-+ if (map)
- dm_table_unplug_all(map);
-- dm_table_put(map);
-- }
-
- /*
- * Then we wait for the already mapped ios to
-@@ -1067,54 +1114,75 @@ int dm_suspend(struct mapped_device *md)
- }
- set_current_state(TASK_RUNNING);
-
-- down_write(&md->lock);
-+ down_write(&md->io_lock);
- remove_wait_queue(&md->wait, &wait);
-
- /* were we interrupted ? */
-+ r = -EINTR;
- if (atomic_read(&md->pending)) {
-- __unlock_fs(md);
-+ up_write(&md->io_lock);
-+ unlock_fs(md);
- clear_bit(DMF_BLOCK_IO, &md->flags);
-- up_write(&md->lock);
-- return -EINTR;
-+ goto out;
- }
-+ up_write(&md->io_lock);
-+
-+ dm_table_postsuspend_targets(map);
-
- set_bit(DMF_SUSPENDED, &md->flags);
-
-- map = dm_get_table(md);
-- if (map)
-- dm_table_suspend_targets(map);
-- dm_table_put(map);
-- up_write(&md->lock);
-+ r = 0;
-
-- return 0;
-+out:
-+ if (r && md->suspended_bdev) {
-+ bdput(md->suspended_bdev);
-+ md->suspended_bdev = NULL;
-+ }
-+
-+ dm_table_put(map);
-+ up(&md->suspend_lock);
-+ return r;
- }
-
- int dm_resume(struct mapped_device *md)
- {
-+ int r = -EINVAL;
- struct bio *def;
-- struct dm_table *map = dm_get_table(md);
-+ struct dm_table *map = NULL;
-
-- down_write(&md->lock);
-- if (!map ||
-- !test_bit(DMF_SUSPENDED, &md->flags) ||
-- !dm_table_get_size(map)) {
-- up_write(&md->lock);
-- dm_table_put(map);
-- return -EINVAL;
-- }
-+ down(&md->suspend_lock);
-+ if (!dm_suspended(md))
-+ goto out;
-+
-+ map = dm_get_table(md);
-+ if (!map || !dm_table_get_size(map))
-+ goto out;
-
- dm_table_resume_targets(map);
-- clear_bit(DMF_SUSPENDED, &md->flags);
-+
-+ down_write(&md->io_lock);
- clear_bit(DMF_BLOCK_IO, &md->flags);
-
- def = bio_list_get(&md->deferred);
- __flush_deferred_io(md, def);
-- up_write(&md->lock);
-- __unlock_fs(md);
-+ up_write(&md->io_lock);
-+
-+ unlock_fs(md);
-+
-+ bdput(md->suspended_bdev);
-+ md->suspended_bdev = NULL;
-+
-+ clear_bit(DMF_SUSPENDED, &md->flags);
-+
- dm_table_unplug_all(map);
-+
-+ r = 0;
-+
-+out:
- dm_table_put(map);
-+ up(&md->suspend_lock);
-
-- return 0;
-+ return r;
- }
-
- /*-----------------------------------------------------------------
-@@ -1151,6 +1219,8 @@ static struct block_device_operations dm
- .owner = THIS_MODULE
- };
-
-+EXPORT_SYMBOL(dm_get_mapinfo);
-+
- /*
- * module hooks
- */
-@@ -1160,5 +1230,5 @@ module_exit(dm_exit);
- module_param(major, uint, 0);
- MODULE_PARM_DESC(major, "The major number of the device mapper");
- MODULE_DESCRIPTION(DM_NAME " driver");
--MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
-+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
- MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-crypt.c ./drivers/md/dm-crypt.c
---- ./drivers/md.dm/dm-crypt.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-crypt.c 2006-03-17 13:16:38.000000000 +0300
-@@ -40,8 +40,8 @@ struct convert_context {
- struct bio *bio_out;
- unsigned int offset_in;
- unsigned int offset_out;
-- int idx_in;
-- int idx_out;
-+ unsigned int idx_in;
-+ unsigned int idx_out;
- sector_t sector;
- int write;
- };
-@@ -67,8 +67,8 @@ struct crypt_config {
- struct crypto_tfm *tfm;
- sector_t iv_offset;
- int (*iv_generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
-- int iv_size;
-- int key_size;
-+ unsigned int iv_size;
-+ unsigned int key_size;
- u8 key[0];
- };
-
-@@ -97,10 +97,8 @@ static void mempool_free_page(void *page
- */
- static int crypt_iv_plain(struct crypt_config *cc, u8 *iv, sector_t sector)
- {
-+ memset(iv, 0, cc->iv_size);
- *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
-- if (cc->iv_size > sizeof(u32) / sizeof(u8))
-- memset(iv + (sizeof(u32) / sizeof(u8)), 0,
-- cc->iv_size - (sizeof(u32) / sizeof(u8)));
-
- return 0;
- }
-@@ -200,13 +198,13 @@ static int crypt_convert(struct crypt_co
- */
- static struct bio *
- crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
-- struct bio *base_bio, int *bio_vec_idx)
-+ struct bio *base_bio, unsigned int *bio_vec_idx)
- {
- struct bio *bio;
-- int nr_iovecs = dm_div_up(size, PAGE_SIZE);
-+ unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- int gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
-- int flags = current->flags;
-- int i;
-+ unsigned long flags = current->flags;
-+ unsigned int i;
-
- /*
- * Tell VM to act less aggressively and fail earlier.
-@@ -280,9 +278,8 @@ crypt_alloc_buffer(struct crypt_config *
- static void crypt_free_buffer_pages(struct crypt_config *cc,
- struct bio *bio, unsigned int bytes)
- {
-- unsigned int start, end;
-+ unsigned int i, start, end;
- struct bio_vec *bv;
-- int i;
-
- /*
- * This is ugly, but Jens Axboe thinks that using bi_idx in the
-@@ -366,11 +363,11 @@ static void kcryptd_queue_io(struct cryp
- /*
- * Decode key from its hex representation
- */
--static int crypt_decode_key(u8 *key, char *hex, int size)
-+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
- {
- char buffer[3];
- char *endp;
-- int i;
-+ unsigned int i;
-
- buffer[2] = '\0';
-
-@@ -393,9 +390,9 @@ static int crypt_decode_key(u8 *key, cha
- /*
- * Encode key into its hex representation
- */
--static void crypt_encode_key(char *hex, u8 *key, int size)
-+static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
- {
-- int i;
-+ unsigned int i;
-
- for(i = 0; i < size; i++) {
- sprintf(hex, "%02x", *key);
-@@ -415,8 +412,8 @@ static int crypt_ctr(struct dm_target *t
- char *tmp;
- char *cipher;
- char *mode;
-- int crypto_flags;
-- int key_size;
-+ unsigned int crypto_flags;
-+ unsigned int key_size;
-
- if (argc != 5) {
- ti->error = PFX "Not enough arguments";
-@@ -464,9 +461,9 @@ static int crypt_ctr(struct dm_target *t
- }
-
- if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
-- /* at least a 32 bit sector number should fit in our buffer */
-+ /* at least a 64 bit sector number should fit in our buffer */
- cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
-- (unsigned int)(sizeof(u32) / sizeof(u8)));
-+ (unsigned int)(sizeof(u64) / sizeof(u8)));
- else {
- cc->iv_size = 0;
- if (cc->iv_generator) {
-@@ -528,6 +525,8 @@ bad3:
- bad2:
- crypto_free_tfm(tfm);
- bad1:
-+ /* Must zero key material before freeing */
-+ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
- return -EINVAL;
- }
-@@ -541,6 +540,9 @@ static void crypt_dtr(struct dm_target *
-
- crypto_free_tfm(cc->tfm);
- dm_put_device(ti, cc->dev);
-+
-+ /* Must zero key material before freeing */
-+ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- kfree(cc);
- }
-
-@@ -577,7 +579,8 @@ static int crypt_endio(struct bio *bio,
-
- static inline struct bio *
- crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
-- sector_t sector, int *bvec_idx, struct convert_context *ctx)
-+ sector_t sector, unsigned int *bvec_idx,
-+ struct convert_context *ctx)
- {
- struct bio *clone;
-
-@@ -630,7 +633,7 @@ static int crypt_map(struct dm_target *t
- struct bio *clone;
- unsigned int remaining = bio->bi_size;
- sector_t sector = bio->bi_sector - ti->begin;
-- int bvec_idx = 0;
-+ unsigned int bvec_idx = 0;
-
- io->target = ti;
- io->bio = bio;
-@@ -693,7 +696,7 @@ static int crypt_status(struct dm_target
- char buffer[32];
- const char *cipher;
- const char *mode = NULL;
-- int offset;
-+ unsigned int offset;
-
- switch (type) {
- case STATUSTYPE_INFO:
-diff -pruN ./drivers/md.dm/dm-emc.c ./drivers/md/dm-emc.c
---- ./drivers/md.dm/dm-emc.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-emc.c 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,359 @@
-+/*
-+ * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath support for EMC CLARiiON AX/CX-series hardware.
-+ */
-+
-+#include "dm.h"
-+#include "dm-hw-handler.h"
-+#include <scsi/scsi.h>
-+#include <scsi/scsi_cmnd.h>
-+
-+struct emc_handler {
-+ spinlock_t lock;
-+
-+ /* Whether we should send the short trespass command (FC-series)
-+ * or the long version (default for AX/CX CLARiiON arrays). */
-+ unsigned short_trespass;
-+ /* Whether or not to honor SCSI reservations when initiating a
-+ * switch-over. Default: Don't. */
-+ unsigned hr;
-+
-+ unsigned char sense[SCSI_SENSE_BUFFERSIZE];
-+};
-+
-+#define TRESPASS_PAGE 0x22
-+#define EMC_FAILOVER_TIMEOUT (60 * HZ)
-+
-+/* Code borrowed from dm-lsi-rdac by Mike Christie */
-+
-+static inline void free_bio(struct bio *bio)
-+{
-+ __free_page(bio->bi_io_vec[0].bv_page);
-+ bio_put(bio);
-+}
-+
-+static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+ struct path *path = bio->bi_private;
-+
-+ if (bio->bi_size)
-+ return 1;
-+
-+ /* We also need to look at the sense keys here whether or not to
-+ * switch to the next PG etc.
-+ *
-+ * For now simple logic: either it works or it doesn't.
-+ */
-+ if (error)
-+ dm_pg_init_complete(path, MP_FAIL_PATH);
-+ else
-+ dm_pg_init_complete(path, 0);
-+
-+ /* request is freed in block layer */
-+ free_bio(bio);
-+
-+ return 0;
-+}
-+
-+static struct bio *get_failover_bio(struct path *path, unsigned data_size)
-+{
-+ struct bio *bio;
-+ struct page *page;
-+
-+ bio = bio_alloc(GFP_ATOMIC, 1);
-+ if (!bio) {
-+ DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
-+ return NULL;
-+ }
-+
-+ bio->bi_rw |= (1 << BIO_RW);
-+ bio->bi_bdev = path->dev->bdev;
-+ bio->bi_sector = 0;
-+ bio->bi_private = path;
-+ bio->bi_end_io = emc_endio;
-+
-+ page = alloc_page(GFP_ATOMIC);
-+ if (!page) {
-+ DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
-+ bio_put(bio);
-+ return NULL;
-+ }
-+
-+ if (bio_add_page(bio, page, data_size, 0) != data_size) {
-+ DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
-+ __free_page(page);
-+ bio_put(bio);
-+ return NULL;
-+ }
-+
-+ return bio;
-+}
-+
-+static struct request *get_failover_req(struct emc_handler *h,
-+ struct bio *bio, struct path *path)
-+{
-+ struct request *rq;
-+ struct block_device *bdev = bio->bi_bdev;
-+ struct request_queue *q = bdev_get_queue(bdev);
-+
-+ /* FIXME: Figure out why it fails with GFP_ATOMIC. */
-+ rq = blk_get_request(q, WRITE, __GFP_WAIT);
-+ if (!rq) {
-+ DMERR("dm-emc: get_failover_req: blk_get_request failed");
-+ return NULL;
-+ }
-+
-+ rq->bio = rq->biotail = bio;
-+ blk_rq_bio_prep(q, rq, bio);
-+
-+ rq->rq_disk = bdev->bd_contains->bd_disk;
-+
-+ /* bio backed don't set data */
-+ rq->buffer = rq->data = NULL;
-+ /* rq data_len used for pc cmd's request_bufflen */
-+ rq->data_len = bio->bi_size;
-+
-+ rq->sense = h->sense;
-+ memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-+ rq->sense_len = 0;
-+
-+ memset(&rq->cmd, 0, BLK_MAX_CDB);
-+
-+ rq->timeout = EMC_FAILOVER_TIMEOUT;
-+ rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
-+
-+ return rq;
-+}
-+
-+static struct request *emc_trespass_get(struct emc_handler *h,
-+ struct path *path)
-+{
-+ struct bio *bio;
-+ struct request *rq;
-+ unsigned char *page22;
-+ unsigned char long_trespass_pg[] = {
-+ 0, 0, 0, 0,
-+ TRESPASS_PAGE, /* Page code */
-+ 0x09, /* Page length - 2 */
-+ h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
-+ 0xff, 0xff, /* Trespass target */
-+ 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */
-+ };
-+ unsigned char short_trespass_pg[] = {
-+ 0, 0, 0, 0,
-+ TRESPASS_PAGE, /* Page code */
-+ 0x02, /* Page length - 2 */
-+ h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
-+ 0xff, /* Trespass target */
-+ };
-+ unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
-+ sizeof(long_trespass_pg);
-+
-+ /* get bio backing */
-+ if (data_size > PAGE_SIZE)
-+ /* this should never happen */
-+ return NULL;
-+
-+ bio = get_failover_bio(path, data_size);
-+ if (!bio) {
-+ DMERR("dm-emc: emc_trespass_get: no bio");
-+ return NULL;
-+ }
-+
-+ page22 = (unsigned char *)bio_data(bio);
-+ memset(page22, 0, data_size);
-+
-+ memcpy(page22, h->short_trespass ?
-+ short_trespass_pg : long_trespass_pg, data_size);
-+
-+ /* get request for block layer packet command */
-+ rq = get_failover_req(h, bio, path);
-+ if (!rq) {
-+ DMERR("dm-emc: emc_trespass_get: no rq");
-+ free_bio(bio);
-+ return NULL;
-+ }
-+
-+ /* Prepare the command. */
-+ rq->cmd[0] = MODE_SELECT;
-+ rq->cmd[1] = 0x10;
-+ rq->cmd[4] = data_size;
-+ rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-+
-+ return rq;
-+}
-+
-+static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
-+ struct path *path)
-+{
-+ struct request *rq;
-+ struct request_queue *q = bdev_get_queue(path->dev->bdev);
-+
-+ /*
-+ * We can either blindly init the pg (then look at the sense),
-+ * or we can send some commands to get the state here (then
-+ * possibly send the fo cmnd), or we can also have the
-+ * initial state passed into us and then get an update here.
-+ */
-+ if (!q) {
-+ DMINFO("dm-emc: emc_pg_init: no queue");
-+ goto fail_path;
-+ }
-+
-+ /* FIXME: The request should be pre-allocated. */
-+ rq = emc_trespass_get(hwh->context, path);
-+ if (!rq) {
-+ DMERR("dm-emc: emc_pg_init: no rq");
-+ goto fail_path;
-+ }
-+
-+ DMINFO("dm-emc: emc_pg_init: sending switch-over command");
-+ elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
-+ return;
-+
-+fail_path:
-+ dm_pg_init_complete(path, MP_FAIL_PATH);
-+}
-+
-+static struct emc_handler *alloc_emc_handler(void)
-+{
-+ struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
-+
-+ if (h) {
-+ memset(h, 0, sizeof(*h));
-+ spin_lock_init(&h->lock);
-+ }
-+
-+ return h;
-+}
-+
-+static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
-+{
-+ struct emc_handler *h;
-+ unsigned hr, short_trespass;
-+
-+ if (argc == 0) {
-+ /* No arguments: use defaults */
-+ hr = 0;
-+ short_trespass = 0;
-+ } else if (argc != 2) {
-+ DMWARN("dm-emc hwhandler: incorrect number of arguments");
-+ return -EINVAL;
-+ } else {
-+ if ((sscanf(argv[0], "%u", &short_trespass) != 1)
-+ || (short_trespass > 1)) {
-+ DMWARN("dm-emc: invalid trespass mode selected");
-+ return -EINVAL;
-+ }
-+
-+ if ((sscanf(argv[1], "%u", &hr) != 1)
-+ || (hr > 1)) {
-+ DMWARN("dm-emc: invalid honor reservation flag selected");
-+ return -EINVAL;
-+ }
-+ }
-+
-+ h = alloc_emc_handler();
-+ if (!h)
-+ return -ENOMEM;
-+
-+ hwh->context = h;
-+
-+ if ((h->short_trespass = short_trespass))
-+ DMWARN("dm-emc: short trespass command will be send");
-+ else
-+ DMWARN("dm-emc: long trespass command will be send");
-+
-+ if ((h->hr = hr))
-+ DMWARN("dm-emc: honor reservation bit will be set");
-+ else
-+ DMWARN("dm-emc: honor reservation bit will not be set (default)");
-+
-+ return 0;
-+}
-+
-+static void emc_destroy(struct hw_handler *hwh)
-+{
-+ struct emc_handler *h = (struct emc_handler *) hwh->context;
-+
-+ kfree(h);
-+ hwh->context = NULL;
-+}
-+
-+static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
-+{
-+ /* FIXME: Patch from axboe still missing */
-+#if 0
-+ int sense;
-+
-+ if (bio->bi_error & BIO_SENSE) {
-+ sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
-+
-+ if (sense == 0x020403) {
-+ /* LUN Not Ready - Manual Intervention Required
-+ * indicates this is a passive path.
-+ *
-+ * FIXME: However, if this is seen and EVPD C0
-+ * indicates that this is due to a NDU in
-+ * progress, we should set FAIL_PATH too.
-+ * This indicates we might have to do a SCSI
-+ * inquiry in the end_io path. Ugh. */
-+ return MP_BYPASS_PG | MP_RETRY_IO;
-+ } else if (sense == 0x052501) {
-+ /* An array based copy is in progress. Do not
-+ * fail the path, do not bypass to another PG,
-+ * do not retry. Fail the IO immediately.
-+ * (Actually this is the same conclusion as in
-+ * the default handler, but lets make sure.) */
-+ return 0;
-+ } else if (sense == 0x062900) {
-+ /* Unit Attention Code. This is the first IO
-+ * to the new path, so just retry. */
-+ return MP_RETRY_IO;
-+ }
-+ }
-+#endif
-+
-+ /* Try default handler */
-+ return dm_scsi_err_handler(hwh, bio);
-+}
-+
-+static struct hw_handler_type emc_hwh = {
-+ .name = "emc",
-+ .module = THIS_MODULE,
-+ .create = emc_create,
-+ .destroy = emc_destroy,
-+ .pg_init = emc_pg_init,
-+ .error = emc_error,
-+};
-+
-+static int __init dm_emc_init(void)
-+{
-+ int r = dm_register_hw_handler(&emc_hwh);
-+
-+ if (r < 0)
-+ DMERR("emc: register failed %d", r);
-+
-+ DMINFO("dm-emc version 0.0.3 loaded");
-+
-+ return r;
-+}
-+
-+static void __exit dm_emc_exit(void)
-+{
-+ int r = dm_unregister_hw_handler(&emc_hwh);
-+
-+ if (r < 0)
-+ DMERR("emc: unregister failed %d", r);
-+}
-+
-+module_init(dm_emc_init);
-+module_exit(dm_emc_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
-+MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm.h ./drivers/md/dm.h
---- ./drivers/md.dm/dm.h 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm.h 2006-03-17 13:16:38.000000000 +0300
-@@ -19,6 +19,9 @@
- #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
- #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
-
-+#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
-+ 0 : scnprintf(result + sz, maxlen - sz, x))
-+
- /*
- * FIXME: I think this should be with the definition of sector_t
- * in types.h.
-@@ -40,6 +43,7 @@ struct dm_dev {
- atomic_t count;
- int mode;
- struct block_device *bdev;
-+ char name[16];
- };
-
- struct dm_table;
-@@ -51,6 +55,8 @@ struct mapped_device;
- *---------------------------------------------------------------*/
- int dm_create(struct mapped_device **md);
- int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
-+void dm_set_mdptr(struct mapped_device *md, void *ptr);
-+void *dm_get_mdptr(dev_t dev);
-
- /*
- * Reference counting for md.
-@@ -61,7 +67,7 @@ void dm_put(struct mapped_device *md);
- /*
- * A device can still be used while suspended, but I/O is deferred.
- */
--int dm_suspend(struct mapped_device *md);
-+int dm_suspend(struct mapped_device *md, int with_lockfs);
- int dm_resume(struct mapped_device *md);
-
- /*
-@@ -109,10 +115,12 @@ void dm_table_set_restrictions(struct dm
- unsigned int dm_table_get_num_targets(struct dm_table *t);
- struct list_head *dm_table_get_devices(struct dm_table *t);
- int dm_table_get_mode(struct dm_table *t);
--void dm_table_suspend_targets(struct dm_table *t);
-+void dm_table_presuspend_targets(struct dm_table *t);
-+void dm_table_postsuspend_targets(struct dm_table *t);
- void dm_table_resume_targets(struct dm_table *t);
- int dm_table_any_congested(struct dm_table *t, int bdi_bits);
- void dm_table_unplug_all(struct dm_table *t);
-+int dm_table_flush_all(struct dm_table *t);
-
- /*-----------------------------------------------------------------
- * A registry of target types.
-@@ -135,21 +143,22 @@ static inline int array_too_big(unsigned
- }
-
- /*
-- * ceiling(n / size) * size
-+ * Ceiling(n / sz)
- */
--static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
--{
-- unsigned long r = n % size;
-- return n + (r ? (size - r) : 0);
--}
-+#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
-+
-+#define dm_sector_div_up(n, sz) ( \
-+{ \
-+ sector_t _r = ((n) + (sz) - 1); \
-+ sector_div(_r, (sz)); \
-+ _r; \
-+} \
-+)
-
- /*
-- * Ceiling(n / size)
-+ * ceiling(n / size) * size
- */
--static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
--{
-- return dm_round_up(n, size) / size;
--}
-+#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
-
- static inline sector_t to_sector(unsigned long n)
- {
-@@ -161,6 +170,8 @@ static inline unsigned long to_bytes(sec
- return (n << 9);
- }
-
-+int dm_split_args(int *argc, char ***argvp, char *input);
-+
- /*
- * The device-mapper can be driven through one of two interfaces;
- * ioctl or filesystem, depending which patch you have applied.
-@@ -178,5 +189,6 @@ int dm_stripe_init(void);
- void dm_stripe_exit(void);
-
- void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
-+union map_info *dm_get_mapinfo(struct bio *bio);
-
- #endif
-diff -pruN ./drivers/md.dm/dm-hw-handler.c ./drivers/md/dm-hw-handler.c
---- ./drivers/md.dm/dm-hw-handler.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-hw-handler.c 2006-03-20 09:38:13.000000000 +0300
-@@ -0,0 +1,216 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath hardware handler registration.
-+ */
-+
-+#include "dm.h"
-+#include "dm-hw-handler.h"
-+
-+#include <linux/slab.h>
-+
-+struct hwh_internal {
-+ struct hw_handler_type hwht;
-+
-+ struct list_head list;
-+ long use;
-+};
-+
-+#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
-+
-+static LIST_HEAD(_hw_handlers);
-+static DECLARE_RWSEM(_hwh_lock);
-+
-+struct hwh_internal *__find_hw_handler_type(const char *name)
-+{
-+ struct hwh_internal *hwhi;
-+
-+ list_for_each_entry(hwhi, &_hw_handlers, list) {
-+ if (!strcmp(name, hwhi->hwht.name))
-+ return hwhi;
-+ }
-+
-+ return NULL;
-+}
-+
-+static struct hwh_internal *get_hw_handler(const char *name)
-+{
-+ struct hwh_internal *hwhi;
-+
-+ down_read(&_hwh_lock);
-+ hwhi = __find_hw_handler_type(name);
-+ if (hwhi) {
-+ if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
-+ hwhi = NULL;
-+ else
-+ hwhi->use++;
-+ }
-+ up_read(&_hwh_lock);
-+
-+ return hwhi;
-+}
-+
-+struct hw_handler_type *dm_get_hw_handler(const char *name)
-+{
-+ struct hwh_internal *hwhi;
-+
-+ if (!name)
-+ return NULL;
-+
-+ hwhi = get_hw_handler(name);
-+ if (!hwhi) {
-+ request_module("dm-%s", name);
-+ hwhi = get_hw_handler(name);
-+ }
-+
-+ return hwhi ? &hwhi->hwht : NULL;
-+}
-+
-+void dm_put_hw_handler(struct hw_handler_type *hwht)
-+{
-+ struct hwh_internal *hwhi;
-+
-+ if (!hwht)
-+ return;
-+
-+ down_read(&_hwh_lock);
-+ hwhi = __find_hw_handler_type(hwht->name);
-+ if (!hwhi)
-+ goto out;
-+
-+ if (--hwhi->use == 0)
-+ module_put(hwhi->hwht.module);
-+
-+ if (hwhi->use < 0)
-+ BUG();
-+
-+ out:
-+ up_read(&_hwh_lock);
-+}
-+
-+static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
-+{
-+ struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
-+
-+ if (hwhi) {
-+ memset(hwhi, 0, sizeof(*hwhi));
-+ hwhi->hwht = *hwht;
-+ }
-+
-+ return hwhi;
-+}
-+
-+int dm_register_hw_handler(struct hw_handler_type *hwht)
-+{
-+ int r = 0;
-+ struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
-+
-+ if (!hwhi)
-+ return -ENOMEM;
-+
-+ down_write(&_hwh_lock);
-+
-+ if (__find_hw_handler_type(hwht->name)) {
-+ kfree(hwhi);
-+ r = -EEXIST;
-+ } else
-+ list_add(&hwhi->list, &_hw_handlers);
-+
-+ up_write(&_hwh_lock);
-+
-+ return r;
-+}
-+
-+int dm_unregister_hw_handler(struct hw_handler_type *hwht)
-+{
-+ struct hwh_internal *hwhi;
-+
-+ down_write(&_hwh_lock);
-+
-+ hwhi = __find_hw_handler_type(hwht->name);
-+ if (!hwhi) {
-+ up_write(&_hwh_lock);
-+ return -EINVAL;
-+ }
-+
-+ if (hwhi->use) {
-+ up_write(&_hwh_lock);
-+ return -ETXTBSY;
-+ }
-+
-+ list_del(&hwhi->list);
-+
-+ up_write(&_hwh_lock);
-+
-+ kfree(hwhi);
-+
-+ return 0;
-+}
-+
-+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
-+{
-+#if 0
-+ int sense_key, asc, ascq;
-+
-+ if (bio->bi_error & BIO_SENSE) {
-+ /* FIXME: This is just an initial guess. */
-+ /* key / asc / ascq */
-+ sense_key = (bio->bi_error >> 16) & 0xff;
-+ asc = (bio->bi_error >> 8) & 0xff;
-+ ascq = bio->bi_error & 0xff;
-+
-+ switch (sense_key) {
-+ /* This block as a whole comes from the device.
-+ * So no point retrying on another path. */
-+ case 0x03: /* Medium error */
-+ case 0x05: /* Illegal request */
-+ case 0x07: /* Data protect */
-+ case 0x08: /* Blank check */
-+ case 0x0a: /* copy aborted */
-+ case 0x0c: /* obsolete - no clue ;-) */
-+ case 0x0d: /* volume overflow */
-+ case 0x0e: /* data miscompare */
-+ case 0x0f: /* reserved - no idea either. */
-+ return MP_ERROR_IO;
-+
-+ /* For these errors it's unclear whether they
-+ * come from the device or the controller.
-+ * So just lets try a different path, and if
-+ * it eventually succeeds, user-space will clear
-+ * the paths again... */
-+ case 0x02: /* Not ready */
-+ case 0x04: /* Hardware error */
-+ case 0x09: /* vendor specific */
-+ case 0x0b: /* Aborted command */
-+ return MP_FAIL_PATH;
-+
-+ case 0x06: /* Unit attention - might want to decode */
-+ if (asc == 0x04 && ascq == 0x01)
-+ /* "Unit in the process of
-+ * becoming ready" */
-+ return 0;
-+ return MP_FAIL_PATH;
-+
-+ /* FIXME: For Unit Not Ready we may want
-+ * to have a generic pg activation
-+ * feature (START_UNIT). */
-+
-+ /* Should these two ever end up in the
-+ * error path? I don't think so. */
-+ case 0x00: /* No sense */
-+ case 0x01: /* Recovered error */
-+ return 0;
-+ }
-+ }
-+#endif
-+
-+ /* We got no idea how to decode the other kinds of errors ->
-+ * assume generic error condition. */
-+ return MP_FAIL_PATH;
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_register_hw_handler);
-+EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
-+EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
-diff -pruN ./drivers/md.dm/dm-hw-handler.h ./drivers/md/dm-hw-handler.h
---- ./drivers/md.dm/dm-hw-handler.h 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-hw-handler.h 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,61 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath hardware handler registration.
-+ */
-+
-+#ifndef DM_HW_HANDLER_H
-+#define DM_HW_HANDLER_H
-+
-+#include <linux/device-mapper.h>
-+
-+#include "dm-mpath.h"
-+
-+struct hw_handler_type;
-+struct hw_handler {
-+ struct hw_handler_type *type;
-+ void *context;
-+};
-+
-+/*
-+ * Constructs a hardware handler object, takes custom arguments
-+ */
-+/* Information about a hardware handler type */
-+struct hw_handler_type {
-+ char *name;
-+ struct module *module;
-+
-+ int (*create) (struct hw_handler *handler, unsigned int argc,
-+ char **argv);
-+ void (*destroy) (struct hw_handler *hwh);
-+
-+ void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
-+ struct path *path);
-+ unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
-+ int (*status) (struct hw_handler *hwh, status_type_t type,
-+ char *result, unsigned int maxlen);
-+};
-+
-+/* Register a hardware handler */
-+int dm_register_hw_handler(struct hw_handler_type *type);
-+
-+/* Unregister a hardware handler */
-+int dm_unregister_hw_handler(struct hw_handler_type *type);
-+
-+/* Returns a registered hardware handler type */
-+struct hw_handler_type *dm_get_hw_handler(const char *name);
-+
-+/* Releases a hardware handler */
-+void dm_put_hw_handler(struct hw_handler_type *hwht);
-+
-+/* Default err function */
-+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
-+
-+/* Error flags for err and dm_pg_init_complete */
-+#define MP_FAIL_PATH 1
-+#define MP_BYPASS_PG 2
-+#define MP_ERROR_IO 4 /* Don't retry this I/O */
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-io.c ./drivers/md/dm-io.c
---- ./drivers/md.dm/dm-io.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-io.c 2006-03-17 13:16:38.000000000 +0300
-@@ -267,7 +267,7 @@ static int resize_pool(unsigned int new_
- /* create new pool */
- _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
- if (!_io_pool)
-- r = -ENOMEM;
-+ return -ENOMEM;
-
- r = bio_set_init(&_bios, "dm-io", 512, 1);
- if (r) {
-diff -pruN ./drivers/md.dm/dm-ioctl.c ./drivers/md/dm-ioctl.c
---- ./drivers/md.dm/dm-ioctl.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-ioctl.c 2006-03-17 13:16:38.000000000 +0300
-@@ -1,5 +1,6 @@
- /*
- * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
-+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
- *
- * This file is released under the GPL.
- */
-@@ -17,7 +18,7 @@
-
- #include <asm/uaccess.h>
-
--#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
-+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
-
- /*-----------------------------------------------------------------
- * The ioctl interface needs to be able to look up devices by
-@@ -121,14 +122,6 @@ static struct hash_cell *__get_uuid_cell
- /*-----------------------------------------------------------------
- * Inserting, removing and renaming a device.
- *---------------------------------------------------------------*/
--static inline char *kstrdup(const char *str)
--{
-- char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
-- if (r)
-- strcpy(r, str);
-- return r;
--}
--
- static struct hash_cell *alloc_cell(const char *name, const char *uuid,
- struct mapped_device *md)
- {
-@@ -138,7 +131,7 @@ static struct hash_cell *alloc_cell(cons
- if (!hc)
- return NULL;
-
-- hc->name = kstrdup(name);
-+ hc->name = kstrdup(name, GFP_KERNEL);
- if (!hc->name) {
- kfree(hc);
- return NULL;
-@@ -148,7 +141,7 @@ static struct hash_cell *alloc_cell(cons
- hc->uuid = NULL;
-
- else {
-- hc->uuid = kstrdup(uuid);
-+ hc->uuid = kstrdup(uuid, GFP_KERNEL);
- if (!hc->uuid) {
- kfree(hc->name);
- kfree(hc);
-@@ -224,6 +217,7 @@ static int dm_hash_insert(const char *na
- }
- register_with_devfs(cell);
- dm_get(md);
-+ dm_set_mdptr(md, cell);
- up_write(&_hash_lock);
-
- return 0;
-@@ -236,10 +230,20 @@ static int dm_hash_insert(const char *na
-
- static void __hash_remove(struct hash_cell *hc)
- {
-+ struct dm_table *table;
-+
- /* remove from the dev hash */
- list_del(&hc->uuid_list);
- list_del(&hc->name_list);
- unregister_with_devfs(hc);
-+ dm_set_mdptr(hc->md, NULL);
-+
-+ table = dm_get_table(hc->md);
-+ if (table) {
-+ dm_table_event(table);
-+ dm_table_put(table);
-+ }
-+
- dm_put(hc->md);
- if (hc->new_map)
- dm_table_put(hc->new_map);
-@@ -266,11 +270,12 @@ static int dm_hash_rename(const char *ol
- {
- char *new_name, *old_name;
- struct hash_cell *hc;
-+ struct dm_table *table;
-
- /*
- * duplicate new.
- */
-- new_name = kstrdup(new);
-+ new_name = kstrdup(new, GFP_KERNEL);
- if (!new_name)
- return -ENOMEM;
-
-@@ -313,6 +318,15 @@ static int dm_hash_rename(const char *ol
- /* rename the device node in devfs */
- register_with_devfs(hc);
-
-+ /*
-+ * Wake up any dm event waiters.
-+ */
-+ table = dm_get_table(hc->md);
-+ if (table) {
-+ dm_table_event(table);
-+ dm_table_put(table);
-+ }
-+
- up_write(&_hash_lock);
- kfree(old_name);
- return 0;
-@@ -421,8 +435,8 @@ static void list_version_get_needed(stru
- {
- size_t *needed = needed_param;
-
-+ *needed += sizeof(struct dm_target_versions);
- *needed += strlen(tt->name);
-- *needed += sizeof(tt->version);
- *needed += ALIGN_MASK;
- }
-
-@@ -517,19 +531,22 @@ static int __dev_status(struct mapped_de
- if (dm_suspended(md))
- param->flags |= DM_SUSPEND_FLAG;
-
-- bdev = bdget_disk(disk, 0);
-- if (!bdev)
-- return -ENXIO;
--
- param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
-
-- /*
-- * Yes, this will be out of date by the time it gets back
-- * to userland, but it is still very useful ofr
-- * debugging.
-- */
-- param->open_count = bdev->bd_openers;
-- bdput(bdev);
-+ if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
-+ bdev = bdget_disk(disk, 0);
-+ if (!bdev)
-+ return -ENXIO;
-+
-+ /*
-+ * Yes, this will be out of date by the time it gets back
-+ * to userland, but it is still very useful for
-+ * debugging.
-+ */
-+ param->open_count = bdev->bd_openers;
-+ bdput(bdev);
-+ } else
-+ param->open_count = -1;
-
- if (disk->policy)
- param->flags |= DM_READONLY_FLAG;
-@@ -579,12 +596,16 @@ static int dev_create(struct dm_ioctl *p
- }
-
- /*
-- * Always use UUID for lookups if it's present, otherwise use name.
-+ * Always use UUID for lookups if it's present, otherwise use name or dev.
- */
- static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
- {
-- return *param->uuid ?
-- __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
-+ if (*param->uuid)
-+ return __get_uuid_cell(param->uuid);
-+ else if (*param->name)
-+ return __get_name_cell(param->name);
-+ else
-+ return dm_get_mdptr(huge_decode_dev(param->dev));
- }
-
- static inline struct mapped_device *find_device(struct dm_ioctl *param)
-@@ -596,6 +617,7 @@ static inline struct mapped_device *find
- hc = __find_device_hash_cell(param);
- if (hc) {
- md = hc->md;
-+ dm_get(md);
-
- /*
- * Sneakily write in both the name and the uuid
-@@ -611,8 +633,6 @@ static inline struct mapped_device *find
- param->flags |= DM_INACTIVE_PRESENT_FLAG;
- else
- param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
--
-- dm_get(md);
- }
- up_read(&_hash_lock);
-
-@@ -673,14 +693,18 @@ static int dev_rename(struct dm_ioctl *p
- static int do_suspend(struct dm_ioctl *param)
- {
- int r = 0;
-+ int do_lockfs = 1;
- struct mapped_device *md;
-
- md = find_device(param);
- if (!md)
- return -ENXIO;
-
-+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
-+ do_lockfs = 0;
-+
- if (!dm_suspended(md))
-- r = dm_suspend(md);
-+ r = dm_suspend(md, do_lockfs);
-
- if (!r)
- r = __dev_status(md, param);
-@@ -692,6 +716,7 @@ static int do_suspend(struct dm_ioctl *p
- static int do_resume(struct dm_ioctl *param)
- {
- int r = 0;
-+ int do_lockfs = 1;
- struct hash_cell *hc;
- struct mapped_device *md;
- struct dm_table *new_map;
-@@ -717,8 +742,10 @@ static int do_resume(struct dm_ioctl *pa
- /* Do we need to load a new map ? */
- if (new_map) {
- /* Suspend if it isn't already suspended */
-+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
-+ do_lockfs = 0;
- if (!dm_suspended(md))
-- dm_suspend(md);
-+ dm_suspend(md, do_lockfs);
-
- r = dm_swap_table(md, new_map);
- if (r) {
-@@ -964,6 +991,7 @@ static int table_load(struct dm_ioctl *p
- if (!hc) {
- DMWARN("device doesn't appear to be in the dev hash table.");
- up_write(&_hash_lock);
-+ dm_table_put(t);
- return -ENXIO;
- }
-
-@@ -1097,6 +1125,67 @@ static int table_status(struct dm_ioctl
- return r;
- }
-
-+/*
-+ * Pass a message to the target that's at the supplied device offset.
-+ */
-+static int target_message(struct dm_ioctl *param, size_t param_size)
-+{
-+ int r, argc;
-+ char **argv;
-+ struct mapped_device *md;
-+ struct dm_table *table;
-+ struct dm_target *ti;
-+ struct dm_target_msg *tmsg = (void *) param + param->data_start;
-+
-+ md = find_device(param);
-+ if (!md)
-+ return -ENXIO;
-+
-+ r = __dev_status(md, param);
-+ if (r)
-+ goto out;
-+
-+ if (tmsg < (struct dm_target_msg *) (param + 1) ||
-+ invalid_str(tmsg->message, (void *) param + param_size)) {
-+ DMWARN("Invalid target message parameters.");
-+ r = -EINVAL;
-+ goto out;
-+ }
-+
-+ r = dm_split_args(&argc, &argv, tmsg->message);
-+ if (r) {
-+ DMWARN("Failed to split target message parameters");
-+ goto out;
-+ }
-+
-+ table = dm_get_table(md);
-+ if (!table)
-+ goto out_argv;
-+
-+ if (tmsg->sector >= dm_table_get_size(table)) {
-+ DMWARN("Target message sector outside device.");
-+ r = -EINVAL;
-+ goto out_table;
-+ }
-+
-+ ti = dm_table_find_target(table, tmsg->sector);
-+ if (ti->type->message)
-+ r = ti->type->message(ti, argc, argv);
-+ else {
-+ DMWARN("Target type does not support messages");
-+ r = -EINVAL;
-+ }
-+
-+ out_table:
-+ dm_table_put(table);
-+ out_argv:
-+ kfree(argv);
-+ out:
-+ param->data_size = 0;
-+ dm_put(md);
-+ return r;
-+}
-+
- /*-----------------------------------------------------------------
- * Implementation of open/close/ioctl on the special char
- * device.
-@@ -1123,7 +1212,9 @@ static ioctl_fn lookup_ioctl(unsigned in
- {DM_TABLE_DEPS_CMD, table_deps},
- {DM_TABLE_STATUS_CMD, table_status},
-
-- {DM_LIST_VERSIONS_CMD, list_versions}
-+ {DM_LIST_VERSIONS_CMD, list_versions},
-+
-+ {DM_TARGET_MSG_CMD, target_message}
- };
-
- return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
-@@ -1202,14 +1293,14 @@ static int validate_params(uint cmd, str
- cmd == DM_LIST_VERSIONS_CMD)
- return 0;
-
-- /* Unless creating, either name or uuid but not both */
-- if (cmd != DM_DEV_CREATE_CMD) {
-- if ((!*param->uuid && !*param->name) ||
-- (*param->uuid && *param->name)) {
-- DMWARN("one of name or uuid must be supplied, cmd(%u)",
-- cmd);
-+ if ((cmd == DM_DEV_CREATE_CMD)) {
-+ if (!*param->name) {
-+ DMWARN("name not supplied when creating device");
- return -EINVAL;
- }
-+ } else if ((*param->uuid && *param->name)) {
-+ DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
-+ return -EINVAL;
- }
-
- /* Ensure strings are terminated */
-@@ -1268,16 +1359,11 @@ static int ctl_ioctl(struct inode *inode
- * Copy the parameters into kernel space.
- */
- r = copy_params(user, &param);
-- if (r) {
-- current->flags &= ~PF_MEMALLOC;
-- return r;
-- }
-
-- /*
-- * FIXME: eventually we will remove the PF_MEMALLOC flag
-- * here. However the tools still do nasty things like
-- * 'load' while a device is suspended.
-- */
-+ current->flags &= ~PF_MEMALLOC;
-+
-+ if (r)
-+ return r;
-
- r = validate_params(cmd, param);
- if (r)
-@@ -1295,7 +1381,6 @@ static int ctl_ioctl(struct inode *inode
-
- out:
- free_params(param);
-- current->flags &= ~PF_MEMALLOC;
- return r;
- }
-
-diff -pruN ./drivers/md.dm/dm-linear.c ./drivers/md/dm-linear.c
---- ./drivers/md.dm/dm-linear.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-linear.c 2006-03-17 13:16:38.000000000 +0300
-@@ -80,7 +80,6 @@ static int linear_status(struct dm_targe
- char *result, unsigned int maxlen)
- {
- struct linear_c *lc = (struct linear_c *) ti->private;
-- char buffer[32];
-
- switch (type) {
- case STATUSTYPE_INFO:
-@@ -88,8 +87,8 @@ static int linear_status(struct dm_targe
- break;
-
- case STATUSTYPE_TABLE:
-- format_dev_t(buffer, lc->dev->bdev->bd_dev);
-- snprintf(result, maxlen, "%s " SECTOR_FORMAT, buffer, lc->start);
-+ snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
-+ lc->start);
- break;
- }
- return 0;
-diff -pruN ./drivers/md.dm/dm-log.c ./drivers/md/dm-log.c
---- ./drivers/md.dm/dm-log.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-log.c 2006-03-17 13:16:38.000000000 +0300
-@@ -17,9 +17,6 @@ static spinlock_t _lock = SPIN_LOCK_UNLO
-
- int dm_register_dirty_log_type(struct dirty_log_type *type)
- {
-- if (!try_module_get(type->module))
-- return -EINVAL;
--
- spin_lock(&_lock);
- type->use_count = 0;
- list_add(&type->list, &_log_types);
-@@ -33,11 +30,10 @@ int dm_unregister_dirty_log_type(struct
- spin_lock(&_lock);
-
- if (type->use_count)
-- DMWARN("Attempt to unregister a log type that is still in use");
-- else {
-+ DMWARN("Unregister failed: log type '%s' still in use",
-+ type->name);
-+ else
- list_del(&type->list);
-- module_put(type->module);
-- }
-
- spin_unlock(&_lock);
-
-@@ -51,6 +47,10 @@ static struct dirty_log_type *get_type(c
- spin_lock(&_lock);
- list_for_each_entry (type, &_log_types, list)
- if (!strcmp(type_name, type->name)) {
-+ if (!type->use_count && !try_module_get(type->module)){
-+ spin_unlock(&_lock);
-+ return NULL;
-+ }
- type->use_count++;
- spin_unlock(&_lock);
- return type;
-@@ -63,7 +63,8 @@ static struct dirty_log_type *get_type(c
- static void put_type(struct dirty_log_type *type)
- {
- spin_lock(&_lock);
-- type->use_count--;
-+ if (!--type->use_count)
-+ module_put(type->module);
- spin_unlock(&_lock);
- }
-
-@@ -112,7 +113,7 @@ void dm_destroy_dirty_log(struct dirty_l
- /*
- * The on-disk version of the metadata.
- */
--#define MIRROR_DISK_VERSION 1
-+#define MIRROR_DISK_VERSION 2
- #define LOG_OFFSET 2
-
- struct log_header {
-@@ -129,20 +130,32 @@ struct log_header {
- struct log_c {
- struct dm_target *ti;
- int touched;
-- sector_t region_size;
-+ uint32_t region_size;
- unsigned int region_count;
- region_t sync_count;
-
- unsigned bitset_uint32_count;
- uint32_t *clean_bits;
- uint32_t *sync_bits;
-- uint32_t *recovering_bits; /* FIXME: this seems excessive */
-+ uint32_t *recovering_bits;
-
- int sync_search;
-
-+ /* Resync flag */
-+ enum sync {
-+ DEFAULTSYNC, /* Synchronize if necessary */
-+ NOSYNC, /* Devices known to be already in sync */
-+ FORCESYNC, /* Force a sync to happen */
-+ } sync;
-+
-+ int failure_response;
-+
- /*
- * Disk log fields
- */
-+ int log_dev_failed;
-+ atomic_t suspended;
-+ struct completion failure_completion;
- struct dm_dev *log_dev;
- struct log_header header;
-
-@@ -150,7 +163,6 @@ struct log_c {
- struct log_header *disk_header;
-
- struct io_region bits_location;
-- uint32_t *disk_bits;
- };
-
- /*
-@@ -159,20 +171,20 @@ struct log_c {
- */
- static inline int log_test_bit(uint32_t *bs, unsigned bit)
- {
-- return test_bit(bit, (unsigned long *) bs) ? 1 : 0;
-+ return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
- }
-
- static inline void log_set_bit(struct log_c *l,
- uint32_t *bs, unsigned bit)
- {
-- set_bit(bit, (unsigned long *) bs);
-+ ext2_set_bit(bit, (unsigned long *) bs);
- l->touched = 1;
- }
-
- static inline void log_clear_bit(struct log_c *l,
- uint32_t *bs, unsigned bit)
- {
-- clear_bit(bit, (unsigned long *) bs);
-+ ext2_clear_bit(bit, (unsigned long *) bs);
- l->touched = 1;
- }
-
-@@ -205,12 +217,19 @@ static int read_header(struct log_c *log
-
- header_from_disk(&log->header, log->disk_header);
-
-- if (log->header.magic != MIRROR_MAGIC) {
-+ /* New log required? */
-+ if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
- log->header.magic = MIRROR_MAGIC;
- log->header.version = MIRROR_DISK_VERSION;
- log->header.nr_regions = 0;
- }
-
-+ /* Version 2 is like version 1 but always little endian on disk. */
-+#ifdef __LITTLE_ENDIAN
-+ if (log->header.version == 1)
-+ log->header.version = 2;
-+#endif
-+
- if (log->header.version != MIRROR_DISK_VERSION) {
- DMWARN("incompatible disk log version");
- return -EINVAL;
-@@ -231,70 +250,69 @@ static inline int write_header(struct lo
- /*----------------------------------------------------------------
- * Bits IO
- *--------------------------------------------------------------*/
--static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count)
--{
-- unsigned i;
--
-- for (i = 0; i < count; i++)
-- core[i] = le32_to_cpu(disk[i]);
--}
--
--static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count)
--{
-- unsigned i;
--
-- /* copy across the clean/dirty bitset */
-- for (i = 0; i < count; i++)
-- disk[i] = cpu_to_le32(core[i]);
--}
--
- static int read_bits(struct log_c *log)
- {
- int r;
- unsigned long ebits;
-
- r = dm_io_sync_vm(1, &log->bits_location, READ,
-- log->disk_bits, &ebits);
-+ log->clean_bits, &ebits);
- if (r)
- return r;
-
-- bits_to_core(log->clean_bits, log->disk_bits,
-- log->bitset_uint32_count);
- return 0;
- }
-
- static int write_bits(struct log_c *log)
- {
- unsigned long ebits;
-- bits_to_disk(log->clean_bits, log->disk_bits,
-- log->bitset_uint32_count);
- return dm_io_sync_vm(1, &log->bits_location, WRITE,
-- log->disk_bits, &ebits);
-+ log->clean_bits, &ebits);
- }
-
- /*----------------------------------------------------------------
-- * constructor/destructor
-+ * core log constructor/destructor
-+ *
-+ * argv contains: <region_size> [[no]sync] [block_on_error]
- *--------------------------------------------------------------*/
- #define BYTE_SHIFT 3
- static int core_ctr(struct dirty_log *log, struct dm_target *ti,
- unsigned int argc, char **argv)
- {
-+ enum sync sync = DEFAULTSYNC;
-+ int failure_response = DMLOG_IOERR_IGNORE;
-+
- struct log_c *lc;
-- sector_t region_size;
-+ uint32_t region_size;
- unsigned int region_count;
- size_t bitset_size;
-+ unsigned i;
-
-- if (argc != 1) {
-- DMWARN("wrong number of arguments to log_c");
-+ if (argc < 1 || argc > 3) {
-+ DMWARN("wrong number of arguments to mirror log");
- return -EINVAL;
- }
-
-- if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
-+ for (i = 1; i < argc; i++) {
-+ if (!strcmp(argv[i], "sync"))
-+ sync = FORCESYNC;
-+ else if (!strcmp(argv[i], "nosync"))
-+ sync = NOSYNC;
-+ else if (!strcmp(argv[i], "block_on_error"))
-+ failure_response = DMLOG_IOERR_BLOCK;
-+ else {
-+ DMWARN("unrecognised sync argument to mirror log: %s",
-+ argv[i]);
-+ return -EINVAL;
-+ }
-+ }
-+
-+ if (sscanf(argv[0], "%u", &region_size) != 1) {
- DMWARN("invalid region size string");
- return -EINVAL;
- }
-
-- region_count = dm_div_up(ti->len, region_size);
-+ region_count = dm_sector_div_up(ti->len, region_size);
-
- lc = kmalloc(sizeof(*lc), GFP_KERNEL);
- if (!lc) {
-@@ -306,12 +324,14 @@ static int core_ctr(struct dirty_log *lo
- lc->touched = 0;
- lc->region_size = region_size;
- lc->region_count = region_count;
-+ lc->sync = sync;
-+ lc->failure_response = failure_response;
-
- /*
-- * Work out how many words we need to hold the bitset.
-+ * Work out how many "unsigned long"s we need to hold the bitset.
- */
- bitset_size = dm_round_up(region_count,
-- sizeof(*lc->clean_bits) << BYTE_SHIFT);
-+ sizeof(unsigned long) << BYTE_SHIFT);
- bitset_size >>= BYTE_SHIFT;
-
- lc->bitset_uint32_count = bitset_size / 4;
-@@ -330,12 +350,12 @@ static int core_ctr(struct dirty_log *lo
- kfree(lc);
- return -ENOMEM;
- }
-- memset(lc->sync_bits, 0, bitset_size);
-- lc->sync_count = 0;
-+ memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
-+ lc->sync_count = (sync == NOSYNC) ? region_count : 0;
-
- lc->recovering_bits = vmalloc(bitset_size);
- if (!lc->recovering_bits) {
-- DMWARN("couldn't allocate sync bitset");
-+ DMWARN("couldn't allocate recovering bitset");
- vfree(lc->sync_bits);
- vfree(lc->clean_bits);
- kfree(lc);
-@@ -356,6 +376,11 @@ static void core_dtr(struct dirty_log *l
- kfree(lc);
- }
-
-+/*----------------------------------------------------------------
-+ * disk log constructor/destructor
-+ *
-+ * argv contains log_device region_size followed optionally by [no]sync
-+ *--------------------------------------------------------------*/
- static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
- unsigned int argc, char **argv)
- {
-@@ -364,8 +389,8 @@ static int disk_ctr(struct dirty_log *lo
- struct log_c *lc;
- struct dm_dev *dev;
-
-- if (argc != 2) {
-- DMWARN("wrong number of arguments to log_d");
-+ if (argc < 2 || argc > 3) {
-+ DMWARN("wrong number of arguments to disk mirror log");
- return -EINVAL;
- }
-
-@@ -382,6 +407,8 @@ static int disk_ctr(struct dirty_log *lo
-
- lc = (struct log_c *) log->context;
- lc->log_dev = dev;
-+ lc->log_dev_failed = 0;
-+ init_completion(&lc->failure_completion);
-
- /* setup the disk header fields */
- lc->header_location.bdev = lc->log_dev->bdev;
-@@ -403,11 +430,6 @@ static int disk_ctr(struct dirty_log *lo
- size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
- 1 << SECTOR_SHIFT);
- lc->bits_location.count = size >> SECTOR_SHIFT;
-- lc->disk_bits = vmalloc(size);
-- if (!lc->disk_bits) {
-- vfree(lc->disk_header);
-- goto bad;
-- }
- return 0;
-
- bad:
-@@ -421,7 +443,6 @@ static void disk_dtr(struct dirty_log *l
- struct log_c *lc = (struct log_c *) log->context;
- dm_put_device(lc->ti, lc->log_dev);
- vfree(lc->disk_header);
-- vfree(lc->disk_bits);
- core_dtr(log);
- }
-
-@@ -435,42 +456,65 @@ static int count_bits32(uint32_t *addr,
- return count;
- }
-
-+static void fail_log_device(struct log_c *lc)
-+{
-+ lc->log_dev_failed = 1;
-+ if (lc->failure_response == DMLOG_IOERR_BLOCK)
-+ dm_table_event(lc->ti->table);
-+}
-+
-+static void restore_log_device(struct log_c *lc)
-+{
-+ lc->log_dev_failed = 0;
-+}
-+
- static int disk_resume(struct dirty_log *log)
- {
-- int r;
-+ int r = 0;
- unsigned i;
- struct log_c *lc = (struct log_c *) log->context;
- size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
-
-- /* read the disk header */
-- r = read_header(lc);
-- if (r)
-- return r;
--
-- /* read the bits */
-- r = read_bits(lc);
-- if (r)
-- return r;
--
-- /* zero any new bits if the mirror has grown */
-- for (i = lc->header.nr_regions; i < lc->region_count; i++)
-- /* FIXME: amazingly inefficient */
-- log_clear_bit(lc, lc->clean_bits, i);
-+ /*
-+ * Read the disk header, but only if we know it is good.
-+ * Assume the worst in the event of failure.
-+ */
-+ if (!lc->log_dev_failed &&
-+ ((r = read_header(lc)) || read_bits(lc))) {
-+ DMWARN("Read %s failed on mirror log device, %s.",
-+ r ? "header" : "bits", lc->log_dev->name);
-+ fail_log_device(lc);
-+ lc->header.nr_regions = 0;
-+ }
-+
-+ /* set or clear any new bits */
-+ if (lc->sync == NOSYNC)
-+ for (i = lc->header.nr_regions; i < lc->region_count; i++)
-+ /* FIXME: amazingly inefficient */
-+ log_set_bit(lc, lc->clean_bits, i);
-+ else
-+ for (i = lc->header.nr_regions; i < lc->region_count; i++)
-+ /* FIXME: amazingly inefficient */
-+ log_clear_bit(lc, lc->clean_bits, i);
-
- /* copy clean across to sync */
- memcpy(lc->sync_bits, lc->clean_bits, size);
- lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
-
-- /* write the bits */
-- r = write_bits(lc);
-- if (r)
-- return r;
--
- /* set the correct number of regions in the header */
- lc->header.nr_regions = lc->region_count;
-
-- /* write the new header */
-- return write_header(lc);
-+ /* write out the log. 'i' tells us which has failed if any */
-+ i = 1;
-+ if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
-+ DMWARN("Write %s failed on mirror log device, %s.",
-+ i ? "bits" : "header", lc->log_dev->name);
-+ fail_log_device(lc);
-+ } else
-+ restore_log_device(lc);
-+
-+ atomic_set(&lc->suspended, 0);
-+ return r;
- }
-
- static sector_t core_get_region_size(struct dirty_log *log)
-@@ -497,6 +541,17 @@ static int core_flush(struct dirty_log *
- return 0;
- }
-
-+static int disk_presuspend(struct dirty_log *log)
-+{
-+ struct log_c *lc = (struct log_c *) log->context;
-+
-+ atomic_set(&lc->suspended, 1);
-+ if (lc->log_dev_failed && (lc->failure_response == DMLOG_IOERR_BLOCK))
-+ complete(&lc->failure_completion);
-+
-+ return 0;
-+}
-+
- static int disk_flush(struct dirty_log *log)
- {
- int r;
-@@ -506,9 +561,24 @@ static int disk_flush(struct dirty_log *
- if (!lc->touched)
- return 0;
-
-+ /*
-+ * If a failure occurs, we must wait for a suspension.
-+ * We must not proceed in the event of a failure,
-+ * because if the machine reboots with the log
-+ * incorrect, recovery could be compromised
-+ */
- r = write_bits(lc);
-- if (!r)
-+ if (!r) {
- lc->touched = 0;
-+ restore_log_device(lc);
-+ } else {
-+ DMERR("Write failure on mirror log device, %s.",
-+ lc->log_dev->name);
-+ fail_log_device(lc);
-+ if (!atomic_read(&lc->suspended) &&
-+ (lc->failure_response == DMLOG_IOERR_BLOCK))
-+ wait_for_completion(&lc->failure_completion);
-+ }
-
- return r;
- }
-@@ -538,7 +608,7 @@ static int core_get_resync_work(struct d
- lc->sync_search);
- lc->sync_search = *region + 1;
-
-- if (*region == lc->region_count)
-+ if (*region >= lc->region_count)
- return 0;
-
- } while (log_test_bit(lc->recovering_bits, *region));
-@@ -566,6 +636,60 @@ static region_t core_get_sync_count(stru
- return lc->sync_count;
- }
-
-+#define DMEMIT_SYNC \
-+ if (lc->sync != DEFAULTSYNC) \
-+ DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
-+
-+static int core_status(struct dirty_log *log, status_type_t status,
-+ char *result, unsigned int maxlen)
-+{
-+ int sz = 0;
-+ struct log_c *lc = log->context;
-+
-+ switch(status) {
-+ case STATUSTYPE_INFO:
-+ DMEMIT("1 core");
-+ break;
-+
-+ case STATUSTYPE_TABLE:
-+ DMEMIT("%s %u %u ", log->type->name,
-+ lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
-+ DMEMIT_SYNC;
-+ }
-+
-+ return sz;
-+}
-+
-+static int disk_status(struct dirty_log *log, status_type_t status,
-+ char *result, unsigned int maxlen)
-+{
-+ int sz = 0;
-+ struct log_c *lc = log->context;
-+
-+ switch(status) {
-+ case STATUSTYPE_INFO:
-+ DMEMIT("3 disk %s %c", lc->log_dev->name,
-+ lc->log_dev_failed ? 'D' : 'A');
-+ break;
-+
-+ case STATUSTYPE_TABLE:
-+ DMEMIT("%s %u %s %u ", log->type->name,
-+ lc->sync == DEFAULTSYNC ? 2 : 3,
-+ lc->log_dev->name,
-+ lc->region_size);
-+ DMEMIT_SYNC;
-+ }
-+
-+ return sz;
-+}
-+
-+static int core_get_failure_response(struct dirty_log *log)
-+{
-+ struct log_c *lc = log->context;
-+
-+ return lc->failure_response;
-+}
-+
- static struct dirty_log_type _core_type = {
- .name = "core",
- .module = THIS_MODULE,
-@@ -579,7 +703,9 @@ static struct dirty_log_type _core_type
- .clear_region = core_clear_region,
- .get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
-- .get_sync_count = core_get_sync_count
-+ .get_sync_count = core_get_sync_count,
-+ .status = core_status,
-+ .get_failure_response = core_get_failure_response,
- };
-
- static struct dirty_log_type _disk_type = {
-@@ -587,7 +713,8 @@ static struct dirty_log_type _disk_type
- .module = THIS_MODULE,
- .ctr = disk_ctr,
- .dtr = disk_dtr,
-- .suspend = disk_flush,
-+ .presuspend = disk_presuspend,
-+ .postsuspend = disk_flush,
- .resume = disk_resume,
- .get_region_size = core_get_region_size,
- .is_clean = core_is_clean,
-@@ -597,7 +724,9 @@ static struct dirty_log_type _disk_type
- .clear_region = core_clear_region,
- .get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
-- .get_sync_count = core_get_sync_count
-+ .get_sync_count = core_get_sync_count,
-+ .status = disk_status,
-+ .get_failure_response = core_get_failure_response,
- };
-
- int __init dm_dirty_log_init(void)
-diff -pruN ./drivers/md.dm/dm-log.h ./drivers/md/dm-log.h
---- ./drivers/md.dm/dm-log.h 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-log.h 2006-03-17 13:16:38.000000000 +0300
-@@ -9,6 +9,15 @@
-
- #include "dm.h"
-
-+/*
-+ * Values returned by get_failure_response()
-+ * DMLOG_IOERR_IGNORE: ignore device failures
-+ * DMLOG_IOERR_BLOCK: issue dm event, and do not complete
-+ * I/O until presuspend is recieved.
-+ */
-+#define DMLOG_IOERR_IGNORE 0
-+#define DMLOG_IOERR_BLOCK 1
-+
- typedef sector_t region_t;
-
- struct dirty_log_type;
-@@ -32,7 +41,8 @@ struct dirty_log_type {
- * There are times when we don't want the log to touch
- * the disk.
- */
-- int (*suspend)(struct dirty_log *log);
-+ int (*presuspend)(struct dirty_log *log);
-+ int (*postsuspend)(struct dirty_log *log);
- int (*resume)(struct dirty_log *log);
-
- /*
-@@ -48,6 +58,16 @@ struct dirty_log_type {
- int (*is_clean)(struct dirty_log *log, region_t region);
-
- /*
-+ * Returns: 0, 1
-+ *
-+ * This is necessary for cluster mirroring. It provides
-+ * a way to detect recovery on another node, so we
-+ * aren't writing concurrently. This function is likely
-+ * to block (when a cluster log is used).
-+ */
-+ int (*is_remote_recovering)(struct dirty_log *log, region_t region);
-+
-+ /*
- * Returns: 0, 1, -EWOULDBLOCK, < 0
- *
- * A predicate function to check the area given by
-@@ -101,6 +121,18 @@ struct dirty_log_type {
- * Returns the number of regions that are in sync.
- */
- region_t (*get_sync_count)(struct dirty_log *log);
-+
-+ /*
-+ * Support function for mirror status requests.
-+ */
-+ int (*status)(struct dirty_log *log, status_type_t status_type,
-+ char *result, unsigned int maxlen);
-+
-+ /*
-+ * Return the code describing what to do in the event
-+ * of a device failure.
-+ */
-+ int (*get_failure_response)(struct dirty_log *log);
- };
-
- int dm_register_dirty_log_type(struct dirty_log_type *type);
-diff -pruN ./drivers/md.dm/dm-mpath.c ./drivers/md/dm-mpath.c
---- ./drivers/md.dm/dm-mpath.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-mpath.c 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,1342 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software Limited.
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+#include "dm-hw-handler.h"
-+#include "dm-bio-list.h"
-+#include "dm-bio-record.h"
-+
-+#include <linux/ctype.h>
-+#include <linux/init.h>
-+#include <linux/mempool.h>
-+#include <linux/module.h>
-+#include <linux/pagemap.h>
-+#include <linux/slab.h>
-+#include <linux/time.h>
-+#include <linux/workqueue.h>
-+#include <asm/atomic.h>
-+
-+#define MESG_STR(x) x, sizeof(x)
-+
-+/* Path properties */
-+struct pgpath {
-+ struct list_head list;
-+
-+ struct priority_group *pg; /* Owning PG */
-+ unsigned fail_count; /* Cumulative failure count */
-+
-+ struct path path;
-+};
-+
-+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
-+
-+/*
-+ * Paths are grouped into Priority Groups and numbered from 1 upwards.
-+ * Each has a path selector which controls which path gets used.
-+ */
-+struct priority_group {
-+ struct list_head list;
-+
-+ struct multipath *m; /* Owning multipath instance */
-+ struct path_selector ps;
-+
-+ unsigned pg_num; /* Reference number */
-+ unsigned bypassed; /* Temporarily bypass this PG? */
-+
-+ unsigned nr_pgpaths; /* Number of paths in PG */
-+ struct list_head pgpaths;
-+};
-+
-+/* Multipath context */
-+struct multipath {
-+ struct list_head list;
-+ struct dm_target *ti;
-+
-+ spinlock_t lock;
-+
-+ struct hw_handler hw_handler;
-+ unsigned nr_priority_groups;
-+ struct list_head priority_groups;
-+ unsigned pg_init_required; /* pg_init needs calling? */
-+ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
-+
-+ unsigned nr_valid_paths; /* Total number of usable paths */
-+ struct pgpath *current_pgpath;
-+ struct priority_group *current_pg;
-+ struct priority_group *next_pg; /* Switch to this PG if set */
-+ unsigned repeat_count; /* I/Os left before calling PS again */
-+
-+ unsigned queue_io; /* Must we queue all I/O? */
-+ unsigned queue_if_no_path; /* Queue I/O if last path fails? */
-+ unsigned saved_queue_if_no_path;/* Saved state during suspension */
-+
-+ struct work_struct process_queued_ios;
-+ struct bio_list queued_ios;
-+ unsigned queue_size;
-+
-+ struct work_struct trigger_event;
-+
-+ /*
-+ * We must use a mempool of mpath_io structs so that we
-+ * can resubmit bios on error.
-+ */
-+ mempool_t *mpio_pool;
-+};
-+
-+/*
-+ * Context information attached to each bio we process.
-+ */
-+struct mpath_io {
-+ struct pgpath *pgpath;
-+ struct dm_bio_details details;
-+};
-+
-+typedef int (*action_fn) (struct pgpath *pgpath);
-+
-+#define MIN_IOS 256 /* Mempool size */
-+
-+static kmem_cache_t *_mpio_cache;
-+
-+struct workqueue_struct *kmultipathd;
-+static void process_queued_ios(void *data);
-+static void trigger_event(void *data);
-+
-+
-+/*-----------------------------------------------
-+ * Allocation routines
-+ *-----------------------------------------------*/
-+
-+static struct pgpath *alloc_pgpath(void)
-+{
-+ struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
-+
-+ if (pgpath) {
-+ memset(pgpath, 0, sizeof(*pgpath));
-+ pgpath->path.is_active = 1;
-+ }
-+
-+ return pgpath;
-+}
-+
-+static inline void free_pgpath(struct pgpath *pgpath)
-+{
-+ kfree(pgpath);
-+}
-+
-+static struct priority_group *alloc_priority_group(void)
-+{
-+ struct priority_group *pg;
-+
-+ pg = kmalloc(sizeof(*pg), GFP_KERNEL);
-+ if (!pg)
-+ return NULL;
-+
-+ memset(pg, 0, sizeof(*pg));
-+ INIT_LIST_HEAD(&pg->pgpaths);
-+
-+ return pg;
-+}
-+
-+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
-+{
-+ struct pgpath *pgpath, *tmp;
-+
-+ list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
-+ list_del(&pgpath->list);
-+ dm_put_device(ti, pgpath->path.dev);
-+ free_pgpath(pgpath);
-+ }
-+}
-+
-+static void free_priority_group(struct priority_group *pg,
-+ struct dm_target *ti)
-+{
-+ struct path_selector *ps = &pg->ps;
-+
-+ if (ps->type) {
-+ ps->type->destroy(ps);
-+ dm_put_path_selector(ps->type);
-+ }
-+
-+ free_pgpaths(&pg->pgpaths, ti);
-+ kfree(pg);
-+}
-+
-+static struct multipath *alloc_multipath(void)
-+{
-+ struct multipath *m;
-+
-+ m = kmalloc(sizeof(*m), GFP_KERNEL);
-+ if (m) {
-+ memset(m, 0, sizeof(*m));
-+ INIT_LIST_HEAD(&m->priority_groups);
-+ spin_lock_init(&m->lock);
-+ m->queue_io = 1;
-+ INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
-+ INIT_WORK(&m->trigger_event, trigger_event, m);
-+ m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-+ mempool_free_slab, _mpio_cache);
-+ if (!m->mpio_pool) {
-+ kfree(m);
-+ return NULL;
-+ }
-+ }
-+
-+ return m;
-+}
-+
-+static void free_multipath(struct multipath *m)
-+{
-+ struct priority_group *pg, *tmp;
-+ struct hw_handler *hwh = &m->hw_handler;
-+
-+ list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
-+ list_del(&pg->list);
-+ free_priority_group(pg, m->ti);
-+ }
-+
-+ if (hwh->type) {
-+ hwh->type->destroy(hwh);
-+ dm_put_hw_handler(hwh->type);
-+ }
-+
-+ mempool_destroy(m->mpio_pool);
-+ kfree(m);
-+}
-+
-+
-+/*-----------------------------------------------
-+ * Path selection
-+ *-----------------------------------------------*/
-+
-+static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
-+{
-+ struct hw_handler *hwh = &m->hw_handler;
-+
-+ m->current_pg = pgpath->pg;
-+
-+ /* Must we initialise the PG first, and queue I/O till it's ready? */
-+ if (hwh->type && hwh->type->pg_init) {
-+ m->pg_init_required = 1;
-+ m->queue_io = 1;
-+ } else {
-+ m->pg_init_required = 0;
-+ m->queue_io = 0;
-+ }
-+}
-+
-+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
-+{
-+ struct path *path;
-+
-+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
-+ if (!path)
-+ return -ENXIO;
-+
-+ m->current_pgpath = path_to_pgpath(path);
-+
-+ if (m->current_pg != pg)
-+ __switch_pg(m, m->current_pgpath);
-+
-+ return 0;
-+}
-+
-+static void __choose_pgpath(struct multipath *m)
-+{
-+ struct priority_group *pg;
-+ unsigned bypassed = 1;
-+
-+ if (!m->nr_valid_paths)
-+ goto failed;
-+
-+ /* Were we instructed to switch PG? */
-+ if (m->next_pg) {
-+ pg = m->next_pg;
-+ m->next_pg = NULL;
-+ if (!__choose_path_in_pg(m, pg))
-+ return;
-+ }
-+
-+ /* Don't change PG until it has no remaining paths */
-+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
-+ return;
-+
-+ /*
-+ * Loop through priority groups until we find a valid path.
-+ * First time we skip PGs marked 'bypassed'.
-+ * Second time we only try the ones we skipped.
-+ */
-+ do {
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ if (pg->bypassed == bypassed)
-+ continue;
-+ if (!__choose_path_in_pg(m, pg))
-+ return;
-+ }
-+ } while (bypassed--);
-+
-+failed:
-+ m->current_pgpath = NULL;
-+ m->current_pg = NULL;
-+}
-+
-+static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
-+ unsigned was_queued)
-+{
-+ int r = 1;
-+ unsigned long flags;
-+ struct pgpath *pgpath;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ /* Do we need to select a new pgpath? */
-+ if (!m->current_pgpath ||
-+ (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-+ __choose_pgpath(m);
-+
-+ pgpath = m->current_pgpath;
-+
-+ if (was_queued)
-+ m->queue_size--;
-+
-+ if ((pgpath && m->queue_io) ||
-+ (!pgpath && m->queue_if_no_path)) {
-+ /* Queue for the daemon to resubmit */
-+ bio_list_add(&m->queued_ios, bio);
-+ m->queue_size++;
-+ if ((m->pg_init_required && !m->pg_init_in_progress) ||
-+ !m->queue_io)
-+ queue_work(kmultipathd, &m->process_queued_ios);
-+ pgpath = NULL;
-+ r = 0;
-+ } else if (!pgpath)
-+ r = -EIO; /* Failed */
-+ else
-+ bio->bi_bdev = pgpath->path.dev->bdev;
-+
-+ mpio->pgpath = pgpath;
-+
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return r;
-+}
-+
-+/*
-+ * If we run out of usable paths, should we queue I/O or error it?
-+ */
-+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
-+ unsigned save_old_value)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ if (save_old_value)
-+ m->saved_queue_if_no_path = m->queue_if_no_path;
-+ else
-+ m->saved_queue_if_no_path = queue_if_no_path;
-+ m->queue_if_no_path = queue_if_no_path;
-+ if (!m->queue_if_no_path && m->queue_size)
-+ queue_work(kmultipathd, &m->process_queued_ios);
-+
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return 0;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * The multipath daemon is responsible for resubmitting queued ios.
-+ *---------------------------------------------------------------*/
-+
-+static void dispatch_queued_ios(struct multipath *m)
-+{
-+ int r;
-+ unsigned long flags;
-+ struct bio *bio = NULL, *next;
-+ struct mpath_io *mpio;
-+ union map_info *info;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+ bio = bio_list_get(&m->queued_ios);
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ while (bio) {
-+ next = bio->bi_next;
-+ bio->bi_next = NULL;
-+
-+ info = dm_get_mapinfo(bio);
-+ mpio = info->ptr;
-+
-+ r = map_io(m, bio, mpio, 1);
-+ if (r < 0)
-+ bio_endio(bio, bio->bi_size, r);
-+ else if (r == 1)
-+ generic_make_request(bio);
-+
-+ bio = next;
-+ }
-+}
-+
-+static void process_queued_ios(void *data)
-+{
-+ struct multipath *m = (struct multipath *) data;
-+ struct hw_handler *hwh = &m->hw_handler;
-+ struct pgpath *pgpath = NULL;
-+ unsigned init_required = 0, must_queue = 1;
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ if (!m->queue_size)
-+ goto out;
-+
-+ if (!m->current_pgpath)
-+ __choose_pgpath(m);
-+
-+ pgpath = m->current_pgpath;
-+
-+ if ((pgpath && !m->queue_io) ||
-+ (!pgpath && !m->queue_if_no_path))
-+ must_queue = 0;
-+
-+ if (m->pg_init_required && !m->pg_init_in_progress) {
-+ m->pg_init_required = 0;
-+ m->pg_init_in_progress = 1;
-+ init_required = 1;
-+ }
-+
-+out:
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ if (init_required)
-+ hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
-+
-+ if (!must_queue)
-+ dispatch_queued_ios(m);
-+}
-+
-+/*
-+ * An event is triggered whenever a path is taken out of use.
-+ * Includes path failure and PG bypass.
-+ */
-+static void trigger_event(void *data)
-+{
-+ struct multipath *m = (struct multipath *) data;
-+
-+ dm_table_event(m->ti->table);
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Constructor/argument parsing:
-+ * <#multipath feature args> [<arg>]*
-+ * <#hw_handler args> [hw_handler [<arg>]*]
-+ * <#priority groups>
-+ * <initial priority group>
-+ * [<selector> <#selector args> [<arg>]*
-+ * <#paths> <#per-path selector args>
-+ * [<path> [<arg>]* ]+ ]+
-+ *---------------------------------------------------------------*/
-+struct param {
-+ unsigned min;
-+ unsigned max;
-+ char *error;
-+};
-+
-+#define ESTR(s) ("dm-multipath: " s)
-+
-+static int read_param(struct param *param, char *str, unsigned *v, char **error)
-+{
-+ if (!str ||
-+ (sscanf(str, "%u", v) != 1) ||
-+ (*v < param->min) ||
-+ (*v > param->max)) {
-+ *error = param->error;
-+ return -EINVAL;
-+ }
-+
-+ return 0;
-+}
-+
-+struct arg_set {
-+ unsigned argc;
-+ char **argv;
-+};
-+
-+static char *shift(struct arg_set *as)
-+{
-+ char *r;
-+
-+ if (as->argc) {
-+ as->argc--;
-+ r = *as->argv;
-+ as->argv++;
-+ return r;
-+ }
-+
-+ return NULL;
-+}
-+
-+static void consume(struct arg_set *as, unsigned n)
-+{
-+ BUG_ON (as->argc < n);
-+ as->argc -= n;
-+ as->argv += n;
-+}
-+
-+static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
-+ struct dm_target *ti)
-+{
-+ int r;
-+ struct path_selector_type *pst;
-+ unsigned ps_argc;
-+
-+ static struct param _params[] = {
-+ {0, 1024, ESTR("invalid number of path selector args")},
-+ };
-+
-+ pst = dm_get_path_selector(shift(as));
-+ if (!pst) {
-+ ti->error = ESTR("unknown path selector type");
-+ return -EINVAL;
-+ }
-+
-+ r = read_param(_params, shift(as), &ps_argc, &ti->error);
-+ if (r)
-+ return -EINVAL;
-+
-+ r = pst->create(&pg->ps, ps_argc, as->argv);
-+ if (r) {
-+ dm_put_path_selector(pst);
-+ ti->error = ESTR("path selector constructor failed");
-+ return r;
-+ }
-+
-+ pg->ps.type = pst;
-+ consume(as, ps_argc);
-+
-+ return 0;
-+}
-+
-+static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
-+ struct dm_target *ti)
-+{
-+ int r;
-+ struct pgpath *p;
-+
-+ /* we need at least a path arg */
-+ if (as->argc < 1) {
-+ ti->error = ESTR("no device given");
-+ return NULL;
-+ }
-+
-+ p = alloc_pgpath();
-+ if (!p)
-+ return NULL;
-+
-+ r = dm_get_device(ti, shift(as), ti->begin, ti->len,
-+ dm_table_get_mode(ti->table), &p->path.dev);
-+ if (r) {
-+ ti->error = ESTR("error getting device");
-+ goto bad;
-+ }
-+
-+ r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
-+ if (r) {
-+ dm_put_device(ti, p->path.dev);
-+ goto bad;
-+ }
-+
-+ return p;
-+
-+ bad:
-+ free_pgpath(p);
-+ return NULL;
-+}
-+
-+static struct priority_group *parse_priority_group(struct arg_set *as,
-+ struct multipath *m,
-+ struct dm_target *ti)
-+{
-+ static struct param _params[] = {
-+ {1, 1024, ESTR("invalid number of paths")},
-+ {0, 1024, ESTR("invalid number of selector args")}
-+ };
-+
-+ int r;
-+ unsigned i, nr_selector_args, nr_params;
-+ struct priority_group *pg;
-+
-+ if (as->argc < 2) {
-+ as->argc = 0;
-+ ti->error = ESTR("not enough priority group aruments");
-+ return NULL;
-+ }
-+
-+ pg = alloc_priority_group();
-+ if (!pg) {
-+ ti->error = ESTR("couldn't allocate priority group");
-+ return NULL;
-+ }
-+ pg->m = m;
-+
-+ r = parse_path_selector(as, pg, ti);
-+ if (r)
-+ goto bad;
-+
-+ /*
-+ * read the paths
-+ */
-+ r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
-+ if (r)
-+ goto bad;
-+
-+ r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
-+ if (r)
-+ goto bad;
-+
-+ nr_params = 1 + nr_selector_args;
-+ for (i = 0; i < pg->nr_pgpaths; i++) {
-+ struct pgpath *pgpath;
-+ struct arg_set path_args;
-+
-+ if (as->argc < nr_params)
-+ goto bad;
-+
-+ path_args.argc = nr_params;
-+ path_args.argv = as->argv;
-+
-+ pgpath = parse_path(&path_args, &pg->ps, ti);
-+ if (!pgpath)
-+ goto bad;
-+
-+ pgpath->pg = pg;
-+ list_add_tail(&pgpath->list, &pg->pgpaths);
-+ consume(as, nr_params);
-+ }
-+
-+ return pg;
-+
-+ bad:
-+ free_priority_group(pg, ti);
-+ return NULL;
-+}
-+
-+static int parse_hw_handler(struct arg_set *as, struct multipath *m,
-+ struct dm_target *ti)
-+{
-+ int r;
-+ struct hw_handler_type *hwht;
-+ unsigned hw_argc;
-+
-+ static struct param _params[] = {
-+ {0, 1024, ESTR("invalid number of hardware handler args")},
-+ };
-+
-+ r = read_param(_params, shift(as), &hw_argc, &ti->error);
-+ if (r)
-+ return -EINVAL;
-+
-+ if (!hw_argc)
-+ return 0;
-+
-+ hwht = dm_get_hw_handler(shift(as));
-+ if (!hwht) {
-+ ti->error = ESTR("unknown hardware handler type");
-+ return -EINVAL;
-+ }
-+
-+ r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
-+ if (r) {
-+ dm_put_hw_handler(hwht);
-+ ti->error = ESTR("hardware handler constructor failed");
-+ return r;
-+ }
-+
-+ m->hw_handler.type = hwht;
-+ consume(as, hw_argc - 1);
-+
-+ return 0;
-+}
-+
-+static int parse_features(struct arg_set *as, struct multipath *m,
-+ struct dm_target *ti)
-+{
-+ int r;
-+ unsigned argc;
-+
-+ static struct param _params[] = {
-+ {0, 1, ESTR("invalid number of feature args")},
-+ };
-+
-+ r = read_param(_params, shift(as), &argc, &ti->error);
-+ if (r)
-+ return -EINVAL;
-+
-+ if (!argc)
-+ return 0;
-+
-+ if (!strnicmp(shift(as), MESG_STR("queue_if_no_path")))
-+ return queue_if_no_path(m, 1, 0);
-+ else {
-+ ti->error = "Unrecognised multipath feature request";
-+ return -EINVAL;
-+ }
-+}
-+
-+static int multipath_ctr(struct dm_target *ti, unsigned int argc,
-+ char **argv)
-+{
-+ /* target parameters */
-+ static struct param _params[] = {
-+ {1, 1024, ESTR("invalid number of priority groups")},
-+ {1, 1024, ESTR("invalid initial priority group number")},
-+ };
-+
-+ int r;
-+ struct multipath *m;
-+ struct arg_set as;
-+ unsigned pg_count = 0;
-+ unsigned next_pg_num;
-+
-+ as.argc = argc;
-+ as.argv = argv;
-+
-+ m = alloc_multipath();
-+ if (!m) {
-+ ti->error = ESTR("can't allocate multipath");
-+ return -EINVAL;
-+ }
-+
-+ r = parse_features(&as, m, ti);
-+ if (r)
-+ goto bad;
-+
-+ r = parse_hw_handler(&as, m, ti);
-+ if (r)
-+ goto bad;
-+
-+ r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
-+ if (r)
-+ goto bad;
-+
-+ r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
-+ if (r)
-+ goto bad;
-+
-+ /* parse the priority groups */
-+ while (as.argc) {
-+ struct priority_group *pg;
-+
-+ pg = parse_priority_group(&as, m, ti);
-+ if (!pg) {
-+ r = -EINVAL;
-+ goto bad;
-+ }
-+
-+ m->nr_valid_paths += pg->nr_pgpaths;
-+ list_add_tail(&pg->list, &m->priority_groups);
-+ pg_count++;
-+ pg->pg_num = pg_count;
-+ if (!--next_pg_num)
-+ m->next_pg = pg;
-+ }
-+
-+ if (pg_count != m->nr_priority_groups) {
-+ ti->error = ESTR("priority group count mismatch");
-+ r = -EINVAL;
-+ goto bad;
-+ }
-+
-+ ti->private = m;
-+ m->ti = ti;
-+
-+ return 0;
-+
-+ bad:
-+ free_multipath(m);
-+ return r;
-+}
-+
-+static void multipath_dtr(struct dm_target *ti)
-+{
-+ struct multipath *m = (struct multipath *) ti->private;
-+
-+ flush_workqueue(kmultipathd);
-+ free_multipath(m);
-+}
-+
-+/*
-+ * Map bios, recording original fields for later in case we have to resubmit
-+ */
-+static int multipath_map(struct dm_target *ti, struct bio *bio,
-+ union map_info *map_context)
-+{
-+ int r;
-+ struct mpath_io *mpio;
-+ struct multipath *m = (struct multipath *) ti->private;
-+
-+ if (bio_barrier(bio))
-+ return -EOPNOTSUPP;
-+
-+ mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
-+ dm_bio_record(&mpio->details, bio);
-+
-+ map_context->ptr = mpio;
-+ bio->bi_rw |= (1 << BIO_RW_FAILFAST);
-+ r = map_io(m, bio, mpio, 0);
-+ if (r < 0)
-+ mempool_free(mpio, m->mpio_pool);
-+
-+ return r;
-+}
-+
-+/*
-+ * Take a path out of use.
-+ */
-+static int fail_path(struct pgpath *pgpath)
-+{
-+ unsigned long flags;
-+ struct multipath *m = pgpath->pg->m;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ if (!pgpath->path.is_active)
-+ goto out;
-+
-+ DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
-+
-+ pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
-+ pgpath->path.is_active = 0;
-+ pgpath->fail_count++;
-+
-+ m->nr_valid_paths--;
-+
-+ if (pgpath == m->current_pgpath)
-+ m->current_pgpath = NULL;
-+
-+ queue_work(kmultipathd, &m->trigger_event);
-+
-+out:
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Reinstate a previously-failed path
-+ */
-+static int reinstate_path(struct pgpath *pgpath)
-+{
-+ int r = 0;
-+ unsigned long flags;
-+ struct multipath *m = pgpath->pg->m;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ if (pgpath->path.is_active)
-+ goto out;
-+
-+ if (!pgpath->pg->ps.type) {
-+ DMWARN("Reinstate path not supported by path selector %s",
-+ pgpath->pg->ps.type->name);
-+ r = -EINVAL;
-+ goto out;
-+ }
-+
-+ r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
-+ if (r)
-+ goto out;
-+
-+ pgpath->path.is_active = 1;
-+
-+ m->current_pgpath = NULL;
-+ if (!m->nr_valid_paths++ && m->queue_size)
-+ queue_work(kmultipathd, &m->process_queued_ios);
-+
-+ queue_work(kmultipathd, &m->trigger_event);
-+
-+out:
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return r;
-+}
-+
-+/*
-+ * Fail or reinstate all paths that match the provided struct dm_dev.
-+ */
-+static int action_dev(struct multipath *m, struct dm_dev *dev,
-+ action_fn action)
-+{
-+ int r = 0;
-+ struct pgpath *pgpath;
-+ struct priority_group *pg;
-+
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ list_for_each_entry(pgpath, &pg->pgpaths, list) {
-+ if (pgpath->path.dev == dev)
-+ r = action(pgpath);
-+ }
-+ }
-+
-+ return r;
-+}
-+
-+/*
-+ * Temporarily try to avoid having to use the specified PG
-+ */
-+static void bypass_pg(struct multipath *m, struct priority_group *pg,
-+ int bypassed)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ pg->bypassed = bypassed;
-+ m->current_pgpath = NULL;
-+ m->current_pg = NULL;
-+
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ queue_work(kmultipathd, &m->trigger_event);
-+}
-+
-+/*
-+ * Switch to using the specified PG from the next I/O that gets mapped
-+ */
-+static int switch_pg_num(struct multipath *m, const char *pgstr)
-+{
-+ struct priority_group *pg;
-+ unsigned pgnum;
-+ unsigned long flags;
-+
-+ if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
-+ (pgnum > m->nr_priority_groups)) {
-+ DMWARN("invalid PG number supplied to switch_pg_num");
-+ return -EINVAL;
-+ }
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ pg->bypassed = 0;
-+ if (--pgnum)
-+ continue;
-+
-+ m->current_pgpath = NULL;
-+ m->current_pg = NULL;
-+ m->next_pg = pg;
-+ }
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ queue_work(kmultipathd, &m->trigger_event);
-+ return 0;
-+}
-+
-+/*
-+ * Set/clear bypassed status of a PG.
-+ * PGs are numbered upwards from 1 in the order they were declared.
-+ */
-+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
-+{
-+ struct priority_group *pg;
-+ unsigned pgnum;
-+
-+ if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
-+ (pgnum > m->nr_priority_groups)) {
-+ DMWARN("invalid PG number supplied to bypass_pg");
-+ return -EINVAL;
-+ }
-+
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ if (!--pgnum)
-+ break;
-+ }
-+
-+ bypass_pg(m, pg, bypassed);
-+ return 0;
-+}
-+
-+/*
-+ * pg_init must call this when it has completed its initialisation
-+ */
-+void dm_pg_init_complete(struct path *path, unsigned err_flags)
-+{
-+ struct pgpath *pgpath = path_to_pgpath(path);
-+ struct priority_group *pg = pgpath->pg;
-+ struct multipath *m = pg->m;
-+ unsigned long flags;
-+
-+ /* We insist on failing the path if the PG is already bypassed. */
-+ if (err_flags && pg->bypassed)
-+ err_flags |= MP_FAIL_PATH;
-+
-+ if (err_flags & MP_FAIL_PATH)
-+ fail_path(pgpath);
-+
-+ if (err_flags & MP_BYPASS_PG)
-+ bypass_pg(m, pg, 1);
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+ if (err_flags) {
-+ m->current_pgpath = NULL;
-+ m->current_pg = NULL;
-+ } else if (!m->pg_init_required)
-+ m->queue_io = 0;
-+
-+ m->pg_init_in_progress = 0;
-+ queue_work(kmultipathd, &m->process_queued_ios);
-+ spin_unlock_irqrestore(&m->lock, flags);
-+}
-+
-+/*
-+ * end_io handling
-+ */
-+static int do_end_io(struct multipath *m, struct bio *bio,
-+ int error, struct mpath_io *mpio)
-+{
-+ struct hw_handler *hwh = &m->hw_handler;
-+ unsigned err_flags = MP_FAIL_PATH; /* Default behavior */
-+ unsigned long flags;
-+
-+ if (!error)
-+ return 0; /* I/O complete */
-+
-+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-+ return error;
-+
-+ if (error == -EOPNOTSUPP)
-+ return error;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+ if (!m->nr_valid_paths) {
-+ if (!m->queue_if_no_path) {
-+ spin_unlock_irqrestore(&m->lock, flags);
-+ return -EIO;
-+ } else {
-+ spin_unlock_irqrestore(&m->lock, flags);
-+ goto requeue;
-+ }
-+ }
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ if (hwh->type && hwh->type->error)
-+ err_flags = hwh->type->error(hwh, bio);
-+ else
-+ err_flags = dm_scsi_err_handler(hwh, bio);
-+
-+ if (mpio->pgpath) {
-+ if (err_flags & MP_FAIL_PATH)
-+ fail_path(mpio->pgpath);
-+
-+ if (err_flags & MP_BYPASS_PG)
-+ bypass_pg(m, mpio->pgpath->pg, 1);
-+ }
-+
-+ if (err_flags & MP_ERROR_IO)
-+ return -EIO;
-+
-+ requeue:
-+ dm_bio_restore(&mpio->details, bio);
-+
-+ /* queue for the daemon to resubmit or fail */
-+ spin_lock_irqsave(&m->lock, flags);
-+ bio_list_add(&m->queued_ios, bio);
-+ m->queue_size++;
-+ if (!m->queue_io)
-+ queue_work(kmultipathd, &m->process_queued_ios);
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return 1; /* io not complete */
-+}
-+
-+static int multipath_end_io(struct dm_target *ti, struct bio *bio,
-+ int error, union map_info *map_context)
-+{
-+ struct multipath *m = (struct multipath *) ti->private;
-+ struct mpath_io *mpio = (struct mpath_io *) map_context->ptr;
-+ struct pgpath *pgpath = mpio->pgpath;
-+ struct path_selector *ps;
-+ int r;
-+
-+ r = do_end_io(m, bio, error, mpio);
-+ if (pgpath) {
-+ ps = &pgpath->pg->ps;
-+ if (ps->type->end_io)
-+ ps->type->end_io(ps, &pgpath->path);
-+ }
-+ if (r <= 0)
-+ mempool_free(mpio, m->mpio_pool);
-+
-+ return r;
-+}
-+
-+/*
-+ * Suspend can't complete until all the I/O is processed so if
-+ * the last path fails we must error any remaining I/O.
-+ * Note that if the freeze_bdev fails while suspending, the
-+ * queue_if_no_path state is lost - userspace should reset it.
-+ */
-+static void multipath_presuspend(struct dm_target *ti)
-+{
-+ struct multipath *m = (struct multipath *) ti->private;
-+
-+ queue_if_no_path(m, 0, 1);
-+}
-+
-+/*
-+ * Restore the queue_if_no_path setting.
-+ */
-+static void multipath_resume(struct dm_target *ti)
-+{
-+ struct multipath *m = (struct multipath *) ti->private;
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+ m->queue_if_no_path = m->saved_queue_if_no_path;
-+ spin_unlock_irqrestore(&m->lock, flags);
-+}
-+
-+/*
-+ * Info output has the following format:
-+ * num_multipath_feature_args [multipath_feature_args]*
-+ * num_handler_status_args [handler_status_args]*
-+ * num_groups init_group_number
-+ * [A|D|E num_ps_status_args [ps_status_args]*
-+ * num_paths num_selector_args
-+ * [path_dev A|F fail_count [selector_args]* ]+ ]+
-+ *
-+ * Table output has the following format (identical to the constructor string):
-+ * num_feature_args [features_args]*
-+ * num_handler_args hw_handler [hw_handler_args]*
-+ * num_groups init_group_number
-+ * [priority selector-name num_ps_args [ps_args]*
-+ * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
-+ */
-+static int multipath_status(struct dm_target *ti, status_type_t type,
-+ char *result, unsigned int maxlen)
-+{
-+ int sz = 0;
-+ unsigned long flags;
-+ struct multipath *m = (struct multipath *) ti->private;
-+ struct hw_handler *hwh = &m->hw_handler;
-+ struct priority_group *pg;
-+ struct pgpath *p;
-+ unsigned pg_num;
-+ char state;
-+
-+ spin_lock_irqsave(&m->lock, flags);
-+
-+ /* Features */
-+ if (type == STATUSTYPE_INFO)
-+ DMEMIT("1 %u ", m->queue_size);
-+ else if (m->queue_if_no_path)
-+ DMEMIT("1 queue_if_no_path ");
-+ else
-+ DMEMIT("0 ");
-+
-+ if (hwh->type && hwh->type->status)
-+ sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
-+ else if (!hwh->type || type == STATUSTYPE_INFO)
-+ DMEMIT("0 ");
-+ else
-+ DMEMIT("1 %s ", hwh->type->name);
-+
-+ DMEMIT("%u ", m->nr_priority_groups);
-+
-+ if (m->next_pg)
-+ pg_num = m->next_pg->pg_num;
-+ else if (m->current_pg)
-+ pg_num = m->current_pg->pg_num;
-+ else
-+ pg_num = 1;
-+
-+ DMEMIT("%u ", pg_num);
-+
-+ switch (type) {
-+ case STATUSTYPE_INFO:
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ if (pg->bypassed)
-+ state = 'D'; /* Disabled */
-+ else if (pg == m->current_pg)
-+ state = 'A'; /* Currently Active */
-+ else
-+ state = 'E'; /* Enabled */
-+
-+ DMEMIT("%c ", state);
-+
-+ if (pg->ps.type->status)
-+ sz += pg->ps.type->status(&pg->ps, NULL, type,
-+ result + sz,
-+ maxlen - sz);
-+ else
-+ DMEMIT("0 ");
-+
-+ DMEMIT("%u %u ", pg->nr_pgpaths,
-+ pg->ps.type->info_args);
-+
-+ list_for_each_entry(p, &pg->pgpaths, list) {
-+ DMEMIT("%s %s %u ", p->path.dev->name,
-+ p->path.is_active ? "A" : "F",
-+ p->fail_count);
-+ if (pg->ps.type->status)
-+ sz += pg->ps.type->status(&pg->ps,
-+ &p->path, type, result + sz,
-+ maxlen - sz);
-+ }
-+ }
-+ break;
-+
-+ case STATUSTYPE_TABLE:
-+ list_for_each_entry(pg, &m->priority_groups, list) {
-+ DMEMIT("%s ", pg->ps.type->name);
-+
-+ if (pg->ps.type->status)
-+ sz += pg->ps.type->status(&pg->ps, NULL, type,
-+ result + sz,
-+ maxlen - sz);
-+ else
-+ DMEMIT("0 ");
-+
-+ DMEMIT("%u %u ", pg->nr_pgpaths,
-+ pg->ps.type->table_args);
-+
-+ list_for_each_entry(p, &pg->pgpaths, list) {
-+ DMEMIT("%s ", p->path.dev->name);
-+ if (pg->ps.type->status)
-+ sz += pg->ps.type->status(&pg->ps,
-+ &p->path, type, result + sz,
-+ maxlen - sz);
-+ }
-+ }
-+ break;
-+ }
-+
-+ spin_unlock_irqrestore(&m->lock, flags);
-+
-+ return 0;
-+}
-+
-+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
-+{
-+ int r;
-+ struct dm_dev *dev;
-+ struct multipath *m = (struct multipath *) ti->private;
-+ action_fn action;
-+
-+ if (argc == 1) {
-+ if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
-+ return queue_if_no_path(m, 1, 0);
-+ else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
-+ return queue_if_no_path(m, 0, 0);
-+ }
-+
-+ if (argc != 2)
-+ goto error;
-+
-+ if (!strnicmp(argv[0], MESG_STR("disable_group")))
-+ return bypass_pg_num(m, argv[1], 1);
-+ else if (!strnicmp(argv[0], MESG_STR("enable_group")))
-+ return bypass_pg_num(m, argv[1], 0);
-+ else if (!strnicmp(argv[0], MESG_STR("switch_group")))
-+ return switch_pg_num(m, argv[1]);
-+ else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
-+ action = reinstate_path;
-+ else if (!strnicmp(argv[0], MESG_STR("fail_path")))
-+ action = fail_path;
-+ else
-+ goto error;
-+
-+ r = dm_get_device(ti, argv[1], ti->begin, ti->len,
-+ dm_table_get_mode(ti->table), &dev);
-+ if (r) {
-+ DMWARN("dm-multipath message: error getting device %s",
-+ argv[1]);
-+ return -EINVAL;
-+ }
-+
-+ r = action_dev(m, dev, action);
-+
-+ dm_put_device(ti, dev);
-+
-+ return r;
-+
-+error:
-+ DMWARN("Unrecognised multipath message received.");
-+ return -EINVAL;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Module setup
-+ *---------------------------------------------------------------*/
-+static struct target_type multipath_target = {
-+ .name = "multipath",
-+ .version = {1, 0, 4},
-+ .module = THIS_MODULE,
-+ .ctr = multipath_ctr,
-+ .dtr = multipath_dtr,
-+ .map = multipath_map,
-+ .end_io = multipath_end_io,
-+ .presuspend = multipath_presuspend,
-+ .resume = multipath_resume,
-+ .status = multipath_status,
-+ .message = multipath_message,
-+};
-+
-+static int __init dm_multipath_init(void)
-+{
-+ int r;
-+
-+ /* allocate a slab for the dm_ios */
-+ _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io),
-+ 0, 0, NULL, NULL);
-+ if (!_mpio_cache)
-+ return -ENOMEM;
-+
-+ r = dm_register_target(&multipath_target);
-+ if (r < 0) {
-+ DMERR("%s: register failed %d", multipath_target.name, r);
-+ kmem_cache_destroy(_mpio_cache);
-+ return -EINVAL;
-+ }
-+
-+ kmultipathd = create_workqueue("kmpathd");
-+ if (!kmultipathd) {
-+ DMERR("%s: failed to create workqueue kmpathd",
-+ multipath_target.name);
-+ dm_unregister_target(&multipath_target);
-+ kmem_cache_destroy(_mpio_cache);
-+ return -ENOMEM;
-+ }
-+
-+ DMINFO("dm-multipath version %u.%u.%u loaded",
-+ multipath_target.version[0], multipath_target.version[1],
-+ multipath_target.version[2]);
-+
-+ return r;
-+}
-+
-+static void __exit dm_multipath_exit(void)
-+{
-+ int r;
-+
-+ destroy_workqueue(kmultipathd);
-+
-+ r = dm_unregister_target(&multipath_target);
-+ if (r < 0)
-+ DMERR("%s: target unregister failed %d",
-+ multipath_target.name, r);
-+ kmem_cache_destroy(_mpio_cache);
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_pg_init_complete);
-+
-+module_init(dm_multipath_init);
-+module_exit(dm_multipath_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " multipath target");
-+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-mpath.h ./drivers/md/dm-mpath.h
---- ./drivers/md.dm/dm-mpath.h 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-mpath.h 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,25 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath.
-+ */
-+
-+#ifndef DM_MPATH_H
-+#define DM_MPATH_H
-+
-+struct dm_dev;
-+
-+struct path {
-+ struct dm_dev *dev; /* Read-only */
-+ unsigned is_active; /* Read-only */
-+
-+ void *pscontext; /* For path-selector use */
-+ void *hwhcontext; /* For hw-handler use */
-+};
-+
-+/* Callback for hwh_pg_init_fn to use when complete */
-+void dm_pg_init_complete(struct path *path, unsigned err_flags);
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-path-selector.c ./drivers/md/dm-path-selector.c
---- ./drivers/md.dm/dm-path-selector.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-path-selector.c 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,156 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Path selector registration.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+
-+#include <linux/slab.h>
-+
-+struct ps_internal {
-+ struct path_selector_type pst;
-+
-+ struct list_head list;
-+ long use;
-+};
-+
-+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
-+
-+static LIST_HEAD(_path_selectors);
-+static DECLARE_RWSEM(_ps_lock);
-+
-+struct ps_internal *__find_path_selector_type(const char *name)
-+{
-+ struct ps_internal *psi;
-+
-+ list_for_each_entry(psi, &_path_selectors, list) {
-+ if (!strcmp(name, psi->pst.name))
-+ return psi;
-+ }
-+
-+ return NULL;
-+}
-+
-+static struct ps_internal *get_path_selector(const char *name)
-+{
-+ struct ps_internal *psi;
-+
-+ down_read(&_ps_lock);
-+ psi = __find_path_selector_type(name);
-+ if (psi) {
-+ if ((psi->use == 0) && !try_module_get(psi->pst.module))
-+ psi = NULL;
-+ else
-+ psi->use++;
-+ }
-+ up_read(&_ps_lock);
-+
-+ return psi;
-+}
-+
-+struct path_selector_type *dm_get_path_selector(const char *name)
-+{
-+ struct ps_internal *psi;
-+
-+ if (!name)
-+ return NULL;
-+
-+ psi = get_path_selector(name);
-+ if (!psi) {
-+ request_module("dm-%s", name);
-+ psi = get_path_selector(name);
-+ }
-+
-+ return psi ? &psi->pst : NULL;
-+}
-+
-+void dm_put_path_selector(struct path_selector_type *pst)
-+{
-+ struct ps_internal *psi;
-+
-+ if (!pst)
-+ return;
-+
-+ down_read(&_ps_lock);
-+ psi = __find_path_selector_type(pst->name);
-+ if (!psi)
-+ goto out;
-+
-+ if (--psi->use == 0)
-+ module_put(psi->pst.module);
-+
-+ if (psi->use < 0)
-+ BUG();
-+
-+out:
-+ up_read(&_ps_lock);
-+}
-+
-+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
-+{
-+ struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
-+
-+ if (psi) {
-+ memset(psi, 0, sizeof(*psi));
-+ psi->pst = *pst;
-+ }
-+
-+ return psi;
-+}
-+
-+int dm_register_path_selector(struct path_selector_type *pst)
-+{
-+ int r = 0;
-+ struct ps_internal *psi = _alloc_path_selector(pst);
-+
-+ if (!psi)
-+ return -ENOMEM;
-+
-+ down_write(&_ps_lock);
-+
-+ if (__find_path_selector_type(pst->name)) {
-+ kfree(psi);
-+ r = -EEXIST;
-+ } else
-+ list_add(&psi->list, &_path_selectors);
-+
-+ up_write(&_ps_lock);
-+
-+ return r;
-+}
-+
-+int dm_unregister_path_selector(struct path_selector_type *pst)
-+{
-+ struct ps_internal *psi;
-+
-+ down_write(&_ps_lock);
-+
-+ psi = __find_path_selector_type(pst->name);
-+ if (!psi) {
-+ up_write(&_ps_lock);
-+ return -EINVAL;
-+ }
-+
-+ if (psi->use) {
-+ up_write(&_ps_lock);
-+ return -ETXTBSY;
-+ }
-+
-+ list_del(&psi->list);
-+
-+ up_write(&_ps_lock);
-+
-+ kfree(psi);
-+
-+ return 0;
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_register_path_selector);
-+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
-diff -pruN ./drivers/md.dm/dm-path-selector.h ./drivers/md/dm-path-selector.h
---- ./drivers/md.dm/dm-path-selector.h 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-path-selector.h 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,93 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Path-Selector registration.
-+ */
-+
-+#ifndef DM_PATH_SELECTOR_H
-+#define DM_PATH_SELECTOR_H
-+
-+#include <linux/device-mapper.h>
-+
-+#include "dm-mpath.h"
-+
-+/*
-+ * We provide an abstraction for the code that chooses which path
-+ * to send some io down.
-+ */
-+struct path_selector_type;
-+struct path_selector {
-+ struct path_selector_type *type;
-+ void *context;
-+};
-+
-+/* Information about a path selector type */
-+struct path_selector_type {
-+ char *name;
-+ struct module *module;
-+
-+ unsigned int table_args;
-+ unsigned int info_args;
-+
-+ /*
-+ * Constructs a path selector object, takes custom arguments
-+ */
-+ int (*create) (struct path_selector *ps, unsigned argc, char **argv);
-+ void (*destroy) (struct path_selector *ps);
-+
-+ /*
-+ * Add an opaque path object, along with some selector specific
-+ * path args (eg, path priority).
-+ */
-+ int (*add_path) (struct path_selector *ps, struct path *path,
-+ int argc, char **argv, char **error);
-+
-+ /*
-+ * Chooses a path for this io, if no paths are available then
-+ * NULL will be returned.
-+ *
-+ * repeat_count is the number of times to use the path before
-+ * calling the function again. 0 means don't call it again unless
-+ * the path fails.
-+ */
-+ struct path *(*select_path) (struct path_selector *ps,
-+ unsigned *repeat_count);
-+
-+ /*
-+ * Notify the selector that a path has failed.
-+ */
-+ void (*fail_path) (struct path_selector *ps, struct path *p);
-+
-+ /*
-+ * Ask selector to reinstate a path.
-+ */
-+ int (*reinstate_path) (struct path_selector *ps, struct path *p);
-+
-+ /*
-+ * Table content based on parameters added in ps_add_path_fn
-+ * or path selector status
-+ */
-+ int (*status) (struct path_selector *ps, struct path *path,
-+ status_type_t type, char *result, unsigned int maxlen);
-+
-+ int (*end_io) (struct path_selector *ps, struct path *path);
-+};
-+
-+/* Register a path selector */
-+int dm_register_path_selector(struct path_selector_type *type);
-+
-+/* Unregister a path selector */
-+int dm_unregister_path_selector(struct path_selector_type *type);
-+
-+/* Returns a registered path selector type */
-+struct path_selector_type *dm_get_path_selector(const char *name);
-+
-+/* Releases a path selector */
-+void dm_put_path_selector(struct path_selector_type *pst);
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-raid1.c ./drivers/md/dm-raid1.c
---- ./drivers/md.dm/dm-raid1.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-raid1.c 2006-03-17 13:16:38.000000000 +0300
-@@ -6,6 +6,7 @@
-
- #include "dm.h"
- #include "dm-bio-list.h"
-+#include "dm-bio-record.h"
- #include "dm-io.h"
- #include "dm-log.h"
- #include "kcopyd.h"
-@@ -28,6 +29,8 @@ static inline void wake(void)
- queue_work(_kmirrord_wq, &_kmirrord_work);
- }
-
-+static struct workqueue_struct *_kmir_mon_wq;
-+
- /*-----------------------------------------------------------------
- * Region hash
- *
-@@ -67,7 +70,7 @@ static inline void wake(void)
- struct mirror_set;
- struct region_hash {
- struct mirror_set *ms;
-- sector_t region_size;
-+ uint32_t region_size;
- unsigned region_shift;
-
- /* holds persistent region state */
-@@ -135,7 +138,7 @@ static void region_free(void *element, v
- #define MIN_REGIONS 64
- #define MAX_RECOVERY 1
- static int rh_init(struct region_hash *rh, struct mirror_set *ms,
-- struct dirty_log *log, sector_t region_size,
-+ struct dirty_log *log, uint32_t region_size,
- region_t nr_regions)
- {
- unsigned int nr_buckets, max_buckets;
-@@ -253,9 +256,9 @@ static struct region *__rh_alloc(struct
- else {
- __rh_insert(rh, nreg);
- if (nreg->state == RH_CLEAN) {
-- spin_lock_irq(&rh->region_lock);
-+ spin_lock(&rh->region_lock);
- list_add(&nreg->list, &rh->clean_regions);
-- spin_unlock_irq(&rh->region_lock);
-+ spin_unlock(&rh->region_lock);
- }
- reg = nreg;
- }
-@@ -375,16 +378,19 @@ static void rh_inc(struct region_hash *r
-
- read_lock(&rh->hash_lock);
- reg = __rh_find(rh, region);
-- if (reg->state == RH_CLEAN) {
-- rh->log->type->mark_region(rh->log, reg->key);
-
-- spin_lock_irq(&rh->region_lock);
-+ spin_lock_irq(&rh->region_lock);
-+ atomic_inc(&reg->pending);
-+
-+ if (reg->state == RH_CLEAN) {
- reg->state = RH_DIRTY;
- list_del_init(&reg->list); /* take off the clean list */
- spin_unlock_irq(&rh->region_lock);
-- }
-
-- atomic_inc(&reg->pending);
-+ rh->log->type->mark_region(rh->log, reg->key);
-+ } else
-+ spin_unlock_irq(&rh->region_lock);
-+
- read_unlock(&rh->hash_lock);
- }
-
-@@ -406,17 +412,17 @@ static void rh_dec(struct region_hash *r
- reg = __rh_lookup(rh, region);
- read_unlock(&rh->hash_lock);
-
-+ spin_lock_irqsave(&rh->region_lock, flags);
- if (atomic_dec_and_test(&reg->pending)) {
-- spin_lock_irqsave(&rh->region_lock, flags);
- if (reg->state == RH_RECOVERING) {
- list_add_tail(&reg->list, &rh->quiesced_regions);
- } else {
- reg->state = RH_CLEAN;
- list_add(&reg->list, &rh->clean_regions);
- }
-- spin_unlock_irqrestore(&rh->region_lock, flags);
- should_wake = 1;
- }
-+ spin_unlock_irqrestore(&rh->region_lock, flags);
-
- if (should_wake)
- wake();
-@@ -539,7 +545,8 @@ static void rh_start_recovery(struct reg
- * Mirror set structures.
- *---------------------------------------------------------------*/
- struct mirror {
-- atomic_t error_count;
-+ atomic_t error_count; /* Error counter to flag mirror failure */
-+ struct mirror_set *ms;
- struct dm_dev *dev;
- sector_t offset;
- };
-@@ -550,36 +557,59 @@ struct mirror_set {
- struct region_hash rh;
- struct kcopyd_client *kcopyd_client;
-
-- spinlock_t lock; /* protects the next two lists */
-+ spinlock_t lock; /* protects the lists */
- struct bio_list reads;
- struct bio_list writes;
-+ struct bio_list failures;
-+ struct work_struct failure_work;
-+ struct completion failure_completion;
-
- /* recovery */
-+ atomic_t suspended;
- region_t nr_regions;
- int in_sync;
-
- unsigned int nr_mirrors;
-- struct mirror mirror[0];
-+ spinlock_t choose_lock; /* protects select in choose_mirror(). */
-+ atomic_t read_count; /* Read counter for read balancing. */
-+ unsigned int read_mirror; /* Last mirror read. */
-+ struct mirror *default_mirror; /* Default mirror. */
-+ struct mirror mirror[0];
- };
-
-+struct bio_map_info {
-+ struct mirror *bmi_m;
-+ struct dm_bio_details bmi_bd;
-+};
-+
-+static mempool_t *bio_map_info_pool = NULL;
-+
-+static void *bio_map_info_alloc(int gfp_mask, void *pool_data){
-+ return kmalloc(sizeof(struct bio_map_info), gfp_mask);
-+}
-+
-+static void bio_map_info_free(void *element, void *pool_data){
-+ kfree(element);
-+}
-+
- /*
- * Every mirror should look like this one.
- */
- #define DEFAULT_MIRROR 0
-
- /*
-- * This is yucky. We squirrel the mirror_set struct away inside
-- * bi_next for write buffers. This is safe since the bh
-+ * This is yucky. We squirrel the mirror struct away inside
-+ * bi_next for read/write buffers. This is safe since the bh
- * doesn't get submitted to the lower levels of block layer.
- */
--static struct mirror_set *bio_get_ms(struct bio *bio)
-+static struct mirror *bio_get_m(struct bio *bio)
- {
-- return (struct mirror_set *) bio->bi_next;
-+ return (struct mirror *) bio->bi_next;
- }
-
--static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
-+static void bio_set_m(struct bio *bio, struct mirror *m)
- {
-- bio->bi_next = (struct bio *) ms;
-+ bio->bi_next = (struct bio *) m;
- }
-
- /*-----------------------------------------------------------------
-@@ -607,7 +637,7 @@ static int recover(struct mirror_set *ms
- unsigned long flags = 0;
-
- /* fill in the source */
-- m = ms->mirror + DEFAULT_MIRROR;
-+ m = ms->default_mirror;
- from.bdev = m->dev->bdev;
- from.sector = m->offset + region_to_sector(reg->rh, reg->key);
- if (reg->key == (ms->nr_regions - 1)) {
-@@ -623,7 +653,7 @@ static int recover(struct mirror_set *ms
-
- /* fill in the destinations */
- for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-- if (i == DEFAULT_MIRROR)
-+ if (&ms->mirror[i] == ms->default_mirror)
- continue;
-
- m = ms->mirror + i;
-@@ -673,42 +703,163 @@ static void do_recovery(struct mirror_se
- }
-
- /*-----------------------------------------------------------------
-- * Reads
-+ * Misc Functions
- *---------------------------------------------------------------*/
--static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
-+#define MIN_READS 128
-+/*
-+ * choose_mirror
-+ * @ms: the mirror set
-+ * @m: mirror that has failed, or NULL if just choosing
-+ *
-+ * Returns: chosen mirror, or NULL on failure
-+ */
-+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m)
- {
-- /* FIXME: add read balancing */
-- return ms->mirror + DEFAULT_MIRROR;
-+ int i, retry;
-+ unsigned long flags;
-+ struct mirror *ret = NULL;
-+
-+ spin_lock_irqsave(&ms->choose_lock, flags);
-+
-+ if (unlikely(m == ms->default_mirror)) {
-+ i = DEFAULT_MIRROR;
-+ atomic_set(&ms->read_count, MIN_READS);
-+ } else
-+ i = ms->read_mirror;
-+
-+ for (retry = 0; retry < ms->nr_mirrors; ) {
-+ i %= ms->nr_mirrors;
-+ ret = ms->mirror + i;
-+
-+ if (unlikely(atomic_read(&ret->error_count))) {
-+ retry++;
-+ i++;
-+ } else {
-+ /*
-+ * Guarantee that a number of read IOs
-+ * get queued to the same mirror.
-+ */
-+ if (atomic_dec_and_test(&ms->read_count)) {
-+ atomic_set(&ms->read_count, MIN_READS);
-+ i++;
-+ }
-+
-+ ms->read_mirror = i;
-+ break;
-+ }
-+ }
-+
-+ /* Check for failure of default mirror, reset if necessary */
-+ if (unlikely(m == ms->default_mirror))
-+ ms->default_mirror = ret;
-+
-+ spin_unlock_irqrestore(&ms->choose_lock, flags);
-+
-+ if (unlikely(atomic_read(&ret->error_count))) {
-+ DMERR("All mirror devices are dead. Unable to choose mirror.");
-+ return NULL;
-+ }
-+
-+ return ret;
-+}
-+
-+static void fail_mirror(struct mirror *m)
-+{
-+ DMINFO("incrementing error_count on %s", m->dev->name);
-+ atomic_inc(&m->error_count);
-+
-+ choose_mirror(m->ms, m);
-+}
-+
-+static int default_ok(struct mirror *m)
-+{
-+ return !atomic_read(&m->ms->default_mirror->error_count);
- }
-
- /*
- * remap a buffer to a particular mirror.
- */
--static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
-+static sector_t map_sector(struct mirror *m, struct bio *bio)
-+{
-+ return m->offset + (bio->bi_sector - m->ms->ti->begin);
-+}
-+
-+static void map_bio(struct mirror *m, struct bio *bio)
- {
- bio->bi_bdev = m->dev->bdev;
-- bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
-+ bio->bi_sector = map_sector(m, bio);
-+}
-+
-+static void map_region(struct io_region *io, struct mirror *m,
-+ struct bio *bio)
-+{
-+ io->bdev = m->dev->bdev;
-+ io->sector = map_sector(m, bio);
-+ io->count = bio->bi_size >> 9;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Reads
-+ *---------------------------------------------------------------*/
-+static void read_callback(unsigned long error, void *context)
-+{
-+ struct bio *bio = (struct bio *)context;
-+ struct mirror *m;
-+
-+ m = bio_get_m(bio);
-+ bio_set_m(bio, NULL);
-+
-+ if (unlikely(error)) {
-+ DMWARN("A read failure occurred on a mirror device.");
-+ fail_mirror(m);
-+ if (likely(default_ok(m))) {
-+ DMWARN("Trying different device.");
-+ queue_bio(m->ms, bio, bio_rw(bio));
-+ } else {
-+ DMERR("No other device available, failing I/O.");
-+ bio_endio(bio, 0, -EIO);
-+ }
-+ } else
-+ bio_endio(bio, bio->bi_size, 0);
-+}
-+
-+/* Asynchronous read. */
-+static void read_async_bio(struct mirror *m, struct bio *bio)
-+{
-+ struct io_region io;
-+
-+ map_region(&io, m, bio);
-+ bio_set_m(bio, m);
-+ dm_io_async_bvec(1, &io, READ,
-+ bio->bi_io_vec + bio->bi_idx,
-+ read_callback, bio);
- }
-
- static void do_reads(struct mirror_set *ms, struct bio_list *reads)
- {
-- region_t region;
- struct bio *bio;
- struct mirror *m;
-
- while ((bio = bio_list_pop(reads))) {
-- region = bio_to_region(&ms->rh, bio);
--
- /*
- * We can only read balance if the region is in sync.
- */
-- if (rh_in_sync(&ms->rh, region, 0))
-- m = choose_mirror(ms, bio->bi_sector);
-- else
-- m = ms->mirror + DEFAULT_MIRROR;
-+ if (likely(rh_in_sync(&ms->rh,
-+ bio_to_region(&ms->rh, bio),
-+ 0)))
-+ m = choose_mirror(ms, NULL);
-+ else {
-+ m = ms->default_mirror;
-+
-+ /* If the default fails, we give up .*/
-+ if (unlikely(m && atomic_read(&m->error_count)))
-+ m = NULL;
-+ }
-
-- map_bio(ms, m, bio);
-- generic_make_request(bio);
-+ if (likely(m))
-+ read_async_bio(m, bio);
-+ else
-+ bio_endio(bio, 0, -EIO);
- }
- }
-
-@@ -722,56 +873,116 @@ static void do_reads(struct mirror_set *
- * RECOVERING: delay the io until recovery completes
- * NOSYNC: increment pending, just write to the default mirror
- *---------------------------------------------------------------*/
-+static void write_failure_handler(void *data)
-+{
-+ struct bio *bio;
-+ struct bio_list failed_writes;
-+ struct mirror_set *ms = (struct mirror_set *)data;
-+ struct dirty_log *log = ms->rh.log;
-+
-+ if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK) {
-+ dm_table_event(ms->ti->table);
-+ wait_for_completion(&ms->failure_completion);
-+ }
-+
-+ /* Take list out to handle endios. */
-+ spin_lock_irq(&ms->lock);
-+ failed_writes = ms->failures;
-+ bio_list_init(&ms->failures);
-+ spin_unlock_irq(&ms->lock);
-+
-+ while ((bio = bio_list_pop(&failed_writes)))
-+ bio_endio(bio, bio->bi_size, 0);
-+}
-+
- static void write_callback(unsigned long error, void *context)
- {
-- unsigned int i;
-- int uptodate = 1;
-+ unsigned int i, ret = 0;
- struct bio *bio = (struct bio *) context;
- struct mirror_set *ms;
--
-- ms = bio_get_ms(bio);
-- bio_set_ms(bio, NULL);
--
-+ int uptodate = 0, run;
-+
-+ ms = (bio_get_m(bio))->ms;
-+ bio_set_m(bio, NULL);
-+
- /*
- * NOTE: We don't decrement the pending count here,
- * instead it is done by the targets endio function.
- * This way we handle both writes to SYNC and NOSYNC
- * regions with the same code.
- */
-+ if (unlikely(error)) {
-+ DMERR("Error during write occurred.");
-
-- if (error) {
- /*
-- * only error the io if all mirrors failed.
-- * FIXME: bogus
-+ * Test all bits - if all failed, fail io.
-+ * Otherwise, go through hassle of failing a device...
- */
-- uptodate = 0;
-- for (i = 0; i < ms->nr_mirrors; i++)
-- if (!test_bit(i, &error)) {
-+ for (i = 0; i < ms->nr_mirrors; i++) {
-+ if (test_bit(i, &error))
-+ fail_mirror(ms->mirror + i);
-+ else
- uptodate = 1;
-- break;
-+ }
-+
-+ if (likely(uptodate)) {
-+ spin_lock(&ms->lock);
-+ if (atomic_read(&ms->suspended)) {
-+ /*
-+ * The device is suspended, it is
-+ * safe to complete I/O.
-+ */
-+ spin_unlock(&ms->lock);
-+ } else {
-+ /*
-+ * Need to raise event. Since raising
-+ * events can block, we need to do it in
-+ * seperate thread.
-+ *
-+ * run gets set if this will be the first
-+ * bio in the list.
-+ */
-+ run = !ms->failures.head;
-+ bio_list_add(&ms->failures, bio);
-+ spin_unlock(&ms->lock);
-+
-+ if (run)
-+ queue_work(_kmir_mon_wq,
-+ &ms->failure_work);
-+
-+ return;
- }
-+ } else {
-+ DMERR("All replicated volumes dead, failing I/O");
-+ /* None of the writes succeeded, fail the I/O. */
-+ ret = -EIO;
-+ }
- }
-- bio_endio(bio, bio->bi_size, 0);
-+
-+ bio_endio(bio, bio->bi_size, ret);
- }
-
- static void do_write(struct mirror_set *ms, struct bio *bio)
- {
- unsigned int i;
-- struct io_region io[KCOPYD_MAX_REGIONS+1];
-+ struct io_region io[ms->nr_mirrors], *dest = io;
- struct mirror *m;
-
-- for (i = 0; i < ms->nr_mirrors; i++) {
-- m = ms->mirror + i;
--
-- io[i].bdev = m->dev->bdev;
-- io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
-- io[i].count = bio->bi_size >> 9;
-- }
-+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
-+ map_region(dest++, m, bio);
-
-- bio_set_ms(bio, ms);
-- dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
-- bio->bi_io_vec + bio->bi_idx,
-- write_callback, bio);
-+ if (likely(dest - io)) {
-+ /*
-+ * We can use the default mirror here, because we
-+ * only need it in order to retrieve the reference
-+ * to the mirror set in write_callback().
-+ */
-+ bio_set_m(bio, ms->default_mirror);
-+ dm_io_async_bvec(dest - io, io, WRITE,
-+ bio->bi_io_vec + bio->bi_idx,
-+ write_callback, bio);
-+ } else
-+ bio_endio(bio, bio->bi_size, -EIO);
- }
-
- static void do_writes(struct mirror_set *ms, struct bio_list *writes)
-@@ -779,6 +990,9 @@ static void do_writes(struct mirror_set
- int state;
- struct bio *bio;
- struct bio_list sync, nosync, recover, *this_list = NULL;
-+ struct bio_list requeue;
-+ struct dirty_log *log = ms->rh.log;
-+ region_t region;
-
- if (!writes->head)
- return;
-@@ -789,9 +1003,18 @@ static void do_writes(struct mirror_set
- bio_list_init(&sync);
- bio_list_init(&nosync);
- bio_list_init(&recover);
-+ bio_list_init(&requeue);
-
- while ((bio = bio_list_pop(writes))) {
-- state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
-+ region = bio_to_region(&ms->rh, bio);
-+
-+ if (log->type->is_remote_recovering &&
-+ log->type->is_remote_recovering(log, region)) {
-+ bio_list_add(&requeue, bio);
-+ continue;
-+ }
-+
-+ state = rh_state(&ms->rh, region, 1);
- switch (state) {
- case RH_CLEAN:
- case RH_DIRTY:
-@@ -810,6 +1033,8 @@ static void do_writes(struct mirror_set
- bio_list_add(this_list, bio);
- }
-
-+ bio_list_merge(writes, &requeue);
-+
- /*
- * Increment the pending counts for any regions that will
- * be written to (writes to recover regions are going to
-@@ -829,7 +1054,7 @@ static void do_writes(struct mirror_set
- rh_delay(&ms->rh, bio);
-
- while ((bio = bio_list_pop(&nosync))) {
-- map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
-+ map_bio(ms->default_mirror, bio);
- generic_make_request(bio);
- }
- }
-@@ -844,12 +1069,12 @@ static void do_mirror(struct mirror_set
- {
- struct bio_list reads, writes;
-
-- spin_lock(&ms->lock);
-+ spin_lock_irq(&ms->lock);
- reads = ms->reads;
- writes = ms->writes;
- bio_list_init(&ms->reads);
- bio_list_init(&ms->writes);
-- spin_unlock(&ms->lock);
-+ spin_unlock_irq(&ms->lock);
-
- rh_update_states(&ms->rh);
- do_recovery(ms);
-@@ -871,7 +1096,7 @@ static void do_work(void *ignored)
- * Target functions
- *---------------------------------------------------------------*/
- static struct mirror_set *alloc_context(unsigned int nr_mirrors,
-- sector_t region_size,
-+ uint32_t region_size,
- struct dm_target *ti,
- struct dirty_log *dl)
- {
-@@ -891,11 +1116,16 @@ static struct mirror_set *alloc_context(
-
- memset(ms, 0, len);
- spin_lock_init(&ms->lock);
-+ spin_lock_init(&ms->choose_lock);
-
- ms->ti = ti;
- ms->nr_mirrors = nr_mirrors;
-- ms->nr_regions = dm_div_up(ti->len, region_size);
-+ ms->nr_regions = dm_sector_div_up(ti->len, region_size);
- ms->in_sync = 0;
-+ ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
-+
-+ /* a resume must be issued to start the device */
-+ atomic_set(&ms->suspended, 1);
-
- if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
- ti->error = "dm-mirror: Error creating dirty region hash";
-@@ -903,6 +1133,13 @@ static struct mirror_set *alloc_context(
- return NULL;
- }
-
-+ atomic_set(&ms->read_count, MIN_READS);
-+
-+ bio_list_init(&ms->failures);
-+ INIT_WORK(&ms->failure_work, write_failure_handler, ms);
-+
-+ init_completion(&ms->failure_completion);
-+
- return ms;
- }
-
-@@ -916,7 +1153,7 @@ static void free_context(struct mirror_s
- kfree(ms);
- }
-
--static inline int _check_region_size(struct dm_target *ti, sector_t size)
-+static inline int _check_region_size(struct dm_target *ti, uint32_t size)
- {
- return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
- size > ti->len);
-@@ -940,6 +1177,8 @@ static int get_mirror(struct mirror_set
- }
-
- ms->mirror[mirror].offset = offset;
-+ atomic_set(&(ms->mirror[mirror].error_count), 0);
-+ ms->mirror[mirror].ms = ms;
-
- return 0;
- }
-@@ -1009,8 +1248,8 @@ static struct dirty_log *create_dirty_lo
- * log_type #log_params <log_params>
- * #mirrors [mirror_path offset]{2,}
- *
-- * For now, #log_params = 1, log_type = "core"
-- *
-+ * log_type is "core" or "disk"
-+ * #log_params is between 1 and 3
- */
- #define DM_IO_PAGES 64
- static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
-@@ -1060,6 +1299,7 @@ static int mirror_ctr(struct dm_target *
- }
-
- ti->private = ms;
-+ ti->split_io = ms->rh.region_size;
-
- r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
- if (r) {
-@@ -1082,14 +1322,15 @@ static void mirror_dtr(struct dm_target
-
- static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
- {
-+ unsigned long flags;
- int should_wake = 0;
- struct bio_list *bl;
-
- bl = (rw == WRITE) ? &ms->writes : &ms->reads;
-- spin_lock(&ms->lock);
-+ spin_lock_irqsave(&ms->lock, flags);
- should_wake = !(bl->head);
- bio_list_add(bl, bio);
-- spin_unlock(&ms->lock);
-+ spin_unlock_irqrestore(&ms->lock, flags);
-
- if (should_wake)
- wake();
-@@ -1104,42 +1345,64 @@ static int mirror_map(struct dm_target *
- int r, rw = bio_rw(bio);
- struct mirror *m;
- struct mirror_set *ms = ti->private;
--
-- map_context->ll = bio->bi_sector >> ms->rh.region_shift;
-+ struct dm_bio_details *bd;
-+ struct bio_map_info *bmi;
-
- if (rw == WRITE) {
-+ /* Save region for mirror_end_io() handler */
-+ map_context->ll = bio_to_region(&ms->rh, bio);
- queue_bio(ms, bio, rw);
- return 0;
- }
-
-+ /* It's all about the READs now */
-+
- r = ms->rh.log->type->in_sync(ms->rh.log,
- bio_to_region(&ms->rh, bio), 0);
- if (r < 0 && r != -EWOULDBLOCK)
- return r;
-
-- if (r == -EWOULDBLOCK) /* FIXME: ugly */
-+ if (r == -EWOULDBLOCK)
- r = 0;
-
-- /*
-- * We don't want to fast track a recovery just for a read
-- * ahead. So we just let it silently fail.
-- * FIXME: get rid of this.
-- */
-- if (!r && rw == READA)
-- return -EIO;
-+ if (likely(r)) {
-+ /*
-+ * Optimize reads by avoiding to hand them to daemon.
-+ *
-+ * In case they fail, queue them for another shot
-+ * in the mirror_end_io() function.
-+ */
-+ m = choose_mirror(ms, NULL);
-+ if (likely(m)) {
-+ bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO);
-+
-+ if (likely(bmi)) {
-+ /* without this, a read is not retryable */
-+ bd = &bmi->bmi_bd;
-+ dm_bio_record(bd, bio);
-+ map_context->ptr = bmi;
-+ bmi->bmi_m = m;
-+ } else {
-+ /* we could fail now, but we can at least **
-+ ** give it a shot. The bd is only used to **
-+ ** retry in the event of a failure anyway. **
-+ ** If we fail, we can fail the I/O then. */
-+ map_context->ptr = NULL;
-+ }
-+
-+ map_bio(m, bio);
-+ return 1; /* Mapped -> queue request. */
-+ } else
-+ return -EIO;
-+ } else {
-+ /* Either not clean, or -EWOULDBLOCK */
-+ if (rw == READA)
-+ return -EWOULDBLOCK;
-
-- if (!r) {
-- /* Pass this io over to the daemon */
- queue_bio(ms, bio, rw);
-- return 0;
- }
-
-- m = choose_mirror(ms, bio->bi_sector);
-- if (!m)
-- return -EIO;
--
-- map_bio(ms, m, bio);
-- return 1;
-+ return 0;
- }
-
- static int mirror_end_io(struct dm_target *ti, struct bio *bio,
-@@ -1147,71 +1410,140 @@ static int mirror_end_io(struct dm_targe
- {
- int rw = bio_rw(bio);
- struct mirror_set *ms = (struct mirror_set *) ti->private;
-- region_t region = map_context->ll;
-+ struct mirror *m = NULL;
-+ struct dm_bio_details *bd = NULL;
-
- /*
- * We need to dec pending if this was a write.
- */
-- if (rw == WRITE)
-- rh_dec(&ms->rh, region);
-+ if (rw == WRITE) {
-+ rh_dec(&ms->rh, map_context->ll);
-+ return error;
-+ }
-
-- return 0;
-+ if (error == -EOPNOTSUPP)
-+ goto out;
-+
-+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-+ goto out;
-+
-+ if (unlikely(error)) {
-+ DMERR("A read failure occurred on a mirror device.");
-+ if (!map_context->ptr) {
-+ /*
-+ * There wasn't enough memory to record necessary
-+ * information for a retry.
-+ */
-+ DMERR("Out of memory causing inability to retry read.");
-+ return -EIO;
-+ }
-+ m = ((struct bio_map_info *)map_context->ptr)->bmi_m;
-+ fail_mirror(m); /* Flag error on mirror. */
-+
-+ /*
-+ * A failed read needs to get queued
-+ * to the daemon for another shot to
-+ * one (if any) intact mirrors.
-+ */
-+ if (default_ok(m)) {
-+ bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd);
-+
-+ DMWARN("Trying different device.");
-+ dm_bio_restore(bd, bio);
-+ mempool_free(map_context->ptr, bio_map_info_pool);
-+ map_context->ptr = NULL;
-+ queue_bio(ms, bio, rw);
-+ return 1; /* We want another shot on the bio. */
-+ }
-+ DMERR("All replicated volumes dead, failing I/O");
-+ }
-+
-+ out:
-+ if (map_context->ptr)
-+ mempool_free(map_context->ptr, bio_map_info_pool);
-+
-+ return error;
- }
-
--static void mirror_suspend(struct dm_target *ti)
-+static void mirror_presuspend(struct dm_target *ti)
- {
- struct mirror_set *ms = (struct mirror_set *) ti->private;
- struct dirty_log *log = ms->rh.log;
-+ unsigned long flags;
-+ int run;
-+
-+ /*
-+ * Only run the completion if we are suspending after
-+ * a disk failure.
-+ */
-+ spin_lock_irqsave(&ms->lock, flags);
-+ run = ms->failures.head ? 1 : 0;
-+ spin_unlock_irqrestore(&ms->lock, flags);
-+
-+ if (run && (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK))
-+ complete(&ms->failure_completion);
-+
-+ if (log->type->presuspend && log->type->presuspend(log))
-+ /* FIXME: need better error handling */
-+ DMWARN("log presuspend failed");
-+
-+}
-+
-+static void mirror_postsuspend(struct dm_target *ti)
-+{
-+ struct mirror_set *ms = (struct mirror_set *) ti->private;
-+ struct dirty_log *log = ms->rh.log;
-+
- rh_stop_recovery(&ms->rh);
-- if (log->type->suspend && log->type->suspend(log))
-+ if (log->type->postsuspend && log->type->postsuspend(log))
- /* FIXME: need better error handling */
-- DMWARN("log suspend failed");
-+ DMWARN("log postsuspend failed");
-+ atomic_set(&ms->suspended, 1);
- }
-
- static void mirror_resume(struct dm_target *ti)
- {
- struct mirror_set *ms = (struct mirror_set *) ti->private;
- struct dirty_log *log = ms->rh.log;
-+
- if (log->type->resume && log->type->resume(log))
- /* FIXME: need better error handling */
- DMWARN("log resume failed");
-- rh_start_recovery(&ms->rh);
-+
-+ if (atomic_dec_and_test(&ms->suspended))
-+ rh_start_recovery(&ms->rh);
-+ atomic_set(&ms->suspended, 0);
- }
-
- static int mirror_status(struct dm_target *ti, status_type_t type,
- char *result, unsigned int maxlen)
- {
-- char buffer[32];
- unsigned int m, sz = 0;
- struct mirror_set *ms = (struct mirror_set *) ti->private;
--
--#define EMIT(x...) sz += ((sz >= maxlen) ? \
-- 0 : scnprintf(result + sz, maxlen - sz, x))
-+ char buffer[ms->nr_mirrors + 1];
-
- switch (type) {
- case STATUSTYPE_INFO:
-- EMIT("%d ", ms->nr_mirrors);
--
-+ DMEMIT("%d ", ms->nr_mirrors);
- for (m = 0; m < ms->nr_mirrors; m++) {
-- format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
-- EMIT("%s ", buffer);
-+ DMEMIT("%s ", ms->mirror[m].dev->name);
-+ buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ?
-+ 'D' : 'A';
- }
-+ buffer[m] = '\0';
-
-- EMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
-- ms->rh.log->type->get_sync_count(ms->rh.log),
-- ms->nr_regions);
-+ DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ",
-+ ms->rh.log->type->get_sync_count(ms->rh.log),
-+ ms->nr_regions, buffer);
-+ ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
- break;
-
- case STATUSTYPE_TABLE:
-- EMIT("%s 1 " SECTOR_FORMAT " %d ",
-- ms->rh.log->type->name, ms->rh.region_size,
-- ms->nr_mirrors);
--
-- for (m = 0; m < ms->nr_mirrors; m++) {
-- format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
-- EMIT("%s " SECTOR_FORMAT " ",
-- buffer, ms->mirror[m].offset);
-- }
-+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
-+ DMEMIT("%d ", ms->nr_mirrors);
-+ for (m = 0; m < ms->nr_mirrors; m++)
-+ DMEMIT("%s " SECTOR_FORMAT " ",
-+ ms->mirror[m].dev->name, ms->mirror[m].offset);
- }
-
- return 0;
-@@ -1219,13 +1551,14 @@ static int mirror_status(struct dm_targe
-
- static struct target_type mirror_target = {
- .name = "mirror",
-- .version = {1, 0, 1},
-+ .version = {1, 1, 0},
- .module = THIS_MODULE,
- .ctr = mirror_ctr,
- .dtr = mirror_dtr,
- .map = mirror_map,
- .end_io = mirror_end_io,
-- .suspend = mirror_suspend,
-+ .presuspend = mirror_presuspend,
-+ .postsuspend = mirror_postsuspend,
- .resume = mirror_resume,
- .status = mirror_status,
- };
-@@ -1234,24 +1567,38 @@ static int __init dm_mirror_init(void)
- {
- int r;
-
-+ bio_map_info_pool = mempool_create(100, bio_map_info_alloc,
-+ bio_map_info_free, NULL);
-+ if (!bio_map_info_pool)
-+ return -ENOMEM;
-+
- r = dm_dirty_log_init();
- if (r)
- return r;
-
-- _kmirrord_wq = create_workqueue("kmirrord");
-+ _kmirrord_wq = create_singlethread_workqueue("kmirrord");
- if (!_kmirrord_wq) {
- DMERR("couldn't start kmirrord");
- dm_dirty_log_exit();
-- return r;
-+ return -ENOMEM;
- }
- INIT_WORK(&_kmirrord_work, do_work, NULL);
-
-+ _kmir_mon_wq = create_singlethread_workqueue("kmir_mon");
-+ if (!_kmir_mon_wq) {
-+ DMERR("couldn't start kmir_mon");
-+ dm_dirty_log_exit();
-+ destroy_workqueue(_kmirrord_wq);
-+ return -ENOMEM;
-+ }
-+
- r = dm_register_target(&mirror_target);
- if (r < 0) {
- DMERR("%s: Failed to register mirror target",
- mirror_target.name);
- dm_dirty_log_exit();
- destroy_workqueue(_kmirrord_wq);
-+ destroy_workqueue(_kmir_mon_wq);
- }
-
- return r;
-diff -pruN ./drivers/md.dm/dm-round-robin.c ./drivers/md/dm-round-robin.c
---- ./drivers/md.dm/dm-round-robin.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-round-robin.c 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,214 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Round-robin path selector.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+
-+#include <linux/slab.h>
-+
-+/*-----------------------------------------------------------------
-+ * Path-handling code, paths are held in lists
-+ *---------------------------------------------------------------*/
-+struct path_info {
-+ struct list_head list;
-+ struct path *path;
-+ unsigned repeat_count;
-+};
-+
-+static void free_paths(struct list_head *paths)
-+{
-+ struct path_info *pi, *next;
-+
-+ list_for_each_entry_safe(pi, next, paths, list) {
-+ list_del(&pi->list);
-+ kfree(pi);
-+ }
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Round-robin selector
-+ *---------------------------------------------------------------*/
-+
-+#define RR_MIN_IO 1000
-+
-+struct selector {
-+ struct list_head valid_paths;
-+ struct list_head invalid_paths;
-+};
-+
-+static struct selector *alloc_selector(void)
-+{
-+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-+
-+ if (s) {
-+ INIT_LIST_HEAD(&s->valid_paths);
-+ INIT_LIST_HEAD(&s->invalid_paths);
-+ }
-+
-+ return s;
-+}
-+
-+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
-+{
-+ struct selector *s;
-+
-+ s = alloc_selector();
-+ if (!s)
-+ return -ENOMEM;
-+
-+ ps->context = s;
-+ return 0;
-+}
-+
-+static void rr_destroy(struct path_selector *ps)
-+{
-+ struct selector *s = (struct selector *) ps->context;
-+
-+ free_paths(&s->valid_paths);
-+ free_paths(&s->invalid_paths);
-+ kfree(s);
-+ ps->context = NULL;
-+}
-+
-+static int rr_status(struct path_selector *ps, struct path *path,
-+ status_type_t type, char *result, unsigned int maxlen)
-+{
-+ struct path_info *pi;
-+ int sz = 0;
-+
-+ if (!path)
-+ DMEMIT("0 ");
-+ else {
-+ switch(type) {
-+ case STATUSTYPE_INFO:
-+ break;
-+ case STATUSTYPE_TABLE:
-+ pi = path->pscontext;
-+ DMEMIT("%u ", pi->repeat_count);
-+ break;
-+ }
-+ }
-+
-+ return sz;
-+}
-+
-+/*
-+ * Called during initialisation to register each path with an
-+ * optional repeat_count.
-+ */
-+static int rr_add_path(struct path_selector *ps, struct path *path,
-+ int argc, char **argv, char **error)
-+{
-+ struct selector *s = (struct selector *) ps->context;
-+ struct path_info *pi;
-+ unsigned repeat_count = RR_MIN_IO;
-+
-+ if (argc > 1) {
-+ *error = "round-robin ps: incorrect number of arguments";
-+ return -EINVAL;
-+ }
-+
-+ /* First path argument is number of I/Os before switching path */
-+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
-+ *error = "round-robin ps: invalid repeat count";
-+ return -EINVAL;
-+ }
-+
-+ /* allocate the path */
-+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-+ if (!pi) {
-+ *error = "round-robin ps: Error allocating path context";
-+ return -ENOMEM;
-+ }
-+
-+ pi->path = path;
-+ pi->repeat_count = repeat_count;
-+
-+ path->pscontext = pi;
-+
-+ list_add(&pi->list, &s->valid_paths);
-+
-+ return 0;
-+}
-+
-+static void rr_fail_path(struct path_selector *ps, struct path *p)
-+{
-+ struct selector *s = (struct selector *) ps->context;
-+ struct path_info *pi = p->pscontext;
-+
-+ list_move(&pi->list, &s->invalid_paths);
-+}
-+
-+static int rr_reinstate_path(struct path_selector *ps, struct path *p)
-+{
-+ struct selector *s = (struct selector *) ps->context;
-+ struct path_info *pi = p->pscontext;
-+
-+ list_move(&pi->list, &s->valid_paths);
-+
-+ return 0;
-+}
-+
-+static struct path *rr_select_path(struct path_selector *ps,
-+ unsigned *repeat_count)
-+{
-+ struct selector *s = (struct selector *) ps->context;
-+ struct path_info *pi = NULL;
-+
-+ if (!list_empty(&s->valid_paths)) {
-+ pi = list_entry(s->valid_paths.next, struct path_info, list);
-+ list_move_tail(&pi->list, &s->valid_paths);
-+ *repeat_count = pi->repeat_count;
-+ }
-+
-+ return pi ? pi->path : NULL;
-+}
-+
-+static struct path_selector_type rr_ps = {
-+ .name = "round-robin",
-+ .module = THIS_MODULE,
-+ .table_args = 1,
-+ .info_args = 0,
-+ .create = rr_create,
-+ .destroy = rr_destroy,
-+ .status = rr_status,
-+ .add_path = rr_add_path,
-+ .fail_path = rr_fail_path,
-+ .reinstate_path = rr_reinstate_path,
-+ .select_path = rr_select_path,
-+};
-+
-+static int __init dm_rr_init(void)
-+{
-+ int r = dm_register_path_selector(&rr_ps);
-+
-+ if (r < 0)
-+ DMERR("round-robin: register failed %d", r);
-+
-+ DMINFO("dm-round-robin version 1.0.0 loaded");
-+
-+ return r;
-+}
-+
-+static void __exit dm_rr_exit(void)
-+{
-+ int r = dm_unregister_path_selector(&rr_ps);
-+
-+ if (r < 0)
-+ DMERR("round-robin: unregister failed %d", r);
-+}
-+
-+module_init(dm_rr_init);
-+module_exit(dm_rr_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
-+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-snap.c ./drivers/md/dm-snap.c
---- ./drivers/md.dm/dm-snap.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-snap.c 2006-03-17 13:16:38.000000000 +0300
-@@ -49,6 +49,11 @@ struct pending_exception {
- struct bio_list snapshot_bios;
-
- /*
-+ * Short-term queue of pending exceptions prior to submission.
-+ */
-+ struct list_head list;
-+
-+ /*
- * Other pending_exceptions that are processing this
- * chunk. When this list is empty, we know we can
- * complete the origins.
-@@ -371,6 +376,15 @@ static inline ulong round_up(ulong n, ul
- return (n + size) & ~size;
- }
-
-+static void read_snapshot_metadata(struct dm_snapshot *s)
-+{
-+ if (s->store.read_metadata(&s->store)) {
-+ down_write(&s->lock);
-+ s->valid = 0;
-+ up_write(&s->lock);
-+ }
-+}
-+
- /*
- * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
- */
-@@ -457,7 +471,7 @@ static int snapshot_ctr(struct dm_target
- s->chunk_shift = ffs(chunk_size) - 1;
-
- s->valid = 1;
-- s->have_metadata = 0;
-+ s->active = 0;
- s->last_percent = 0;
- init_rwsem(&s->lock);
- s->table = ti->table;
-@@ -492,7 +506,11 @@ static int snapshot_ctr(struct dm_target
- goto bad5;
- }
-
-+ /* Metadata must only be loaded into one table at once */
-+ read_snapshot_metadata(s);
-+
- /* Add snapshot to the list of snapshots for this origin */
-+ /* Exceptions aren't triggered till snapshot_resume() is called */
- if (register_snapshot(s)) {
- r = -EINVAL;
- ti->error = "Cannot register snapshot origin";
-@@ -529,8 +547,12 @@ static void snapshot_dtr(struct dm_targe
- {
- struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
-
-+ /* Prevent further origin writes from using this snapshot. */
-+ /* After this returns there can be no new kcopyd jobs. */
- unregister_snapshot(s);
-
-+ kcopyd_client_destroy(s->kcopyd_client);
-+
- exit_exception_table(&s->pending, pending_cache);
- exit_exception_table(&s->complete, exception_cache);
-
-@@ -539,7 +561,7 @@ static void snapshot_dtr(struct dm_targe
-
- dm_put_device(ti, s->origin);
- dm_put_device(ti, s->cow);
-- kcopyd_client_destroy(s->kcopyd_client);
-+
- kfree(s);
- }
-
-@@ -777,7 +799,10 @@ static int snapshot_map(struct dm_target
-
- /* Full snapshots are not usable */
- if (!s->valid)
-- return -1;
-+ return -EIO;
-+
-+ if (unlikely(bio_barrier(bio)))
-+ return -EOPNOTSUPP;
-
- /*
- * Write to snapshot - higher level takes care of RW/RO
-@@ -848,24 +873,15 @@ static void snapshot_resume(struct dm_ta
- {
- struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
-
-- if (s->have_metadata)
-- return;
--
-- if (s->store.read_metadata(&s->store)) {
-- down_write(&s->lock);
-- s->valid = 0;
-- up_write(&s->lock);
-- }
--
-- s->have_metadata = 1;
-+ down_write(&s->lock);
-+ s->active = 1;
-+ up_write(&s->lock);
- }
-
- static int snapshot_status(struct dm_target *ti, status_type_t type,
- char *result, unsigned int maxlen)
- {
- struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
-- char cow[32];
-- char org[32];
-
- switch (type) {
- case STATUSTYPE_INFO:
-@@ -892,9 +908,8 @@ static int snapshot_status(struct dm_tar
- * to make private copies if the output is to
- * make sense.
- */
-- format_dev_t(cow, snap->cow->bdev->bd_dev);
-- format_dev_t(org, snap->origin->bdev->bd_dev);
-- snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, org, cow,
-+ snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
-+ snap->origin->name, snap->cow->name,
- snap->type, snap->chunk_size);
- break;
- }
-@@ -924,14 +939,19 @@ static int __origin_write(struct list_he
- int r = 1, first = 1;
- struct dm_snapshot *snap;
- struct exception *e;
-- struct pending_exception *pe, *last = NULL;
-+ struct pending_exception *pe, *next_pe, *last = NULL;
- chunk_t chunk;
-+ LIST_HEAD(pe_queue);
-
- /* Do all the snapshots on this origin */
- list_for_each_entry (snap, snapshots, list) {
-
-- /* Only deal with valid snapshots */
-- if (!snap->valid)
-+ /* Only deal with valid and active snapshots */
-+ if (!snap->valid || !snap->active)
-+ continue;
-+
-+ /* Nothing to do if writing beyond end of snapshot */
-+ if (bio->bi_sector >= dm_table_get_size(snap->table))
- continue;
-
- down_write(&snap->lock);
-@@ -955,12 +975,19 @@ static int __origin_write(struct list_he
- snap->valid = 0;
-
- } else {
-- if (last)
-+ if (first) {
-+ bio_list_add(&pe->origin_bios, bio);
-+ r = 0;
-+ first = 0;
-+ }
-+ if (last && list_empty(&pe->siblings))
- list_merge(&pe->siblings,
- &last->siblings);
--
-+ if (!pe->started) {
-+ pe->started = 1;
-+ list_add_tail(&pe->list, &pe_queue);
-+ }
- last = pe;
-- r = 0;
- }
- }
-
-@@ -970,24 +997,8 @@ static int __origin_write(struct list_he
- /*
- * Now that we have a complete pe list we can start the copying.
- */
-- if (last) {
-- pe = last;
-- do {
-- down_write(&pe->snap->lock);
-- if (first)
-- bio_list_add(&pe->origin_bios, bio);
-- if (!pe->started) {
-- pe->started = 1;
-- up_write(&pe->snap->lock);
-- start_copy(pe);
-- } else
-- up_write(&pe->snap->lock);
-- first = 0;
-- pe = list_entry(pe->siblings.next,
-- struct pending_exception, siblings);
--
-- } while (pe != last);
-- }
-+ list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
-+ start_copy(pe);
-
- return r;
- }
-@@ -1051,6 +1062,9 @@ static int origin_map(struct dm_target *
- struct dm_dev *dev = (struct dm_dev *) ti->private;
- bio->bi_bdev = dev->bdev;
-
-+ if (unlikely(bio_barrier(bio)))
-+ return -EOPNOTSUPP;
-+
- /* Only tell snapshots if this is a write */
- return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
- }
-@@ -1082,7 +1096,6 @@ static int origin_status(struct dm_targe
- unsigned int maxlen)
- {
- struct dm_dev *dev = (struct dm_dev *) ti->private;
-- char buffer[32];
-
- switch (type) {
- case STATUSTYPE_INFO:
-@@ -1090,8 +1103,7 @@ static int origin_status(struct dm_targe
- break;
-
- case STATUSTYPE_TABLE:
-- format_dev_t(buffer, dev->bdev->bd_dev);
-- snprintf(result, maxlen, "%s", buffer);
-+ snprintf(result, maxlen, "%s", dev->name);
- break;
- }
-
-@@ -1100,7 +1112,7 @@ static int origin_status(struct dm_targe
-
- static struct target_type origin_target = {
- .name = "snapshot-origin",
-- .version = {1, 0, 1},
-+ .version = {1, 2, 0},
- .module = THIS_MODULE,
- .ctr = origin_ctr,
- .dtr = origin_dtr,
-@@ -1111,7 +1123,7 @@ static struct target_type origin_target
-
- static struct target_type snapshot_target = {
- .name = "snapshot",
-- .version = {1, 0, 1},
-+ .version = {1, 2, 0},
- .module = THIS_MODULE,
- .ctr = snapshot_ctr,
- .dtr = snapshot_dtr,
-diff -pruN ./drivers/md.dm/dm-snap.h ./drivers/md/dm-snap.h
---- ./drivers/md.dm/dm-snap.h 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-snap.h 2006-03-17 13:16:38.000000000 +0300
-@@ -99,7 +99,9 @@ struct dm_snapshot {
-
- /* You can't use a snapshot if this is 0 (e.g. if full) */
- int valid;
-- int have_metadata;
-+
-+ /* Origin writes don't trigger exceptions until this is set */
-+ int active;
-
- /* Used for display of table */
- char type;
-diff -pruN ./drivers/md.dm/dm-stripe.c ./drivers/md/dm-stripe.c
---- ./drivers/md.dm/dm-stripe.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-stripe.c 2006-03-17 13:16:38.000000000 +0300
-@@ -21,7 +21,7 @@ struct stripe_c {
- uint32_t stripes;
-
- /* The size of this target / num. stripes */
-- uint32_t stripe_width;
-+ sector_t stripe_width;
-
- /* stripe chunk size */
- uint32_t chunk_shift;
-@@ -173,9 +173,8 @@ static int stripe_map(struct dm_target *
- struct stripe_c *sc = (struct stripe_c *) ti->private;
-
- sector_t offset = bio->bi_sector - ti->begin;
-- uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
-- uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
-- chunk = chunk / sc->stripes;
-+ sector_t chunk = offset >> sc->chunk_shift;
-+ uint32_t stripe = do_div(chunk, sc->stripes);
-
- bio->bi_bdev = sc->stripe[stripe].dev->bdev;
- bio->bi_sector = sc->stripe[stripe].physical_start +
-@@ -189,10 +188,6 @@ static int stripe_status(struct dm_targe
- struct stripe_c *sc = (struct stripe_c *) ti->private;
- unsigned int sz = 0;
- unsigned int i;
-- char buffer[32];
--
--#define EMIT(x...) sz += ((sz >= maxlen) ? \
-- 0 : scnprintf(result + sz, maxlen - sz, x))
-
- switch (type) {
- case STATUSTYPE_INFO:
-@@ -200,12 +195,10 @@ static int stripe_status(struct dm_targe
- break;
-
- case STATUSTYPE_TABLE:
-- EMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
-- for (i = 0; i < sc->stripes; i++) {
-- format_dev_t(buffer, sc->stripe[i].dev->bdev->bd_dev);
-- EMIT(" %s " SECTOR_FORMAT, buffer,
-- sc->stripe[i].physical_start);
-- }
-+ DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
-+ for (i = 0; i < sc->stripes; i++)
-+ DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
-+ sc->stripe[i].physical_start);
- break;
- }
- return 0;
-@@ -213,7 +206,7 @@ static int stripe_status(struct dm_targe
-
- static struct target_type stripe_target = {
- .name = "striped",
-- .version= {1, 0, 1},
-+ .version= {1, 0, 2},
- .module = THIS_MODULE,
- .ctr = stripe_ctr,
- .dtr = stripe_dtr,
-diff -pruN ./drivers/md.dm/dm-table.c ./drivers/md/dm-table.c
---- ./drivers/md.dm/dm-table.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-table.c 2006-03-17 13:16:38.000000000 +0300
-@@ -57,7 +57,7 @@ struct dm_table {
- /*
- * Similar to ceiling(log_size(n))
- */
--static unsigned int int_log(unsigned long n, unsigned long base)
-+static unsigned int int_log(unsigned int n, unsigned int base)
- {
- int result = 0;
-
-@@ -454,6 +454,8 @@ static int __table_get_device(struct dm_
- return r;
- }
-
-+ format_dev_t(dd->name, dev);
-+
- atomic_set(&dd->count, 0);
- list_add(&dd->list, &t->devices);
-
-@@ -575,7 +577,7 @@ static char **realloc_argv(unsigned *arr
- /*
- * Destructively splits up the argument list to pass to ctr.
- */
--static int split_args(int *argc, char ***argvp, char *input)
-+int dm_split_args(int *argc, char ***argvp, char *input)
- {
- char *start, *end = input, *out, **argv = NULL;
- unsigned array_size = 0;
-@@ -663,14 +665,14 @@ int dm_table_add_target(struct dm_table
-
- if (!len) {
- tgt->error = "zero-length target";
-- DMERR(": %s\n", tgt->error);
-+ DMERR("%s", tgt->error);
- return -EINVAL;
- }
-
- tgt->type = dm_get_target_type(type);
- if (!tgt->type) {
- tgt->error = "unknown target type";
-- DMERR(": %s\n", tgt->error);
-+ DMERR("%s", tgt->error);
- return -EINVAL;
- }
-
-@@ -688,7 +690,7 @@ int dm_table_add_target(struct dm_table
- goto bad;
- }
-
-- r = split_args(&argc, &argv, params);
-+ r = dm_split_args(&argc, &argv, params);
- if (r) {
- tgt->error = "couldn't split parameters (insufficient memory)";
- goto bad;
-@@ -707,7 +709,7 @@ int dm_table_add_target(struct dm_table
- return 0;
-
- bad:
-- DMERR(": %s\n", tgt->error);
-+ DMERR("%s", tgt->error);
- dm_put_target_type(tgt->type);
- return r;
- }
-@@ -825,7 +827,7 @@ void dm_table_set_restrictions(struct dm
- * Make sure we obey the optimistic sub devices
- * restrictions.
- */
-- q->max_sectors = t->limits.max_sectors;
-+ blk_queue_max_sectors(q, t->limits.max_sectors);
- q->max_phys_segments = t->limits.max_phys_segments;
- q->max_hw_segments = t->limits.max_hw_segments;
- q->hardsect_size = t->limits.hardsect_size;
-@@ -848,18 +850,38 @@ int dm_table_get_mode(struct dm_table *t
- return t->mode;
- }
-
--void dm_table_suspend_targets(struct dm_table *t)
-+static void suspend_targets(struct dm_table *t, unsigned postsuspend)
- {
-- int i;
-+ int i = t->num_targets;
-+ struct dm_target *ti = t->targets;
-
-- for (i = 0; i < t->num_targets; i++) {
-- struct dm_target *ti = t->targets + i;
-+ while (i--) {
-+ if (postsuspend) {
-+ if (ti->type->postsuspend)
-+ ti->type->postsuspend(ti);
-+ } else if (ti->type->presuspend)
-+ ti->type->presuspend(ti);
-
-- if (ti->type->suspend)
-- ti->type->suspend(ti);
-+ ti++;
- }
- }
-
-+void dm_table_presuspend_targets(struct dm_table *t)
-+{
-+ if (!t)
-+ return;
-+
-+ return suspend_targets(t, 0);
-+}
-+
-+void dm_table_postsuspend_targets(struct dm_table *t)
-+{
-+ if (!t)
-+ return;
-+
-+ return suspend_targets(t, 1);
-+}
-+
- void dm_table_resume_targets(struct dm_table *t)
- {
- int i;
-@@ -900,11 +922,35 @@ void dm_table_unplug_all(struct dm_table
- }
- }
-
-+int dm_table_flush_all(struct dm_table *t)
-+{
-+ struct list_head *d, *devices = dm_table_get_devices(t);
-+ int ret = 0;
-+
-+ for (d = devices->next; d != devices; d = d->next) {
-+ struct dm_dev *dd = list_entry(d, struct dm_dev, list);
-+ request_queue_t *q = bdev_get_queue(dd->bdev);
-+ int err;
-+
-+ if (!q->issue_flush_fn)
-+ err = -EOPNOTSUPP;
-+ else
-+ err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
-+
-+ if (!ret)
-+ ret = err;
-+ }
-+
-+ return ret;
-+}
-+
- EXPORT_SYMBOL(dm_vcalloc);
- EXPORT_SYMBOL(dm_get_device);
- EXPORT_SYMBOL(dm_put_device);
- EXPORT_SYMBOL(dm_table_event);
-+EXPORT_SYMBOL(dm_table_get_size);
- EXPORT_SYMBOL(dm_table_get_mode);
- EXPORT_SYMBOL(dm_table_put);
- EXPORT_SYMBOL(dm_table_get);
- EXPORT_SYMBOL(dm_table_unplug_all);
-+EXPORT_SYMBOL(dm_table_flush_all);
-diff -pruN ./drivers/md.dm/dm-target.c ./drivers/md/dm-target.c
---- ./drivers/md.dm/dm-target.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-target.c 2006-03-17 13:16:38.000000000 +0300
-@@ -120,10 +120,9 @@ int dm_register_target(struct target_typ
- return -ENOMEM;
-
- down_write(&_lock);
-- if (__find_target_type(t->name)) {
-- kfree(ti);
-+ if (__find_target_type(t->name))
- rv = -EEXIST;
-- } else
-+ else
- list_add(&ti->list, &_targets);
-
- up_write(&_lock);
-diff -pruN ./drivers/md.dm/Kconfig ./drivers/md/Kconfig
---- ./drivers/md.dm/Kconfig 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/Kconfig 2006-03-17 13:16:38.000000000 +0300
-@@ -85,6 +85,24 @@ config MD_RAID1
-
- If unsure, say Y.
-
-+config MD_RAID10
-+ tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
-+ depends on BLK_DEV_MD && EXPERIMENTAL
-+ ---help---
-+ RAID-10 provides a combination of striping (RAID-0) and
-+ mirroring (RAID-1) with easier configuration and more flexable
-+ layout.
-+ Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
-+ be the same size (or atleast, only as much as the smallest device
-+ will be used).
-+ RAID-10 provides a variety of layouts that provide different levels
-+ of redundancy and performance.
-+
-+ RAID-10 requires mdadm-1.7.0 or later, available at:
-+
-+ ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
-+
-+
- config MD_RAID5
- tristate "RAID-4/RAID-5 mode"
- depends on BLK_DEV_MD
-@@ -200,5 +218,17 @@ config DM_ZERO
- A target that discards writes, and returns all zeroes for
- reads. Useful in some recovery situations.
-
-+config DM_MULTIPATH
-+ tristate "Multipath target (EXPERIMENTAL)"
-+ depends on BLK_DEV_DM && EXPERIMENTAL
-+ ---help---
-+ Allow volume managers to support multipath hardware.
-+
-+config DM_MULTIPATH_EMC
-+ tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
-+ depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
-+ ---help---
-+ Multipath support for EMC CX/AX series hardware.
-+
- endmenu
-
-diff -pruN ./drivers/md.dm/kcopyd.c ./drivers/md/kcopyd.c
---- ./drivers/md.dm/kcopyd.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/kcopyd.c 2006-03-20 09:36:55.000000000 +0300
-@@ -43,6 +43,10 @@ struct kcopyd_client {
- struct page_list *pages;
- unsigned int nr_pages;
- unsigned int nr_free_pages;
-+#ifndef __GENKSYMS__
-+ wait_queue_head_t destroyq;
-+ atomic_t nr_jobs;
-+#endif
- };
-
- static struct page_list *alloc_pl(void)
-@@ -292,10 +296,15 @@ static int run_complete_job(struct kcopy
- int read_err = job->read_err;
- unsigned int write_err = job->write_err;
- kcopyd_notify_fn fn = job->fn;
-+ struct kcopyd_client *kc = job->kc;
-
-- kcopyd_put_pages(job->kc, job->pages);
-+ kcopyd_put_pages(kc, job->pages);
- mempool_free(job, _job_pool);
- fn(read_err, write_err, context);
-+
-+ if (atomic_dec_and_test(&kc->nr_jobs))
-+ wake_up(&kc->destroyq);
-+
- return 0;
- }
-
-@@ -430,6 +439,7 @@ static void do_work(void *ignored)
- */
- static void dispatch_job(struct kcopyd_job *job)
- {
-+ atomic_inc(&job->kc->nr_jobs);
- push(&_pages_jobs, job);
- wake();
- }
-@@ -667,6 +677,9 @@ int kcopyd_client_create(unsigned int nr
- return r;
- }
-
-+ init_waitqueue_head(&kc->destroyq);
-+ atomic_set(&kc->nr_jobs, 0);
-+
- client_add(kc);
- *result = kc;
- return 0;
-@@ -674,6 +687,9 @@ int kcopyd_client_create(unsigned int nr
-
- void kcopyd_client_destroy(struct kcopyd_client *kc)
- {
-+ /* Wait for completion of all jobs submitted by this client. */
-+ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
-+
- dm_io_put(kc->nr_pages);
- client_free_pages(kc);
- client_del(kc);
-diff -pruN ./drivers/md.dm/linear.c ./drivers/md/linear.c
---- ./drivers/md.dm/linear.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/linear.c 2006-03-17 13:16:38.000000000 +0300
-@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde
- return hash->dev0;
- }
-
--
- /**
- * linear_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @q: request queue
-@@ -93,13 +92,35 @@ static void linear_unplug(request_queue_
- }
- }
-
-+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ linear_conf_t *conf = mddev_to_conf(mddev);
-+ int i, ret = 0;
-+
-+ for (i=0; i < mddev->raid_disks; i++) {
-+ struct block_device *bdev = conf->disks[i].rdev->bdev;
-+ request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+ if (!r_queue->issue_flush_fn) {
-+ ret = -EOPNOTSUPP;
-+ break;
-+ }
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ return ret;
-+}
-
- static int linear_run (mddev_t *mddev)
- {
- linear_conf_t *conf;
- struct linear_hash *table;
- mdk_rdev_t *rdev;
-- int size, i, nb_zone, cnt;
-+ int i, nb_zone, cnt;
-+ sector_t size;
- unsigned int curr_offset;
- struct list_head *tmp;
-
-@@ -137,7 +158,7 @@ static int linear_run (mddev_t *mddev)
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- disk->size = rdev->size;
- mddev->array_size += rdev->size;
-@@ -200,6 +221,7 @@ static int linear_run (mddev_t *mddev)
-
- blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- mddev->queue->unplug_fn = linear_unplug;
-+ mddev->queue->issue_flush_fn = linear_issue_flush;
- return 0;
-
- out:
-@@ -247,10 +269,11 @@ static int linear_make_request (request_
- char b[BDEVNAME_SIZE];
-
- printk("linear_make_request: Block %llu out of bounds on "
-- "dev %s size %ld offset %ld\n",
-+ "dev %s size %llu offset %llu\n",
- (unsigned long long)block,
- bdevname(tmp_dev->rdev->bdev, b),
-- tmp_dev->size, tmp_dev->offset);
-+ (unsigned long long)tmp_dev->size,
-+ (unsigned long long)tmp_dev->offset);
- bio_io_error(bio, bio->bi_size);
- return 0;
- }
-diff -pruN ./drivers/md.dm/Makefile ./drivers/md/Makefile
---- ./drivers/md.dm/Makefile 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/Makefile 2006-03-17 13:16:38.000000000 +0300
-@@ -4,13 +4,16 @@
-
- dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
- dm-ioctl.o dm-io.o kcopyd.o
-+dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
- dm-snapshot-objs := dm-snap.o dm-exception-store.o
- dm-mirror-objs := dm-log.o dm-raid1.o
- raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
- raid6int1.o raid6int2.o raid6int4.o \
- raid6int8.o raid6int16.o raid6int32.o \
- raid6mmx.o raid6sse1.o raid6sse2.o
--host-progs := mktables
-+hostprogs-y := mktables
-+
-+CFLAGS_raid6int8.o += -O2
-
- # Note: link order is important. All raid personalities
- # and xor.o must come before md.o, as they each initialise
-@@ -20,12 +23,15 @@ host-progs := mktables
- obj-$(CONFIG_MD_LINEAR) += linear.o
- obj-$(CONFIG_MD_RAID0) += raid0.o
- obj-$(CONFIG_MD_RAID1) += raid1.o
-+obj-$(CONFIG_MD_RAID10) += raid10.o
- obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
- obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
- obj-$(CONFIG_MD_MULTIPATH) += multipath.o
- obj-$(CONFIG_BLK_DEV_MD) += md.o
- obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
- obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
-+obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
-+obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
- obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
- obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
- obj-$(CONFIG_DM_ZERO) += dm-zero.o
-diff -pruN ./drivers/md.dm/md.c ./drivers/md/md.c
---- ./drivers/md.dm/md.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/md.c 2006-03-17 13:22:09.000000000 +0300
-@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN
- tmp = tmp->next;}) \
- )
-
-+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
-+{
-+ struct list_head *tmp;
-+ mdk_rdev_t *rdev;
-+ int ret = 0;
-+
-+ /*
-+ * this list iteration is done without any locking in md?!
-+ */
-+ ITERATE_RDEV(mddev, rdev, tmp) {
-+ request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-+ int err;
-+
-+ if (!r_queue->issue_flush_fn)
-+ err = -EOPNOTSUPP;
-+ else
-+ err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
-+
-+ if (!ret)
-+ ret = err;
-+ }
-+
-+ return ret;
-+}
-+
-+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+
-+ return md_flush_mddev(mddev, error_sector);
-+}
-+
- static int md_fail_request (request_queue_t *q, struct bio *bio)
- {
- bio_io_error(bio, bio->bi_size);
-@@ -331,29 +364,24 @@ static int bi_complete(struct bio *bio,
- static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
- struct page *page, int rw)
- {
-- struct bio bio;
-- struct bio_vec vec;
-+ struct bio *bio = bio_alloc(GFP_NOIO, 1);
- struct completion event;
-+ int ret;
-
- rw |= (1 << BIO_RW_SYNC);
-
-- bio_init(&bio);
-- bio.bi_io_vec = &vec;
-- vec.bv_page = page;
-- vec.bv_len = size;
-- vec.bv_offset = 0;
-- bio.bi_vcnt = 1;
-- bio.bi_idx = 0;
-- bio.bi_size = size;
-- bio.bi_bdev = bdev;
-- bio.bi_sector = sector;
-+ bio->bi_bdev = bdev;
-+ bio->bi_sector = sector;
-+ bio_add_page(bio, page, size, 0);
- init_completion(&event);
-- bio.bi_private = &event;
-- bio.bi_end_io = bi_complete;
-- submit_bio(rw, &bio);
-+ bio->bi_private = &event;
-+ bio->bi_end_io = bi_complete;
-+ submit_bio(rw, bio);
- wait_for_completion(&event);
-
-- return test_bit(BIO_UPTODATE, &bio.bi_flags);
-+ ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+ bio_put(bio);
-+ return ret;
- }
-
- static int read_disk_sb(mdk_rdev_t * rdev)
-@@ -373,7 +401,7 @@ static int read_disk_sb(mdk_rdev_t * rde
- return 0;
-
- fail:
-- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
-+ printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
- bdevname(rdev->bdev,b));
- return -EINVAL;
- }
-@@ -439,6 +467,31 @@ static unsigned int calc_sb_csum(mdp_sup
- return csum;
- }
-
-+/* csum_partial is not consistent between different architectures.
-+ * Some (i386) do a 32bit csum. Some (alpha) do 16 bit.
-+ * This makes it hard for user-space to know what to do.
-+ * So we use calc_sb_csum to set the checksum to allow working
-+ * with older kernels, but allow calc_sb_csum_common to
-+ * be used when checking if a checksum is correct, to
-+ * make life easier for user-space tools that might write
-+ * a superblock.
-+ */
-+static unsigned int calc_sb_csum_common(mdp_super_t *super)
-+{
-+ unsigned int disk_csum = super->sb_csum;
-+ unsigned long long newcsum = 0;
-+ unsigned int csum;
-+ int i;
-+ unsigned int *superc = (int*) super;
-+ super->sb_csum = 0;
-+
-+ for (i=0; i<MD_SB_BYTES/4; i++)
-+ newcsum+= superc[i];
-+ csum = (newcsum& 0xffffffff) + (newcsum>>32);
-+ super->sb_csum = disk_csum;
-+ return csum;
-+}
-+
- /*
- * Handle superblock details.
- * We want to be able to handle multiple superblock formats
-@@ -521,7 +574,8 @@ static int super_90_load(mdk_rdev_t *rde
- if (sb->raid_disks <= 0)
- goto abort;
-
-- if (calc_sb_csum(sb) != sb->sb_csum) {
-+ if (calc_sb_csum(sb) != sb->sb_csum &&
-+ calc_sb_csum_common(sb) != sb->sb_csum) {
- printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
- b);
- goto abort;
-@@ -530,7 +584,7 @@ static int super_90_load(mdk_rdev_t *rde
- rdev->preferred_minor = sb->md_minor;
- rdev->data_offset = 0;
-
-- if (sb->level == MULTIPATH)
-+ if (sb->level == LEVEL_MULTIPATH)
- rdev->desc_nr = -1;
- else
- rdev->desc_nr = sb->this_disk.number;
-@@ -745,11 +799,21 @@ static void super_90_sync(mddev_t *mddev
- static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
- {
- unsigned int disk_csum, csum;
-+ unsigned long long newcsum;
- int size = 256 + sb->max_dev*2;
-+ unsigned int *isuper = (unsigned int*)sb;
-+ int i;
-
- disk_csum = sb->sb_csum;
- sb->sb_csum = 0;
-- csum = csum_partial((void *)sb, size, 0);
-+ newcsum = 0;
-+ for (i=0; size>=4; size -= 4 )
-+ newcsum += le32_to_cpu(*isuper++);
-+
-+ if (size == 2)
-+ newcsum += le16_to_cpu(*(unsigned short*) isuper);
-+
-+ csum = (newcsum & 0xffffffff) + (newcsum >> 32);
- sb->sb_csum = disk_csum;
- return csum;
- }
-@@ -924,12 +988,12 @@ static void super_1_sync(mddev_t *mddev,
-
- max_dev = 0;
- ITERATE_RDEV(mddev,rdev2,tmp)
-- if (rdev2->desc_nr > max_dev)
-- max_dev = rdev2->desc_nr;
-+ if (rdev2->desc_nr+1 > max_dev)
-+ max_dev = rdev2->desc_nr+1;
-
- sb->max_dev = max_dev;
- for (i=0; i<max_dev;i++)
-- sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
-+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
-
- ITERATE_RDEV(mddev,rdev2,tmp) {
- i = rdev2->desc_nr;
-@@ -942,6 +1006,7 @@ static void super_1_sync(mddev_t *mddev,
- }
-
- sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
-+ sb->sb_csum = calc_sb_1_csum(sb);
- }
-
-
-@@ -1042,20 +1107,24 @@ static void unbind_rdev_from_array(mdk_r
- /*
- * prevent the device from being mounted, repartitioned or
- * otherwise reused by a RAID array (or any other kernel
-- * subsystem), by opening the device. [simply getting an
-- * inode is not enough, the SCSI module usage code needs
-- * an explicit open() on the device]
-+ * subsystem), by bd_claiming the device.
- */
- static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
- {
- int err = 0;
- struct block_device *bdev;
-+ char b[BDEVNAME_SIZE];
-
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
-- if (IS_ERR(bdev))
-+ if (IS_ERR(bdev)) {
-+ printk(KERN_ERR "md: could not open %s.\n",
-+ __bdevname(dev, b));
- return PTR_ERR(bdev);
-+ }
- err = bd_claim(bdev, rdev);
- if (err) {
-+ printk(KERN_ERR "md: could not bd_claim %s.\n",
-+ bdevname(bdev, b));
- blkdev_put(bdev);
- return err;
- }
-@@ -1117,10 +1186,7 @@ static void export_array(mddev_t *mddev)
-
- static void print_desc(mdp_disk_t *desc)
- {
-- char b[BDEVNAME_SIZE];
--
-- printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
-- __bdevname(MKDEV(desc->major, desc->minor), b),
-+ printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
- desc->major,desc->minor,desc->raid_disk,desc->state);
- }
-
-@@ -1312,8 +1378,7 @@ static mdk_rdev_t *md_import_device(dev_
-
- rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
- if (!rdev) {
-- printk(KERN_ERR "md: could not alloc mem for %s!\n",
-- __bdevname(newdev, b));
-+ printk(KERN_ERR "md: could not alloc mem for new device!\n");
- return ERR_PTR(-ENOMEM);
- }
- memset(rdev, 0, sizeof(*rdev));
-@@ -1322,11 +1387,9 @@ static mdk_rdev_t *md_import_device(dev_
- goto abort_free;
-
- err = lock_rdev(rdev, newdev);
-- if (err) {
-- printk(KERN_ERR "md: could not lock %s.\n",
-- __bdevname(newdev, b));
-+ if (err)
- goto abort_free;
-- }
-+
- rdev->desc_nr = -1;
- rdev->faulty = 0;
- rdev->in_sync = 0;
-@@ -1436,9 +1499,8 @@ static int analyze_sbs(mddev_t * mddev)
- goto abort;
- }
-
-- if ((mddev->recovery_cp != MaxSector) &&
-- ((mddev->level == 1) ||
-- ((mddev->level >= 4) && (mddev->level <= 6))))
-+ if (mddev->recovery_cp != MaxSector &&
-+ mddev->level >= 1)
- printk(KERN_ERR "md: %s: raid array is not clean"
- " -- starting background reconstruction\n",
- mdname(mddev));
-@@ -1615,6 +1677,8 @@ static int do_md_run(mddev_t * mddev)
- mddev->pers = pers[pnum];
- spin_unlock(&pers_lock);
-
-+ mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
-+
- err = mddev->pers->run(mddev);
- if (err) {
- printk(KERN_ERR "md: pers->run() failed ...\n");
-@@ -1645,6 +1709,7 @@ static int do_md_run(mddev_t * mddev)
- */
- mddev->queue->queuedata = mddev;
- mddev->queue->make_request_fn = mddev->pers->make_request;
-+ mddev->queue->issue_flush_fn = md_flush_all;
-
- mddev->changed = 1;
- return 0;
-@@ -1881,11 +1946,9 @@ static int autostart_array(dev_t startde
- mdk_rdev_t *start_rdev = NULL, *rdev;
-
- start_rdev = md_import_device(startdev, 0, 0);
-- if (IS_ERR(start_rdev)) {
-- printk(KERN_WARNING "md: could not import %s!\n",
-- __bdevname(startdev, b));
-+ if (IS_ERR(start_rdev))
- return err;
-- }
-+
-
- /* NOTE: this can only work for 0.90.0 superblocks */
- sb = (mdp_super_t*)page_address(start_rdev->sb_page);
-@@ -1916,12 +1979,9 @@ static int autostart_array(dev_t startde
- if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
- continue;
- rdev = md_import_device(dev, 0, 0);
-- if (IS_ERR(rdev)) {
-- printk(KERN_WARNING "md: could not import %s,"
-- " trying to run array nevertheless.\n",
-- __bdevname(dev, b));
-+ if (IS_ERR(rdev))
- continue;
-- }
-+
- list_add(&rdev->same_set, &pending_raid_disks);
- }
-
-@@ -2153,42 +2213,6 @@ static int add_new_disk(mddev_t * mddev,
- return 0;
- }
-
--static int hot_generate_error(mddev_t * mddev, dev_t dev)
--{
-- char b[BDEVNAME_SIZE];
-- struct request_queue *q;
-- mdk_rdev_t *rdev;
--
-- if (!mddev->pers)
-- return -ENODEV;
--
-- printk(KERN_INFO "md: trying to generate %s error in %s ... \n",
-- __bdevname(dev, b), mdname(mddev));
--
-- rdev = find_rdev(mddev, dev);
-- if (!rdev) {
-- /* MD_BUG(); */ /* like hell - it's not a driver bug */
-- return -ENXIO;
-- }
--
-- if (rdev->desc_nr == -1) {
-- MD_BUG();
-- return -EINVAL;
-- }
-- if (!rdev->in_sync)
-- return -ENODEV;
--
-- q = bdev_get_queue(rdev->bdev);
-- if (!q) {
-- MD_BUG();
-- return -ENODEV;
-- }
-- printk(KERN_INFO "md: okay, generating error!\n");
--// q->oneshot_error = 1; // disabled for now
--
-- return 0;
--}
--
- static int hot_remove_disk(mddev_t * mddev, dev_t dev)
- {
- char b[BDEVNAME_SIZE];
-@@ -2197,9 +2221,6 @@ static int hot_remove_disk(mddev_t * mdd
- if (!mddev->pers)
- return -ENODEV;
-
-- printk(KERN_INFO "md: trying to remove %s from %s ... \n",
-- __bdevname(dev, b), mdname(mddev));
--
- rdev = find_rdev(mddev, dev);
- if (!rdev)
- return -ENXIO;
-@@ -2227,9 +2248,6 @@ static int hot_add_disk(mddev_t * mddev,
- if (!mddev->pers)
- return -ENODEV;
-
-- printk(KERN_INFO "md: trying to hot-add %s to %s ... \n",
-- __bdevname(dev, b), mdname(mddev));
--
- if (mddev->major_version != 0) {
- printk(KERN_WARNING "%s: HOT_ADD may only be used with"
- " version-0 superblocks.\n",
-@@ -2478,6 +2496,9 @@ static int set_disk_faulty(mddev_t *mdde
- {
- mdk_rdev_t *rdev;
-
-+ if (mddev->pers == NULL)
-+ return -ENODEV;
-+
- rdev = find_rdev(mddev, dev);
- if (!rdev)
- return -ENODEV;
-@@ -2489,7 +2510,6 @@ static int set_disk_faulty(mddev_t *mdde
- static int md_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
- {
-- char b[BDEVNAME_SIZE];
- int err = 0;
- void __user *argp = (void __user *)arg;
- struct hd_geometry __user *loc = argp;
-@@ -2548,8 +2568,7 @@ static int md_ioctl(struct inode *inode,
- }
- err = autostart_array(new_decode_dev(arg));
- if (err) {
-- printk(KERN_WARNING "md: autostart %s failed!\n",
-- __bdevname(arg, b));
-+ printk(KERN_WARNING "md: autostart failed!\n");
- goto abort;
- }
- goto done;
-@@ -2690,9 +2709,7 @@ static int md_ioctl(struct inode *inode,
- err = add_new_disk(mddev, &info);
- goto done_unlock;
- }
-- case HOT_GENERATE_ERROR:
-- err = hot_generate_error(mddev, new_decode_dev(arg));
-- goto done_unlock;
-+
- case HOT_REMOVE_DISK:
- err = hot_remove_disk(mddev, new_decode_dev(arg));
- goto done_unlock;
-@@ -2876,7 +2893,7 @@ mdk_thread_t *md_register_thread(void (*
- return thread;
- }
-
--void md_interrupt_thread(mdk_thread_t *thread)
-+static void md_interrupt_thread(mdk_thread_t *thread)
- {
- if (!thread->tsk) {
- MD_BUG();
-@@ -2919,6 +2936,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t
- if (!mddev->pers->error_handler)
- return;
- mddev->pers->error_handler(mddev,rdev);
-+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
- }
-@@ -2951,7 +2969,11 @@ static void status_resync(struct seq_fil
- unsigned long max_blocks, resync, res, dt, db, rt;
-
- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
-- max_blocks = mddev->size;
-+
-+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+ max_blocks = mddev->resync_max_sectors >> 1;
-+ else
-+ max_blocks = mddev->size;
-
- /*
- * Should not happen.
-@@ -3187,11 +3209,6 @@ int unregister_md_personality(int pnum)
- return 0;
- }
-
--void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
--{
-- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
--}
--
- static int is_mddev_idle(mddev_t *mddev)
- {
- mdk_rdev_t * rdev;
-@@ -3204,8 +3221,12 @@ static int is_mddev_idle(mddev_t *mddev)
- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- curr_events = disk_stat_read(disk, read_sectors) +
- disk_stat_read(disk, write_sectors) -
-- disk->sync_io;
-- if ((curr_events - rdev->last_events) > 32) {
-+ atomic_read(&disk->sync_io);
-+ /* Allow some slack between valud of curr_events and last_events,
-+ * as there are some uninteresting races.
-+ * Note: the following is an unsigned comparison.
-+ */
-+ if ((curr_events - rdev->last_events + 32) > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
-@@ -3339,7 +3360,14 @@ static void md_do_sync(mddev_t *mddev)
- }
- } while (mddev->curr_resync < 2);
-
-- max_sectors = mddev->size << 1;
-+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+ /* resync follows the size requested by the personality,
-+ * which default to physical size, but can be virtual size
-+ */
-+ max_sectors = mddev->resync_max_sectors;
-+ else
-+ /* recovery follows the physical size of devices */
-+ max_sectors = mddev->size << 1;
-
- printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-@@ -3372,10 +3400,12 @@ static void md_do_sync(mddev_t *mddev)
- init_waitqueue_head(&mddev->recovery_wait);
- last_check = 0;
-
-- if (j)
-+ if (j>2) {
- printk(KERN_INFO
- "md: resuming recovery of %s from checkpoint.\n",
- mdname(mddev));
-+ mddev->curr_resync = j;
-+ }
-
- while (j < max_sectors) {
- int sectors;
-@@ -3458,7 +3488,7 @@ static void md_do_sync(mddev_t *mddev)
-
- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- mddev->curr_resync > 2 &&
-- mddev->curr_resync > mddev->recovery_cp) {
-+ mddev->curr_resync >= mddev->recovery_cp) {
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- printk(KERN_INFO
- "md: checkpointing recovery of %s.\n",
-@@ -3697,7 +3727,6 @@ void md_autodetect_dev(dev_t dev)
-
- static void autostart_arrays(int part)
- {
-- char b[BDEVNAME_SIZE];
- mdk_rdev_t *rdev;
- int i;
-
-@@ -3707,11 +3736,9 @@ static void autostart_arrays(int part)
- dev_t dev = detected_devices[i];
-
- rdev = md_import_device(dev,0, 0);
-- if (IS_ERR(rdev)) {
-- printk(KERN_ALERT "md: could not import %s!\n",
-- __bdevname(dev, b));
-+ if (IS_ERR(rdev))
- continue;
-- }
-+
- if (rdev->faulty) {
- MD_BUG();
- continue;
-@@ -3762,7 +3789,6 @@ module_exit(md_exit)
- EXPORT_SYMBOL(register_md_personality);
- EXPORT_SYMBOL(unregister_md_personality);
- EXPORT_SYMBOL(md_error);
--EXPORT_SYMBOL(md_sync_acct);
- EXPORT_SYMBOL(md_done_sync);
- EXPORT_SYMBOL(md_write_start);
- EXPORT_SYMBOL(md_write_end);
-@@ -3771,6 +3797,5 @@ EXPORT_SYMBOL(md_register_thread);
- EXPORT_SYMBOL(md_unregister_thread);
- EXPORT_SYMBOL(md_wakeup_thread);
- EXPORT_SYMBOL(md_print_devices);
--EXPORT_SYMBOL(md_interrupt_thread);
- EXPORT_SYMBOL(md_check_recovery);
- MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/multipath.c ./drivers/md/multipath.c
---- ./drivers/md.dm/multipath.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/multipath.c 2006-03-17 13:16:38.000000000 +0300
-@@ -99,12 +99,12 @@ static void multipath_reschedule_retry (
- * operation and are ready to return a success/failure code to the buffer
- * cache layer.
- */
--static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
-+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
- {
- struct bio *bio = mp_bh->master_bio;
- multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
-
-- bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO);
-+ bio_endio(bio, bio->bi_size, err);
- mempool_free(mp_bh, conf->pool);
- }
-
-@@ -119,8 +119,8 @@ int multipath_end_request(struct bio *bi
- return 1;
-
- if (uptodate)
-- multipath_end_bh_io(mp_bh, uptodate);
-- else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) {
-+ multipath_end_bh_io(mp_bh, 0);
-+ else if (!bio_rw_ahead(bio)) {
- /*
- * oops, IO error:
- */
-@@ -131,7 +131,7 @@ int multipath_end_request(struct bio *bi
- (unsigned long long)bio->bi_sector);
- multipath_reschedule_retry(mp_bh);
- } else
-- multipath_end_bh_io(mp_bh, 0);
-+ multipath_end_bh_io(mp_bh, error);
- rdev_dec_pending(rdev, conf->mddev);
- return 0;
- }
-@@ -155,7 +155,7 @@ static void unplug_slaves(mddev_t *mddev
- r_queue->unplug_fn(r_queue);
-
- spin_lock_irqsave(&conf->device_lock, flags);
-- atomic_dec(&rdev->nr_pending);
-+ rdev_dec_pending(rdev, mddev);
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -217,6 +217,31 @@ static void multipath_status (struct seq
- seq_printf (seq, "]");
- }
-
-+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ multipath_conf_t *conf = mddev_to_conf(mddev);
-+ int i, ret = 0;
-+
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->multipaths[i].rdev;
-+ if (rdev && !rdev->faulty) {
-+ struct block_device *bdev = rdev->bdev;
-+ request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+ if (!r_queue->issue_flush_fn) {
-+ ret = -EOPNOTSUPP;
-+ break;
-+ }
-+
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ }
-+ return ret;
-+}
-
- /*
- * Careful, this can execute in IRQ contexts as well!
-@@ -300,7 +325,7 @@ static int multipath_add_disk(mddev_t *m
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- conf->working_disks++;
- rdev->raid_disk = path;
-@@ -377,7 +402,7 @@ static void multipathd (mddev_t *mddev)
- " error for block %llu\n",
- bdevname(bio->bi_bdev,b),
- (unsigned long long)bio->bi_sector);
-- multipath_end_bh_io(mp_bh, 0);
-+ multipath_end_bh_io(mp_bh, -EIO);
- } else {
- printk(KERN_ERR "multipath: %s: redirecting sector %llu"
- " to another IO path\n",
-@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev
-
- mddev->queue->unplug_fn = multipath_unplug;
-
-+ mddev->queue->issue_flush_fn = multipath_issue_flush;
-+
- conf->working_disks = 0;
- ITERATE_RDEV(mddev,rdev,tmp) {
- disk_idx = rdev->raid_disk;
-@@ -452,7 +479,7 @@ static int multipath_run (mddev_t *mddev
- * a merge_bvec_fn to be involved in multipath */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- if (!rdev->faulty)
- conf->working_disks++;
-diff -pruN ./drivers/md.dm/raid0.c ./drivers/md/raid0.c
---- ./drivers/md.dm/raid0.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid0.c 2006-03-17 13:16:38.000000000 +0300
-@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t
- }
- }
-
-+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ raid0_conf_t *conf = mddev_to_conf(mddev);
-+ mdk_rdev_t **devlist = conf->strip_zone[0].dev;
-+ int i, ret = 0;
-+
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ struct block_device *bdev = devlist[i]->bdev;
-+ request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+ if (!r_queue->issue_flush_fn) {
-+ ret = -EOPNOTSUPP;
-+ break;
-+ }
-+
-+ ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ return ret;
-+}
-+
-+
- static int create_strip_zones (mddev_t *mddev)
- {
- int i, c, j;
-@@ -137,7 +162,7 @@ static int create_strip_zones (mddev_t *
-
- if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- if (!smallest || (rdev1->size <smallest->size))
- smallest = rdev1;
-@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t *
-
- mddev->queue->unplug_fn = raid0_unplug;
-
-+ mddev->queue->issue_flush_fn = raid0_issue_flush;
-+
- printk("raid0: done.\n");
- return 0;
- abort:
-diff -pruN ./drivers/md.dm/raid10.c ./drivers/md/raid10.c
---- ./drivers/md.dm/raid10.c 1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/raid10.c 2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,1780 @@
-+/*
-+ * raid10.c : Multiple Devices driver for Linux
-+ *
-+ * Copyright (C) 2000-2004 Neil Brown
-+ *
-+ * RAID-10 support for md.
-+ *
-+ * Base on code in raid1.c. See raid1.c for futher copyright information.
-+ *
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2, or (at your option)
-+ * any later version.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * (for example /usr/src/linux/COPYING); if not, write to the Free
-+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-+ */
-+
-+#include <linux/raid/raid10.h>
-+
-+/*
-+ * RAID10 provides a combination of RAID0 and RAID1 functionality.
-+ * The layout of data is defined by
-+ * chunk_size
-+ * raid_disks
-+ * near_copies (stored in low byte of layout)
-+ * far_copies (stored in second byte of layout)
-+ *
-+ * The data to be stored is divided into chunks using chunksize.
-+ * Each device is divided into far_copies sections.
-+ * In each section, chunks are laid out in a style similar to raid0, but
-+ * near_copies copies of each chunk is stored (each on a different drive).
-+ * The starting device for each section is offset near_copies from the starting
-+ * device of the previous section.
-+ * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
-+ * drive.
-+ * near_copies and far_copies must be at least one, and there product is at most
-+ * raid_disks.
-+ */
-+
-+/*
-+ * Number of guaranteed r10bios in case of extreme VM load:
-+ */
-+#define NR_RAID10_BIOS 256
-+
-+static void unplug_slaves(mddev_t *mddev);
-+
-+static void * r10bio_pool_alloc(int gfp_flags, void *data)
-+{
-+ conf_t *conf = data;
-+ r10bio_t *r10_bio;
-+ int size = offsetof(struct r10bio_s, devs[conf->copies]);
-+
-+ /* allocate a r10bio with room for raid_disks entries in the bios array */
-+ r10_bio = kmalloc(size, gfp_flags);
-+ if (r10_bio)
-+ memset(r10_bio, 0, size);
-+ else
-+ unplug_slaves(conf->mddev);
-+
-+ return r10_bio;
-+}
-+
-+static void r10bio_pool_free(void *r10_bio, void *data)
-+{
-+ kfree(r10_bio);
-+}
-+
-+#define RESYNC_BLOCK_SIZE (64*1024)
-+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
-+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-+#define RESYNC_WINDOW (2048*1024)
-+
-+/*
-+ * When performing a resync, we need to read and compare, so
-+ * we need as many pages are there are copies.
-+ * When performing a recovery, we need 2 bios, one for read,
-+ * one for write (we recover only one drive per r10buf)
-+ *
-+ */
-+static void * r10buf_pool_alloc(int gfp_flags, void *data)
-+{
-+ conf_t *conf = data;
-+ struct page *page;
-+ r10bio_t *r10_bio;
-+ struct bio *bio;
-+ int i, j;
-+ int nalloc;
-+
-+ r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-+ if (!r10_bio) {
-+ unplug_slaves(conf->mddev);
-+ return NULL;
-+ }
-+
-+ if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-+ nalloc = conf->copies; /* resync */
-+ else
-+ nalloc = 2; /* recovery */
-+
-+ /*
-+ * Allocate bios.
-+ */
-+ for (j = nalloc ; j-- ; ) {
-+ bio = bio_alloc(gfp_flags, RESYNC_PAGES);
-+ if (!bio)
-+ goto out_free_bio;
-+ r10_bio->devs[j].bio = bio;
-+ }
-+ /*
-+ * Allocate RESYNC_PAGES data pages and attach them
-+ * where needed.
-+ */
-+ for (j = 0 ; j < nalloc; j++) {
-+ bio = r10_bio->devs[j].bio;
-+ for (i = 0; i < RESYNC_PAGES; i++) {
-+ page = alloc_page(gfp_flags);
-+ if (unlikely(!page))
-+ goto out_free_pages;
-+
-+ bio->bi_io_vec[i].bv_page = page;
-+ }
-+ }
-+
-+ return r10_bio;
-+
-+out_free_pages:
-+ for ( ; i > 0 ; i--)
-+ __free_page(bio->bi_io_vec[i-1].bv_page);
-+ while (j--)
-+ for (i = 0; i < RESYNC_PAGES ; i++)
-+ __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
-+ j = -1;
-+out_free_bio:
-+ while ( ++j < nalloc )
-+ bio_put(r10_bio->devs[j].bio);
-+ r10bio_pool_free(r10_bio, conf);
-+ return NULL;
-+}
-+
-+static void r10buf_pool_free(void *__r10_bio, void *data)
-+{
-+ int i;
-+ conf_t *conf = data;
-+ r10bio_t *r10bio = __r10_bio;
-+ int j;
-+
-+ for (j=0; j < conf->copies; j++) {
-+ struct bio *bio = r10bio->devs[j].bio;
-+ if (bio) {
-+ for (i = 0; i < RESYNC_PAGES; i++) {
-+ __free_page(bio->bi_io_vec[i].bv_page);
-+ bio->bi_io_vec[i].bv_page = NULL;
-+ }
-+ bio_put(bio);
-+ }
-+ }
-+ r10bio_pool_free(r10bio, conf);
-+}
-+
-+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
-+{
-+ int i;
-+
-+ for (i = 0; i < conf->copies; i++) {
-+ struct bio **bio = & r10_bio->devs[i].bio;
-+ if (*bio)
-+ bio_put(*bio);
-+ *bio = NULL;
-+ }
-+}
-+
-+static inline void free_r10bio(r10bio_t *r10_bio)
-+{
-+ unsigned long flags;
-+
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+ /*
-+ * Wake up any possible resync thread that waits for the device
-+ * to go idle.
-+ */
-+ spin_lock_irqsave(&conf->resync_lock, flags);
-+ if (!--conf->nr_pending) {
-+ wake_up(&conf->wait_idle);
-+ wake_up(&conf->wait_resume);
-+ }
-+ spin_unlock_irqrestore(&conf->resync_lock, flags);
-+
-+ put_all_bios(conf, r10_bio);
-+ mempool_free(r10_bio, conf->r10bio_pool);
-+}
-+
-+static inline void put_buf(r10bio_t *r10_bio)
-+{
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+ unsigned long flags;
-+
-+ mempool_free(r10_bio, conf->r10buf_pool);
-+
-+ spin_lock_irqsave(&conf->resync_lock, flags);
-+ if (!conf->barrier)
-+ BUG();
-+ --conf->barrier;
-+ wake_up(&conf->wait_resume);
-+ wake_up(&conf->wait_idle);
-+
-+ if (!--conf->nr_pending) {
-+ wake_up(&conf->wait_idle);
-+ wake_up(&conf->wait_resume);
-+ }
-+ spin_unlock_irqrestore(&conf->resync_lock, flags);
-+}
-+
-+static void reschedule_retry(r10bio_t *r10_bio)
-+{
-+ unsigned long flags;
-+ mddev_t *mddev = r10_bio->mddev;
-+ conf_t *conf = mddev_to_conf(mddev);
-+
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ list_add(&r10_bio->retry_list, &conf->retry_list);
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+ md_wakeup_thread(mddev->thread);
-+}
-+
-+/*
-+ * raid_end_bio_io() is called when we have finished servicing a mirrored
-+ * operation and are ready to return a success/failure code to the buffer
-+ * cache layer.
-+ */
-+static void raid_end_bio_io(r10bio_t *r10_bio)
-+{
-+ struct bio *bio = r10_bio->master_bio;
-+
-+ bio_endio(bio, bio->bi_size,
-+ test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
-+ free_r10bio(r10_bio);
-+}
-+
-+/*
-+ * Update disk head position estimator based on IRQ completion info.
-+ */
-+static inline void update_head_pos(int slot, r10bio_t *r10_bio)
-+{
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+ conf->mirrors[r10_bio->devs[slot].devnum].head_position =
-+ r10_bio->devs[slot].addr + (r10_bio->sectors);
-+}
-+
-+static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+ int slot, dev;
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+ if (bio->bi_size)
-+ return 1;
-+
-+ slot = r10_bio->read_slot;
-+ dev = r10_bio->devs[slot].devnum;
-+ /*
-+ * this branch is our 'one mirror IO has finished' event handler:
-+ */
-+ if (!uptodate)
-+ md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-+ else
-+ /*
-+ * Set R10BIO_Uptodate in our master bio, so that
-+ * we will return a good error code to the higher
-+ * levels even if IO on some other mirrored buffer fails.
-+ *
-+ * The 'master' represents the composite IO operation to
-+ * user-side. So if something waits for IO, then it will
-+ * wait for the 'master' bio.
-+ */
-+ set_bit(R10BIO_Uptodate, &r10_bio->state);
-+
-+ update_head_pos(slot, r10_bio);
-+
-+ /*
-+ * we have only one bio on the read side
-+ */
-+ if (uptodate)
-+ raid_end_bio_io(r10_bio);
-+ else {
-+ /*
-+ * oops, read error:
-+ */
-+ char b[BDEVNAME_SIZE];
-+ if (printk_ratelimit())
-+ printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
-+ bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
-+ reschedule_retry(r10_bio);
-+ }
-+
-+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
-+ return 0;
-+}
-+
-+static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+ int slot, dev;
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+ if (bio->bi_size)
-+ return 1;
-+
-+ for (slot = 0; slot < conf->copies; slot++)
-+ if (r10_bio->devs[slot].bio == bio)
-+ break;
-+ dev = r10_bio->devs[slot].devnum;
-+
-+ /*
-+ * this branch is our 'one mirror IO has finished' event handler:
-+ */
-+ if (!uptodate)
-+ md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-+ else
-+ /*
-+ * Set R10BIO_Uptodate in our master bio, so that
-+ * we will return a good error code for to the higher
-+ * levels even if IO on some other mirrored buffer fails.
-+ *
-+ * The 'master' represents the composite IO operation to
-+ * user-side. So if something waits for IO, then it will
-+ * wait for the 'master' bio.
-+ */
-+ set_bit(R10BIO_Uptodate, &r10_bio->state);
-+
-+ update_head_pos(slot, r10_bio);
-+
-+ /*
-+ *
-+ * Let's see if all mirrored write operations have finished
-+ * already.
-+ */
-+ if (atomic_dec_and_test(&r10_bio->remaining)) {
-+ md_write_end(r10_bio->mddev);
-+ raid_end_bio_io(r10_bio);
-+ }
-+
-+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
-+ return 0;
-+}
-+
-+
-+/*
-+ * RAID10 layout manager
-+ * Aswell as the chunksize and raid_disks count, there are two
-+ * parameters: near_copies and far_copies.
-+ * near_copies * far_copies must be <= raid_disks.
-+ * Normally one of these will be 1.
-+ * If both are 1, we get raid0.
-+ * If near_copies == raid_disks, we get raid1.
-+ *
-+ * Chunks are layed out in raid0 style with near_copies copies of the
-+ * first chunk, followed by near_copies copies of the next chunk and
-+ * so on.
-+ * If far_copies > 1, then after 1/far_copies of the array has been assigned
-+ * as described above, we start again with a device offset of near_copies.
-+ * So we effectively have another copy of the whole array further down all
-+ * the drives, but with blocks on different drives.
-+ * With this layout, and block is never stored twice on the one device.
-+ *
-+ * raid10_find_phys finds the sector offset of a given virtual sector
-+ * on each device that it is on. If a block isn't on a device,
-+ * that entry in the array is set to MaxSector.
-+ *
-+ * raid10_find_virt does the reverse mapping, from a device and a
-+ * sector offset to a virtual address
-+ */
-+
-+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
-+{
-+ int n,f;
-+ sector_t sector;
-+ sector_t chunk;
-+ sector_t stripe;
-+ int dev;
-+
-+ int slot = 0;
-+
-+ /* now calculate first sector/dev */
-+ chunk = r10bio->sector >> conf->chunk_shift;
-+ sector = r10bio->sector & conf->chunk_mask;
-+
-+ chunk *= conf->near_copies;
-+ stripe = chunk;
-+ dev = sector_div(stripe, conf->raid_disks);
-+
-+ sector += stripe << conf->chunk_shift;
-+
-+ /* and calculate all the others */
-+ for (n=0; n < conf->near_copies; n++) {
-+ int d = dev;
-+ sector_t s = sector;
-+ r10bio->devs[slot].addr = sector;
-+ r10bio->devs[slot].devnum = d;
-+ slot++;
-+
-+ for (f = 1; f < conf->far_copies; f++) {
-+ d += conf->near_copies;
-+ if (d >= conf->raid_disks)
-+ d -= conf->raid_disks;
-+ s += conf->stride;
-+ r10bio->devs[slot].devnum = d;
-+ r10bio->devs[slot].addr = s;
-+ slot++;
-+ }
-+ dev++;
-+ if (dev >= conf->raid_disks) {
-+ dev = 0;
-+ sector += (conf->chunk_mask + 1);
-+ }
-+ }
-+ BUG_ON(slot != conf->copies);
-+}
-+
-+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
-+{
-+ sector_t offset, chunk, vchunk;
-+
-+ while (sector > conf->stride) {
-+ sector -= conf->stride;
-+ if (dev < conf->near_copies)
-+ dev += conf->raid_disks - conf->near_copies;
-+ else
-+ dev -= conf->near_copies;
-+ }
-+
-+ offset = sector & conf->chunk_mask;
-+ chunk = sector >> conf->chunk_shift;
-+ vchunk = chunk * conf->raid_disks + dev;
-+ sector_div(vchunk, conf->near_copies);
-+ return (vchunk << conf->chunk_shift) + offset;
-+}
-+
-+/**
-+ * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
-+ * @q: request queue
-+ * @bio: the buffer head that's been built up so far
-+ * @biovec: the request that could be merged to it.
-+ *
-+ * Return amount of bytes we can accept at this offset
-+ * If near_copies == raid_disk, there are no striping issues,
-+ * but in that case, the function isn't called at all.
-+ */
-+static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
-+ struct bio_vec *bio_vec)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
-+ int max;
-+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
-+ unsigned int bio_sectors = bio->bi_size >> 9;
-+
-+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-+ if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-+ if (max <= bio_vec->bv_len && bio_sectors == 0)
-+ return bio_vec->bv_len;
-+ else
-+ return max;
-+}
-+
-+/*
-+ * This routine returns the disk from which the requested read should
-+ * be done. There is a per-array 'next expected sequential IO' sector
-+ * number - if this matches on the next IO then we use the last disk.
-+ * There is also a per-disk 'last know head position' sector that is
-+ * maintained from IRQ contexts, both the normal and the resync IO
-+ * completion handlers update this position correctly. If there is no
-+ * perfect sequential match then we pick the disk whose head is closest.
-+ *
-+ * If there are 2 mirrors in the same 2 devices, performance degrades
-+ * because position is mirror, not device based.
-+ *
-+ * The rdev for the device selected will have nr_pending incremented.
-+ */
-+
-+/*
-+ * FIXME: possibly should rethink readbalancing and do it differently
-+ * depending on near_copies / far_copies geometry.
-+ */
-+static int read_balance(conf_t *conf, r10bio_t *r10_bio)
-+{
-+ const unsigned long this_sector = r10_bio->sector;
-+ int disk, slot, nslot;
-+ const int sectors = r10_bio->sectors;
-+ sector_t new_distance, current_distance;
-+
-+ raid10_find_phys(conf, r10_bio);
-+ spin_lock_irq(&conf->device_lock);
-+ /*
-+ * Check if we can balance. We can balance on the whole
-+ * device if no resync is going on, or below the resync window.
-+ * We take the first readable disk when above the resync window.
-+ */
-+ if (conf->mddev->recovery_cp < MaxSector
-+ && (this_sector + sectors >= conf->next_resync)) {
-+ /* make sure that disk is operational */
-+ slot = 0;
-+ disk = r10_bio->devs[slot].devnum;
-+
-+ while (!conf->mirrors[disk].rdev ||
-+ !conf->mirrors[disk].rdev->in_sync) {
-+ slot++;
-+ if (slot == conf->copies) {
-+ slot = 0;
-+ disk = -1;
-+ break;
-+ }
-+ disk = r10_bio->devs[slot].devnum;
-+ }
-+ goto rb_out;
-+ }
-+
-+
-+ /* make sure the disk is operational */
-+ slot = 0;
-+ disk = r10_bio->devs[slot].devnum;
-+ while (!conf->mirrors[disk].rdev ||
-+ !conf->mirrors[disk].rdev->in_sync) {
-+ slot ++;
-+ if (slot == conf->copies) {
-+ disk = -1;
-+ goto rb_out;
-+ }
-+ disk = r10_bio->devs[slot].devnum;
-+ }
-+
-+
-+ current_distance = abs(this_sector - conf->mirrors[disk].head_position);
-+
-+ /* Find the disk whose head is closest */
-+
-+ for (nslot = slot; nslot < conf->copies; nslot++) {
-+ int ndisk = r10_bio->devs[nslot].devnum;
-+
-+
-+ if (!conf->mirrors[ndisk].rdev ||
-+ !conf->mirrors[ndisk].rdev->in_sync)
-+ continue;
-+
-+ if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {
-+ disk = ndisk;
-+ slot = nslot;
-+ break;
-+ }
-+ new_distance = abs(r10_bio->devs[nslot].addr -
-+ conf->mirrors[ndisk].head_position);
-+ if (new_distance < current_distance) {
-+ current_distance = new_distance;
-+ disk = ndisk;
-+ slot = nslot;
-+ }
-+ }
-+
-+rb_out:
-+ r10_bio->read_slot = slot;
-+/* conf->next_seq_sect = this_sector + sectors;*/
-+
-+ if (disk >= 0 && conf->mirrors[disk].rdev)
-+ atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-+ spin_unlock_irq(&conf->device_lock);
-+
-+ return disk;
-+}
-+
-+static void unplug_slaves(mddev_t *mddev)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+ int i;
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+ if (rdev && atomic_read(&rdev->nr_pending)) {
-+ request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-+
-+ atomic_inc(&rdev->nr_pending);
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+ if (r_queue->unplug_fn)
-+ r_queue->unplug_fn(r_queue);
-+
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ rdev_dec_pending(rdev, mddev);
-+ }
-+ }
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+}
-+static void raid10_unplug(request_queue_t *q)
-+{
-+ unplug_slaves(q->queuedata);
-+}
-+
-+static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ conf_t *conf = mddev_to_conf(mddev);
-+ unsigned long flags;
-+ int i, ret = 0;
-+
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+ if (rdev && !rdev->faulty) {
-+ struct block_device *bdev = rdev->bdev;
-+ request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+ if (r_queue->issue_flush_fn) {
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ }
-+ }
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+ return ret;
-+}
-+
-+/*
-+ * Throttle resync depth, so that we can both get proper overlapping of
-+ * requests, but are still able to handle normal requests quickly.
-+ */
-+#define RESYNC_DEPTH 32
-+
-+static void device_barrier(conf_t *conf, sector_t sect)
-+{
-+ spin_lock_irq(&conf->resync_lock);
-+ wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-+ conf->resync_lock, unplug_slaves(conf->mddev));
-+
-+ if (!conf->barrier++) {
-+ wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-+ conf->resync_lock, unplug_slaves(conf->mddev));
-+ if (conf->nr_pending)
-+ BUG();
-+ }
-+ wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-+ conf->resync_lock, unplug_slaves(conf->mddev));
-+ conf->next_resync = sect;
-+ spin_unlock_irq(&conf->resync_lock);
-+}
-+
-+static int make_request(request_queue_t *q, struct bio * bio)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ conf_t *conf = mddev_to_conf(mddev);
-+ mirror_info_t *mirror;
-+ r10bio_t *r10_bio;
-+ struct bio *read_bio;
-+ int i;
-+ int chunk_sects = conf->chunk_mask + 1;
-+
-+ /* If this request crosses a chunk boundary, we need to
-+ * split it. This will only happen for 1 PAGE (or less) requests.
-+ */
-+ if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
-+ > chunk_sects &&
-+ conf->near_copies < conf->raid_disks)) {
-+ struct bio_pair *bp;
-+ /* Sanity check -- queue functions should prevent this happening */
-+ if (bio->bi_vcnt != 1 ||
-+ bio->bi_idx != 0)
-+ goto bad_map;
-+ /* This is a one page bio that upper layers
-+ * refuse to split for us, so we need to split it.
-+ */
-+ bp = bio_split(bio, bio_split_pool,
-+ chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-+ if (make_request(q, &bp->bio1))
-+ generic_make_request(&bp->bio1);
-+ if (make_request(q, &bp->bio2))
-+ generic_make_request(&bp->bio2);
-+
-+ bio_pair_release(bp);
-+ return 0;
-+ bad_map:
-+ printk("raid10_make_request bug: can't convert block across chunks"
-+ " or bigger than %dk %llu %d\n", chunk_sects/2,
-+ (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
-+
-+ bio_io_error(bio, bio->bi_size);
-+ return 0;
-+ }
-+
-+ /*
-+ * Register the new request and wait if the reconstruction
-+ * thread has put up a bar for new requests.
-+ * Continue immediately if no resync is active currently.
-+ */
-+ spin_lock_irq(&conf->resync_lock);
-+ wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-+ conf->nr_pending++;
-+ spin_unlock_irq(&conf->resync_lock);
-+
-+ if (bio_data_dir(bio)==WRITE) {
-+ disk_stat_inc(mddev->gendisk, writes);
-+ disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
-+ } else {
-+ disk_stat_inc(mddev->gendisk, reads);
-+ disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
-+ }
-+
-+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-+
-+ r10_bio->master_bio = bio;
-+ r10_bio->sectors = bio->bi_size >> 9;
-+
-+ r10_bio->mddev = mddev;
-+ r10_bio->sector = bio->bi_sector;
-+
-+ if (bio_data_dir(bio) == READ) {
-+ /*
-+ * read balancing logic:
-+ */
-+ int disk = read_balance(conf, r10_bio);
-+ int slot = r10_bio->read_slot;
-+ if (disk < 0) {
-+ raid_end_bio_io(r10_bio);
-+ return 0;
-+ }
-+ mirror = conf->mirrors + disk;
-+
-+ read_bio = bio_clone(bio, GFP_NOIO);
-+
-+ r10_bio->devs[slot].bio = read_bio;
-+
-+ read_bio->bi_sector = r10_bio->devs[slot].addr +
-+ mirror->rdev->data_offset;
-+ read_bio->bi_bdev = mirror->rdev->bdev;
-+ read_bio->bi_end_io = raid10_end_read_request;
-+ read_bio->bi_rw = READ;
-+ read_bio->bi_private = r10_bio;
-+
-+ generic_make_request(read_bio);
-+ return 0;
-+ }
-+
-+ /*
-+ * WRITE:
-+ */
-+ /* first select target devices under spinlock and
-+ * inc refcount on their rdev. Record them by setting
-+ * bios[x] to bio
-+ */
-+ raid10_find_phys(conf, r10_bio);
-+ spin_lock_irq(&conf->device_lock);
-+ for (i = 0; i < conf->copies; i++) {
-+ int d = r10_bio->devs[i].devnum;
-+ if (conf->mirrors[d].rdev &&
-+ !conf->mirrors[d].rdev->faulty) {
-+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+ r10_bio->devs[i].bio = bio;
-+ } else
-+ r10_bio->devs[i].bio = NULL;
-+ }
-+ spin_unlock_irq(&conf->device_lock);
-+
-+ atomic_set(&r10_bio->remaining, 1);
-+ md_write_start(mddev);
-+ for (i = 0; i < conf->copies; i++) {
-+ struct bio *mbio;
-+ int d = r10_bio->devs[i].devnum;
-+ if (!r10_bio->devs[i].bio)
-+ continue;
-+
-+ mbio = bio_clone(bio, GFP_NOIO);
-+ r10_bio->devs[i].bio = mbio;
-+
-+ mbio->bi_sector = r10_bio->devs[i].addr+
-+ conf->mirrors[d].rdev->data_offset;
-+ mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+ mbio->bi_end_io = raid10_end_write_request;
-+ mbio->bi_rw = WRITE;
-+ mbio->bi_private = r10_bio;
-+
-+ atomic_inc(&r10_bio->remaining);
-+ generic_make_request(mbio);
-+ }
-+
-+ if (atomic_dec_and_test(&r10_bio->remaining)) {
-+ md_write_end(mddev);
-+ raid_end_bio_io(r10_bio);
-+ }
-+
-+ return 0;
-+}
-+
-+static void status(struct seq_file *seq, mddev_t *mddev)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+ int i;
-+
-+ if (conf->near_copies < conf->raid_disks)
-+ seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
-+ if (conf->near_copies > 1)
-+ seq_printf(seq, " %d near-copies", conf->near_copies);
-+ if (conf->far_copies > 1)
-+ seq_printf(seq, " %d far-copies", conf->far_copies);
-+
-+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-+ conf->working_disks);
-+ for (i = 0; i < conf->raid_disks; i++)
-+ seq_printf(seq, "%s",
-+ conf->mirrors[i].rdev &&
-+ conf->mirrors[i].rdev->in_sync ? "U" : "_");
-+ seq_printf(seq, "]");
-+}
-+
-+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
-+{
-+ char b[BDEVNAME_SIZE];
-+ conf_t *conf = mddev_to_conf(mddev);
-+
-+ /*
-+ * If it is not operational, then we have already marked it as dead
-+ * else if it is the last working disks, ignore the error, let the
-+ * next level up know.
-+ * else mark the drive as failed
-+ */
-+ if (rdev->in_sync
-+ && conf->working_disks == 1)
-+ /*
-+ * Don't fail the drive, just return an IO error.
-+ * The test should really be more sophisticated than
-+ * "working_disks == 1", but it isn't critical, and
-+ * can wait until we do more sophisticated "is the drive
-+ * really dead" tests...
-+ */
-+ return;
-+ if (rdev->in_sync) {
-+ mddev->degraded++;
-+ conf->working_disks--;
-+ /*
-+ * if recovery is running, make sure it aborts.
-+ */
-+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
-+ }
-+ rdev->in_sync = 0;
-+ rdev->faulty = 1;
-+ mddev->sb_dirty = 1;
-+ printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
-+ " Operation continuing on %d devices\n",
-+ bdevname(rdev->bdev,b), conf->working_disks);
-+}
-+
-+static void print_conf(conf_t *conf)
-+{
-+ int i;
-+ mirror_info_t *tmp;
-+
-+ printk("RAID10 conf printout:\n");
-+ if (!conf) {
-+ printk("(!conf)\n");
-+ return;
-+ }
-+ printk(" --- wd:%d rd:%d\n", conf->working_disks,
-+ conf->raid_disks);
-+
-+ for (i = 0; i < conf->raid_disks; i++) {
-+ char b[BDEVNAME_SIZE];
-+ tmp = conf->mirrors + i;
-+ if (tmp->rdev)
-+ printk(" disk %d, wo:%d, o:%d, dev:%s\n",
-+ i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
-+ bdevname(tmp->rdev->bdev,b));
-+ }
-+}
-+
-+static void close_sync(conf_t *conf)
-+{
-+ spin_lock_irq(&conf->resync_lock);
-+ wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-+ conf->resync_lock, unplug_slaves(conf->mddev));
-+ spin_unlock_irq(&conf->resync_lock);
-+
-+ if (conf->barrier) BUG();
-+ if (waitqueue_active(&conf->wait_idle)) BUG();
-+
-+ mempool_destroy(conf->r10buf_pool);
-+ conf->r10buf_pool = NULL;
-+}
-+
-+static int raid10_spare_active(mddev_t *mddev)
-+{
-+ int i;
-+ conf_t *conf = mddev->private;
-+ mirror_info_t *tmp;
-+
-+ spin_lock_irq(&conf->device_lock);
-+ /*
-+ * Find all non-in_sync disks within the RAID10 configuration
-+ * and mark them in_sync
-+ */
-+ for (i = 0; i < conf->raid_disks; i++) {
-+ tmp = conf->mirrors + i;
-+ if (tmp->rdev
-+ && !tmp->rdev->faulty
-+ && !tmp->rdev->in_sync) {
-+ conf->working_disks++;
-+ mddev->degraded--;
-+ tmp->rdev->in_sync = 1;
-+ }
-+ }
-+ spin_unlock_irq(&conf->device_lock);
-+
-+ print_conf(conf);
-+ return 0;
-+}
-+
-+
-+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
-+{
-+ conf_t *conf = mddev->private;
-+ int found = 0;
-+ int mirror;
-+ mirror_info_t *p;
-+
-+ if (mddev->recovery_cp < MaxSector)
-+ /* only hot-add to in-sync arrays, as recovery is
-+ * very different from resync
-+ */
-+ return 0;
-+ spin_lock_irq(&conf->device_lock);
-+ for (mirror=0; mirror < mddev->raid_disks; mirror++)
-+ if ( !(p=conf->mirrors+mirror)->rdev) {
-+ p->rdev = rdev;
-+
-+ blk_queue_stack_limits(mddev->queue,
-+ rdev->bdev->bd_disk->queue);
-+ /* as we don't honour merge_bvec_fn, we must never risk
-+ * violating it, so limit ->max_sector to one PAGE, as
-+ * a one page request is never in violation.
-+ */
-+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-+ mddev->queue->max_sectors > (PAGE_SIZE>>9))
-+ mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+
-+ p->head_position = 0;
-+ rdev->raid_disk = mirror;
-+ found = 1;
-+ break;
-+ }
-+ spin_unlock_irq(&conf->device_lock);
-+
-+ print_conf(conf);
-+ return found;
-+}
-+
-+static int raid10_remove_disk(mddev_t *mddev, int number)
-+{
-+ conf_t *conf = mddev->private;
-+ int err = 1;
-+ mirror_info_t *p = conf->mirrors+ number;
-+
-+ print_conf(conf);
-+ spin_lock_irq(&conf->device_lock);
-+ if (p->rdev) {
-+ if (p->rdev->in_sync ||
-+ atomic_read(&p->rdev->nr_pending)) {
-+ err = -EBUSY;
-+ goto abort;
-+ }
-+ p->rdev = NULL;
-+ err = 0;
-+ }
-+ if (err)
-+ MD_BUG();
-+abort:
-+ spin_unlock_irq(&conf->device_lock);
-+
-+ print_conf(conf);
-+ return err;
-+}
-+
-+
-+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+ conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+ int i,d;
-+
-+ if (bio->bi_size)
-+ return 1;
-+
-+ for (i=0; i<conf->copies; i++)
-+ if (r10_bio->devs[i].bio == bio)
-+ break;
-+ if (i == conf->copies)
-+ BUG();
-+ update_head_pos(i, r10_bio);
-+ d = r10_bio->devs[i].devnum;
-+ if (!uptodate)
-+ md_error(r10_bio->mddev,
-+ conf->mirrors[d].rdev);
-+
-+ /* for reconstruct, we always reschedule after a read.
-+ * for resync, only after all reads
-+ */
-+ if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
-+ atomic_dec_and_test(&r10_bio->remaining)) {
-+ /* we have read all the blocks,
-+ * do the comparison in process context in raid10d
-+ */
-+ reschedule_retry(r10_bio);
-+ }
-+ rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
-+ return 0;
-+}
-+
-+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+ r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+ mddev_t *mddev = r10_bio->mddev;
-+ conf_t *conf = mddev_to_conf(mddev);
-+ int i,d;
-+
-+ if (bio->bi_size)
-+ return 1;
-+
-+ for (i = 0; i < conf->copies; i++)
-+ if (r10_bio->devs[i].bio == bio)
-+ break;
-+ d = r10_bio->devs[i].devnum;
-+
-+ if (!uptodate)
-+ md_error(mddev, conf->mirrors[d].rdev);
-+ update_head_pos(i, r10_bio);
-+
-+ while (atomic_dec_and_test(&r10_bio->remaining)) {
-+ if (r10_bio->master_bio == NULL) {
-+ /* the primary of several recovery bios */
-+ md_done_sync(mddev, r10_bio->sectors, 1);
-+ put_buf(r10_bio);
-+ break;
-+ } else {
-+ r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
-+ put_buf(r10_bio);
-+ r10_bio = r10_bio2;
-+ }
-+ }
-+ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-+ return 0;
-+}
-+
-+/*
-+ * Note: sync and recover and handled very differently for raid10
-+ * This code is for resync.
-+ * For resync, we read through virtual addresses and read all blocks.
-+ * If there is any error, we schedule a write. The lowest numbered
-+ * drive is authoritative.
-+ * However requests come for physical address, so we need to map.
-+ * For every physical address there are raid_disks/copies virtual addresses,
-+ * which is always are least one, but is not necessarly an integer.
-+ * This means that a physical address can span multiple chunks, so we may
-+ * have to submit multiple io requests for a single sync request.
-+ */
-+/*
-+ * We check if all blocks are in-sync and only write to blocks that
-+ * aren't in sync
-+ */
-+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+ int i, first;
-+ struct bio *tbio, *fbio;
-+
-+ atomic_set(&r10_bio->remaining, 1);
-+
-+ /* find the first device with a block */
-+ for (i=0; i<conf->copies; i++)
-+ if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-+ break;
-+
-+ if (i == conf->copies)
-+ goto done;
-+
-+ first = i;
-+ fbio = r10_bio->devs[i].bio;
-+
-+ /* now find blocks with errors */
-+ for (i=first+1 ; i < conf->copies ; i++) {
-+ int vcnt, j, d;
-+
-+ if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-+ continue;
-+ /* We know that the bi_io_vec layout is the same for
-+ * both 'first' and 'i', so we just compare them.
-+ * All vec entries are PAGE_SIZE;
-+ */
-+ tbio = r10_bio->devs[i].bio;
-+ vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
-+ for (j = 0; j < vcnt; j++)
-+ if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-+ page_address(tbio->bi_io_vec[j].bv_page),
-+ PAGE_SIZE))
-+ break;
-+ if (j == vcnt)
-+ continue;
-+ /* Ok, we need to write this bio
-+ * First we need to fixup bv_offset, bv_len and
-+ * bi_vecs, as the read request might have corrupted these
-+ */
-+ tbio->bi_vcnt = vcnt;
-+ tbio->bi_size = r10_bio->sectors << 9;
-+ tbio->bi_idx = 0;
-+ tbio->bi_phys_segments = 0;
-+ tbio->bi_hw_segments = 0;
-+ tbio->bi_hw_front_size = 0;
-+ tbio->bi_hw_back_size = 0;
-+ tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-+ tbio->bi_flags |= 1 << BIO_UPTODATE;
-+ tbio->bi_next = NULL;
-+ tbio->bi_rw = WRITE;
-+ tbio->bi_private = r10_bio;
-+ tbio->bi_sector = r10_bio->devs[i].addr;
-+
-+ for (j=0; j < vcnt ; j++) {
-+ tbio->bi_io_vec[j].bv_offset = 0;
-+ tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-+
-+ memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-+ page_address(fbio->bi_io_vec[j].bv_page),
-+ PAGE_SIZE);
-+ }
-+ tbio->bi_end_io = end_sync_write;
-+
-+ d = r10_bio->devs[i].devnum;
-+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+ atomic_inc(&r10_bio->remaining);
-+ md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
-+
-+ generic_make_request(tbio);
-+ }
-+
-+done:
-+ if (atomic_dec_and_test(&r10_bio->remaining)) {
-+ md_done_sync(mddev, r10_bio->sectors, 1);
-+ put_buf(r10_bio);
-+ }
-+}
-+
-+/*
-+ * Now for the recovery code.
-+ * Recovery happens across physical sectors.
-+ * We recover all non-is_sync drives by finding the virtual address of
-+ * each, and then choose a working drive that also has that virt address.
-+ * There is a separate r10_bio for each non-in_sync drive.
-+ * Only the first two slots are in use. The first for reading,
-+ * The second for writing.
-+ *
-+ */
-+
-+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+ int i, d;
-+ struct bio *bio, *wbio;
-+
-+
-+ /* move the pages across to the second bio
-+ * and submit the write request
-+ */
-+ bio = r10_bio->devs[0].bio;
-+ wbio = r10_bio->devs[1].bio;
-+ for (i=0; i < wbio->bi_vcnt; i++) {
-+ struct page *p = bio->bi_io_vec[i].bv_page;
-+ bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
-+ wbio->bi_io_vec[i].bv_page = p;
-+ }
-+ d = r10_bio->devs[1].devnum;
-+
-+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+ md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-+ generic_make_request(wbio);
-+}
-+
-+
-+/*
-+ * This is a kernel thread which:
-+ *
-+ * 1. Retries failed read operations on working mirrors.
-+ * 2. Updates the raid superblock when problems encounter.
-+ * 3. Performs writes following reads for array syncronising.
-+ */
-+
-+static void raid10d(mddev_t *mddev)
-+{
-+ r10bio_t *r10_bio;
-+ struct bio *bio;
-+ unsigned long flags;
-+ conf_t *conf = mddev_to_conf(mddev);
-+ struct list_head *head = &conf->retry_list;
-+ int unplug=0;
-+ mdk_rdev_t *rdev;
-+
-+ md_check_recovery(mddev);
-+ md_handle_safemode(mddev);
-+
-+ for (;;) {
-+ char b[BDEVNAME_SIZE];
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ if (list_empty(head))
-+ break;
-+ r10_bio = list_entry(head->prev, r10bio_t, retry_list);
-+ list_del(head->prev);
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+ mddev = r10_bio->mddev;
-+ conf = mddev_to_conf(mddev);
-+ if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
-+ sync_request_write(mddev, r10_bio);
-+ unplug = 1;
-+ } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
-+ recovery_request_write(mddev, r10_bio);
-+ unplug = 1;
-+ } else {
-+ int mirror;
-+ bio = r10_bio->devs[r10_bio->read_slot].bio;
-+ r10_bio->devs[r10_bio->read_slot].bio = NULL;
-+ bio_put(bio);
-+ mirror = read_balance(conf, r10_bio);
-+ if (mirror == -1) {
-+ printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
-+ " read error for block %llu\n",
-+ bdevname(bio->bi_bdev,b),
-+ (unsigned long long)r10_bio->sector);
-+ raid_end_bio_io(r10_bio);
-+ } else {
-+ rdev = conf->mirrors[mirror].rdev;
-+ if (printk_ratelimit())
-+ printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
-+ " another mirror\n",
-+ bdevname(rdev->bdev,b),
-+ (unsigned long long)r10_bio->sector);
-+ bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
-+ r10_bio->devs[r10_bio->read_slot].bio = bio;
-+ bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
-+ + rdev->data_offset;
-+ bio->bi_bdev = rdev->bdev;
-+ bio->bi_rw = READ;
-+ bio->bi_private = r10_bio;
-+ bio->bi_end_io = raid10_end_read_request;
-+ unplug = 1;
-+ generic_make_request(bio);
-+ }
-+ }
-+ }
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+ if (unplug)
-+ unplug_slaves(mddev);
-+}
-+
-+
-+static int init_resync(conf_t *conf)
-+{
-+ int buffs;
-+
-+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
-+ if (conf->r10buf_pool)
-+ BUG();
-+ conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
-+ if (!conf->r10buf_pool)
-+ return -ENOMEM;
-+ conf->next_resync = 0;
-+ return 0;
-+}
-+
-+/*
-+ * perform a "sync" on one "block"
-+ *
-+ * We need to make sure that no normal I/O request - particularly write
-+ * requests - conflict with active sync requests.
-+ *
-+ * This is achieved by tracking pending requests and a 'barrier' concept
-+ * that can be installed to exclude normal IO requests.
-+ *
-+ * Resync and recovery are handled very differently.
-+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
-+ *
-+ * For resync, we iterate over virtual addresses, read all copies,
-+ * and update if there are differences. If only one copy is live,
-+ * skip it.
-+ * For recovery, we iterate over physical addresses, read a good
-+ * value for each non-in_sync drive, and over-write.
-+ *
-+ * So, for recovery we may have several outstanding complex requests for a
-+ * given address, one for each out-of-sync device. We model this by allocating
-+ * a number of r10_bio structures, one for each out-of-sync device.
-+ * As we setup these structures, we collect all bio's together into a list
-+ * which we then process collectively to add pages, and then process again
-+ * to pass to generic_make_request.
-+ *
-+ * The r10_bio structures are linked using a borrowed master_bio pointer.
-+ * This link is counted in ->remaining. When the r10_bio that points to NULL
-+ * has its remaining count decremented to 0, the whole complex operation
-+ * is complete.
-+ *
-+ */
-+
-+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+ r10bio_t *r10_bio;
-+ struct bio *biolist = NULL, *bio;
-+ sector_t max_sector, nr_sectors;
-+ int disk;
-+ int i;
-+
-+ sector_t sectors_skipped = 0;
-+ int chunks_skipped = 0;
-+
-+ if (!conf->r10buf_pool)
-+ if (init_resync(conf))
-+ return -ENOMEM;
-+
-+ skipped:
-+ max_sector = mddev->size << 1;
-+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+ max_sector = mddev->resync_max_sectors;
-+ if (sector_nr >= max_sector) {
-+ close_sync(conf);
-+ return sectors_skipped;
-+ }
-+ if (chunks_skipped >= conf->raid_disks) {
-+ /* if there has been nothing to do on any drive,
-+ * then there is nothing to do at all..
-+ */
-+ sector_t sec = max_sector - sector_nr;
-+ md_done_sync(mddev, sec, 1);
-+ return sec + sectors_skipped;
-+ }
-+
-+ /* make sure whole request will fit in a chunk - if chunks
-+ * are meaningful
-+ */
-+ if (conf->near_copies < conf->raid_disks &&
-+ max_sector > (sector_nr | conf->chunk_mask))
-+ max_sector = (sector_nr | conf->chunk_mask) + 1;
-+ /*
-+ * If there is non-resync activity waiting for us then
-+ * put in a delay to throttle resync.
-+ */
-+ if (!go_faster && waitqueue_active(&conf->wait_resume))
-+ schedule_timeout(HZ);
-+ device_barrier(conf, sector_nr + RESYNC_SECTORS);
-+
-+ /* Again, very different code for resync and recovery.
-+ * Both must result in an r10bio with a list of bios that
-+ * have bi_end_io, bi_sector, bi_bdev set,
-+ * and bi_private set to the r10bio.
-+ * For recovery, we may actually create several r10bios
-+ * with 2 bios in each, that correspond to the bios in the main one.
-+ * In this case, the subordinate r10bios link back through a
-+ * borrowed master_bio pointer, and the counter in the master
-+ * includes a ref from each subordinate.
-+ */
-+ /* First, we decide what to do and set ->bi_end_io
-+ * To end_sync_read if we want to read, and
-+ * end_sync_write if we will want to write.
-+ */
-+
-+ if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-+ /* recovery... the complicated one */
-+ int i, j, k;
-+ r10_bio = NULL;
-+
-+ for (i=0 ; i<conf->raid_disks; i++)
-+ if (conf->mirrors[i].rdev &&
-+ !conf->mirrors[i].rdev->in_sync) {
-+ /* want to reconstruct this device */
-+ r10bio_t *rb2 = r10_bio;
-+
-+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-+ spin_lock_irq(&conf->resync_lock);
-+ conf->nr_pending++;
-+ if (rb2) conf->barrier++;
-+ spin_unlock_irq(&conf->resync_lock);
-+ atomic_set(&r10_bio->remaining, 0);
-+
-+ r10_bio->master_bio = (struct bio*)rb2;
-+ if (rb2)
-+ atomic_inc(&rb2->remaining);
-+ r10_bio->mddev = mddev;
-+ set_bit(R10BIO_IsRecover, &r10_bio->state);
-+ r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
-+ raid10_find_phys(conf, r10_bio);
-+ for (j=0; j<conf->copies;j++) {
-+ int d = r10_bio->devs[j].devnum;
-+ if (conf->mirrors[d].rdev &&
-+ conf->mirrors[d].rdev->in_sync) {
-+ /* This is where we read from */
-+ bio = r10_bio->devs[0].bio;
-+ bio->bi_next = biolist;
-+ biolist = bio;
-+ bio->bi_private = r10_bio;
-+ bio->bi_end_io = end_sync_read;
-+ bio->bi_rw = 0;
-+ bio->bi_sector = r10_bio->devs[j].addr +
-+ conf->mirrors[d].rdev->data_offset;
-+ bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+ atomic_inc(&r10_bio->remaining);
-+ /* and we write to 'i' */
-+
-+ for (k=0; k<conf->copies; k++)
-+ if (r10_bio->devs[k].devnum == i)
-+ break;
-+ bio = r10_bio->devs[1].bio;
-+ bio->bi_next = biolist;
-+ biolist = bio;
-+ bio->bi_private = r10_bio;
-+ bio->bi_end_io = end_sync_write;
-+ bio->bi_rw = 1;
-+ bio->bi_sector = r10_bio->devs[k].addr +
-+ conf->mirrors[i].rdev->data_offset;
-+ bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-+
-+ r10_bio->devs[0].devnum = d;
-+ r10_bio->devs[1].devnum = i;
-+
-+ break;
-+ }
-+ }
-+ if (j == conf->copies) {
-+ BUG();
-+ }
-+ }
-+ if (biolist == NULL) {
-+ while (r10_bio) {
-+ r10bio_t *rb2 = r10_bio;
-+ r10_bio = (r10bio_t*) rb2->master_bio;
-+ rb2->master_bio = NULL;
-+ put_buf(rb2);
-+ }
-+ goto giveup;
-+ }
-+ } else {
-+ /* resync. Schedule a read for every block at this virt offset */
-+ int count = 0;
-+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-+
-+ spin_lock_irq(&conf->resync_lock);
-+ conf->nr_pending++;
-+ spin_unlock_irq(&conf->resync_lock);
-+
-+ r10_bio->mddev = mddev;
-+ atomic_set(&r10_bio->remaining, 0);
-+
-+ r10_bio->master_bio = NULL;
-+ r10_bio->sector = sector_nr;
-+ set_bit(R10BIO_IsSync, &r10_bio->state);
-+ raid10_find_phys(conf, r10_bio);
-+ r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
-+ spin_lock_irq(&conf->device_lock);
-+ for (i=0; i<conf->copies; i++) {
-+ int d = r10_bio->devs[i].devnum;
-+ bio = r10_bio->devs[i].bio;
-+ bio->bi_end_io = NULL;
-+ if (conf->mirrors[d].rdev == NULL ||
-+ conf->mirrors[d].rdev->faulty)
-+ continue;
-+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+ atomic_inc(&r10_bio->remaining);
-+ bio->bi_next = biolist;
-+ biolist = bio;
-+ bio->bi_private = r10_bio;
-+ bio->bi_end_io = end_sync_read;
-+ bio->bi_rw = 0;
-+ bio->bi_sector = r10_bio->devs[i].addr +
-+ conf->mirrors[d].rdev->data_offset;
-+ bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+ count++;
-+ }
-+ spin_unlock_irq(&conf->device_lock);
-+ if (count < 2) {
-+ for (i=0; i<conf->copies; i++) {
-+ int d = r10_bio->devs[i].devnum;
-+ if (r10_bio->devs[i].bio->bi_end_io)
-+ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-+ }
-+ put_buf(r10_bio);
-+ biolist = NULL;
-+ goto giveup;
-+ }
-+ }
-+
-+ for (bio = biolist; bio ; bio=bio->bi_next) {
-+
-+ bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-+ if (bio->bi_end_io)
-+ bio->bi_flags |= 1 << BIO_UPTODATE;
-+ bio->bi_vcnt = 0;
-+ bio->bi_idx = 0;
-+ bio->bi_phys_segments = 0;
-+ bio->bi_hw_segments = 0;
-+ bio->bi_size = 0;
-+ }
-+
-+ nr_sectors = 0;
-+ do {
-+ struct page *page;
-+ int len = PAGE_SIZE;
-+ disk = 0;
-+ if (sector_nr + (len>>9) > max_sector)
-+ len = (max_sector - sector_nr) << 9;
-+ if (len == 0)
-+ break;
-+ for (bio= biolist ; bio ; bio=bio->bi_next) {
-+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-+ if (bio_add_page(bio, page, len, 0) == 0) {
-+ /* stop here */
-+ struct bio *bio2;
-+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-+ for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
-+ /* remove last page from this bio */
-+ bio2->bi_vcnt--;
-+ bio2->bi_size -= len;
-+ bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
-+ }
-+ goto bio_full;
-+ }
-+ disk = i;
-+ }
-+ nr_sectors += len>>9;
-+ sector_nr += len>>9;
-+ } while (biolist->bi_vcnt < RESYNC_PAGES);
-+ bio_full:
-+ r10_bio->sectors = nr_sectors;
-+
-+ while (biolist) {
-+ bio = biolist;
-+ biolist = biolist->bi_next;
-+
-+ bio->bi_next = NULL;
-+ r10_bio = bio->bi_private;
-+ r10_bio->sectors = nr_sectors;
-+
-+ if (bio->bi_end_io == end_sync_read) {
-+ md_sync_acct(bio->bi_bdev, nr_sectors);
-+ generic_make_request(bio);
-+ }
-+ }
-+
-+ return sectors_skipped + nr_sectors;
-+ giveup:
-+ /* There is nowhere to write, so all non-sync
-+ * drives must be failed, so try the next chunk...
-+ */
-+ {
-+ int sec = max_sector - sector_nr;
-+ sectors_skipped += sec;
-+ chunks_skipped ++;
-+ sector_nr = max_sector;
-+ md_done_sync(mddev, sec, 1);
-+ goto skipped;
-+ }
-+}
-+
-+static int run(mddev_t *mddev)
-+{
-+ conf_t *conf;
-+ int i, disk_idx;
-+ mirror_info_t *disk;
-+ mdk_rdev_t *rdev;
-+ struct list_head *tmp;
-+ int nc, fc;
-+ sector_t stride, size;
-+
-+ if (mddev->level != 10) {
-+ printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
-+ mdname(mddev), mddev->level);
-+ goto out;
-+ }
-+ nc = mddev->layout & 255;
-+ fc = (mddev->layout >> 8) & 255;
-+ if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-+ (mddev->layout >> 16)) {
-+ printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
-+ mdname(mddev), mddev->layout);
-+ goto out;
-+ }
-+ /*
-+ * copy the already verified devices into our private RAID10
-+ * bookkeeping area. [whatever we allocate in run(),
-+ * should be freed in stop()]
-+ */
-+ conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
-+ mddev->private = conf;
-+ if (!conf) {
-+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+ mdname(mddev));
-+ goto out;
-+ }
-+ memset(conf, 0, sizeof(*conf));
-+ conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-+ GFP_KERNEL);
-+ if (!conf->mirrors) {
-+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+ mdname(mddev));
-+ goto out_free_conf;
-+ }
-+ memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
-+
-+ conf->near_copies = nc;
-+ conf->far_copies = fc;
-+ conf->copies = nc*fc;
-+ conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
-+ conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-+ stride = mddev->size >> (conf->chunk_shift-1);
-+ sector_div(stride, fc);
-+ conf->stride = stride << conf->chunk_shift;
-+
-+ conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
-+ r10bio_pool_free, conf);
-+ if (!conf->r10bio_pool) {
-+ printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+ mdname(mddev));
-+ goto out_free_conf;
-+ }
-+ mddev->queue->unplug_fn = raid10_unplug;
-+
-+ mddev->queue->issue_flush_fn = raid10_issue_flush;
-+
-+ ITERATE_RDEV(mddev, rdev, tmp) {
-+ disk_idx = rdev->raid_disk;
-+ if (disk_idx >= mddev->raid_disks
-+ || disk_idx < 0)
-+ continue;
-+ disk = conf->mirrors + disk_idx;
-+
-+ disk->rdev = rdev;
-+
-+ blk_queue_stack_limits(mddev->queue,
-+ rdev->bdev->bd_disk->queue);
-+ /* as we don't honour merge_bvec_fn, we must never risk
-+ * violating it, so limit ->max_sector to one PAGE, as
-+ * a one page request is never in violation.
-+ */
-+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-+ mddev->queue->max_sectors > (PAGE_SIZE>>9))
-+ mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+
-+ disk->head_position = 0;
-+ if (!rdev->faulty && rdev->in_sync)
-+ conf->working_disks++;
-+ }
-+ conf->raid_disks = mddev->raid_disks;
-+ conf->mddev = mddev;
-+ conf->device_lock = SPIN_LOCK_UNLOCKED;
-+ INIT_LIST_HEAD(&conf->retry_list);
-+
-+ conf->resync_lock = SPIN_LOCK_UNLOCKED;
-+ init_waitqueue_head(&conf->wait_idle);
-+ init_waitqueue_head(&conf->wait_resume);
-+
-+ if (!conf->working_disks) {
-+ printk(KERN_ERR "raid10: no operational mirrors for %s\n",
-+ mdname(mddev));
-+ goto out_free_conf;
-+ }
-+
-+ mddev->degraded = 0;
-+ for (i = 0; i < conf->raid_disks; i++) {
-+
-+ disk = conf->mirrors + i;
-+
-+ if (!disk->rdev) {
-+ disk->head_position = 0;
-+ mddev->degraded++;
-+ }
-+ }
-+
-+
-+ mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
-+ if (!mddev->thread) {
-+ printk(KERN_ERR
-+ "raid10: couldn't allocate thread for %s\n",
-+ mdname(mddev));
-+ goto out_free_conf;
-+ }
-+
-+ printk(KERN_INFO
-+ "raid10: raid set %s active with %d out of %d devices\n",
-+ mdname(mddev), mddev->raid_disks - mddev->degraded,
-+ mddev->raid_disks);
-+ /*
-+ * Ok, everything is just fine now
-+ */
-+ size = conf->stride * conf->raid_disks;
-+ sector_div(size, conf->near_copies);
-+ mddev->array_size = size/2;
-+ mddev->resync_max_sectors = size;
-+
-+ /* Calculate max read-ahead size.
-+ * We need to readahead at least twice a whole stripe....
-+ * maybe...
-+ */
-+ {
-+ int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
-+ stripe /= conf->near_copies;
-+ if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-+ mddev->queue->backing_dev_info.ra_pages = 2* stripe;
-+ }
-+
-+ if (conf->near_copies < mddev->raid_disks)
-+ blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-+ return 0;
-+
-+out_free_conf:
-+ if (conf->r10bio_pool)
-+ mempool_destroy(conf->r10bio_pool);
-+ if (conf->mirrors)
-+ kfree(conf->mirrors);
-+ kfree(conf);
-+ mddev->private = NULL;
-+out:
-+ return -EIO;
-+}
-+
-+static int stop(mddev_t *mddev)
-+{
-+ conf_t *conf = mddev_to_conf(mddev);
-+
-+ md_unregister_thread(mddev->thread);
-+ mddev->thread = NULL;
-+ if (conf->r10bio_pool)
-+ mempool_destroy(conf->r10bio_pool);
-+ if (conf->mirrors)
-+ kfree(conf->mirrors);
-+ kfree(conf);
-+ mddev->private = NULL;
-+ return 0;
-+}
-+
-+
-+static mdk_personality_t raid10_personality =
-+{
-+ .name = "raid10",
-+ .owner = THIS_MODULE,
-+ .make_request = make_request,
-+ .run = run,
-+ .stop = stop,
-+ .status = status,
-+ .error_handler = error,
-+ .hot_add_disk = raid10_add_disk,
-+ .hot_remove_disk= raid10_remove_disk,
-+ .spare_active = raid10_spare_active,
-+ .sync_request = sync_request,
-+};
-+
-+static int __init raid_init(void)
-+{
-+ return register_md_personality(RAID10, &raid10_personality);
-+}
-+
-+static void raid_exit(void)
-+{
-+ unregister_md_personality(RAID10);
-+}
-+
-+module_init(raid_init);
-+module_exit(raid_exit);
-+MODULE_LICENSE("GPL");
-+MODULE_ALIAS("md-personality-9"); /* RAID10 */
-diff -pruN ./drivers/md.dm/raid1.c ./drivers/md/raid1.c
---- ./drivers/md.dm/raid1.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid1.c 2006-03-17 13:16:38.000000000 +0300
-@@ -24,10 +24,6 @@
-
- #include <linux/raid/raid1.h>
-
--#define MAJOR_NR MD_MAJOR
--#define MD_DRIVER
--#define MD_PERSONALITY
--
- /*
- * Number of guaranteed r1bios in case of extreme VM load:
- */
-@@ -44,13 +40,12 @@ static void * r1bio_pool_alloc(int gfp_f
- {
- struct pool_info *pi = data;
- r1bio_t *r1_bio;
-+ int size = offsetof(r1bio_t, bios[pi->raid_disks]);
-
- /* allocate a r1bio with room for raid_disks entries in the bios array */
-- r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks,
-- gfp_flags);
-+ r1_bio = kmalloc(size, gfp_flags);
- if (r1_bio)
-- memset(r1_bio, 0, sizeof(*r1_bio) +
-- sizeof(struct bio*) * pi->raid_disks);
-+ memset(r1_bio, 0, size);
- else
- unplug_slaves(pi->mddev);
-
-@@ -104,7 +99,7 @@ static void * r1buf_pool_alloc(int gfp_f
- bio->bi_io_vec[i].bv_page = page;
- }
-
-- r1_bio->master_bio = bio;
-+ r1_bio->master_bio = NULL;
-
- return r1_bio;
-
-@@ -189,32 +184,6 @@ static inline void put_buf(r1bio_t *r1_b
- spin_unlock_irqrestore(&conf->resync_lock, flags);
- }
-
--static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
--{
-- conf_t *conf = mddev_to_conf(mddev);
-- int i, disks = conf->raid_disks;
--
-- /*
-- * Later we do read balancing on the read side
-- * now we use the first available disk.
-- */
--
-- spin_lock_irq(&conf->device_lock);
-- for (i = 0; i < disks; i++) {
-- mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-- if (rdev && rdev->in_sync) {
-- *rdevp = rdev;
-- atomic_inc(&rdev->nr_pending);
-- spin_unlock_irq(&conf->device_lock);
-- return i;
-- }
-- }
-- spin_unlock_irq(&conf->device_lock);
--
-- printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n");
-- return -1;
--}
--
- static void reschedule_retry(r1bio_t *r1_bio)
- {
- unsigned long flags;
-@@ -292,8 +261,9 @@ static int raid1_end_read_request(struct
- * oops, read error:
- */
- char b[BDEVNAME_SIZE];
-- printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
-- bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
-+ if (printk_ratelimit())
-+ printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
-+ bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
- reschedule_retry(r1_bio);
- }
-
-@@ -363,12 +333,13 @@ static int raid1_end_write_request(struc
- *
- * The rdev for the device selected will have nr_pending incremented.
- */
--static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
-+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
- {
- const unsigned long this_sector = r1_bio->sector;
- int new_disk = conf->last_used, disk = new_disk;
-- const int sectors = bio->bi_size >> 9;
-+ const int sectors = r1_bio->sectors;
- sector_t new_distance, current_distance;
-+ mdk_rdev_t *new_rdev, *rdev;
-
- spin_lock_irq(&conf->device_lock);
- /*
-@@ -376,16 +347,17 @@ static int read_balance(conf_t *conf, st
- * device if no resync is going on, or below the resync window.
- * We take the first readable disk when above the resync window.
- */
-+ retry:
- if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync)) {
-- /* make sure that disk is operational */
-+ /* Choose the first operation device, for consistancy */
- new_disk = 0;
-
-- while (!conf->mirrors[new_disk].rdev ||
-- !conf->mirrors[new_disk].rdev->in_sync) {
-+ while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-+ !new_rdev->in_sync) {
- new_disk++;
- if (new_disk == conf->raid_disks) {
-- new_disk = 0;
-+ new_disk = -1;
- break;
- }
- }
-@@ -394,13 +366,13 @@ static int read_balance(conf_t *conf, st
-
-
- /* make sure the disk is operational */
-- while (!conf->mirrors[new_disk].rdev ||
-- !conf->mirrors[new_disk].rdev->in_sync) {
-+ while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-+ !new_rdev->in_sync) {
- if (new_disk <= 0)
- new_disk = conf->raid_disks;
- new_disk--;
- if (new_disk == disk) {
-- new_disk = conf->last_used;
-+ new_disk = -1;
- goto rb_out;
- }
- }
-@@ -424,29 +396,38 @@ static int read_balance(conf_t *conf, st
- disk = conf->raid_disks;
- disk--;
-
-- if (!conf->mirrors[disk].rdev ||
-- !conf->mirrors[disk].rdev->in_sync)
-+ if ((rdev=conf->mirrors[disk].rdev) == NULL ||
-+ !rdev->in_sync)
- continue;
-
-- if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {
-+ if (!atomic_read(&rdev->nr_pending)) {
- new_disk = disk;
-+ new_rdev = rdev;
- break;
- }
- new_distance = abs(this_sector - conf->mirrors[disk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- new_disk = disk;
-+ new_rdev = rdev;
- }
- } while (disk != conf->last_used);
-
- rb_out:
-- r1_bio->read_disk = new_disk;
-- conf->next_seq_sect = this_sector + sectors;
-
-- conf->last_used = new_disk;
-
-- if (conf->mirrors[new_disk].rdev)
-- atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);
-+ if (new_disk >= 0) {
-+ conf->next_seq_sect = this_sector + sectors;
-+ conf->last_used = new_disk;
-+ atomic_inc(&new_rdev->nr_pending);
-+ if (!new_rdev->in_sync) {
-+ /* cannot risk returning a device that failed
-+ * before we inc'ed nr_pending
-+ */
-+ atomic_dec(&new_rdev->nr_pending);
-+ goto retry;
-+ }
-+ }
- spin_unlock_irq(&conf->device_lock);
-
- return new_disk;
-@@ -471,7 +452,7 @@ static void unplug_slaves(mddev_t *mddev
- r_queue->unplug_fn(r_queue);
-
- spin_lock_irqsave(&conf->device_lock, flags);
-- atomic_dec(&rdev->nr_pending);
-+ rdev_dec_pending(rdev, mddev);
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -481,6 +462,32 @@ static void raid1_unplug(request_queue_t
- unplug_slaves(q->queuedata);
- }
-
-+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ conf_t *conf = mddev_to_conf(mddev);
-+ unsigned long flags;
-+ int i, ret = 0;
-+
-+ spin_lock_irqsave(&conf->device_lock, flags);
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+ if (rdev && !rdev->faulty) {
-+ struct block_device *bdev = rdev->bdev;
-+ request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+ if (r_queue->issue_flush_fn) {
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ }
-+ }
-+ spin_unlock_irqrestore(&conf->device_lock, flags);
-+ return ret;
-+}
-+
- /*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
-@@ -513,6 +520,7 @@ static int make_request(request_queue_t
- r1bio_t *r1_bio;
- struct bio *read_bio;
- int i, disks;
-+ mdk_rdev_t *rdev;
-
- /*
- * Register the new request and wait if the reconstruction
-@@ -545,15 +553,26 @@ static int make_request(request_queue_t
- r1_bio->mddev = mddev;
- r1_bio->sector = bio->bi_sector;
-
-+ r1_bio->state = 0;
-+
- if (bio_data_dir(bio) == READ) {
- /*
- * read balancing logic:
- */
-- mirror = conf->mirrors + read_balance(conf, bio, r1_bio);
-+ int rdisk = read_balance(conf, r1_bio);
-+
-+ if (rdisk < 0) {
-+ /* couldn't find anywhere to read from */
-+ raid_end_bio_io(r1_bio);
-+ return 0;
-+ }
-+ mirror = conf->mirrors + rdisk;
-+
-+ r1_bio->read_disk = rdisk;
-
- read_bio = bio_clone(bio, GFP_NOIO);
-
-- r1_bio->bios[r1_bio->read_disk] = read_bio;
-+ r1_bio->bios[rdisk] = read_bio;
-
- read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
- read_bio->bi_bdev = mirror->rdev->bdev;
-@@ -575,10 +594,14 @@ static int make_request(request_queue_t
- disks = conf->raid_disks;
- spin_lock_irq(&conf->device_lock);
- for (i = 0; i < disks; i++) {
-- if (conf->mirrors[i].rdev &&
-- !conf->mirrors[i].rdev->faulty) {
-- atomic_inc(&conf->mirrors[i].rdev->nr_pending);
-- r1_bio->bios[i] = bio;
-+ if ((rdev=conf->mirrors[i].rdev) != NULL &&
-+ !rdev->faulty) {
-+ atomic_inc(&rdev->nr_pending);
-+ if (rdev->faulty) {
-+ atomic_dec(&rdev->nr_pending);
-+ r1_bio->bios[i] = NULL;
-+ } else
-+ r1_bio->bios[i] = bio;
- } else
- r1_bio->bios[i] = NULL;
- }
-@@ -746,7 +769,7 @@ static int raid1_add_disk(mddev_t *mddev
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- p->head_position = 0;
- rdev->raid_disk = mirror;
-@@ -877,7 +900,7 @@ static void sync_request_write(mddev_t *
-
- atomic_inc(&conf->mirrors[i].rdev->nr_pending);
- atomic_inc(&r1_bio->remaining);
-- md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9);
-+ md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
- generic_make_request(wbio);
- }
-
-@@ -925,7 +948,7 @@ static void raid1d(mddev_t *mddev)
- } else {
- int disk;
- bio = r1_bio->bios[r1_bio->read_disk];
-- if ((disk=map(mddev, &rdev)) == -1) {
-+ if ((disk=read_balance(conf, r1_bio)) == -1) {
- printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
- " read error for block %llu\n",
- bdevname(bio->bi_bdev,b),
-@@ -934,14 +957,20 @@ static void raid1d(mddev_t *mddev)
- } else {
- r1_bio->bios[r1_bio->read_disk] = NULL;
- r1_bio->read_disk = disk;
-+ bio_put(bio);
-+ bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
- r1_bio->bios[r1_bio->read_disk] = bio;
-- printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-- " another mirror\n",
-- bdevname(rdev->bdev,b),
-- (unsigned long long)r1_bio->sector);
-- bio->bi_bdev = rdev->bdev;
-+ rdev = conf->mirrors[disk].rdev;
-+ if (printk_ratelimit())
-+ printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-+ " another mirror\n",
-+ bdevname(rdev->bdev,b),
-+ (unsigned long long)r1_bio->sector);
- bio->bi_sector = r1_bio->sector + rdev->data_offset;
-+ bio->bi_bdev = rdev->bdev;
-+ bio->bi_end_io = raid1_end_read_request;
- bio->bi_rw = READ;
-+ bio->bi_private = r1_bio;
- unplug = 1;
- generic_make_request(bio);
- }
-@@ -1078,7 +1107,7 @@ static int sync_request(mddev_t *mddev,
- int rv = max_sector - sector_nr;
- md_done_sync(mddev, rv, 1);
- put_buf(r1_bio);
-- atomic_dec(&conf->mirrors[disk].rdev->nr_pending);
-+ rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
- return rv;
- }
-
-@@ -1117,7 +1146,7 @@ static int sync_request(mddev_t *mddev,
- bio = r1_bio->bios[disk];
- r1_bio->sectors = nr_sectors;
-
-- md_sync_acct(mirror->rdev, nr_sectors);
-+ md_sync_acct(mirror->rdev->bdev, nr_sectors);
-
- generic_make_request(bio);
-
-@@ -1168,6 +1197,7 @@ static int run(mddev_t *mddev)
-
- mddev->queue->unplug_fn = raid1_unplug;
-
-+ mddev->queue->issue_flush_fn = raid1_issue_flush;
-
- ITERATE_RDEV(mddev, rdev, tmp) {
- disk_idx = rdev->raid_disk;
-@@ -1186,7 +1216,7 @@ static int run(mddev_t *mddev)
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- mddev->queue->max_sectors > (PAGE_SIZE>>9))
-- mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
- disk->head_position = 0;
- if (!rdev->faulty && rdev->in_sync)
-@@ -1328,7 +1358,7 @@ static int raid1_reshape(mddev_t *mddev,
- if (conf->mirrors[d].rdev)
- return -EBUSY;
-
-- newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL);
-+ newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
- if (!newpoolinfo)
- return -ENOMEM;
- newpoolinfo->mddev = mddev;
-diff -pruN ./drivers/md.dm/raid5.c ./drivers/md/raid5.c
---- ./drivers/md.dm/raid5.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid5.c 2006-03-17 13:16:38.000000000 +0300
-@@ -457,6 +457,7 @@ static void raid5_build_block (struct st
- bio_init(&dev->req);
- dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
-+ dev->req.bi_max_vecs++;
- dev->vec.bv_page = dev->page;
- dev->vec.bv_len = STRIPE_SIZE;
- dev->vec.bv_offset = 0;
-@@ -477,8 +478,8 @@ static void error(mddev_t *mddev, mdk_rd
-
- if (!rdev->faulty) {
- mddev->sb_dirty = 1;
-- conf->working_disks--;
- if (rdev->in_sync) {
-+ conf->working_disks--;
- mddev->degraded++;
- conf->failed_disks++;
- rdev->in_sync = 0;
-@@ -1071,7 +1072,8 @@ static void handle_stripe(struct stripe_
- PRINTK("Reading block %d (sync=%d)\n",
- i, syncing);
- if (syncing)
-- md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
-+ md_sync_acct(conf->disks[i].rdev->bdev,
-+ STRIPE_SECTORS);
- }
- }
- }
-@@ -1256,7 +1258,7 @@ static void handle_stripe(struct stripe_
-
- if (rdev) {
- if (test_bit(R5_Syncio, &sh->dev[i].flags))
-- md_sync_acct(rdev, STRIPE_SECTORS);
-+ md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
- bi->bi_bdev = rdev->bdev;
- PRINTK("for %llu schedule op %ld on disc %d\n",
-@@ -1265,6 +1267,7 @@ static void handle_stripe(struct stripe_
- bi->bi_sector = sh->sector + rdev->data_offset;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_vcnt = 1;
-+ bi->bi_max_vecs = 1;
- bi->bi_idx = 0;
- bi->bi_io_vec = &sh->dev[i].vec;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-@@ -1316,7 +1319,7 @@ static void unplug_slaves(mddev_t *mddev
- r_queue->unplug_fn(r_queue);
-
- spin_lock_irqsave(&conf->device_lock, flags);
-- atomic_dec(&rdev->nr_pending);
-+ rdev_dec_pending(rdev, mddev);
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -1328,6 +1331,8 @@ static void raid5_unplug_device(request_
- raid5_conf_t *conf = mddev_to_conf(mddev);
- unsigned long flags;
-
-+ if (!conf) return;
-+
- spin_lock_irqsave(&conf->device_lock, flags);
-
- if (blk_remove_plug(q))
-@@ -1339,6 +1344,39 @@ static void raid5_unplug_device(request_
- unplug_slaves(mddev);
- }
-
-+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ raid5_conf_t *conf = mddev_to_conf(mddev);
-+ int i, ret = 0;
-+
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->disks[i].rdev;
-+ if (rdev && !rdev->faulty) {
-+ struct block_device *bdev = rdev->bdev;
-+ request_queue_t *r_queue;
-+
-+ if (!bdev)
-+ continue;
-+
-+ r_queue = bdev_get_queue(bdev);
-+ if (!r_queue)
-+ continue;
-+
-+ if (!r_queue->issue_flush_fn) {
-+ ret = -EOPNOTSUPP;
-+ break;
-+ }
-+
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ }
-+ return ret;
-+}
-+
- static inline void raid5_plug_device(raid5_conf_t *conf)
- {
- spin_lock_irq(&conf->device_lock);
-@@ -1545,6 +1583,7 @@ static int run (mddev_t *mddev)
- atomic_set(&conf->preread_active_stripes, 0);
-
- mddev->queue->unplug_fn = raid5_unplug_device;
-+ mddev->queue->issue_flush_fn = raid5_issue_flush;
-
- PRINTK("raid5: run(%s) called.\n", mdname(mddev));
-
-diff -pruN ./drivers/md.dm/raid6main.c ./drivers/md/raid6main.c
---- ./drivers/md.dm/raid6main.c 2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid6main.c 2006-03-17 13:16:38.000000000 +0300
-@@ -478,6 +478,7 @@ static void raid6_build_block (struct st
- bio_init(&dev->req);
- dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
-+ dev->req.bi_max_vecs++;
- dev->vec.bv_page = dev->page;
- dev->vec.bv_len = STRIPE_SIZE;
- dev->vec.bv_offset = 0;
-@@ -498,8 +499,8 @@ static void error(mddev_t *mddev, mdk_rd
-
- if (!rdev->faulty) {
- mddev->sb_dirty = 1;
-- conf->working_disks--;
- if (rdev->in_sync) {
-+ conf->working_disks--;
- mddev->degraded++;
- conf->failed_disks++;
- rdev->in_sync = 0;
-@@ -1208,7 +1209,8 @@ static void handle_stripe(struct stripe_
- PRINTK("Reading block %d (sync=%d)\n",
- i, syncing);
- if (syncing)
-- md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
-+ md_sync_acct(conf->disks[i].rdev->bdev,
-+ STRIPE_SECTORS);
- }
- }
- }
-@@ -1418,7 +1420,7 @@ static void handle_stripe(struct stripe_
-
- if (rdev) {
- if (test_bit(R5_Syncio, &sh->dev[i].flags))
-- md_sync_acct(rdev, STRIPE_SECTORS);
-+ md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
- bi->bi_bdev = rdev->bdev;
- PRINTK("for %llu schedule op %ld on disc %d\n",
-@@ -1427,6 +1429,7 @@ static void handle_stripe(struct stripe_
- bi->bi_sector = sh->sector + rdev->data_offset;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_vcnt = 1;
-+ bi->bi_max_vecs = 1;
- bi->bi_idx = 0;
- bi->bi_io_vec = &sh->dev[i].vec;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-@@ -1478,7 +1481,7 @@ static void unplug_slaves(mddev_t *mddev
- r_queue->unplug_fn(r_queue);
-
- spin_lock_irqsave(&conf->device_lock, flags);
-- atomic_dec(&rdev->nr_pending);
-+ rdev_dec_pending(rdev, mddev);
- }
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -1501,6 +1504,39 @@ static void raid6_unplug_device(request_
- unplug_slaves(mddev);
- }
-
-+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
-+ sector_t *error_sector)
-+{
-+ mddev_t *mddev = q->queuedata;
-+ raid6_conf_t *conf = mddev_to_conf(mddev);
-+ int i, ret = 0;
-+
-+ for (i=0; i<mddev->raid_disks; i++) {
-+ mdk_rdev_t *rdev = conf->disks[i].rdev;
-+ if (rdev && !rdev->faulty) {
-+ struct block_device *bdev = rdev->bdev;
-+ request_queue_t *r_queue;
-+
-+ if (!bdev)
-+ continue;
-+
-+ r_queue = bdev_get_queue(bdev);
-+ if (!r_queue)
-+ continue;
-+
-+ if (!r_queue->issue_flush_fn) {
-+ ret = -EOPNOTSUPP;
-+ break;
-+ }
-+
-+ ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+ if (ret)
-+ break;
-+ }
-+ }
-+ return ret;
-+}
-+
- static inline void raid6_plug_device(raid6_conf_t *conf)
- {
- spin_lock_irq(&conf->device_lock);
-@@ -1708,6 +1744,7 @@ static int run (mddev_t *mddev)
- atomic_set(&conf->preread_active_stripes, 0);
-
- mddev->queue->unplug_fn = raid6_unplug_device;
-+ mddev->queue->issue_flush_fn = raid6_issue_flush;
-
- PRINTK("raid6: run(%s) called.\n", mdname(mddev));
-
---- ./include/linux/compat_ioctl.h.dm 2006-03-17 08:58:47.000000000 +0300
-+++ ./include/linux/compat_ioctl.h 2006-03-17 08:16:12.000000000 +0300
-@@ -102,6 +102,7 @@ COMPATIBLE_IOCTL(BLKROGET)
- COMPATIBLE_IOCTL(BLKRRPART)
- COMPATIBLE_IOCTL(BLKFLSBUF)
- COMPATIBLE_IOCTL(BLKSECTSET)
-+COMPATIBLE_IOCTL(BLKSECTGET)
- COMPATIBLE_IOCTL(BLKSSZGET)
- ULONG_IOCTL(BLKRASET)
- ULONG_IOCTL(BLKFRASET)
-@@ -141,6 +142,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32)
- COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
- COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
- COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
-+COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
- COMPATIBLE_IOCTL(DM_VERSION)
- COMPATIBLE_IOCTL(DM_REMOVE_ALL)
- COMPATIBLE_IOCTL(DM_LIST_DEVICES)
-@@ -155,6 +157,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
- COMPATIBLE_IOCTL(DM_TABLE_DEPS)
- COMPATIBLE_IOCTL(DM_TABLE_STATUS)
- COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
-+COMPATIBLE_IOCTL(DM_TARGET_MSG)
- /* Big K */
- COMPATIBLE_IOCTL(PIO_FONT)
- COMPATIBLE_IOCTL(GIO_FONT)
-@@ -387,6 +390,7 @@ COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
- COMPATIBLE_IOCTL(DVD_AUTH)
- /* Big L */
- ULONG_IOCTL(LOOP_SET_FD)
-+ULONG_IOCTL(LOOP_CHANGE_FD)
- COMPATIBLE_IOCTL(LOOP_CLR_FD)
- COMPATIBLE_IOCTL(LOOP_GET_STATUS64)
- COMPATIBLE_IOCTL(LOOP_SET_STATUS64)
-@@ -595,13 +599,15 @@ COMPATIBLE_IOCTL(ATMTCP_CREATE)
- COMPATIBLE_IOCTL(ATMTCP_REMOVE)
- COMPATIBLE_IOCTL(ATMMPC_CTRL)
- COMPATIBLE_IOCTL(ATMMPC_DATA)
--/* Big W */
--/* WIOC_GETSUPPORT not yet implemented -E */
-+/* Watchdog */
-+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
- COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
- COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
- COMPATIBLE_IOCTL(WDIOC_GETTEMP)
- COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
- COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
-+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
-+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
- /* Big R */
- COMPATIBLE_IOCTL(RNDGETENTCNT)
- COMPATIBLE_IOCTL(RNDADDTOENTCNT)
-@@ -735,3 +741,20 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY)
- COMPATIBLE_IOCTL(SIOCGIWRETRY)
- COMPATIBLE_IOCTL(SIOCSIWPOWER)
- COMPATIBLE_IOCTL(SIOCGIWPOWER)
-+/* hiddev */
-+COMPATIBLE_IOCTL(HIDIOCGVERSION)
-+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
-+COMPATIBLE_IOCTL(HIDIOCGSTRING)
-+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
-+COMPATIBLE_IOCTL(HIDIOCGREPORT)
-+COMPATIBLE_IOCTL(HIDIOCSREPORT)
-+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
-+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
-+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
-+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
-+COMPATIBLE_IOCTL(HIDIOCGUCODE)
-+COMPATIBLE_IOCTL(HIDIOCGFLAG)
-+COMPATIBLE_IOCTL(HIDIOCSFLAG)
-+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
-+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
---- ./include/linux/device-mapper.h.dm 2006-03-17 08:58:56.000000000 +0300
-+++ ./include/linux/device-mapper.h 2006-03-17 08:16:12.000000000 +0300
-@@ -51,12 +51,15 @@ typedef int (*dm_endio_fn) (struct dm_ta
- struct bio *bio, int error,
- union map_info *map_context);
-
--typedef void (*dm_suspend_fn) (struct dm_target *ti);
-+typedef void (*dm_presuspend_fn) (struct dm_target *ti);
-+typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
- typedef void (*dm_resume_fn) (struct dm_target *ti);
-
- typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
- char *result, unsigned int maxlen);
-
-+typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
-+
- void dm_error(const char *message);
-
- /*
-@@ -79,9 +82,11 @@ struct target_type {
- dm_dtr_fn dtr;
- dm_map_fn map;
- dm_endio_fn end_io;
-- dm_suspend_fn suspend;
-+ dm_presuspend_fn presuspend;
-+ dm_postsuspend_fn postsuspend;
- dm_resume_fn resume;
- dm_status_fn status;
-+ dm_message_fn message;
- };
-
- struct io_restrictions {
-@@ -102,6 +107,7 @@ struct dm_target {
- sector_t len;
-
- /* FIXME: turn this into a mask, and merge with io_restrictions */
-+ /* Always a power of 2 */
- sector_t split_io;
-
- /*
---- ./include/linux/dm-ioctl.h.dm 2006-03-17 08:59:07.000000000 +0300
-+++ ./include/linux/dm-ioctl.h 2006-03-17 08:16:12.000000000 +0300
-@@ -1,5 +1,6 @@
- /*
- * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
-+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
- *
- * This file is released under the LGPL.
- */
-@@ -76,6 +77,9 @@
- *
- * DM_TABLE_STATUS:
- * Return the targets status for the 'active' table.
-+ *
-+ * DM_TARGET_MSG:
-+ * Pass a message string to the target at a specific offset of a device.
- */
-
- /*
-@@ -179,6 +183,15 @@ struct dm_target_versions {
- };
-
- /*
-+ * Used to pass message to a target
-+ */
-+struct dm_target_msg {
-+ uint64_t sector; /* Device sector */
-+
-+ char message[0];
-+};
-+
-+/*
- * If you change this make sure you make the corresponding change
- * to dm-ioctl.c:lookup_ioctl()
- */
-@@ -204,6 +217,7 @@ enum {
-
- /* Added later */
- DM_LIST_VERSIONS_CMD,
-+ DM_TARGET_MSG_CMD,
- };
-
- /*
-@@ -232,6 +246,7 @@ typedef char ioctl_struct[308];
- #define DM_TABLE_DEPS_32 _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct)
- #define DM_TABLE_STATUS_32 _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
- #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
-+#define DM_TARGET_MSG_32 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
- #endif
-
- #define DM_IOCTL 0xfd
-@@ -254,10 +269,12 @@ typedef char ioctl_struct[308];
-
- #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
-
-+#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
-+
- #define DM_VERSION_MAJOR 4
--#define DM_VERSION_MINOR 1
-+#define DM_VERSION_MINOR 5
- #define DM_VERSION_PATCHLEVEL 0
--#define DM_VERSION_EXTRA "-ioctl (2003-12-10)"
-+#define DM_VERSION_EXTRA "-ioctl (2005-10-04)"
-
- /* Status bits */
- #define DM_READONLY_FLAG (1 << 0) /* In/Out */
-@@ -283,4 +300,14 @@ typedef char ioctl_struct[308];
- */
- #define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
-
-+/*
-+ * Set this to improve performance when you aren't going to use open_count.
-+ */
-+#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */
-+
-+/*
-+ * Set this to avoid attempting to freeze any filesystem when suspending.
-+ */
-+#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */
-+
- #endif /* _LINUX_DM_IOCTL_H */
---- ./include/linux/genhd.h.dm 2006-03-20 08:42:40.000000000 +0300
-+++ ./include/linux/genhd.h 2006-03-17 13:44:40.000000000 +0300
-@@ -100,7 +100,7 @@ struct gendisk {
- struct timer_rand_state *random;
- int policy;
-
-- unsigned sync_io; /* RAID */
-+ atomic_t sync_io; /* RAID */
- unsigned long stamp, stamp_idle;
- int in_flight;
- #ifdef CONFIG_SMP
-diff -pruN ./include/linux/raid.dm/linear.h ./include/linux/raid/linear.h
---- ./include/linux/raid.dm/linear.h 2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/linear.h 2006-03-17 13:26:59.000000000 +0300
-@@ -5,8 +5,8 @@
-
- struct dev_info {
- mdk_rdev_t *rdev;
-- unsigned long size;
-- unsigned long offset;
-+ sector_t size;
-+ sector_t offset;
- };
-
- typedef struct dev_info dev_info_t;
-diff -pruN ./include/linux/raid.dm/md.h ./include/linux/raid/md.h
---- ./include/linux/raid.dm/md.h 2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/md.h 2006-03-17 13:26:59.000000000 +0300
-@@ -69,12 +69,10 @@ extern mdk_thread_t * md_register_thread
- extern void md_unregister_thread (mdk_thread_t *thread);
- extern void md_wakeup_thread(mdk_thread_t *thread);
- extern void md_check_recovery(mddev_t *mddev);
--extern void md_interrupt_thread (mdk_thread_t *thread);
- extern void md_write_start(mddev_t *mddev);
- extern void md_write_end(mddev_t *mddev);
- extern void md_handle_safemode(mddev_t *mddev);
- extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
--extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
- extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
- extern void md_unplug_mddev(mddev_t *mddev);
-
-diff -pruN ./include/linux/raid.dm/md_k.h ./include/linux/raid/md_k.h
---- ./include/linux/raid.dm/md_k.h 2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/md_k.h 2006-03-17 13:26:59.000000000 +0300
-@@ -24,7 +24,8 @@
- #define HSM 6UL
- #define MULTIPATH 7UL
- #define RAID6 8UL
--#define MAX_PERSONALITY 9UL
-+#define RAID10 9UL
-+#define MAX_PERSONALITY 10UL
-
- #define LEVEL_MULTIPATH (-4)
- #define LEVEL_LINEAR (-1)
-@@ -43,6 +44,7 @@ static inline int pers_to_level (int per
- case RAID1: return 1;
- case RAID5: return 5;
- case RAID6: return 6;
-+ case RAID10: return 10;
- }
- BUG();
- return MD_RESERVED;
-@@ -60,6 +62,7 @@ static inline int level_to_pers (int lev
- case 4:
- case 5: return RAID5;
- case 6: return RAID6;
-+ case 10: return RAID10;
- }
- return MD_RESERVED;
- }
-@@ -216,6 +219,7 @@ struct mddev_s
- unsigned long resync_mark; /* a recent timestamp */
- sector_t resync_mark_cnt;/* blocks written at resync_mark */
-
-+ sector_t resync_max_sectors; /* may be set by personality */
- /* recovery/resync flags
- * NEEDED: we might need to start a resync/recover
- * RUNNING: a thread is running, or about to be started
-@@ -263,6 +267,11 @@ static inline void rdev_dec_pending(mdk_
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- }
-
-+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-+{
-+ atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
-+}
-+
- struct mdk_personality_s
- {
- char *name;
-diff -pruN ./include/linux/raid.dm/raid10.h ./include/linux/raid/raid10.h
---- ./include/linux/raid.dm/raid10.h 1970-01-01 03:00:00.000000000 +0300
-+++ ./include/linux/raid/raid10.h 2006-03-17 13:26:59.000000000 +0300
-@@ -0,0 +1,103 @@
-+#ifndef _RAID10_H
-+#define _RAID10_H
-+
-+#include <linux/raid/md.h>
-+
-+typedef struct mirror_info mirror_info_t;
-+
-+struct mirror_info {
-+ mdk_rdev_t *rdev;
-+ sector_t head_position;
-+};
-+
-+typedef struct r10bio_s r10bio_t;
-+
-+struct r10_private_data_s {
-+ mddev_t *mddev;
-+ mirror_info_t *mirrors;
-+ int raid_disks;
-+ int working_disks;
-+ spinlock_t device_lock;
-+
-+ /* geometry */
-+ int near_copies; /* number of copies layed out raid0 style */
-+ int far_copies; /* number of copies layed out
-+ * at large strides across drives
-+ */
-+ int copies; /* near_copies * far_copies.
-+ * must be <= raid_disks
-+ */
-+ sector_t stride; /* distance between far copies.
-+ * This is size / far_copies
-+ */
-+
-+ int chunk_shift; /* shift from chunks to sectors */
-+ sector_t chunk_mask;
-+
-+ struct list_head retry_list;
-+ /* for use when syncing mirrors: */
-+
-+ spinlock_t resync_lock;
-+ int nr_pending;
-+ int barrier;
-+ sector_t next_resync;
-+
-+ wait_queue_head_t wait_idle;
-+ wait_queue_head_t wait_resume;
-+
-+ mempool_t *r10bio_pool;
-+ mempool_t *r10buf_pool;
-+};
-+
-+typedef struct r10_private_data_s conf_t;
-+
-+/*
-+ * this is the only point in the RAID code where we violate
-+ * C type safety. mddev->private is an 'opaque' pointer.
-+ */
-+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-+
-+/*
-+ * this is our 'private' RAID10 bio.
-+ *
-+ * it contains information about what kind of IO operations were started
-+ * for this RAID10 operation, and about their status:
-+ */
-+
-+struct r10bio_s {
-+ atomic_t remaining; /* 'have we finished' count,
-+ * used from IRQ handlers
-+ */
-+ sector_t sector; /* virtual sector number */
-+ int sectors;
-+ unsigned long state;
-+ mddev_t *mddev;
-+ /*
-+ * original bio going to /dev/mdx
-+ */
-+ struct bio *master_bio;
-+ /*
-+ * if the IO is in READ direction, then this is where we read
-+ */
-+ int read_slot;
-+
-+ struct list_head retry_list;
-+ /*
-+ * if the IO is in WRITE direction, then multiple bios are used,
-+ * one for each copy.
-+ * When resyncing we also use one for each copy.
-+ * When reconstructing, we use 2 bios, one for read, one for write.
-+ * We choose the number when they are allocated.
-+ */
-+ struct {
-+ struct bio *bio;
-+ sector_t addr;
-+ int devnum;
-+ } devs[0];
-+};
-+
-+/* bits for r10bio.state */
-+#define R10BIO_Uptodate 0
-+#define R10BIO_IsSync 1
-+#define R10BIO_IsRecover 2
-+#endif