1 files changed, 0 insertions, 9859 deletions
diff --git a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch b/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch
deleted file mode 100644
index 4075cab..0000000
--- a/openvz-sources/022.078-r3/5127_linux-2.6.8-dm-20051004.patch
+++ /dev/null
@@ -1,9859 +0,0 @@
-diff -pruN ./drivers/md.dm/dm-bio-list.h ./drivers/md/dm-bio-list.h
---- ./drivers/md.dm/dm-bio-list.h	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-bio-list.h	2006-03-17 13:16:38.000000000 +0300
-@@ -33,6 +33,9 @@ static inline void bio_list_add(struct b
- 
- static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
- {
-+	if (!bl2->head)
-+		return;
-+
- 	if (bl->tail)
- 		bl->tail->bi_next = bl2->head;
- 	else
-diff -pruN ./drivers/md.dm/dm-bio-record.h ./drivers/md/dm-bio-record.h
---- ./drivers/md.dm/dm-bio-record.h	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-bio-record.h	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,45 @@
-+/*
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ */
-+
-+#ifndef DM_BIO_RECORD_H
-+#define DM_BIO_RECORD_H
-+
-+#include <linux/bio.h>
-+
-+/*
-+ * There are lots of mutable fields in the bio struct that get
-+ * changed by the lower levels of the block layer.  Some targets,
-+ * such as multipath, may wish to resubmit a bio on error.  The
-+ * functions in this file help the target record and restore the
-+ * original bio state.
-+ */
-+struct dm_bio_details {
-+	sector_t bi_sector;
-+	struct block_device *bi_bdev;
-+	unsigned int bi_size;
-+	unsigned short bi_idx;
-+	unsigned long bi_flags;
-+};
-+
-+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
-+{
-+	bd->bi_sector = bio->bi_sector;
-+	bd->bi_bdev = bio->bi_bdev;
-+	bd->bi_size = bio->bi_size;
-+	bd->bi_idx = bio->bi_idx;
-+	bd->bi_flags = bio->bi_flags;
-+}
-+
-+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
-+{
-+	bio->bi_sector = bd->bi_sector;
-+	bio->bi_bdev = bd->bi_bdev;
-+	bio->bi_size = bd->bi_size;
-+	bio->bi_idx = bd->bi_idx;
-+	bio->bi_flags = bd->bi_flags;
-+}
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm.c ./drivers/md/dm.c
---- ./drivers/md.dm/dm.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm.c	2006-03-17 13:16:38.000000000 +0300
-@@ -15,15 +15,13 @@
- #include <linux/buffer_head.h>
- #include <linux/mempool.h>
- #include <linux/slab.h>
-+#include <linux/idr.h>
- 
- static const char *_name = DM_NAME;
- 
- static unsigned int major = 0;
- static unsigned int _major = 0;
- 
--static int realloc_minor_bits(unsigned long requested_minor);
--static void free_minor_bits(void);
--
- /*
-  * One of these is allocated per bio.
-  */
-@@ -32,6 +30,7 @@ struct dm_io {
- 	int error;
- 	struct bio *bio;
- 	atomic_t io_count;
-+	unsigned long start_time;
- };
- 
- /*
-@@ -44,15 +43,23 @@ struct target_io {
- 	union map_info info;
- };
- 
-+union map_info *dm_get_mapinfo(struct bio *bio)
-+{
-+        if (bio && bio->bi_private)
-+                return &((struct target_io *)bio->bi_private)->info;
-+        return NULL;
-+}
-+
- /*
-  * Bits for the md->flags field.
-  */
- #define DMF_BLOCK_IO 0
- #define DMF_SUSPENDED 1
--#define DMF_FS_LOCKED 2
-+#define DMF_FROZEN 2
- 
- struct mapped_device {
--	struct rw_semaphore lock;
-+	struct rw_semaphore io_lock;
-+	struct semaphore suspend_lock;
- 	rwlock_t map_lock;
- 	atomic_t holders;
- 
-@@ -61,6 +68,8 @@ struct mapped_device {
- 	request_queue_t *queue;
- 	struct gendisk *disk;
- 
-+	void *interface_ptr;
-+
- 	/*
- 	 * A list of ios that arrived while we were suspended.
- 	 */
-@@ -89,6 +98,7 @@ struct mapped_device {
- 	 * freeze/thaw support require holding onto a super block
- 	 */
- 	struct super_block *frozen_sb;
-+	struct block_device *suspended_bdev;
- };
- 
- #define MIN_IOS 256
-@@ -113,19 +123,11 @@ static int __init local_init(void)
- 		return -ENOMEM;
- 	}
- 
--	r = realloc_minor_bits(1024);
--	if (r < 0) {
--		kmem_cache_destroy(_tio_cache);
--		kmem_cache_destroy(_io_cache);
--		return r;
--	}
--
- 	_major = major;
- 	r = register_blkdev(_major, _name);
- 	if (r < 0) {
- 		kmem_cache_destroy(_tio_cache);
- 		kmem_cache_destroy(_io_cache);
--		free_minor_bits();
- 		return r;
- 	}
- 
-@@ -139,7 +141,6 @@ static void local_exit(void)
- {
- 	kmem_cache_destroy(_tio_cache);
- 	kmem_cache_destroy(_io_cache);
--	free_minor_bits();
- 
- 	if (unregister_blkdev(_major, _name) < 0)
- 		DMERR("devfs_unregister_blkdev failed");
-@@ -238,21 +239,53 @@ static inline void free_tio(struct mappe
- 	mempool_free(tio, md->tio_pool);
- }
- 
-+static void start_io_acct(struct dm_io *io)
-+{
-+	struct mapped_device *md = io->md;
-+
-+	io->start_time = jiffies;
-+
-+	disk_round_stats(dm_disk(md));
-+	dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
-+}
-+
-+static int end_io_acct(struct dm_io *io)
-+{
-+	struct mapped_device *md = io->md;
-+	struct bio *bio = io->bio;
-+	unsigned long duration = jiffies - io->start_time;
-+	int pending;
-+
-+	disk_round_stats(dm_disk(md));
-+	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
-+
-+	switch (bio_data_dir(bio)) {
-+	case WRITE:
-+		disk_stat_add(dm_disk(md), write_ticks, duration);
-+		break;
-+	case READ:
-+		disk_stat_add(dm_disk(md), read_ticks, duration);
-+		break;
-+	}
-+
-+	return !pending;
-+}
-+
- /*
-  * Add the bio to the list of deferred io.
-  */
- static int queue_io(struct mapped_device *md, struct bio *bio)
- {
--	down_write(&md->lock);
-+	down_write(&md->io_lock);
- 
- 	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
--		up_write(&md->lock);
-+		up_write(&md->io_lock);
- 		return 1;
- 	}
- 
- 	bio_list_add(&md->deferred, bio);
- 
--	up_write(&md->lock);
-+	up_write(&md->io_lock);
- 	return 0;		/* deferred successfully */
- }
- 
-@@ -293,7 +326,7 @@ static inline void dec_pending(struct dm
- 		io->error = error;
- 
- 	if (atomic_dec_and_test(&io->io_count)) {
--		if (atomic_dec_and_test(&io->md->pending))
-+		if (end_io_acct(io))
- 			/* nudge anyone waiting on suspend queue */
- 			wake_up(&io->md->wait);
- 
-@@ -342,8 +375,8 @@ static sector_t max_io_len(struct mapped
- 	 */
- 	if (ti->split_io) {
- 		sector_t boundary;
--		boundary = dm_round_up(offset + 1, ti->split_io) - offset;
--
-+		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
-+			   - offset;
- 		if (len > boundary)
- 			len = boundary;
- 	}
-@@ -379,7 +412,7 @@ static void __map_bio(struct dm_target *
- 		/* error the io and bail out */
- 		struct dm_io *io = tio->io;
- 		free_tio(tio->io->md, tio);
--		dec_pending(io, -EIO);
-+		dec_pending(io, r);
- 		bio_put(clone);
- 	}
- }
-@@ -542,7 +575,7 @@ static void __split_bio(struct mapped_de
- 	ci.sector_count = bio_sectors(bio);
- 	ci.idx = bio->bi_idx;
- 
--	atomic_inc(&md->pending);
-+	start_io_acct(ci.io);
- 	while (ci.sector_count)
- 		__clone_and_map(&ci);
- 
-@@ -563,14 +596,22 @@ static int dm_request(request_queue_t *q
- 	int r;
- 	struct mapped_device *md = q->queuedata;
- 
--	down_read(&md->lock);
-+	down_read(&md->io_lock);
-+
-+	if (bio_data_dir(bio) == WRITE) {
-+		disk_stat_inc(dm_disk(md), writes);
-+		disk_stat_add(dm_disk(md), write_sectors, bio_sectors(bio));
-+	} else {
-+		disk_stat_inc(dm_disk(md), reads);
-+		disk_stat_add(dm_disk(md), read_sectors, bio_sectors(bio));
-+	}
- 
- 	/*
- 	 * If we're suspended we have to queue
- 	 * this io for later.
- 	 */
- 	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
--		up_read(&md->lock);
-+		up_read(&md->io_lock);
- 
- 		if (bio_rw(bio) == READA) {
- 			bio_io_error(bio, bio->bi_size);
-@@ -589,14 +630,29 @@ static int dm_request(request_queue_t *q
- 		 * We're in a while loop, because someone could suspend
- 		 * before we get to the following read lock.
- 		 */
--		down_read(&md->lock);
-+		down_read(&md->io_lock);
- 	}
- 
- 	__split_bio(md, bio);
--	up_read(&md->lock);
-+	up_read(&md->io_lock);
- 	return 0;
- }
- 
-+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
-+			sector_t *error_sector)
-+{
-+	struct mapped_device *md = q->queuedata;
-+	struct dm_table *map = dm_get_table(md);
-+	int ret = -ENXIO;
-+
-+	if (map) {
-+		ret = dm_table_flush_all(map);
-+		dm_table_put(map);
-+	}
-+
-+	return ret;
-+}
-+
- static void dm_unplug_all(request_queue_t *q)
- {
- 	struct mapped_device *md = q->queuedata;
-@@ -624,109 +680,86 @@ static int dm_any_congested(void *conges
- }
- 
- /*-----------------------------------------------------------------
-- * A bitset is used to keep track of allocated minor numbers.
-+ * An IDR is used to keep track of allocated minor numbers.
-  *---------------------------------------------------------------*/
- static DECLARE_MUTEX(_minor_lock);
--static unsigned long *_minor_bits = NULL;
--static unsigned long _max_minors = 0;
--
--#define MINORS_SIZE(minors) ((minors / BITS_PER_LONG) * sizeof(unsigned long))
--
--static int realloc_minor_bits(unsigned long requested_minor)
--{
--	unsigned long max_minors;
--	unsigned long *minor_bits, *tmp;
--
--	if (requested_minor < _max_minors)
--		return -EINVAL;
--
--	/* Round up the requested minor to the next power-of-2. */
--	max_minors = 1 << fls(requested_minor - 1);
--	if (max_minors > (1 << MINORBITS))
--		return -EINVAL;
--
--	minor_bits = kmalloc(MINORS_SIZE(max_minors), GFP_KERNEL);
--	if (!minor_bits)
--		return -ENOMEM;
--	memset(minor_bits, 0, MINORS_SIZE(max_minors));
--
--	/* Copy the existing bit-set to the new one. */
--	if (_minor_bits)
--		memcpy(minor_bits, _minor_bits, MINORS_SIZE(_max_minors));
--
--	tmp = _minor_bits;
--	_minor_bits = minor_bits;
--	_max_minors = max_minors;
--	if (tmp)
--		kfree(tmp);
--
--	return 0;
--}
--
--static void free_minor_bits(void)
--{
--	down(&_minor_lock);
--	kfree(_minor_bits);
--	_minor_bits = NULL;
--	_max_minors = 0;
--	up(&_minor_lock);
--}
-+static DEFINE_IDR(_minor_idr);
- 
- static void free_minor(unsigned int minor)
- {
- 	down(&_minor_lock);
--	if (minor < _max_minors)
--		clear_bit(minor, _minor_bits);
-+	idr_remove(&_minor_idr, minor);
- 	up(&_minor_lock);
- }
- 
- /*
-  * See if the device with a specific minor # is free.
-  */
--static int specific_minor(unsigned int minor)
-+static int specific_minor(struct mapped_device *md, unsigned int minor)
- {
--	int r = 0;
-+	int r, m;
- 
--	if (minor > (1 << MINORBITS))
-+	if (minor >= (1 << MINORBITS))
- 		return -EINVAL;
- 
- 	down(&_minor_lock);
--	if (minor >= _max_minors) {
--		r = realloc_minor_bits(minor);
--		if (r) {
--			up(&_minor_lock);
--			return r;
--		}
-+
-+	if (idr_find(&_minor_idr, minor)) {
-+		r = -EBUSY;
-+		goto out;
-+	}
-+
-+	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-+	if (!r) {
-+		r = -ENOMEM;
-+		goto out;
-+	}
-+
-+	r = idr_get_new_above(&_minor_idr, md, minor, &m);
-+	if (r) {
-+		goto out;
- 	}
- 
--	if (test_and_set_bit(minor, _minor_bits))
-+	if (m != minor) {
-+		idr_remove(&_minor_idr, m);
- 		r = -EBUSY;
--	up(&_minor_lock);
-+		goto out;
-+	}
- 
-+out:
-+	up(&_minor_lock);
- 	return r;
- }
- 
--static int next_free_minor(unsigned int *minor)
-+static int next_free_minor(struct mapped_device *md, unsigned int *minor)
- {
- 	int r;
- 	unsigned int m;
- 
- 	down(&_minor_lock);
--	m = find_first_zero_bit(_minor_bits, _max_minors);
--	if (m >= _max_minors) {
--		r = realloc_minor_bits(_max_minors * 2);
--		if (r) {
--			up(&_minor_lock);
--			return r;
--		}
--		m = find_first_zero_bit(_minor_bits, _max_minors);
-+
-+	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
-+	if (!r) {
-+		r = -ENOMEM;
-+		goto out;
-+	}
-+
-+	r = idr_get_new(&_minor_idr, md, &m);
-+	if (r) {
-+		goto out;
-+	}
-+
-+	if (m >= (1 << MINORBITS)) {
-+		idr_remove(&_minor_idr, m);
-+		r = -ENOSPC;
-+		goto out;
- 	}
- 
--	set_bit(m, _minor_bits);
- 	*minor = m;
--	up(&_minor_lock);
- 
--	return 0;
-+out:
-+	up(&_minor_lock);
-+	return r;
- }
- 
- static struct block_device_operations dm_blk_dops;
-@@ -745,12 +778,13 @@ static struct mapped_device *alloc_dev(u
- 	}
- 
- 	/* get a minor number for the dev */
--	r = persistent ? specific_minor(minor) : next_free_minor(&minor);
-+	r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
- 	if (r < 0)
- 		goto bad1;
- 
- 	memset(md, 0, sizeof(*md));
--	init_rwsem(&md->lock);
-+	init_rwsem(&md->io_lock);
-+	init_MUTEX(&md->suspend_lock);
- 	rwlock_init(&md->map_lock);
- 	atomic_set(&md->holders, 1);
- 	atomic_set(&md->event_nr, 0);
-@@ -764,6 +798,7 @@ static struct mapped_device *alloc_dev(u
- 	md->queue->backing_dev_info.congested_data = md;
- 	blk_queue_make_request(md->queue, dm_request);
- 	md->queue->unplug_fn = dm_unplug_all;
-+	md->queue->issue_flush_fn = dm_flush_all;
- 
- 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- 				     mempool_free_slab, _io_cache);
-@@ -823,22 +858,17 @@ static void event_callback(void *context
- {
- 	struct mapped_device *md = (struct mapped_device *) context;
- 
--	atomic_inc(&md->event_nr);;
-+	atomic_inc(&md->event_nr);
- 	wake_up(&md->eventq);
- }
- 
--static void __set_size(struct gendisk *disk, sector_t size)
-+static void __set_size(struct mapped_device *md, sector_t size)
- {
--	struct block_device *bdev;
-+	set_capacity(md->disk, size);
- 
--	set_capacity(disk, size);
--	bdev = bdget_disk(disk, 0);
--	if (bdev) {
--		down(&bdev->bd_inode->i_sem);
--		i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
--		up(&bdev->bd_inode->i_sem);
--		bdput(bdev);
--	}
-+	down(&md->suspended_bdev->bd_inode->i_sem);
-+	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-+	up(&md->suspended_bdev->bd_inode->i_sem);
- }
- 
- static int __bind(struct mapped_device *md, struct dm_table *t)
-@@ -847,17 +877,18 @@ static int __bind(struct mapped_device *
- 	sector_t size;
- 
- 	size = dm_table_get_size(t);
--	__set_size(md->disk, size);
-+	__set_size(md, size);
- 	if (size == 0)
- 		return 0;
- 
-+	dm_table_get(t);
-+	dm_table_event_callback(t, event_callback, md);
-+
- 	write_lock(&md->map_lock);
- 	md->map = t;
-+	dm_table_set_restrictions(t, q);
- 	write_unlock(&md->map_lock);
- 
--	dm_table_get(t);
--	dm_table_event_callback(md->map, event_callback, md);
--	dm_table_set_restrictions(t, q);
- 	return 0;
- }
- 
-@@ -901,6 +932,32 @@ int dm_create_with_minor(unsigned int mi
- 	return create_aux(minor, 1, result);
- }
- 
-+void *dm_get_mdptr(dev_t dev)
-+{
-+	struct mapped_device *md;
-+	void *mdptr = NULL;
-+	unsigned minor = MINOR(dev);
-+
-+	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
-+		return NULL;
-+
-+	down(&_minor_lock);
-+
-+	md = idr_find(&_minor_idr, minor);
-+
-+	if (md && (dm_disk(md)->first_minor == minor))
-+		mdptr = md->interface_ptr;
-+
-+	up(&_minor_lock);
-+
-+	return mdptr;
-+}
-+
-+void dm_set_mdptr(struct mapped_device *md, void *ptr)
-+{
-+	md->interface_ptr = ptr;
-+}
-+
- void dm_get(struct mapped_device *md)
- {
- 	atomic_inc(&md->holders);
-@@ -911,8 +968,10 @@ void dm_put(struct mapped_device *md)
- 	struct dm_table *map = dm_get_table(md);
- 
- 	if (atomic_dec_and_test(&md->holders)) {
--		if (!test_bit(DMF_SUSPENDED, &md->flags) && map)
--			dm_table_suspend_targets(map);
-+		if (!dm_suspended(md)) {
-+			dm_table_presuspend_targets(map);
-+			dm_table_postsuspend_targets(map);
-+		}
- 		__unbind(md);
- 		free_dev(md);
- 	}
-@@ -940,69 +999,55 @@ static void __flush_deferred_io(struct m
-  */
- int dm_swap_table(struct mapped_device *md, struct dm_table *table)
- {
--	int r;
-+	int r = -EINVAL;
- 
--	down_write(&md->lock);
-+	down(&md->suspend_lock);
- 
- 	/* device must be suspended */
--	if (!test_bit(DMF_SUSPENDED, &md->flags)) {
--		up_write(&md->lock);
--		return -EPERM;
--	}
-+	if (!dm_suspended(md))
-+		goto out;
- 
- 	__unbind(md);
- 	r = __bind(md, table);
--	if (r)
--		return r;
- 
--	up_write(&md->lock);
--	return 0;
-+out:
-+	up(&md->suspend_lock);
-+	return r;
- }
- 
- /*
-  * Functions to lock and unlock any filesystem running on the
-  * device.
-  */
--static int __lock_fs(struct mapped_device *md)
-+static int lock_fs(struct mapped_device *md)
- {
--	struct block_device *bdev;
-+	int r;
- 
--	if (test_and_set_bit(DMF_FS_LOCKED, &md->flags))
--		return 0;
-+	WARN_ON(md->frozen_sb);
- 
--	bdev = bdget_disk(md->disk, 0);
--	if (!bdev) {
--		DMWARN("bdget failed in __lock_fs");
--		return -ENOMEM;
-+	md->frozen_sb = freeze_bdev(md->suspended_bdev);
-+	if (IS_ERR(md->frozen_sb)) {
-+		r = PTR_ERR(md->frozen_sb);
-+		md->frozen_sb = NULL;
-+		return r;
- 	}
- 
--	WARN_ON(md->frozen_sb);
--	md->frozen_sb = freeze_bdev(bdev);
-+	set_bit(DMF_FROZEN, &md->flags);
-+
- 	/* don't bdput right now, we don't want the bdev
--	 * to go away while it is locked.  We'll bdput
--	 * in __unlock_fs
-+	 * to go away while it is locked.
- 	 */
- 	return 0;
- }
- 
--static int __unlock_fs(struct mapped_device *md)
-+static void unlock_fs(struct mapped_device *md)
- {
--	struct block_device *bdev;
--
--	if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags))
--		return 0;
--
--	bdev = bdget_disk(md->disk, 0);
--	if (!bdev) {
--		DMWARN("bdget failed in __unlock_fs");
--		return -ENOMEM;
--	}
-+	if (!test_bit(DMF_FROZEN, &md->flags))
-+		return;
- 
--	thaw_bdev(bdev, md->frozen_sb);
-+	thaw_bdev(md->suspended_bdev, md->frozen_sb);
- 	md->frozen_sb = NULL;
--	bdput(bdev);
--	bdput(bdev);
--	return 0;
-+	clear_bit(DMF_FROZEN, &md->flags);
- }
- 
- /*
-@@ -1012,46 +1057,48 @@ static int __unlock_fs(struct mapped_dev
-  * dm_bind_table, dm_suspend must be called to flush any in
-  * flight bios and ensure that any further io gets deferred.
-  */
--int dm_suspend(struct mapped_device *md)
-+int dm_suspend(struct mapped_device *md, int do_lockfs)
- {
--	struct dm_table *map;
-+	struct dm_table *map = NULL;
- 	DECLARE_WAITQUEUE(wait, current);
-+	int r = -EINVAL;
- 
--	/* Flush I/O to the device. */
--	down_read(&md->lock);
--	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
--		up_read(&md->lock);
--		return -EINVAL;
-+	down(&md->suspend_lock);
-+
-+	if (dm_suspended(md))
-+		goto out;
-+
-+	map = dm_get_table(md);
-+
-+	/* This does not get reverted if there's an error later. */
-+	dm_table_presuspend_targets(map);
-+
-+	md->suspended_bdev = bdget_disk(md->disk, 0);
-+	if (!md->suspended_bdev) {
-+		DMWARN("bdget failed in dm_suspend");
-+		r = -ENOMEM;
-+		goto out;
- 	}
- 
--	__lock_fs(md);
--	up_read(&md->lock);
-+	/* Flush I/O to the device. */
-+	if (do_lockfs) {
-+		r = lock_fs(md);
-+		if (r)
-+			goto out;
-+	}
- 
- 	/*
--	 * First we set the BLOCK_IO flag so no more ios will be
--	 * mapped.
-+	 * First we set the BLOCK_IO flag so no more ios will be mapped.
- 	 */
--	down_write(&md->lock);
--	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
--		/*
--		 * If we get here we know another thread is
--		 * trying to suspend as well, so we leave the fs
--		 * locked for this thread.
--		 */
--		up_write(&md->lock);
--		return -EINVAL;
--	}
--
-+	down_write(&md->io_lock);
- 	set_bit(DMF_BLOCK_IO, &md->flags);
-+
- 	add_wait_queue(&md->wait, &wait);
--	up_write(&md->lock);
-+	up_write(&md->io_lock);
- 
- 	/* unplug */
--	map = dm_get_table(md);
--	if (map) {
-+	if (map)
- 		dm_table_unplug_all(map);
--		dm_table_put(map);
--	}
- 
- 	/*
- 	 * Then we wait for the already mapped ios to
-@@ -1067,54 +1114,75 @@ int dm_suspend(struct mapped_device *md)
- 	}
- 	set_current_state(TASK_RUNNING);
- 
--	down_write(&md->lock);
-+	down_write(&md->io_lock);
- 	remove_wait_queue(&md->wait, &wait);
- 
- 	/* were we interrupted ? */
-+	r = -EINTR;
- 	if (atomic_read(&md->pending)) {
--		__unlock_fs(md);
-+		up_write(&md->io_lock);
-+		unlock_fs(md);
- 		clear_bit(DMF_BLOCK_IO, &md->flags);
--		up_write(&md->lock);
--		return -EINTR;
-+		goto out;
- 	}
-+	up_write(&md->io_lock);
-+
-+	dm_table_postsuspend_targets(map);
- 
- 	set_bit(DMF_SUSPENDED, &md->flags);
- 
--	map = dm_get_table(md);
--	if (map)
--		dm_table_suspend_targets(map);
--	dm_table_put(map);
--	up_write(&md->lock);
-+	r = 0;
- 
--	return 0;
-+out:
-+	if (r && md->suspended_bdev) {
-+		bdput(md->suspended_bdev);
-+		md->suspended_bdev = NULL;
-+	}
-+
-+	dm_table_put(map);
-+	up(&md->suspend_lock);
-+	return r;
- }
- 
- int dm_resume(struct mapped_device *md)
- {
-+	int r = -EINVAL;
- 	struct bio *def;
--	struct dm_table *map = dm_get_table(md);
-+	struct dm_table *map = NULL;
- 
--	down_write(&md->lock);
--	if (!map ||
--	    !test_bit(DMF_SUSPENDED, &md->flags) ||
--	    !dm_table_get_size(map)) {
--		up_write(&md->lock);
--		dm_table_put(map);
--		return -EINVAL;
--	}
-+	down(&md->suspend_lock);
-+	if (!dm_suspended(md))
-+		goto out;
-+
-+	map = dm_get_table(md);
-+	if (!map || !dm_table_get_size(map))
-+		goto out;
- 
- 	dm_table_resume_targets(map);
--	clear_bit(DMF_SUSPENDED, &md->flags);
-+
-+	down_write(&md->io_lock);
- 	clear_bit(DMF_BLOCK_IO, &md->flags);
- 
- 	def = bio_list_get(&md->deferred);
- 	__flush_deferred_io(md, def);
--	up_write(&md->lock);
--	__unlock_fs(md);
-+	up_write(&md->io_lock);
-+
-+	unlock_fs(md);
-+
-+	bdput(md->suspended_bdev);
-+	md->suspended_bdev = NULL;
-+
-+	clear_bit(DMF_SUSPENDED, &md->flags);
-+
- 	dm_table_unplug_all(map);
-+
-+	r = 0;
-+
-+out:
- 	dm_table_put(map);
-+	up(&md->suspend_lock);
- 
--	return 0;
-+	return r;
- }
- 
- /*-----------------------------------------------------------------
-@@ -1151,6 +1219,8 @@ static struct block_device_operations dm
- 	.owner = THIS_MODULE
- };
- 
-+EXPORT_SYMBOL(dm_get_mapinfo);
-+
- /*
-  * module hooks
-  */
-@@ -1160,5 +1230,5 @@ module_exit(dm_exit);
- module_param(major, uint, 0);
- MODULE_PARM_DESC(major, "The major number of the device mapper");
- MODULE_DESCRIPTION(DM_NAME " driver");
--MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
-+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
- MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-crypt.c ./drivers/md/dm-crypt.c
---- ./drivers/md.dm/dm-crypt.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-crypt.c	2006-03-17 13:16:38.000000000 +0300
-@@ -40,8 +40,8 @@ struct convert_context {
- 	struct bio *bio_out;
- 	unsigned int offset_in;
- 	unsigned int offset_out;
--	int idx_in;
--	int idx_out;
-+	unsigned int idx_in;
-+	unsigned int idx_out;
- 	sector_t sector;
- 	int write;
- };
-@@ -67,8 +67,8 @@ struct crypt_config {
- 	struct crypto_tfm *tfm;
- 	sector_t iv_offset;
- 	int (*iv_generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
--	int iv_size;
--	int key_size;
-+	unsigned int iv_size;
-+	unsigned int key_size;
- 	u8 key[0];
- };
- 
-@@ -97,10 +97,8 @@ static void mempool_free_page(void *page
-  */
- static int crypt_iv_plain(struct crypt_config *cc, u8 *iv, sector_t sector)
- {
-+	memset(iv, 0, cc->iv_size);
- 	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
--	if (cc->iv_size > sizeof(u32) / sizeof(u8))
--		memset(iv + (sizeof(u32) / sizeof(u8)), 0,
--		       cc->iv_size - (sizeof(u32) / sizeof(u8)));
- 
- 	return 0;
- }
-@@ -200,13 +198,13 @@ static int crypt_convert(struct crypt_co
-  */
- static struct bio *
- crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
--                   struct bio *base_bio, int *bio_vec_idx)
-+                   struct bio *base_bio, unsigned int *bio_vec_idx)
- {
- 	struct bio *bio;
--	int nr_iovecs = dm_div_up(size, PAGE_SIZE);
-+	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- 	int gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
--	int flags = current->flags;
--	int i;
-+	unsigned long flags = current->flags;
-+	unsigned int i;
- 
- 	/*
- 	 * Tell VM to act less aggressively and fail earlier.
-@@ -280,9 +278,8 @@ crypt_alloc_buffer(struct crypt_config *
- static void crypt_free_buffer_pages(struct crypt_config *cc,
-                                     struct bio *bio, unsigned int bytes)
- {
--	unsigned int start, end;
-+	unsigned int i, start, end;
- 	struct bio_vec *bv;
--	int i;
- 
- 	/*
- 	 * This is ugly, but Jens Axboe thinks that using bi_idx in the
-@@ -366,11 +363,11 @@ static void kcryptd_queue_io(struct cryp
- /*
-  * Decode key from its hex representation
-  */
--static int crypt_decode_key(u8 *key, char *hex, int size)
-+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
- {
- 	char buffer[3];
- 	char *endp;
--	int i;
-+	unsigned int i;
- 
- 	buffer[2] = '\0';
- 
-@@ -393,9 +390,9 @@ static int crypt_decode_key(u8 *key, cha
- /*
-  * Encode key into its hex representation
-  */
--static void crypt_encode_key(char *hex, u8 *key, int size)
-+static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
- {
--	int i;
-+	unsigned int i;
- 
- 	for(i = 0; i < size; i++) {
- 		sprintf(hex, "%02x", *key);
-@@ -415,8 +412,8 @@ static int crypt_ctr(struct dm_target *t
- 	char *tmp;
- 	char *cipher;
- 	char *mode;
--	int crypto_flags;
--	int key_size;
-+	unsigned int crypto_flags;
-+	unsigned int key_size;
- 
- 	if (argc != 5) {
- 		ti->error = PFX "Not enough arguments";
-@@ -464,9 +461,9 @@ static int crypt_ctr(struct dm_target *t
- 	}
- 
- 	if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
--		/* at least a 32 bit sector number should fit in our buffer */
-+		/* at least a 64 bit sector number should fit in our buffer */
- 		cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
--		                  (unsigned int)(sizeof(u32) / sizeof(u8)));
-+		                  (unsigned int)(sizeof(u64) / sizeof(u8)));
- 	else {
- 		cc->iv_size = 0;
- 		if (cc->iv_generator) {
-@@ -528,6 +525,8 @@ bad3:
- bad2:
- 	crypto_free_tfm(tfm);
- bad1:
-+	/* Must zero key material before freeing */
-+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- 	kfree(cc);
- 	return -EINVAL;
- }
-@@ -541,6 +540,9 @@ static void crypt_dtr(struct dm_target *
- 
- 	crypto_free_tfm(cc->tfm);
- 	dm_put_device(ti, cc->dev);
-+
-+	/* Must zero key material before freeing */
-+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
- 	kfree(cc);
- }
- 
-@@ -577,7 +579,8 @@ static int crypt_endio(struct bio *bio, 
- 
- static inline struct bio *
- crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
--            sector_t sector, int *bvec_idx, struct convert_context *ctx)
-+            sector_t sector, unsigned int *bvec_idx,
-+	    struct convert_context *ctx)
- {
- 	struct bio *clone;
- 
-@@ -630,7 +633,7 @@ static int crypt_map(struct dm_target *t
- 	struct bio *clone;
- 	unsigned int remaining = bio->bi_size;
- 	sector_t sector = bio->bi_sector - ti->begin;
--	int bvec_idx = 0;
-+	unsigned int bvec_idx = 0;
- 
- 	io->target = ti;
- 	io->bio = bio;
-@@ -693,7 +696,7 @@ static int crypt_status(struct dm_target
- 	char buffer[32];
- 	const char *cipher;
- 	const char *mode = NULL;
--	int offset;
-+	unsigned int offset;
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
-diff -pruN ./drivers/md.dm/dm-emc.c ./drivers/md/dm-emc.c
---- ./drivers/md.dm/dm-emc.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-emc.c	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,359 @@
-+/*
-+ * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath support for EMC CLARiiON AX/CX-series hardware.
-+ */
-+
-+#include "dm.h"
-+#include "dm-hw-handler.h"
-+#include <scsi/scsi.h>
-+#include <scsi/scsi_cmnd.h>
-+
-+struct emc_handler {
-+	spinlock_t lock;
-+
-+	/* Whether we should send the short trespass command (FC-series)
-+	 * or the long version (default for AX/CX CLARiiON arrays). */
-+	unsigned short_trespass;
-+	/* Whether or not to honor SCSI reservations when initiating a
-+	 * switch-over. Default: Don't. */
-+	unsigned hr;
-+	
-+	unsigned char sense[SCSI_SENSE_BUFFERSIZE];
-+};
-+
-+#define TRESPASS_PAGE 0x22
-+#define EMC_FAILOVER_TIMEOUT (60 * HZ)
-+
-+/* Code borrowed from dm-lsi-rdac by Mike Christie */
-+
-+static inline void free_bio(struct bio *bio)
-+{
-+	__free_page(bio->bi_io_vec[0].bv_page);
-+	bio_put(bio);
-+}
-+
-+static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+	struct path *path = bio->bi_private;
-+
-+	if (bio->bi_size)
-+		return 1;
-+
-+	/* We also need to look at the sense keys here whether or not to
-+	 * switch to the next PG etc.
-+	 *
-+	 * For now simple logic: either it works or it doesn't.
-+	 */
-+	if (error)
-+		dm_pg_init_complete(path, MP_FAIL_PATH);
-+	else
-+		dm_pg_init_complete(path, 0);
-+
-+	/* request is freed in block layer */
-+	free_bio(bio);
-+
-+	return 0;
-+}
-+
-+static struct bio *get_failover_bio(struct path *path, unsigned data_size)
-+{
-+	struct bio *bio;
-+	struct page *page;
-+
-+	bio = bio_alloc(GFP_ATOMIC, 1);
-+	if (!bio) {
-+		DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
-+		return NULL;
-+	}
-+	
-+	bio->bi_rw |= (1 << BIO_RW);
-+	bio->bi_bdev = path->dev->bdev;
-+	bio->bi_sector = 0;
-+	bio->bi_private = path;
-+	bio->bi_end_io = emc_endio;
-+
-+	page = alloc_page(GFP_ATOMIC);
-+	if (!page) {
-+		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
-+		bio_put(bio);
-+		return NULL;
-+	}
-+
-+	if (bio_add_page(bio, page, data_size, 0) != data_size) {
-+		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
-+		__free_page(page);
-+		bio_put(bio);
-+		return NULL;
-+	}
-+
-+	return bio;
-+}
-+
-+static struct request *get_failover_req(struct emc_handler *h,
-+					struct bio *bio, struct path *path)
-+{
-+	struct request *rq;
-+	struct block_device *bdev = bio->bi_bdev;
-+	struct request_queue *q = bdev_get_queue(bdev);
-+
-+	/* FIXME: Figure out why it fails with GFP_ATOMIC. */
-+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
-+	if (!rq) {
-+		DMERR("dm-emc: get_failover_req: blk_get_request failed");
-+		return NULL;
-+	}
-+	
-+	rq->bio = rq->biotail = bio;
-+	blk_rq_bio_prep(q, rq, bio);
-+
-+	rq->rq_disk = bdev->bd_contains->bd_disk;
-+
-+	/* bio backed don't set data */
-+	rq->buffer = rq->data = NULL;
-+	/* rq data_len used for pc cmd's request_bufflen */
-+	rq->data_len = bio->bi_size;
-+
-+	rq->sense = h->sense;
-+	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-+	rq->sense_len = 0;
-+
-+	memset(&rq->cmd, 0, BLK_MAX_CDB);
-+
-+	rq->timeout = EMC_FAILOVER_TIMEOUT;
-+	rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
-+
-+	return rq;
-+}
-+
-+static struct request *emc_trespass_get(struct emc_handler *h,
-+					struct path *path)
-+{
-+	struct bio *bio;
-+	struct request *rq;
-+	unsigned char *page22;
-+	unsigned char long_trespass_pg[] = {
-+		0, 0, 0, 0,
-+		TRESPASS_PAGE,        /* Page code */
-+		0x09,                 /* Page length - 2 */
-+		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
-+		0xff, 0xff,           /* Trespass target */
-+		0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */
-+		};
-+	unsigned char short_trespass_pg[] = {
-+		0, 0, 0, 0,
-+		TRESPASS_PAGE,        /* Page code */
-+		0x02,                 /* Page length - 2 */
-+		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
-+		0xff,                 /* Trespass target */
-+		};
-+	unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
-+				sizeof(long_trespass_pg);
-+
-+	/* get bio backing */
-+	if (data_size > PAGE_SIZE)
-+		/* this should never happen */
-+		return NULL;
-+
-+	bio = get_failover_bio(path, data_size);
-+	if (!bio) {
-+		DMERR("dm-emc: emc_trespass_get: no bio");
-+		return NULL;
-+	}
-+	
-+	page22 = (unsigned char *)bio_data(bio);
-+	memset(page22, 0, data_size);
-+
-+	memcpy(page22, h->short_trespass ?
-+		short_trespass_pg : long_trespass_pg, data_size);
-+
-+	/* get request for block layer packet command */
-+	rq = get_failover_req(h, bio, path);
-+	if (!rq) {
-+		DMERR("dm-emc: emc_trespass_get: no rq");
-+		free_bio(bio);
-+		return NULL;
-+	}
-+
-+	/* Prepare the command. */
-+	rq->cmd[0] = MODE_SELECT;
-+	rq->cmd[1] = 0x10;
-+	rq->cmd[4] = data_size;
-+	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-+
-+	return rq;
-+}
-+
-+static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
-+			struct path *path)
-+{
-+	struct request *rq;
-+	struct request_queue *q = bdev_get_queue(path->dev->bdev);
-+
-+	/*
-+	 * We can either blindly init the pg (then look at the sense),
-+	 * or we can send some commands to get the state here (then
-+	 * possibly send the fo cmnd), or we can also have the
-+	 * initial state passed into us and then get an update here.
-+	 */
-+	if (!q) {
-+		DMINFO("dm-emc: emc_pg_init: no queue");
-+		goto fail_path;
-+	}
-+
-+	/* FIXME: The request should be pre-allocated. */
-+	rq = emc_trespass_get(hwh->context, path);
-+	if (!rq) {
-+		DMERR("dm-emc: emc_pg_init: no rq");
-+		goto fail_path;
-+	}
-+
-+	DMINFO("dm-emc: emc_pg_init: sending switch-over command");
-+	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
-+	return;
-+
-+fail_path:
-+	dm_pg_init_complete(path, MP_FAIL_PATH);
-+}
-+
-+static struct emc_handler *alloc_emc_handler(void)
-+{
-+	struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
-+
-+	if (h) {
-+		memset(h, 0, sizeof(*h));
-+		spin_lock_init(&h->lock);
-+	}
-+
-+	return h;
-+}
-+
-+static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
-+{
-+	struct emc_handler *h;
-+	unsigned hr, short_trespass;
-+	
-+	if (argc == 0) {
-+		/* No arguments: use defaults */
-+		hr = 0;
-+		short_trespass = 0;
-+	} else if (argc != 2) {
-+		DMWARN("dm-emc hwhandler: incorrect number of arguments");
-+		return -EINVAL;
-+	} else {
-+		if ((sscanf(argv[0], "%u", &short_trespass) != 1)
-+			|| (short_trespass > 1)) {
-+			DMWARN("dm-emc: invalid trespass mode selected");
-+			return -EINVAL;
-+		}
-+		
-+		if ((sscanf(argv[1], "%u", &hr) != 1)
-+			|| (hr > 1)) {
-+			DMWARN("dm-emc: invalid honor reservation flag selected");
-+			return -EINVAL;
-+		}
-+	}
-+
-+	h = alloc_emc_handler();
-+	if (!h)
-+		return -ENOMEM;
-+
-+	hwh->context = h;
-+
-+	if ((h->short_trespass = short_trespass))
-+		DMWARN("dm-emc: short trespass command will be send");
-+	else
-+		DMWARN("dm-emc: long trespass command will be send");
-+	
-+	if ((h->hr = hr))
-+		DMWARN("dm-emc: honor reservation bit will be set");
-+	else
-+		DMWARN("dm-emc: honor reservation bit will not be set (default)");
-+
-+	return 0;
-+}
-+
-+static void emc_destroy(struct hw_handler *hwh)
-+{
-+	struct emc_handler *h = (struct emc_handler *) hwh->context;
-+
-+	kfree(h);
-+	hwh->context = NULL;
-+}
-+
-+static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
-+{
-+	/* FIXME: Patch from axboe still missing */
-+#if 0
-+	int sense;
-+
-+	if (bio->bi_error & BIO_SENSE) {
-+		sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
-+
-+		if (sense == 0x020403) {
-+			/* LUN Not Ready - Manual Intervention Required
-+			 * indicates this is a passive path.
-+			 *
-+			 * FIXME: However, if this is seen and EVPD C0
-+			 * indicates that this is due to a NDU in
-+			 * progress, we should set FAIL_PATH too.
-+			 * This indicates we might have to do a SCSI
-+			 * inquiry in the end_io path. Ugh. */
-+			return MP_BYPASS_PG | MP_RETRY_IO;
-+		} else if (sense == 0x052501) {
-+			/* An array based copy is in progress. Do not
-+			 * fail the path, do not bypass to another PG,
-+			 * do not retry. Fail the IO immediately.
-+			 * (Actually this is the same conclusion as in
-+			 * the default handler, but lets make sure.) */
-+			return 0;
-+		} else if (sense == 0x062900) {
-+			/* Unit Attention Code. This is the first IO
-+			 * to the new path, so just retry. */
-+			return MP_RETRY_IO;
-+		}
-+	}
-+#endif
-+
-+	/* Try default handler */
-+	return dm_scsi_err_handler(hwh, bio);
-+}
-+
-+static struct hw_handler_type emc_hwh = {
-+	.name = "emc",
-+	.module = THIS_MODULE,
-+	.create = emc_create,
-+	.destroy = emc_destroy,
-+	.pg_init = emc_pg_init,
-+	.error = emc_error,
-+};
-+
-+static int __init dm_emc_init(void)
-+{
-+	int r = dm_register_hw_handler(&emc_hwh);
-+
-+	if (r < 0)
-+		DMERR("emc: register failed %d", r);
-+
-+	DMINFO("dm-emc version 0.0.3 loaded");
-+
-+	return r;
-+}
-+
-+static void __exit dm_emc_exit(void)
-+{
-+	int r = dm_unregister_hw_handler(&emc_hwh);
-+
-+	if (r < 0)
-+		DMERR("emc: unregister failed %d", r);
-+}
-+
-+module_init(dm_emc_init);
-+module_exit(dm_emc_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
-+MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm.h ./drivers/md/dm.h
---- ./drivers/md.dm/dm.h	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm.h	2006-03-17 13:16:38.000000000 +0300
-@@ -19,6 +19,9 @@
- #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
- #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
- 
-+#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
-+			  0 : scnprintf(result + sz, maxlen - sz, x))
-+
- /*
-  * FIXME: I think this should be with the definition of sector_t
-  * in types.h.
-@@ -40,6 +43,7 @@ struct dm_dev {
- 	atomic_t count;
- 	int mode;
- 	struct block_device *bdev;
-+	char name[16];
- };
- 
- struct dm_table;
-@@ -51,6 +55,8 @@ struct mapped_device;
-  *---------------------------------------------------------------*/
- int dm_create(struct mapped_device **md);
- int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
-+void dm_set_mdptr(struct mapped_device *md, void *ptr);
-+void *dm_get_mdptr(dev_t dev);
- 
- /*
-  * Reference counting for md.
-@@ -61,7 +67,7 @@ void dm_put(struct mapped_device *md);
- /*
-  * A device can still be used while suspended, but I/O is deferred.
-  */
--int dm_suspend(struct mapped_device *md);
-+int dm_suspend(struct mapped_device *md, int with_lockfs);
- int dm_resume(struct mapped_device *md);
- 
- /*
-@@ -109,10 +115,12 @@ void dm_table_set_restrictions(struct dm
- unsigned int dm_table_get_num_targets(struct dm_table *t);
- struct list_head *dm_table_get_devices(struct dm_table *t);
- int dm_table_get_mode(struct dm_table *t);
--void dm_table_suspend_targets(struct dm_table *t);
-+void dm_table_presuspend_targets(struct dm_table *t);
-+void dm_table_postsuspend_targets(struct dm_table *t);
- void dm_table_resume_targets(struct dm_table *t);
- int dm_table_any_congested(struct dm_table *t, int bdi_bits);
- void dm_table_unplug_all(struct dm_table *t);
-+int dm_table_flush_all(struct dm_table *t);
- 
- /*-----------------------------------------------------------------
-  * A registry of target types.
-@@ -135,21 +143,22 @@ static inline int array_too_big(unsigned
- }
- 
- /*
-- * ceiling(n / size) * size
-+ * Ceiling(n / sz)
-  */
--static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
--{
--	unsigned long r = n % size;
--	return n + (r ? (size - r) : 0);
--}
-+#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
-+
-+#define dm_sector_div_up(n, sz) ( \
-+{ \
-+	sector_t _r = ((n) + (sz) - 1); \
-+	sector_div(_r, (sz)); \
-+	_r; \
-+} \
-+)
- 
- /*
-- * Ceiling(n / size)
-+ * ceiling(n / size) * size
-  */
--static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
--{
--	return dm_round_up(n, size) / size;
--}
-+#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
- 
- static inline sector_t to_sector(unsigned long n)
- {
-@@ -161,6 +170,8 @@ static inline unsigned long to_bytes(sec
- 	return (n << 9);
- }
- 
-+int dm_split_args(int *argc, char ***argvp, char *input);
-+
- /*
-  * The device-mapper can be driven through one of two interfaces;
-  * ioctl or filesystem, depending which patch you have applied.
-@@ -178,5 +189,6 @@ int dm_stripe_init(void);
- void dm_stripe_exit(void);
- 
- void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
-+union map_info *dm_get_mapinfo(struct bio *bio);
- 
- #endif
-diff -pruN ./drivers/md.dm/dm-hw-handler.c ./drivers/md/dm-hw-handler.c
---- ./drivers/md.dm/dm-hw-handler.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-hw-handler.c	2006-03-20 09:38:13.000000000 +0300
-@@ -0,0 +1,216 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath hardware handler registration.
-+ */
-+
-+#include "dm.h"
-+#include "dm-hw-handler.h"
-+
-+#include <linux/slab.h>
-+
-+struct hwh_internal {
-+	struct hw_handler_type hwht;
-+
-+	struct list_head list;
-+	long use;
-+};
-+
-+#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
-+
-+static LIST_HEAD(_hw_handlers);
-+static DECLARE_RWSEM(_hwh_lock);
-+
-+struct hwh_internal *__find_hw_handler_type(const char *name)
-+{
-+	struct hwh_internal *hwhi;
-+
-+	list_for_each_entry(hwhi, &_hw_handlers, list) {
-+		if (!strcmp(name, hwhi->hwht.name))
-+			return hwhi;
-+	}
-+
-+	return NULL;
-+}
-+
-+static struct hwh_internal *get_hw_handler(const char *name)
-+{
-+	struct hwh_internal *hwhi;
-+
-+	down_read(&_hwh_lock);
-+	hwhi = __find_hw_handler_type(name);
-+	if (hwhi) {
-+		if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
-+			hwhi = NULL;
-+		else
-+			hwhi->use++;
-+	}
-+	up_read(&_hwh_lock);
-+
-+	return hwhi;
-+}
-+
-+struct hw_handler_type *dm_get_hw_handler(const char *name)
-+{
-+	struct hwh_internal *hwhi;
-+
-+	if (!name)
-+		return NULL;
-+
-+	hwhi = get_hw_handler(name);
-+	if (!hwhi) {
-+		request_module("dm-%s", name);
-+		hwhi = get_hw_handler(name);
-+	}
-+
-+	return hwhi ? &hwhi->hwht : NULL;
-+}
-+
-+void dm_put_hw_handler(struct hw_handler_type *hwht)
-+{
-+	struct hwh_internal *hwhi;
-+
-+	if (!hwht)
-+		return;
-+
-+	down_read(&_hwh_lock);
-+	hwhi = __find_hw_handler_type(hwht->name);
-+	if (!hwhi)
-+		goto out;
-+
-+	if (--hwhi->use == 0)
-+		module_put(hwhi->hwht.module);
-+
-+	if (hwhi->use < 0)
-+		BUG();
-+
-+      out:
-+	up_read(&_hwh_lock);
-+}
-+
-+static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
-+{
-+	struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
-+
-+	if (hwhi) {
-+		memset(hwhi, 0, sizeof(*hwhi));
-+		hwhi->hwht = *hwht;
-+	}
-+
-+	return hwhi;
-+}
-+
-+int dm_register_hw_handler(struct hw_handler_type *hwht)
-+{
-+	int r = 0;
-+	struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
-+
-+	if (!hwhi)
-+		return -ENOMEM;
-+
-+	down_write(&_hwh_lock);
-+
-+	if (__find_hw_handler_type(hwht->name)) {
-+		kfree(hwhi);
-+		r = -EEXIST;
-+	} else
-+		list_add(&hwhi->list, &_hw_handlers);
-+
-+	up_write(&_hwh_lock);
-+
-+	return r;
-+}
-+
-+int dm_unregister_hw_handler(struct hw_handler_type *hwht)
-+{
-+	struct hwh_internal *hwhi;
-+
-+	down_write(&_hwh_lock);
-+
-+	hwhi = __find_hw_handler_type(hwht->name);
-+	if (!hwhi) {
-+		up_write(&_hwh_lock);
-+		return -EINVAL;
-+	}
-+
-+	if (hwhi->use) {
-+		up_write(&_hwh_lock);
-+		return -ETXTBSY;
-+	}
-+
-+	list_del(&hwhi->list);
-+
-+	up_write(&_hwh_lock);
-+
-+	kfree(hwhi);
-+
-+	return 0;
-+}
-+
-+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
-+{
-+#if 0
-+	int sense_key, asc, ascq;
-+
-+	if (bio->bi_error & BIO_SENSE) {
-+		/* FIXME: This is just an initial guess. */
-+		/* key / asc / ascq */
-+		sense_key = (bio->bi_error >> 16) & 0xff;
-+		asc = (bio->bi_error >> 8) & 0xff;
-+		ascq = bio->bi_error & 0xff;
-+
-+		switch (sense_key) {
-+			/* This block as a whole comes from the device.
-+			 * So no point retrying on another path. */
-+		case 0x03:	/* Medium error */
-+		case 0x05:	/* Illegal request */
-+		case 0x07:	/* Data protect */
-+		case 0x08:	/* Blank check */
-+		case 0x0a:	/* copy aborted */
-+		case 0x0c:	/* obsolete - no clue ;-) */
-+		case 0x0d:	/* volume overflow */
-+		case 0x0e:	/* data miscompare */
-+		case 0x0f:	/* reserved - no idea either. */
-+			return MP_ERROR_IO;
-+
-+			/* For these errors it's unclear whether they
-+			 * come from the device or the controller.
-+			 * So just lets try a different path, and if
-+			 * it eventually succeeds, user-space will clear
-+			 * the paths again... */
-+		case 0x02:	/* Not ready */
-+		case 0x04:	/* Hardware error */
-+		case 0x09:	/* vendor specific */
-+		case 0x0b:	/* Aborted command */
-+			return MP_FAIL_PATH;
-+
-+		case 0x06:	/* Unit attention - might want to decode */
-+			if (asc == 0x04 && ascq == 0x01)
-+				/* "Unit in the process of
-+				 * becoming ready" */
-+				return 0;
-+			return MP_FAIL_PATH;
-+
-+			/* FIXME: For Unit Not Ready we may want
-+			 * to have a generic pg activation
-+			 * feature (START_UNIT). */
-+
-+			/* Should these two ever end up in the
-+			 * error path? I don't think so. */
-+		case 0x00:	/* No sense */
-+		case 0x01:	/* Recovered error */
-+			return 0;
-+		}
-+	}
-+#endif
-+
-+	/* We got no idea how to decode the other kinds of errors ->
-+	 * assume generic error condition. */
-+	return MP_FAIL_PATH;
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_register_hw_handler);
-+EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
-+EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
-diff -pruN ./drivers/md.dm/dm-hw-handler.h ./drivers/md/dm-hw-handler.h
---- ./drivers/md.dm/dm-hw-handler.h	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-hw-handler.h	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,61 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath hardware handler registration.
-+ */
-+
-+#ifndef	DM_HW_HANDLER_H
-+#define	DM_HW_HANDLER_H
-+
-+#include <linux/device-mapper.h>
-+
-+#include "dm-mpath.h"
-+
-+struct hw_handler_type;
-+struct hw_handler {
-+	struct hw_handler_type *type;
-+	void *context;
-+};
-+
-+/*
-+ * Constructs a hardware handler object, takes custom arguments
-+ */
-+/* Information about a hardware handler type */
-+struct hw_handler_type {
-+	char *name;
-+	struct module *module;
-+
-+	int (*create) (struct hw_handler *handler, unsigned int argc,
-+		       char **argv);
-+	void (*destroy) (struct hw_handler *hwh);
-+
-+	void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
-+			 struct path *path);
-+	unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
-+	int (*status) (struct hw_handler *hwh, status_type_t type,
-+		       char *result, unsigned int maxlen);
-+};
-+
-+/* Register a hardware handler */
-+int dm_register_hw_handler(struct hw_handler_type *type);
-+
-+/* Unregister a hardware handler */
-+int dm_unregister_hw_handler(struct hw_handler_type *type);
-+
-+/* Returns a registered hardware handler type */
-+struct hw_handler_type *dm_get_hw_handler(const char *name);
-+
-+/* Releases a hardware handler  */
-+void dm_put_hw_handler(struct hw_handler_type *hwht);
-+
-+/* Default err function */
-+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
-+
-+/* Error flags for err and dm_pg_init_complete */
-+#define MP_FAIL_PATH 1
-+#define MP_BYPASS_PG 2
-+#define MP_ERROR_IO  4	/* Don't retry this I/O */
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-io.c ./drivers/md/dm-io.c
---- ./drivers/md.dm/dm-io.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-io.c	2006-03-17 13:16:38.000000000 +0300
-@@ -267,7 +267,7 @@ static int resize_pool(unsigned int new_
- 		/* create new pool */
- 		_io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
- 		if (!_io_pool)
--			r = -ENOMEM;
-+			return -ENOMEM;
- 
- 		r = bio_set_init(&_bios, "dm-io", 512, 1);
- 		if (r) {
-diff -pruN ./drivers/md.dm/dm-ioctl.c ./drivers/md/dm-ioctl.c
---- ./drivers/md.dm/dm-ioctl.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-ioctl.c	2006-03-17 13:16:38.000000000 +0300
-@@ -1,5 +1,6 @@
- /*
-  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
-+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
-  *
-  * This file is released under the GPL.
-  */
-@@ -17,7 +18,7 @@
- 
- #include <asm/uaccess.h>
- 
--#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
-+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
- 
- /*-----------------------------------------------------------------
-  * The ioctl interface needs to be able to look up devices by
-@@ -121,14 +122,6 @@ static struct hash_cell *__get_uuid_cell
- /*-----------------------------------------------------------------
-  * Inserting, removing and renaming a device.
-  *---------------------------------------------------------------*/
--static inline char *kstrdup(const char *str)
--{
--	char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
--	if (r)
--		strcpy(r, str);
--	return r;
--}
--
- static struct hash_cell *alloc_cell(const char *name, const char *uuid,
- 				    struct mapped_device *md)
- {
-@@ -138,7 +131,7 @@ static struct hash_cell *alloc_cell(cons
- 	if (!hc)
- 		return NULL;
- 
--	hc->name = kstrdup(name);
-+	hc->name = kstrdup(name, GFP_KERNEL);
- 	if (!hc->name) {
- 		kfree(hc);
- 		return NULL;
-@@ -148,7 +141,7 @@ static struct hash_cell *alloc_cell(cons
- 		hc->uuid = NULL;
- 
- 	else {
--		hc->uuid = kstrdup(uuid);
-+		hc->uuid = kstrdup(uuid, GFP_KERNEL);
- 		if (!hc->uuid) {
- 			kfree(hc->name);
- 			kfree(hc);
-@@ -224,6 +217,7 @@ static int dm_hash_insert(const char *na
- 	}
- 	register_with_devfs(cell);
- 	dm_get(md);
-+	dm_set_mdptr(md, cell);
- 	up_write(&_hash_lock);
- 
- 	return 0;
-@@ -236,10 +230,20 @@ static int dm_hash_insert(const char *na
- 
- static void __hash_remove(struct hash_cell *hc)
- {
-+	struct dm_table *table;
-+
- 	/* remove from the dev hash */
- 	list_del(&hc->uuid_list);
- 	list_del(&hc->name_list);
- 	unregister_with_devfs(hc);
-+	dm_set_mdptr(hc->md, NULL);
-+
-+	table = dm_get_table(hc->md);
-+	if (table) {
-+		dm_table_event(table);
-+		dm_table_put(table);
-+	}
-+
- 	dm_put(hc->md);
- 	if (hc->new_map)
- 		dm_table_put(hc->new_map);
-@@ -266,11 +270,12 @@ static int dm_hash_rename(const char *ol
- {
- 	char *new_name, *old_name;
- 	struct hash_cell *hc;
-+	struct dm_table *table;
- 
- 	/*
- 	 * duplicate new.
- 	 */
--	new_name = kstrdup(new);
-+	new_name = kstrdup(new, GFP_KERNEL);
- 	if (!new_name)
- 		return -ENOMEM;
- 
-@@ -313,6 +318,15 @@ static int dm_hash_rename(const char *ol
- 	/* rename the device node in devfs */
- 	register_with_devfs(hc);
- 
-+	/*
-+	 * Wake up any dm event waiters.
-+	 */
-+	table = dm_get_table(hc->md);
-+	if (table) {
-+		dm_table_event(table);
-+		dm_table_put(table);
-+	}
-+
- 	up_write(&_hash_lock);
- 	kfree(old_name);
- 	return 0;
-@@ -421,8 +435,8 @@ static void list_version_get_needed(stru
- {
-     size_t *needed = needed_param;
- 
-+    *needed += sizeof(struct dm_target_versions);
-     *needed += strlen(tt->name);
--    *needed += sizeof(tt->version);
-     *needed += ALIGN_MASK;
- }
- 
-@@ -517,19 +531,22 @@ static int __dev_status(struct mapped_de
- 	if (dm_suspended(md))
- 		param->flags |= DM_SUSPEND_FLAG;
- 
--	bdev = bdget_disk(disk, 0);
--	if (!bdev)
--		return -ENXIO;
--
- 	param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
- 
--	/*
--	 * Yes, this will be out of date by the time it gets back
--	 * to userland, but it is still very useful ofr
--	 * debugging.
--	 */
--	param->open_count = bdev->bd_openers;
--	bdput(bdev);
-+	if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
-+		bdev = bdget_disk(disk, 0);
-+		if (!bdev)
-+			return -ENXIO;
-+
-+		/*
-+		 * Yes, this will be out of date by the time it gets back
-+		 * to userland, but it is still very useful for
-+		 * debugging.
-+		 */
-+		param->open_count = bdev->bd_openers;
-+		bdput(bdev);
-+	} else
-+		param->open_count = -1;
- 
- 	if (disk->policy)
- 		param->flags |= DM_READONLY_FLAG;
-@@ -579,12 +596,16 @@ static int dev_create(struct dm_ioctl *p
- }
- 
- /*
-- * Always use UUID for lookups if it's present, otherwise use name.
-+ * Always use UUID for lookups if it's present, otherwise use name or dev.
-  */
- static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
- {
--	return *param->uuid ?
--	    __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
-+	if (*param->uuid)
-+		return __get_uuid_cell(param->uuid);
-+	else if (*param->name)
-+		return __get_name_cell(param->name);
-+	else
-+		return dm_get_mdptr(huge_decode_dev(param->dev));
- }
- 
- static inline struct mapped_device *find_device(struct dm_ioctl *param)
-@@ -596,6 +617,7 @@ static inline struct mapped_device *find
- 	hc = __find_device_hash_cell(param);
- 	if (hc) {
- 		md = hc->md;
-+		dm_get(md);
- 
- 		/*
- 		 * Sneakily write in both the name and the uuid
-@@ -611,8 +633,6 @@ static inline struct mapped_device *find
- 			param->flags |= DM_INACTIVE_PRESENT_FLAG;
- 		else
- 			param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
--
--		dm_get(md);
- 	}
- 	up_read(&_hash_lock);
- 
-@@ -673,14 +693,18 @@ static int dev_rename(struct dm_ioctl *p
- static int do_suspend(struct dm_ioctl *param)
- {
- 	int r = 0;
-+	int do_lockfs = 1;
- 	struct mapped_device *md;
- 
- 	md = find_device(param);
- 	if (!md)
- 		return -ENXIO;
- 
-+	if (param->flags & DM_SKIP_LOCKFS_FLAG)
-+		do_lockfs = 0;
-+
- 	if (!dm_suspended(md))
--		r = dm_suspend(md);
-+		r = dm_suspend(md, do_lockfs);
- 
- 	if (!r)
- 		r = __dev_status(md, param);
-@@ -692,6 +716,7 @@ static int do_suspend(struct dm_ioctl *p
- static int do_resume(struct dm_ioctl *param)
- {
- 	int r = 0;
-+	int do_lockfs = 1;
- 	struct hash_cell *hc;
- 	struct mapped_device *md;
- 	struct dm_table *new_map;
-@@ -717,8 +742,10 @@ static int do_resume(struct dm_ioctl *pa
- 	/* Do we need to load a new map ? */
- 	if (new_map) {
- 		/* Suspend if it isn't already suspended */
-+		if (param->flags & DM_SKIP_LOCKFS_FLAG)
-+			do_lockfs = 0;
- 		if (!dm_suspended(md))
--			dm_suspend(md);
-+			dm_suspend(md, do_lockfs);
- 
- 		r = dm_swap_table(md, new_map);
- 		if (r) {
-@@ -964,6 +991,7 @@ static int table_load(struct dm_ioctl *p
- 	if (!hc) {
- 		DMWARN("device doesn't appear to be in the dev hash table.");
- 		up_write(&_hash_lock);
-+		dm_table_put(t);
- 		return -ENXIO;
- 	}
- 
-@@ -1097,6 +1125,67 @@ static int table_status(struct dm_ioctl 
- 	return r;
- }
- 
-+/*
-+ * Pass a message to the target that's at the supplied device offset.
-+ */
-+static int target_message(struct dm_ioctl *param, size_t param_size)
-+{
-+	int r, argc;
-+	char **argv;
-+	struct mapped_device *md;
-+	struct dm_table *table;
-+	struct dm_target *ti;
-+	struct dm_target_msg *tmsg = (void *) param + param->data_start;
-+
-+	md = find_device(param);
-+	if (!md)
-+		return -ENXIO;
-+
-+	r = __dev_status(md, param);
-+	if (r)
-+		goto out;
-+
-+	if (tmsg < (struct dm_target_msg *) (param + 1) ||
-+	    invalid_str(tmsg->message, (void *) param + param_size)) {
-+		DMWARN("Invalid target message parameters.");
-+		r = -EINVAL;
-+		goto out;
-+	}
-+
-+	r = dm_split_args(&argc, &argv, tmsg->message);
-+	if (r) {
-+		DMWARN("Failed to split target message parameters");
-+		goto out;
-+	}
-+
-+	table = dm_get_table(md);
-+	if (!table)
-+		goto out_argv;
-+
-+	if (tmsg->sector >= dm_table_get_size(table)) {
-+		DMWARN("Target message sector outside device.");
-+		r = -EINVAL;
-+		goto out_table;
-+	}
-+
-+	ti = dm_table_find_target(table, tmsg->sector);
-+	if (ti->type->message)
-+		r = ti->type->message(ti, argc, argv);
-+	else {
-+		DMWARN("Target type does not support messages");
-+		r = -EINVAL;
-+	}
-+
-+ out_table:
-+	dm_table_put(table);
-+ out_argv:
-+	kfree(argv);
-+ out:
-+	param->data_size = 0;
-+	dm_put(md);
-+	return r;
-+}
-+
- /*-----------------------------------------------------------------
-  * Implementation of open/close/ioctl on the special char
-  * device.
-@@ -1123,7 +1212,9 @@ static ioctl_fn lookup_ioctl(unsigned in
- 		{DM_TABLE_DEPS_CMD, table_deps},
- 		{DM_TABLE_STATUS_CMD, table_status},
- 
--		{DM_LIST_VERSIONS_CMD, list_versions}
-+		{DM_LIST_VERSIONS_CMD, list_versions},
-+
-+		{DM_TARGET_MSG_CMD, target_message}
- 	};
- 
- 	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
-@@ -1202,14 +1293,14 @@ static int validate_params(uint cmd, str
- 	    cmd == DM_LIST_VERSIONS_CMD)
- 		return 0;
- 
--	/* Unless creating, either name or uuid but not both */
--	if (cmd != DM_DEV_CREATE_CMD) {
--		if ((!*param->uuid && !*param->name) ||
--		    (*param->uuid && *param->name)) {
--			DMWARN("one of name or uuid must be supplied, cmd(%u)",
--			       cmd);
-+	if ((cmd == DM_DEV_CREATE_CMD)) {
-+		if (!*param->name) {
-+			DMWARN("name not supplied when creating device");
- 			return -EINVAL;
- 		}
-+	} else if ((*param->uuid && *param->name)) {
-+		DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
-+		return -EINVAL;
- 	}
- 
- 	/* Ensure strings are terminated */
-@@ -1268,16 +1359,11 @@ static int ctl_ioctl(struct inode *inode
- 	 * Copy the parameters into kernel space.
- 	 */
- 	r = copy_params(user, &param);
--	if (r) {
--		current->flags &= ~PF_MEMALLOC;
--		return r;
--	}
- 
--	/*
--	 * FIXME: eventually we will remove the PF_MEMALLOC flag
--	 * here.  However the tools still do nasty things like
--	 * 'load' while a device is suspended.
--	 */
-+	current->flags &= ~PF_MEMALLOC;
-+
-+	if (r)
-+		return r;
- 
- 	r = validate_params(cmd, param);
- 	if (r)
-@@ -1295,7 +1381,6 @@ static int ctl_ioctl(struct inode *inode
- 
-  out:
- 	free_params(param);
--	current->flags &= ~PF_MEMALLOC;
- 	return r;
- }
- 
-diff -pruN ./drivers/md.dm/dm-linear.c ./drivers/md/dm-linear.c
---- ./drivers/md.dm/dm-linear.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-linear.c	2006-03-17 13:16:38.000000000 +0300
-@@ -80,7 +80,6 @@ static int linear_status(struct dm_targe
- 			 char *result, unsigned int maxlen)
- {
- 	struct linear_c *lc = (struct linear_c *) ti->private;
--	char buffer[32];
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
-@@ -88,8 +87,8 @@ static int linear_status(struct dm_targe
- 		break;
- 
- 	case STATUSTYPE_TABLE:
--		format_dev_t(buffer, lc->dev->bdev->bd_dev);
--		snprintf(result, maxlen, "%s " SECTOR_FORMAT, buffer, lc->start);
-+		snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
-+			 lc->start);
- 		break;
- 	}
- 	return 0;
-diff -pruN ./drivers/md.dm/dm-log.c ./drivers/md/dm-log.c
---- ./drivers/md.dm/dm-log.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-log.c	2006-03-17 13:16:38.000000000 +0300
-@@ -17,9 +17,6 @@ static spinlock_t _lock = SPIN_LOCK_UNLO
- 
- int dm_register_dirty_log_type(struct dirty_log_type *type)
- {
--	if (!try_module_get(type->module))
--		return -EINVAL;
--
- 	spin_lock(&_lock);
- 	type->use_count = 0;
- 	list_add(&type->list, &_log_types);
-@@ -33,11 +30,10 @@ int dm_unregister_dirty_log_type(struct 
- 	spin_lock(&_lock);
- 
- 	if (type->use_count)
--		DMWARN("Attempt to unregister a log type that is still in use");
--	else {
-+		DMWARN("Unregister failed: log type '%s' still in use",
-+		       type->name);
-+	else
- 		list_del(&type->list);
--		module_put(type->module);
--	}
- 
- 	spin_unlock(&_lock);
- 
-@@ -51,6 +47,10 @@ static struct dirty_log_type *get_type(c
- 	spin_lock(&_lock);
- 	list_for_each_entry (type, &_log_types, list)
- 		if (!strcmp(type_name, type->name)) {
-+			if (!type->use_count && !try_module_get(type->module)){
-+				spin_unlock(&_lock);
-+				return NULL;
-+			}
- 			type->use_count++;
- 			spin_unlock(&_lock);
- 			return type;
-@@ -63,7 +63,8 @@ static struct dirty_log_type *get_type(c
- static void put_type(struct dirty_log_type *type)
- {
- 	spin_lock(&_lock);
--	type->use_count--;
-+	if (!--type->use_count)
-+		module_put(type->module);
- 	spin_unlock(&_lock);
- }
- 
-@@ -112,7 +113,7 @@ void dm_destroy_dirty_log(struct dirty_l
- /*
-  * The on-disk version of the metadata.
-  */
--#define MIRROR_DISK_VERSION 1
-+#define MIRROR_DISK_VERSION 2
- #define LOG_OFFSET 2
- 
- struct log_header {
-@@ -129,20 +130,32 @@ struct log_header {
- struct log_c {
- 	struct dm_target *ti;
- 	int touched;
--	sector_t region_size;
-+	uint32_t region_size;
- 	unsigned int region_count;
- 	region_t sync_count;
- 
- 	unsigned bitset_uint32_count;
- 	uint32_t *clean_bits;
- 	uint32_t *sync_bits;
--	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
-+	uint32_t *recovering_bits;
- 
- 	int sync_search;
- 
-+	/* Resync flag */
-+	enum sync {
-+		DEFAULTSYNC,	/* Synchronize if necessary */
-+		NOSYNC,		/* Devices known to be already in sync */
-+		FORCESYNC,	/* Force a sync to happen */
-+	} sync;
-+
-+	int failure_response;
-+
- 	/*
- 	 * Disk log fields
- 	 */
-+	int log_dev_failed;
-+	atomic_t suspended;
-+	struct completion failure_completion;
- 	struct dm_dev *log_dev;
- 	struct log_header header;
- 
-@@ -150,7 +163,6 @@ struct log_c {
- 	struct log_header *disk_header;
- 
- 	struct io_region bits_location;
--	uint32_t *disk_bits;
- };
- 
- /*
-@@ -159,20 +171,20 @@ struct log_c {
-  */
- static  inline int log_test_bit(uint32_t *bs, unsigned bit)
- {
--	return test_bit(bit, (unsigned long *) bs) ? 1 : 0;
-+	return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
- }
- 
- static inline void log_set_bit(struct log_c *l,
- 			       uint32_t *bs, unsigned bit)
- {
--	set_bit(bit, (unsigned long *) bs);
-+	ext2_set_bit(bit, (unsigned long *) bs);
- 	l->touched = 1;
- }
- 
- static inline void log_clear_bit(struct log_c *l,
- 				 uint32_t *bs, unsigned bit)
- {
--	clear_bit(bit, (unsigned long *) bs);
-+	ext2_clear_bit(bit, (unsigned long *) bs);
- 	l->touched = 1;
- }
- 
-@@ -205,12 +217,19 @@ static int read_header(struct log_c *log
- 
- 	header_from_disk(&log->header, log->disk_header);
- 
--	if (log->header.magic != MIRROR_MAGIC) {
-+	/* New log required? */
-+	if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
- 		log->header.magic = MIRROR_MAGIC;
- 		log->header.version = MIRROR_DISK_VERSION;
- 		log->header.nr_regions = 0;
- 	}
- 
-+	/* Version 2 is like version 1 but always little endian on disk. */
-+#ifdef __LITTLE_ENDIAN
-+	if (log->header.version == 1)
-+		log->header.version = 2;
-+#endif
-+
- 	if (log->header.version != MIRROR_DISK_VERSION) {
- 		DMWARN("incompatible disk log version");
- 		return -EINVAL;
-@@ -231,70 +250,69 @@ static inline int write_header(struct lo
- /*----------------------------------------------------------------
-  * Bits IO
-  *--------------------------------------------------------------*/
--static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count)
--{
--	unsigned i;
--
--	for (i = 0; i < count; i++)
--		core[i] = le32_to_cpu(disk[i]);
--}
--
--static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count)
--{
--	unsigned i;
--
--	/* copy across the clean/dirty bitset */
--	for (i = 0; i < count; i++)
--		disk[i] = cpu_to_le32(core[i]);
--}
--
- static int read_bits(struct log_c *log)
- {
- 	int r;
- 	unsigned long ebits;
- 
- 	r = dm_io_sync_vm(1, &log->bits_location, READ,
--			  log->disk_bits, &ebits);
-+			  log->clean_bits, &ebits);
- 	if (r)
- 		return r;
- 
--	bits_to_core(log->clean_bits, log->disk_bits,
--		     log->bitset_uint32_count);
- 	return 0;
- }
- 
- static int write_bits(struct log_c *log)
- {
- 	unsigned long ebits;
--	bits_to_disk(log->clean_bits, log->disk_bits,
--		     log->bitset_uint32_count);
- 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
--			     log->disk_bits, &ebits);
-+			     log->clean_bits, &ebits);
- }
- 
- /*----------------------------------------------------------------
-- * constructor/destructor
-+ * core log constructor/destructor
-+ *
-+ * argv contains: <region_size> [[no]sync] [block_on_error]
-  *--------------------------------------------------------------*/
- #define BYTE_SHIFT 3
- static int core_ctr(struct dirty_log *log, struct dm_target *ti,
- 		    unsigned int argc, char **argv)
- {
-+	enum sync sync = DEFAULTSYNC;
-+	int failure_response = DMLOG_IOERR_IGNORE;
-+
- 	struct log_c *lc;
--	sector_t region_size;
-+	uint32_t region_size;
- 	unsigned int region_count;
- 	size_t bitset_size;
-+	unsigned i;
- 
--	if (argc != 1) {
--		DMWARN("wrong number of arguments to log_c");
-+	if (argc < 1 || argc > 3) {
-+		DMWARN("wrong number of arguments to mirror log");
- 		return -EINVAL;
- 	}
- 
--	if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
-+	for (i = 1; i < argc; i++) {
-+		if (!strcmp(argv[i], "sync"))
-+			sync = FORCESYNC;
-+		else if (!strcmp(argv[i], "nosync"))
-+			sync = NOSYNC;
-+		else if (!strcmp(argv[i], "block_on_error"))
-+			failure_response = DMLOG_IOERR_BLOCK;
-+		else {
-+			DMWARN("unrecognised sync argument to mirror log: %s",
-+			       argv[i]);
-+			return -EINVAL;
-+		}
-+	}
-+
-+	if (sscanf(argv[0], "%u", &region_size) != 1) {
- 		DMWARN("invalid region size string");
- 		return -EINVAL;
- 	}
- 
--	region_count = dm_div_up(ti->len, region_size);
-+	region_count = dm_sector_div_up(ti->len, region_size);
- 
- 	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
- 	if (!lc) {
-@@ -306,12 +324,14 @@ static int core_ctr(struct dirty_log *lo
- 	lc->touched = 0;
- 	lc->region_size = region_size;
- 	lc->region_count = region_count;
-+	lc->sync = sync;
-+	lc->failure_response = failure_response;
- 
- 	/*
--	 * Work out how many words we need to hold the bitset.
-+	 * Work out how many "unsigned long"s we need to hold the bitset.
- 	 */
- 	bitset_size = dm_round_up(region_count,
--				  sizeof(*lc->clean_bits) << BYTE_SHIFT);
-+				  sizeof(unsigned long) << BYTE_SHIFT);
- 	bitset_size >>= BYTE_SHIFT;
- 
- 	lc->bitset_uint32_count = bitset_size / 4;
-@@ -330,12 +350,12 @@ static int core_ctr(struct dirty_log *lo
- 		kfree(lc);
- 		return -ENOMEM;
- 	}
--	memset(lc->sync_bits, 0, bitset_size);
--        lc->sync_count = 0;
-+	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
-+	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
- 
- 	lc->recovering_bits = vmalloc(bitset_size);
- 	if (!lc->recovering_bits) {
--		DMWARN("couldn't allocate sync bitset");
-+		DMWARN("couldn't allocate recovering bitset");
- 		vfree(lc->sync_bits);
- 		vfree(lc->clean_bits);
- 		kfree(lc);
-@@ -356,6 +376,11 @@ static void core_dtr(struct dirty_log *l
- 	kfree(lc);
- }
- 
-+/*----------------------------------------------------------------
-+ * disk log constructor/destructor
-+ *
-+ * argv contains log_device region_size followed optionally by [no]sync
-+ *--------------------------------------------------------------*/
- static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
- 		    unsigned int argc, char **argv)
- {
-@@ -364,8 +389,8 @@ static int disk_ctr(struct dirty_log *lo
- 	struct log_c *lc;
- 	struct dm_dev *dev;
- 
--	if (argc != 2) {
--		DMWARN("wrong number of arguments to log_d");
-+	if (argc < 2 || argc > 3) {
-+		DMWARN("wrong number of arguments to disk mirror log");
- 		return -EINVAL;
- 	}
- 
-@@ -382,6 +407,8 @@ static int disk_ctr(struct dirty_log *lo
- 
- 	lc = (struct log_c *) log->context;
- 	lc->log_dev = dev;
-+	lc->log_dev_failed = 0;
-+	init_completion(&lc->failure_completion);
- 
- 	/* setup the disk header fields */
- 	lc->header_location.bdev = lc->log_dev->bdev;
-@@ -403,11 +430,6 @@ static int disk_ctr(struct dirty_log *lo
- 	size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
- 			   1 << SECTOR_SHIFT);
- 	lc->bits_location.count = size >> SECTOR_SHIFT;
--	lc->disk_bits = vmalloc(size);
--	if (!lc->disk_bits) {
--		vfree(lc->disk_header);
--		goto bad;
--	}
- 	return 0;
- 
-  bad:
-@@ -421,7 +443,6 @@ static void disk_dtr(struct dirty_log *l
- 	struct log_c *lc = (struct log_c *) log->context;
- 	dm_put_device(lc->ti, lc->log_dev);
- 	vfree(lc->disk_header);
--	vfree(lc->disk_bits);
- 	core_dtr(log);
- }
- 
-@@ -435,42 +456,65 @@ static int count_bits32(uint32_t *addr, 
- 	return count;
- }
- 
-+static void fail_log_device(struct log_c *lc)
-+{
-+	lc->log_dev_failed = 1;
-+	if (lc->failure_response == DMLOG_IOERR_BLOCK)
-+		dm_table_event(lc->ti->table);
-+}
-+
-+static void restore_log_device(struct log_c *lc)
-+{
-+	lc->log_dev_failed = 0;
-+}
-+
- static int disk_resume(struct dirty_log *log)
- {
--	int r;
-+	int r = 0;
- 	unsigned i;
- 	struct log_c *lc = (struct log_c *) log->context;
- 	size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
- 
--	/* read the disk header */
--	r = read_header(lc);
--	if (r)
--		return r;
--
--	/* read the bits */
--	r = read_bits(lc);
--	if (r)
--		return r;
--
--	/* zero any new bits if the mirror has grown */
--	for (i = lc->header.nr_regions; i < lc->region_count; i++)
--		/* FIXME: amazingly inefficient */
--		log_clear_bit(lc, lc->clean_bits, i);
-+	/* 
-+	 * Read the disk header, but only if we know it is good.
-+	 * Assume the worst in the event of failure.
-+	 */
-+	if (!lc->log_dev_failed &&
-+	    ((r = read_header(lc)) || read_bits(lc))) {
-+		DMWARN("Read %s failed on mirror log device, %s.",
-+		      r ? "header" : "bits", lc->log_dev->name);
-+		fail_log_device(lc);
-+		lc->header.nr_regions = 0;
-+	}
-+
-+	/* set or clear any new bits */
-+	if (lc->sync == NOSYNC)
-+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
-+			/* FIXME: amazingly inefficient */
-+			log_set_bit(lc, lc->clean_bits, i);
-+	else
-+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
-+			/* FIXME: amazingly inefficient */
-+			log_clear_bit(lc, lc->clean_bits, i);
- 
- 	/* copy clean across to sync */
- 	memcpy(lc->sync_bits, lc->clean_bits, size);
- 	lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
- 
--	/* write the bits */
--	r = write_bits(lc);
--	if (r)
--		return r;
--
- 	/* set the correct number of regions in the header */
- 	lc->header.nr_regions = lc->region_count;
- 
--	/* write the new header */
--	return write_header(lc);
-+	/* write out the log.  'i' tells us which has failed if any */
-+	i = 1;
-+	if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
-+		DMWARN("Write %s failed on mirror log device, %s.",
-+		      i ? "bits" : "header", lc->log_dev->name);
-+		fail_log_device(lc);
-+	} else
-+		restore_log_device(lc);
-+
-+	atomic_set(&lc->suspended, 0);
-+	return r;
- }
- 
- static sector_t core_get_region_size(struct dirty_log *log)
-@@ -497,6 +541,17 @@ static int core_flush(struct dirty_log *
- 	return 0;
- }
- 
-+static int disk_presuspend(struct dirty_log *log)
-+{
-+	struct log_c *lc = (struct log_c *) log->context;
-+
-+	atomic_set(&lc->suspended, 1);
-+	if (lc->log_dev_failed && (lc->failure_response == DMLOG_IOERR_BLOCK))
-+		complete(&lc->failure_completion);
-+
-+	return 0;
-+}
-+
- static int disk_flush(struct dirty_log *log)
- {
- 	int r;
-@@ -506,9 +561,24 @@ static int disk_flush(struct dirty_log *
- 	if (!lc->touched)
- 		return 0;
- 
-+	/*
-+	 * If a failure occurs, we must wait for a suspension.
-+	 * We must not proceed in the event of a failure,
-+	 * because if the machine reboots with the log
-+	 * incorrect, recovery could be compromised
-+	 */
- 	r = write_bits(lc);
--	if (!r)
-+	if (!r) {
- 		lc->touched = 0;
-+		restore_log_device(lc);
-+	} else {
-+		DMERR("Write failure on mirror log device, %s.",
-+		      lc->log_dev->name);
-+		fail_log_device(lc);
-+		if (!atomic_read(&lc->suspended) &&
-+		    (lc->failure_response == DMLOG_IOERR_BLOCK))
-+			wait_for_completion(&lc->failure_completion);
-+	}
- 
- 	return r;
- }
-@@ -538,7 +608,7 @@ static int core_get_resync_work(struct d
- 					     lc->sync_search);
- 		lc->sync_search = *region + 1;
- 
--		if (*region == lc->region_count)
-+		if (*region >= lc->region_count)
- 			return 0;
- 
- 	} while (log_test_bit(lc->recovering_bits, *region));
-@@ -566,6 +636,60 @@ static region_t core_get_sync_count(stru
-         return lc->sync_count;
- }
- 
-+#define	DMEMIT_SYNC \
-+	if (lc->sync != DEFAULTSYNC) \
-+		DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
-+
-+static int core_status(struct dirty_log *log, status_type_t status,
-+		       char *result, unsigned int maxlen)
-+{
-+	int sz = 0;
-+	struct log_c *lc = log->context;
-+
-+	switch(status) {
-+	case STATUSTYPE_INFO:
-+		DMEMIT("1 core");
-+		break;
-+
-+	case STATUSTYPE_TABLE:
-+		DMEMIT("%s %u %u ", log->type->name,
-+		       lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
-+		DMEMIT_SYNC;
-+	}
-+
-+	return sz;
-+}
-+
-+static int disk_status(struct dirty_log *log, status_type_t status,
-+		       char *result, unsigned int maxlen)
-+{
-+	int sz = 0;
-+	struct log_c *lc = log->context;
-+
-+	switch(status) {
-+	case STATUSTYPE_INFO:
-+		DMEMIT("3 disk %s %c", lc->log_dev->name,
-+		       lc->log_dev_failed ? 'D' : 'A');
-+		break;
-+
-+	case STATUSTYPE_TABLE:
-+		DMEMIT("%s %u %s %u ", log->type->name,
-+		       lc->sync == DEFAULTSYNC ? 2 : 3,
-+		       lc->log_dev->name,
-+		       lc->region_size);
-+		DMEMIT_SYNC;
-+	}
-+
-+	return sz;
-+}
-+
-+static int core_get_failure_response(struct dirty_log *log)
-+{
-+	struct log_c *lc = log->context;
-+
-+	return lc->failure_response;
-+}
-+
- static struct dirty_log_type _core_type = {
- 	.name = "core",
- 	.module = THIS_MODULE,
-@@ -579,7 +703,9 @@ static struct dirty_log_type _core_type 
- 	.clear_region = core_clear_region,
- 	.get_resync_work = core_get_resync_work,
- 	.complete_resync_work = core_complete_resync_work,
--        .get_sync_count = core_get_sync_count
-+	.get_sync_count = core_get_sync_count,
-+	.status = core_status,
-+	.get_failure_response = core_get_failure_response,
- };
- 
- static struct dirty_log_type _disk_type = {
-@@ -587,7 +713,8 @@ static struct dirty_log_type _disk_type 
- 	.module = THIS_MODULE,
- 	.ctr = disk_ctr,
- 	.dtr = disk_dtr,
--	.suspend = disk_flush,
-+	.presuspend = disk_presuspend,
-+	.postsuspend = disk_flush,
- 	.resume = disk_resume,
- 	.get_region_size = core_get_region_size,
- 	.is_clean = core_is_clean,
-@@ -597,7 +724,9 @@ static struct dirty_log_type _disk_type 
- 	.clear_region = core_clear_region,
- 	.get_resync_work = core_get_resync_work,
- 	.complete_resync_work = core_complete_resync_work,
--        .get_sync_count = core_get_sync_count
-+	.get_sync_count = core_get_sync_count,
-+	.status = disk_status,
-+	.get_failure_response = core_get_failure_response,
- };
- 
- int __init dm_dirty_log_init(void)
-diff -pruN ./drivers/md.dm/dm-log.h ./drivers/md/dm-log.h
---- ./drivers/md.dm/dm-log.h	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-log.h	2006-03-17 13:16:38.000000000 +0300
-@@ -9,6 +9,15 @@
- 
- #include "dm.h"
- 
-+/*
-+ * Values returned by get_failure_response()
-+ *   DMLOG_IOERR_IGNORE:  ignore device failures
-+ *   DMLOG_IOERR_BLOCK:     issue dm event, and do not complete
-+ *                 I/O until presuspend is recieved.
-+ */
-+#define DMLOG_IOERR_IGNORE 0
-+#define DMLOG_IOERR_BLOCK  1
-+
- typedef sector_t region_t;
- 
- struct dirty_log_type;
-@@ -32,7 +41,8 @@ struct dirty_log_type {
- 	 * There are times when we don't want the log to touch
- 	 * the disk.
- 	 */
--	int (*suspend)(struct dirty_log *log);
-+	int (*presuspend)(struct dirty_log *log);
-+	int (*postsuspend)(struct dirty_log *log);
- 	int (*resume)(struct dirty_log *log);
- 
- 	/*
-@@ -48,6 +58,16 @@ struct dirty_log_type {
- 	int (*is_clean)(struct dirty_log *log, region_t region);
- 
- 	/*
-+	 * Returns: 0, 1
-+	 *
-+	 * This is necessary for cluster mirroring. It provides
-+	 * a way to detect recovery on another node, so we
-+	 * aren't writing concurrently.  This function is likely
-+	 * to block (when a cluster log is used).
-+	 */
-+	int (*is_remote_recovering)(struct dirty_log *log, region_t region);
-+
-+	/*
- 	 *  Returns: 0, 1, -EWOULDBLOCK, < 0
- 	 *
- 	 * A predicate function to check the area given by
-@@ -101,6 +121,18 @@ struct dirty_log_type {
- 	 * Returns the number of regions that are in sync.
-          */
-         region_t (*get_sync_count)(struct dirty_log *log);
-+
-+	/*
-+	 * Support function for mirror status requests.
-+	 */
-+	int (*status)(struct dirty_log *log, status_type_t status_type,
-+		      char *result, unsigned int maxlen);
-+
-+	/*
-+	 * Return the code describing what to do in the event
-+	 * of a device failure.
-+	 */
-+	int (*get_failure_response)(struct dirty_log *log);
- };
- 
- int dm_register_dirty_log_type(struct dirty_log_type *type);
-diff -pruN ./drivers/md.dm/dm-mpath.c ./drivers/md/dm-mpath.c
---- ./drivers/md.dm/dm-mpath.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-mpath.c	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,1342 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software Limited.
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+#include "dm-hw-handler.h"
-+#include "dm-bio-list.h"
-+#include "dm-bio-record.h"
-+
-+#include <linux/ctype.h>
-+#include <linux/init.h>
-+#include <linux/mempool.h>
-+#include <linux/module.h>
-+#include <linux/pagemap.h>
-+#include <linux/slab.h>
-+#include <linux/time.h>
-+#include <linux/workqueue.h>
-+#include <asm/atomic.h>
-+
-+#define MESG_STR(x) x, sizeof(x)
-+
-+/* Path properties */
-+struct pgpath {
-+	struct list_head list;
-+
-+	struct priority_group *pg;	/* Owning PG */
-+	unsigned fail_count;		/* Cumulative failure count */
-+
-+	struct path path;
-+};
-+
-+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
-+
-+/*
-+ * Paths are grouped into Priority Groups and numbered from 1 upwards.
-+ * Each has a path selector which controls which path gets used.
-+ */
-+struct priority_group {
-+	struct list_head list;
-+
-+	struct multipath *m;		/* Owning multipath instance */
-+	struct path_selector ps;
-+
-+	unsigned pg_num;		/* Reference number */
-+	unsigned bypassed;		/* Temporarily bypass this PG? */
-+
-+	unsigned nr_pgpaths;		/* Number of paths in PG */
-+	struct list_head pgpaths;
-+};
-+
-+/* Multipath context */
-+struct multipath {
-+	struct list_head list;
-+	struct dm_target *ti;
-+
-+	spinlock_t lock;
-+
-+	struct hw_handler hw_handler;
-+	unsigned nr_priority_groups;
-+	struct list_head priority_groups;
-+	unsigned pg_init_required;	/* pg_init needs calling? */
-+	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
-+
-+	unsigned nr_valid_paths;	/* Total number of usable paths */
-+	struct pgpath *current_pgpath;
-+	struct priority_group *current_pg;
-+	struct priority_group *next_pg;	/* Switch to this PG if set */
-+	unsigned repeat_count;		/* I/Os left before calling PS again */
-+
-+	unsigned queue_io;		/* Must we queue all I/O? */
-+	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
-+	unsigned saved_queue_if_no_path;/* Saved state during suspension */
-+
-+	struct work_struct process_queued_ios;
-+	struct bio_list queued_ios;
-+	unsigned queue_size;
-+
-+	struct work_struct trigger_event;
-+
-+	/*
-+	 * We must use a mempool of mpath_io structs so that we
-+	 * can resubmit bios on error.
-+	 */
-+	mempool_t *mpio_pool;
-+};
-+
-+/*
-+ * Context information attached to each bio we process.
-+ */
-+struct mpath_io {
-+	struct pgpath *pgpath;
-+	struct dm_bio_details details;
-+};
-+
-+typedef int (*action_fn) (struct pgpath *pgpath);
-+
-+#define MIN_IOS 256	/* Mempool size */
-+
-+static kmem_cache_t *_mpio_cache;
-+
-+struct workqueue_struct *kmultipathd;
-+static void process_queued_ios(void *data);
-+static void trigger_event(void *data);
-+
-+
-+/*-----------------------------------------------
-+ * Allocation routines
-+ *-----------------------------------------------*/
-+
-+static struct pgpath *alloc_pgpath(void)
-+{
-+	struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
-+
-+	if (pgpath) {
-+		memset(pgpath, 0, sizeof(*pgpath));
-+		pgpath->path.is_active = 1;
-+	}
-+
-+	return pgpath;
-+}
-+
-+static inline void free_pgpath(struct pgpath *pgpath)
-+{
-+	kfree(pgpath);
-+}
-+
-+static struct priority_group *alloc_priority_group(void)
-+{
-+	struct priority_group *pg;
-+
-+	pg = kmalloc(sizeof(*pg), GFP_KERNEL);
-+	if (!pg)
-+		return NULL;
-+
-+	memset(pg, 0, sizeof(*pg));
-+	INIT_LIST_HEAD(&pg->pgpaths);
-+
-+	return pg;
-+}
-+
-+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
-+{
-+	struct pgpath *pgpath, *tmp;
-+
-+	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
-+		list_del(&pgpath->list);
-+		dm_put_device(ti, pgpath->path.dev);
-+		free_pgpath(pgpath);
-+	}
-+}
-+
-+static void free_priority_group(struct priority_group *pg,
-+				struct dm_target *ti)
-+{
-+	struct path_selector *ps = &pg->ps;
-+
-+	if (ps->type) {
-+		ps->type->destroy(ps);
-+		dm_put_path_selector(ps->type);
-+	}
-+
-+	free_pgpaths(&pg->pgpaths, ti);
-+	kfree(pg);
-+}
-+
-+static struct multipath *alloc_multipath(void)
-+{
-+	struct multipath *m;
-+
-+	m = kmalloc(sizeof(*m), GFP_KERNEL);
-+	if (m) {
-+		memset(m, 0, sizeof(*m));
-+		INIT_LIST_HEAD(&m->priority_groups);
-+		spin_lock_init(&m->lock);
-+		m->queue_io = 1;
-+		INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
-+		INIT_WORK(&m->trigger_event, trigger_event, m);
-+		m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-+					      mempool_free_slab, _mpio_cache);
-+		if (!m->mpio_pool) {
-+			kfree(m);
-+			return NULL;
-+		}
-+	}
-+
-+	return m;
-+}
-+
-+static void free_multipath(struct multipath *m)
-+{
-+	struct priority_group *pg, *tmp;
-+	struct hw_handler *hwh = &m->hw_handler;
-+
-+	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
-+		list_del(&pg->list);
-+		free_priority_group(pg, m->ti);
-+	}
-+
-+	if (hwh->type) {
-+		hwh->type->destroy(hwh);
-+		dm_put_hw_handler(hwh->type);
-+	}
-+
-+	mempool_destroy(m->mpio_pool);
-+	kfree(m);
-+}
-+
-+
-+/*-----------------------------------------------
-+ * Path selection
-+ *-----------------------------------------------*/
-+
-+static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
-+{
-+	struct hw_handler *hwh = &m->hw_handler;
-+
-+	m->current_pg = pgpath->pg;
-+
-+	/* Must we initialise the PG first, and queue I/O till it's ready? */
-+	if (hwh->type && hwh->type->pg_init) {
-+		m->pg_init_required = 1;
-+		m->queue_io = 1;
-+	} else {
-+		m->pg_init_required = 0;
-+		m->queue_io = 0;
-+	}
-+}
-+
-+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
-+{
-+	struct path *path;
-+
-+	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
-+	if (!path)
-+		return -ENXIO;
-+
-+	m->current_pgpath = path_to_pgpath(path);
-+
-+	if (m->current_pg != pg)
-+		__switch_pg(m, m->current_pgpath);
-+
-+	return 0;
-+}
-+
-+static void __choose_pgpath(struct multipath *m)
-+{
-+	struct priority_group *pg;
-+	unsigned bypassed = 1;
-+
-+	if (!m->nr_valid_paths)
-+		goto failed;
-+
-+	/* Were we instructed to switch PG? */
-+	if (m->next_pg) {
-+		pg = m->next_pg;
-+		m->next_pg = NULL;
-+		if (!__choose_path_in_pg(m, pg))
-+			return;
-+	}
-+
-+	/* Don't change PG until it has no remaining paths */
-+	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
-+		return;
-+
-+	/*
-+	 * Loop through priority groups until we find a valid path.
-+	 * First time we skip PGs marked 'bypassed'.
-+	 * Second time we only try the ones we skipped.
-+	 */
-+	do {
-+		list_for_each_entry(pg, &m->priority_groups, list) {
-+			if (pg->bypassed == bypassed)
-+				continue;
-+			if (!__choose_path_in_pg(m, pg))
-+				return;
-+		}
-+	} while (bypassed--);
-+
-+failed:
-+	m->current_pgpath = NULL;
-+	m->current_pg = NULL;
-+}
-+
-+static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
-+		  unsigned was_queued)
-+{
-+	int r = 1;
-+	unsigned long flags;
-+	struct pgpath *pgpath;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	/* Do we need to select a new pgpath? */
-+	if (!m->current_pgpath ||
-+	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-+		__choose_pgpath(m);
-+
-+	pgpath = m->current_pgpath;
-+
-+	if (was_queued)
-+		m->queue_size--;
-+
-+	if ((pgpath && m->queue_io) ||
-+	    (!pgpath && m->queue_if_no_path)) {
-+		/* Queue for the daemon to resubmit */
-+		bio_list_add(&m->queued_ios, bio);
-+		m->queue_size++;
-+		if ((m->pg_init_required && !m->pg_init_in_progress) ||
-+		    !m->queue_io)
-+			queue_work(kmultipathd, &m->process_queued_ios);
-+		pgpath = NULL;
-+		r = 0;
-+	} else if (!pgpath)
-+		r = -EIO;		/* Failed */
-+	else
-+		bio->bi_bdev = pgpath->path.dev->bdev;
-+
-+	mpio->pgpath = pgpath;
-+
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return r;
-+}
-+
-+/*
-+ * If we run out of usable paths, should we queue I/O or error it?
-+ */
-+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
-+			    unsigned save_old_value)
-+{
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	if (save_old_value)
-+		m->saved_queue_if_no_path = m->queue_if_no_path;
-+	else
-+		m->saved_queue_if_no_path = queue_if_no_path;
-+	m->queue_if_no_path = queue_if_no_path;
-+	if (!m->queue_if_no_path && m->queue_size)
-+		queue_work(kmultipathd, &m->process_queued_ios);
-+
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return 0;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * The multipath daemon is responsible for resubmitting queued ios.
-+ *---------------------------------------------------------------*/
-+
-+static void dispatch_queued_ios(struct multipath *m)
-+{
-+	int r;
-+	unsigned long flags;
-+	struct bio *bio = NULL, *next;
-+	struct mpath_io *mpio;
-+	union map_info *info;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+	bio = bio_list_get(&m->queued_ios);
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	while (bio) {
-+		next = bio->bi_next;
-+		bio->bi_next = NULL;
-+
-+		info = dm_get_mapinfo(bio);
-+		mpio = info->ptr;
-+
-+		r = map_io(m, bio, mpio, 1);
-+		if (r < 0)
-+			bio_endio(bio, bio->bi_size, r);
-+		else if (r == 1)
-+			generic_make_request(bio);
-+
-+		bio = next;
-+	}
-+}
-+
-+static void process_queued_ios(void *data)
-+{
-+	struct multipath *m = (struct multipath *) data;
-+	struct hw_handler *hwh = &m->hw_handler;
-+	struct pgpath *pgpath = NULL;
-+	unsigned init_required = 0, must_queue = 1;
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	if (!m->queue_size)
-+		goto out;
-+
-+	if (!m->current_pgpath)
-+		__choose_pgpath(m);
-+
-+	pgpath = m->current_pgpath;
-+
-+	if ((pgpath && !m->queue_io) ||
-+	    (!pgpath && !m->queue_if_no_path))
-+		must_queue = 0;
-+
-+	if (m->pg_init_required && !m->pg_init_in_progress) {
-+		m->pg_init_required = 0;
-+		m->pg_init_in_progress = 1;
-+		init_required = 1;
-+	}
-+
-+out:
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	if (init_required)
-+		hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
-+
-+	if (!must_queue)
-+		dispatch_queued_ios(m);
-+}
-+
-+/*
-+ * An event is triggered whenever a path is taken out of use.
-+ * Includes path failure and PG bypass.
-+ */
-+static void trigger_event(void *data)
-+{
-+	struct multipath *m = (struct multipath *) data;
-+
-+	dm_table_event(m->ti->table);
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Constructor/argument parsing:
-+ * <#multipath feature args> [<arg>]*
-+ * <#hw_handler args> [hw_handler [<arg>]*]
-+ * <#priority groups>
-+ * <initial priority group>
-+ *     [<selector> <#selector args> [<arg>]*
-+ *      <#paths> <#per-path selector args>
-+ *         [<path> [<arg>]* ]+ ]+
-+ *---------------------------------------------------------------*/
-+struct param {
-+	unsigned min;
-+	unsigned max;
-+	char *error;
-+};
-+
-+#define ESTR(s) ("dm-multipath: " s)
-+
-+static int read_param(struct param *param, char *str, unsigned *v, char **error)
-+{
-+	if (!str ||
-+	    (sscanf(str, "%u", v) != 1) ||
-+	    (*v < param->min) ||
-+	    (*v > param->max)) {
-+		*error = param->error;
-+		return -EINVAL;
-+	}
-+
-+	return 0;
-+}
-+
-+struct arg_set {
-+	unsigned argc;
-+	char **argv;
-+};
-+
-+static char *shift(struct arg_set *as)
-+{
-+	char *r;
-+
-+	if (as->argc) {
-+		as->argc--;
-+		r = *as->argv;
-+		as->argv++;
-+		return r;
-+	}
-+
-+	return NULL;
-+}
-+
-+static void consume(struct arg_set *as, unsigned n)
-+{
-+	BUG_ON (as->argc < n);
-+	as->argc -= n;
-+	as->argv += n;
-+}
-+
-+static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
-+			       struct dm_target *ti)
-+{
-+	int r;
-+	struct path_selector_type *pst;
-+	unsigned ps_argc;
-+
-+	static struct param _params[] = {
-+		{0, 1024, ESTR("invalid number of path selector args")},
-+	};
-+
-+	pst = dm_get_path_selector(shift(as));
-+	if (!pst) {
-+		ti->error = ESTR("unknown path selector type");
-+		return -EINVAL;
-+	}
-+
-+	r = read_param(_params, shift(as), &ps_argc, &ti->error);
-+	if (r)
-+		return -EINVAL;
-+
-+	r = pst->create(&pg->ps, ps_argc, as->argv);
-+	if (r) {
-+		dm_put_path_selector(pst);
-+		ti->error = ESTR("path selector constructor failed");
-+		return r;
-+	}
-+
-+	pg->ps.type = pst;
-+	consume(as, ps_argc);
-+
-+	return 0;
-+}
-+
-+static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
-+			       struct dm_target *ti)
-+{
-+	int r;
-+	struct pgpath *p;
-+
-+	/* we need at least a path arg */
-+	if (as->argc < 1) {
-+		ti->error = ESTR("no device given");
-+		return NULL;
-+	}
-+
-+	p = alloc_pgpath();
-+	if (!p)
-+		return NULL;
-+
-+	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
-+			  dm_table_get_mode(ti->table), &p->path.dev);
-+	if (r) {
-+		ti->error = ESTR("error getting device");
-+		goto bad;
-+	}
-+
-+	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
-+	if (r) {
-+		dm_put_device(ti, p->path.dev);
-+		goto bad;
-+	}
-+
-+	return p;
-+
-+ bad:
-+	free_pgpath(p);
-+	return NULL;
-+}
-+
-+static struct priority_group *parse_priority_group(struct arg_set *as,
-+						   struct multipath *m,
-+						   struct dm_target *ti)
-+{
-+	static struct param _params[] = {
-+		{1, 1024, ESTR("invalid number of paths")},
-+		{0, 1024, ESTR("invalid number of selector args")}
-+	};
-+
-+	int r;
-+	unsigned i, nr_selector_args, nr_params;
-+	struct priority_group *pg;
-+
-+	if (as->argc < 2) {
-+		as->argc = 0;
-+		ti->error = ESTR("not enough priority group aruments");
-+		return NULL;
-+	}
-+
-+	pg = alloc_priority_group();
-+	if (!pg) {
-+		ti->error = ESTR("couldn't allocate priority group");
-+		return NULL;
-+	}
-+	pg->m = m;
-+
-+	r = parse_path_selector(as, pg, ti);
-+	if (r)
-+		goto bad;
-+
-+	/*
-+	 * read the paths
-+	 */
-+	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
-+	if (r)
-+		goto bad;
-+
-+	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
-+	if (r)
-+		goto bad;
-+
-+	nr_params = 1 + nr_selector_args;
-+	for (i = 0; i < pg->nr_pgpaths; i++) {
-+		struct pgpath *pgpath;
-+		struct arg_set path_args;
-+
-+		if (as->argc < nr_params)
-+			goto bad;
-+
-+		path_args.argc = nr_params;
-+		path_args.argv = as->argv;
-+
-+		pgpath = parse_path(&path_args, &pg->ps, ti);
-+		if (!pgpath)
-+			goto bad;
-+
-+		pgpath->pg = pg;
-+		list_add_tail(&pgpath->list, &pg->pgpaths);
-+		consume(as, nr_params);
-+	}
-+
-+	return pg;
-+
-+ bad:
-+	free_priority_group(pg, ti);
-+	return NULL;
-+}
-+
-+static int parse_hw_handler(struct arg_set *as, struct multipath *m,
-+			    struct dm_target *ti)
-+{
-+	int r;
-+	struct hw_handler_type *hwht;
-+	unsigned hw_argc;
-+
-+	static struct param _params[] = {
-+		{0, 1024, ESTR("invalid number of hardware handler args")},
-+	};
-+
-+	r = read_param(_params, shift(as), &hw_argc, &ti->error);
-+	if (r)
-+		return -EINVAL;
-+
-+	if (!hw_argc)
-+		return 0;
-+
-+	hwht = dm_get_hw_handler(shift(as));
-+	if (!hwht) {
-+		ti->error = ESTR("unknown hardware handler type");
-+		return -EINVAL;
-+	}
-+
-+	r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
-+	if (r) {
-+		dm_put_hw_handler(hwht);
-+		ti->error = ESTR("hardware handler constructor failed");
-+		return r;
-+	}
-+
-+	m->hw_handler.type = hwht;
-+	consume(as, hw_argc - 1);
-+
-+	return 0;
-+}
-+
-+static int parse_features(struct arg_set *as, struct multipath *m,
-+			  struct dm_target *ti)
-+{
-+	int r;
-+	unsigned argc;
-+
-+	static struct param _params[] = {
-+		{0, 1, ESTR("invalid number of feature args")},
-+	};
-+
-+	r = read_param(_params, shift(as), &argc, &ti->error);
-+	if (r)
-+		return -EINVAL;
-+
-+	if (!argc)
-+		return 0;
-+
-+	if (!strnicmp(shift(as), MESG_STR("queue_if_no_path")))
-+		return queue_if_no_path(m, 1, 0);
-+	else {
-+		ti->error = "Unrecognised multipath feature request";
-+		return -EINVAL;
-+	}
-+}
-+
-+static int multipath_ctr(struct dm_target *ti, unsigned int argc,
-+			 char **argv)
-+{
-+	/* target parameters */
-+	static struct param _params[] = {
-+		{1, 1024, ESTR("invalid number of priority groups")},
-+		{1, 1024, ESTR("invalid initial priority group number")},
-+	};
-+
-+	int r;
-+	struct multipath *m;
-+	struct arg_set as;
-+	unsigned pg_count = 0;
-+	unsigned next_pg_num;
-+
-+	as.argc = argc;
-+	as.argv = argv;
-+
-+	m = alloc_multipath();
-+	if (!m) {
-+		ti->error = ESTR("can't allocate multipath");
-+		return -EINVAL;
-+	}
-+
-+	r = parse_features(&as, m, ti);
-+	if (r)
-+		goto bad;
-+
-+	r = parse_hw_handler(&as, m, ti);
-+	if (r)
-+		goto bad;
-+
-+	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
-+	if (r)
-+		goto bad;
-+
-+	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
-+	if (r)
-+		goto bad;
-+
-+	/* parse the priority groups */
-+	while (as.argc) {
-+		struct priority_group *pg;
-+
-+		pg = parse_priority_group(&as, m, ti);
-+		if (!pg) {
-+			r = -EINVAL;
-+			goto bad;
-+		}
-+
-+		m->nr_valid_paths += pg->nr_pgpaths;
-+		list_add_tail(&pg->list, &m->priority_groups);
-+		pg_count++;
-+		pg->pg_num = pg_count;
-+		if (!--next_pg_num)
-+			m->next_pg = pg;
-+	}
-+
-+	if (pg_count != m->nr_priority_groups) {
-+		ti->error = ESTR("priority group count mismatch");
-+		r = -EINVAL;
-+		goto bad;
-+	}
-+
-+	ti->private = m;
-+	m->ti = ti;
-+
-+	return 0;
-+
-+ bad:
-+	free_multipath(m);
-+	return r;
-+}
-+
-+static void multipath_dtr(struct dm_target *ti)
-+{
-+	struct multipath *m = (struct multipath *) ti->private;
-+
-+	flush_workqueue(kmultipathd);
-+	free_multipath(m);
-+}
-+
-+/*
-+ * Map bios, recording original fields for later in case we have to resubmit
-+ */
-+static int multipath_map(struct dm_target *ti, struct bio *bio,
-+			 union map_info *map_context)
-+{
-+	int r;
-+	struct mpath_io *mpio;
-+	struct multipath *m = (struct multipath *) ti->private;
-+
-+	if (bio_barrier(bio))
-+		return -EOPNOTSUPP;
-+
-+	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
-+	dm_bio_record(&mpio->details, bio);
-+
-+	map_context->ptr = mpio;
-+	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
-+	r = map_io(m, bio, mpio, 0);
-+	if (r < 0)
-+		mempool_free(mpio, m->mpio_pool);
-+
-+	return r;
-+}
-+
-+/*
-+ * Take a path out of use.
-+ */
-+static int fail_path(struct pgpath *pgpath)
-+{
-+	unsigned long flags;
-+	struct multipath *m = pgpath->pg->m;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	if (!pgpath->path.is_active)
-+		goto out;
-+
-+	DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
-+
-+	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
-+	pgpath->path.is_active = 0;
-+	pgpath->fail_count++;
-+
-+	m->nr_valid_paths--;
-+
-+	if (pgpath == m->current_pgpath)
-+		m->current_pgpath = NULL;
-+
-+	queue_work(kmultipathd, &m->trigger_event);
-+
-+out:
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return 0;
-+}
-+
-+/*
-+ * Reinstate a previously-failed path
-+ */
-+static int reinstate_path(struct pgpath *pgpath)
-+{
-+	int r = 0;
-+	unsigned long flags;
-+	struct multipath *m = pgpath->pg->m;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	if (pgpath->path.is_active)
-+		goto out;
-+
-+	if (!pgpath->pg->ps.type) {
-+		DMWARN("Reinstate path not supported by path selector %s",
-+		       pgpath->pg->ps.type->name);
-+		r = -EINVAL;
-+		goto out;
-+	}
-+
-+	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
-+	if (r)
-+		goto out;
-+
-+	pgpath->path.is_active = 1;
-+
-+	m->current_pgpath = NULL;
-+	if (!m->nr_valid_paths++ && m->queue_size)
-+		queue_work(kmultipathd, &m->process_queued_ios);
-+
-+	queue_work(kmultipathd, &m->trigger_event);
-+
-+out:
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return r;
-+}
-+
-+/*
-+ * Fail or reinstate all paths that match the provided struct dm_dev.
-+ */
-+static int action_dev(struct multipath *m, struct dm_dev *dev,
-+		      action_fn action)
-+{
-+	int r = 0;
-+	struct pgpath *pgpath;
-+	struct priority_group *pg;
-+
-+	list_for_each_entry(pg, &m->priority_groups, list) {
-+		list_for_each_entry(pgpath, &pg->pgpaths, list) {
-+			if (pgpath->path.dev == dev)
-+				r = action(pgpath);
-+		}
-+	}
-+
-+	return r;
-+}
-+
-+/*
-+ * Temporarily try to avoid having to use the specified PG
-+ */
-+static void bypass_pg(struct multipath *m, struct priority_group *pg,
-+		      int bypassed)
-+{
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	pg->bypassed = bypassed;
-+	m->current_pgpath = NULL;
-+	m->current_pg = NULL;
-+
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	queue_work(kmultipathd, &m->trigger_event);
-+}
-+
-+/*
-+ * Switch to using the specified PG from the next I/O that gets mapped
-+ */
-+static int switch_pg_num(struct multipath *m, const char *pgstr)
-+{
-+	struct priority_group *pg;
-+	unsigned pgnum;
-+	unsigned long flags;
-+
-+	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
-+	    (pgnum > m->nr_priority_groups)) {
-+		DMWARN("invalid PG number supplied to switch_pg_num");
-+		return -EINVAL;
-+	}
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+	list_for_each_entry(pg, &m->priority_groups, list) {
-+		pg->bypassed = 0;
-+		if (--pgnum)
-+			continue;
-+
-+		m->current_pgpath = NULL;
-+		m->current_pg = NULL;
-+		m->next_pg = pg;
-+	}
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	queue_work(kmultipathd, &m->trigger_event);
-+	return 0;
-+}
-+
-+/*
-+ * Set/clear bypassed status of a PG.
-+ * PGs are numbered upwards from 1 in the order they were declared.
-+ */
-+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
-+{
-+	struct priority_group *pg;
-+	unsigned pgnum;
-+
-+	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
-+	    (pgnum > m->nr_priority_groups)) {
-+		DMWARN("invalid PG number supplied to bypass_pg");
-+		return -EINVAL;
-+	}
-+
-+	list_for_each_entry(pg, &m->priority_groups, list) {
-+		if (!--pgnum)
-+			break;
-+	}
-+
-+	bypass_pg(m, pg, bypassed);
-+	return 0;
-+}
-+
-+/*
-+ * pg_init must call this when it has completed its initialisation
-+ */
-+void dm_pg_init_complete(struct path *path, unsigned err_flags)
-+{
-+	struct pgpath *pgpath = path_to_pgpath(path);
-+	struct priority_group *pg = pgpath->pg;
-+	struct multipath *m = pg->m;
-+	unsigned long flags;
-+
-+	/* We insist on failing the path if the PG is already bypassed. */
-+	if (err_flags && pg->bypassed)
-+		err_flags |= MP_FAIL_PATH;
-+
-+	if (err_flags & MP_FAIL_PATH)
-+		fail_path(pgpath);
-+
-+	if (err_flags & MP_BYPASS_PG)
-+		bypass_pg(m, pg, 1);
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+	if (err_flags) {
-+		m->current_pgpath = NULL;
-+		m->current_pg = NULL;
-+	} else if (!m->pg_init_required)
-+		m->queue_io = 0;
-+
-+	m->pg_init_in_progress = 0;
-+	queue_work(kmultipathd, &m->process_queued_ios);
-+	spin_unlock_irqrestore(&m->lock, flags);
-+}
-+
-+/*
-+ * end_io handling
-+ */
-+static int do_end_io(struct multipath *m, struct bio *bio,
-+		     int error, struct mpath_io *mpio)
-+{
-+	struct hw_handler *hwh = &m->hw_handler;
-+	unsigned err_flags = MP_FAIL_PATH;	/* Default behavior */
-+	unsigned long flags;
-+
-+	if (!error)
-+		return 0;	/* I/O complete */
-+
-+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-+		return error;
-+
-+	if (error == -EOPNOTSUPP)
-+		return error;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+	if (!m->nr_valid_paths) {
-+		if (!m->queue_if_no_path) {
-+			spin_unlock_irqrestore(&m->lock, flags);
-+			return -EIO;
-+		} else {
-+			spin_unlock_irqrestore(&m->lock, flags);
-+			goto requeue;
-+		}
-+	}
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	if (hwh->type && hwh->type->error)
-+		err_flags = hwh->type->error(hwh, bio);
-+	else
-+		err_flags = dm_scsi_err_handler(hwh, bio);
-+
-+	if (mpio->pgpath) {
-+		if (err_flags & MP_FAIL_PATH)
-+			fail_path(mpio->pgpath);
-+	
-+		if (err_flags & MP_BYPASS_PG)
-+			bypass_pg(m, mpio->pgpath->pg, 1);
-+	}
-+
-+	if (err_flags & MP_ERROR_IO)
-+		return -EIO;
-+
-+      requeue:
-+	dm_bio_restore(&mpio->details, bio);
-+
-+	/* queue for the daemon to resubmit or fail */
-+	spin_lock_irqsave(&m->lock, flags);
-+	bio_list_add(&m->queued_ios, bio);
-+	m->queue_size++;
-+	if (!m->queue_io)
-+		queue_work(kmultipathd, &m->process_queued_ios);
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return 1;	/* io not complete */
-+}
-+
-+static int multipath_end_io(struct dm_target *ti, struct bio *bio,
-+			    int error, union map_info *map_context)
-+{
-+	struct multipath *m = (struct multipath *) ti->private;
-+	struct mpath_io *mpio = (struct mpath_io *) map_context->ptr;
-+	struct pgpath *pgpath = mpio->pgpath;
-+	struct path_selector *ps;
-+	int r;
-+
-+	r  = do_end_io(m, bio, error, mpio);
-+	if (pgpath) {
-+		ps = &pgpath->pg->ps;
-+		if (ps->type->end_io)
-+			ps->type->end_io(ps, &pgpath->path);
-+	}
-+	if (r <= 0)
-+		mempool_free(mpio, m->mpio_pool);
-+
-+	return r;
-+}
-+
-+/*
-+ * Suspend can't complete until all the I/O is processed so if
-+ * the last path fails we must error any remaining I/O.
-+ * Note that if the freeze_bdev fails while suspending, the 
-+ * queue_if_no_path state is lost - userspace should reset it.
-+ */
-+static void multipath_presuspend(struct dm_target *ti)
-+{
-+	struct multipath *m = (struct multipath *) ti->private;
-+
-+	queue_if_no_path(m, 0, 1);
-+}
-+
-+/*
-+ * Restore the queue_if_no_path setting.
-+ */
-+static void multipath_resume(struct dm_target *ti)
-+{
-+	struct multipath *m = (struct multipath *) ti->private;
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+	m->queue_if_no_path = m->saved_queue_if_no_path;
-+	spin_unlock_irqrestore(&m->lock, flags);
-+}
-+
-+/*
-+ * Info output has the following format:
-+ * num_multipath_feature_args [multipath_feature_args]*
-+ * num_handler_status_args [handler_status_args]*
-+ * num_groups init_group_number
-+ *            [A|D|E num_ps_status_args [ps_status_args]*
-+ *             num_paths num_selector_args
-+ *             [path_dev A|F fail_count [selector_args]* ]+ ]+
-+ *
-+ * Table output has the following format (identical to the constructor string):
-+ * num_feature_args [features_args]*
-+ * num_handler_args hw_handler [hw_handler_args]*
-+ * num_groups init_group_number
-+ *     [priority selector-name num_ps_args [ps_args]*
-+ *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
-+ */
-+static int multipath_status(struct dm_target *ti, status_type_t type,
-+			    char *result, unsigned int maxlen)
-+{
-+	int sz = 0;
-+	unsigned long flags;
-+	struct multipath *m = (struct multipath *) ti->private;
-+	struct hw_handler *hwh = &m->hw_handler;
-+	struct priority_group *pg;
-+	struct pgpath *p;
-+	unsigned pg_num;
-+	char state;
-+
-+	spin_lock_irqsave(&m->lock, flags);
-+
-+	/* Features */
-+	if (type == STATUSTYPE_INFO)
-+		DMEMIT("1 %u ", m->queue_size);
-+	else if (m->queue_if_no_path)
-+		DMEMIT("1 queue_if_no_path ");
-+	else
-+		DMEMIT("0 ");
-+
-+	if (hwh->type && hwh->type->status)
-+		sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
-+	else if (!hwh->type || type == STATUSTYPE_INFO)
-+		DMEMIT("0 ");
-+	else
-+		DMEMIT("1 %s ", hwh->type->name);
-+
-+	DMEMIT("%u ", m->nr_priority_groups);
-+
-+	if (m->next_pg)
-+		pg_num = m->next_pg->pg_num;
-+	else if (m->current_pg)	
-+		pg_num = m->current_pg->pg_num;
-+	else
-+			pg_num = 1;
-+
-+	DMEMIT("%u ", pg_num);
-+
-+	switch (type) {
-+	case STATUSTYPE_INFO:
-+		list_for_each_entry(pg, &m->priority_groups, list) {
-+			if (pg->bypassed)
-+				state = 'D';	/* Disabled */
-+			else if (pg == m->current_pg)
-+				state = 'A';	/* Currently Active */
-+			else
-+				state = 'E';	/* Enabled */
-+
-+			DMEMIT("%c ", state);
-+
-+			if (pg->ps.type->status)
-+				sz += pg->ps.type->status(&pg->ps, NULL, type,
-+							  result + sz,
-+							  maxlen - sz);
-+			else
-+				DMEMIT("0 ");
-+
-+			DMEMIT("%u %u ", pg->nr_pgpaths,
-+			       pg->ps.type->info_args);
-+
-+			list_for_each_entry(p, &pg->pgpaths, list) {
-+				DMEMIT("%s %s %u ", p->path.dev->name,
-+				       p->path.is_active ? "A" : "F",
-+				       p->fail_count);
-+				if (pg->ps.type->status)
-+					sz += pg->ps.type->status(&pg->ps,
-+					      &p->path, type, result + sz,
-+					      maxlen - sz);
-+			}
-+		}
-+		break;
-+
-+	case STATUSTYPE_TABLE:
-+		list_for_each_entry(pg, &m->priority_groups, list) {
-+			DMEMIT("%s ", pg->ps.type->name);
-+
-+			if (pg->ps.type->status)
-+				sz += pg->ps.type->status(&pg->ps, NULL, type,
-+							  result + sz,
-+							  maxlen - sz);
-+			else
-+				DMEMIT("0 ");
-+
-+			DMEMIT("%u %u ", pg->nr_pgpaths,
-+			       pg->ps.type->table_args);
-+
-+			list_for_each_entry(p, &pg->pgpaths, list) {
-+				DMEMIT("%s ", p->path.dev->name);
-+				if (pg->ps.type->status)
-+					sz += pg->ps.type->status(&pg->ps,
-+					      &p->path, type, result + sz,
-+					      maxlen - sz);
-+			}
-+		}
-+		break;
-+	}
-+
-+	spin_unlock_irqrestore(&m->lock, flags);
-+
-+	return 0;
-+}
-+
-+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
-+{
-+	int r;
-+	struct dm_dev *dev;
-+	struct multipath *m = (struct multipath *) ti->private;
-+	action_fn action;
-+
-+	if (argc == 1) {
-+		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
-+			return queue_if_no_path(m, 1, 0);
-+		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
-+			return queue_if_no_path(m, 0, 0);
-+	}
-+
-+	if (argc != 2)
-+		goto error;
-+
-+	if (!strnicmp(argv[0], MESG_STR("disable_group")))
-+		return bypass_pg_num(m, argv[1], 1);
-+	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
-+		return bypass_pg_num(m, argv[1], 0);
-+	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
-+		return switch_pg_num(m, argv[1]);
-+	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
-+		action = reinstate_path;
-+	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
-+		action = fail_path;
-+	else
-+		goto error;
-+
-+	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
-+			  dm_table_get_mode(ti->table), &dev);
-+	if (r) {
-+		DMWARN("dm-multipath message: error getting device %s",
-+		       argv[1]);
-+		return -EINVAL;
-+	}
-+
-+	r = action_dev(m, dev, action);
-+
-+	dm_put_device(ti, dev);
-+
-+	return r;
-+
-+error:
-+	DMWARN("Unrecognised multipath message received.");
-+	return -EINVAL;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Module setup
-+ *---------------------------------------------------------------*/
-+static struct target_type multipath_target = {
-+	.name = "multipath",
-+	.version = {1, 0, 4},
-+	.module = THIS_MODULE,
-+	.ctr = multipath_ctr,
-+	.dtr = multipath_dtr,
-+	.map = multipath_map,
-+	.end_io = multipath_end_io,
-+	.presuspend = multipath_presuspend,
-+	.resume = multipath_resume,
-+	.status = multipath_status,
-+	.message = multipath_message,
-+};
-+
-+static int __init dm_multipath_init(void)
-+{
-+	int r;
-+
-+	/* allocate a slab for the dm_ios */
-+	_mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io),
-+					0, 0, NULL, NULL);
-+	if (!_mpio_cache)
-+		return -ENOMEM;
-+
-+	r = dm_register_target(&multipath_target);
-+	if (r < 0) {
-+		DMERR("%s: register failed %d", multipath_target.name, r);
-+		kmem_cache_destroy(_mpio_cache);
-+		return -EINVAL;
-+	}
-+
-+	kmultipathd = create_workqueue("kmpathd");
-+	if (!kmultipathd) {
-+		DMERR("%s: failed to create workqueue kmpathd", 
-+				multipath_target.name);
-+		dm_unregister_target(&multipath_target);
-+		kmem_cache_destroy(_mpio_cache);
-+		return -ENOMEM;
-+	}
-+
-+	DMINFO("dm-multipath version %u.%u.%u loaded",
-+	       multipath_target.version[0], multipath_target.version[1],
-+	       multipath_target.version[2]);
-+
-+	return r;
-+}
-+
-+static void __exit dm_multipath_exit(void)
-+{
-+	int r;
-+
-+	destroy_workqueue(kmultipathd);
-+
-+	r = dm_unregister_target(&multipath_target);
-+	if (r < 0)
-+		DMERR("%s: target unregister failed %d",
-+		      multipath_target.name, r);
-+	kmem_cache_destroy(_mpio_cache);
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_pg_init_complete);
-+
-+module_init(dm_multipath_init);
-+module_exit(dm_multipath_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " multipath target");
-+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-mpath.h ./drivers/md/dm-mpath.h
---- ./drivers/md.dm/dm-mpath.h	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-mpath.h	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,25 @@
-+/*
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Multipath.
-+ */
-+
-+#ifndef	DM_MPATH_H
-+#define	DM_MPATH_H
-+
-+struct dm_dev;
-+
-+struct path {
-+	struct dm_dev *dev;	/* Read-only */
-+	unsigned is_active;	/* Read-only */
-+
-+	void *pscontext;	/* For path-selector use */
-+	void *hwhcontext;	/* For hw-handler use */
-+};
-+
-+/* Callback for hwh_pg_init_fn to use when complete */
-+void dm_pg_init_complete(struct path *path, unsigned err_flags);
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-path-selector.c ./drivers/md/dm-path-selector.c
---- ./drivers/md.dm/dm-path-selector.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-path-selector.c	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,156 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Path selector registration.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+
-+#include <linux/slab.h>
-+
-+struct ps_internal {
-+	struct path_selector_type pst;
-+
-+	struct list_head list;
-+	long use;
-+};
-+
-+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
-+
-+static LIST_HEAD(_path_selectors);
-+static DECLARE_RWSEM(_ps_lock);
-+
-+struct ps_internal *__find_path_selector_type(const char *name)
-+{
-+	struct ps_internal *psi;
-+
-+	list_for_each_entry(psi, &_path_selectors, list) {
-+		if (!strcmp(name, psi->pst.name))
-+			return psi;
-+	}
-+
-+	return NULL;
-+}
-+
-+static struct ps_internal *get_path_selector(const char *name)
-+{
-+	struct ps_internal *psi;
-+
-+	down_read(&_ps_lock);
-+	psi = __find_path_selector_type(name);
-+	if (psi) {
-+		if ((psi->use == 0) && !try_module_get(psi->pst.module))
-+			psi = NULL;
-+		else
-+			psi->use++;
-+	}
-+	up_read(&_ps_lock);
-+
-+	return psi;
-+}
-+
-+struct path_selector_type *dm_get_path_selector(const char *name)
-+{
-+	struct ps_internal *psi;
-+
-+	if (!name)
-+		return NULL;
-+
-+	psi = get_path_selector(name);
-+	if (!psi) {
-+		request_module("dm-%s", name);
-+		psi = get_path_selector(name);
-+	}
-+
-+	return psi ? &psi->pst : NULL;
-+}
-+
-+void dm_put_path_selector(struct path_selector_type *pst)
-+{
-+	struct ps_internal *psi;
-+
-+	if (!pst)
-+		return;
-+
-+	down_read(&_ps_lock);
-+	psi = __find_path_selector_type(pst->name);
-+	if (!psi)
-+		goto out;
-+
-+	if (--psi->use == 0)
-+		module_put(psi->pst.module);
-+
-+	if (psi->use < 0)
-+		BUG();
-+
-+out:
-+	up_read(&_ps_lock);
-+}
-+
-+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
-+{
-+	struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
-+
-+	if (psi) {
-+		memset(psi, 0, sizeof(*psi));
-+		psi->pst = *pst;
-+	}
-+
-+	return psi;
-+}
-+
-+int dm_register_path_selector(struct path_selector_type *pst)
-+{
-+	int r = 0;
-+	struct ps_internal *psi = _alloc_path_selector(pst);
-+
-+	if (!psi)
-+		return -ENOMEM;
-+
-+	down_write(&_ps_lock);
-+
-+	if (__find_path_selector_type(pst->name)) {
-+		kfree(psi);
-+		r = -EEXIST;
-+	} else
-+		list_add(&psi->list, &_path_selectors);
-+
-+	up_write(&_ps_lock);
-+
-+	return r;
-+}
-+
-+int dm_unregister_path_selector(struct path_selector_type *pst)
-+{
-+	struct ps_internal *psi;
-+
-+	down_write(&_ps_lock);
-+
-+	psi = __find_path_selector_type(pst->name);
-+	if (!psi) {
-+		up_write(&_ps_lock);
-+		return -EINVAL;
-+	}
-+
-+	if (psi->use) {
-+		up_write(&_ps_lock);
-+		return -ETXTBSY;
-+	}
-+
-+	list_del(&psi->list);
-+
-+	up_write(&_ps_lock);
-+
-+	kfree(psi);
-+
-+	return 0;
-+}
-+
-+EXPORT_SYMBOL_GPL(dm_register_path_selector);
-+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
-diff -pruN ./drivers/md.dm/dm-path-selector.h ./drivers/md/dm-path-selector.h
---- ./drivers/md.dm/dm-path-selector.h	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-path-selector.h	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,93 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Path-Selector registration.
-+ */
-+
-+#ifndef	DM_PATH_SELECTOR_H
-+#define	DM_PATH_SELECTOR_H
-+
-+#include <linux/device-mapper.h>
-+
-+#include "dm-mpath.h"
-+
-+/*
-+ * We provide an abstraction for the code that chooses which path
-+ * to send some io down.
-+ */
-+struct path_selector_type;
-+struct path_selector {
-+	struct path_selector_type *type;
-+	void *context;
-+};
-+
-+/* Information about a path selector type */
-+struct path_selector_type {
-+	char *name;
-+	struct module *module;
-+
-+	unsigned int table_args;
-+	unsigned int info_args;
-+
-+	/*
-+	 * Constructs a path selector object, takes custom arguments
-+	 */
-+	int (*create) (struct path_selector *ps, unsigned argc, char **argv);
-+	void (*destroy) (struct path_selector *ps);
-+
-+	/*
-+	 * Add an opaque path object, along with some selector specific
-+	 * path args (eg, path priority).
-+	 */
-+	int (*add_path) (struct path_selector *ps, struct path *path,
-+			 int argc, char **argv, char **error);
-+
-+	/*
-+	 * Chooses a path for this io, if no paths are available then
-+	 * NULL will be returned.
-+	 *
-+	 * repeat_count is the number of times to use the path before
-+	 * calling the function again.  0 means don't call it again unless
-+	 * the path fails.
-+	 */
-+	struct path *(*select_path) (struct path_selector *ps,
-+				     unsigned *repeat_count);
-+
-+	/*
-+	 * Notify the selector that a path has failed.
-+	 */
-+	void (*fail_path) (struct path_selector *ps, struct path *p);
-+
-+	/*
-+	 * Ask selector to reinstate a path.
-+	 */
-+	int (*reinstate_path) (struct path_selector *ps, struct path *p);
-+
-+	/*
-+	 * Table content based on parameters added in ps_add_path_fn
-+	 * or path selector status
-+	 */
-+	int (*status) (struct path_selector *ps, struct path *path,
-+		       status_type_t type, char *result, unsigned int maxlen);
-+
-+	int (*end_io) (struct path_selector *ps, struct path *path);
-+};
-+
-+/* Register a path selector */
-+int dm_register_path_selector(struct path_selector_type *type);
-+
-+/* Unregister a path selector */
-+int dm_unregister_path_selector(struct path_selector_type *type);
-+
-+/* Returns a registered path selector type */
-+struct path_selector_type *dm_get_path_selector(const char *name);
-+
-+/* Releases a path selector  */
-+void dm_put_path_selector(struct path_selector_type *pst);
-+
-+#endif
-diff -pruN ./drivers/md.dm/dm-raid1.c ./drivers/md/dm-raid1.c
---- ./drivers/md.dm/dm-raid1.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-raid1.c	2006-03-17 13:16:38.000000000 +0300
-@@ -6,6 +6,7 @@
- 
- #include "dm.h"
- #include "dm-bio-list.h"
-+#include "dm-bio-record.h"
- #include "dm-io.h"
- #include "dm-log.h"
- #include "kcopyd.h"
-@@ -28,6 +29,8 @@ static inline void wake(void)
- 	queue_work(_kmirrord_wq, &_kmirrord_work);
- }
- 
-+static struct workqueue_struct *_kmir_mon_wq;
-+
- /*-----------------------------------------------------------------
-  * Region hash
-  *
-@@ -67,7 +70,7 @@ static inline void wake(void)
- struct mirror_set;
- struct region_hash {
- 	struct mirror_set *ms;
--	sector_t region_size;
-+	uint32_t region_size;
- 	unsigned region_shift;
- 
- 	/* holds persistent region state */
-@@ -135,7 +138,7 @@ static void region_free(void *element, v
- #define MIN_REGIONS 64
- #define MAX_RECOVERY 1
- static int rh_init(struct region_hash *rh, struct mirror_set *ms,
--		   struct dirty_log *log, sector_t region_size,
-+		   struct dirty_log *log, uint32_t region_size,
- 		   region_t nr_regions)
- {
- 	unsigned int nr_buckets, max_buckets;
-@@ -253,9 +256,9 @@ static struct region *__rh_alloc(struct 
- 	else {
- 		__rh_insert(rh, nreg);
- 		if (nreg->state == RH_CLEAN) {
--			spin_lock_irq(&rh->region_lock);
-+			spin_lock(&rh->region_lock);
- 			list_add(&nreg->list, &rh->clean_regions);
--			spin_unlock_irq(&rh->region_lock);
-+			spin_unlock(&rh->region_lock);
- 		}
- 		reg = nreg;
- 	}
-@@ -375,16 +378,19 @@ static void rh_inc(struct region_hash *r
- 
- 	read_lock(&rh->hash_lock);
- 	reg = __rh_find(rh, region);
--	if (reg->state == RH_CLEAN) {
--		rh->log->type->mark_region(rh->log, reg->key);
- 
--		spin_lock_irq(&rh->region_lock);
-+	spin_lock_irq(&rh->region_lock);
-+	atomic_inc(&reg->pending);
-+
-+	if (reg->state == RH_CLEAN) {
- 		reg->state = RH_DIRTY;
- 		list_del_init(&reg->list);	/* take off the clean list */
- 		spin_unlock_irq(&rh->region_lock);
--	}
- 
--	atomic_inc(&reg->pending);
-+		rh->log->type->mark_region(rh->log, reg->key);
-+	} else
-+		spin_unlock_irq(&rh->region_lock);
-+
- 	read_unlock(&rh->hash_lock);
- }
- 
-@@ -406,17 +412,17 @@ static void rh_dec(struct region_hash *r
- 	reg = __rh_lookup(rh, region);
- 	read_unlock(&rh->hash_lock);
- 
-+	spin_lock_irqsave(&rh->region_lock, flags);
- 	if (atomic_dec_and_test(&reg->pending)) {
--		spin_lock_irqsave(&rh->region_lock, flags);
- 		if (reg->state == RH_RECOVERING) {
- 			list_add_tail(&reg->list, &rh->quiesced_regions);
- 		} else {
- 			reg->state = RH_CLEAN;
- 			list_add(&reg->list, &rh->clean_regions);
- 		}
--		spin_unlock_irqrestore(&rh->region_lock, flags);
- 		should_wake = 1;
- 	}
-+	spin_unlock_irqrestore(&rh->region_lock, flags);
- 
- 	if (should_wake)
- 		wake();
-@@ -539,7 +545,8 @@ static void rh_start_recovery(struct reg
-  * Mirror set structures.
-  *---------------------------------------------------------------*/
- struct mirror {
--	atomic_t error_count;
-+	atomic_t error_count;  /* Error counter to flag mirror failure */
-+	struct mirror_set *ms;
- 	struct dm_dev *dev;
- 	sector_t offset;
- };
-@@ -550,36 +557,59 @@ struct mirror_set {
- 	struct region_hash rh;
- 	struct kcopyd_client *kcopyd_client;
- 
--	spinlock_t lock;	/* protects the next two lists */
-+	spinlock_t lock;	/* protects the lists */
- 	struct bio_list reads;
- 	struct bio_list writes;
-+	struct bio_list failures;
-+	struct work_struct failure_work;
-+	struct completion failure_completion;
- 
- 	/* recovery */
-+	atomic_t suspended;
- 	region_t nr_regions;
- 	int in_sync;
- 
- 	unsigned int nr_mirrors;
--	struct mirror mirror[0];
-+	spinlock_t choose_lock; /* protects select in choose_mirror(). */
-+	atomic_t read_count;    /* Read counter for read balancing. */
-+	unsigned int read_mirror;       /* Last mirror read. */
-+	struct mirror *default_mirror;  /* Default mirror. */
-+ 	struct mirror mirror[0];
- };
- 
-+struct bio_map_info {
-+	struct mirror *bmi_m;
-+	struct dm_bio_details bmi_bd;
-+};
-+
-+static mempool_t *bio_map_info_pool = NULL;
-+
-+static void *bio_map_info_alloc(int gfp_mask, void *pool_data){
-+	return kmalloc(sizeof(struct bio_map_info), gfp_mask);
-+}
-+
-+static void bio_map_info_free(void *element, void *pool_data){
-+	kfree(element);
-+}
-+
- /*
-  * Every mirror should look like this one.
-  */
- #define DEFAULT_MIRROR 0
- 
- /*
-- * This is yucky.  We squirrel the mirror_set struct away inside
-- * bi_next for write buffers.  This is safe since the bh
-+ * This is yucky.  We squirrel the mirror struct away inside
-+ * bi_next for read/write buffers.  This is safe since the bh
-  * doesn't get submitted to the lower levels of block layer.
-  */
--static struct mirror_set *bio_get_ms(struct bio *bio)
-+static struct mirror *bio_get_m(struct bio *bio)
- {
--	return (struct mirror_set *) bio->bi_next;
-+	return (struct mirror *) bio->bi_next;
- }
- 
--static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
-+static void bio_set_m(struct bio *bio, struct mirror *m)
- {
--	bio->bi_next = (struct bio *) ms;
-+	bio->bi_next = (struct bio *) m;
- }
- 
- /*-----------------------------------------------------------------
-@@ -607,7 +637,7 @@ static int recover(struct mirror_set *ms
- 	unsigned long flags = 0;
- 
- 	/* fill in the source */
--	m = ms->mirror + DEFAULT_MIRROR;
-+	m = ms->default_mirror;
- 	from.bdev = m->dev->bdev;
- 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
- 	if (reg->key == (ms->nr_regions - 1)) {
-@@ -623,7 +653,7 @@ static int recover(struct mirror_set *ms
- 
- 	/* fill in the destinations */
- 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
--		if (i == DEFAULT_MIRROR)
-+		if (&ms->mirror[i] == ms->default_mirror)
- 			continue;
- 
- 		m = ms->mirror + i;
-@@ -673,42 +703,163 @@ static void do_recovery(struct mirror_se
- }
- 
- /*-----------------------------------------------------------------
-- * Reads
-+ * Misc Functions
-  *---------------------------------------------------------------*/
--static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
-+#define MIN_READS       128
-+/*
-+ * choose_mirror
-+ * @ms: the mirror set
-+ * @m: mirror that has failed, or NULL if just choosing
-+ *
-+ * Returns: chosen mirror, or NULL on failure
-+ */
-+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m)
- {
--	/* FIXME: add read balancing */
--	return ms->mirror + DEFAULT_MIRROR;
-+	int i, retry;
-+	unsigned long flags;
-+	struct mirror *ret = NULL;
-+
-+	spin_lock_irqsave(&ms->choose_lock, flags);
-+
-+	if (unlikely(m == ms->default_mirror)) {
-+		i = DEFAULT_MIRROR;
-+		atomic_set(&ms->read_count, MIN_READS);
-+	} else
-+		i = ms->read_mirror;
-+
-+	for (retry = 0; retry < ms->nr_mirrors; ) {
-+		i %= ms->nr_mirrors;
-+		ret = ms->mirror + i;
-+
-+		if (unlikely(atomic_read(&ret->error_count))) {
-+			retry++;
-+			i++;
-+		} else {
-+			/*
-+			 * Guarantee that a number of read IOs
-+			 * get queued to the same mirror.
-+			 */
-+			if (atomic_dec_and_test(&ms->read_count)) {
-+				atomic_set(&ms->read_count, MIN_READS);
-+				i++;
-+			}
-+
-+			ms->read_mirror = i;
-+			break;
-+		}
-+	}
-+
-+	/* Check for failure of default mirror, reset if necessary */
-+	if (unlikely(m == ms->default_mirror))
-+		ms->default_mirror = ret;
-+
-+	spin_unlock_irqrestore(&ms->choose_lock, flags);
-+
-+	if (unlikely(atomic_read(&ret->error_count))) {
-+		DMERR("All mirror devices are dead. Unable to choose mirror.");
-+		return NULL;
-+	}
-+
-+	return ret;
-+}
-+
-+static void fail_mirror(struct mirror *m)
-+{
-+	DMINFO("incrementing error_count on %s", m->dev->name);
-+	atomic_inc(&m->error_count);
-+
-+	choose_mirror(m->ms, m);
-+}
-+
-+static int default_ok(struct mirror *m)
-+{
-+	return !atomic_read(&m->ms->default_mirror->error_count);
- }
- 
- /*
-  * remap a buffer to a particular mirror.
-  */
--static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
-+static sector_t map_sector(struct mirror *m, struct bio *bio)
-+{
-+	return m->offset + (bio->bi_sector - m->ms->ti->begin);
-+}
-+
-+static void map_bio(struct mirror *m, struct bio *bio)
- {
- 	bio->bi_bdev = m->dev->bdev;
--	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
-+	bio->bi_sector = map_sector(m, bio);
-+}
-+
-+static void map_region(struct io_region *io, struct mirror *m,
-+		       struct bio *bio)
-+{
-+	io->bdev = m->dev->bdev;
-+	io->sector = map_sector(m, bio);
-+	io->count = bio->bi_size >> 9;
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Reads
-+ *---------------------------------------------------------------*/
-+static void read_callback(unsigned long error, void *context)
-+{
-+	struct bio *bio = (struct bio *)context;
-+	struct mirror *m;
-+
-+	m = bio_get_m(bio);
-+	bio_set_m(bio, NULL);
-+
-+	if (unlikely(error)) {
-+		DMWARN("A read failure occurred on a mirror device.");
-+		fail_mirror(m);
-+		if (likely(default_ok(m))) {
-+			DMWARN("Trying different device.");
-+			queue_bio(m->ms, bio, bio_rw(bio));
-+		} else {
-+			DMERR("No other device available, failing I/O.");
-+			bio_endio(bio, 0, -EIO);
-+		}
-+	} else
-+		bio_endio(bio, bio->bi_size, 0);
-+}
-+
-+/* Asynchronous read. */
-+static void read_async_bio(struct mirror *m, struct bio *bio)
-+{
-+	struct io_region io;
-+
-+	map_region(&io, m, bio);
-+	bio_set_m(bio, m);
-+	dm_io_async_bvec(1, &io, READ,
-+			 bio->bi_io_vec + bio->bi_idx,
-+			 read_callback, bio);
- }
- 
- static void do_reads(struct mirror_set *ms, struct bio_list *reads)
- {
--	region_t region;
- 	struct bio *bio;
- 	struct mirror *m;
- 
- 	while ((bio = bio_list_pop(reads))) {
--		region = bio_to_region(&ms->rh, bio);
--
- 		/*
- 		 * We can only read balance if the region is in sync.
- 		 */
--		if (rh_in_sync(&ms->rh, region, 0))
--			m = choose_mirror(ms, bio->bi_sector);
--		else
--			m = ms->mirror + DEFAULT_MIRROR;
-+		if (likely(rh_in_sync(&ms->rh,
-+				      bio_to_region(&ms->rh, bio),
-+				      0)))
-+			m = choose_mirror(ms, NULL);
-+		else {
-+			m = ms->default_mirror;
-+
-+			/* If the default fails, we give up .*/
-+			if (unlikely(m && atomic_read(&m->error_count)))
-+				m = NULL;
-+		}
- 
--		map_bio(ms, m, bio);
--		generic_make_request(bio);
-+		if (likely(m))
-+			read_async_bio(m, bio);
-+		else
-+			bio_endio(bio, 0, -EIO);
- 	}
- }
- 
-@@ -722,56 +873,116 @@ static void do_reads(struct mirror_set *
-  * RECOVERING:	delay the io until recovery completes
-  * NOSYNC:	increment pending, just write to the default mirror
-  *---------------------------------------------------------------*/
-+static void write_failure_handler(void *data)
-+{
-+	struct bio *bio;
-+	struct bio_list failed_writes;
-+	struct mirror_set *ms = (struct mirror_set *)data;
-+	struct dirty_log *log = ms->rh.log;
-+
-+	if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK) {
-+		dm_table_event(ms->ti->table);
-+		wait_for_completion(&ms->failure_completion);
-+	}
-+
-+	/* Take list out to handle endios. */
-+	spin_lock_irq(&ms->lock);
-+	failed_writes = ms->failures;
-+	bio_list_init(&ms->failures);
-+	spin_unlock_irq(&ms->lock);
-+
-+	while ((bio = bio_list_pop(&failed_writes)))
-+		bio_endio(bio, bio->bi_size, 0);
-+}
-+
- static void write_callback(unsigned long error, void *context)
- {
--	unsigned int i;
--	int uptodate = 1;
-+	unsigned int i, ret = 0;
- 	struct bio *bio = (struct bio *) context;
- 	struct mirror_set *ms;
--
--	ms = bio_get_ms(bio);
--	bio_set_ms(bio, NULL);
--
-+	int uptodate = 0, run;
-+ 
-+	ms = (bio_get_m(bio))->ms;
-+	bio_set_m(bio, NULL);
-+ 
- 	/*
- 	 * NOTE: We don't decrement the pending count here,
- 	 * instead it is done by the targets endio function.
- 	 * This way we handle both writes to SYNC and NOSYNC
- 	 * regions with the same code.
- 	 */
-+	if (unlikely(error)) {
-+		DMERR("Error during write occurred.");
- 
--	if (error) {
- 		/*
--		 * only error the io if all mirrors failed.
--		 * FIXME: bogus
-+		 * Test all bits - if all failed, fail io.
-+		 * Otherwise, go through hassle of failing a device...
- 		 */
--		uptodate = 0;
--		for (i = 0; i < ms->nr_mirrors; i++)
--			if (!test_bit(i, &error)) {
-+		for (i = 0; i < ms->nr_mirrors; i++) {
-+			if (test_bit(i, &error))
-+				fail_mirror(ms->mirror + i);
-+			else
- 				uptodate = 1;
--				break;
-+		}
-+
-+		if (likely(uptodate)) {
-+			spin_lock(&ms->lock);
-+			if (atomic_read(&ms->suspended)) {
-+				/*
-+				 * The device is suspended, it is
-+				 * safe to complete I/O.
-+				 */
-+				spin_unlock(&ms->lock);
-+			} else {
-+				/*
-+				 * Need to raise event.  Since raising
-+				 * events can block, we need to do it in
-+				 * seperate thread.
-+				 *
-+				 * run gets set if this will be the first
-+				 * bio in the list.
-+				 */
-+				run = !ms->failures.head;
-+				bio_list_add(&ms->failures, bio);
-+				spin_unlock(&ms->lock);
-+
-+				if (run)
-+					queue_work(_kmir_mon_wq,
-+						   &ms->failure_work);
-+
-+				return;
- 			}
-+		} else {
-+			DMERR("All replicated volumes dead, failing I/O");
-+			/* None of the writes succeeded, fail the I/O. */
-+			ret = -EIO;
-+		}
- 	}
--	bio_endio(bio, bio->bi_size, 0);
-+ 
-+	bio_endio(bio, bio->bi_size, ret);
- }
- 
- static void do_write(struct mirror_set *ms, struct bio *bio)
- {
- 	unsigned int i;
--	struct io_region io[KCOPYD_MAX_REGIONS+1];
-+	struct io_region io[ms->nr_mirrors], *dest = io;
- 	struct mirror *m;
- 
--	for (i = 0; i < ms->nr_mirrors; i++) {
--		m = ms->mirror + i;
--
--		io[i].bdev = m->dev->bdev;
--		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
--		io[i].count = bio->bi_size >> 9;
--	}
-+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
-+		map_region(dest++, m, bio);
- 
--	bio_set_ms(bio, ms);
--	dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
--			 bio->bi_io_vec + bio->bi_idx,
--			 write_callback, bio);
-+	if (likely(dest - io)) {	
-+		/*
-+		 * We can use the default mirror here, because we
-+		 * only need it in order to retrieve the reference
-+		 * to the mirror set in write_callback().
-+		 */
-+		bio_set_m(bio, ms->default_mirror);
-+		dm_io_async_bvec(dest - io, io, WRITE,
-+				 bio->bi_io_vec + bio->bi_idx,
-+				 write_callback, bio);
-+	} else
-+		bio_endio(bio, bio->bi_size, -EIO);
- }
- 
- static void do_writes(struct mirror_set *ms, struct bio_list *writes)
-@@ -779,6 +990,9 @@ static void do_writes(struct mirror_set 
- 	int state;
- 	struct bio *bio;
- 	struct bio_list sync, nosync, recover, *this_list = NULL;
-+	struct bio_list requeue;
-+	struct dirty_log *log = ms->rh.log;
-+	region_t region;
- 
- 	if (!writes->head)
- 		return;
-@@ -789,9 +1003,18 @@ static void do_writes(struct mirror_set 
- 	bio_list_init(&sync);
- 	bio_list_init(&nosync);
- 	bio_list_init(&recover);
-+	bio_list_init(&requeue);
- 
- 	while ((bio = bio_list_pop(writes))) {
--		state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
-+		region = bio_to_region(&ms->rh, bio);
-+
-+		if (log->type->is_remote_recovering &&
-+		    log->type->is_remote_recovering(log, region)) {
-+			bio_list_add(&requeue, bio);
-+			continue;
-+		}
-+
-+		state = rh_state(&ms->rh, region, 1);
- 		switch (state) {
- 		case RH_CLEAN:
- 		case RH_DIRTY:
-@@ -810,6 +1033,8 @@ static void do_writes(struct mirror_set 
- 		bio_list_add(this_list, bio);
- 	}
- 
-+	bio_list_merge(writes, &requeue);
-+
- 	/*
- 	 * Increment the pending counts for any regions that will
- 	 * be written to (writes to recover regions are going to
-@@ -829,7 +1054,7 @@ static void do_writes(struct mirror_set 
- 		rh_delay(&ms->rh, bio);
- 
- 	while ((bio = bio_list_pop(&nosync))) {
--		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
-+		map_bio(ms->default_mirror, bio);
- 		generic_make_request(bio);
- 	}
- }
-@@ -844,12 +1069,12 @@ static void do_mirror(struct mirror_set 
- {
- 	struct bio_list reads, writes;
- 
--	spin_lock(&ms->lock);
-+	spin_lock_irq(&ms->lock);
- 	reads = ms->reads;
- 	writes = ms->writes;
- 	bio_list_init(&ms->reads);
- 	bio_list_init(&ms->writes);
--	spin_unlock(&ms->lock);
-+	spin_unlock_irq(&ms->lock);
- 
- 	rh_update_states(&ms->rh);
- 	do_recovery(ms);
-@@ -871,7 +1096,7 @@ static void do_work(void *ignored)
-  * Target functions
-  *---------------------------------------------------------------*/
- static struct mirror_set *alloc_context(unsigned int nr_mirrors,
--					sector_t region_size,
-+					uint32_t region_size,
- 					struct dm_target *ti,
- 					struct dirty_log *dl)
- {
-@@ -891,11 +1116,16 @@ static struct mirror_set *alloc_context(
- 
- 	memset(ms, 0, len);
- 	spin_lock_init(&ms->lock);
-+	spin_lock_init(&ms->choose_lock);
- 
- 	ms->ti = ti;
- 	ms->nr_mirrors = nr_mirrors;
--	ms->nr_regions = dm_div_up(ti->len, region_size);
-+	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
- 	ms->in_sync = 0;
-+	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
-+
-+	/* a resume must be issued to start the device */
-+	atomic_set(&ms->suspended, 1);
- 
- 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
- 		ti->error = "dm-mirror: Error creating dirty region hash";
-@@ -903,6 +1133,13 @@ static struct mirror_set *alloc_context(
- 		return NULL;
- 	}
- 
-+	atomic_set(&ms->read_count, MIN_READS);
-+
-+	bio_list_init(&ms->failures);
-+	INIT_WORK(&ms->failure_work, write_failure_handler, ms);
-+
-+	init_completion(&ms->failure_completion);
-+
- 	return ms;
- }
- 
-@@ -916,7 +1153,7 @@ static void free_context(struct mirror_s
- 	kfree(ms);
- }
- 
--static inline int _check_region_size(struct dm_target *ti, sector_t size)
-+static inline int _check_region_size(struct dm_target *ti, uint32_t size)
- {
- 	return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
- 		 size > ti->len);
-@@ -940,6 +1177,8 @@ static int get_mirror(struct mirror_set 
- 	}
- 
- 	ms->mirror[mirror].offset = offset;
-+	atomic_set(&(ms->mirror[mirror].error_count), 0);
-+	ms->mirror[mirror].ms = ms;
- 
- 	return 0;
- }
-@@ -1009,8 +1248,8 @@ static struct dirty_log *create_dirty_lo
-  * log_type #log_params <log_params>
-  * #mirrors [mirror_path offset]{2,}
-  *
-- * For now, #log_params = 1, log_type = "core"
-- *
-+ * log_type is "core" or "disk"
-+ * #log_params is between 1 and 3
-  */
- #define DM_IO_PAGES 64
- static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
-@@ -1060,6 +1299,7 @@ static int mirror_ctr(struct dm_target *
- 	}
- 
- 	ti->private = ms;
-+ 	ti->split_io = ms->rh.region_size;
- 
- 	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
- 	if (r) {
-@@ -1082,14 +1322,15 @@ static void mirror_dtr(struct dm_target 
- 
- static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
- {
-+	unsigned long flags;
- 	int should_wake = 0;
- 	struct bio_list *bl;
- 
- 	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
--	spin_lock(&ms->lock);
-+	spin_lock_irqsave(&ms->lock, flags);
- 	should_wake = !(bl->head);
- 	bio_list_add(bl, bio);
--	spin_unlock(&ms->lock);
-+	spin_unlock_irqrestore(&ms->lock, flags);
- 
- 	if (should_wake)
- 		wake();
-@@ -1104,42 +1345,64 @@ static int mirror_map(struct dm_target *
- 	int r, rw = bio_rw(bio);
- 	struct mirror *m;
- 	struct mirror_set *ms = ti->private;
--
--	map_context->ll = bio->bi_sector >> ms->rh.region_shift;
-+	struct dm_bio_details *bd;
-+	struct bio_map_info *bmi;
- 
- 	if (rw == WRITE) {
-+		/* Save region for mirror_end_io() handler */
-+		map_context->ll = bio_to_region(&ms->rh, bio);
- 		queue_bio(ms, bio, rw);
- 		return 0;
- 	}
- 
-+	/* It's all about the READs now */
-+
- 	r = ms->rh.log->type->in_sync(ms->rh.log,
- 				      bio_to_region(&ms->rh, bio), 0);
- 	if (r < 0 && r != -EWOULDBLOCK)
- 		return r;
- 
--	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
-+	if (r == -EWOULDBLOCK)
- 		r = 0;
- 
--	/*
--	 * We don't want to fast track a recovery just for a read
--	 * ahead.  So we just let it silently fail.
--	 * FIXME: get rid of this.
--	 */
--	if (!r && rw == READA)
--		return -EIO;
-+	if (likely(r)) {
-+		/*
-+		 * Optimize reads by avoiding to hand them to daemon.
-+		 *
-+		 * In case they fail, queue them for another shot
-+		 * in the mirror_end_io() function.
-+		 */
-+		m = choose_mirror(ms, NULL);
-+		if (likely(m)) {
-+			bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO);
-+
-+			if (likely(bmi)) {
-+				/* without this, a read is not retryable */
-+				bd = &bmi->bmi_bd;
-+				dm_bio_record(bd, bio);
-+				map_context->ptr = bmi;
-+				bmi->bmi_m = m;
-+			} else {
-+				/* we could fail now, but we can at least  **
-+				** give it a shot.  The bd is only used to **
-+				** retry in the event of a failure anyway. **
-+				** If we fail, we can fail the I/O then.   */
-+				map_context->ptr = NULL;
-+			}
-+
-+			map_bio(m, bio);
-+			return 1; /* Mapped -> queue request. */
-+		} else
-+			return -EIO;
-+	} else {
-+		/* Either not clean, or -EWOULDBLOCK */
-+		if (rw == READA)
-+			return -EWOULDBLOCK;
- 
--	if (!r) {
--		/* Pass this io over to the daemon */
- 		queue_bio(ms, bio, rw);
--		return 0;
- 	}
- 
--	m = choose_mirror(ms, bio->bi_sector);
--	if (!m)
--		return -EIO;
--
--	map_bio(ms, m, bio);
--	return 1;
-+	return 0;
- }
- 
- static int mirror_end_io(struct dm_target *ti, struct bio *bio,
-@@ -1147,71 +1410,140 @@ static int mirror_end_io(struct dm_targe
- {
- 	int rw = bio_rw(bio);
- 	struct mirror_set *ms = (struct mirror_set *) ti->private;
--	region_t region = map_context->ll;
-+	struct mirror *m = NULL;
-+	struct dm_bio_details *bd = NULL;
- 
- 	/*
- 	 * We need to dec pending if this was a write.
- 	 */
--	if (rw == WRITE)
--		rh_dec(&ms->rh, region);
-+	if (rw == WRITE) {
-+		rh_dec(&ms->rh, map_context->ll);
-+		return error;
-+	}
- 
--	return 0;
-+	if (error == -EOPNOTSUPP)
-+		goto out;
-+
-+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-+		goto out;
-+
-+	if (unlikely(error)) {
-+		DMERR("A read failure occurred on a mirror device.");
-+		if (!map_context->ptr) {
-+			/*
-+			 * There wasn't enough memory to record necessary
-+			 * information for a retry.
-+			 */
-+			DMERR("Out of memory causing inability to retry read.");
-+			return -EIO;
-+		}
-+		m = ((struct bio_map_info *)map_context->ptr)->bmi_m;
-+		fail_mirror(m); /* Flag error on mirror. */
-+
-+		/*
-+		 * A failed read needs to get queued
-+		 * to the daemon for another shot to
-+		 * one (if any) intact mirrors.
-+		 */
-+		if (default_ok(m)) {
-+			bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd);
-+
-+			DMWARN("Trying different device.");
-+			dm_bio_restore(bd, bio);
-+			mempool_free(map_context->ptr, bio_map_info_pool);
-+			map_context->ptr = NULL;
-+			queue_bio(ms, bio, rw);
-+			return 1; /* We want another shot on the bio. */
-+		}
-+		DMERR("All replicated volumes dead, failing I/O");
-+	}
-+
-+ out:
-+	if (map_context->ptr)
-+		mempool_free(map_context->ptr, bio_map_info_pool);
-+
-+	return error;
- }
- 
--static void mirror_suspend(struct dm_target *ti)
-+static void mirror_presuspend(struct dm_target *ti)
- {
- 	struct mirror_set *ms = (struct mirror_set *) ti->private;
- 	struct dirty_log *log = ms->rh.log;
-+	unsigned long flags;
-+	int run;
-+
-+	/*
-+	 * Only run the completion if we are suspending after
-+	 * a disk failure.
-+	 */
-+	spin_lock_irqsave(&ms->lock, flags);
-+	run = ms->failures.head ? 1 : 0;
-+	spin_unlock_irqrestore(&ms->lock, flags);
-+
-+	if (run && (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK))
-+		complete(&ms->failure_completion);
-+
-+	if (log->type->presuspend && log->type->presuspend(log))
-+		/* FIXME: need better error handling */
-+		DMWARN("log presuspend failed");
-+
-+}  
-+
-+static void mirror_postsuspend(struct dm_target *ti)
-+{
-+	struct mirror_set *ms = (struct mirror_set *) ti->private;
-+	struct dirty_log *log = ms->rh.log;
-+
- 	rh_stop_recovery(&ms->rh);
--	if (log->type->suspend && log->type->suspend(log))
-+	if (log->type->postsuspend && log->type->postsuspend(log))
- 		/* FIXME: need better error handling */
--		DMWARN("log suspend failed");
-+		DMWARN("log postsuspend failed");
-+	atomic_set(&ms->suspended, 1);
- }
- 
- static void mirror_resume(struct dm_target *ti)
- {
- 	struct mirror_set *ms = (struct mirror_set *) ti->private;
- 	struct dirty_log *log = ms->rh.log;
-+
- 	if (log->type->resume && log->type->resume(log))
- 		/* FIXME: need better error handling */
- 		DMWARN("log resume failed");
--	rh_start_recovery(&ms->rh);
-+
-+	if (atomic_dec_and_test(&ms->suspended))
-+		rh_start_recovery(&ms->rh);
-+	atomic_set(&ms->suspended, 0);
- }
- 
- static int mirror_status(struct dm_target *ti, status_type_t type,
- 			 char *result, unsigned int maxlen)
- {
--	char buffer[32];
- 	unsigned int m, sz = 0;
- 	struct mirror_set *ms = (struct mirror_set *) ti->private;
--
--#define EMIT(x...) sz += ((sz >= maxlen) ? \
--			  0 : scnprintf(result + sz, maxlen - sz, x))
-+	char buffer[ms->nr_mirrors + 1];
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
--		EMIT("%d ", ms->nr_mirrors);
--
-+		DMEMIT("%d ", ms->nr_mirrors);
- 		for (m = 0; m < ms->nr_mirrors; m++) {
--			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
--			EMIT("%s ", buffer);
-+			DMEMIT("%s ", ms->mirror[m].dev->name);
-+			buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ? 
-+				'D' : 'A';
- 		}
-+		buffer[m] = '\0';
- 
--		EMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
--		     ms->rh.log->type->get_sync_count(ms->rh.log),
--		     ms->nr_regions);
-+		DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ",
-+		       ms->rh.log->type->get_sync_count(ms->rh.log),
-+		       ms->nr_regions, buffer);
-+		ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
- 		break;
- 
- 	case STATUSTYPE_TABLE:
--		EMIT("%s 1 " SECTOR_FORMAT " %d ",
--		     ms->rh.log->type->name, ms->rh.region_size,
--		     ms->nr_mirrors);
--
--		for (m = 0; m < ms->nr_mirrors; m++) {
--			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
--			EMIT("%s " SECTOR_FORMAT " ",
--			     buffer, ms->mirror[m].offset);
--		}
-+		sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
-+		DMEMIT("%d ", ms->nr_mirrors);
-+		for (m = 0; m < ms->nr_mirrors; m++)
-+			DMEMIT("%s " SECTOR_FORMAT " ",
-+			       ms->mirror[m].dev->name, ms->mirror[m].offset);
- 	}
- 
- 	return 0;
-@@ -1219,13 +1551,14 @@ static int mirror_status(struct dm_targe
- 
- static struct target_type mirror_target = {
- 	.name	 = "mirror",
--	.version = {1, 0, 1},
-+	.version = {1, 1, 0},
- 	.module	 = THIS_MODULE,
- 	.ctr	 = mirror_ctr,
- 	.dtr	 = mirror_dtr,
- 	.map	 = mirror_map,
- 	.end_io	 = mirror_end_io,
--	.suspend = mirror_suspend,
-+	.presuspend = mirror_presuspend,
-+	.postsuspend = mirror_postsuspend,
- 	.resume	 = mirror_resume,
- 	.status	 = mirror_status,
- };
-@@ -1234,24 +1567,38 @@ static int __init dm_mirror_init(void)
- {
- 	int r;
- 
-+	bio_map_info_pool = mempool_create(100, bio_map_info_alloc,
-+					   bio_map_info_free, NULL);
-+	if (!bio_map_info_pool)
-+		return -ENOMEM;
-+
- 	r = dm_dirty_log_init();
- 	if (r)
- 		return r;
- 
--	_kmirrord_wq = create_workqueue("kmirrord");
-+	_kmirrord_wq = create_singlethread_workqueue("kmirrord");
- 	if (!_kmirrord_wq) {
- 		DMERR("couldn't start kmirrord");
- 		dm_dirty_log_exit();
--		return r;
-+		return -ENOMEM;
- 	}
- 	INIT_WORK(&_kmirrord_work, do_work, NULL);
- 
-+	_kmir_mon_wq = create_singlethread_workqueue("kmir_mon");
-+	if (!_kmir_mon_wq) {
-+		DMERR("couldn't start kmir_mon");
-+		dm_dirty_log_exit();
-+		destroy_workqueue(_kmirrord_wq);
-+		return -ENOMEM;
-+	}
-+
- 	r = dm_register_target(&mirror_target);
- 	if (r < 0) {
- 		DMERR("%s: Failed to register mirror target",
- 		      mirror_target.name);
- 		dm_dirty_log_exit();
- 		destroy_workqueue(_kmirrord_wq);
-+		destroy_workqueue(_kmir_mon_wq);
- 	}
- 
- 	return r;
-diff -pruN ./drivers/md.dm/dm-round-robin.c ./drivers/md/dm-round-robin.c
---- ./drivers/md.dm/dm-round-robin.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/dm-round-robin.c	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,214 @@
-+/*
-+ * Copyright (C) 2003 Sistina Software.
-+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
-+ *
-+ * Module Author: Heinz Mauelshagen
-+ *
-+ * This file is released under the GPL.
-+ *
-+ * Round-robin path selector.
-+ */
-+
-+#include "dm.h"
-+#include "dm-path-selector.h"
-+
-+#include <linux/slab.h>
-+
-+/*-----------------------------------------------------------------
-+ * Path-handling code, paths are held in lists
-+ *---------------------------------------------------------------*/
-+struct path_info {
-+	struct list_head list;
-+	struct path *path;
-+	unsigned repeat_count;
-+};
-+
-+static void free_paths(struct list_head *paths)
-+{
-+	struct path_info *pi, *next;
-+
-+	list_for_each_entry_safe(pi, next, paths, list) {
-+		list_del(&pi->list);
-+		kfree(pi);
-+	}
-+}
-+
-+/*-----------------------------------------------------------------
-+ * Round-robin selector
-+ *---------------------------------------------------------------*/
-+
-+#define RR_MIN_IO		1000
-+
-+struct selector {
-+	struct list_head valid_paths;
-+	struct list_head invalid_paths;
-+};
-+
-+static struct selector *alloc_selector(void)
-+{
-+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
-+
-+	if (s) {
-+		INIT_LIST_HEAD(&s->valid_paths);
-+		INIT_LIST_HEAD(&s->invalid_paths);
-+	}
-+
-+	return s;
-+}
-+
-+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
-+{
-+	struct selector *s;
-+
-+	s = alloc_selector();
-+	if (!s)
-+		return -ENOMEM;
-+
-+	ps->context = s;
-+	return 0;
-+}
-+
-+static void rr_destroy(struct path_selector *ps)
-+{
-+	struct selector *s = (struct selector *) ps->context;
-+
-+	free_paths(&s->valid_paths);
-+	free_paths(&s->invalid_paths);
-+	kfree(s);
-+	ps->context = NULL;
-+}
-+
-+static int rr_status(struct path_selector *ps, struct path *path,
-+		     status_type_t type, char *result, unsigned int maxlen)
-+{
-+	struct path_info *pi;
-+	int sz = 0;
-+
-+	if (!path)
-+		DMEMIT("0 ");
-+	else {
-+		switch(type) {
-+		case STATUSTYPE_INFO:
-+			break;
-+		case STATUSTYPE_TABLE:
-+			pi = path->pscontext;
-+			DMEMIT("%u ", pi->repeat_count);
-+			break;
-+		}
-+	}
-+
-+	return sz;
-+}
-+
-+/*
-+ * Called during initialisation to register each path with an
-+ * optional repeat_count.
-+ */
-+static int rr_add_path(struct path_selector *ps, struct path *path,
-+		       int argc, char **argv, char **error)
-+{
-+	struct selector *s = (struct selector *) ps->context;
-+	struct path_info *pi;
-+	unsigned repeat_count = RR_MIN_IO;
-+
-+	if (argc > 1) {
-+		*error = "round-robin ps: incorrect number of arguments";
-+		return -EINVAL;
-+	}
-+
-+	/* First path argument is number of I/Os before switching path */
-+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
-+		*error = "round-robin ps: invalid repeat count";
-+		return -EINVAL;
-+	}
-+
-+	/* allocate the path */
-+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
-+	if (!pi) {
-+		*error = "round-robin ps: Error allocating path context";
-+		return -ENOMEM;
-+	}
-+
-+	pi->path = path;
-+	pi->repeat_count = repeat_count;
-+
-+	path->pscontext = pi;
-+
-+	list_add(&pi->list, &s->valid_paths);
-+
-+	return 0;
-+}
-+
-+static void rr_fail_path(struct path_selector *ps, struct path *p)
-+{
-+	struct selector *s = (struct selector *) ps->context;
-+	struct path_info *pi = p->pscontext;
-+
-+	list_move(&pi->list, &s->invalid_paths);
-+}
-+
-+static int rr_reinstate_path(struct path_selector *ps, struct path *p)
-+{
-+	struct selector *s = (struct selector *) ps->context;
-+	struct path_info *pi = p->pscontext;
-+
-+	list_move(&pi->list, &s->valid_paths);
-+
-+	return 0;
-+}
-+
-+static struct path *rr_select_path(struct path_selector *ps,
-+				   unsigned *repeat_count)
-+{
-+	struct selector *s = (struct selector *) ps->context;
-+	struct path_info *pi = NULL;
-+
-+	if (!list_empty(&s->valid_paths)) {
-+		pi = list_entry(s->valid_paths.next, struct path_info, list);
-+		list_move_tail(&pi->list, &s->valid_paths);
-+		*repeat_count = pi->repeat_count;
-+	}
-+
-+	return pi ? pi->path : NULL;
-+}
-+
-+static struct path_selector_type rr_ps = {
-+	.name = "round-robin",
-+	.module = THIS_MODULE,
-+	.table_args = 1,
-+	.info_args = 0,
-+	.create = rr_create,
-+	.destroy = rr_destroy,
-+	.status = rr_status,
-+	.add_path = rr_add_path,
-+	.fail_path = rr_fail_path,
-+	.reinstate_path = rr_reinstate_path,
-+	.select_path = rr_select_path,
-+};
-+
-+static int __init dm_rr_init(void)
-+{
-+	int r = dm_register_path_selector(&rr_ps);
-+
-+	if (r < 0)
-+		DMERR("round-robin: register failed %d", r);
-+
-+	DMINFO("dm-round-robin version 1.0.0 loaded");
-+
-+	return r;
-+}
-+
-+static void __exit dm_rr_exit(void)
-+{
-+	int r = dm_unregister_path_selector(&rr_ps);
-+
-+	if (r < 0)
-+		DMERR("round-robin: unregister failed %d", r);
-+}
-+
-+module_init(dm_rr_init);
-+module_exit(dm_rr_exit);
-+
-+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
-+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
-+MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/dm-snap.c ./drivers/md/dm-snap.c
---- ./drivers/md.dm/dm-snap.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-snap.c	2006-03-17 13:16:38.000000000 +0300
-@@ -49,6 +49,11 @@ struct pending_exception {
- 	struct bio_list snapshot_bios;
- 
- 	/*
-+	 * Short-term queue of pending exceptions prior to submission.
-+	 */
-+	struct list_head list;
-+
-+	/*
- 	 * Other pending_exceptions that are processing this
- 	 * chunk.  When this list is empty, we know we can
- 	 * complete the origins.
-@@ -371,6 +376,15 @@ static inline ulong round_up(ulong n, ul
- 	return (n + size) & ~size;
- }
- 
-+static void read_snapshot_metadata(struct dm_snapshot *s)
-+{
-+	if (s->store.read_metadata(&s->store)) {
-+		down_write(&s->lock);
-+		s->valid = 0;
-+		up_write(&s->lock);
-+	}
-+}
-+
- /*
-  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
-  */
-@@ -457,7 +471,7 @@ static int snapshot_ctr(struct dm_target
- 	s->chunk_shift = ffs(chunk_size) - 1;
- 
- 	s->valid = 1;
--	s->have_metadata = 0;
-+	s->active = 0;
- 	s->last_percent = 0;
- 	init_rwsem(&s->lock);
- 	s->table = ti->table;
-@@ -492,7 +506,11 @@ static int snapshot_ctr(struct dm_target
- 		goto bad5;
- 	}
- 
-+	/* Metadata must only be loaded into one table at once */
-+	read_snapshot_metadata(s);
-+
- 	/* Add snapshot to the list of snapshots for this origin */
-+	/* Exceptions aren't triggered till snapshot_resume() is called */
- 	if (register_snapshot(s)) {
- 		r = -EINVAL;
- 		ti->error = "Cannot register snapshot origin";
-@@ -529,8 +547,12 @@ static void snapshot_dtr(struct dm_targe
- {
- 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
- 
-+	/* Prevent further origin writes from using this snapshot. */
-+	/* After this returns there can be no new kcopyd jobs. */
- 	unregister_snapshot(s);
- 
-+	kcopyd_client_destroy(s->kcopyd_client);
-+
- 	exit_exception_table(&s->pending, pending_cache);
- 	exit_exception_table(&s->complete, exception_cache);
- 
-@@ -539,7 +561,7 @@ static void snapshot_dtr(struct dm_targe
- 
- 	dm_put_device(ti, s->origin);
- 	dm_put_device(ti, s->cow);
--	kcopyd_client_destroy(s->kcopyd_client);
-+
- 	kfree(s);
- }
- 
-@@ -777,7 +799,10 @@ static int snapshot_map(struct dm_target
- 
- 	/* Full snapshots are not usable */
- 	if (!s->valid)
--		return -1;
-+		return -EIO;
-+
-+	if (unlikely(bio_barrier(bio)))
-+		return -EOPNOTSUPP;
- 
- 	/*
- 	 * Write to snapshot - higher level takes care of RW/RO
-@@ -848,24 +873,15 @@ static void snapshot_resume(struct dm_ta
- {
- 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
- 
--	if (s->have_metadata)
--		return;
--
--	if (s->store.read_metadata(&s->store)) {
--		down_write(&s->lock);
--		s->valid = 0;
--		up_write(&s->lock);
--	}
--
--	s->have_metadata = 1;
-+	down_write(&s->lock);
-+	s->active = 1;
-+	up_write(&s->lock);
- }
- 
- static int snapshot_status(struct dm_target *ti, status_type_t type,
- 			   char *result, unsigned int maxlen)
- {
- 	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
--	char cow[32];
--	char org[32];
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
-@@ -892,9 +908,8 @@ static int snapshot_status(struct dm_tar
- 		 * to make private copies if the output is to
- 		 * make sense.
- 		 */
--		format_dev_t(cow, snap->cow->bdev->bd_dev);
--		format_dev_t(org, snap->origin->bdev->bd_dev);
--		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, org, cow,
-+		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
-+			 snap->origin->name, snap->cow->name,
- 			 snap->type, snap->chunk_size);
- 		break;
- 	}
-@@ -924,14 +939,19 @@ static int __origin_write(struct list_he
- 	int r = 1, first = 1;
- 	struct dm_snapshot *snap;
- 	struct exception *e;
--	struct pending_exception *pe, *last = NULL;
-+	struct pending_exception *pe, *next_pe, *last = NULL;
- 	chunk_t chunk;
-+	LIST_HEAD(pe_queue);
- 
- 	/* Do all the snapshots on this origin */
- 	list_for_each_entry (snap, snapshots, list) {
- 
--		/* Only deal with valid snapshots */
--		if (!snap->valid)
-+		/* Only deal with valid and active snapshots */
-+		if (!snap->valid || !snap->active)
-+			continue;
-+
-+		/* Nothing to do if writing beyond end of snapshot */
-+		if (bio->bi_sector >= dm_table_get_size(snap->table))
- 			continue;
- 
- 		down_write(&snap->lock);
-@@ -955,12 +975,19 @@ static int __origin_write(struct list_he
- 				snap->valid = 0;
- 
- 			} else {
--				if (last)
-+				if (first) {
-+					bio_list_add(&pe->origin_bios, bio);
-+					r = 0;
-+					first = 0;
-+				}
-+				if (last && list_empty(&pe->siblings))
- 					list_merge(&pe->siblings,
- 						   &last->siblings);
--
-+				if (!pe->started) {
-+					pe->started = 1;
-+					list_add_tail(&pe->list, &pe_queue);
-+				}
- 				last = pe;
--				r = 0;
- 			}
- 		}
- 
-@@ -970,24 +997,8 @@ static int __origin_write(struct list_he
- 	/*
- 	 * Now that we have a complete pe list we can start the copying.
- 	 */
--	if (last) {
--		pe = last;
--		do {
--			down_write(&pe->snap->lock);
--			if (first)
--				bio_list_add(&pe->origin_bios, bio);
--			if (!pe->started) {
--				pe->started = 1;
--				up_write(&pe->snap->lock);
--				start_copy(pe);
--			} else
--				up_write(&pe->snap->lock);
--			first = 0;
--			pe = list_entry(pe->siblings.next,
--					struct pending_exception, siblings);
--
--		} while (pe != last);
--	}
-+	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
-+		start_copy(pe);
- 
- 	return r;
- }
-@@ -1051,6 +1062,9 @@ static int origin_map(struct dm_target *
- 	struct dm_dev *dev = (struct dm_dev *) ti->private;
- 	bio->bi_bdev = dev->bdev;
- 
-+	if (unlikely(bio_barrier(bio)))
-+		return -EOPNOTSUPP;
-+
- 	/* Only tell snapshots if this is a write */
- 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
- }
-@@ -1082,7 +1096,6 @@ static int origin_status(struct dm_targe
- 			 unsigned int maxlen)
- {
- 	struct dm_dev *dev = (struct dm_dev *) ti->private;
--	char buffer[32];
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
-@@ -1090,8 +1103,7 @@ static int origin_status(struct dm_targe
- 		break;
- 
- 	case STATUSTYPE_TABLE:
--		format_dev_t(buffer, dev->bdev->bd_dev);
--		snprintf(result, maxlen, "%s", buffer);
-+		snprintf(result, maxlen, "%s", dev->name);
- 		break;
- 	}
- 
-@@ -1100,7 +1112,7 @@ static int origin_status(struct dm_targe
- 
- static struct target_type origin_target = {
- 	.name    = "snapshot-origin",
--	.version = {1, 0, 1},
-+	.version = {1, 2, 0},
- 	.module  = THIS_MODULE,
- 	.ctr     = origin_ctr,
- 	.dtr     = origin_dtr,
-@@ -1111,7 +1123,7 @@ static struct target_type origin_target 
- 
- static struct target_type snapshot_target = {
- 	.name    = "snapshot",
--	.version = {1, 0, 1},
-+	.version = {1, 2, 0},
- 	.module  = THIS_MODULE,
- 	.ctr     = snapshot_ctr,
- 	.dtr     = snapshot_dtr,
-diff -pruN ./drivers/md.dm/dm-snap.h ./drivers/md/dm-snap.h
---- ./drivers/md.dm/dm-snap.h	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-snap.h	2006-03-17 13:16:38.000000000 +0300
-@@ -99,7 +99,9 @@ struct dm_snapshot {
- 
- 	/* You can't use a snapshot if this is 0 (e.g. if full) */
- 	int valid;
--	int have_metadata;
-+
-+	/* Origin writes don't trigger exceptions until this is set */
-+	int active;
- 
- 	/* Used for display of table */
- 	char type;
-diff -pruN ./drivers/md.dm/dm-stripe.c ./drivers/md/dm-stripe.c
---- ./drivers/md.dm/dm-stripe.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-stripe.c	2006-03-17 13:16:38.000000000 +0300
-@@ -21,7 +21,7 @@ struct stripe_c {
- 	uint32_t stripes;
- 
- 	/* The size of this target / num. stripes */
--	uint32_t stripe_width;
-+	sector_t stripe_width;
- 
- 	/* stripe chunk size */
- 	uint32_t chunk_shift;
-@@ -173,9 +173,8 @@ static int stripe_map(struct dm_target *
- 	struct stripe_c *sc = (struct stripe_c *) ti->private;
- 
- 	sector_t offset = bio->bi_sector - ti->begin;
--	uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
--	uint32_t stripe = chunk % sc->stripes;	/* 32bit modulus */
--	chunk = chunk / sc->stripes;
-+	sector_t chunk = offset >> sc->chunk_shift;
-+	uint32_t stripe = do_div(chunk, sc->stripes);
- 
- 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
- 	bio->bi_sector = sc->stripe[stripe].physical_start +
-@@ -189,10 +188,6 @@ static int stripe_status(struct dm_targe
- 	struct stripe_c *sc = (struct stripe_c *) ti->private;
- 	unsigned int sz = 0;
- 	unsigned int i;
--	char buffer[32];
--
--#define EMIT(x...) sz += ((sz >= maxlen) ? \
--			  0 : scnprintf(result + sz, maxlen - sz, x))
- 
- 	switch (type) {
- 	case STATUSTYPE_INFO:
-@@ -200,12 +195,10 @@ static int stripe_status(struct dm_targe
- 		break;
- 
- 	case STATUSTYPE_TABLE:
--		EMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
--		for (i = 0; i < sc->stripes; i++) {
--			format_dev_t(buffer, sc->stripe[i].dev->bdev->bd_dev);
--			EMIT(" %s " SECTOR_FORMAT, buffer,
--			     sc->stripe[i].physical_start);
--		}
-+		DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
-+		for (i = 0; i < sc->stripes; i++)
-+			DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
-+			       sc->stripe[i].physical_start);
- 		break;
- 	}
- 	return 0;
-@@ -213,7 +206,7 @@ static int stripe_status(struct dm_targe
- 
- static struct target_type stripe_target = {
- 	.name   = "striped",
--	.version= {1, 0, 1},
-+	.version= {1, 0, 2},
- 	.module = THIS_MODULE,
- 	.ctr    = stripe_ctr,
- 	.dtr    = stripe_dtr,
-diff -pruN ./drivers/md.dm/dm-table.c ./drivers/md/dm-table.c
---- ./drivers/md.dm/dm-table.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-table.c	2006-03-17 13:16:38.000000000 +0300
-@@ -57,7 +57,7 @@ struct dm_table {
- /*
-  * Similar to ceiling(log_size(n))
-  */
--static unsigned int int_log(unsigned long n, unsigned long base)
-+static unsigned int int_log(unsigned int n, unsigned int base)
- {
- 	int result = 0;
- 
-@@ -454,6 +454,8 @@ static int __table_get_device(struct dm_
- 			return r;
- 		}
- 
-+		format_dev_t(dd->name, dev);
-+
- 		atomic_set(&dd->count, 0);
- 		list_add(&dd->list, &t->devices);
- 
-@@ -575,7 +577,7 @@ static char **realloc_argv(unsigned *arr
- /*
-  * Destructively splits up the argument list to pass to ctr.
-  */
--static int split_args(int *argc, char ***argvp, char *input)
-+int dm_split_args(int *argc, char ***argvp, char *input)
- {
- 	char *start, *end = input, *out, **argv = NULL;
- 	unsigned array_size = 0;
-@@ -663,14 +665,14 @@ int dm_table_add_target(struct dm_table 
- 
- 	if (!len) {
- 		tgt->error = "zero-length target";
--		DMERR(": %s\n", tgt->error);
-+		DMERR("%s", tgt->error);
- 		return -EINVAL;
- 	}
- 
- 	tgt->type = dm_get_target_type(type);
- 	if (!tgt->type) {
- 		tgt->error = "unknown target type";
--		DMERR(": %s\n", tgt->error);
-+		DMERR("%s", tgt->error);
- 		return -EINVAL;
- 	}
- 
-@@ -688,7 +690,7 @@ int dm_table_add_target(struct dm_table 
- 		goto bad;
- 	}
- 
--	r = split_args(&argc, &argv, params);
-+	r = dm_split_args(&argc, &argv, params);
- 	if (r) {
- 		tgt->error = "couldn't split parameters (insufficient memory)";
- 		goto bad;
-@@ -707,7 +709,7 @@ int dm_table_add_target(struct dm_table 
- 	return 0;
- 
-  bad:
--	DMERR(": %s\n", tgt->error);
-+	DMERR("%s", tgt->error);
- 	dm_put_target_type(tgt->type);
- 	return r;
- }
-@@ -825,7 +827,7 @@ void dm_table_set_restrictions(struct dm
- 	 * Make sure we obey the optimistic sub devices
- 	 * restrictions.
- 	 */
--	q->max_sectors = t->limits.max_sectors;
-+	blk_queue_max_sectors(q, t->limits.max_sectors);
- 	q->max_phys_segments = t->limits.max_phys_segments;
- 	q->max_hw_segments = t->limits.max_hw_segments;
- 	q->hardsect_size = t->limits.hardsect_size;
-@@ -848,18 +850,38 @@ int dm_table_get_mode(struct dm_table *t
- 	return t->mode;
- }
- 
--void dm_table_suspend_targets(struct dm_table *t)
-+static void suspend_targets(struct dm_table *t, unsigned postsuspend)
- {
--	int i;
-+	int i = t->num_targets;
-+	struct dm_target *ti = t->targets;
- 
--	for (i = 0; i < t->num_targets; i++) {
--		struct dm_target *ti = t->targets + i;
-+	while (i--) {
-+		if (postsuspend) {
-+			if (ti->type->postsuspend)
-+				ti->type->postsuspend(ti);
-+		} else if (ti->type->presuspend)
-+			ti->type->presuspend(ti);
- 
--		if (ti->type->suspend)
--			ti->type->suspend(ti);
-+		ti++;
- 	}
- }
- 
-+void dm_table_presuspend_targets(struct dm_table *t)
-+{
-+	if (!t)
-+		return;
-+
-+	return suspend_targets(t, 0);
-+}
-+
-+void dm_table_postsuspend_targets(struct dm_table *t)
-+{
-+	if (!t)
-+		return;
-+
-+	return suspend_targets(t, 1);
-+}
-+
- void dm_table_resume_targets(struct dm_table *t)
- {
- 	int i;
-@@ -900,11 +922,35 @@ void dm_table_unplug_all(struct dm_table
- 	}
- }
- 
-+int dm_table_flush_all(struct dm_table *t)
-+{
-+	struct list_head *d, *devices = dm_table_get_devices(t);
-+	int ret = 0;
-+
-+	for (d = devices->next; d != devices; d = d->next) {
-+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
-+		request_queue_t *q = bdev_get_queue(dd->bdev);
-+		int err;
-+
-+		if (!q->issue_flush_fn)
-+			err = -EOPNOTSUPP;
-+		else
-+			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
-+
-+		if (!ret)
-+			ret = err;
-+	}
-+
-+	return ret;
-+}
-+
- EXPORT_SYMBOL(dm_vcalloc);
- EXPORT_SYMBOL(dm_get_device);
- EXPORT_SYMBOL(dm_put_device);
- EXPORT_SYMBOL(dm_table_event);
-+EXPORT_SYMBOL(dm_table_get_size);
- EXPORT_SYMBOL(dm_table_get_mode);
- EXPORT_SYMBOL(dm_table_put);
- EXPORT_SYMBOL(dm_table_get);
- EXPORT_SYMBOL(dm_table_unplug_all);
-+EXPORT_SYMBOL(dm_table_flush_all);
-diff -pruN ./drivers/md.dm/dm-target.c ./drivers/md/dm-target.c
---- ./drivers/md.dm/dm-target.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/dm-target.c	2006-03-17 13:16:38.000000000 +0300
-@@ -120,10 +120,9 @@ int dm_register_target(struct target_typ
- 		return -ENOMEM;
- 
- 	down_write(&_lock);
--	if (__find_target_type(t->name)) {
--		kfree(ti);
-+	if (__find_target_type(t->name))
- 		rv = -EEXIST;
--	} else
-+	else
- 		list_add(&ti->list, &_targets);
- 
- 	up_write(&_lock);
-diff -pruN ./drivers/md.dm/Kconfig ./drivers/md/Kconfig
---- ./drivers/md.dm/Kconfig	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/Kconfig	2006-03-17 13:16:38.000000000 +0300
-@@ -85,6 +85,24 @@ config MD_RAID1
- 
- 	  If unsure, say Y.
- 
-+config MD_RAID10
-+	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
-+	depends on BLK_DEV_MD && EXPERIMENTAL
-+	---help---
-+	  RAID-10 provides a combination of striping (RAID-0) and
-+	  mirroring (RAID-1) with easier configuration and more flexable
-+	  layout.
-+	  Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
-+	  be the same size (or atleast, only as much as the smallest device
-+	  will be used).
-+	  RAID-10 provides a variety of layouts that provide different levels
-+	  of redundancy and performance.
-+
-+	  RAID-10 requires mdadm-1.7.0 or later, available at:
-+
-+	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
-+
-+
- config MD_RAID5
- 	tristate "RAID-4/RAID-5 mode"
- 	depends on BLK_DEV_MD
-@@ -200,5 +218,17 @@ config DM_ZERO
- 	  A target that discards writes, and returns all zeroes for
- 	  reads.  Useful in some recovery situations.
- 
-+config DM_MULTIPATH
-+	tristate "Multipath target (EXPERIMENTAL)"
-+	depends on BLK_DEV_DM && EXPERIMENTAL
-+	---help---
-+	  Allow volume managers to support multipath hardware.
-+
-+config DM_MULTIPATH_EMC
-+	tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
-+	depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
-+	---help---
-+	  Multipath support for EMC CX/AX series hardware.
-+
- endmenu
- 
-diff -pruN ./drivers/md.dm/kcopyd.c ./drivers/md/kcopyd.c
---- ./drivers/md.dm/kcopyd.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/kcopyd.c	2006-03-20 09:36:55.000000000 +0300
-@@ -43,6 +43,10 @@ struct kcopyd_client {
- 	struct page_list *pages;
- 	unsigned int nr_pages;
- 	unsigned int nr_free_pages;
-+#ifndef __GENKSYMS__
-+	wait_queue_head_t destroyq;
-+	atomic_t nr_jobs;
-+#endif
- };
- 
- static struct page_list *alloc_pl(void)
-@@ -292,10 +296,15 @@ static int run_complete_job(struct kcopy
- 	int read_err = job->read_err;
- 	unsigned int write_err = job->write_err;
- 	kcopyd_notify_fn fn = job->fn;
-+	struct kcopyd_client *kc = job->kc;
- 
--	kcopyd_put_pages(job->kc, job->pages);
-+	kcopyd_put_pages(kc, job->pages);
- 	mempool_free(job, _job_pool);
- 	fn(read_err, write_err, context);
-+
-+	if (atomic_dec_and_test(&kc->nr_jobs))
-+		wake_up(&kc->destroyq);
-+
- 	return 0;
- }
- 
-@@ -430,6 +439,7 @@ static void do_work(void *ignored)
-  */
- static void dispatch_job(struct kcopyd_job *job)
- {
-+	atomic_inc(&job->kc->nr_jobs);
- 	push(&_pages_jobs, job);
- 	wake();
- }
-@@ -667,6 +677,9 @@ int kcopyd_client_create(unsigned int nr
- 		return r;
- 	}
- 
-+	init_waitqueue_head(&kc->destroyq);
-+	atomic_set(&kc->nr_jobs, 0);
-+
- 	client_add(kc);
- 	*result = kc;
- 	return 0;
-@@ -674,6 +687,9 @@ int kcopyd_client_create(unsigned int nr
- 
- void kcopyd_client_destroy(struct kcopyd_client *kc)
- {
-+	/* Wait for completion of all jobs submitted by this client. */
-+	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
-+
- 	dm_io_put(kc->nr_pages);
- 	client_free_pages(kc);
- 	client_del(kc);
-diff -pruN ./drivers/md.dm/linear.c ./drivers/md/linear.c
---- ./drivers/md.dm/linear.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/linear.c	2006-03-17 13:16:38.000000000 +0300
-@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde
- 		return hash->dev0;
- }
- 
--
- /**
-  *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
-  *	@q: request queue
-@@ -93,13 +92,35 @@ static void linear_unplug(request_queue_
- 	}
- }
- 
-+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			      sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	linear_conf_t *conf = mddev_to_conf(mddev);
-+	int i, ret = 0;
-+
-+	for (i=0; i < mddev->raid_disks; i++) {
-+		struct block_device *bdev = conf->disks[i].rdev->bdev;
-+		request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+		if (!r_queue->issue_flush_fn) {
-+			ret = -EOPNOTSUPP;
-+			break;
-+		}
-+		ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+		if (ret)
-+			break;
-+	}
-+	return ret;
-+}
- 
- static int linear_run (mddev_t *mddev)
- {
- 	linear_conf_t *conf;
- 	struct linear_hash *table;
- 	mdk_rdev_t *rdev;
--	int size, i, nb_zone, cnt;
-+	int i, nb_zone, cnt;
-+	sector_t size;
- 	unsigned int curr_offset;
- 	struct list_head *tmp;
- 
-@@ -137,7 +158,7 @@ static int linear_run (mddev_t *mddev)
- 		 */
- 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--			mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 		disk->size = rdev->size;
- 		mddev->array_size += rdev->size;
-@@ -200,6 +221,7 @@ static int linear_run (mddev_t *mddev)
- 
- 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- 	mddev->queue->unplug_fn = linear_unplug;
-+	mddev->queue->issue_flush_fn = linear_issue_flush;
- 	return 0;
- 
- out:
-@@ -247,10 +269,11 @@ static int linear_make_request (request_
- 		char b[BDEVNAME_SIZE];
- 
- 		printk("linear_make_request: Block %llu out of bounds on "
--			"dev %s size %ld offset %ld\n",
-+			"dev %s size %llu offset %llu\n",
- 			(unsigned long long)block,
- 			bdevname(tmp_dev->rdev->bdev, b),
--			tmp_dev->size, tmp_dev->offset);
-+			(unsigned long long)tmp_dev->size,
-+		        (unsigned long long)tmp_dev->offset);
- 		bio_io_error(bio, bio->bi_size);
- 		return 0;
- 	}
-diff -pruN ./drivers/md.dm/Makefile ./drivers/md/Makefile
---- ./drivers/md.dm/Makefile	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/Makefile	2006-03-17 13:16:38.000000000 +0300
-@@ -4,13 +4,16 @@
- 
- dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
- 		   dm-ioctl.o dm-io.o kcopyd.o
-+dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
- dm-snapshot-objs := dm-snap.o dm-exception-store.o
- dm-mirror-objs	:= dm-log.o dm-raid1.o
- raid6-objs	:= raid6main.o raid6algos.o raid6recov.o raid6tables.o \
- 		   raid6int1.o raid6int2.o raid6int4.o \
- 		   raid6int8.o raid6int16.o raid6int32.o \
- 		   raid6mmx.o raid6sse1.o raid6sse2.o
--host-progs	:= mktables
-+hostprogs-y	:= mktables
-+
-+CFLAGS_raid6int8.o += -O2
- 
- # Note: link order is important.  All raid personalities
- # and xor.o must come before md.o, as they each initialise 
-@@ -20,12 +23,15 @@ host-progs	:= mktables
- obj-$(CONFIG_MD_LINEAR)		+= linear.o
- obj-$(CONFIG_MD_RAID0)		+= raid0.o
- obj-$(CONFIG_MD_RAID1)		+= raid1.o
-+obj-$(CONFIG_MD_RAID10)		+= raid10.o
- obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o
- obj-$(CONFIG_MD_RAID6)		+= raid6.o xor.o
- obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
- obj-$(CONFIG_BLK_DEV_MD)	+= md.o
- obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
- obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
-+obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
-+obj-$(CONFIG_DM_MULTIPATH_EMC)	+= dm-emc.o
- obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
- obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
- obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
-diff -pruN ./drivers/md.dm/md.c ./drivers/md/md.c
---- ./drivers/md.dm/md.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/md.c	2006-03-17 13:22:09.000000000 +0300
-@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN
- 		tmp = tmp->next;})					\
- 		)
- 
-+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
-+{
-+	struct list_head *tmp;
-+	mdk_rdev_t *rdev;
-+	int ret = 0;
-+
-+	/*
-+	 * this list iteration is done without any locking in md?!
-+	 */
-+	ITERATE_RDEV(mddev, rdev, tmp) {
-+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-+		int err;
-+
-+		if (!r_queue->issue_flush_fn)
-+			err = -EOPNOTSUPP;
-+		else
-+			err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
-+
-+		if (!ret)
-+			ret = err;
-+	}
-+
-+	return ret;
-+}
-+
-+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
-+			 sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+
-+	return md_flush_mddev(mddev, error_sector);
-+}
-+
- static int md_fail_request (request_queue_t *q, struct bio *bio)
- {
- 	bio_io_error(bio, bio->bi_size);
-@@ -331,29 +364,24 @@ static int bi_complete(struct bio *bio, 
- static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
- 		   struct page *page, int rw)
- {
--	struct bio bio;
--	struct bio_vec vec;
-+	struct bio *bio = bio_alloc(GFP_NOIO, 1);
- 	struct completion event;
-+	int ret;
- 
- 	rw |= (1 << BIO_RW_SYNC);
- 
--	bio_init(&bio);
--	bio.bi_io_vec = &vec;
--	vec.bv_page = page;
--	vec.bv_len = size;
--	vec.bv_offset = 0;
--	bio.bi_vcnt = 1;
--	bio.bi_idx = 0;
--	bio.bi_size = size;
--	bio.bi_bdev = bdev;
--	bio.bi_sector = sector;
-+	bio->bi_bdev = bdev;
-+	bio->bi_sector = sector;
-+	bio_add_page(bio, page, size, 0);
- 	init_completion(&event);
--	bio.bi_private = &event;
--	bio.bi_end_io = bi_complete;
--	submit_bio(rw, &bio);
-+	bio->bi_private = &event;
-+	bio->bi_end_io = bi_complete;
-+	submit_bio(rw, bio);
- 	wait_for_completion(&event);
- 
--	return test_bit(BIO_UPTODATE, &bio.bi_flags);
-+	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+	bio_put(bio);
-+	return ret;
- }
- 
- static int read_disk_sb(mdk_rdev_t * rdev)
-@@ -373,7 +401,7 @@ static int read_disk_sb(mdk_rdev_t * rde
- 	return 0;
- 
- fail:
--	printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
-+	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
- 		bdevname(rdev->bdev,b));
- 	return -EINVAL;
- }
-@@ -439,6 +467,31 @@ static unsigned int calc_sb_csum(mdp_sup
- 	return csum;
- }
- 
-+/* csum_partial is not consistent between different architectures.
-+ * Some (i386) do a 32bit csum.  Some (alpha) do 16 bit.
-+ * This makes it hard for user-space to know what to do.
-+ * So we use calc_sb_csum to set the checksum to allow working
-+ * with older kernels, but allow calc_sb_csum_common to
-+ * be used when checking if a checksum is correct, to
-+ * make life easier for user-space tools that might write
-+ * a superblock.
-+ */
-+static unsigned int calc_sb_csum_common(mdp_super_t *super)
-+{
-+	unsigned int  disk_csum = super->sb_csum;
-+	unsigned long long newcsum = 0;
-+	unsigned int csum;
-+	int i;
-+	unsigned int *superc = (int*) super;
-+	super->sb_csum = 0;
-+
-+	for (i=0; i<MD_SB_BYTES/4; i++)
-+		newcsum+= superc[i];
-+	csum = (newcsum& 0xffffffff) + (newcsum>>32);
-+	super->sb_csum = disk_csum;
-+	return csum;
-+}
-+
- /*
-  * Handle superblock details.
-  * We want to be able to handle multiple superblock formats
-@@ -521,7 +574,8 @@ static int super_90_load(mdk_rdev_t *rde
- 	if (sb->raid_disks <= 0)
- 		goto abort;
- 
--	if (calc_sb_csum(sb) != sb->sb_csum) {
-+	if (calc_sb_csum(sb) != sb->sb_csum &&
-+		calc_sb_csum_common(sb) != sb->sb_csum) {
- 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
- 			b);
- 		goto abort;
-@@ -530,7 +584,7 @@ static int super_90_load(mdk_rdev_t *rde
- 	rdev->preferred_minor = sb->md_minor;
- 	rdev->data_offset = 0;
- 
--	if (sb->level == MULTIPATH)
-+	if (sb->level == LEVEL_MULTIPATH)
- 		rdev->desc_nr = -1;
- 	else
- 		rdev->desc_nr = sb->this_disk.number;
-@@ -745,11 +799,21 @@ static void super_90_sync(mddev_t *mddev
- static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
- {
- 	unsigned int disk_csum, csum;
-+	unsigned long long newcsum;
- 	int size = 256 + sb->max_dev*2;
-+	unsigned int *isuper = (unsigned int*)sb;
-+	int i;
- 
- 	disk_csum = sb->sb_csum;
- 	sb->sb_csum = 0;
--	csum = csum_partial((void *)sb, size, 0);
-+	newcsum = 0;
-+	for (i=0; size>=4; size -= 4 )
-+		newcsum += le32_to_cpu(*isuper++);
-+
-+	if (size == 2)
-+		newcsum += le16_to_cpu(*(unsigned short*) isuper);
-+
-+	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
- 	sb->sb_csum = disk_csum;
- 	return csum;
- }
-@@ -924,12 +988,12 @@ static void super_1_sync(mddev_t *mddev,
- 
- 	max_dev = 0;
- 	ITERATE_RDEV(mddev,rdev2,tmp)
--		if (rdev2->desc_nr > max_dev)
--			max_dev = rdev2->desc_nr;
-+		if (rdev2->desc_nr+1 > max_dev)
-+			max_dev = rdev2->desc_nr+1;
- 	
- 	sb->max_dev = max_dev;
- 	for (i=0; i<max_dev;i++)
--		sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
-+		sb->dev_roles[i] = cpu_to_le16(0xfffe);
- 	
- 	ITERATE_RDEV(mddev,rdev2,tmp) {
- 		i = rdev2->desc_nr;
-@@ -942,6 +1006,7 @@ static void super_1_sync(mddev_t *mddev,
- 	}
- 
- 	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
-+	sb->sb_csum = calc_sb_1_csum(sb);
- }
- 
- 
-@@ -1042,20 +1107,24 @@ static void unbind_rdev_from_array(mdk_r
- /*
-  * prevent the device from being mounted, repartitioned or
-  * otherwise reused by a RAID array (or any other kernel
-- * subsystem), by opening the device. [simply getting an
-- * inode is not enough, the SCSI module usage code needs
-- * an explicit open() on the device]
-+ * subsystem), by bd_claiming the device.
-  */
- static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
- {
- 	int err = 0;
- 	struct block_device *bdev;
-+	char b[BDEVNAME_SIZE];
- 
- 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
--	if (IS_ERR(bdev))
-+	if (IS_ERR(bdev)) {
-+		printk(KERN_ERR "md: could not open %s.\n",
-+			__bdevname(dev, b));
- 		return PTR_ERR(bdev);
-+	}
- 	err = bd_claim(bdev, rdev);
- 	if (err) {
-+		printk(KERN_ERR "md: could not bd_claim %s.\n",
-+			bdevname(bdev, b));
- 		blkdev_put(bdev);
- 		return err;
- 	}
-@@ -1117,10 +1186,7 @@ static void export_array(mddev_t *mddev)
- 
- static void print_desc(mdp_disk_t *desc)
- {
--	char b[BDEVNAME_SIZE];
--
--	printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
--		__bdevname(MKDEV(desc->major, desc->minor), b),
-+	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
- 		desc->major,desc->minor,desc->raid_disk,desc->state);
- }
- 
-@@ -1312,8 +1378,7 @@ static mdk_rdev_t *md_import_device(dev_
- 
- 	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
- 	if (!rdev) {
--		printk(KERN_ERR "md: could not alloc mem for %s!\n", 
--			__bdevname(newdev, b));
-+		printk(KERN_ERR "md: could not alloc mem for new device!\n");
- 		return ERR_PTR(-ENOMEM);
- 	}
- 	memset(rdev, 0, sizeof(*rdev));
-@@ -1322,11 +1387,9 @@ static mdk_rdev_t *md_import_device(dev_
- 		goto abort_free;
- 
- 	err = lock_rdev(rdev, newdev);
--	if (err) {
--		printk(KERN_ERR "md: could not lock %s.\n",
--			__bdevname(newdev, b));
-+	if (err)
- 		goto abort_free;
--	}
-+
- 	rdev->desc_nr = -1;
- 	rdev->faulty = 0;
- 	rdev->in_sync = 0;
-@@ -1436,9 +1499,8 @@ static int analyze_sbs(mddev_t * mddev)
- 		goto abort;
- 	}
- 
--	if ((mddev->recovery_cp != MaxSector) &&
--	    ((mddev->level == 1) ||
--	     ((mddev->level >= 4) && (mddev->level <= 6))))
-+	if (mddev->recovery_cp != MaxSector &&
-+	    mddev->level >= 1)
- 		printk(KERN_ERR "md: %s: raid array is not clean"
- 		       " -- starting background reconstruction\n",
- 		       mdname(mddev));
-@@ -1615,6 +1677,8 @@ static int do_md_run(mddev_t * mddev)
- 	mddev->pers = pers[pnum];
- 	spin_unlock(&pers_lock);
- 
-+	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
-+
- 	err = mddev->pers->run(mddev);
- 	if (err) {
- 		printk(KERN_ERR "md: pers->run() failed ...\n");
-@@ -1645,6 +1709,7 @@ static int do_md_run(mddev_t * mddev)
- 	 */
- 	mddev->queue->queuedata = mddev;
- 	mddev->queue->make_request_fn = mddev->pers->make_request;
-+	mddev->queue->issue_flush_fn = md_flush_all;
- 
- 	mddev->changed = 1;
- 	return 0;
-@@ -1881,11 +1946,9 @@ static int autostart_array(dev_t startde
- 	mdk_rdev_t *start_rdev = NULL, *rdev;
- 
- 	start_rdev = md_import_device(startdev, 0, 0);
--	if (IS_ERR(start_rdev)) {
--		printk(KERN_WARNING "md: could not import %s!\n",
--			__bdevname(startdev, b));
-+	if (IS_ERR(start_rdev))
- 		return err;
--	}
-+
- 
- 	/* NOTE: this can only work for 0.90.0 superblocks */
- 	sb = (mdp_super_t*)page_address(start_rdev->sb_page);
-@@ -1916,12 +1979,9 @@ static int autostart_array(dev_t startde
- 		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
- 			continue;
- 		rdev = md_import_device(dev, 0, 0);
--		if (IS_ERR(rdev)) {
--			printk(KERN_WARNING "md: could not import %s,"
--				" trying to run array nevertheless.\n",
--				__bdevname(dev, b));
-+		if (IS_ERR(rdev))
- 			continue;
--		}
-+
- 		list_add(&rdev->same_set, &pending_raid_disks);
- 	}
- 
-@@ -2153,42 +2213,6 @@ static int add_new_disk(mddev_t * mddev,
- 	return 0;
- }
- 
--static int hot_generate_error(mddev_t * mddev, dev_t dev)
--{
--	char b[BDEVNAME_SIZE];
--	struct request_queue *q;
--	mdk_rdev_t *rdev;
--
--	if (!mddev->pers)
--		return -ENODEV;
--
--	printk(KERN_INFO "md: trying to generate %s error in %s ... \n",
--		__bdevname(dev, b), mdname(mddev));
--
--	rdev = find_rdev(mddev, dev);
--	if (!rdev) {
--		/* MD_BUG(); */ /* like hell - it's not a driver bug */
--		return -ENXIO;
--	}
--
--	if (rdev->desc_nr == -1) {
--		MD_BUG();
--		return -EINVAL;
--	}
--	if (!rdev->in_sync)
--		return -ENODEV;
--
--	q = bdev_get_queue(rdev->bdev);
--	if (!q) {
--		MD_BUG();
--		return -ENODEV;
--	}
--	printk(KERN_INFO "md: okay, generating error!\n");
--//	q->oneshot_error = 1; // disabled for now
--
--	return 0;
--}
--
- static int hot_remove_disk(mddev_t * mddev, dev_t dev)
- {
- 	char b[BDEVNAME_SIZE];
-@@ -2197,9 +2221,6 @@ static int hot_remove_disk(mddev_t * mdd
- 	if (!mddev->pers)
- 		return -ENODEV;
- 
--	printk(KERN_INFO "md: trying to remove %s from %s ... \n",
--		__bdevname(dev, b), mdname(mddev));
--
- 	rdev = find_rdev(mddev, dev);
- 	if (!rdev)
- 		return -ENXIO;
-@@ -2227,9 +2248,6 @@ static int hot_add_disk(mddev_t * mddev,
- 	if (!mddev->pers)
- 		return -ENODEV;
- 
--	printk(KERN_INFO "md: trying to hot-add %s to %s ... \n",
--		__bdevname(dev, b), mdname(mddev));
--
- 	if (mddev->major_version != 0) {
- 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
- 			" version-0 superblocks.\n",
-@@ -2478,6 +2496,9 @@ static int set_disk_faulty(mddev_t *mdde
- {
- 	mdk_rdev_t *rdev;
- 
-+	if (mddev->pers == NULL)
-+		return -ENODEV;
-+
- 	rdev = find_rdev(mddev, dev);
- 	if (!rdev)
- 		return -ENODEV;
-@@ -2489,7 +2510,6 @@ static int set_disk_faulty(mddev_t *mdde
- static int md_ioctl(struct inode *inode, struct file *file,
- 			unsigned int cmd, unsigned long arg)
- {
--	char b[BDEVNAME_SIZE];
- 	int err = 0;
- 	void __user *argp = (void __user *)arg;
- 	struct hd_geometry __user *loc = argp;
-@@ -2548,8 +2568,7 @@ static int md_ioctl(struct inode *inode,
- 		}
- 		err = autostart_array(new_decode_dev(arg));
- 		if (err) {
--			printk(KERN_WARNING "md: autostart %s failed!\n",
--				__bdevname(arg, b));
-+			printk(KERN_WARNING "md: autostart failed!\n");
- 			goto abort;
- 		}
- 		goto done;
-@@ -2690,9 +2709,7 @@ static int md_ioctl(struct inode *inode,
- 				err = add_new_disk(mddev, &info);
- 			goto done_unlock;
- 		}
--		case HOT_GENERATE_ERROR:
--			err = hot_generate_error(mddev, new_decode_dev(arg));
--			goto done_unlock;
-+
- 		case HOT_REMOVE_DISK:
- 			err = hot_remove_disk(mddev, new_decode_dev(arg));
- 			goto done_unlock;
-@@ -2876,7 +2893,7 @@ mdk_thread_t *md_register_thread(void (*
- 	return thread;
- }
- 
--void md_interrupt_thread(mdk_thread_t *thread)
-+static void md_interrupt_thread(mdk_thread_t *thread)
- {
- 	if (!thread->tsk) {
- 		MD_BUG();
-@@ -2919,6 +2936,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t
- 	if (!mddev->pers->error_handler)
- 		return;
- 	mddev->pers->error_handler(mddev,rdev);
-+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- 	md_wakeup_thread(mddev->thread);
- }
-@@ -2951,7 +2969,11 @@ static void status_resync(struct seq_fil
- 	unsigned long max_blocks, resync, res, dt, db, rt;
- 
- 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
--	max_blocks = mddev->size;
-+
-+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+		max_blocks = mddev->resync_max_sectors >> 1;
-+	else
-+		max_blocks = mddev->size;
- 
- 	/*
- 	 * Should not happen.
-@@ -3187,11 +3209,6 @@ int unregister_md_personality(int pnum)
- 	return 0;
- }
- 
--void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
--{
--	rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
--}
--
- static int is_mddev_idle(mddev_t *mddev)
- {
- 	mdk_rdev_t * rdev;
-@@ -3204,8 +3221,12 @@ static int is_mddev_idle(mddev_t *mddev)
- 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- 		curr_events = disk_stat_read(disk, read_sectors) + 
- 				disk_stat_read(disk, write_sectors) - 
--				disk->sync_io;
--		if ((curr_events - rdev->last_events) > 32) {
-+				atomic_read(&disk->sync_io);
-+		/* Allow some slack between valud of curr_events and last_events,
-+		 * as there are some uninteresting races.
-+		 * Note: the following is an unsigned comparison.
-+		 */
-+		if ((curr_events - rdev->last_events + 32) > 64) {
- 			rdev->last_events = curr_events;
- 			idle = 0;
- 		}
-@@ -3339,7 +3360,14 @@ static void md_do_sync(mddev_t *mddev)
- 		}
- 	} while (mddev->curr_resync < 2);
- 
--	max_sectors = mddev->size << 1;
-+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+		/* resync follows the size requested by the personality,
-+		 * which default to physical size, but can be virtual size
-+		 */
-+		max_sectors = mddev->resync_max_sectors;
-+	else
-+		/* recovery follows the physical size of devices */
-+		max_sectors = mddev->size << 1;
- 
- 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
- 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-@@ -3372,10 +3400,12 @@ static void md_do_sync(mddev_t *mddev)
- 	init_waitqueue_head(&mddev->recovery_wait);
- 	last_check = 0;
- 
--	if (j)
-+	if (j>2) {
- 		printk(KERN_INFO 
- 			"md: resuming recovery of %s from checkpoint.\n",
- 			mdname(mddev));
-+		mddev->curr_resync = j;
-+	}
- 
- 	while (j < max_sectors) {
- 		int sectors;
-@@ -3458,7 +3488,7 @@ static void md_do_sync(mddev_t *mddev)
- 
- 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- 	    mddev->curr_resync > 2 &&
--	    mddev->curr_resync > mddev->recovery_cp) {
-+	    mddev->curr_resync >= mddev->recovery_cp) {
- 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- 			printk(KERN_INFO 
- 				"md: checkpointing recovery of %s.\n",
-@@ -3697,7 +3727,6 @@ void md_autodetect_dev(dev_t dev)
- 
- static void autostart_arrays(int part)
- {
--	char b[BDEVNAME_SIZE];
- 	mdk_rdev_t *rdev;
- 	int i;
- 
-@@ -3707,11 +3736,9 @@ static void autostart_arrays(int part)
- 		dev_t dev = detected_devices[i];
- 
- 		rdev = md_import_device(dev,0, 0);
--		if (IS_ERR(rdev)) {
--			printk(KERN_ALERT "md: could not import %s!\n",
--				__bdevname(dev, b));
-+		if (IS_ERR(rdev))
- 			continue;
--		}
-+
- 		if (rdev->faulty) {
- 			MD_BUG();
- 			continue;
-@@ -3762,7 +3789,6 @@ module_exit(md_exit)
- EXPORT_SYMBOL(register_md_personality);
- EXPORT_SYMBOL(unregister_md_personality);
- EXPORT_SYMBOL(md_error);
--EXPORT_SYMBOL(md_sync_acct);
- EXPORT_SYMBOL(md_done_sync);
- EXPORT_SYMBOL(md_write_start);
- EXPORT_SYMBOL(md_write_end);
-@@ -3771,6 +3797,5 @@ EXPORT_SYMBOL(md_register_thread);
- EXPORT_SYMBOL(md_unregister_thread);
- EXPORT_SYMBOL(md_wakeup_thread);
- EXPORT_SYMBOL(md_print_devices);
--EXPORT_SYMBOL(md_interrupt_thread);
- EXPORT_SYMBOL(md_check_recovery);
- MODULE_LICENSE("GPL");
-diff -pruN ./drivers/md.dm/multipath.c ./drivers/md/multipath.c
---- ./drivers/md.dm/multipath.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/multipath.c	2006-03-17 13:16:38.000000000 +0300
-@@ -99,12 +99,12 @@ static void multipath_reschedule_retry (
-  * operation and are ready to return a success/failure code to the buffer
-  * cache layer.
-  */
--static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
-+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
- {
- 	struct bio *bio = mp_bh->master_bio;
- 	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
- 
--	bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO);
-+	bio_endio(bio, bio->bi_size, err);
- 	mempool_free(mp_bh, conf->pool);
- }
- 
-@@ -119,8 +119,8 @@ int multipath_end_request(struct bio *bi
- 		return 1;
- 
- 	if (uptodate)
--		multipath_end_bh_io(mp_bh, uptodate);
--	else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) {
-+		multipath_end_bh_io(mp_bh, 0);
-+	else if (!bio_rw_ahead(bio)) {
- 		/*
- 		 * oops, IO error:
- 		 */
-@@ -131,7 +131,7 @@ int multipath_end_request(struct bio *bi
- 		       (unsigned long long)bio->bi_sector);
- 		multipath_reschedule_retry(mp_bh);
- 	} else
--		multipath_end_bh_io(mp_bh, 0);
-+		multipath_end_bh_io(mp_bh, error);
- 	rdev_dec_pending(rdev, conf->mddev);
- 	return 0;
- }
-@@ -155,7 +155,7 @@ static void unplug_slaves(mddev_t *mddev
- 				r_queue->unplug_fn(r_queue);
- 
- 			spin_lock_irqsave(&conf->device_lock, flags);
--			atomic_dec(&rdev->nr_pending);
-+			rdev_dec_pending(rdev, mddev);
- 		}
- 	}
- 	spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -217,6 +217,31 @@ static void multipath_status (struct seq
- 	seq_printf (seq, "]");
- }
- 
-+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
-+				 sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	multipath_conf_t *conf = mddev_to_conf(mddev);
-+	int i, ret = 0;
-+
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
-+		if (rdev && !rdev->faulty) {
-+			struct block_device *bdev = rdev->bdev;
-+			request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+			if (!r_queue->issue_flush_fn) {
-+				ret = -EOPNOTSUPP;
-+				break;
-+			}
-+
-+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+			if (ret)
-+				break;
-+		}
-+	}
-+	return ret;
-+}
- 
- /*
-  * Careful, this can execute in IRQ contexts as well!
-@@ -300,7 +325,7 @@ static int multipath_add_disk(mddev_t *m
- 		 */
- 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- 			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--				mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 			conf->working_disks++;
- 			rdev->raid_disk = path;
-@@ -377,7 +402,7 @@ static void multipathd (mddev_t *mddev)
- 				" error for block %llu\n",
- 				bdevname(bio->bi_bdev,b),
- 				(unsigned long long)bio->bi_sector);
--			multipath_end_bh_io(mp_bh, 0);
-+			multipath_end_bh_io(mp_bh, -EIO);
- 		} else {
- 			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
- 				" to another IO path\n",
-@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev
- 
- 	mddev->queue->unplug_fn = multipath_unplug;
- 
-+	mddev->queue->issue_flush_fn = multipath_issue_flush;
-+
- 	conf->working_disks = 0;
- 	ITERATE_RDEV(mddev,rdev,tmp) {
- 		disk_idx = rdev->raid_disk;
-@@ -452,7 +479,7 @@ static int multipath_run (mddev_t *mddev
- 		 * a merge_bvec_fn to be involved in multipath */
- 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--			mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 		if (!rdev->faulty) 
- 			conf->working_disks++;
-diff -pruN ./drivers/md.dm/raid0.c ./drivers/md/raid0.c
---- ./drivers/md.dm/raid0.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid0.c	2006-03-17 13:16:38.000000000 +0300
-@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t
- 	}
- }
- 
-+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			     sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	raid0_conf_t *conf = mddev_to_conf(mddev);
-+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
-+	int i, ret = 0;
-+
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		struct block_device *bdev = devlist[i]->bdev;
-+		request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+		if (!r_queue->issue_flush_fn) {
-+			ret = -EOPNOTSUPP;
-+			break;
-+		}
-+
-+		ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+		if (ret)
-+			break;
-+	}
-+	return ret;
-+}
-+
-+
- static int create_strip_zones (mddev_t *mddev)
- {
- 	int i, c, j;
-@@ -137,7 +162,7 @@ static int create_strip_zones (mddev_t *
- 
- 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
- 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--			mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 		if (!smallest || (rdev1->size <smallest->size))
- 			smallest = rdev1;
-@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t *
- 
- 	mddev->queue->unplug_fn = raid0_unplug;
- 
-+	mddev->queue->issue_flush_fn = raid0_issue_flush;
-+
- 	printk("raid0: done.\n");
- 	return 0;
-  abort:
-diff -pruN ./drivers/md.dm/raid10.c ./drivers/md/raid10.c
---- ./drivers/md.dm/raid10.c	1970-01-01 03:00:00.000000000 +0300
-+++ ./drivers/md/raid10.c	2006-03-17 13:16:38.000000000 +0300
-@@ -0,0 +1,1780 @@
-+/*
-+ * raid10.c : Multiple Devices driver for Linux
-+ *
-+ * Copyright (C) 2000-2004 Neil Brown
-+ *
-+ * RAID-10 support for md.
-+ *
-+ * Base on code in raid1.c.  See raid1.c for futher copyright information.
-+ *
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2, or (at your option)
-+ * any later version.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * (for example /usr/src/linux/COPYING); if not, write to the Free
-+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-+ */
-+
-+#include <linux/raid/raid10.h>
-+
-+/*
-+ * RAID10 provides a combination of RAID0 and RAID1 functionality.
-+ * The layout of data is defined by
-+ *    chunk_size
-+ *    raid_disks
-+ *    near_copies (stored in low byte of layout)
-+ *    far_copies (stored in second byte of layout)
-+ *
-+ * The data to be stored is divided into chunks using chunksize.
-+ * Each device is divided into far_copies sections.
-+ * In each section, chunks are laid out in a style similar to raid0, but
-+ * near_copies copies of each chunk is stored (each on a different drive).
-+ * The starting device for each section is offset near_copies from the starting
-+ * device of the previous section.
-+ * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
-+ * drive.
-+ * near_copies and far_copies must be at least one, and there product is at most
-+ * raid_disks.
-+ */
-+
-+/*
-+ * Number of guaranteed r10bios in case of extreme VM load:
-+ */
-+#define	NR_RAID10_BIOS 256
-+
-+static void unplug_slaves(mddev_t *mddev);
-+
-+static void * r10bio_pool_alloc(int gfp_flags, void *data)
-+{
-+	conf_t *conf = data;
-+	r10bio_t *r10_bio;
-+	int size = offsetof(struct r10bio_s, devs[conf->copies]);
-+
-+	/* allocate a r10bio with room for raid_disks entries in the bios array */
-+	r10_bio = kmalloc(size, gfp_flags);
-+	if (r10_bio)
-+		memset(r10_bio, 0, size);
-+	else
-+		unplug_slaves(conf->mddev);
-+
-+	return r10_bio;
-+}
-+
-+static void r10bio_pool_free(void *r10_bio, void *data)
-+{
-+	kfree(r10_bio);
-+}
-+
-+#define RESYNC_BLOCK_SIZE (64*1024)
-+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
-+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-+#define RESYNC_WINDOW (2048*1024)
-+
-+/*
-+ * When performing a resync, we need to read and compare, so
-+ * we need as many pages are there are copies.
-+ * When performing a recovery, we need 2 bios, one for read,
-+ * one for write (we recover only one drive per r10buf)
-+ *
-+ */
-+static void * r10buf_pool_alloc(int gfp_flags, void *data)
-+{
-+	conf_t *conf = data;
-+	struct page *page;
-+	r10bio_t *r10_bio;
-+	struct bio *bio;
-+	int i, j;
-+	int nalloc;
-+
-+	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-+	if (!r10_bio) {
-+		unplug_slaves(conf->mddev);
-+		return NULL;
-+	}
-+
-+	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-+		nalloc = conf->copies; /* resync */
-+	else
-+		nalloc = 2; /* recovery */
-+
-+	/*
-+	 * Allocate bios.
-+	 */
-+	for (j = nalloc ; j-- ; ) {
-+		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
-+		if (!bio)
-+			goto out_free_bio;
-+		r10_bio->devs[j].bio = bio;
-+	}
-+	/*
-+	 * Allocate RESYNC_PAGES data pages and attach them
-+	 * where needed.
-+	 */
-+	for (j = 0 ; j < nalloc; j++) {
-+		bio = r10_bio->devs[j].bio;
-+		for (i = 0; i < RESYNC_PAGES; i++) {
-+			page = alloc_page(gfp_flags);
-+			if (unlikely(!page))
-+				goto out_free_pages;
-+
-+			bio->bi_io_vec[i].bv_page = page;
-+		}
-+	}
-+
-+	return r10_bio;
-+
-+out_free_pages:
-+	for ( ; i > 0 ; i--)
-+		__free_page(bio->bi_io_vec[i-1].bv_page);
-+	while (j--)
-+		for (i = 0; i < RESYNC_PAGES ; i++)
-+			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
-+	j = -1;
-+out_free_bio:
-+	while ( ++j < nalloc )
-+		bio_put(r10_bio->devs[j].bio);
-+	r10bio_pool_free(r10_bio, conf);
-+	return NULL;
-+}
-+
-+static void r10buf_pool_free(void *__r10_bio, void *data)
-+{
-+	int i;
-+	conf_t *conf = data;
-+	r10bio_t *r10bio = __r10_bio;
-+	int j;
-+
-+	for (j=0; j < conf->copies; j++) {
-+		struct bio *bio = r10bio->devs[j].bio;
-+		if (bio) {
-+			for (i = 0; i < RESYNC_PAGES; i++) {
-+				__free_page(bio->bi_io_vec[i].bv_page);
-+				bio->bi_io_vec[i].bv_page = NULL;
-+			}
-+			bio_put(bio);
-+		}
-+	}
-+	r10bio_pool_free(r10bio, conf);
-+}
-+
-+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
-+{
-+	int i;
-+
-+	for (i = 0; i < conf->copies; i++) {
-+		struct bio **bio = & r10_bio->devs[i].bio;
-+		if (*bio)
-+			bio_put(*bio);
-+		*bio = NULL;
-+	}
-+}
-+
-+static inline void free_r10bio(r10bio_t *r10_bio)
-+{
-+	unsigned long flags;
-+
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+	/*
-+	 * Wake up any possible resync thread that waits for the device
-+	 * to go idle.
-+	 */
-+	spin_lock_irqsave(&conf->resync_lock, flags);
-+	if (!--conf->nr_pending) {
-+		wake_up(&conf->wait_idle);
-+		wake_up(&conf->wait_resume);
-+	}
-+	spin_unlock_irqrestore(&conf->resync_lock, flags);
-+
-+	put_all_bios(conf, r10_bio);
-+	mempool_free(r10_bio, conf->r10bio_pool);
-+}
-+
-+static inline void put_buf(r10bio_t *r10_bio)
-+{
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+	unsigned long flags;
-+
-+	mempool_free(r10_bio, conf->r10buf_pool);
-+
-+	spin_lock_irqsave(&conf->resync_lock, flags);
-+	if (!conf->barrier)
-+		BUG();
-+	--conf->barrier;
-+	wake_up(&conf->wait_resume);
-+	wake_up(&conf->wait_idle);
-+
-+	if (!--conf->nr_pending) {
-+		wake_up(&conf->wait_idle);
-+		wake_up(&conf->wait_resume);
-+	}
-+	spin_unlock_irqrestore(&conf->resync_lock, flags);
-+}
-+
-+static void reschedule_retry(r10bio_t *r10_bio)
-+{
-+	unsigned long flags;
-+	mddev_t *mddev = r10_bio->mddev;
-+	conf_t *conf = mddev_to_conf(mddev);
-+
-+	spin_lock_irqsave(&conf->device_lock, flags);
-+	list_add(&r10_bio->retry_list, &conf->retry_list);
-+	spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+	md_wakeup_thread(mddev->thread);
-+}
-+
-+/*
-+ * raid_end_bio_io() is called when we have finished servicing a mirrored
-+ * operation and are ready to return a success/failure code to the buffer
-+ * cache layer.
-+ */
-+static void raid_end_bio_io(r10bio_t *r10_bio)
-+{
-+	struct bio *bio = r10_bio->master_bio;
-+
-+	bio_endio(bio, bio->bi_size,
-+		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
-+	free_r10bio(r10_bio);
-+}
-+
-+/*
-+ * Update disk head position estimator based on IRQ completion info.
-+ */
-+static inline void update_head_pos(int slot, r10bio_t *r10_bio)
-+{
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
-+		r10_bio->devs[slot].addr + (r10_bio->sectors);
-+}
-+
-+static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+	int slot, dev;
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+	if (bio->bi_size)
-+		return 1;
-+
-+	slot = r10_bio->read_slot;
-+	dev = r10_bio->devs[slot].devnum;
-+	/*
-+	 * this branch is our 'one mirror IO has finished' event handler:
-+	 */
-+	if (!uptodate)
-+		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-+	else
-+		/*
-+		 * Set R10BIO_Uptodate in our master bio, so that
-+		 * we will return a good error code to the higher
-+		 * levels even if IO on some other mirrored buffer fails.
-+		 *
-+		 * The 'master' represents the composite IO operation to
-+		 * user-side. So if something waits for IO, then it will
-+		 * wait for the 'master' bio.
-+		 */
-+		set_bit(R10BIO_Uptodate, &r10_bio->state);
-+
-+	update_head_pos(slot, r10_bio);
-+
-+	/*
-+	 * we have only one bio on the read side
-+	 */
-+	if (uptodate)
-+		raid_end_bio_io(r10_bio);
-+	else {
-+		/*
-+		 * oops, read error:
-+		 */
-+		char b[BDEVNAME_SIZE];
-+		if (printk_ratelimit())
-+			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
-+			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
-+		reschedule_retry(r10_bio);
-+	}
-+
-+	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
-+	return 0;
-+}
-+
-+static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+	int slot, dev;
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+
-+	if (bio->bi_size)
-+		return 1;
-+
-+	for (slot = 0; slot < conf->copies; slot++)
-+		if (r10_bio->devs[slot].bio == bio)
-+			break;
-+	dev = r10_bio->devs[slot].devnum;
-+
-+	/*
-+	 * this branch is our 'one mirror IO has finished' event handler:
-+	 */
-+	if (!uptodate)
-+		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-+	else
-+		/*
-+		 * Set R10BIO_Uptodate in our master bio, so that
-+		 * we will return a good error code for to the higher
-+		 * levels even if IO on some other mirrored buffer fails.
-+		 *
-+		 * The 'master' represents the composite IO operation to
-+		 * user-side. So if something waits for IO, then it will
-+		 * wait for the 'master' bio.
-+		 */
-+		set_bit(R10BIO_Uptodate, &r10_bio->state);
-+
-+	update_head_pos(slot, r10_bio);
-+
-+	/*
-+	 *
-+	 * Let's see if all mirrored write operations have finished
-+	 * already.
-+	 */
-+	if (atomic_dec_and_test(&r10_bio->remaining)) {
-+		md_write_end(r10_bio->mddev);
-+		raid_end_bio_io(r10_bio);
-+	}
-+
-+	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
-+	return 0;
-+}
-+
-+
-+/*
-+ * RAID10 layout manager
-+ * Aswell as the chunksize and raid_disks count, there are two
-+ * parameters: near_copies and far_copies.
-+ * near_copies * far_copies must be <= raid_disks.
-+ * Normally one of these will be 1.
-+ * If both are 1, we get raid0.
-+ * If near_copies == raid_disks, we get raid1.
-+ *
-+ * Chunks are layed out in raid0 style with near_copies copies of the
-+ * first chunk, followed by near_copies copies of the next chunk and
-+ * so on.
-+ * If far_copies > 1, then after 1/far_copies of the array has been assigned
-+ * as described above, we start again with a device offset of near_copies.
-+ * So we effectively have another copy of the whole array further down all
-+ * the drives, but with blocks on different drives.
-+ * With this layout, and block is never stored twice on the one device.
-+ *
-+ * raid10_find_phys finds the sector offset of a given virtual sector
-+ * on each device that it is on. If a block isn't on a device,
-+ * that entry in the array is set to MaxSector.
-+ *
-+ * raid10_find_virt does the reverse mapping, from a device and a
-+ * sector offset to a virtual address
-+ */
-+
-+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
-+{
-+	int n,f;
-+	sector_t sector;
-+	sector_t chunk;
-+	sector_t stripe;
-+	int dev;
-+
-+	int slot = 0;
-+
-+	/* now calculate first sector/dev */
-+	chunk = r10bio->sector >> conf->chunk_shift;
-+	sector = r10bio->sector & conf->chunk_mask;
-+
-+	chunk *= conf->near_copies;
-+	stripe = chunk;
-+	dev = sector_div(stripe, conf->raid_disks);
-+
-+	sector += stripe << conf->chunk_shift;
-+
-+	/* and calculate all the others */
-+	for (n=0; n < conf->near_copies; n++) {
-+		int d = dev;
-+		sector_t s = sector;
-+		r10bio->devs[slot].addr = sector;
-+		r10bio->devs[slot].devnum = d;
-+		slot++;
-+
-+		for (f = 1; f < conf->far_copies; f++) {
-+			d += conf->near_copies;
-+			if (d >= conf->raid_disks)
-+				d -= conf->raid_disks;
-+			s += conf->stride;
-+			r10bio->devs[slot].devnum = d;
-+			r10bio->devs[slot].addr = s;
-+			slot++;
-+		}
-+		dev++;
-+		if (dev >= conf->raid_disks) {
-+			dev = 0;
-+			sector += (conf->chunk_mask + 1);
-+		}
-+	}
-+	BUG_ON(slot != conf->copies);
-+}
-+
-+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
-+{
-+	sector_t offset, chunk, vchunk;
-+
-+	while (sector > conf->stride) {
-+		sector -= conf->stride;
-+		if (dev < conf->near_copies)
-+			dev += conf->raid_disks - conf->near_copies;
-+		else
-+			dev -= conf->near_copies;
-+	}
-+
-+	offset = sector & conf->chunk_mask;
-+	chunk = sector >> conf->chunk_shift;
-+	vchunk = chunk * conf->raid_disks + dev;
-+	sector_div(vchunk, conf->near_copies);
-+	return (vchunk << conf->chunk_shift) + offset;
-+}
-+
-+/**
-+ *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
-+ *	@q: request queue
-+ *	@bio: the buffer head that's been built up so far
-+ *	@biovec: the request that could be merged to it.
-+ *
-+ *	Return amount of bytes we can accept at this offset
-+ *      If near_copies == raid_disk, there are no striping issues,
-+ *      but in that case, the function isn't called at all.
-+ */
-+static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
-+				struct bio_vec *bio_vec)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
-+	int max;
-+	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-+	unsigned int bio_sectors = bio->bi_size >> 9;
-+
-+	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-+	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-+	if (max <= bio_vec->bv_len && bio_sectors == 0)
-+		return bio_vec->bv_len;
-+	else
-+		return max;
-+}
-+
-+/*
-+ * This routine returns the disk from which the requested read should
-+ * be done. There is a per-array 'next expected sequential IO' sector
-+ * number - if this matches on the next IO then we use the last disk.
-+ * There is also a per-disk 'last know head position' sector that is
-+ * maintained from IRQ contexts, both the normal and the resync IO
-+ * completion handlers update this position correctly. If there is no
-+ * perfect sequential match then we pick the disk whose head is closest.
-+ *
-+ * If there are 2 mirrors in the same 2 devices, performance degrades
-+ * because position is mirror, not device based.
-+ *
-+ * The rdev for the device selected will have nr_pending incremented.
-+ */
-+
-+/*
-+ * FIXME: possibly should rethink readbalancing and do it differently
-+ * depending on near_copies / far_copies geometry.
-+ */
-+static int read_balance(conf_t *conf, r10bio_t *r10_bio)
-+{
-+	const unsigned long this_sector = r10_bio->sector;
-+	int disk, slot, nslot;
-+	const int sectors = r10_bio->sectors;
-+	sector_t new_distance, current_distance;
-+
-+	raid10_find_phys(conf, r10_bio);
-+	spin_lock_irq(&conf->device_lock);
-+	/*
-+	 * Check if we can balance. We can balance on the whole
-+	 * device if no resync is going on, or below the resync window.
-+	 * We take the first readable disk when above the resync window.
-+	 */
-+	if (conf->mddev->recovery_cp < MaxSector
-+	    && (this_sector + sectors >= conf->next_resync)) {
-+		/* make sure that disk is operational */
-+		slot = 0;
-+		disk = r10_bio->devs[slot].devnum;
-+
-+		while (!conf->mirrors[disk].rdev ||
-+		       !conf->mirrors[disk].rdev->in_sync) {
-+			slot++;
-+			if (slot == conf->copies) {
-+				slot = 0;
-+				disk = -1;
-+				break;
-+			}
-+			disk = r10_bio->devs[slot].devnum;
-+		}
-+		goto rb_out;
-+	}
-+
-+
-+	/* make sure the disk is operational */
-+	slot = 0;
-+	disk = r10_bio->devs[slot].devnum;
-+	while (!conf->mirrors[disk].rdev ||
-+	       !conf->mirrors[disk].rdev->in_sync) {
-+		slot ++;
-+		if (slot == conf->copies) {
-+			disk = -1;
-+			goto rb_out;
-+		}
-+		disk = r10_bio->devs[slot].devnum;
-+	}
-+
-+
-+	current_distance = abs(this_sector - conf->mirrors[disk].head_position);
-+
-+	/* Find the disk whose head is closest */
-+
-+	for (nslot = slot; nslot < conf->copies; nslot++) {
-+		int ndisk = r10_bio->devs[nslot].devnum;
-+
-+
-+		if (!conf->mirrors[ndisk].rdev ||
-+		    !conf->mirrors[ndisk].rdev->in_sync)
-+			continue;
-+
-+		if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {
-+			disk = ndisk;
-+			slot = nslot;
-+			break;
-+		}
-+		new_distance = abs(r10_bio->devs[nslot].addr -
-+				   conf->mirrors[ndisk].head_position);
-+		if (new_distance < current_distance) {
-+			current_distance = new_distance;
-+			disk = ndisk;
-+			slot = nslot;
-+		}
-+	}
-+
-+rb_out:
-+	r10_bio->read_slot = slot;
-+/*	conf->next_seq_sect = this_sector + sectors;*/
-+
-+	if (disk >= 0 && conf->mirrors[disk].rdev)
-+		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-+	spin_unlock_irq(&conf->device_lock);
-+
-+	return disk;
-+}
-+
-+static void unplug_slaves(mddev_t *mddev)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+	int i;
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&conf->device_lock, flags);
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+		if (rdev && atomic_read(&rdev->nr_pending)) {
-+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-+
-+			atomic_inc(&rdev->nr_pending);
-+			spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+			if (r_queue->unplug_fn)
-+				r_queue->unplug_fn(r_queue);
-+
-+			spin_lock_irqsave(&conf->device_lock, flags);
-+			rdev_dec_pending(rdev, mddev);
-+		}
-+	}
-+	spin_unlock_irqrestore(&conf->device_lock, flags);
-+}
-+static void raid10_unplug(request_queue_t *q)
-+{
-+	unplug_slaves(q->queuedata);
-+}
-+
-+static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			     sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	conf_t *conf = mddev_to_conf(mddev);
-+	unsigned long flags;
-+	int i, ret = 0;
-+
-+	spin_lock_irqsave(&conf->device_lock, flags);
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+		if (rdev && !rdev->faulty) {
-+			struct block_device *bdev = rdev->bdev;
-+			request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+			if (r_queue->issue_flush_fn) {
-+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+				if (ret)
-+					break;
-+			}
-+		}
-+	}
-+	spin_unlock_irqrestore(&conf->device_lock, flags);
-+	return ret;
-+}
-+
-+/*
-+ * Throttle resync depth, so that we can both get proper overlapping of
-+ * requests, but are still able to handle normal requests quickly.
-+ */
-+#define RESYNC_DEPTH 32
-+
-+static void device_barrier(conf_t *conf, sector_t sect)
-+{
-+	spin_lock_irq(&conf->resync_lock);
-+	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-+			    conf->resync_lock, unplug_slaves(conf->mddev));
-+
-+	if (!conf->barrier++) {
-+		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-+				    conf->resync_lock, unplug_slaves(conf->mddev));
-+		if (conf->nr_pending)
-+			BUG();
-+	}
-+	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-+			    conf->resync_lock, unplug_slaves(conf->mddev));
-+	conf->next_resync = sect;
-+	spin_unlock_irq(&conf->resync_lock);
-+}
-+
-+static int make_request(request_queue_t *q, struct bio * bio)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	conf_t *conf = mddev_to_conf(mddev);
-+	mirror_info_t *mirror;
-+	r10bio_t *r10_bio;
-+	struct bio *read_bio;
-+	int i;
-+	int chunk_sects = conf->chunk_mask + 1;
-+
-+	/* If this request crosses a chunk boundary, we need to
-+	 * split it.  This will only happen for 1 PAGE (or less) requests.
-+	 */
-+	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
-+		      > chunk_sects &&
-+		    conf->near_copies < conf->raid_disks)) {
-+		struct bio_pair *bp;
-+		/* Sanity check -- queue functions should prevent this happening */
-+		if (bio->bi_vcnt != 1 ||
-+		    bio->bi_idx != 0)
-+			goto bad_map;
-+		/* This is a one page bio that upper layers
-+		 * refuse to split for us, so we need to split it.
-+		 */
-+		bp = bio_split(bio, bio_split_pool,
-+			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-+		if (make_request(q, &bp->bio1))
-+			generic_make_request(&bp->bio1);
-+		if (make_request(q, &bp->bio2))
-+			generic_make_request(&bp->bio2);
-+
-+		bio_pair_release(bp);
-+		return 0;
-+	bad_map:
-+		printk("raid10_make_request bug: can't convert block across chunks"
-+		       " or bigger than %dk %llu %d\n", chunk_sects/2,
-+		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
-+
-+		bio_io_error(bio, bio->bi_size);
-+		return 0;
-+	}
-+
-+	/*
-+	 * Register the new request and wait if the reconstruction
-+	 * thread has put up a bar for new requests.
-+	 * Continue immediately if no resync is active currently.
-+	 */
-+	spin_lock_irq(&conf->resync_lock);
-+	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-+	conf->nr_pending++;
-+	spin_unlock_irq(&conf->resync_lock);
-+
-+	if (bio_data_dir(bio)==WRITE) {
-+		disk_stat_inc(mddev->gendisk, writes);
-+		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
-+	} else {
-+		disk_stat_inc(mddev->gendisk, reads);
-+		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
-+	}
-+
-+	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-+
-+	r10_bio->master_bio = bio;
-+	r10_bio->sectors = bio->bi_size >> 9;
-+
-+	r10_bio->mddev = mddev;
-+	r10_bio->sector = bio->bi_sector;
-+
-+	if (bio_data_dir(bio) == READ) {
-+		/*
-+		 * read balancing logic:
-+		 */
-+		int disk = read_balance(conf, r10_bio);
-+		int slot = r10_bio->read_slot;
-+		if (disk < 0) {
-+			raid_end_bio_io(r10_bio);
-+			return 0;
-+		}
-+		mirror = conf->mirrors + disk;
-+
-+		read_bio = bio_clone(bio, GFP_NOIO);
-+
-+		r10_bio->devs[slot].bio = read_bio;
-+
-+		read_bio->bi_sector = r10_bio->devs[slot].addr +
-+			mirror->rdev->data_offset;
-+		read_bio->bi_bdev = mirror->rdev->bdev;
-+		read_bio->bi_end_io = raid10_end_read_request;
-+		read_bio->bi_rw = READ;
-+		read_bio->bi_private = r10_bio;
-+
-+		generic_make_request(read_bio);
-+		return 0;
-+	}
-+
-+	/*
-+	 * WRITE:
-+	 */
-+	/* first select target devices under spinlock and
-+	 * inc refcount on their rdev.  Record them by setting
-+	 * bios[x] to bio
-+	 */
-+	raid10_find_phys(conf, r10_bio);
-+	spin_lock_irq(&conf->device_lock);
-+	for (i = 0;  i < conf->copies; i++) {
-+		int d = r10_bio->devs[i].devnum;
-+		if (conf->mirrors[d].rdev &&
-+		    !conf->mirrors[d].rdev->faulty) {
-+			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+			r10_bio->devs[i].bio = bio;
-+		} else
-+			r10_bio->devs[i].bio = NULL;
-+	}
-+	spin_unlock_irq(&conf->device_lock);
-+
-+	atomic_set(&r10_bio->remaining, 1);
-+	md_write_start(mddev);
-+	for (i = 0; i < conf->copies; i++) {
-+		struct bio *mbio;
-+		int d = r10_bio->devs[i].devnum;
-+		if (!r10_bio->devs[i].bio)
-+			continue;
-+
-+		mbio = bio_clone(bio, GFP_NOIO);
-+		r10_bio->devs[i].bio = mbio;
-+
-+		mbio->bi_sector	= r10_bio->devs[i].addr+
-+			conf->mirrors[d].rdev->data_offset;
-+		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+		mbio->bi_end_io	= raid10_end_write_request;
-+		mbio->bi_rw = WRITE;
-+		mbio->bi_private = r10_bio;
-+
-+		atomic_inc(&r10_bio->remaining);
-+		generic_make_request(mbio);
-+	}
-+
-+	if (atomic_dec_and_test(&r10_bio->remaining)) {
-+		md_write_end(mddev);
-+		raid_end_bio_io(r10_bio);
-+	}
-+
-+	return 0;
-+}
-+
-+static void status(struct seq_file *seq, mddev_t *mddev)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+	int i;
-+
-+	if (conf->near_copies < conf->raid_disks)
-+		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
-+	if (conf->near_copies > 1)
-+		seq_printf(seq, " %d near-copies", conf->near_copies);
-+	if (conf->far_copies > 1)
-+		seq_printf(seq, " %d far-copies", conf->far_copies);
-+
-+	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-+						conf->working_disks);
-+	for (i = 0; i < conf->raid_disks; i++)
-+		seq_printf(seq, "%s",
-+			      conf->mirrors[i].rdev &&
-+			      conf->mirrors[i].rdev->in_sync ? "U" : "_");
-+	seq_printf(seq, "]");
-+}
-+
-+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
-+{
-+	char b[BDEVNAME_SIZE];
-+	conf_t *conf = mddev_to_conf(mddev);
-+
-+	/*
-+	 * If it is not operational, then we have already marked it as dead
-+	 * else if it is the last working disks, ignore the error, let the
-+	 * next level up know.
-+	 * else mark the drive as failed
-+	 */
-+	if (rdev->in_sync
-+	    && conf->working_disks == 1)
-+		/*
-+		 * Don't fail the drive, just return an IO error.
-+		 * The test should really be more sophisticated than
-+		 * "working_disks == 1", but it isn't critical, and
-+		 * can wait until we do more sophisticated "is the drive
-+		 * really dead" tests...
-+		 */
-+		return;
-+	if (rdev->in_sync) {
-+		mddev->degraded++;
-+		conf->working_disks--;
-+		/*
-+		 * if recovery is running, make sure it aborts.
-+		 */
-+		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
-+	}
-+	rdev->in_sync = 0;
-+	rdev->faulty = 1;
-+	mddev->sb_dirty = 1;
-+	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
-+		"	Operation continuing on %d devices\n",
-+		bdevname(rdev->bdev,b), conf->working_disks);
-+}
-+
-+static void print_conf(conf_t *conf)
-+{
-+	int i;
-+	mirror_info_t *tmp;
-+
-+	printk("RAID10 conf printout:\n");
-+	if (!conf) {
-+		printk("(!conf)\n");
-+		return;
-+	}
-+	printk(" --- wd:%d rd:%d\n", conf->working_disks,
-+		conf->raid_disks);
-+
-+	for (i = 0; i < conf->raid_disks; i++) {
-+		char b[BDEVNAME_SIZE];
-+		tmp = conf->mirrors + i;
-+		if (tmp->rdev)
-+			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
-+				i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
-+				bdevname(tmp->rdev->bdev,b));
-+	}
-+}
-+
-+static void close_sync(conf_t *conf)
-+{
-+	spin_lock_irq(&conf->resync_lock);
-+	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-+			    conf->resync_lock, 	unplug_slaves(conf->mddev));
-+	spin_unlock_irq(&conf->resync_lock);
-+
-+	if (conf->barrier) BUG();
-+	if (waitqueue_active(&conf->wait_idle)) BUG();
-+
-+	mempool_destroy(conf->r10buf_pool);
-+	conf->r10buf_pool = NULL;
-+}
-+
-+static int raid10_spare_active(mddev_t *mddev)
-+{
-+	int i;
-+	conf_t *conf = mddev->private;
-+	mirror_info_t *tmp;
-+
-+	spin_lock_irq(&conf->device_lock);
-+	/*
-+	 * Find all non-in_sync disks within the RAID10 configuration
-+	 * and mark them in_sync
-+	 */
-+	for (i = 0; i < conf->raid_disks; i++) {
-+		tmp = conf->mirrors + i;
-+		if (tmp->rdev
-+		    && !tmp->rdev->faulty
-+		    && !tmp->rdev->in_sync) {
-+			conf->working_disks++;
-+			mddev->degraded--;
-+			tmp->rdev->in_sync = 1;
-+		}
-+	}
-+	spin_unlock_irq(&conf->device_lock);
-+
-+	print_conf(conf);
-+	return 0;
-+}
-+
-+
-+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
-+{
-+	conf_t *conf = mddev->private;
-+	int found = 0;
-+	int mirror;
-+	mirror_info_t *p;
-+
-+	if (mddev->recovery_cp < MaxSector)
-+		/* only hot-add to in-sync arrays, as recovery is
-+		 * very different from resync
-+		 */
-+		return 0;
-+	spin_lock_irq(&conf->device_lock);
-+	for (mirror=0; mirror < mddev->raid_disks; mirror++)
-+		if ( !(p=conf->mirrors+mirror)->rdev) {
-+			p->rdev = rdev;
-+
-+			blk_queue_stack_limits(mddev->queue,
-+					       rdev->bdev->bd_disk->queue);
-+			/* as we don't honour merge_bvec_fn, we must never risk
-+			 * violating it, so limit ->max_sector to one PAGE, as
-+			 * a one page request is never in violation.
-+			 */
-+			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-+			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-+				mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+
-+			p->head_position = 0;
-+			rdev->raid_disk = mirror;
-+			found = 1;
-+			break;
-+		}
-+	spin_unlock_irq(&conf->device_lock);
-+
-+	print_conf(conf);
-+	return found;
-+}
-+
-+static int raid10_remove_disk(mddev_t *mddev, int number)
-+{
-+	conf_t *conf = mddev->private;
-+	int err = 1;
-+	mirror_info_t *p = conf->mirrors+ number;
-+
-+	print_conf(conf);
-+	spin_lock_irq(&conf->device_lock);
-+	if (p->rdev) {
-+		if (p->rdev->in_sync ||
-+		    atomic_read(&p->rdev->nr_pending)) {
-+			err = -EBUSY;
-+			goto abort;
-+		}
-+		p->rdev = NULL;
-+		err = 0;
-+	}
-+	if (err)
-+		MD_BUG();
-+abort:
-+	spin_unlock_irq(&conf->device_lock);
-+
-+	print_conf(conf);
-+	return err;
-+}
-+
-+
-+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-+	int i,d;
-+
-+	if (bio->bi_size)
-+		return 1;
-+
-+	for (i=0; i<conf->copies; i++)
-+		if (r10_bio->devs[i].bio == bio)
-+			break;
-+	if (i == conf->copies)
-+		BUG();
-+	update_head_pos(i, r10_bio);
-+	d = r10_bio->devs[i].devnum;
-+	if (!uptodate)
-+		md_error(r10_bio->mddev,
-+			 conf->mirrors[d].rdev);
-+
-+	/* for reconstruct, we always reschedule after a read.
-+	 * for resync, only after all reads
-+	 */
-+	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
-+	    atomic_dec_and_test(&r10_bio->remaining)) {
-+		/* we have read all the blocks,
-+		 * do the comparison in process context in raid10d
-+		 */
-+		reschedule_retry(r10_bio);
-+	}
-+	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
-+	return 0;
-+}
-+
-+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
-+{
-+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-+	mddev_t *mddev = r10_bio->mddev;
-+	conf_t *conf = mddev_to_conf(mddev);
-+	int i,d;
-+
-+	if (bio->bi_size)
-+		return 1;
-+
-+	for (i = 0; i < conf->copies; i++)
-+		if (r10_bio->devs[i].bio == bio)
-+			break;
-+	d = r10_bio->devs[i].devnum;
-+
-+	if (!uptodate)
-+		md_error(mddev, conf->mirrors[d].rdev);
-+	update_head_pos(i, r10_bio);
-+
-+	while (atomic_dec_and_test(&r10_bio->remaining)) {
-+		if (r10_bio->master_bio == NULL) {
-+			/* the primary of several recovery bios */
-+			md_done_sync(mddev, r10_bio->sectors, 1);
-+			put_buf(r10_bio);
-+			break;
-+		} else {
-+			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
-+			put_buf(r10_bio);
-+			r10_bio = r10_bio2;
-+		}
-+	}
-+	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-+	return 0;
-+}
-+
-+/*
-+ * Note: sync and recover and handled very differently for raid10
-+ * This code is for resync.
-+ * For resync, we read through virtual addresses and read all blocks.
-+ * If there is any error, we schedule a write.  The lowest numbered
-+ * drive is authoritative.
-+ * However requests come for physical address, so we need to map.
-+ * For every physical address there are raid_disks/copies virtual addresses,
-+ * which is always are least one, but is not necessarly an integer.
-+ * This means that a physical address can span multiple chunks, so we may
-+ * have to submit multiple io requests for a single sync request.
-+ */
-+/*
-+ * We check if all blocks are in-sync and only write to blocks that
-+ * aren't in sync
-+ */
-+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+	int i, first;
-+	struct bio *tbio, *fbio;
-+
-+	atomic_set(&r10_bio->remaining, 1);
-+
-+	/* find the first device with a block */
-+	for (i=0; i<conf->copies; i++)
-+		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-+			break;
-+
-+	if (i == conf->copies)
-+		goto done;
-+
-+	first = i;
-+	fbio = r10_bio->devs[i].bio;
-+
-+	/* now find blocks with errors */
-+	for (i=first+1 ; i < conf->copies ; i++) {
-+		int vcnt, j, d;
-+
-+		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-+			continue;
-+		/* We know that the bi_io_vec layout is the same for
-+		 * both 'first' and 'i', so we just compare them.
-+		 * All vec entries are PAGE_SIZE;
-+		 */
-+		tbio = r10_bio->devs[i].bio;
-+		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
-+		for (j = 0; j < vcnt; j++)
-+			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-+				   page_address(tbio->bi_io_vec[j].bv_page),
-+				   PAGE_SIZE))
-+				break;
-+		if (j == vcnt)
-+			continue;
-+		/* Ok, we need to write this bio
-+		 * First we need to fixup bv_offset, bv_len and
-+		 * bi_vecs, as the read request might have corrupted these
-+		 */
-+		tbio->bi_vcnt = vcnt;
-+		tbio->bi_size = r10_bio->sectors << 9;
-+		tbio->bi_idx = 0;
-+		tbio->bi_phys_segments = 0;
-+		tbio->bi_hw_segments = 0;
-+		tbio->bi_hw_front_size = 0;
-+		tbio->bi_hw_back_size = 0;
-+		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-+		tbio->bi_flags |= 1 << BIO_UPTODATE;
-+		tbio->bi_next = NULL;
-+		tbio->bi_rw = WRITE;
-+		tbio->bi_private = r10_bio;
-+		tbio->bi_sector = r10_bio->devs[i].addr;
-+
-+		for (j=0; j < vcnt ; j++) {
-+			tbio->bi_io_vec[j].bv_offset = 0;
-+			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-+
-+			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-+			       page_address(fbio->bi_io_vec[j].bv_page),
-+			       PAGE_SIZE);
-+		}
-+		tbio->bi_end_io = end_sync_write;
-+
-+		d = r10_bio->devs[i].devnum;
-+		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+		atomic_inc(&r10_bio->remaining);
-+		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
-+
-+		generic_make_request(tbio);
-+	}
-+
-+done:
-+	if (atomic_dec_and_test(&r10_bio->remaining)) {
-+		md_done_sync(mddev, r10_bio->sectors, 1);
-+		put_buf(r10_bio);
-+	}
-+}
-+
-+/*
-+ * Now for the recovery code.
-+ * Recovery happens across physical sectors.
-+ * We recover all non-is_sync drives by finding the virtual address of
-+ * each, and then choose a working drive that also has that virt address.
-+ * There is a separate r10_bio for each non-in_sync drive.
-+ * Only the first two slots are in use. The first for reading,
-+ * The second for writing.
-+ *
-+ */
-+
-+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+	int i, d;
-+	struct bio *bio, *wbio;
-+
-+
-+	/* move the pages across to the second bio
-+	 * and submit the write request
-+	 */
-+	bio = r10_bio->devs[0].bio;
-+	wbio = r10_bio->devs[1].bio;
-+	for (i=0; i < wbio->bi_vcnt; i++) {
-+		struct page *p = bio->bi_io_vec[i].bv_page;
-+		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
-+		wbio->bi_io_vec[i].bv_page = p;
-+	}
-+	d = r10_bio->devs[1].devnum;
-+
-+	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-+	generic_make_request(wbio);
-+}
-+
-+
-+/*
-+ * This is a kernel thread which:
-+ *
-+ *	1.	Retries failed read operations on working mirrors.
-+ *	2.	Updates the raid superblock when problems encounter.
-+ *	3.	Performs writes following reads for array syncronising.
-+ */
-+
-+static void raid10d(mddev_t *mddev)
-+{
-+	r10bio_t *r10_bio;
-+	struct bio *bio;
-+	unsigned long flags;
-+	conf_t *conf = mddev_to_conf(mddev);
-+	struct list_head *head = &conf->retry_list;
-+	int unplug=0;
-+	mdk_rdev_t *rdev;
-+
-+	md_check_recovery(mddev);
-+	md_handle_safemode(mddev);
-+
-+	for (;;) {
-+		char b[BDEVNAME_SIZE];
-+		spin_lock_irqsave(&conf->device_lock, flags);
-+		if (list_empty(head))
-+			break;
-+		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
-+		list_del(head->prev);
-+		spin_unlock_irqrestore(&conf->device_lock, flags);
-+
-+		mddev = r10_bio->mddev;
-+		conf = mddev_to_conf(mddev);
-+		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
-+			sync_request_write(mddev, r10_bio);
-+			unplug = 1;
-+		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
-+			recovery_request_write(mddev, r10_bio);
-+			unplug = 1;
-+		} else {
-+			int mirror;
-+			bio = r10_bio->devs[r10_bio->read_slot].bio;
-+			r10_bio->devs[r10_bio->read_slot].bio = NULL;
-+			bio_put(bio);
-+			mirror = read_balance(conf, r10_bio);
-+			if (mirror == -1) {
-+				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
-+				       " read error for block %llu\n",
-+				       bdevname(bio->bi_bdev,b),
-+				       (unsigned long long)r10_bio->sector);
-+				raid_end_bio_io(r10_bio);
-+			} else {
-+				rdev = conf->mirrors[mirror].rdev;
-+				if (printk_ratelimit())
-+					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
-+					       " another mirror\n",
-+					       bdevname(rdev->bdev,b),
-+					       (unsigned long long)r10_bio->sector);
-+				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
-+				r10_bio->devs[r10_bio->read_slot].bio = bio;
-+				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
-+					+ rdev->data_offset;
-+				bio->bi_bdev = rdev->bdev;
-+				bio->bi_rw = READ;
-+				bio->bi_private = r10_bio;
-+				bio->bi_end_io = raid10_end_read_request;
-+				unplug = 1;
-+				generic_make_request(bio);
-+			}
-+		}
-+	}
-+	spin_unlock_irqrestore(&conf->device_lock, flags);
-+	if (unplug)
-+		unplug_slaves(mddev);
-+}
-+
-+
-+static int init_resync(conf_t *conf)
-+{
-+	int buffs;
-+
-+	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
-+	if (conf->r10buf_pool)
-+		BUG();
-+	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
-+	if (!conf->r10buf_pool)
-+		return -ENOMEM;
-+	conf->next_resync = 0;
-+	return 0;
-+}
-+
-+/*
-+ * perform a "sync" on one "block"
-+ *
-+ * We need to make sure that no normal I/O request - particularly write
-+ * requests - conflict with active sync requests.
-+ *
-+ * This is achieved by tracking pending requests and a 'barrier' concept
-+ * that can be installed to exclude normal IO requests.
-+ *
-+ * Resync and recovery are handled very differently.
-+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
-+ *
-+ * For resync, we iterate over virtual addresses, read all copies,
-+ * and update if there are differences.  If only one copy is live,
-+ * skip it.
-+ * For recovery, we iterate over physical addresses, read a good
-+ * value for each non-in_sync drive, and over-write.
-+ *
-+ * So, for recovery we may have several outstanding complex requests for a
-+ * given address, one for each out-of-sync device.  We model this by allocating
-+ * a number of r10_bio structures, one for each out-of-sync device.
-+ * As we setup these structures, we collect all bio's together into a list
-+ * which we then process collectively to add pages, and then process again
-+ * to pass to generic_make_request.
-+ *
-+ * The r10_bio structures are linked using a borrowed master_bio pointer.
-+ * This link is counted in ->remaining.  When the r10_bio that points to NULL
-+ * has its remaining count decremented to 0, the whole complex operation
-+ * is complete.
-+ *
-+ */
-+
-+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+	r10bio_t *r10_bio;
-+	struct bio *biolist = NULL, *bio;
-+	sector_t max_sector, nr_sectors;
-+	int disk;
-+	int i;
-+
-+	sector_t sectors_skipped = 0;
-+	int chunks_skipped = 0;
-+
-+	if (!conf->r10buf_pool)
-+		if (init_resync(conf))
-+			return -ENOMEM;
-+
-+ skipped:
-+	max_sector = mddev->size << 1;
-+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-+		max_sector = mddev->resync_max_sectors;
-+	if (sector_nr >= max_sector) {
-+		close_sync(conf);
-+		return sectors_skipped;
-+	}
-+	if (chunks_skipped >= conf->raid_disks) {
-+		/* if there has been nothing to do on any drive,
-+		 * then there is nothing to do at all..
-+		 */
-+		sector_t sec = max_sector - sector_nr;
-+		md_done_sync(mddev, sec, 1);
-+		return sec + sectors_skipped;
-+	}
-+
-+	/* make sure whole request will fit in a chunk - if chunks
-+	 * are meaningful
-+	 */
-+	if (conf->near_copies < conf->raid_disks &&
-+	    max_sector > (sector_nr | conf->chunk_mask))
-+		max_sector = (sector_nr | conf->chunk_mask) + 1;
-+	/*
-+	 * If there is non-resync activity waiting for us then
-+	 * put in a delay to throttle resync.
-+	 */
-+	if (!go_faster && waitqueue_active(&conf->wait_resume))
-+		schedule_timeout(HZ);
-+	device_barrier(conf, sector_nr + RESYNC_SECTORS);
-+
-+	/* Again, very different code for resync and recovery.
-+	 * Both must result in an r10bio with a list of bios that
-+	 * have bi_end_io, bi_sector, bi_bdev set,
-+	 * and bi_private set to the r10bio.
-+	 * For recovery, we may actually create several r10bios
-+	 * with 2 bios in each, that correspond to the bios in the main one.
-+	 * In this case, the subordinate r10bios link back through a
-+	 * borrowed master_bio pointer, and the counter in the master
-+	 * includes a ref from each subordinate.
-+	 */
-+	/* First, we decide what to do and set ->bi_end_io
-+	 * To end_sync_read if we want to read, and
-+	 * end_sync_write if we will want to write.
-+	 */
-+
-+	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-+		/* recovery... the complicated one */
-+		int i, j, k;
-+		r10_bio = NULL;
-+
-+		for (i=0 ; i<conf->raid_disks; i++)
-+			if (conf->mirrors[i].rdev &&
-+			    !conf->mirrors[i].rdev->in_sync) {
-+				/* want to reconstruct this device */
-+				r10bio_t *rb2 = r10_bio;
-+
-+				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-+				spin_lock_irq(&conf->resync_lock);
-+				conf->nr_pending++;
-+				if (rb2) conf->barrier++;
-+				spin_unlock_irq(&conf->resync_lock);
-+				atomic_set(&r10_bio->remaining, 0);
-+
-+				r10_bio->master_bio = (struct bio*)rb2;
-+				if (rb2)
-+					atomic_inc(&rb2->remaining);
-+				r10_bio->mddev = mddev;
-+				set_bit(R10BIO_IsRecover, &r10_bio->state);
-+				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
-+				raid10_find_phys(conf, r10_bio);
-+				for (j=0; j<conf->copies;j++) {
-+					int d = r10_bio->devs[j].devnum;
-+					if (conf->mirrors[d].rdev &&
-+					    conf->mirrors[d].rdev->in_sync) {
-+						/* This is where we read from */
-+						bio = r10_bio->devs[0].bio;
-+						bio->bi_next = biolist;
-+						biolist = bio;
-+						bio->bi_private = r10_bio;
-+						bio->bi_end_io = end_sync_read;
-+						bio->bi_rw = 0;
-+						bio->bi_sector = r10_bio->devs[j].addr +
-+							conf->mirrors[d].rdev->data_offset;
-+						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+						atomic_inc(&r10_bio->remaining);
-+						/* and we write to 'i' */
-+
-+						for (k=0; k<conf->copies; k++)
-+							if (r10_bio->devs[k].devnum == i)
-+								break;
-+						bio = r10_bio->devs[1].bio;
-+						bio->bi_next = biolist;
-+						biolist = bio;
-+						bio->bi_private = r10_bio;
-+						bio->bi_end_io = end_sync_write;
-+						bio->bi_rw = 1;
-+						bio->bi_sector = r10_bio->devs[k].addr +
-+							conf->mirrors[i].rdev->data_offset;
-+						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-+
-+						r10_bio->devs[0].devnum = d;
-+						r10_bio->devs[1].devnum = i;
-+
-+						break;
-+					}
-+				}
-+				if (j == conf->copies) {
-+					BUG();
-+				}
-+			}
-+		if (biolist == NULL) {
-+			while (r10_bio) {
-+				r10bio_t *rb2 = r10_bio;
-+				r10_bio = (r10bio_t*) rb2->master_bio;
-+				rb2->master_bio = NULL;
-+				put_buf(rb2);
-+			}
-+			goto giveup;
-+		}
-+	} else {
-+		/* resync. Schedule a read for every block at this virt offset */
-+		int count = 0;
-+		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-+
-+		spin_lock_irq(&conf->resync_lock);
-+		conf->nr_pending++;
-+		spin_unlock_irq(&conf->resync_lock);
-+
-+		r10_bio->mddev = mddev;
-+		atomic_set(&r10_bio->remaining, 0);
-+
-+		r10_bio->master_bio = NULL;
-+		r10_bio->sector = sector_nr;
-+		set_bit(R10BIO_IsSync, &r10_bio->state);
-+		raid10_find_phys(conf, r10_bio);
-+		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
-+		spin_lock_irq(&conf->device_lock);
-+		for (i=0; i<conf->copies; i++) {
-+			int d = r10_bio->devs[i].devnum;
-+			bio = r10_bio->devs[i].bio;
-+			bio->bi_end_io = NULL;
-+			if (conf->mirrors[d].rdev == NULL ||
-+			    conf->mirrors[d].rdev->faulty)
-+				continue;
-+			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-+			atomic_inc(&r10_bio->remaining);
-+			bio->bi_next = biolist;
-+			biolist = bio;
-+			bio->bi_private = r10_bio;
-+			bio->bi_end_io = end_sync_read;
-+			bio->bi_rw = 0;
-+			bio->bi_sector = r10_bio->devs[i].addr +
-+				conf->mirrors[d].rdev->data_offset;
-+			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-+			count++;
-+		}
-+		spin_unlock_irq(&conf->device_lock);
-+		if (count < 2) {
-+			for (i=0; i<conf->copies; i++) {
-+				int d = r10_bio->devs[i].devnum;
-+				if (r10_bio->devs[i].bio->bi_end_io)
-+					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
-+			}
-+			put_buf(r10_bio);
-+			biolist = NULL;
-+			goto giveup;
-+		}
-+	}
-+
-+	for (bio = biolist; bio ; bio=bio->bi_next) {
-+
-+		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-+		if (bio->bi_end_io)
-+			bio->bi_flags |= 1 << BIO_UPTODATE;
-+		bio->bi_vcnt = 0;
-+		bio->bi_idx = 0;
-+		bio->bi_phys_segments = 0;
-+		bio->bi_hw_segments = 0;
-+		bio->bi_size = 0;
-+	}
-+
-+	nr_sectors = 0;
-+	do {
-+		struct page *page;
-+		int len = PAGE_SIZE;
-+		disk = 0;
-+		if (sector_nr + (len>>9) > max_sector)
-+			len = (max_sector - sector_nr) << 9;
-+		if (len == 0)
-+			break;
-+		for (bio= biolist ; bio ; bio=bio->bi_next) {
-+			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-+			if (bio_add_page(bio, page, len, 0) == 0) {
-+				/* stop here */
-+				struct bio *bio2;
-+				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-+				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
-+					/* remove last page from this bio */
-+					bio2->bi_vcnt--;
-+					bio2->bi_size -= len;
-+					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
-+				}
-+				goto bio_full;
-+			}
-+			disk = i;
-+		}
-+		nr_sectors += len>>9;
-+		sector_nr += len>>9;
-+	} while (biolist->bi_vcnt < RESYNC_PAGES);
-+ bio_full:
-+	r10_bio->sectors = nr_sectors;
-+
-+	while (biolist) {
-+		bio = biolist;
-+		biolist = biolist->bi_next;
-+
-+		bio->bi_next = NULL;
-+		r10_bio = bio->bi_private;
-+		r10_bio->sectors = nr_sectors;
-+
-+		if (bio->bi_end_io == end_sync_read) {
-+			md_sync_acct(bio->bi_bdev, nr_sectors);
-+			generic_make_request(bio);
-+		}
-+	}
-+
-+	return sectors_skipped + nr_sectors;
-+ giveup:
-+	/* There is nowhere to write, so all non-sync
-+	 * drives must be failed, so try the next chunk...
-+	 */
-+	{
-+	int sec = max_sector - sector_nr;
-+	sectors_skipped += sec;
-+	chunks_skipped ++;
-+	sector_nr = max_sector;
-+	md_done_sync(mddev, sec, 1);
-+	goto skipped;
-+	}
-+}
-+
-+static int run(mddev_t *mddev)
-+{
-+	conf_t *conf;
-+	int i, disk_idx;
-+	mirror_info_t *disk;
-+	mdk_rdev_t *rdev;
-+	struct list_head *tmp;
-+	int nc, fc;
-+	sector_t stride, size;
-+
-+	if (mddev->level != 10) {
-+		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
-+		       mdname(mddev), mddev->level);
-+		goto out;
-+	}
-+	nc = mddev->layout & 255;
-+	fc = (mddev->layout >> 8) & 255;
-+	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-+	    (mddev->layout >> 16)) {
-+		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
-+		       mdname(mddev), mddev->layout);
-+		goto out;
-+	}
-+	/*
-+	 * copy the already verified devices into our private RAID10
-+	 * bookkeeping area. [whatever we allocate in run(),
-+	 * should be freed in stop()]
-+	 */
-+	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
-+	mddev->private = conf;
-+	if (!conf) {
-+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+			mdname(mddev));
-+		goto out;
-+	}
-+	memset(conf, 0, sizeof(*conf));
-+	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-+				 GFP_KERNEL);
-+	if (!conf->mirrors) {
-+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+		       mdname(mddev));
-+		goto out_free_conf;
-+	}
-+	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
-+
-+	conf->near_copies = nc;
-+	conf->far_copies = fc;
-+	conf->copies = nc*fc;
-+	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
-+	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-+	stride = mddev->size >> (conf->chunk_shift-1);
-+	sector_div(stride, fc);
-+	conf->stride = stride << conf->chunk_shift;
-+
-+	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
-+						r10bio_pool_free, conf);
-+	if (!conf->r10bio_pool) {
-+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
-+			mdname(mddev));
-+		goto out_free_conf;
-+	}
-+	mddev->queue->unplug_fn = raid10_unplug;
-+
-+	mddev->queue->issue_flush_fn = raid10_issue_flush;
-+
-+	ITERATE_RDEV(mddev, rdev, tmp) {
-+		disk_idx = rdev->raid_disk;
-+		if (disk_idx >= mddev->raid_disks
-+		    || disk_idx < 0)
-+			continue;
-+		disk = conf->mirrors + disk_idx;
-+
-+		disk->rdev = rdev;
-+
-+		blk_queue_stack_limits(mddev->queue,
-+				       rdev->bdev->bd_disk->queue);
-+		/* as we don't honour merge_bvec_fn, we must never risk
-+		 * violating it, so limit ->max_sector to one PAGE, as
-+		 * a one page request is never in violation.
-+		 */
-+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-+		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-+			mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+
-+		disk->head_position = 0;
-+		if (!rdev->faulty && rdev->in_sync)
-+			conf->working_disks++;
-+	}
-+	conf->raid_disks = mddev->raid_disks;
-+	conf->mddev = mddev;
-+	conf->device_lock = SPIN_LOCK_UNLOCKED;
-+	INIT_LIST_HEAD(&conf->retry_list);
-+
-+	conf->resync_lock = SPIN_LOCK_UNLOCKED;
-+	init_waitqueue_head(&conf->wait_idle);
-+	init_waitqueue_head(&conf->wait_resume);
-+
-+	if (!conf->working_disks) {
-+		printk(KERN_ERR "raid10: no operational mirrors for %s\n",
-+			mdname(mddev));
-+		goto out_free_conf;
-+	}
-+
-+	mddev->degraded = 0;
-+	for (i = 0; i < conf->raid_disks; i++) {
-+
-+		disk = conf->mirrors + i;
-+
-+		if (!disk->rdev) {
-+			disk->head_position = 0;
-+			mddev->degraded++;
-+		}
-+	}
-+
-+
-+	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
-+	if (!mddev->thread) {
-+		printk(KERN_ERR
-+		       "raid10: couldn't allocate thread for %s\n",
-+		       mdname(mddev));
-+		goto out_free_conf;
-+	}
-+
-+	printk(KERN_INFO
-+		"raid10: raid set %s active with %d out of %d devices\n",
-+		mdname(mddev), mddev->raid_disks - mddev->degraded,
-+		mddev->raid_disks);
-+	/*
-+	 * Ok, everything is just fine now
-+	 */
-+	size = conf->stride * conf->raid_disks;
-+	sector_div(size, conf->near_copies);
-+	mddev->array_size = size/2;
-+	mddev->resync_max_sectors = size;
-+
-+	/* Calculate max read-ahead size.
-+	 * We need to readahead at least twice a whole stripe....
-+	 * maybe...
-+	 */
-+	{
-+		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
-+		stripe /= conf->near_copies;
-+		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-+			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
-+	}
-+
-+	if (conf->near_copies < mddev->raid_disks)
-+		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-+	return 0;
-+
-+out_free_conf:
-+	if (conf->r10bio_pool)
-+		mempool_destroy(conf->r10bio_pool);
-+	if (conf->mirrors)
-+		kfree(conf->mirrors);
-+	kfree(conf);
-+	mddev->private = NULL;
-+out:
-+	return -EIO;
-+}
-+
-+static int stop(mddev_t *mddev)
-+{
-+	conf_t *conf = mddev_to_conf(mddev);
-+
-+	md_unregister_thread(mddev->thread);
-+	mddev->thread = NULL;
-+	if (conf->r10bio_pool)
-+		mempool_destroy(conf->r10bio_pool);
-+	if (conf->mirrors)
-+		kfree(conf->mirrors);
-+	kfree(conf);
-+	mddev->private = NULL;
-+	return 0;
-+}
-+
-+
-+static mdk_personality_t raid10_personality =
-+{
-+	.name		= "raid10",
-+	.owner		= THIS_MODULE,
-+	.make_request	= make_request,
-+	.run		= run,
-+	.stop		= stop,
-+	.status		= status,
-+	.error_handler	= error,
-+	.hot_add_disk	= raid10_add_disk,
-+	.hot_remove_disk= raid10_remove_disk,
-+	.spare_active	= raid10_spare_active,
-+	.sync_request	= sync_request,
-+};
-+
-+static int __init raid_init(void)
-+{
-+	return register_md_personality(RAID10, &raid10_personality);
-+}
-+
-+static void raid_exit(void)
-+{
-+	unregister_md_personality(RAID10);
-+}
-+
-+module_init(raid_init);
-+module_exit(raid_exit);
-+MODULE_LICENSE("GPL");
-+MODULE_ALIAS("md-personality-9"); /* RAID10 */
-diff -pruN ./drivers/md.dm/raid1.c ./drivers/md/raid1.c
---- ./drivers/md.dm/raid1.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid1.c	2006-03-17 13:16:38.000000000 +0300
-@@ -24,10 +24,6 @@
- 
- #include <linux/raid/raid1.h>
- 
--#define MAJOR_NR MD_MAJOR
--#define MD_DRIVER
--#define MD_PERSONALITY
--
- /*
-  * Number of guaranteed r1bios in case of extreme VM load:
-  */
-@@ -44,13 +40,12 @@ static void * r1bio_pool_alloc(int gfp_f
- {
- 	struct pool_info *pi = data;
- 	r1bio_t *r1_bio;
-+	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
- 
- 	/* allocate a r1bio with room for raid_disks entries in the bios array */
--	r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks,
--			 gfp_flags);
-+	r1_bio = kmalloc(size, gfp_flags);
- 	if (r1_bio)
--		memset(r1_bio, 0, sizeof(*r1_bio) +
--			       sizeof(struct bio*) * pi->raid_disks);
-+		memset(r1_bio, 0, size);
- 	else
- 		unplug_slaves(pi->mddev);
- 
-@@ -104,7 +99,7 @@ static void * r1buf_pool_alloc(int gfp_f
- 		bio->bi_io_vec[i].bv_page = page;
- 	}
- 
--	r1_bio->master_bio = bio;
-+	r1_bio->master_bio = NULL;
- 
- 	return r1_bio;
- 
-@@ -189,32 +184,6 @@ static inline void put_buf(r1bio_t *r1_b
- 	spin_unlock_irqrestore(&conf->resync_lock, flags);
- }
- 
--static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
--{
--	conf_t *conf = mddev_to_conf(mddev);
--	int i, disks = conf->raid_disks;
--
--	/*
--	 * Later we do read balancing on the read side
--	 * now we use the first available disk.
--	 */
--
--	spin_lock_irq(&conf->device_lock);
--	for (i = 0; i < disks; i++) {
--		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
--		if (rdev && rdev->in_sync) {
--			*rdevp = rdev;
--			atomic_inc(&rdev->nr_pending);
--			spin_unlock_irq(&conf->device_lock);
--			return i;
--		}
--	}
--	spin_unlock_irq(&conf->device_lock);
--
--	printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n");
--	return -1;
--}
--
- static void reschedule_retry(r1bio_t *r1_bio)
- {
- 	unsigned long flags;
-@@ -292,8 +261,9 @@ static int raid1_end_read_request(struct
- 		 * oops, read error:
- 		 */
- 		char b[BDEVNAME_SIZE];
--		printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
--		       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
-+		if (printk_ratelimit())
-+			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
-+			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
- 		reschedule_retry(r1_bio);
- 	}
- 
-@@ -363,12 +333,13 @@ static int raid1_end_write_request(struc
-  *
-  * The rdev for the device selected will have nr_pending incremented.
-  */
--static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
-+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
- {
- 	const unsigned long this_sector = r1_bio->sector;
- 	int new_disk = conf->last_used, disk = new_disk;
--	const int sectors = bio->bi_size >> 9;
-+	const int sectors = r1_bio->sectors;
- 	sector_t new_distance, current_distance;
-+	mdk_rdev_t *new_rdev, *rdev;
- 
- 	spin_lock_irq(&conf->device_lock);
- 	/*
-@@ -376,16 +347,17 @@ static int read_balance(conf_t *conf, st
- 	 * device if no resync is going on, or below the resync window.
- 	 * We take the first readable disk when above the resync window.
- 	 */
-+ retry:
- 	if (conf->mddev->recovery_cp < MaxSector &&
- 	    (this_sector + sectors >= conf->next_resync)) {
--		/* make sure that disk is operational */
-+		/* Choose the first operation device, for consistancy */
- 		new_disk = 0;
- 
--		while (!conf->mirrors[new_disk].rdev ||
--		       !conf->mirrors[new_disk].rdev->in_sync) {
-+		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-+		       !new_rdev->in_sync) {
- 			new_disk++;
- 			if (new_disk == conf->raid_disks) {
--				new_disk = 0;
-+				new_disk = -1;
- 				break;
- 			}
- 		}
-@@ -394,13 +366,13 @@ static int read_balance(conf_t *conf, st
- 
- 
- 	/* make sure the disk is operational */
--	while (!conf->mirrors[new_disk].rdev ||
--	       !conf->mirrors[new_disk].rdev->in_sync) {
-+	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-+	       !new_rdev->in_sync) {
- 		if (new_disk <= 0)
- 			new_disk = conf->raid_disks;
- 		new_disk--;
- 		if (new_disk == disk) {
--			new_disk = conf->last_used;
-+			new_disk = -1;
- 			goto rb_out;
- 		}
- 	}
-@@ -424,29 +396,38 @@ static int read_balance(conf_t *conf, st
- 			disk = conf->raid_disks;
- 		disk--;
- 
--		if (!conf->mirrors[disk].rdev ||
--		    !conf->mirrors[disk].rdev->in_sync)
-+		if ((rdev=conf->mirrors[disk].rdev) == NULL ||
-+		    !rdev->in_sync)
- 			continue;
- 
--		if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {
-+		if (!atomic_read(&rdev->nr_pending)) {
- 			new_disk = disk;
-+			new_rdev = rdev;
- 			break;
- 		}
- 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
- 		if (new_distance < current_distance) {
- 			current_distance = new_distance;
- 			new_disk = disk;
-+			new_rdev = rdev;
- 		}
- 	} while (disk != conf->last_used);
- 
- rb_out:
--	r1_bio->read_disk = new_disk;
--	conf->next_seq_sect = this_sector + sectors;
- 
--	conf->last_used = new_disk;
- 
--	if (conf->mirrors[new_disk].rdev)
--		atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);
-+	if (new_disk >= 0) {
-+		conf->next_seq_sect = this_sector + sectors;
-+		conf->last_used = new_disk;
-+		atomic_inc(&new_rdev->nr_pending);
-+		if (!new_rdev->in_sync) {
-+			/* cannot risk returning a device that failed
-+			 * before we inc'ed nr_pending
-+			 */
-+			atomic_dec(&new_rdev->nr_pending);
-+			goto retry;
-+		}
-+	}
- 	spin_unlock_irq(&conf->device_lock);
- 
- 	return new_disk;
-@@ -471,7 +452,7 @@ static void unplug_slaves(mddev_t *mddev
- 				r_queue->unplug_fn(r_queue);
- 
- 			spin_lock_irqsave(&conf->device_lock, flags);
--			atomic_dec(&rdev->nr_pending);
-+			rdev_dec_pending(rdev, mddev);
- 		}
- 	}
- 	spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -481,6 +462,32 @@ static void raid1_unplug(request_queue_t
- 	unplug_slaves(q->queuedata);
- }
- 
-+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			     sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	conf_t *conf = mddev_to_conf(mddev);
-+	unsigned long flags;
-+	int i, ret = 0;
-+
-+	spin_lock_irqsave(&conf->device_lock, flags);
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-+		if (rdev && !rdev->faulty) {
-+			struct block_device *bdev = rdev->bdev;
-+			request_queue_t *r_queue = bdev_get_queue(bdev);
-+
-+			if (r_queue->issue_flush_fn) {
-+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+				if (ret)
-+					break;
-+			}
-+		}
-+	}
-+	spin_unlock_irqrestore(&conf->device_lock, flags);
-+	return ret;
-+}
-+
- /*
-  * Throttle resync depth, so that we can both get proper overlapping of
-  * requests, but are still able to handle normal requests quickly.
-@@ -513,6 +520,7 @@ static int make_request(request_queue_t 
- 	r1bio_t *r1_bio;
- 	struct bio *read_bio;
- 	int i, disks;
-+	mdk_rdev_t *rdev;
- 
- 	/*
- 	 * Register the new request and wait if the reconstruction
-@@ -545,15 +553,26 @@ static int make_request(request_queue_t 
- 	r1_bio->mddev = mddev;
- 	r1_bio->sector = bio->bi_sector;
- 
-+	r1_bio->state = 0;
-+
- 	if (bio_data_dir(bio) == READ) {
- 		/*
- 		 * read balancing logic:
- 		 */
--		mirror = conf->mirrors + read_balance(conf, bio, r1_bio);
-+		int rdisk = read_balance(conf, r1_bio);
-+
-+		if (rdisk < 0) {
-+			/* couldn't find anywhere to read from */
-+			raid_end_bio_io(r1_bio);
-+			return 0;
-+		}
-+		mirror = conf->mirrors + rdisk;
-+
-+		r1_bio->read_disk = rdisk;
- 
- 		read_bio = bio_clone(bio, GFP_NOIO);
- 
--		r1_bio->bios[r1_bio->read_disk] = read_bio;
-+		r1_bio->bios[rdisk] = read_bio;
- 
- 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
- 		read_bio->bi_bdev = mirror->rdev->bdev;
-@@ -575,10 +594,14 @@ static int make_request(request_queue_t 
- 	disks = conf->raid_disks;
- 	spin_lock_irq(&conf->device_lock);
- 	for (i = 0;  i < disks; i++) {
--		if (conf->mirrors[i].rdev &&
--		    !conf->mirrors[i].rdev->faulty) {
--			atomic_inc(&conf->mirrors[i].rdev->nr_pending);
--			r1_bio->bios[i] = bio;
-+		if ((rdev=conf->mirrors[i].rdev) != NULL &&
-+		    !rdev->faulty) {
-+			atomic_inc(&rdev->nr_pending);
-+			if (rdev->faulty) {
-+				atomic_dec(&rdev->nr_pending);
-+				r1_bio->bios[i] = NULL;
-+			} else
-+				r1_bio->bios[i] = bio;
- 		} else
- 			r1_bio->bios[i] = NULL;
- 	}
-@@ -746,7 +769,7 @@ static int raid1_add_disk(mddev_t *mddev
- 			 */
- 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- 			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--				mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 			p->head_position = 0;
- 			rdev->raid_disk = mirror;
-@@ -877,7 +900,7 @@ static void sync_request_write(mddev_t *
- 
- 		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
- 		atomic_inc(&r1_bio->remaining);
--		md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9);
-+		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
- 		generic_make_request(wbio);
- 	}
- 
-@@ -925,7 +948,7 @@ static void raid1d(mddev_t *mddev)
- 		} else {
- 			int disk;
- 			bio = r1_bio->bios[r1_bio->read_disk];
--			if ((disk=map(mddev, &rdev)) == -1) {
-+			if ((disk=read_balance(conf, r1_bio)) == -1) {
- 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
- 				       " read error for block %llu\n",
- 				       bdevname(bio->bi_bdev,b),
-@@ -934,14 +957,20 @@ static void raid1d(mddev_t *mddev)
- 			} else {
- 				r1_bio->bios[r1_bio->read_disk] = NULL;
- 				r1_bio->read_disk = disk;
-+				bio_put(bio);
-+				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
- 				r1_bio->bios[r1_bio->read_disk] = bio;
--				printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
--				       " another mirror\n",
--				       bdevname(rdev->bdev,b),
--				       (unsigned long long)r1_bio->sector);
--				bio->bi_bdev = rdev->bdev;
-+				rdev = conf->mirrors[disk].rdev;
-+				if (printk_ratelimit())
-+					printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-+					       " another mirror\n",
-+					       bdevname(rdev->bdev,b),
-+					       (unsigned long long)r1_bio->sector);
- 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
-+				bio->bi_bdev = rdev->bdev;
-+				bio->bi_end_io = raid1_end_read_request;
- 				bio->bi_rw = READ;
-+				bio->bi_private = r1_bio;
- 				unplug = 1;
- 				generic_make_request(bio);
- 			}
-@@ -1078,7 +1107,7 @@ static int sync_request(mddev_t *mddev, 
- 		int rv = max_sector - sector_nr;
- 		md_done_sync(mddev, rv, 1);
- 		put_buf(r1_bio);
--		atomic_dec(&conf->mirrors[disk].rdev->nr_pending);
-+		rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
- 		return rv;
- 	}
- 
-@@ -1117,7 +1146,7 @@ static int sync_request(mddev_t *mddev, 
- 	bio = r1_bio->bios[disk];
- 	r1_bio->sectors = nr_sectors;
- 
--	md_sync_acct(mirror->rdev, nr_sectors);
-+	md_sync_acct(mirror->rdev->bdev, nr_sectors);
- 
- 	generic_make_request(bio);
- 
-@@ -1168,6 +1197,7 @@ static int run(mddev_t *mddev)
- 
- 	mddev->queue->unplug_fn = raid1_unplug;
- 
-+	mddev->queue->issue_flush_fn = raid1_issue_flush;
- 
- 	ITERATE_RDEV(mddev, rdev, tmp) {
- 		disk_idx = rdev->raid_disk;
-@@ -1186,7 +1216,7 @@ static int run(mddev_t *mddev)
- 		 */
- 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
--			mddev->queue->max_sectors = (PAGE_SIZE>>9);
-+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
- 
- 		disk->head_position = 0;
- 		if (!rdev->faulty && rdev->in_sync)
-@@ -1328,7 +1358,7 @@ static int raid1_reshape(mddev_t *mddev,
- 		if (conf->mirrors[d].rdev)
- 			return -EBUSY;
- 
--	newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL);
-+	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
- 	if (!newpoolinfo)
- 		return -ENOMEM;
- 	newpoolinfo->mddev = mddev;
-diff -pruN ./drivers/md.dm/raid5.c ./drivers/md/raid5.c
---- ./drivers/md.dm/raid5.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid5.c	2006-03-17 13:16:38.000000000 +0300
-@@ -457,6 +457,7 @@ static void raid5_build_block (struct st
- 	bio_init(&dev->req);
- 	dev->req.bi_io_vec = &dev->vec;
- 	dev->req.bi_vcnt++;
-+	dev->req.bi_max_vecs++;
- 	dev->vec.bv_page = dev->page;
- 	dev->vec.bv_len = STRIPE_SIZE;
- 	dev->vec.bv_offset = 0;
-@@ -477,8 +478,8 @@ static void error(mddev_t *mddev, mdk_rd
- 
- 	if (!rdev->faulty) {
- 		mddev->sb_dirty = 1;
--		conf->working_disks--;
- 		if (rdev->in_sync) {
-+			conf->working_disks--;
- 			mddev->degraded++;
- 			conf->failed_disks++;
- 			rdev->in_sync = 0;
-@@ -1071,7 +1072,8 @@ static void handle_stripe(struct stripe_
- 					PRINTK("Reading block %d (sync=%d)\n", 
- 						i, syncing);
- 					if (syncing)
--						md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
-+						md_sync_acct(conf->disks[i].rdev->bdev,
-+							     STRIPE_SECTORS);
- 				}
- 			}
- 		}
-@@ -1256,7 +1258,7 @@ static void handle_stripe(struct stripe_
-  
- 		if (rdev) {
- 			if (test_bit(R5_Syncio, &sh->dev[i].flags))
--				md_sync_acct(rdev, STRIPE_SECTORS);
-+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
- 
- 			bi->bi_bdev = rdev->bdev;
- 			PRINTK("for %llu schedule op %ld on disc %d\n",
-@@ -1265,6 +1267,7 @@ static void handle_stripe(struct stripe_
- 			bi->bi_sector = sh->sector + rdev->data_offset;
- 			bi->bi_flags = 1 << BIO_UPTODATE;
- 			bi->bi_vcnt = 1;	
-+			bi->bi_max_vecs = 1;
- 			bi->bi_idx = 0;
- 			bi->bi_io_vec = &sh->dev[i].vec;
- 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-@@ -1316,7 +1319,7 @@ static void unplug_slaves(mddev_t *mddev
- 				r_queue->unplug_fn(r_queue);
- 
- 			spin_lock_irqsave(&conf->device_lock, flags);
--			atomic_dec(&rdev->nr_pending);
-+			rdev_dec_pending(rdev, mddev);
- 		}
- 	}
- 	spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -1328,6 +1331,8 @@ static void raid5_unplug_device(request_
- 	raid5_conf_t *conf = mddev_to_conf(mddev);
- 	unsigned long flags;
- 
-+	if (!conf) return;
-+
- 	spin_lock_irqsave(&conf->device_lock, flags);
- 
- 	if (blk_remove_plug(q))
-@@ -1339,6 +1344,39 @@ static void raid5_unplug_device(request_
- 	unplug_slaves(mddev);
- }
- 
-+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			     sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	raid5_conf_t *conf = mddev_to_conf(mddev);
-+	int i, ret = 0;
-+
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->disks[i].rdev;
-+		if (rdev && !rdev->faulty) {
-+			struct block_device *bdev = rdev->bdev;
-+			request_queue_t *r_queue;
-+
-+			if (!bdev)
-+				continue;
-+
-+			r_queue = bdev_get_queue(bdev);
-+			if (!r_queue)
-+				continue;
-+
-+			if (!r_queue->issue_flush_fn) {
-+				ret = -EOPNOTSUPP;
-+				break;
-+			}
-+
-+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+			if (ret)
-+				break;
-+		}
-+	}
-+	return ret;
-+}
-+
- static inline void raid5_plug_device(raid5_conf_t *conf)
- {
- 	spin_lock_irq(&conf->device_lock);
-@@ -1545,6 +1583,7 @@ static int run (mddev_t *mddev)
- 	atomic_set(&conf->preread_active_stripes, 0);
- 
- 	mddev->queue->unplug_fn = raid5_unplug_device;
-+	mddev->queue->issue_flush_fn = raid5_issue_flush;
- 
- 	PRINTK("raid5: run(%s) called.\n", mdname(mddev));
- 
-diff -pruN ./drivers/md.dm/raid6main.c ./drivers/md/raid6main.c
---- ./drivers/md.dm/raid6main.c	2006-03-17 08:57:42.000000000 +0300
-+++ ./drivers/md/raid6main.c	2006-03-17 13:16:38.000000000 +0300
-@@ -478,6 +478,7 @@ static void raid6_build_block (struct st
- 	bio_init(&dev->req);
- 	dev->req.bi_io_vec = &dev->vec;
- 	dev->req.bi_vcnt++;
-+	dev->req.bi_max_vecs++;
- 	dev->vec.bv_page = dev->page;
- 	dev->vec.bv_len = STRIPE_SIZE;
- 	dev->vec.bv_offset = 0;
-@@ -498,8 +499,8 @@ static void error(mddev_t *mddev, mdk_rd
- 
- 	if (!rdev->faulty) {
- 		mddev->sb_dirty = 1;
--		conf->working_disks--;
- 		if (rdev->in_sync) {
-+			conf->working_disks--;
- 			mddev->degraded++;
- 			conf->failed_disks++;
- 			rdev->in_sync = 0;
-@@ -1208,7 +1209,8 @@ static void handle_stripe(struct stripe_
- 					PRINTK("Reading block %d (sync=%d)\n",
- 						i, syncing);
- 					if (syncing)
--						md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
-+						md_sync_acct(conf->disks[i].rdev->bdev,
-+							     STRIPE_SECTORS);
- 				}
- 			}
- 		}
-@@ -1418,7 +1420,7 @@ static void handle_stripe(struct stripe_
- 
- 		if (rdev) {
- 			if (test_bit(R5_Syncio, &sh->dev[i].flags))
--				md_sync_acct(rdev, STRIPE_SECTORS);
-+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
- 
- 			bi->bi_bdev = rdev->bdev;
- 			PRINTK("for %llu schedule op %ld on disc %d\n",
-@@ -1427,6 +1429,7 @@ static void handle_stripe(struct stripe_
- 			bi->bi_sector = sh->sector + rdev->data_offset;
- 			bi->bi_flags = 1 << BIO_UPTODATE;
- 			bi->bi_vcnt = 1;
-+			bi->bi_max_vecs = 1;
- 			bi->bi_idx = 0;
- 			bi->bi_io_vec = &sh->dev[i].vec;
- 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-@@ -1478,7 +1481,7 @@ static void unplug_slaves(mddev_t *mddev
- 				r_queue->unplug_fn(r_queue);
- 
- 			spin_lock_irqsave(&conf->device_lock, flags);
--			atomic_dec(&rdev->nr_pending);
-+			rdev_dec_pending(rdev, mddev);
- 		}
- 	}
- 	spin_unlock_irqrestore(&conf->device_lock, flags);
-@@ -1501,6 +1504,39 @@ static void raid6_unplug_device(request_
- 	unplug_slaves(mddev);
- }
- 
-+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
-+			     sector_t *error_sector)
-+{
-+	mddev_t *mddev = q->queuedata;
-+	raid6_conf_t *conf = mddev_to_conf(mddev);
-+	int i, ret = 0;
-+
-+	for (i=0; i<mddev->raid_disks; i++) {
-+		mdk_rdev_t *rdev = conf->disks[i].rdev;
-+		if (rdev && !rdev->faulty) {
-+			struct block_device *bdev = rdev->bdev;
-+			request_queue_t *r_queue;
-+
-+			if (!bdev)
-+				continue;
-+
-+			r_queue = bdev_get_queue(bdev);
-+			if (!r_queue)
-+				continue;
-+
-+			if (!r_queue->issue_flush_fn) {
-+				ret = -EOPNOTSUPP;
-+				break;
-+			}
-+
-+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
-+			if (ret)
-+				break;
-+		}
-+	}
-+	return ret;
-+}
-+
- static inline void raid6_plug_device(raid6_conf_t *conf)
- {
- 	spin_lock_irq(&conf->device_lock);
-@@ -1708,6 +1744,7 @@ static int run (mddev_t *mddev)
- 	atomic_set(&conf->preread_active_stripes, 0);
- 
- 	mddev->queue->unplug_fn = raid6_unplug_device;
-+	mddev->queue->issue_flush_fn = raid6_issue_flush;
- 
- 	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
- 
---- ./include/linux/compat_ioctl.h.dm	2006-03-17 08:58:47.000000000 +0300
-+++ ./include/linux/compat_ioctl.h	2006-03-17 08:16:12.000000000 +0300
-@@ -102,6 +102,7 @@ COMPATIBLE_IOCTL(BLKROGET)
- COMPATIBLE_IOCTL(BLKRRPART)
- COMPATIBLE_IOCTL(BLKFLSBUF)
- COMPATIBLE_IOCTL(BLKSECTSET)
-+COMPATIBLE_IOCTL(BLKSECTGET)
- COMPATIBLE_IOCTL(BLKSSZGET)
- ULONG_IOCTL(BLKRASET)
- ULONG_IOCTL(BLKFRASET)
-@@ -141,6 +142,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32)
- COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
- COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
- COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
-+COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
- COMPATIBLE_IOCTL(DM_VERSION)
- COMPATIBLE_IOCTL(DM_REMOVE_ALL)
- COMPATIBLE_IOCTL(DM_LIST_DEVICES)
-@@ -155,6 +157,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
- COMPATIBLE_IOCTL(DM_TABLE_DEPS)
- COMPATIBLE_IOCTL(DM_TABLE_STATUS)
- COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
-+COMPATIBLE_IOCTL(DM_TARGET_MSG)
- /* Big K */
- COMPATIBLE_IOCTL(PIO_FONT)
- COMPATIBLE_IOCTL(GIO_FONT)
-@@ -387,6 +390,7 @@ COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
- COMPATIBLE_IOCTL(DVD_AUTH)
- /* Big L */
- ULONG_IOCTL(LOOP_SET_FD)
-+ULONG_IOCTL(LOOP_CHANGE_FD)
- COMPATIBLE_IOCTL(LOOP_CLR_FD)
- COMPATIBLE_IOCTL(LOOP_GET_STATUS64)
- COMPATIBLE_IOCTL(LOOP_SET_STATUS64)
-@@ -595,13 +599,15 @@ COMPATIBLE_IOCTL(ATMTCP_CREATE)
- COMPATIBLE_IOCTL(ATMTCP_REMOVE)
- COMPATIBLE_IOCTL(ATMMPC_CTRL)
- COMPATIBLE_IOCTL(ATMMPC_DATA)
--/* Big W */
--/* WIOC_GETSUPPORT not yet implemented -E */
-+/* Watchdog */
-+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
- COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
- COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
- COMPATIBLE_IOCTL(WDIOC_GETTEMP)
- COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
- COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
-+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
-+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
- /* Big R */
- COMPATIBLE_IOCTL(RNDGETENTCNT)
- COMPATIBLE_IOCTL(RNDADDTOENTCNT)
-@@ -735,3 +741,20 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY)
- COMPATIBLE_IOCTL(SIOCGIWRETRY)
- COMPATIBLE_IOCTL(SIOCSIWPOWER)
- COMPATIBLE_IOCTL(SIOCGIWPOWER)
-+/* hiddev */
-+COMPATIBLE_IOCTL(HIDIOCGVERSION)
-+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
-+COMPATIBLE_IOCTL(HIDIOCGSTRING)
-+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
-+COMPATIBLE_IOCTL(HIDIOCGREPORT)
-+COMPATIBLE_IOCTL(HIDIOCSREPORT)
-+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
-+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
-+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
-+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
-+COMPATIBLE_IOCTL(HIDIOCGUCODE)
-+COMPATIBLE_IOCTL(HIDIOCGFLAG)
-+COMPATIBLE_IOCTL(HIDIOCSFLAG)
-+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
-+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
---- ./include/linux/device-mapper.h.dm	2006-03-17 08:58:56.000000000 +0300
-+++ ./include/linux/device-mapper.h	2006-03-17 08:16:12.000000000 +0300
-@@ -51,12 +51,15 @@ typedef int (*dm_endio_fn) (struct dm_ta
- 			    struct bio *bio, int error,
- 			    union map_info *map_context);
- 
--typedef void (*dm_suspend_fn) (struct dm_target *ti);
-+typedef void (*dm_presuspend_fn) (struct dm_target *ti);
-+typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
- typedef void (*dm_resume_fn) (struct dm_target *ti);
- 
- typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
- 			     char *result, unsigned int maxlen);
- 
-+typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
-+
- void dm_error(const char *message);
- 
- /*
-@@ -79,9 +82,11 @@ struct target_type {
- 	dm_dtr_fn dtr;
- 	dm_map_fn map;
- 	dm_endio_fn end_io;
--	dm_suspend_fn suspend;
-+	dm_presuspend_fn presuspend;
-+	dm_postsuspend_fn postsuspend;
- 	dm_resume_fn resume;
- 	dm_status_fn status;
-+	dm_message_fn message;
- };
- 
- struct io_restrictions {
-@@ -102,6 +107,7 @@ struct dm_target {
- 	sector_t len;
- 
- 	/* FIXME: turn this into a mask, and merge with io_restrictions */
-+	/* Always a power of 2 */
- 	sector_t split_io;
- 
- 	/*
---- ./include/linux/dm-ioctl.h.dm	2006-03-17 08:59:07.000000000 +0300
-+++ ./include/linux/dm-ioctl.h	2006-03-17 08:16:12.000000000 +0300
-@@ -1,5 +1,6 @@
- /*
-  * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
-+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
-  *
-  * This file is released under the LGPL.
-  */
-@@ -76,6 +77,9 @@
-  *
-  * DM_TABLE_STATUS:
-  * Return the targets status for the 'active' table.
-+ *
-+ * DM_TARGET_MSG:
-+ * Pass a message string to the target at a specific offset of a device.
-  */
- 
- /*
-@@ -179,6 +183,15 @@ struct dm_target_versions {
- };
- 
- /*
-+ * Used to pass message to a target
-+ */
-+struct dm_target_msg {
-+	uint64_t sector;	/* Device sector */
-+
-+	char message[0];
-+};
-+
-+/*
-  * If you change this make sure you make the corresponding change
-  * to dm-ioctl.c:lookup_ioctl()
-  */
-@@ -204,6 +217,7 @@ enum {
- 
- 	/* Added later */
- 	DM_LIST_VERSIONS_CMD,
-+	DM_TARGET_MSG_CMD,
- };
- 
- /*
-@@ -232,6 +246,7 @@ typedef char ioctl_struct[308];
- #define DM_TABLE_DEPS_32    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct)
- #define DM_TABLE_STATUS_32  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
- #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
-+#define DM_TARGET_MSG_32    _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
- #endif
- 
- #define DM_IOCTL 0xfd
-@@ -254,10 +269,12 @@ typedef char ioctl_struct[308];
- 
- #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
- 
-+#define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
-+
- #define DM_VERSION_MAJOR	4
--#define DM_VERSION_MINOR	1
-+#define DM_VERSION_MINOR	5
- #define DM_VERSION_PATCHLEVEL	0
--#define DM_VERSION_EXTRA	"-ioctl (2003-12-10)"
-+#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
- 
- /* Status bits */
- #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-@@ -283,4 +300,14 @@ typedef char ioctl_struct[308];
-  */
- #define DM_BUFFER_FULL_FLAG	(1 << 8) /* Out */
- 
-+/*
-+ * Set this to improve performance when you aren't going to use open_count.
-+ */
-+#define DM_SKIP_BDGET_FLAG	(1 << 9) /* In */
-+
-+/*
-+ * Set this to avoid attempting to freeze any filesystem when suspending.
-+ */
-+#define DM_SKIP_LOCKFS_FLAG	(1 << 10) /* In */
-+
- #endif				/* _LINUX_DM_IOCTL_H */
---- ./include/linux/genhd.h.dm	2006-03-20 08:42:40.000000000 +0300
-+++ ./include/linux/genhd.h	2006-03-17 13:44:40.000000000 +0300
-@@ -100,7 +100,7 @@ struct gendisk {
- 	struct timer_rand_state *random;
- 	int policy;
- 
--	unsigned sync_io;		/* RAID */
-+	atomic_t sync_io;		/* RAID */
- 	unsigned long stamp, stamp_idle;
- 	int in_flight;
- #ifdef	CONFIG_SMP
-diff -pruN ./include/linux/raid.dm/linear.h ./include/linux/raid/linear.h
---- ./include/linux/raid.dm/linear.h	2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/linear.h	2006-03-17 13:26:59.000000000 +0300
-@@ -5,8 +5,8 @@
- 
- struct dev_info {
- 	mdk_rdev_t	*rdev;
--	unsigned long	size;
--	unsigned long	offset;
-+	sector_t	size;
-+	sector_t	offset;
- };
- 
- typedef struct dev_info dev_info_t;
-diff -pruN ./include/linux/raid.dm/md.h ./include/linux/raid/md.h
---- ./include/linux/raid.dm/md.h	2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/md.h	2006-03-17 13:26:59.000000000 +0300
-@@ -69,12 +69,10 @@ extern mdk_thread_t * md_register_thread
- extern void md_unregister_thread (mdk_thread_t *thread);
- extern void md_wakeup_thread(mdk_thread_t *thread);
- extern void md_check_recovery(mddev_t *mddev);
--extern void md_interrupt_thread (mdk_thread_t *thread);
- extern void md_write_start(mddev_t *mddev);
- extern void md_write_end(mddev_t *mddev);
- extern void md_handle_safemode(mddev_t *mddev);
- extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
--extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
- extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
- extern void md_unplug_mddev(mddev_t *mddev);
- 
-diff -pruN ./include/linux/raid.dm/md_k.h ./include/linux/raid/md_k.h
---- ./include/linux/raid.dm/md_k.h	2006-03-17 13:26:03.000000000 +0300
-+++ ./include/linux/raid/md_k.h	2006-03-17 13:26:59.000000000 +0300
-@@ -24,7 +24,8 @@
- #define HSM               6UL
- #define MULTIPATH         7UL
- #define RAID6		  8UL
--#define MAX_PERSONALITY   9UL
-+#define	RAID10		  9UL
-+#define MAX_PERSONALITY   10UL
- 
- #define	LEVEL_MULTIPATH		(-4)
- #define	LEVEL_LINEAR		(-1)
-@@ -43,6 +44,7 @@ static inline int pers_to_level (int per
- 		case RAID1:		return 1;
- 		case RAID5:		return 5;
- 		case RAID6:		return 6;
-+		case RAID10:		return 10;
- 	}
- 	BUG();
- 	return MD_RESERVED;
-@@ -60,6 +62,7 @@ static inline int level_to_pers (int lev
- 		case 4:
- 		case 5: return RAID5;
- 		case 6: return RAID6;
-+		case 10: return RAID10;
- 	}
- 	return MD_RESERVED;
- }
-@@ -216,6 +219,7 @@ struct mddev_s
- 	unsigned long			resync_mark;	/* a recent timestamp */
- 	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
- 
-+	sector_t			resync_max_sectors; /* may be set by personality */
- 	/* recovery/resync flags 
- 	 * NEEDED:   we might need to start a resync/recover
- 	 * RUNNING:  a thread is running, or about to be started
-@@ -263,6 +267,11 @@ static inline void rdev_dec_pending(mdk_
- 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- }
- 
-+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-+{
-+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
-+}
-+
- struct mdk_personality_s
- {
- 	char *name;
-diff -pruN ./include/linux/raid.dm/raid10.h ./include/linux/raid/raid10.h
---- ./include/linux/raid.dm/raid10.h	1970-01-01 03:00:00.000000000 +0300
-+++ ./include/linux/raid/raid10.h	2006-03-17 13:26:59.000000000 +0300
-@@ -0,0 +1,103 @@
-+#ifndef _RAID10_H
-+#define _RAID10_H
-+
-+#include <linux/raid/md.h>
-+
-+typedef struct mirror_info mirror_info_t;
-+
-+struct mirror_info {
-+	mdk_rdev_t	*rdev;
-+	sector_t	head_position;
-+};
-+
-+typedef struct r10bio_s r10bio_t;
-+
-+struct r10_private_data_s {
-+	mddev_t			*mddev;
-+	mirror_info_t		*mirrors;
-+	int			raid_disks;
-+	int			working_disks;
-+	spinlock_t		device_lock;
-+
-+	/* geometry */
-+	int			near_copies;  /* number of copies layed out raid0 style */
-+	int 			far_copies;   /* number of copies layed out
-+					       * at large strides across drives
-+					       */
-+	int			copies;	      /* near_copies * far_copies.
-+					       * must be <= raid_disks
-+					       */
-+	sector_t		stride;	      /* distance between far copies.
-+					       * This is size / far_copies
-+					       */
-+
-+	int chunk_shift; /* shift from chunks to sectors */
-+	sector_t chunk_mask;
-+
-+	struct list_head	retry_list;
-+	/* for use when syncing mirrors: */
-+
-+	spinlock_t		resync_lock;
-+	int nr_pending;
-+	int barrier;
-+	sector_t		next_resync;
-+
-+	wait_queue_head_t	wait_idle;
-+	wait_queue_head_t	wait_resume;
-+
-+	mempool_t *r10bio_pool;
-+	mempool_t *r10buf_pool;
-+};
-+
-+typedef struct r10_private_data_s conf_t;
-+
-+/*
-+ * this is the only point in the RAID code where we violate
-+ * C type safety. mddev->private is an 'opaque' pointer.
-+ */
-+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-+
-+/*
-+ * this is our 'private' RAID10 bio.
-+ *
-+ * it contains information about what kind of IO operations were started
-+ * for this RAID10 operation, and about their status:
-+ */
-+
-+struct r10bio_s {
-+	atomic_t		remaining; /* 'have we finished' count,
-+					    * used from IRQ handlers
-+					    */
-+	sector_t		sector;	/* virtual sector number */
-+	int			sectors;
-+	unsigned long		state;
-+	mddev_t			*mddev;
-+	/*
-+	 * original bio going to /dev/mdx
-+	 */
-+	struct bio		*master_bio;
-+	/*
-+	 * if the IO is in READ direction, then this is where we read
-+	 */
-+	int			read_slot;
-+
-+	struct list_head	retry_list;
-+	/*
-+	 * if the IO is in WRITE direction, then multiple bios are used,
-+	 * one for each copy.
-+	 * When resyncing we also use one for each copy.
-+	 * When reconstructing, we use 2 bios, one for read, one for write.
-+	 * We choose the number when they are allocated.
-+	 */
-+	struct {
-+		struct bio		*bio;
-+		sector_t addr;
-+		int devnum;
-+	} devs[0];
-+};
-+
-+/* bits for r10bio.state */
-+#define	R10BIO_Uptodate	0
-+#define	R10BIO_IsSync	1
-+#define	R10BIO_IsRecover 2
-+#endif