diff -pruN ./drivers/md.dm/dm-bio-list.h ./drivers/md/dm-bio-list.h
--- ./drivers/md.dm/dm-bio-list.h	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-bio-list.h	2006-03-17 13:16:38.000000000 +0300
@@ -33,6 +33,9 @@ static inline void bio_list_add(struct b
 
 static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
 {
+	if (!bl2->head)
+		return;
+
 	if (bl->tail)
 		bl->tail->bi_next = bl2->head;
 	else
diff -pruN ./drivers/md.dm/dm-bio-record.h ./drivers/md/dm-bio-record.h
--- ./drivers/md.dm/dm-bio-record.h	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-bio-record.h	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_RECORD_H
+#define DM_BIO_RECORD_H
+
+#include <linux/bio.h>
+
+/*
+ * There are lots of mutable fields in the bio struct that get
+ * changed by the lower levels of the block layer.  Some targets,
+ * such as multipath, may wish to resubmit a bio on error.  The
+ * functions in this file help the target record and restore the
+ * original bio state.
+ */
+struct dm_bio_details {
+	sector_t bi_sector;
+	struct block_device *bi_bdev;
+	unsigned int bi_size;
+	unsigned short bi_idx;
+	unsigned long bi_flags;
+};
+
+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
+{
+	bd->bi_sector = bio->bi_sector;
+	bd->bi_bdev = bio->bi_bdev;
+	bd->bi_size = bio->bi_size;
+	bd->bi_idx = bio->bi_idx;
+	bd->bi_flags = bio->bi_flags;
+}
+
+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
+{
+	bio->bi_sector = bd->bi_sector;
+	bio->bi_bdev = bd->bi_bdev;
+	bio->bi_size = bd->bi_size;
+	bio->bi_idx = bd->bi_idx;
+	bio->bi_flags = bd->bi_flags;
+}
+
+#endif
diff -pruN ./drivers/md.dm/dm.c ./drivers/md/dm.c
--- ./drivers/md.dm/dm.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm.c	2006-03-17 13:16:38.000000000 +0300
@@ -15,15 +15,13 @@
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
 static unsigned int _major = 0;
 
-static int realloc_minor_bits(unsigned long requested_minor);
-static void free_minor_bits(void);
-
 /*
  * One of these is allocated per bio.
  */
@@ -32,6 +30,7 @@ struct dm_io {
 	int error;
 	struct bio *bio;
 	atomic_t io_count;
+	unsigned long start_time;
 };
 
 /*
@@ -44,15 +43,23 @@ struct target_io {
 	union map_info info;
 };
 
+union map_info *dm_get_mapinfo(struct bio *bio)
+{
+        if (bio && bio->bi_private)
+                return &((struct target_io *)bio->bi_private)->info;
+        return NULL;
+}
+
 /*
  * Bits for the md->flags field.
  */
 #define DMF_BLOCK_IO 0
 #define DMF_SUSPENDED 1
-#define DMF_FS_LOCKED 2
+#define DMF_FROZEN 2
 
 struct mapped_device {
-	struct rw_semaphore lock;
+	struct rw_semaphore io_lock;
+	struct semaphore suspend_lock;
 	rwlock_t map_lock;
 	atomic_t holders;
 
@@ -61,6 +68,8 @@ struct mapped_device {
 	request_queue_t *queue;
 	struct gendisk *disk;
 
+	void *interface_ptr;
+
 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
@@ -89,6 +98,7 @@ struct mapped_device {
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
+	struct block_device *suspended_bdev;
 };
 
 #define MIN_IOS 256
@@ -113,19 +123,11 @@ static int __init local_init(void)
 		return -ENOMEM;
 	}
 
-	r = realloc_minor_bits(1024);
-	if (r < 0) {
-		kmem_cache_destroy(_tio_cache);
-		kmem_cache_destroy(_io_cache);
-		return r;
-	}
-
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0) {
 		kmem_cache_destroy(_tio_cache);
 		kmem_cache_destroy(_io_cache);
-		free_minor_bits();
 		return r;
 	}
 
@@ -139,7 +141,6 @@ static void local_exit(void)
 {
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
-	free_minor_bits();
 
 	if (unregister_blkdev(_major, _name) < 0)
 		DMERR("devfs_unregister_blkdev failed");
@@ -238,21 +239,53 @@ static inline void free_tio(struct mappe
 	mempool_free(tio, md->tio_pool);
 }
 
+static void start_io_acct(struct dm_io *io)
+{
+	struct mapped_device *md = io->md;
+
+	io->start_time = jiffies;
+
+	disk_round_stats(dm_disk(md));
+	dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
+}
+
+static int end_io_acct(struct dm_io *io)
+{
+	struct mapped_device *md = io->md;
+	struct bio *bio = io->bio;
+	unsigned long duration = jiffies - io->start_time;
+	int pending;
+
+	disk_round_stats(dm_disk(md));
+	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
+
+	switch (bio_data_dir(bio)) {
+	case WRITE:
+		disk_stat_add(dm_disk(md), write_ticks, duration);
+		break;
+	case READ:
+		disk_stat_add(dm_disk(md), read_ticks, duration);
+		break;
+	}
+
+	return !pending;
+}
+
 /*
  * Add the bio to the list of deferred io.
  */
 static int queue_io(struct mapped_device *md, struct bio *bio)
 {
-	down_write(&md->lock);
+	down_write(&md->io_lock);
 
 	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
-		up_write(&md->lock);
+		up_write(&md->io_lock);
 		return 1;
 	}
 
 	bio_list_add(&md->deferred, bio);
 
-	up_write(&md->lock);
+	up_write(&md->io_lock);
 	return 0;		/* deferred successfully */
 }
 
@@ -293,7 +326,7 @@ static inline void dec_pending(struct dm
 		io->error = error;
 
 	if (atomic_dec_and_test(&io->io_count)) {
-		if (atomic_dec_and_test(&io->md->pending))
+		if (end_io_acct(io))
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
@@ -342,8 +375,8 @@ static sector_t max_io_len(struct mapped
 	 */
 	if (ti->split_io) {
 		sector_t boundary;
-		boundary = dm_round_up(offset + 1, ti->split_io) - offset;
-
+		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
+			   - offset;
 		if (len > boundary)
 			len = boundary;
 	}
@@ -379,7 +412,7 @@ static void __map_bio(struct dm_target *
 		/* error the io and bail out */
 		struct dm_io *io = tio->io;
 		free_tio(tio->io->md, tio);
-		dec_pending(io, -EIO);
+		dec_pending(io, r);
 		bio_put(clone);
 	}
 }
@@ -542,7 +575,7 @@ static void __split_bio(struct mapped_de
 	ci.sector_count = bio_sectors(bio);
 	ci.idx = bio->bi_idx;
 
-	atomic_inc(&md->pending);
+	start_io_acct(ci.io);
 	while (ci.sector_count)
 		__clone_and_map(&ci);
 
@@ -563,14 +596,22 @@ static int dm_request(request_queue_t *q
 	int r;
 	struct mapped_device *md = q->queuedata;
 
-	down_read(&md->lock);
+	down_read(&md->io_lock);
+
+	if (bio_data_dir(bio) == WRITE) {
+		disk_stat_inc(dm_disk(md), writes);
+		disk_stat_add(dm_disk(md), write_sectors, bio_sectors(bio));
+	} else {
+		disk_stat_inc(dm_disk(md), reads);
+		disk_stat_add(dm_disk(md), read_sectors, bio_sectors(bio));
+	}
 
 	/*
 	 * If we're suspended we have to queue
 	 * this io for later.
 	 */
 	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
-		up_read(&md->lock);
+		up_read(&md->io_lock);
 
 		if (bio_rw(bio) == READA) {
 			bio_io_error(bio, bio->bi_size);
@@ -589,14 +630,29 @@ static int dm_request(request_queue_t *q
 		 * We're in a while loop, because someone could suspend
 		 * before we get to the following read lock.
 		 */
-		down_read(&md->lock);
+		down_read(&md->io_lock);
 	}
 
 	__split_bio(md, bio);
-	up_read(&md->lock);
+	up_read(&md->io_lock);
 	return 0;
 }
 
+static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
+			sector_t *error_sector)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	int ret = -ENXIO;
+
+	if (map) {
+		ret = dm_table_flush_all(map);
+		dm_table_put(map);
+	}
+
+	return ret;
+}
+
 static void dm_unplug_all(request_queue_t *q)
 {
 	struct mapped_device *md = q->queuedata;
@@ -624,109 +680,86 @@ static int dm_any_congested(void *conges
 }
 
 /*-----------------------------------------------------------------
- * A bitset is used to keep track of allocated minor numbers.
+ * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
 static DECLARE_MUTEX(_minor_lock);
-static unsigned long *_minor_bits = NULL;
-static unsigned long _max_minors = 0;
-
-#define MINORS_SIZE(minors) ((minors / BITS_PER_LONG) * sizeof(unsigned long))
-
-static int realloc_minor_bits(unsigned long requested_minor)
-{
-	unsigned long max_minors;
-	unsigned long *minor_bits, *tmp;
-
-	if (requested_minor < _max_minors)
-		return -EINVAL;
-
-	/* Round up the requested minor to the next power-of-2. */
-	max_minors = 1 << fls(requested_minor - 1);
-	if (max_minors > (1 << MINORBITS))
-		return -EINVAL;
-
-	minor_bits = kmalloc(MINORS_SIZE(max_minors), GFP_KERNEL);
-	if (!minor_bits)
-		return -ENOMEM;
-	memset(minor_bits, 0, MINORS_SIZE(max_minors));
-
-	/* Copy the existing bit-set to the new one. */
-	if (_minor_bits)
-		memcpy(minor_bits, _minor_bits, MINORS_SIZE(_max_minors));
-
-	tmp = _minor_bits;
-	_minor_bits = minor_bits;
-	_max_minors = max_minors;
-	if (tmp)
-		kfree(tmp);
-
-	return 0;
-}
-
-static void free_minor_bits(void)
-{
-	down(&_minor_lock);
-	kfree(_minor_bits);
-	_minor_bits = NULL;
-	_max_minors = 0;
-	up(&_minor_lock);
-}
+static DEFINE_IDR(_minor_idr);
 
 static void free_minor(unsigned int minor)
 {
 	down(&_minor_lock);
-	if (minor < _max_minors)
-		clear_bit(minor, _minor_bits);
+	idr_remove(&_minor_idr, minor);
 	up(&_minor_lock);
 }
 
 /*
  * See if the device with a specific minor # is free.
  */
-static int specific_minor(unsigned int minor)
+static int specific_minor(struct mapped_device *md, unsigned int minor)
 {
-	int r = 0;
+	int r, m;
 
-	if (minor > (1 << MINORBITS))
+	if (minor >= (1 << MINORBITS))
 		return -EINVAL;
 
 	down(&_minor_lock);
-	if (minor >= _max_minors) {
-		r = realloc_minor_bits(minor);
-		if (r) {
-			up(&_minor_lock);
-			return r;
-		}
+
+	if (idr_find(&_minor_idr, minor)) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+	if (!r) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	r = idr_get_new_above(&_minor_idr, md, minor, &m);
+	if (r) {
+		goto out;
 	}
 
-	if (test_and_set_bit(minor, _minor_bits))
+	if (m != minor) {
+		idr_remove(&_minor_idr, m);
 		r = -EBUSY;
-	up(&_minor_lock);
+		goto out;
+	}
 
+out:
+	up(&_minor_lock);
 	return r;
 }
 
-static int next_free_minor(unsigned int *minor)
+static int next_free_minor(struct mapped_device *md, unsigned int *minor)
 {
 	int r;
 	unsigned int m;
 
 	down(&_minor_lock);
-	m = find_first_zero_bit(_minor_bits, _max_minors);
-	if (m >= _max_minors) {
-		r = realloc_minor_bits(_max_minors * 2);
-		if (r) {
-			up(&_minor_lock);
-			return r;
-		}
-		m = find_first_zero_bit(_minor_bits, _max_minors);
+
+	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
+	if (!r) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	r = idr_get_new(&_minor_idr, md, &m);
+	if (r) {
+		goto out;
+	}
+
+	if (m >= (1 << MINORBITS)) {
+		idr_remove(&_minor_idr, m);
+		r = -ENOSPC;
+		goto out;
 	}
 
-	set_bit(m, _minor_bits);
 	*minor = m;
-	up(&_minor_lock);
 
-	return 0;
+out:
+	up(&_minor_lock);
+	return r;
 }
 
 static struct block_device_operations dm_blk_dops;
@@ -745,12 +778,13 @@ static struct mapped_device *alloc_dev(u
 	}
 
 	/* get a minor number for the dev */
-	r = persistent ? specific_minor(minor) : next_free_minor(&minor);
+	r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
 	if (r < 0)
 		goto bad1;
 
 	memset(md, 0, sizeof(*md));
-	init_rwsem(&md->lock);
+	init_rwsem(&md->io_lock);
+	init_MUTEX(&md->suspend_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->event_nr, 0);
@@ -764,6 +798,7 @@ static struct mapped_device *alloc_dev(u
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	md->queue->unplug_fn = dm_unplug_all;
+	md->queue->issue_flush_fn = dm_flush_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -823,22 +858,17 @@ static void event_callback(void *context
 {
 	struct mapped_device *md = (struct mapped_device *) context;
 
-	atomic_inc(&md->event_nr);;
+	atomic_inc(&md->event_nr);
 	wake_up(&md->eventq);
 }
 
-static void __set_size(struct gendisk *disk, sector_t size)
+static void __set_size(struct mapped_device *md, sector_t size)
 {
-	struct block_device *bdev;
+	set_capacity(md->disk, size);
 
-	set_capacity(disk, size);
-	bdev = bdget_disk(disk, 0);
-	if (bdev) {
-		down(&bdev->bd_inode->i_sem);
-		i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-		up(&bdev->bd_inode->i_sem);
-		bdput(bdev);
-	}
+	down(&md->suspended_bdev->bd_inode->i_sem);
+	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	up(&md->suspended_bdev->bd_inode->i_sem);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -847,17 +877,18 @@ static int __bind(struct mapped_device *
 	sector_t size;
 
 	size = dm_table_get_size(t);
-	__set_size(md->disk, size);
+	__set_size(md, size);
 	if (size == 0)
 		return 0;
 
+	dm_table_get(t);
+	dm_table_event_callback(t, event_callback, md);
+
 	write_lock(&md->map_lock);
 	md->map = t;
+	dm_table_set_restrictions(t, q);
 	write_unlock(&md->map_lock);
 
-	dm_table_get(t);
-	dm_table_event_callback(md->map, event_callback, md);
-	dm_table_set_restrictions(t, q);
 	return 0;
 }
 
@@ -901,6 +932,32 @@ int dm_create_with_minor(unsigned int mi
 	return create_aux(minor, 1, result);
 }
 
+void *dm_get_mdptr(dev_t dev)
+{
+	struct mapped_device *md;
+	void *mdptr = NULL;
+	unsigned minor = MINOR(dev);
+
+	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
+		return NULL;
+
+	down(&_minor_lock);
+
+	md = idr_find(&_minor_idr, minor);
+
+	if (md && (dm_disk(md)->first_minor == minor))
+		mdptr = md->interface_ptr;
+
+	up(&_minor_lock);
+
+	return mdptr;
+}
+
+void dm_set_mdptr(struct mapped_device *md, void *ptr)
+{
+	md->interface_ptr = ptr;
+}
+
 void dm_get(struct mapped_device *md)
 {
 	atomic_inc(&md->holders);
@@ -911,8 +968,10 @@ void dm_put(struct mapped_device *md)
 	struct dm_table *map = dm_get_table(md);
 
 	if (atomic_dec_and_test(&md->holders)) {
-		if (!test_bit(DMF_SUSPENDED, &md->flags) && map)
-			dm_table_suspend_targets(map);
+		if (!dm_suspended(md)) {
+			dm_table_presuspend_targets(map);
+			dm_table_postsuspend_targets(map);
+		}
 		__unbind(md);
 		free_dev(md);
 	}
@@ -940,69 +999,55 @@ static void __flush_deferred_io(struct m
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-	int r;
+	int r = -EINVAL;
 
-	down_write(&md->lock);
+	down(&md->suspend_lock);
 
 	/* device must be suspended */
-	if (!test_bit(DMF_SUSPENDED, &md->flags)) {
-		up_write(&md->lock);
-		return -EPERM;
-	}
+	if (!dm_suspended(md))
+		goto out;
 
 	__unbind(md);
 	r = __bind(md, table);
-	if (r)
-		return r;
 
-	up_write(&md->lock);
-	return 0;
+out:
+	up(&md->suspend_lock);
+	return r;
 }
 
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
  */
-static int __lock_fs(struct mapped_device *md)
+static int lock_fs(struct mapped_device *md)
 {
-	struct block_device *bdev;
+	int r;
 
-	if (test_and_set_bit(DMF_FS_LOCKED, &md->flags))
-		return 0;
+	WARN_ON(md->frozen_sb);
 
-	bdev = bdget_disk(md->disk, 0);
-	if (!bdev) {
-		DMWARN("bdget failed in __lock_fs");
-		return -ENOMEM;
+	md->frozen_sb = freeze_bdev(md->suspended_bdev);
+	if (IS_ERR(md->frozen_sb)) {
+		r = PTR_ERR(md->frozen_sb);
+		md->frozen_sb = NULL;
+		return r;
 	}
 
-	WARN_ON(md->frozen_sb);
-	md->frozen_sb = freeze_bdev(bdev);
+	set_bit(DMF_FROZEN, &md->flags);
+
 	/* don't bdput right now, we don't want the bdev
-	 * to go away while it is locked.  We'll bdput
-	 * in __unlock_fs
+	 * to go away while it is locked.
 	 */
 	return 0;
 }
 
-static int __unlock_fs(struct mapped_device *md)
+static void unlock_fs(struct mapped_device *md)
 {
-	struct block_device *bdev;
-
-	if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags))
-		return 0;
-
-	bdev = bdget_disk(md->disk, 0);
-	if (!bdev) {
-		DMWARN("bdget failed in __unlock_fs");
-		return -ENOMEM;
-	}
+	if (!test_bit(DMF_FROZEN, &md->flags))
+		return;
 
-	thaw_bdev(bdev, md->frozen_sb);
+	thaw_bdev(md->suspended_bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
-	bdput(bdev);
-	bdput(bdev);
-	return 0;
+	clear_bit(DMF_FROZEN, &md->flags);
 }
 
 /*
@@ -1012,46 +1057,48 @@ static int __unlock_fs(struct mapped_dev
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
-int dm_suspend(struct mapped_device *md)
+int dm_suspend(struct mapped_device *md, int do_lockfs)
 {
-	struct dm_table *map;
+	struct dm_table *map = NULL;
 	DECLARE_WAITQUEUE(wait, current);
+	int r = -EINVAL;
 
-	/* Flush I/O to the device. */
-	down_read(&md->lock);
-	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
-		up_read(&md->lock);
-		return -EINVAL;
+	down(&md->suspend_lock);
+
+	if (dm_suspended(md))
+		goto out;
+
+	map = dm_get_table(md);
+
+	/* This does not get reverted if there's an error later. */
+	dm_table_presuspend_targets(map);
+
+	md->suspended_bdev = bdget_disk(md->disk, 0);
+	if (!md->suspended_bdev) {
+		DMWARN("bdget failed in dm_suspend");
+		r = -ENOMEM;
+		goto out;
 	}
 
-	__lock_fs(md);
-	up_read(&md->lock);
+	/* Flush I/O to the device. */
+	if (do_lockfs) {
+		r = lock_fs(md);
+		if (r)
+			goto out;
+	}
 
 	/*
-	 * First we set the BLOCK_IO flag so no more ios will be
-	 * mapped.
+	 * First we set the BLOCK_IO flag so no more ios will be mapped.
 	 */
-	down_write(&md->lock);
-	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
-		/*
-		 * If we get here we know another thread is
-		 * trying to suspend as well, so we leave the fs
-		 * locked for this thread.
-		 */
-		up_write(&md->lock);
-		return -EINVAL;
-	}
-
+	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO, &md->flags);
+
 	add_wait_queue(&md->wait, &wait);
-	up_write(&md->lock);
+	up_write(&md->io_lock);
 
 	/* unplug */
-	map = dm_get_table(md);
-	if (map) {
+	if (map)
 		dm_table_unplug_all(map);
-		dm_table_put(map);
-	}
 
 	/*
 	 * Then we wait for the already mapped ios to
@@ -1067,54 +1114,75 @@ int dm_suspend(struct mapped_device *md)
 	}
 	set_current_state(TASK_RUNNING);
 
-	down_write(&md->lock);
+	down_write(&md->io_lock);
 	remove_wait_queue(&md->wait, &wait);
 
 	/* were we interrupted ? */
+	r = -EINTR;
 	if (atomic_read(&md->pending)) {
-		__unlock_fs(md);
+		up_write(&md->io_lock);
+		unlock_fs(md);
 		clear_bit(DMF_BLOCK_IO, &md->flags);
-		up_write(&md->lock);
-		return -EINTR;
+		goto out;
 	}
+	up_write(&md->io_lock);
+
+	dm_table_postsuspend_targets(map);
 
 	set_bit(DMF_SUSPENDED, &md->flags);
 
-	map = dm_get_table(md);
-	if (map)
-		dm_table_suspend_targets(map);
-	dm_table_put(map);
-	up_write(&md->lock);
+	r = 0;
 
-	return 0;
+out:
+	if (r && md->suspended_bdev) {
+		bdput(md->suspended_bdev);
+		md->suspended_bdev = NULL;
+	}
+
+	dm_table_put(map);
+	up(&md->suspend_lock);
+	return r;
 }
 
 int dm_resume(struct mapped_device *md)
 {
+	int r = -EINVAL;
 	struct bio *def;
-	struct dm_table *map = dm_get_table(md);
+	struct dm_table *map = NULL;
 
-	down_write(&md->lock);
-	if (!map ||
-	    !test_bit(DMF_SUSPENDED, &md->flags) ||
-	    !dm_table_get_size(map)) {
-		up_write(&md->lock);
-		dm_table_put(map);
-		return -EINVAL;
-	}
+	down(&md->suspend_lock);
+	if (!dm_suspended(md))
+		goto out;
+
+	map = dm_get_table(md);
+	if (!map || !dm_table_get_size(map))
+		goto out;
 
 	dm_table_resume_targets(map);
-	clear_bit(DMF_SUSPENDED, &md->flags);
+
+	down_write(&md->io_lock);
 	clear_bit(DMF_BLOCK_IO, &md->flags);
 
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
-	up_write(&md->lock);
-	__unlock_fs(md);
+	up_write(&md->io_lock);
+
+	unlock_fs(md);
+
+	bdput(md->suspended_bdev);
+	md->suspended_bdev = NULL;
+
+	clear_bit(DMF_SUSPENDED, &md->flags);
+
 	dm_table_unplug_all(map);
+
+	r = 0;
+
+out:
 	dm_table_put(map);
+	up(&md->suspend_lock);
 
-	return 0;
+	return r;
 }
 
 /*-----------------------------------------------------------------
@@ -1151,6 +1219,8 @@ static struct block_device_operations dm
 	.owner = THIS_MODULE
 };
 
+EXPORT_SYMBOL(dm_get_mapinfo);
+
 /*
  * module hooks
  */
@@ -1160,5 +1230,5 @@ module_exit(dm_exit);
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
 MODULE_DESCRIPTION(DM_NAME " driver");
-MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff -pruN ./drivers/md.dm/dm-crypt.c ./drivers/md/dm-crypt.c
--- ./drivers/md.dm/dm-crypt.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-crypt.c	2006-03-17 13:16:38.000000000 +0300
@@ -40,8 +40,8 @@ struct convert_context {
 	struct bio *bio_out;
 	unsigned int offset_in;
 	unsigned int offset_out;
-	int idx_in;
-	int idx_out;
+	unsigned int idx_in;
+	unsigned int idx_out;
 	sector_t sector;
 	int write;
 };
@@ -67,8 +67,8 @@ struct crypt_config {
 	struct crypto_tfm *tfm;
 	sector_t iv_offset;
 	int (*iv_generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
-	int iv_size;
-	int key_size;
+	unsigned int iv_size;
+	unsigned int key_size;
 	u8 key[0];
 };
 
@@ -97,10 +97,8 @@ static void mempool_free_page(void *page
  */
 static int crypt_iv_plain(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
+	memset(iv, 0, cc->iv_size);
 	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
-	if (cc->iv_size > sizeof(u32) / sizeof(u8))
-		memset(iv + (sizeof(u32) / sizeof(u8)), 0,
-		       cc->iv_size - (sizeof(u32) / sizeof(u8)));
 
 	return 0;
 }
@@ -200,13 +198,13 @@ static int crypt_convert(struct crypt_co
  */
 static struct bio *
 crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
-                   struct bio *base_bio, int *bio_vec_idx)
+                   struct bio *base_bio, unsigned int *bio_vec_idx)
 {
 	struct bio *bio;
-	int nr_iovecs = dm_div_up(size, PAGE_SIZE);
+	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	int gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
-	int flags = current->flags;
-	int i;
+	unsigned long flags = current->flags;
+	unsigned int i;
 
 	/*
 	 * Tell VM to act less aggressively and fail earlier.
@@ -280,9 +278,8 @@ crypt_alloc_buffer(struct crypt_config *
 static void crypt_free_buffer_pages(struct crypt_config *cc,
                                     struct bio *bio, unsigned int bytes)
 {
-	unsigned int start, end;
+	unsigned int i, start, end;
 	struct bio_vec *bv;
-	int i;
 
 	/*
 	 * This is ugly, but Jens Axboe thinks that using bi_idx in the
@@ -366,11 +363,11 @@ static void kcryptd_queue_io(struct cryp
 /*
  * Decode key from its hex representation
  */
-static int crypt_decode_key(u8 *key, char *hex, int size)
+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 {
 	char buffer[3];
 	char *endp;
-	int i;
+	unsigned int i;
 
 	buffer[2] = '\0';
 
@@ -393,9 +390,9 @@ static int crypt_decode_key(u8 *key, cha
 /*
  * Encode key into its hex representation
  */
-static void crypt_encode_key(char *hex, u8 *key, int size)
+static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 {
-	int i;
+	unsigned int i;
 
 	for(i = 0; i < size; i++) {
 		sprintf(hex, "%02x", *key);
@@ -415,8 +412,8 @@ static int crypt_ctr(struct dm_target *t
 	char *tmp;
 	char *cipher;
 	char *mode;
-	int crypto_flags;
-	int key_size;
+	unsigned int crypto_flags;
+	unsigned int key_size;
 
 	if (argc != 5) {
 		ti->error = PFX "Not enough arguments";
@@ -464,9 +461,9 @@ static int crypt_ctr(struct dm_target *t
 	}
 
 	if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
-		/* at least a 32 bit sector number should fit in our buffer */
+		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
-		                  (unsigned int)(sizeof(u32) / sizeof(u8)));
+		                  (unsigned int)(sizeof(u64) / sizeof(u8)));
 	else {
 		cc->iv_size = 0;
 		if (cc->iv_generator) {
@@ -528,6 +525,8 @@ bad3:
 bad2:
 	crypto_free_tfm(tfm);
 bad1:
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
 	kfree(cc);
 	return -EINVAL;
 }
@@ -541,6 +540,9 @@ static void crypt_dtr(struct dm_target *
 
 	crypto_free_tfm(cc->tfm);
 	dm_put_device(ti, cc->dev);
+
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
 	kfree(cc);
 }
 
@@ -577,7 +579,8 @@ static int crypt_endio(struct bio *bio, 
 
 static inline struct bio *
 crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
-            sector_t sector, int *bvec_idx, struct convert_context *ctx)
+            sector_t sector, unsigned int *bvec_idx,
+	    struct convert_context *ctx)
 {
 	struct bio *clone;
 
@@ -630,7 +633,7 @@ static int crypt_map(struct dm_target *t
 	struct bio *clone;
 	unsigned int remaining = bio->bi_size;
 	sector_t sector = bio->bi_sector - ti->begin;
-	int bvec_idx = 0;
+	unsigned int bvec_idx = 0;
 
 	io->target = ti;
 	io->bio = bio;
@@ -693,7 +696,7 @@ static int crypt_status(struct dm_target
 	char buffer[32];
 	const char *cipher;
 	const char *mode = NULL;
-	int offset;
+	unsigned int offset;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
diff -pruN ./drivers/md.dm/dm-emc.c ./drivers/md/dm-emc.c
--- ./drivers/md.dm/dm-emc.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-emc.c	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath support for EMC CLARiiON AX/CX-series hardware.
+ */
+
+#include "dm.h"
+#include "dm-hw-handler.h"
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+
+struct emc_handler {
+	spinlock_t lock;
+
+	/* Whether we should send the short trespass command (FC-series)
+	 * or the long version (default for AX/CX CLARiiON arrays). */
+	unsigned short_trespass;
+	/* Whether or not to honor SCSI reservations when initiating a
+	 * switch-over. Default: Don't. */
+	unsigned hr;
+	
+	unsigned char sense[SCSI_SENSE_BUFFERSIZE];
+};
+
+#define TRESPASS_PAGE 0x22
+#define EMC_FAILOVER_TIMEOUT (60 * HZ)
+
+/* Code borrowed from dm-lsi-rdac by Mike Christie */
+
+static inline void free_bio(struct bio *bio)
+{
+	__free_page(bio->bi_io_vec[0].bv_page);
+	bio_put(bio);
+}
+
+static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
+{
+	struct path *path = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	/* We also need to look at the sense keys here whether or not to
+	 * switch to the next PG etc.
+	 *
+	 * For now simple logic: either it works or it doesn't.
+	 */
+	if (error)
+		dm_pg_init_complete(path, MP_FAIL_PATH);
+	else
+		dm_pg_init_complete(path, 0);
+
+	/* request is freed in block layer */
+	free_bio(bio);
+
+	return 0;
+}
+
+static struct bio *get_failover_bio(struct path *path, unsigned data_size)
+{
+	struct bio *bio;
+	struct page *page;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio) {
+		DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
+		return NULL;
+	}
+	
+	bio->bi_rw |= (1 << BIO_RW);
+	bio->bi_bdev = path->dev->bdev;
+	bio->bi_sector = 0;
+	bio->bi_private = path;
+	bio->bi_end_io = emc_endio;
+
+	page = alloc_page(GFP_ATOMIC);
+	if (!page) {
+		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+		bio_put(bio);
+		return NULL;
+	}
+
+	if (bio_add_page(bio, page, data_size, 0) != data_size) {
+		DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
+		__free_page(page);
+		bio_put(bio);
+		return NULL;
+	}
+
+	return bio;
+}
+
+static struct request *get_failover_req(struct emc_handler *h,
+					struct bio *bio, struct path *path)
+{
+	struct request *rq;
+	struct block_device *bdev = bio->bi_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	/* FIXME: Figure out why it fails with GFP_ATOMIC. */
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	if (!rq) {
+		DMERR("dm-emc: get_failover_req: blk_get_request failed");
+		return NULL;
+	}
+	
+	rq->bio = rq->biotail = bio;
+	blk_rq_bio_prep(q, rq, bio);
+
+	rq->rq_disk = bdev->bd_contains->bd_disk;
+
+	/* bio backed don't set data */
+	rq->buffer = rq->data = NULL;
+	/* rq data_len used for pc cmd's request_bufflen */
+	rq->data_len = bio->bi_size;
+
+	rq->sense = h->sense;
+	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
+	rq->sense_len = 0;
+
+	memset(&rq->cmd, 0, BLK_MAX_CDB);
+
+	rq->timeout = EMC_FAILOVER_TIMEOUT;
+	rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
+
+	return rq;
+}
+
+static struct request *emc_trespass_get(struct emc_handler *h,
+					struct path *path)
+{
+	struct bio *bio;
+	struct request *rq;
+	unsigned char *page22;
+	unsigned char long_trespass_pg[] = {
+		0, 0, 0, 0,
+		TRESPASS_PAGE,        /* Page code */
+		0x09,                 /* Page length - 2 */
+		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
+		0xff, 0xff,           /* Trespass target */
+		0, 0, 0, 0, 0, 0      /* Reserved bytes / unknown */
+		};
+	unsigned char short_trespass_pg[] = {
+		0, 0, 0, 0,
+		TRESPASS_PAGE,        /* Page code */
+		0x02,                 /* Page length - 2 */
+		h->hr ? 0x01 : 0x81,  /* Trespass code + Honor reservation bit */
+		0xff,                 /* Trespass target */
+		};
+	unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
+				sizeof(long_trespass_pg);
+
+	/* get bio backing */
+	if (data_size > PAGE_SIZE)
+		/* this should never happen */
+		return NULL;
+
+	bio = get_failover_bio(path, data_size);
+	if (!bio) {
+		DMERR("dm-emc: emc_trespass_get: no bio");
+		return NULL;
+	}
+	
+	page22 = (unsigned char *)bio_data(bio);
+	memset(page22, 0, data_size);
+
+	memcpy(page22, h->short_trespass ?
+		short_trespass_pg : long_trespass_pg, data_size);
+
+	/* get request for block layer packet command */
+	rq = get_failover_req(h, bio, path);
+	if (!rq) {
+		DMERR("dm-emc: emc_trespass_get: no rq");
+		free_bio(bio);
+		return NULL;
+	}
+
+	/* Prepare the command. */
+	rq->cmd[0] = MODE_SELECT;
+	rq->cmd[1] = 0x10;
+	rq->cmd[4] = data_size;
+	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
+
+	return rq;
+}
+
+static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
+			struct path *path)
+{
+	struct request *rq;
+	struct request_queue *q = bdev_get_queue(path->dev->bdev);
+
+	/*
+	 * We can either blindly init the pg (then look at the sense),
+	 * or we can send some commands to get the state here (then
+	 * possibly send the fo cmnd), or we can also have the
+	 * initial state passed into us and then get an update here.
+	 */
+	if (!q) {
+		DMINFO("dm-emc: emc_pg_init: no queue");
+		goto fail_path;
+	}
+
+	/* FIXME: The request should be pre-allocated. */
+	rq = emc_trespass_get(hwh->context, path);
+	if (!rq) {
+		DMERR("dm-emc: emc_pg_init: no rq");
+		goto fail_path;
+	}
+
+	DMINFO("dm-emc: emc_pg_init: sending switch-over command");
+	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
+	return;
+
+fail_path:
+	dm_pg_init_complete(path, MP_FAIL_PATH);
+}
+
+static struct emc_handler *alloc_emc_handler(void)
+{
+	struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
+
+	if (h) {
+		memset(h, 0, sizeof(*h));
+		spin_lock_init(&h->lock);
+	}
+
+	return h;
+}
+
+static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
+{
+	struct emc_handler *h;
+	unsigned hr, short_trespass;
+	
+	if (argc == 0) {
+		/* No arguments: use defaults */
+		hr = 0;
+		short_trespass = 0;
+	} else if (argc != 2) {
+		DMWARN("dm-emc hwhandler: incorrect number of arguments");
+		return -EINVAL;
+	} else {
+		if ((sscanf(argv[0], "%u", &short_trespass) != 1)
+			|| (short_trespass > 1)) {
+			DMWARN("dm-emc: invalid trespass mode selected");
+			return -EINVAL;
+		}
+		
+		if ((sscanf(argv[1], "%u", &hr) != 1)
+			|| (hr > 1)) {
+			DMWARN("dm-emc: invalid honor reservation flag selected");
+			return -EINVAL;
+		}
+	}
+
+	h = alloc_emc_handler();
+	if (!h)
+		return -ENOMEM;
+
+	hwh->context = h;
+
+	if ((h->short_trespass = short_trespass))
+		DMWARN("dm-emc: short trespass command will be send");
+	else
+		DMWARN("dm-emc: long trespass command will be send");
+	
+	if ((h->hr = hr))
+		DMWARN("dm-emc: honor reservation bit will be set");
+	else
+		DMWARN("dm-emc: honor reservation bit will not be set (default)");
+
+	return 0;
+}
+
+static void emc_destroy(struct hw_handler *hwh)
+{
+	struct emc_handler *h = (struct emc_handler *) hwh->context;
+
+	kfree(h);
+	hwh->context = NULL;
+}
+
+static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
+{
+	/* FIXME: Patch from axboe still missing */
+#if 0
+	int sense;
+
+	if (bio->bi_error & BIO_SENSE) {
+		sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
+
+		if (sense == 0x020403) {
+			/* LUN Not Ready - Manual Intervention Required
+			 * indicates this is a passive path.
+			 *
+			 * FIXME: However, if this is seen and EVPD C0
+			 * indicates that this is due to a NDU in
+			 * progress, we should set FAIL_PATH too.
+			 * This indicates we might have to do a SCSI
+			 * inquiry in the end_io path. Ugh. */
+			return MP_BYPASS_PG | MP_RETRY_IO;
+		} else if (sense == 0x052501) {
+			/* An array based copy is in progress. Do not
+			 * fail the path, do not bypass to another PG,
+			 * do not retry. Fail the IO immediately.
+			 * (Actually this is the same conclusion as in
+			 * the default handler, but lets make sure.) */
+			return 0;
+		} else if (sense == 0x062900) {
+			/* Unit Attention Code. This is the first IO
+			 * to the new path, so just retry. */
+			return MP_RETRY_IO;
+		}
+	}
+#endif
+
+	/* Try default handler */
+	return dm_scsi_err_handler(hwh, bio);
+}
+
+static struct hw_handler_type emc_hwh = {
+	.name = "emc",
+	.module = THIS_MODULE,
+	.create = emc_create,
+	.destroy = emc_destroy,
+	.pg_init = emc_pg_init,
+	.error = emc_error,
+};
+
+static int __init dm_emc_init(void)
+{
+	int r = dm_register_hw_handler(&emc_hwh);
+
+	if (r < 0)
+		DMERR("emc: register failed %d", r);
+
+	DMINFO("dm-emc version 0.0.3 loaded");
+
+	return r;
+}
+
+static void __exit dm_emc_exit(void)
+{
+	int r = dm_unregister_hw_handler(&emc_hwh);
+
+	if (r < 0)
+		DMERR("emc: unregister failed %d", r);
+}
+
+module_init(dm_emc_init);
+module_exit(dm_emc_exit);
+
+MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
+MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
+MODULE_LICENSE("GPL");
diff -pruN ./drivers/md.dm/dm.h ./drivers/md/dm.h
--- ./drivers/md.dm/dm.h	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm.h	2006-03-17 13:16:38.000000000 +0300
@@ -19,6 +19,9 @@
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
 
+#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
+			  0 : scnprintf(result + sz, maxlen - sz, x))
+
 /*
  * FIXME: I think this should be with the definition of sector_t
  * in types.h.
@@ -40,6 +43,7 @@ struct dm_dev {
 	atomic_t count;
 	int mode;
 	struct block_device *bdev;
+	char name[16];
 };
 
 struct dm_table;
@@ -51,6 +55,8 @@ struct mapped_device;
  *---------------------------------------------------------------*/
 int dm_create(struct mapped_device **md);
 int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
+void dm_set_mdptr(struct mapped_device *md, void *ptr);
+void *dm_get_mdptr(dev_t dev);
 
 /*
  * Reference counting for md.
@@ -61,7 +67,7 @@ void dm_put(struct mapped_device *md);
 /*
  * A device can still be used while suspended, but I/O is deferred.
  */
-int dm_suspend(struct mapped_device *md);
+int dm_suspend(struct mapped_device *md, int with_lockfs);
 int dm_resume(struct mapped_device *md);
 
 /*
@@ -109,10 +115,12 @@ void dm_table_set_restrictions(struct dm
 unsigned int dm_table_get_num_targets(struct dm_table *t);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 int dm_table_get_mode(struct dm_table *t);
-void dm_table_suspend_targets(struct dm_table *t);
+void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_postsuspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
+int dm_table_flush_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
@@ -135,21 +143,22 @@ static inline int array_too_big(unsigned
 }
 
 /*
- * ceiling(n / size) * size
+ * Ceiling(n / sz)
  */
-static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
-{
-	unsigned long r = n % size;
-	return n + (r ? (size - r) : 0);
-}
+#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
+
+#define dm_sector_div_up(n, sz) ( \
+{ \
+	sector_t _r = ((n) + (sz) - 1); \
+	sector_div(_r, (sz)); \
+	_r; \
+} \
+)
 
 /*
- * Ceiling(n / size)
+ * ceiling(n / size) * size
  */
-static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
-{
-	return dm_round_up(n, size) / size;
-}
+#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
 
 static inline sector_t to_sector(unsigned long n)
 {
@@ -161,6 +170,8 @@ static inline unsigned long to_bytes(sec
 	return (n << 9);
 }
 
+int dm_split_args(int *argc, char ***argvp, char *input);
+
 /*
  * The device-mapper can be driven through one of two interfaces;
  * ioctl or filesystem, depending which patch you have applied.
@@ -178,5 +189,6 @@ int dm_stripe_init(void);
 void dm_stripe_exit(void);
 
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
+union map_info *dm_get_mapinfo(struct bio *bio);
 
 #endif
diff -pruN ./drivers/md.dm/dm-hw-handler.c ./drivers/md/dm-hw-handler.c
--- ./drivers/md.dm/dm-hw-handler.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-hw-handler.c	2006-03-20 09:38:13.000000000 +0300
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath hardware handler registration.
+ */
+
+#include "dm.h"
+#include "dm-hw-handler.h"
+
+#include <linux/slab.h>
+
+struct hwh_internal {
+	struct hw_handler_type hwht;
+
+	struct list_head list;
+	long use;
+};
+
+#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
+
+static LIST_HEAD(_hw_handlers);
+static DECLARE_RWSEM(_hwh_lock);
+
+struct hwh_internal *__find_hw_handler_type(const char *name)
+{
+	struct hwh_internal *hwhi;
+
+	list_for_each_entry(hwhi, &_hw_handlers, list) {
+		if (!strcmp(name, hwhi->hwht.name))
+			return hwhi;
+	}
+
+	return NULL;
+}
+
+static struct hwh_internal *get_hw_handler(const char *name)
+{
+	struct hwh_internal *hwhi;
+
+	down_read(&_hwh_lock);
+	hwhi = __find_hw_handler_type(name);
+	if (hwhi) {
+		if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
+			hwhi = NULL;
+		else
+			hwhi->use++;
+	}
+	up_read(&_hwh_lock);
+
+	return hwhi;
+}
+
+struct hw_handler_type *dm_get_hw_handler(const char *name)
+{
+	struct hwh_internal *hwhi;
+
+	if (!name)
+		return NULL;
+
+	hwhi = get_hw_handler(name);
+	if (!hwhi) {
+		request_module("dm-%s", name);
+		hwhi = get_hw_handler(name);
+	}
+
+	return hwhi ? &hwhi->hwht : NULL;
+}
+
+void dm_put_hw_handler(struct hw_handler_type *hwht)
+{
+	struct hwh_internal *hwhi;
+
+	if (!hwht)
+		return;
+
+	down_read(&_hwh_lock);
+	hwhi = __find_hw_handler_type(hwht->name);
+	if (!hwhi)
+		goto out;
+
+	if (--hwhi->use == 0)
+		module_put(hwhi->hwht.module);
+
+	if (hwhi->use < 0)
+		BUG();
+
+      out:
+	up_read(&_hwh_lock);
+}
+
+static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
+{
+	struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
+
+	if (hwhi) {
+		memset(hwhi, 0, sizeof(*hwhi));
+		hwhi->hwht = *hwht;
+	}
+
+	return hwhi;
+}
+
+int dm_register_hw_handler(struct hw_handler_type *hwht)
+{
+	int r = 0;
+	struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
+
+	if (!hwhi)
+		return -ENOMEM;
+
+	down_write(&_hwh_lock);
+
+	if (__find_hw_handler_type(hwht->name)) {
+		kfree(hwhi);
+		r = -EEXIST;
+	} else
+		list_add(&hwhi->list, &_hw_handlers);
+
+	up_write(&_hwh_lock);
+
+	return r;
+}
+
+int dm_unregister_hw_handler(struct hw_handler_type *hwht)
+{
+	struct hwh_internal *hwhi;
+
+	down_write(&_hwh_lock);
+
+	hwhi = __find_hw_handler_type(hwht->name);
+	if (!hwhi) {
+		up_write(&_hwh_lock);
+		return -EINVAL;
+	}
+
+	if (hwhi->use) {
+		up_write(&_hwh_lock);
+		return -ETXTBSY;
+	}
+
+	list_del(&hwhi->list);
+
+	up_write(&_hwh_lock);
+
+	kfree(hwhi);
+
+	return 0;
+}
+
+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
+{
+#if 0
+	int sense_key, asc, ascq;
+
+	if (bio->bi_error & BIO_SENSE) {
+		/* FIXME: This is just an initial guess. */
+		/* key / asc / ascq */
+		sense_key = (bio->bi_error >> 16) & 0xff;
+		asc = (bio->bi_error >> 8) & 0xff;
+		ascq = bio->bi_error & 0xff;
+
+		switch (sense_key) {
+			/* This block as a whole comes from the device.
+			 * So no point retrying on another path. */
+		case 0x03:	/* Medium error */
+		case 0x05:	/* Illegal request */
+		case 0x07:	/* Data protect */
+		case 0x08:	/* Blank check */
+		case 0x0a:	/* copy aborted */
+		case 0x0c:	/* obsolete - no clue ;-) */
+		case 0x0d:	/* volume overflow */
+		case 0x0e:	/* data miscompare */
+		case 0x0f:	/* reserved - no idea either. */
+			return MP_ERROR_IO;
+
+			/* For these errors it's unclear whether they
+			 * come from the device or the controller.
+			 * So just lets try a different path, and if
+			 * it eventually succeeds, user-space will clear
+			 * the paths again... */
+		case 0x02:	/* Not ready */
+		case 0x04:	/* Hardware error */
+		case 0x09:	/* vendor specific */
+		case 0x0b:	/* Aborted command */
+			return MP_FAIL_PATH;
+
+		case 0x06:	/* Unit attention - might want to decode */
+			if (asc == 0x04 && ascq == 0x01)
+				/* "Unit in the process of
+				 * becoming ready" */
+				return 0;
+			return MP_FAIL_PATH;
+
+			/* FIXME: For Unit Not Ready we may want
+			 * to have a generic pg activation
+			 * feature (START_UNIT). */
+
+			/* Should these two ever end up in the
+			 * error path? I don't think so. */
+		case 0x00:	/* No sense */
+		case 0x01:	/* Recovered error */
+			return 0;
+		}
+	}
+#endif
+
+	/* We got no idea how to decode the other kinds of errors ->
+	 * assume generic error condition. */
+	return MP_FAIL_PATH;
+}
+
+EXPORT_SYMBOL_GPL(dm_register_hw_handler);
+EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
+EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
diff -pruN ./drivers/md.dm/dm-hw-handler.h ./drivers/md/dm-hw-handler.h
--- ./drivers/md.dm/dm-hw-handler.h	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-hw-handler.h	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath hardware handler registration.
+ */
+
+#ifndef	DM_HW_HANDLER_H
+#define	DM_HW_HANDLER_H
+
+#include <linux/device-mapper.h>
+
+#include "dm-mpath.h"
+
+struct hw_handler_type;
+struct hw_handler {
+	struct hw_handler_type *type;
+	void *context;
+};
+
+/*
+ * Constructs a hardware handler object, takes custom arguments
+ */
+/* Information about a hardware handler type */
+struct hw_handler_type {
+	char *name;
+	struct module *module;
+
+	int (*create) (struct hw_handler *handler, unsigned int argc,
+		       char **argv);
+	void (*destroy) (struct hw_handler *hwh);
+
+	void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
+			 struct path *path);
+	unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
+	int (*status) (struct hw_handler *hwh, status_type_t type,
+		       char *result, unsigned int maxlen);
+};
+
+/* Register a hardware handler */
+int dm_register_hw_handler(struct hw_handler_type *type);
+
+/* Unregister a hardware handler */
+int dm_unregister_hw_handler(struct hw_handler_type *type);
+
+/* Returns a registered hardware handler type */
+struct hw_handler_type *dm_get_hw_handler(const char *name);
+
+/* Releases a hardware handler  */
+void dm_put_hw_handler(struct hw_handler_type *hwht);
+
+/* Default err function */
+unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
+
+/* Error flags for err and dm_pg_init_complete */
+#define MP_FAIL_PATH 1
+#define MP_BYPASS_PG 2
+#define MP_ERROR_IO  4	/* Don't retry this I/O */
+
+#endif
diff -pruN ./drivers/md.dm/dm-io.c ./drivers/md/dm-io.c
--- ./drivers/md.dm/dm-io.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-io.c	2006-03-17 13:16:38.000000000 +0300
@@ -267,7 +267,7 @@ static int resize_pool(unsigned int new_
 		/* create new pool */
 		_io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
 		if (!_io_pool)
-			r = -ENOMEM;
+			return -ENOMEM;
 
 		r = bio_set_init(&_bios, "dm-io", 512, 1);
 		if (r) {
diff -pruN ./drivers/md.dm/dm-ioctl.c ./drivers/md/dm-ioctl.c
--- ./drivers/md.dm/dm-ioctl.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-ioctl.c	2006-03-17 13:16:38.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -17,7 +18,7 @@
 
 #include <asm/uaccess.h>
 
-#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
 
 /*-----------------------------------------------------------------
  * The ioctl interface needs to be able to look up devices by
@@ -121,14 +122,6 @@ static struct hash_cell *__get_uuid_cell
 /*-----------------------------------------------------------------
  * Inserting, removing and renaming a device.
  *---------------------------------------------------------------*/
-static inline char *kstrdup(const char *str)
-{
-	char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
-	if (r)
-		strcpy(r, str);
-	return r;
-}
-
 static struct hash_cell *alloc_cell(const char *name, const char *uuid,
 				    struct mapped_device *md)
 {
@@ -138,7 +131,7 @@ static struct hash_cell *alloc_cell(cons
 	if (!hc)
 		return NULL;
 
-	hc->name = kstrdup(name);
+	hc->name = kstrdup(name, GFP_KERNEL);
 	if (!hc->name) {
 		kfree(hc);
 		return NULL;
@@ -148,7 +141,7 @@ static struct hash_cell *alloc_cell(cons
 		hc->uuid = NULL;
 
 	else {
-		hc->uuid = kstrdup(uuid);
+		hc->uuid = kstrdup(uuid, GFP_KERNEL);
 		if (!hc->uuid) {
 			kfree(hc->name);
 			kfree(hc);
@@ -224,6 +217,7 @@ static int dm_hash_insert(const char *na
 	}
 	register_with_devfs(cell);
 	dm_get(md);
+	dm_set_mdptr(md, cell);
 	up_write(&_hash_lock);
 
 	return 0;
@@ -236,10 +230,20 @@ static int dm_hash_insert(const char *na
 
 static void __hash_remove(struct hash_cell *hc)
 {
+	struct dm_table *table;
+
 	/* remove from the dev hash */
 	list_del(&hc->uuid_list);
 	list_del(&hc->name_list);
 	unregister_with_devfs(hc);
+	dm_set_mdptr(hc->md, NULL);
+
+	table = dm_get_table(hc->md);
+	if (table) {
+		dm_table_event(table);
+		dm_table_put(table);
+	}
+
 	dm_put(hc->md);
 	if (hc->new_map)
 		dm_table_put(hc->new_map);
@@ -266,11 +270,12 @@ static int dm_hash_rename(const char *ol
 {
 	char *new_name, *old_name;
 	struct hash_cell *hc;
+	struct dm_table *table;
 
 	/*
 	 * duplicate new.
 	 */
-	new_name = kstrdup(new);
+	new_name = kstrdup(new, GFP_KERNEL);
 	if (!new_name)
 		return -ENOMEM;
 
@@ -313,6 +318,15 @@ static int dm_hash_rename(const char *ol
 	/* rename the device node in devfs */
 	register_with_devfs(hc);
 
+	/*
+	 * Wake up any dm event waiters.
+	 */
+	table = dm_get_table(hc->md);
+	if (table) {
+		dm_table_event(table);
+		dm_table_put(table);
+	}
+
 	up_write(&_hash_lock);
 	kfree(old_name);
 	return 0;
@@ -421,8 +435,8 @@ static void list_version_get_needed(stru
 {
     size_t *needed = needed_param;
 
+    *needed += sizeof(struct dm_target_versions);
     *needed += strlen(tt->name);
-    *needed += sizeof(tt->version);
     *needed += ALIGN_MASK;
 }
 
@@ -517,19 +531,22 @@ static int __dev_status(struct mapped_de
 	if (dm_suspended(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
-	bdev = bdget_disk(disk, 0);
-	if (!bdev)
-		return -ENXIO;
-
 	param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
 
-	/*
-	 * Yes, this will be out of date by the time it gets back
-	 * to userland, but it is still very useful ofr
-	 * debugging.
-	 */
-	param->open_count = bdev->bd_openers;
-	bdput(bdev);
+	if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
+		bdev = bdget_disk(disk, 0);
+		if (!bdev)
+			return -ENXIO;
+
+		/*
+		 * Yes, this will be out of date by the time it gets back
+		 * to userland, but it is still very useful for
+		 * debugging.
+		 */
+		param->open_count = bdev->bd_openers;
+		bdput(bdev);
+	} else
+		param->open_count = -1;
 
 	if (disk->policy)
 		param->flags |= DM_READONLY_FLAG;
@@ -579,12 +596,16 @@ static int dev_create(struct dm_ioctl *p
 }
 
 /*
- * Always use UUID for lookups if it's present, otherwise use name.
+ * Always use UUID for lookups if it's present, otherwise use name or dev.
  */
 static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
-	return *param->uuid ?
-	    __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
+	if (*param->uuid)
+		return __get_uuid_cell(param->uuid);
+	else if (*param->name)
+		return __get_name_cell(param->name);
+	else
+		return dm_get_mdptr(huge_decode_dev(param->dev));
 }
 
 static inline struct mapped_device *find_device(struct dm_ioctl *param)
@@ -596,6 +617,7 @@ static inline struct mapped_device *find
 	hc = __find_device_hash_cell(param);
 	if (hc) {
 		md = hc->md;
+		dm_get(md);
 
 		/*
 		 * Sneakily write in both the name and the uuid
@@ -611,8 +633,6 @@ static inline struct mapped_device *find
 			param->flags |= DM_INACTIVE_PRESENT_FLAG;
 		else
 			param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-
-		dm_get(md);
 	}
 	up_read(&_hash_lock);
 
@@ -673,14 +693,18 @@ static int dev_rename(struct dm_ioctl *p
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct mapped_device *md;
 
 	md = find_device(param);
 	if (!md)
 		return -ENXIO;
 
+	if (param->flags & DM_SKIP_LOCKFS_FLAG)
+		do_lockfs = 0;
+
 	if (!dm_suspended(md))
-		r = dm_suspend(md);
+		r = dm_suspend(md, do_lockfs);
 
 	if (!r)
 		r = __dev_status(md, param);
@@ -692,6 +716,7 @@ static int do_suspend(struct dm_ioctl *p
 static int do_resume(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	struct dm_table *new_map;
@@ -717,8 +742,10 @@ static int do_resume(struct dm_ioctl *pa
 	/* Do we need to load a new map ? */
 	if (new_map) {
 		/* Suspend if it isn't already suspended */
+		if (param->flags & DM_SKIP_LOCKFS_FLAG)
+			do_lockfs = 0;
 		if (!dm_suspended(md))
-			dm_suspend(md);
+			dm_suspend(md, do_lockfs);
 
 		r = dm_swap_table(md, new_map);
 		if (r) {
@@ -964,6 +991,7 @@ static int table_load(struct dm_ioctl *p
 	if (!hc) {
 		DMWARN("device doesn't appear to be in the dev hash table.");
 		up_write(&_hash_lock);
+		dm_table_put(t);
 		return -ENXIO;
 	}
 
@@ -1097,6 +1125,67 @@ static int table_status(struct dm_ioctl 
 	return r;
 }
 
+/*
+ * Pass a message to the target that's at the supplied device offset.
+ */
+static int target_message(struct dm_ioctl *param, size_t param_size)
+{
+	int r, argc;
+	char **argv;
+	struct mapped_device *md;
+	struct dm_table *table;
+	struct dm_target *ti;
+	struct dm_target_msg *tmsg = (void *) param + param->data_start;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	r = __dev_status(md, param);
+	if (r)
+		goto out;
+
+	if (tmsg < (struct dm_target_msg *) (param + 1) ||
+	    invalid_str(tmsg->message, (void *) param + param_size)) {
+		DMWARN("Invalid target message parameters.");
+		r = -EINVAL;
+		goto out;
+	}
+
+	r = dm_split_args(&argc, &argv, tmsg->message);
+	if (r) {
+		DMWARN("Failed to split target message parameters");
+		goto out;
+	}
+
+	table = dm_get_table(md);
+	if (!table)
+		goto out_argv;
+
+	if (tmsg->sector >= dm_table_get_size(table)) {
+		DMWARN("Target message sector outside device.");
+		r = -EINVAL;
+		goto out_table;
+	}
+
+	ti = dm_table_find_target(table, tmsg->sector);
+	if (ti->type->message)
+		r = ti->type->message(ti, argc, argv);
+	else {
+		DMWARN("Target type does not support messages");
+		r = -EINVAL;
+	}
+
+ out_table:
+	dm_table_put(table);
+ out_argv:
+	kfree(argv);
+ out:
+	param->data_size = 0;
+	dm_put(md);
+	return r;
+}
+
 /*-----------------------------------------------------------------
  * Implementation of open/close/ioctl on the special char
  * device.
@@ -1123,7 +1212,9 @@ static ioctl_fn lookup_ioctl(unsigned in
 		{DM_TABLE_DEPS_CMD, table_deps},
 		{DM_TABLE_STATUS_CMD, table_status},
 
-		{DM_LIST_VERSIONS_CMD, list_versions}
+		{DM_LIST_VERSIONS_CMD, list_versions},
+
+		{DM_TARGET_MSG_CMD, target_message}
 	};
 
 	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
@@ -1202,14 +1293,14 @@ static int validate_params(uint cmd, str
 	    cmd == DM_LIST_VERSIONS_CMD)
 		return 0;
 
-	/* Unless creating, either name or uuid but not both */
-	if (cmd != DM_DEV_CREATE_CMD) {
-		if ((!*param->uuid && !*param->name) ||
-		    (*param->uuid && *param->name)) {
-			DMWARN("one of name or uuid must be supplied, cmd(%u)",
-			       cmd);
+	if ((cmd == DM_DEV_CREATE_CMD)) {
+		if (!*param->name) {
+			DMWARN("name not supplied when creating device");
 			return -EINVAL;
 		}
+	} else if ((*param->uuid && *param->name)) {
+		DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
+		return -EINVAL;
 	}
 
 	/* Ensure strings are terminated */
@@ -1268,16 +1359,11 @@ static int ctl_ioctl(struct inode *inode
 	 * Copy the parameters into kernel space.
 	 */
 	r = copy_params(user, &param);
-	if (r) {
-		current->flags &= ~PF_MEMALLOC;
-		return r;
-	}
 
-	/*
-	 * FIXME: eventually we will remove the PF_MEMALLOC flag
-	 * here.  However the tools still do nasty things like
-	 * 'load' while a device is suspended.
-	 */
+	current->flags &= ~PF_MEMALLOC;
+
+	if (r)
+		return r;
 
 	r = validate_params(cmd, param);
 	if (r)
@@ -1295,7 +1381,6 @@ static int ctl_ioctl(struct inode *inode
 
  out:
 	free_params(param);
-	current->flags &= ~PF_MEMALLOC;
 	return r;
 }
 
diff -pruN ./drivers/md.dm/dm-linear.c ./drivers/md/dm-linear.c
--- ./drivers/md.dm/dm-linear.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-linear.c	2006-03-17 13:16:38.000000000 +0300
@@ -80,7 +80,6 @@ static int linear_status(struct dm_targe
 			 char *result, unsigned int maxlen)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
-	char buffer[32];
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -88,8 +87,8 @@ static int linear_status(struct dm_targe
 		break;
 
 	case STATUSTYPE_TABLE:
-		format_dev_t(buffer, lc->dev->bdev->bd_dev);
-		snprintf(result, maxlen, "%s " SECTOR_FORMAT, buffer, lc->start);
+		snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
+			 lc->start);
 		break;
 	}
 	return 0;
diff -pruN ./drivers/md.dm/dm-log.c ./drivers/md/dm-log.c
--- ./drivers/md.dm/dm-log.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-log.c	2006-03-17 13:16:38.000000000 +0300
@@ -17,9 +17,6 @@ static spinlock_t _lock = SPIN_LOCK_UNLO
 
 int dm_register_dirty_log_type(struct dirty_log_type *type)
 {
-	if (!try_module_get(type->module))
-		return -EINVAL;
-
 	spin_lock(&_lock);
 	type->use_count = 0;
 	list_add(&type->list, &_log_types);
@@ -33,11 +30,10 @@ int dm_unregister_dirty_log_type(struct 
 	spin_lock(&_lock);
 
 	if (type->use_count)
-		DMWARN("Attempt to unregister a log type that is still in use");
-	else {
+		DMWARN("Unregister failed: log type '%s' still in use",
+		       type->name);
+	else
 		list_del(&type->list);
-		module_put(type->module);
-	}
 
 	spin_unlock(&_lock);
 
@@ -51,6 +47,10 @@ static struct dirty_log_type *get_type(c
 	spin_lock(&_lock);
 	list_for_each_entry (type, &_log_types, list)
 		if (!strcmp(type_name, type->name)) {
+			if (!type->use_count && !try_module_get(type->module)){
+				spin_unlock(&_lock);
+				return NULL;
+			}
 			type->use_count++;
 			spin_unlock(&_lock);
 			return type;
@@ -63,7 +63,8 @@ static struct dirty_log_type *get_type(c
 static void put_type(struct dirty_log_type *type)
 {
 	spin_lock(&_lock);
-	type->use_count--;
+	if (!--type->use_count)
+		module_put(type->module);
 	spin_unlock(&_lock);
 }
 
@@ -112,7 +113,7 @@ void dm_destroy_dirty_log(struct dirty_l
 /*
  * The on-disk version of the metadata.
  */
-#define MIRROR_DISK_VERSION 1
+#define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2
 
 struct log_header {
@@ -129,20 +130,32 @@ struct log_header {
 struct log_c {
 	struct dm_target *ti;
 	int touched;
-	sector_t region_size;
+	uint32_t region_size;
 	unsigned int region_count;
 	region_t sync_count;
 
 	unsigned bitset_uint32_count;
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
-	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
+	uint32_t *recovering_bits;
 
 	int sync_search;
 
+	/* Resync flag */
+	enum sync {
+		DEFAULTSYNC,	/* Synchronize if necessary */
+		NOSYNC,		/* Devices known to be already in sync */
+		FORCESYNC,	/* Force a sync to happen */
+	} sync;
+
+	int failure_response;
+
 	/*
 	 * Disk log fields
 	 */
+	int log_dev_failed;
+	atomic_t suspended;
+	struct completion failure_completion;
 	struct dm_dev *log_dev;
 	struct log_header header;
 
@@ -150,7 +163,6 @@ struct log_c {
 	struct log_header *disk_header;
 
 	struct io_region bits_location;
-	uint32_t *disk_bits;
 };
 
 /*
@@ -159,20 +171,20 @@ struct log_c {
  */
 static  inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-	return test_bit(bit, (unsigned long *) bs) ? 1 : 0;
+	return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
-	set_bit(bit, (unsigned long *) bs);
+	ext2_set_bit(bit, (unsigned long *) bs);
 	l->touched = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
-	clear_bit(bit, (unsigned long *) bs);
+	ext2_clear_bit(bit, (unsigned long *) bs);
 	l->touched = 1;
 }
 
@@ -205,12 +217,19 @@ static int read_header(struct log_c *log
 
 	header_from_disk(&log->header, log->disk_header);
 
-	if (log->header.magic != MIRROR_MAGIC) {
+	/* New log required? */
+	if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
 		log->header.magic = MIRROR_MAGIC;
 		log->header.version = MIRROR_DISK_VERSION;
 		log->header.nr_regions = 0;
 	}
 
+	/* Version 2 is like version 1 but always little endian on disk. */
+#ifdef __LITTLE_ENDIAN
+	if (log->header.version == 1)
+		log->header.version = 2;
+#endif
+
 	if (log->header.version != MIRROR_DISK_VERSION) {
 		DMWARN("incompatible disk log version");
 		return -EINVAL;
@@ -231,70 +250,69 @@ static inline int write_header(struct lo
 /*----------------------------------------------------------------
  * Bits IO
  *--------------------------------------------------------------*/
-static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count)
-{
-	unsigned i;
-
-	for (i = 0; i < count; i++)
-		core[i] = le32_to_cpu(disk[i]);
-}
-
-static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count)
-{
-	unsigned i;
-
-	/* copy across the clean/dirty bitset */
-	for (i = 0; i < count; i++)
-		disk[i] = cpu_to_le32(core[i]);
-}
-
 static int read_bits(struct log_c *log)
 {
 	int r;
 	unsigned long ebits;
 
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
-			  log->disk_bits, &ebits);
+			  log->clean_bits, &ebits);
 	if (r)
 		return r;
 
-	bits_to_core(log->clean_bits, log->disk_bits,
-		     log->bitset_uint32_count);
 	return 0;
 }
 
 static int write_bits(struct log_c *log)
 {
 	unsigned long ebits;
-	bits_to_disk(log->clean_bits, log->disk_bits,
-		     log->bitset_uint32_count);
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
-			     log->disk_bits, &ebits);
+			     log->clean_bits, &ebits);
 }
 
 /*----------------------------------------------------------------
- * constructor/destructor
+ * core log constructor/destructor
+ *
+ * argv contains: <region_size> [[no]sync] [block_on_error]
  *--------------------------------------------------------------*/
 #define BYTE_SHIFT 3
 static int core_ctr(struct dirty_log *log, struct dm_target *ti,
 		    unsigned int argc, char **argv)
 {
+	enum sync sync = DEFAULTSYNC;
+	int failure_response = DMLOG_IOERR_IGNORE;
+
 	struct log_c *lc;
-	sector_t region_size;
+	uint32_t region_size;
 	unsigned int region_count;
 	size_t bitset_size;
+	unsigned i;
 
-	if (argc != 1) {
-		DMWARN("wrong number of arguments to log_c");
+	if (argc < 1 || argc > 3) {
+		DMWARN("wrong number of arguments to mirror log");
 		return -EINVAL;
 	}
 
-	if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
+	for (i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "sync"))
+			sync = FORCESYNC;
+		else if (!strcmp(argv[i], "nosync"))
+			sync = NOSYNC;
+		else if (!strcmp(argv[i], "block_on_error"))
+			failure_response = DMLOG_IOERR_BLOCK;
+		else {
+			DMWARN("unrecognised sync argument to mirror log: %s",
+			       argv[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (sscanf(argv[0], "%u", &region_size) != 1) {
 		DMWARN("invalid region size string");
 		return -EINVAL;
 	}
 
-	region_count = dm_div_up(ti->len, region_size);
+	region_count = dm_sector_div_up(ti->len, region_size);
 
 	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
 	if (!lc) {
@@ -306,12 +324,14 @@ static int core_ctr(struct dirty_log *lo
 	lc->touched = 0;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
+	lc->sync = sync;
+	lc->failure_response = failure_response;
 
 	/*
-	 * Work out how many words we need to hold the bitset.
+	 * Work out how many "unsigned long"s we need to hold the bitset.
 	 */
 	bitset_size = dm_round_up(region_count,
-				  sizeof(*lc->clean_bits) << BYTE_SHIFT);
+				  sizeof(unsigned long) << BYTE_SHIFT);
 	bitset_size >>= BYTE_SHIFT;
 
 	lc->bitset_uint32_count = bitset_size / 4;
@@ -330,12 +350,12 @@ static int core_ctr(struct dirty_log *lo
 		kfree(lc);
 		return -ENOMEM;
 	}
-	memset(lc->sync_bits, 0, bitset_size);
-        lc->sync_count = 0;
+	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
+	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
 	lc->recovering_bits = vmalloc(bitset_size);
 	if (!lc->recovering_bits) {
-		DMWARN("couldn't allocate sync bitset");
+		DMWARN("couldn't allocate recovering bitset");
 		vfree(lc->sync_bits);
 		vfree(lc->clean_bits);
 		kfree(lc);
@@ -356,6 +376,11 @@ static void core_dtr(struct dirty_log *l
 	kfree(lc);
 }
 
+/*----------------------------------------------------------------
+ * disk log constructor/destructor
+ *
+ * argv contains log_device region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
 static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
 		    unsigned int argc, char **argv)
 {
@@ -364,8 +389,8 @@ static int disk_ctr(struct dirty_log *lo
 	struct log_c *lc;
 	struct dm_dev *dev;
 
-	if (argc != 2) {
-		DMWARN("wrong number of arguments to log_d");
+	if (argc < 2 || argc > 3) {
+		DMWARN("wrong number of arguments to disk mirror log");
 		return -EINVAL;
 	}
 
@@ -382,6 +407,8 @@ static int disk_ctr(struct dirty_log *lo
 
 	lc = (struct log_c *) log->context;
 	lc->log_dev = dev;
+	lc->log_dev_failed = 0;
+	init_completion(&lc->failure_completion);
 
 	/* setup the disk header fields */
 	lc->header_location.bdev = lc->log_dev->bdev;
@@ -403,11 +430,6 @@ static int disk_ctr(struct dirty_log *lo
 	size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
 			   1 << SECTOR_SHIFT);
 	lc->bits_location.count = size >> SECTOR_SHIFT;
-	lc->disk_bits = vmalloc(size);
-	if (!lc->disk_bits) {
-		vfree(lc->disk_header);
-		goto bad;
-	}
 	return 0;
 
  bad:
@@ -421,7 +443,6 @@ static void disk_dtr(struct dirty_log *l
 	struct log_c *lc = (struct log_c *) log->context;
 	dm_put_device(lc->ti, lc->log_dev);
 	vfree(lc->disk_header);
-	vfree(lc->disk_bits);
 	core_dtr(log);
 }
 
@@ -435,42 +456,65 @@ static int count_bits32(uint32_t *addr, 
 	return count;
 }
 
+static void fail_log_device(struct log_c *lc)
+{
+	lc->log_dev_failed = 1;
+	if (lc->failure_response == DMLOG_IOERR_BLOCK)
+		dm_table_event(lc->ti->table);
+}
+
+static void restore_log_device(struct log_c *lc)
+{
+	lc->log_dev_failed = 0;
+}
+
 static int disk_resume(struct dirty_log *log)
 {
-	int r;
+	int r = 0;
 	unsigned i;
 	struct log_c *lc = (struct log_c *) log->context;
 	size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
 
-	/* read the disk header */
-	r = read_header(lc);
-	if (r)
-		return r;
-
-	/* read the bits */
-	r = read_bits(lc);
-	if (r)
-		return r;
-
-	/* zero any new bits if the mirror has grown */
-	for (i = lc->header.nr_regions; i < lc->region_count; i++)
-		/* FIXME: amazingly inefficient */
-		log_clear_bit(lc, lc->clean_bits, i);
+	/* 
+	 * Read the disk header, but only if we know it is good.
+	 * Assume the worst in the event of failure.
+	 */
+	if (!lc->log_dev_failed &&
+	    ((r = read_header(lc)) || read_bits(lc))) {
+		DMWARN("Read %s failed on mirror log device, %s.",
+		      r ? "header" : "bits", lc->log_dev->name);
+		fail_log_device(lc);
+		lc->header.nr_regions = 0;
+	}
+
+	/* set or clear any new bits */
+	if (lc->sync == NOSYNC)
+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
+			/* FIXME: amazingly inefficient */
+			log_set_bit(lc, lc->clean_bits, i);
+	else
+		for (i = lc->header.nr_regions; i < lc->region_count; i++)
+			/* FIXME: amazingly inefficient */
+			log_clear_bit(lc, lc->clean_bits, i);
 
 	/* copy clean across to sync */
 	memcpy(lc->sync_bits, lc->clean_bits, size);
 	lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
 
-	/* write the bits */
-	r = write_bits(lc);
-	if (r)
-		return r;
-
 	/* set the correct number of regions in the header */
 	lc->header.nr_regions = lc->region_count;
 
-	/* write the new header */
-	return write_header(lc);
+	/* write out the log.  'i' tells us which has failed if any */
+	i = 1;
+	if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
+		DMWARN("Write %s failed on mirror log device, %s.",
+		      i ? "bits" : "header", lc->log_dev->name);
+		fail_log_device(lc);
+	} else
+		restore_log_device(lc);
+
+	atomic_set(&lc->suspended, 0);
+	return r;
 }
 
 static sector_t core_get_region_size(struct dirty_log *log)
@@ -497,6 +541,17 @@ static int core_flush(struct dirty_log *
 	return 0;
 }
 
+static int disk_presuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	atomic_set(&lc->suspended, 1);
+	if (lc->log_dev_failed && (lc->failure_response == DMLOG_IOERR_BLOCK))
+		complete(&lc->failure_completion);
+
+	return 0;
+}
+
 static int disk_flush(struct dirty_log *log)
 {
 	int r;
@@ -506,9 +561,24 @@ static int disk_flush(struct dirty_log *
 	if (!lc->touched)
 		return 0;
 
+	/*
+	 * If a failure occurs, we must wait for a suspension.
+	 * We must not proceed in the event of a failure,
+	 * because if the machine reboots with the log
+	 * incorrect, recovery could be compromised
+	 */
 	r = write_bits(lc);
-	if (!r)
+	if (!r) {
 		lc->touched = 0;
+		restore_log_device(lc);
+	} else {
+		DMERR("Write failure on mirror log device, %s.",
+		      lc->log_dev->name);
+		fail_log_device(lc);
+		if (!atomic_read(&lc->suspended) &&
+		    (lc->failure_response == DMLOG_IOERR_BLOCK))
+			wait_for_completion(&lc->failure_completion);
+	}
 
 	return r;
 }
@@ -538,7 +608,7 @@ static int core_get_resync_work(struct d
 					     lc->sync_search);
 		lc->sync_search = *region + 1;
 
-		if (*region == lc->region_count)
+		if (*region >= lc->region_count)
 			return 0;
 
 	} while (log_test_bit(lc->recovering_bits, *region));
@@ -566,6 +636,60 @@ static region_t core_get_sync_count(stru
         return lc->sync_count;
 }
 
+#define	DMEMIT_SYNC \
+	if (lc->sync != DEFAULTSYNC) \
+		DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
+
+static int core_status(struct dirty_log *log, status_type_t status,
+		       char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct log_c *lc = log->context;
+
+	switch(status) {
+	case STATUSTYPE_INFO:
+		DMEMIT("1 core");
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %u %u ", log->type->name,
+		       lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
+		DMEMIT_SYNC;
+	}
+
+	return sz;
+}
+
+static int disk_status(struct dirty_log *log, status_type_t status,
+		       char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct log_c *lc = log->context;
+
+	switch(status) {
+	case STATUSTYPE_INFO:
+		DMEMIT("3 disk %s %c", lc->log_dev->name,
+		       lc->log_dev_failed ? 'D' : 'A');
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %u %s %u ", log->type->name,
+		       lc->sync == DEFAULTSYNC ? 2 : 3,
+		       lc->log_dev->name,
+		       lc->region_size);
+		DMEMIT_SYNC;
+	}
+
+	return sz;
+}
+
+static int core_get_failure_response(struct dirty_log *log)
+{
+	struct log_c *lc = log->context;
+
+	return lc->failure_response;
+}
+
 static struct dirty_log_type _core_type = {
 	.name = "core",
 	.module = THIS_MODULE,
@@ -579,7 +703,9 @@ static struct dirty_log_type _core_type 
 	.clear_region = core_clear_region,
 	.get_resync_work = core_get_resync_work,
 	.complete_resync_work = core_complete_resync_work,
-        .get_sync_count = core_get_sync_count
+	.get_sync_count = core_get_sync_count,
+	.status = core_status,
+	.get_failure_response = core_get_failure_response,
 };
 
 static struct dirty_log_type _disk_type = {
@@ -587,7 +713,8 @@ static struct dirty_log_type _disk_type 
 	.module = THIS_MODULE,
 	.ctr = disk_ctr,
 	.dtr = disk_dtr,
-	.suspend = disk_flush,
+	.presuspend = disk_presuspend,
+	.postsuspend = disk_flush,
 	.resume = disk_resume,
 	.get_region_size = core_get_region_size,
 	.is_clean = core_is_clean,
@@ -597,7 +724,9 @@ static struct dirty_log_type _disk_type 
 	.clear_region = core_clear_region,
 	.get_resync_work = core_get_resync_work,
 	.complete_resync_work = core_complete_resync_work,
-        .get_sync_count = core_get_sync_count
+	.get_sync_count = core_get_sync_count,
+	.status = disk_status,
+	.get_failure_response = core_get_failure_response,
 };
 
 int __init dm_dirty_log_init(void)
diff -pruN ./drivers/md.dm/dm-log.h ./drivers/md/dm-log.h
--- ./drivers/md.dm/dm-log.h	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-log.h	2006-03-17 13:16:38.000000000 +0300
@@ -9,6 +9,15 @@
 
 #include "dm.h"
 
+/*
+ * Values returned by get_failure_response()
+ *   DMLOG_IOERR_IGNORE:  ignore device failures
+ *   DMLOG_IOERR_BLOCK:     issue dm event, and do not complete
+ *                 I/O until presuspend is recieved.
+ */
+#define DMLOG_IOERR_IGNORE 0
+#define DMLOG_IOERR_BLOCK  1
+
 typedef sector_t region_t;
 
 struct dirty_log_type;
@@ -32,7 +41,8 @@ struct dirty_log_type {
 	 * There are times when we don't want the log to touch
 	 * the disk.
 	 */
-	int (*suspend)(struct dirty_log *log);
+	int (*presuspend)(struct dirty_log *log);
+	int (*postsuspend)(struct dirty_log *log);
 	int (*resume)(struct dirty_log *log);
 
 	/*
@@ -48,6 +58,16 @@ struct dirty_log_type {
 	int (*is_clean)(struct dirty_log *log, region_t region);
 
 	/*
+	 * Returns: 0, 1
+	 *
+	 * This is necessary for cluster mirroring. It provides
+	 * a way to detect recovery on another node, so we
+	 * aren't writing concurrently.  This function is likely
+	 * to block (when a cluster log is used).
+	 */
+	int (*is_remote_recovering)(struct dirty_log *log, region_t region);
+
+	/*
 	 *  Returns: 0, 1, -EWOULDBLOCK, < 0
 	 *
 	 * A predicate function to check the area given by
@@ -101,6 +121,18 @@ struct dirty_log_type {
 	 * Returns the number of regions that are in sync.
          */
         region_t (*get_sync_count)(struct dirty_log *log);
+
+	/*
+	 * Support function for mirror status requests.
+	 */
+	int (*status)(struct dirty_log *log, status_type_t status_type,
+		      char *result, unsigned int maxlen);
+
+	/*
+	 * Return the code describing what to do in the event
+	 * of a device failure.
+	 */
+	int (*get_failure_response)(struct dirty_log *log);
 };
 
 int dm_register_dirty_log_type(struct dirty_log_type *type);
diff -pruN ./drivers/md.dm/dm-mpath.c ./drivers/md/dm-mpath.c
--- ./drivers/md.dm/dm-mpath.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-mpath.c	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+#include "dm-hw-handler.h"
+#include "dm-bio-list.h"
+#include "dm-bio-record.h"
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/workqueue.h>
+#include <asm/atomic.h>
+
+#define MESG_STR(x) x, sizeof(x)
+
+/* Path properties */
+struct pgpath {
+	struct list_head list;
+
+	struct priority_group *pg;	/* Owning PG */
+	unsigned fail_count;		/* Cumulative failure count */
+
+	struct path path;
+};
+
+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
+
+/*
+ * Paths are grouped into Priority Groups and numbered from 1 upwards.
+ * Each has a path selector which controls which path gets used.
+ */
+struct priority_group {
+	struct list_head list;
+
+	struct multipath *m;		/* Owning multipath instance */
+	struct path_selector ps;
+
+	unsigned pg_num;		/* Reference number */
+	unsigned bypassed;		/* Temporarily bypass this PG? */
+
+	unsigned nr_pgpaths;		/* Number of paths in PG */
+	struct list_head pgpaths;
+};
+
+/* Multipath context */
+struct multipath {
+	struct list_head list;
+	struct dm_target *ti;
+
+	spinlock_t lock;
+
+	struct hw_handler hw_handler;
+	unsigned nr_priority_groups;
+	struct list_head priority_groups;
+	unsigned pg_init_required;	/* pg_init needs calling? */
+	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
+
+	unsigned nr_valid_paths;	/* Total number of usable paths */
+	struct pgpath *current_pgpath;
+	struct priority_group *current_pg;
+	struct priority_group *next_pg;	/* Switch to this PG if set */
+	unsigned repeat_count;		/* I/Os left before calling PS again */
+
+	unsigned queue_io;		/* Must we queue all I/O? */
+	unsigned queue_if_no_path;	/* Queue I/O if last path fails? */
+	unsigned saved_queue_if_no_path;/* Saved state during suspension */
+
+	struct work_struct process_queued_ios;
+	struct bio_list queued_ios;
+	unsigned queue_size;
+
+	struct work_struct trigger_event;
+
+	/*
+	 * We must use a mempool of mpath_io structs so that we
+	 * can resubmit bios on error.
+	 */
+	mempool_t *mpio_pool;
+};
+
+/*
+ * Context information attached to each bio we process.
+ */
+struct mpath_io {
+	struct pgpath *pgpath;
+	struct dm_bio_details details;
+};
+
+typedef int (*action_fn) (struct pgpath *pgpath);
+
+#define MIN_IOS 256	/* Mempool size */
+
+static kmem_cache_t *_mpio_cache;
+
+struct workqueue_struct *kmultipathd;
+static void process_queued_ios(void *data);
+static void trigger_event(void *data);
+
+
+/*-----------------------------------------------
+ * Allocation routines
+ *-----------------------------------------------*/
+
+static struct pgpath *alloc_pgpath(void)
+{
+	struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
+
+	if (pgpath) {
+		memset(pgpath, 0, sizeof(*pgpath));
+		pgpath->path.is_active = 1;
+	}
+
+	return pgpath;
+}
+
+static inline void free_pgpath(struct pgpath *pgpath)
+{
+	kfree(pgpath);
+}
+
+static struct priority_group *alloc_priority_group(void)
+{
+	struct priority_group *pg;
+
+	pg = kmalloc(sizeof(*pg), GFP_KERNEL);
+	if (!pg)
+		return NULL;
+
+	memset(pg, 0, sizeof(*pg));
+	INIT_LIST_HEAD(&pg->pgpaths);
+
+	return pg;
+}
+
+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
+{
+	struct pgpath *pgpath, *tmp;
+
+	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
+		list_del(&pgpath->list);
+		dm_put_device(ti, pgpath->path.dev);
+		free_pgpath(pgpath);
+	}
+}
+
+static void free_priority_group(struct priority_group *pg,
+				struct dm_target *ti)
+{
+	struct path_selector *ps = &pg->ps;
+
+	if (ps->type) {
+		ps->type->destroy(ps);
+		dm_put_path_selector(ps->type);
+	}
+
+	free_pgpaths(&pg->pgpaths, ti);
+	kfree(pg);
+}
+
+static struct multipath *alloc_multipath(void)
+{
+	struct multipath *m;
+
+	m = kmalloc(sizeof(*m), GFP_KERNEL);
+	if (m) {
+		memset(m, 0, sizeof(*m));
+		INIT_LIST_HEAD(&m->priority_groups);
+		spin_lock_init(&m->lock);
+		m->queue_io = 1;
+		INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
+		INIT_WORK(&m->trigger_event, trigger_event, m);
+		m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+					      mempool_free_slab, _mpio_cache);
+		if (!m->mpio_pool) {
+			kfree(m);
+			return NULL;
+		}
+	}
+
+	return m;
+}
+
+static void free_multipath(struct multipath *m)
+{
+	struct priority_group *pg, *tmp;
+	struct hw_handler *hwh = &m->hw_handler;
+
+	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
+		list_del(&pg->list);
+		free_priority_group(pg, m->ti);
+	}
+
+	if (hwh->type) {
+		hwh->type->destroy(hwh);
+		dm_put_hw_handler(hwh->type);
+	}
+
+	mempool_destroy(m->mpio_pool);
+	kfree(m);
+}
+
+
+/*-----------------------------------------------
+ * Path selection
+ *-----------------------------------------------*/
+
+static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
+{
+	struct hw_handler *hwh = &m->hw_handler;
+
+	m->current_pg = pgpath->pg;
+
+	/* Must we initialise the PG first, and queue I/O till it's ready? */
+	if (hwh->type && hwh->type->pg_init) {
+		m->pg_init_required = 1;
+		m->queue_io = 1;
+	} else {
+		m->pg_init_required = 0;
+		m->queue_io = 0;
+	}
+}
+
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+{
+	struct path *path;
+
+	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+	if (!path)
+		return -ENXIO;
+
+	m->current_pgpath = path_to_pgpath(path);
+
+	if (m->current_pg != pg)
+		__switch_pg(m, m->current_pgpath);
+
+	return 0;
+}
+
+static void __choose_pgpath(struct multipath *m)
+{
+	struct priority_group *pg;
+	unsigned bypassed = 1;
+
+	if (!m->nr_valid_paths)
+		goto failed;
+
+	/* Were we instructed to switch PG? */
+	if (m->next_pg) {
+		pg = m->next_pg;
+		m->next_pg = NULL;
+		if (!__choose_path_in_pg(m, pg))
+			return;
+	}
+
+	/* Don't change PG until it has no remaining paths */
+	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+		return;
+
+	/*
+	 * Loop through priority groups until we find a valid path.
+	 * First time we skip PGs marked 'bypassed'.
+	 * Second time we only try the ones we skipped.
+	 */
+	do {
+		list_for_each_entry(pg, &m->priority_groups, list) {
+			if (pg->bypassed == bypassed)
+				continue;
+			if (!__choose_path_in_pg(m, pg))
+				return;
+		}
+	} while (bypassed--);
+
+failed:
+	m->current_pgpath = NULL;
+	m->current_pg = NULL;
+}
+
+static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
+		  unsigned was_queued)
+{
+	int r = 1;
+	unsigned long flags;
+	struct pgpath *pgpath;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	/* Do we need to select a new pgpath? */
+	if (!m->current_pgpath ||
+	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+		__choose_pgpath(m);
+
+	pgpath = m->current_pgpath;
+
+	if (was_queued)
+		m->queue_size--;
+
+	if ((pgpath && m->queue_io) ||
+	    (!pgpath && m->queue_if_no_path)) {
+		/* Queue for the daemon to resubmit */
+		bio_list_add(&m->queued_ios, bio);
+		m->queue_size++;
+		if ((m->pg_init_required && !m->pg_init_in_progress) ||
+		    !m->queue_io)
+			queue_work(kmultipathd, &m->process_queued_ios);
+		pgpath = NULL;
+		r = 0;
+	} else if (!pgpath)
+		r = -EIO;		/* Failed */
+	else
+		bio->bi_bdev = pgpath->path.dev->bdev;
+
+	mpio->pgpath = pgpath;
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return r;
+}
+
+/*
+ * If we run out of usable paths, should we queue I/O or error it?
+ */
+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
+			    unsigned save_old_value)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (save_old_value)
+		m->saved_queue_if_no_path = m->queue_if_no_path;
+	else
+		m->saved_queue_if_no_path = queue_if_no_path;
+	m->queue_if_no_path = queue_if_no_path;
+	if (!m->queue_if_no_path && m->queue_size)
+		queue_work(kmultipathd, &m->process_queued_ios);
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return 0;
+}
+
+/*-----------------------------------------------------------------
+ * The multipath daemon is responsible for resubmitting queued ios.
+ *---------------------------------------------------------------*/
+
+static void dispatch_queued_ios(struct multipath *m)
+{
+	int r;
+	unsigned long flags;
+	struct bio *bio = NULL, *next;
+	struct mpath_io *mpio;
+	union map_info *info;
+
+	spin_lock_irqsave(&m->lock, flags);
+	bio = bio_list_get(&m->queued_ios);
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	while (bio) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		info = dm_get_mapinfo(bio);
+		mpio = info->ptr;
+
+		r = map_io(m, bio, mpio, 1);
+		if (r < 0)
+			bio_endio(bio, bio->bi_size, r);
+		else if (r == 1)
+			generic_make_request(bio);
+
+		bio = next;
+	}
+}
+
+static void process_queued_ios(void *data)
+{
+	struct multipath *m = (struct multipath *) data;
+	struct hw_handler *hwh = &m->hw_handler;
+	struct pgpath *pgpath = NULL;
+	unsigned init_required = 0, must_queue = 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (!m->queue_size)
+		goto out;
+
+	if (!m->current_pgpath)
+		__choose_pgpath(m);
+
+	pgpath = m->current_pgpath;
+
+	if ((pgpath && !m->queue_io) ||
+	    (!pgpath && !m->queue_if_no_path))
+		must_queue = 0;
+
+	if (m->pg_init_required && !m->pg_init_in_progress) {
+		m->pg_init_required = 0;
+		m->pg_init_in_progress = 1;
+		init_required = 1;
+	}
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	if (init_required)
+		hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
+
+	if (!must_queue)
+		dispatch_queued_ios(m);
+}
+
+/*
+ * An event is triggered whenever a path is taken out of use.
+ * Includes path failure and PG bypass.
+ */
+static void trigger_event(void *data)
+{
+	struct multipath *m = (struct multipath *) data;
+
+	dm_table_event(m->ti->table);
+}
+
+/*-----------------------------------------------------------------
+ * Constructor/argument parsing:
+ * <#multipath feature args> [<arg>]*
+ * <#hw_handler args> [hw_handler [<arg>]*]
+ * <#priority groups>
+ * <initial priority group>
+ *     [<selector> <#selector args> [<arg>]*
+ *      <#paths> <#per-path selector args>
+ *         [<path> [<arg>]* ]+ ]+
+ *---------------------------------------------------------------*/
+struct param {
+	unsigned min;
+	unsigned max;
+	char *error;
+};
+
+#define ESTR(s) ("dm-multipath: " s)
+
+static int read_param(struct param *param, char *str, unsigned *v, char **error)
+{
+	if (!str ||
+	    (sscanf(str, "%u", v) != 1) ||
+	    (*v < param->min) ||
+	    (*v > param->max)) {
+		*error = param->error;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct arg_set {
+	unsigned argc;
+	char **argv;
+};
+
+static char *shift(struct arg_set *as)
+{
+	char *r;
+
+	if (as->argc) {
+		as->argc--;
+		r = *as->argv;
+		as->argv++;
+		return r;
+	}
+
+	return NULL;
+}
+
+static void consume(struct arg_set *as, unsigned n)
+{
+	BUG_ON (as->argc < n);
+	as->argc -= n;
+	as->argv += n;
+}
+
+static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+			       struct dm_target *ti)
+{
+	int r;
+	struct path_selector_type *pst;
+	unsigned ps_argc;
+
+	static struct param _params[] = {
+		{0, 1024, ESTR("invalid number of path selector args")},
+	};
+
+	pst = dm_get_path_selector(shift(as));
+	if (!pst) {
+		ti->error = ESTR("unknown path selector type");
+		return -EINVAL;
+	}
+
+	r = read_param(_params, shift(as), &ps_argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	r = pst->create(&pg->ps, ps_argc, as->argv);
+	if (r) {
+		dm_put_path_selector(pst);
+		ti->error = ESTR("path selector constructor failed");
+		return r;
+	}
+
+	pg->ps.type = pst;
+	consume(as, ps_argc);
+
+	return 0;
+}
+
+static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+			       struct dm_target *ti)
+{
+	int r;
+	struct pgpath *p;
+
+	/* we need at least a path arg */
+	if (as->argc < 1) {
+		ti->error = ESTR("no device given");
+		return NULL;
+	}
+
+	p = alloc_pgpath();
+	if (!p)
+		return NULL;
+
+	r = dm_get_device(ti, shift(as), ti->begin, ti->len,
+			  dm_table_get_mode(ti->table), &p->path.dev);
+	if (r) {
+		ti->error = ESTR("error getting device");
+		goto bad;
+	}
+
+	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
+	if (r) {
+		dm_put_device(ti, p->path.dev);
+		goto bad;
+	}
+
+	return p;
+
+ bad:
+	free_pgpath(p);
+	return NULL;
+}
+
+static struct priority_group *parse_priority_group(struct arg_set *as,
+						   struct multipath *m,
+						   struct dm_target *ti)
+{
+	static struct param _params[] = {
+		{1, 1024, ESTR("invalid number of paths")},
+		{0, 1024, ESTR("invalid number of selector args")}
+	};
+
+	int r;
+	unsigned i, nr_selector_args, nr_params;
+	struct priority_group *pg;
+
+	if (as->argc < 2) {
+		as->argc = 0;
+		ti->error = ESTR("not enough priority group aruments");
+		return NULL;
+	}
+
+	pg = alloc_priority_group();
+	if (!pg) {
+		ti->error = ESTR("couldn't allocate priority group");
+		return NULL;
+	}
+	pg->m = m;
+
+	r = parse_path_selector(as, pg, ti);
+	if (r)
+		goto bad;
+
+	/*
+	 * read the paths
+	 */
+	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+	if (r)
+		goto bad;
+
+	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+	if (r)
+		goto bad;
+
+	nr_params = 1 + nr_selector_args;
+	for (i = 0; i < pg->nr_pgpaths; i++) {
+		struct pgpath *pgpath;
+		struct arg_set path_args;
+
+		if (as->argc < nr_params)
+			goto bad;
+
+		path_args.argc = nr_params;
+		path_args.argv = as->argv;
+
+		pgpath = parse_path(&path_args, &pg->ps, ti);
+		if (!pgpath)
+			goto bad;
+
+		pgpath->pg = pg;
+		list_add_tail(&pgpath->list, &pg->pgpaths);
+		consume(as, nr_params);
+	}
+
+	return pg;
+
+ bad:
+	free_priority_group(pg, ti);
+	return NULL;
+}
+
+static int parse_hw_handler(struct arg_set *as, struct multipath *m,
+			    struct dm_target *ti)
+{
+	int r;
+	struct hw_handler_type *hwht;
+	unsigned hw_argc;
+
+	static struct param _params[] = {
+		{0, 1024, ESTR("invalid number of hardware handler args")},
+	};
+
+	r = read_param(_params, shift(as), &hw_argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	if (!hw_argc)
+		return 0;
+
+	hwht = dm_get_hw_handler(shift(as));
+	if (!hwht) {
+		ti->error = ESTR("unknown hardware handler type");
+		return -EINVAL;
+	}
+
+	r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
+	if (r) {
+		dm_put_hw_handler(hwht);
+		ti->error = ESTR("hardware handler constructor failed");
+		return r;
+	}
+
+	m->hw_handler.type = hwht;
+	consume(as, hw_argc - 1);
+
+	return 0;
+}
+
+static int parse_features(struct arg_set *as, struct multipath *m,
+			  struct dm_target *ti)
+{
+	int r;
+	unsigned argc;
+
+	static struct param _params[] = {
+		{0, 1, ESTR("invalid number of feature args")},
+	};
+
+	r = read_param(_params, shift(as), &argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	if (!argc)
+		return 0;
+
+	if (!strnicmp(shift(as), MESG_STR("queue_if_no_path")))
+		return queue_if_no_path(m, 1, 0);
+	else {
+		ti->error = "Unrecognised multipath feature request";
+		return -EINVAL;
+	}
+}
+
+static int multipath_ctr(struct dm_target *ti, unsigned int argc,
+			 char **argv)
+{
+	/* target parameters */
+	static struct param _params[] = {
+		{1, 1024, ESTR("invalid number of priority groups")},
+		{1, 1024, ESTR("invalid initial priority group number")},
+	};
+
+	int r;
+	struct multipath *m;
+	struct arg_set as;
+	unsigned pg_count = 0;
+	unsigned next_pg_num;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	m = alloc_multipath();
+	if (!m) {
+		ti->error = ESTR("can't allocate multipath");
+		return -EINVAL;
+	}
+
+	r = parse_features(&as, m, ti);
+	if (r)
+		goto bad;
+
+	r = parse_hw_handler(&as, m, ti);
+	if (r)
+		goto bad;
+
+	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+	if (r)
+		goto bad;
+
+	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+	if (r)
+		goto bad;
+
+	/* parse the priority groups */
+	while (as.argc) {
+		struct priority_group *pg;
+
+		pg = parse_priority_group(&as, m, ti);
+		if (!pg) {
+			r = -EINVAL;
+			goto bad;
+		}
+
+		m->nr_valid_paths += pg->nr_pgpaths;
+		list_add_tail(&pg->list, &m->priority_groups);
+		pg_count++;
+		pg->pg_num = pg_count;
+		if (!--next_pg_num)
+			m->next_pg = pg;
+	}
+
+	if (pg_count != m->nr_priority_groups) {
+		ti->error = ESTR("priority group count mismatch");
+		r = -EINVAL;
+		goto bad;
+	}
+
+	ti->private = m;
+	m->ti = ti;
+
+	return 0;
+
+ bad:
+	free_multipath(m);
+	return r;
+}
+
+static void multipath_dtr(struct dm_target *ti)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+
+	flush_workqueue(kmultipathd);
+	free_multipath(m);
+}
+
+/*
+ * Map bios, recording original fields for later in case we have to resubmit
+ */
+static int multipath_map(struct dm_target *ti, struct bio *bio,
+			 union map_info *map_context)
+{
+	int r;
+	struct mpath_io *mpio;
+	struct multipath *m = (struct multipath *) ti->private;
+
+	if (bio_barrier(bio))
+		return -EOPNOTSUPP;
+
+	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
+	dm_bio_record(&mpio->details, bio);
+
+	map_context->ptr = mpio;
+	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
+	r = map_io(m, bio, mpio, 0);
+	if (r < 0)
+		mempool_free(mpio, m->mpio_pool);
+
+	return r;
+}
+
+/*
+ * Take a path out of use.
+ */
+static int fail_path(struct pgpath *pgpath)
+{
+	unsigned long flags;
+	struct multipath *m = pgpath->pg->m;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (!pgpath->path.is_active)
+		goto out;
+
+	DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
+
+	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
+	pgpath->path.is_active = 0;
+	pgpath->fail_count++;
+
+	m->nr_valid_paths--;
+
+	if (pgpath == m->current_pgpath)
+		m->current_pgpath = NULL;
+
+	queue_work(kmultipathd, &m->trigger_event);
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return 0;
+}
+
+/*
+ * Reinstate a previously-failed path
+ */
+static int reinstate_path(struct pgpath *pgpath)
+{
+	int r = 0;
+	unsigned long flags;
+	struct multipath *m = pgpath->pg->m;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (pgpath->path.is_active)
+		goto out;
+
+	if (!pgpath->pg->ps.type) {
+		DMWARN("Reinstate path not supported by path selector %s",
+		       pgpath->pg->ps.type->name);
+		r = -EINVAL;
+		goto out;
+	}
+
+	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
+	if (r)
+		goto out;
+
+	pgpath->path.is_active = 1;
+
+	m->current_pgpath = NULL;
+	if (!m->nr_valid_paths++ && m->queue_size)
+		queue_work(kmultipathd, &m->process_queued_ios);
+
+	queue_work(kmultipathd, &m->trigger_event);
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return r;
+}
+
+/*
+ * Fail or reinstate all paths that match the provided struct dm_dev.
+ */
+static int action_dev(struct multipath *m, struct dm_dev *dev,
+		      action_fn action)
+{
+	int r = 0;
+	struct pgpath *pgpath;
+	struct priority_group *pg;
+
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		list_for_each_entry(pgpath, &pg->pgpaths, list) {
+			if (pgpath->path.dev == dev)
+				r = action(pgpath);
+		}
+	}
+
+	return r;
+}
+
+/*
+ * Temporarily try to avoid having to use the specified PG
+ */
+static void bypass_pg(struct multipath *m, struct priority_group *pg,
+		      int bypassed)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	pg->bypassed = bypassed;
+	m->current_pgpath = NULL;
+	m->current_pg = NULL;
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	queue_work(kmultipathd, &m->trigger_event);
+}
+
+/*
+ * Switch to using the specified PG from the next I/O that gets mapped
+ */
+static int switch_pg_num(struct multipath *m, const char *pgstr)
+{
+	struct priority_group *pg;
+	unsigned pgnum;
+	unsigned long flags;
+
+	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+	    (pgnum > m->nr_priority_groups)) {
+		DMWARN("invalid PG number supplied to switch_pg_num");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&m->lock, flags);
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		pg->bypassed = 0;
+		if (--pgnum)
+			continue;
+
+		m->current_pgpath = NULL;
+		m->current_pg = NULL;
+		m->next_pg = pg;
+	}
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	queue_work(kmultipathd, &m->trigger_event);
+	return 0;
+}
+
+/*
+ * Set/clear bypassed status of a PG.
+ * PGs are numbered upwards from 1 in the order they were declared.
+ */
+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
+{
+	struct priority_group *pg;
+	unsigned pgnum;
+
+	if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+	    (pgnum > m->nr_priority_groups)) {
+		DMWARN("invalid PG number supplied to bypass_pg");
+		return -EINVAL;
+	}
+
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		if (!--pgnum)
+			break;
+	}
+
+	bypass_pg(m, pg, bypassed);
+	return 0;
+}
+
+/*
+ * pg_init must call this when it has completed its initialisation
+ */
+void dm_pg_init_complete(struct path *path, unsigned err_flags)
+{
+	struct pgpath *pgpath = path_to_pgpath(path);
+	struct priority_group *pg = pgpath->pg;
+	struct multipath *m = pg->m;
+	unsigned long flags;
+
+	/* We insist on failing the path if the PG is already bypassed. */
+	if (err_flags && pg->bypassed)
+		err_flags |= MP_FAIL_PATH;
+
+	if (err_flags & MP_FAIL_PATH)
+		fail_path(pgpath);
+
+	if (err_flags & MP_BYPASS_PG)
+		bypass_pg(m, pg, 1);
+
+	spin_lock_irqsave(&m->lock, flags);
+	if (err_flags) {
+		m->current_pgpath = NULL;
+		m->current_pg = NULL;
+	} else if (!m->pg_init_required)
+		m->queue_io = 0;
+
+	m->pg_init_in_progress = 0;
+	queue_work(kmultipathd, &m->process_queued_ios);
+	spin_unlock_irqrestore(&m->lock, flags);
+}
+
+/*
+ * end_io handling
+ */
+static int do_end_io(struct multipath *m, struct bio *bio,
+		     int error, struct mpath_io *mpio)
+{
+	struct hw_handler *hwh = &m->hw_handler;
+	unsigned err_flags = MP_FAIL_PATH;	/* Default behavior */
+	unsigned long flags;
+
+	if (!error)
+		return 0;	/* I/O complete */
+
+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+		return error;
+
+	if (error == -EOPNOTSUPP)
+		return error;
+
+	spin_lock_irqsave(&m->lock, flags);
+	if (!m->nr_valid_paths) {
+		if (!m->queue_if_no_path) {
+			spin_unlock_irqrestore(&m->lock, flags);
+			return -EIO;
+		} else {
+			spin_unlock_irqrestore(&m->lock, flags);
+			goto requeue;
+		}
+	}
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	if (hwh->type && hwh->type->error)
+		err_flags = hwh->type->error(hwh, bio);
+	else
+		err_flags = dm_scsi_err_handler(hwh, bio);
+
+	if (mpio->pgpath) {
+		if (err_flags & MP_FAIL_PATH)
+			fail_path(mpio->pgpath);
+	
+		if (err_flags & MP_BYPASS_PG)
+			bypass_pg(m, mpio->pgpath->pg, 1);
+	}
+
+	if (err_flags & MP_ERROR_IO)
+		return -EIO;
+
+      requeue:
+	dm_bio_restore(&mpio->details, bio);
+
+	/* queue for the daemon to resubmit or fail */
+	spin_lock_irqsave(&m->lock, flags);
+	bio_list_add(&m->queued_ios, bio);
+	m->queue_size++;
+	if (!m->queue_io)
+		queue_work(kmultipathd, &m->process_queued_ios);
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return 1;	/* io not complete */
+}
+
+static int multipath_end_io(struct dm_target *ti, struct bio *bio,
+			    int error, union map_info *map_context)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+	struct mpath_io *mpio = (struct mpath_io *) map_context->ptr;
+	struct pgpath *pgpath = mpio->pgpath;
+	struct path_selector *ps;
+	int r;
+
+	r  = do_end_io(m, bio, error, mpio);
+	if (pgpath) {
+		ps = &pgpath->pg->ps;
+		if (ps->type->end_io)
+			ps->type->end_io(ps, &pgpath->path);
+	}
+	if (r <= 0)
+		mempool_free(mpio, m->mpio_pool);
+
+	return r;
+}
+
+/*
+ * Suspend can't complete until all the I/O is processed so if
+ * the last path fails we must error any remaining I/O.
+ * Note that if the freeze_bdev fails while suspending, the 
+ * queue_if_no_path state is lost - userspace should reset it.
+ */
+static void multipath_presuspend(struct dm_target *ti)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+
+	queue_if_no_path(m, 0, 1);
+}
+
+/*
+ * Restore the queue_if_no_path setting.
+ */
+static void multipath_resume(struct dm_target *ti)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+	m->queue_if_no_path = m->saved_queue_if_no_path;
+	spin_unlock_irqrestore(&m->lock, flags);
+}
+
+/*
+ * Info output has the following format:
+ * num_multipath_feature_args [multipath_feature_args]*
+ * num_handler_status_args [handler_status_args]*
+ * num_groups init_group_number
+ *            [A|D|E num_ps_status_args [ps_status_args]*
+ *             num_paths num_selector_args
+ *             [path_dev A|F fail_count [selector_args]* ]+ ]+
+ *
+ * Table output has the following format (identical to the constructor string):
+ * num_feature_args [features_args]*
+ * num_handler_args hw_handler [hw_handler_args]*
+ * num_groups init_group_number
+ *     [priority selector-name num_ps_args [ps_args]*
+ *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
+ */
+static int multipath_status(struct dm_target *ti, status_type_t type,
+			    char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	unsigned long flags;
+	struct multipath *m = (struct multipath *) ti->private;
+	struct hw_handler *hwh = &m->hw_handler;
+	struct priority_group *pg;
+	struct pgpath *p;
+	unsigned pg_num;
+	char state;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	/* Features */
+	if (type == STATUSTYPE_INFO)
+		DMEMIT("1 %u ", m->queue_size);
+	else if (m->queue_if_no_path)
+		DMEMIT("1 queue_if_no_path ");
+	else
+		DMEMIT("0 ");
+
+	if (hwh->type && hwh->type->status)
+		sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
+	else if (!hwh->type || type == STATUSTYPE_INFO)
+		DMEMIT("0 ");
+	else
+		DMEMIT("1 %s ", hwh->type->name);
+
+	DMEMIT("%u ", m->nr_priority_groups);
+
+	if (m->next_pg)
+		pg_num = m->next_pg->pg_num;
+	else if (m->current_pg)	
+		pg_num = m->current_pg->pg_num;
+	else
+			pg_num = 1;
+
+	DMEMIT("%u ", pg_num);
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		list_for_each_entry(pg, &m->priority_groups, list) {
+			if (pg->bypassed)
+				state = 'D';	/* Disabled */
+			else if (pg == m->current_pg)
+				state = 'A';	/* Currently Active */
+			else
+				state = 'E';	/* Enabled */
+
+			DMEMIT("%c ", state);
+
+			if (pg->ps.type->status)
+				sz += pg->ps.type->status(&pg->ps, NULL, type,
+							  result + sz,
+							  maxlen - sz);
+			else
+				DMEMIT("0 ");
+
+			DMEMIT("%u %u ", pg->nr_pgpaths,
+			       pg->ps.type->info_args);
+
+			list_for_each_entry(p, &pg->pgpaths, list) {
+				DMEMIT("%s %s %u ", p->path.dev->name,
+				       p->path.is_active ? "A" : "F",
+				       p->fail_count);
+				if (pg->ps.type->status)
+					sz += pg->ps.type->status(&pg->ps,
+					      &p->path, type, result + sz,
+					      maxlen - sz);
+			}
+		}
+		break;
+
+	case STATUSTYPE_TABLE:
+		list_for_each_entry(pg, &m->priority_groups, list) {
+			DMEMIT("%s ", pg->ps.type->name);
+
+			if (pg->ps.type->status)
+				sz += pg->ps.type->status(&pg->ps, NULL, type,
+							  result + sz,
+							  maxlen - sz);
+			else
+				DMEMIT("0 ");
+
+			DMEMIT("%u %u ", pg->nr_pgpaths,
+			       pg->ps.type->table_args);
+
+			list_for_each_entry(p, &pg->pgpaths, list) {
+				DMEMIT("%s ", p->path.dev->name);
+				if (pg->ps.type->status)
+					sz += pg->ps.type->status(&pg->ps,
+					      &p->path, type, result + sz,
+					      maxlen - sz);
+			}
+		}
+		break;
+	}
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return 0;
+}
+
+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	struct dm_dev *dev;
+	struct multipath *m = (struct multipath *) ti->private;
+	action_fn action;
+
+	if (argc == 1) {
+		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
+			return queue_if_no_path(m, 1, 0);
+		else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
+			return queue_if_no_path(m, 0, 0);
+	}
+
+	if (argc != 2)
+		goto error;
+
+	if (!strnicmp(argv[0], MESG_STR("disable_group")))
+		return bypass_pg_num(m, argv[1], 1);
+	else if (!strnicmp(argv[0], MESG_STR("enable_group")))
+		return bypass_pg_num(m, argv[1], 0);
+	else if (!strnicmp(argv[0], MESG_STR("switch_group")))
+		return switch_pg_num(m, argv[1]);
+	else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+		action = reinstate_path;
+	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+		action = fail_path;
+	else
+		goto error;
+
+	r = dm_get_device(ti, argv[1], ti->begin, ti->len,
+			  dm_table_get_mode(ti->table), &dev);
+	if (r) {
+		DMWARN("dm-multipath message: error getting device %s",
+		       argv[1]);
+		return -EINVAL;
+	}
+
+	r = action_dev(m, dev, action);
+
+	dm_put_device(ti, dev);
+
+	return r;
+
+error:
+	DMWARN("Unrecognised multipath message received.");
+	return -EINVAL;
+}
+
+/*-----------------------------------------------------------------
+ * Module setup
+ *---------------------------------------------------------------*/
+static struct target_type multipath_target = {
+	.name = "multipath",
+	.version = {1, 0, 4},
+	.module = THIS_MODULE,
+	.ctr = multipath_ctr,
+	.dtr = multipath_dtr,
+	.map = multipath_map,
+	.end_io = multipath_end_io,
+	.presuspend = multipath_presuspend,
+	.resume = multipath_resume,
+	.status = multipath_status,
+	.message = multipath_message,
+};
+
+static int __init dm_multipath_init(void)
+{
+	int r;
+
+	/* allocate a slab for the dm_ios */
+	_mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io),
+					0, 0, NULL, NULL);
+	if (!_mpio_cache)
+		return -ENOMEM;
+
+	r = dm_register_target(&multipath_target);
+	if (r < 0) {
+		DMERR("%s: register failed %d", multipath_target.name, r);
+		kmem_cache_destroy(_mpio_cache);
+		return -EINVAL;
+	}
+
+	kmultipathd = create_workqueue("kmpathd");
+	if (!kmultipathd) {
+		DMERR("%s: failed to create workqueue kmpathd", 
+				multipath_target.name);
+		dm_unregister_target(&multipath_target);
+		kmem_cache_destroy(_mpio_cache);
+		return -ENOMEM;
+	}
+
+	DMINFO("dm-multipath version %u.%u.%u loaded",
+	       multipath_target.version[0], multipath_target.version[1],
+	       multipath_target.version[2]);
+
+	return r;
+}
+
+static void __exit dm_multipath_exit(void)
+{
+	int r;
+
+	destroy_workqueue(kmultipathd);
+
+	r = dm_unregister_target(&multipath_target);
+	if (r < 0)
+		DMERR("%s: target unregister failed %d",
+		      multipath_target.name, r);
+	kmem_cache_destroy(_mpio_cache);
+}
+
+EXPORT_SYMBOL_GPL(dm_pg_init_complete);
+
+module_init(dm_multipath_init);
+module_exit(dm_multipath_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multipath target");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff -pruN ./drivers/md.dm/dm-mpath.h ./drivers/md/dm-mpath.h
--- ./drivers/md.dm/dm-mpath.h	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-mpath.h	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath.
+ */
+
+#ifndef	DM_MPATH_H
+#define	DM_MPATH_H
+
+struct dm_dev;
+
+struct path {
+	struct dm_dev *dev;	/* Read-only */
+	unsigned is_active;	/* Read-only */
+
+	void *pscontext;	/* For path-selector use */
+	void *hwhcontext;	/* For hw-handler use */
+};
+
+/* Callback for hwh_pg_init_fn to use when complete */
+void dm_pg_init_complete(struct path *path, unsigned err_flags);
+
+#endif
diff -pruN ./drivers/md.dm/dm-path-selector.c ./drivers/md/dm-path-selector.c
--- ./drivers/md.dm/dm-path-selector.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-path-selector.c	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path selector registration.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+
+struct ps_internal {
+	struct path_selector_type pst;
+
+	struct list_head list;
+	long use;
+};
+
+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
+
+static LIST_HEAD(_path_selectors);
+static DECLARE_RWSEM(_ps_lock);
+
+struct ps_internal *__find_path_selector_type(const char *name)
+{
+	struct ps_internal *psi;
+
+	list_for_each_entry(psi, &_path_selectors, list) {
+		if (!strcmp(name, psi->pst.name))
+			return psi;
+	}
+
+	return NULL;
+}
+
+static struct ps_internal *get_path_selector(const char *name)
+{
+	struct ps_internal *psi;
+
+	down_read(&_ps_lock);
+	psi = __find_path_selector_type(name);
+	if (psi) {
+		if ((psi->use == 0) && !try_module_get(psi->pst.module))
+			psi = NULL;
+		else
+			psi->use++;
+	}
+	up_read(&_ps_lock);
+
+	return psi;
+}
+
+struct path_selector_type *dm_get_path_selector(const char *name)
+{
+	struct ps_internal *psi;
+
+	if (!name)
+		return NULL;
+
+	psi = get_path_selector(name);
+	if (!psi) {
+		request_module("dm-%s", name);
+		psi = get_path_selector(name);
+	}
+
+	return psi ? &psi->pst : NULL;
+}
+
+void dm_put_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi;
+
+	if (!pst)
+		return;
+
+	down_read(&_ps_lock);
+	psi = __find_path_selector_type(pst->name);
+	if (!psi)
+		goto out;
+
+	if (--psi->use == 0)
+		module_put(psi->pst.module);
+
+	if (psi->use < 0)
+		BUG();
+
+out:
+	up_read(&_ps_lock);
+}
+
+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
+
+	if (psi) {
+		memset(psi, 0, sizeof(*psi));
+		psi->pst = *pst;
+	}
+
+	return psi;
+}
+
+int dm_register_path_selector(struct path_selector_type *pst)
+{
+	int r = 0;
+	struct ps_internal *psi = _alloc_path_selector(pst);
+
+	if (!psi)
+		return -ENOMEM;
+
+	down_write(&_ps_lock);
+
+	if (__find_path_selector_type(pst->name)) {
+		kfree(psi);
+		r = -EEXIST;
+	} else
+		list_add(&psi->list, &_path_selectors);
+
+	up_write(&_ps_lock);
+
+	return r;
+}
+
+int dm_unregister_path_selector(struct path_selector_type *pst)
+{
+	struct ps_internal *psi;
+
+	down_write(&_ps_lock);
+
+	psi = __find_path_selector_type(pst->name);
+	if (!psi) {
+		up_write(&_ps_lock);
+		return -EINVAL;
+	}
+
+	if (psi->use) {
+		up_write(&_ps_lock);
+		return -ETXTBSY;
+	}
+
+	list_del(&psi->list);
+
+	up_write(&_ps_lock);
+
+	kfree(psi);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dm_register_path_selector);
+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
diff -pruN ./drivers/md.dm/dm-path-selector.h ./drivers/md/dm-path-selector.h
--- ./drivers/md.dm/dm-path-selector.h	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-path-selector.h	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path-Selector registration.
+ */
+
+#ifndef	DM_PATH_SELECTOR_H
+#define	DM_PATH_SELECTOR_H
+
+#include <linux/device-mapper.h>
+
+#include "dm-mpath.h"
+
+/*
+ * We provide an abstraction for the code that chooses which path
+ * to send some io down.
+ */
+struct path_selector_type;
+struct path_selector {
+	struct path_selector_type *type;
+	void *context;
+};
+
+/* Information about a path selector type */
+struct path_selector_type {
+	char *name;
+	struct module *module;
+
+	unsigned int table_args;
+	unsigned int info_args;
+
+	/*
+	 * Constructs a path selector object, takes custom arguments
+	 */
+	int (*create) (struct path_selector *ps, unsigned argc, char **argv);
+	void (*destroy) (struct path_selector *ps);
+
+	/*
+	 * Add an opaque path object, along with some selector specific
+	 * path args (eg, path priority).
+	 */
+	int (*add_path) (struct path_selector *ps, struct path *path,
+			 int argc, char **argv, char **error);
+
+	/*
+	 * Chooses a path for this io, if no paths are available then
+	 * NULL will be returned.
+	 *
+	 * repeat_count is the number of times to use the path before
+	 * calling the function again.  0 means don't call it again unless
+	 * the path fails.
+	 */
+	struct path *(*select_path) (struct path_selector *ps,
+				     unsigned *repeat_count);
+
+	/*
+	 * Notify the selector that a path has failed.
+	 */
+	void (*fail_path) (struct path_selector *ps, struct path *p);
+
+	/*
+	 * Ask selector to reinstate a path.
+	 */
+	int (*reinstate_path) (struct path_selector *ps, struct path *p);
+
+	/*
+	 * Table content based on parameters added in ps_add_path_fn
+	 * or path selector status
+	 */
+	int (*status) (struct path_selector *ps, struct path *path,
+		       status_type_t type, char *result, unsigned int maxlen);
+
+	int (*end_io) (struct path_selector *ps, struct path *path);
+};
+
+/* Register a path selector */
+int dm_register_path_selector(struct path_selector_type *type);
+
+/* Unregister a path selector */
+int dm_unregister_path_selector(struct path_selector_type *type);
+
+/* Returns a registered path selector type */
+struct path_selector_type *dm_get_path_selector(const char *name);
+
+/* Releases a path selector  */
+void dm_put_path_selector(struct path_selector_type *pst);
+
+#endif
diff -pruN ./drivers/md.dm/dm-raid1.c ./drivers/md/dm-raid1.c
--- ./drivers/md.dm/dm-raid1.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-raid1.c	2006-03-17 13:16:38.000000000 +0300
@@ -6,6 +6,7 @@
 
 #include "dm.h"
 #include "dm-bio-list.h"
+#include "dm-bio-record.h"
 #include "dm-io.h"
 #include "dm-log.h"
 #include "kcopyd.h"
@@ -28,6 +29,8 @@ static inline void wake(void)
 	queue_work(_kmirrord_wq, &_kmirrord_work);
 }
 
+static struct workqueue_struct *_kmir_mon_wq;
+
 /*-----------------------------------------------------------------
  * Region hash
  *
@@ -67,7 +70,7 @@ static inline void wake(void)
 struct mirror_set;
 struct region_hash {
 	struct mirror_set *ms;
-	sector_t region_size;
+	uint32_t region_size;
 	unsigned region_shift;
 
 	/* holds persistent region state */
@@ -135,7 +138,7 @@ static void region_free(void *element, v
 #define MIN_REGIONS 64
 #define MAX_RECOVERY 1
 static int rh_init(struct region_hash *rh, struct mirror_set *ms,
-		   struct dirty_log *log, sector_t region_size,
+		   struct dirty_log *log, uint32_t region_size,
 		   region_t nr_regions)
 {
 	unsigned int nr_buckets, max_buckets;
@@ -253,9 +256,9 @@ static struct region *__rh_alloc(struct 
 	else {
 		__rh_insert(rh, nreg);
 		if (nreg->state == RH_CLEAN) {
-			spin_lock_irq(&rh->region_lock);
+			spin_lock(&rh->region_lock);
 			list_add(&nreg->list, &rh->clean_regions);
-			spin_unlock_irq(&rh->region_lock);
+			spin_unlock(&rh->region_lock);
 		}
 		reg = nreg;
 	}
@@ -375,16 +378,19 @@ static void rh_inc(struct region_hash *r
 
 	read_lock(&rh->hash_lock);
 	reg = __rh_find(rh, region);
-	if (reg->state == RH_CLEAN) {
-		rh->log->type->mark_region(rh->log, reg->key);
 
-		spin_lock_irq(&rh->region_lock);
+	spin_lock_irq(&rh->region_lock);
+	atomic_inc(&reg->pending);
+
+	if (reg->state == RH_CLEAN) {
 		reg->state = RH_DIRTY;
 		list_del_init(&reg->list);	/* take off the clean list */
 		spin_unlock_irq(&rh->region_lock);
-	}
 
-	atomic_inc(&reg->pending);
+		rh->log->type->mark_region(rh->log, reg->key);
+	} else
+		spin_unlock_irq(&rh->region_lock);
+
 	read_unlock(&rh->hash_lock);
 }
 
@@ -406,17 +412,17 @@ static void rh_dec(struct region_hash *r
 	reg = __rh_lookup(rh, region);
 	read_unlock(&rh->hash_lock);
 
+	spin_lock_irqsave(&rh->region_lock, flags);
 	if (atomic_dec_and_test(&reg->pending)) {
-		spin_lock_irqsave(&rh->region_lock, flags);
 		if (reg->state == RH_RECOVERING) {
 			list_add_tail(&reg->list, &rh->quiesced_regions);
 		} else {
 			reg->state = RH_CLEAN;
 			list_add(&reg->list, &rh->clean_regions);
 		}
-		spin_unlock_irqrestore(&rh->region_lock, flags);
 		should_wake = 1;
 	}
+	spin_unlock_irqrestore(&rh->region_lock, flags);
 
 	if (should_wake)
 		wake();
@@ -539,7 +545,8 @@ static void rh_start_recovery(struct reg
  * Mirror set structures.
  *---------------------------------------------------------------*/
 struct mirror {
-	atomic_t error_count;
+	atomic_t error_count;  /* Error counter to flag mirror failure */
+	struct mirror_set *ms;
 	struct dm_dev *dev;
 	sector_t offset;
 };
@@ -550,36 +557,59 @@ struct mirror_set {
 	struct region_hash rh;
 	struct kcopyd_client *kcopyd_client;
 
-	spinlock_t lock;	/* protects the next two lists */
+	spinlock_t lock;	/* protects the lists */
 	struct bio_list reads;
 	struct bio_list writes;
+	struct bio_list failures;
+	struct work_struct failure_work;
+	struct completion failure_completion;
 
 	/* recovery */
+	atomic_t suspended;
 	region_t nr_regions;
 	int in_sync;
 
 	unsigned int nr_mirrors;
-	struct mirror mirror[0];
+	spinlock_t choose_lock; /* protects select in choose_mirror(). */
+	atomic_t read_count;    /* Read counter for read balancing. */
+	unsigned int read_mirror;       /* Last mirror read. */
+	struct mirror *default_mirror;  /* Default mirror. */
+ 	struct mirror mirror[0];
 };
 
+struct bio_map_info {
+	struct mirror *bmi_m;
+	struct dm_bio_details bmi_bd;
+};
+
+static mempool_t *bio_map_info_pool = NULL;
+
+static void *bio_map_info_alloc(int gfp_mask, void *pool_data){
+	return kmalloc(sizeof(struct bio_map_info), gfp_mask);
+}
+
+static void bio_map_info_free(void *element, void *pool_data){
+	kfree(element);
+}
+
 /*
  * Every mirror should look like this one.
  */
 #define DEFAULT_MIRROR 0
 
 /*
- * This is yucky.  We squirrel the mirror_set struct away inside
- * bi_next for write buffers.  This is safe since the bh
+ * This is yucky.  We squirrel the mirror struct away inside
+ * bi_next for read/write buffers.  This is safe since the bh
  * doesn't get submitted to the lower levels of block layer.
  */
-static struct mirror_set *bio_get_ms(struct bio *bio)
+static struct mirror *bio_get_m(struct bio *bio)
 {
-	return (struct mirror_set *) bio->bi_next;
+	return (struct mirror *) bio->bi_next;
 }
 
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+static void bio_set_m(struct bio *bio, struct mirror *m)
 {
-	bio->bi_next = (struct bio *) ms;
+	bio->bi_next = (struct bio *) m;
 }
 
 /*-----------------------------------------------------------------
@@ -607,7 +637,7 @@ static int recover(struct mirror_set *ms
 	unsigned long flags = 0;
 
 	/* fill in the source */
-	m = ms->mirror + DEFAULT_MIRROR;
+	m = ms->default_mirror;
 	from.bdev = m->dev->bdev;
 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
 	if (reg->key == (ms->nr_regions - 1)) {
@@ -623,7 +653,7 @@ static int recover(struct mirror_set *ms
 
 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-		if (i == DEFAULT_MIRROR)
+		if (&ms->mirror[i] == ms->default_mirror)
 			continue;
 
 		m = ms->mirror + i;
@@ -673,42 +703,163 @@ static void do_recovery(struct mirror_se
 }
 
 /*-----------------------------------------------------------------
- * Reads
+ * Misc Functions
  *---------------------------------------------------------------*/
-static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+#define MIN_READS       128
+/*
+ * choose_mirror
+ * @ms: the mirror set
+ * @m: mirror that has failed, or NULL if just choosing
+ *
+ * Returns: chosen mirror, or NULL on failure
+ */
+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m)
 {
-	/* FIXME: add read balancing */
-	return ms->mirror + DEFAULT_MIRROR;
+	int i, retry;
+	unsigned long flags;
+	struct mirror *ret = NULL;
+
+	spin_lock_irqsave(&ms->choose_lock, flags);
+
+	if (unlikely(m == ms->default_mirror)) {
+		i = DEFAULT_MIRROR;
+		atomic_set(&ms->read_count, MIN_READS);
+	} else
+		i = ms->read_mirror;
+
+	for (retry = 0; retry < ms->nr_mirrors; ) {
+		i %= ms->nr_mirrors;
+		ret = ms->mirror + i;
+
+		if (unlikely(atomic_read(&ret->error_count))) {
+			retry++;
+			i++;
+		} else {
+			/*
+			 * Guarantee that a number of read IOs
+			 * get queued to the same mirror.
+			 */
+			if (atomic_dec_and_test(&ms->read_count)) {
+				atomic_set(&ms->read_count, MIN_READS);
+				i++;
+			}
+
+			ms->read_mirror = i;
+			break;
+		}
+	}
+
+	/* Check for failure of default mirror, reset if necessary */
+	if (unlikely(m == ms->default_mirror))
+		ms->default_mirror = ret;
+
+	spin_unlock_irqrestore(&ms->choose_lock, flags);
+
+	if (unlikely(atomic_read(&ret->error_count))) {
+		DMERR("All mirror devices are dead. Unable to choose mirror.");
+		return NULL;
+	}
+
+	return ret;
+}
+
+static void fail_mirror(struct mirror *m)
+{
+	DMINFO("incrementing error_count on %s", m->dev->name);
+	atomic_inc(&m->error_count);
+
+	choose_mirror(m->ms, m);
+}
+
+static int default_ok(struct mirror *m)
+{
+	return !atomic_read(&m->ms->default_mirror->error_count);
 }
 
 /*
  * remap a buffer to a particular mirror.
  */
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+	return m->offset + (bio->bi_sector - m->ms->ti->begin);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
 {
 	bio->bi_bdev = m->dev->bdev;
-	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+	bio->bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct io_region *io, struct mirror *m,
+		       struct bio *bio)
+{
+	io->bdev = m->dev->bdev;
+	io->sector = map_sector(m, bio);
+	io->count = bio->bi_size >> 9;
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static void read_callback(unsigned long error, void *context)
+{
+	struct bio *bio = (struct bio *)context;
+	struct mirror *m;
+
+	m = bio_get_m(bio);
+	bio_set_m(bio, NULL);
+
+	if (unlikely(error)) {
+		DMWARN("A read failure occurred on a mirror device.");
+		fail_mirror(m);
+		if (likely(default_ok(m))) {
+			DMWARN("Trying different device.");
+			queue_bio(m->ms, bio, bio_rw(bio));
+		} else {
+			DMERR("No other device available, failing I/O.");
+			bio_endio(bio, 0, -EIO);
+		}
+	} else
+		bio_endio(bio, bio->bi_size, 0);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+	struct io_region io;
+
+	map_region(&io, m, bio);
+	bio_set_m(bio, m);
+	dm_io_async_bvec(1, &io, READ,
+			 bio->bi_io_vec + bio->bi_idx,
+			 read_callback, bio);
 }
 
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 {
-	region_t region;
 	struct bio *bio;
 	struct mirror *m;
 
 	while ((bio = bio_list_pop(reads))) {
-		region = bio_to_region(&ms->rh, bio);
-
 		/*
 		 * We can only read balance if the region is in sync.
 		 */
-		if (rh_in_sync(&ms->rh, region, 0))
-			m = choose_mirror(ms, bio->bi_sector);
-		else
-			m = ms->mirror + DEFAULT_MIRROR;
+		if (likely(rh_in_sync(&ms->rh,
+				      bio_to_region(&ms->rh, bio),
+				      0)))
+			m = choose_mirror(ms, NULL);
+		else {
+			m = ms->default_mirror;
+
+			/* If the default fails, we give up .*/
+			if (unlikely(m && atomic_read(&m->error_count)))
+				m = NULL;
+		}
 
-		map_bio(ms, m, bio);
-		generic_make_request(bio);
+		if (likely(m))
+			read_async_bio(m, bio);
+		else
+			bio_endio(bio, 0, -EIO);
 	}
 }
 
@@ -722,56 +873,116 @@ static void do_reads(struct mirror_set *
  * RECOVERING:	delay the io until recovery completes
  * NOSYNC:	increment pending, just write to the default mirror
  *---------------------------------------------------------------*/
+static void write_failure_handler(void *data)
+{
+	struct bio *bio;
+	struct bio_list failed_writes;
+	struct mirror_set *ms = (struct mirror_set *)data;
+	struct dirty_log *log = ms->rh.log;
+
+	if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK) {
+		dm_table_event(ms->ti->table);
+		wait_for_completion(&ms->failure_completion);
+	}
+
+	/* Take list out to handle endios. */
+	spin_lock_irq(&ms->lock);
+	failed_writes = ms->failures;
+	bio_list_init(&ms->failures);
+	spin_unlock_irq(&ms->lock);
+
+	while ((bio = bio_list_pop(&failed_writes)))
+		bio_endio(bio, bio->bi_size, 0);
+}
+
 static void write_callback(unsigned long error, void *context)
 {
-	unsigned int i;
-	int uptodate = 1;
+	unsigned int i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-
-	ms = bio_get_ms(bio);
-	bio_set_ms(bio, NULL);
-
+	int uptodate = 0, run;
+ 
+	ms = (bio_get_m(bio))->ms;
+	bio_set_m(bio, NULL);
+ 
 	/*
 	 * NOTE: We don't decrement the pending count here,
 	 * instead it is done by the targets endio function.
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
+	if (unlikely(error)) {
+		DMERR("Error during write occurred.");
 
-	if (error) {
 		/*
-		 * only error the io if all mirrors failed.
-		 * FIXME: bogus
+		 * Test all bits - if all failed, fail io.
+		 * Otherwise, go through hassle of failing a device...
 		 */
-		uptodate = 0;
-		for (i = 0; i < ms->nr_mirrors; i++)
-			if (!test_bit(i, &error)) {
+		for (i = 0; i < ms->nr_mirrors; i++) {
+			if (test_bit(i, &error))
+				fail_mirror(ms->mirror + i);
+			else
 				uptodate = 1;
-				break;
+		}
+
+		if (likely(uptodate)) {
+			spin_lock(&ms->lock);
+			if (atomic_read(&ms->suspended)) {
+				/*
+				 * The device is suspended, it is
+				 * safe to complete I/O.
+				 */
+				spin_unlock(&ms->lock);
+			} else {
+				/*
+				 * Need to raise event.  Since raising
+				 * events can block, we need to do it in
+				 * seperate thread.
+				 *
+				 * run gets set if this will be the first
+				 * bio in the list.
+				 */
+				run = !ms->failures.head;
+				bio_list_add(&ms->failures, bio);
+				spin_unlock(&ms->lock);
+
+				if (run)
+					queue_work(_kmir_mon_wq,
+						   &ms->failure_work);
+
+				return;
 			}
+		} else {
+			DMERR("All replicated volumes dead, failing I/O");
+			/* None of the writes succeeded, fail the I/O. */
+			ret = -EIO;
+		}
 	}
-	bio_endio(bio, bio->bi_size, 0);
+ 
+	bio_endio(bio, bio->bi_size, ret);
 }
 
 static void do_write(struct mirror_set *ms, struct bio *bio)
 {
 	unsigned int i;
-	struct io_region io[KCOPYD_MAX_REGIONS+1];
+	struct io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 
-	for (i = 0; i < ms->nr_mirrors; i++) {
-		m = ms->mirror + i;
-
-		io[i].bdev = m->dev->bdev;
-		io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
-		io[i].count = bio->bi_size >> 9;
-	}
+	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
+		map_region(dest++, m, bio);
 
-	bio_set_ms(bio, ms);
-	dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
-			 bio->bi_io_vec + bio->bi_idx,
-			 write_callback, bio);
+	if (likely(dest - io)) {	
+		/*
+		 * We can use the default mirror here, because we
+		 * only need it in order to retrieve the reference
+		 * to the mirror set in write_callback().
+		 */
+		bio_set_m(bio, ms->default_mirror);
+		dm_io_async_bvec(dest - io, io, WRITE,
+				 bio->bi_io_vec + bio->bi_idx,
+				 write_callback, bio);
+	} else
+		bio_endio(bio, bio->bi_size, -EIO);
 }
 
 static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -779,6 +990,9 @@ static void do_writes(struct mirror_set 
 	int state;
 	struct bio *bio;
 	struct bio_list sync, nosync, recover, *this_list = NULL;
+	struct bio_list requeue;
+	struct dirty_log *log = ms->rh.log;
+	region_t region;
 
 	if (!writes->head)
 		return;
@@ -789,9 +1003,18 @@ static void do_writes(struct mirror_set 
 	bio_list_init(&sync);
 	bio_list_init(&nosync);
 	bio_list_init(&recover);
+	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
+		region = bio_to_region(&ms->rh, bio);
+
+		if (log->type->is_remote_recovering &&
+		    log->type->is_remote_recovering(log, region)) {
+			bio_list_add(&requeue, bio);
+			continue;
+		}
+
+		state = rh_state(&ms->rh, region, 1);
 		switch (state) {
 		case RH_CLEAN:
 		case RH_DIRTY:
@@ -810,6 +1033,8 @@ static void do_writes(struct mirror_set 
 		bio_list_add(this_list, bio);
 	}
 
+	bio_list_merge(writes, &requeue);
+
 	/*
 	 * Increment the pending counts for any regions that will
 	 * be written to (writes to recover regions are going to
@@ -829,7 +1054,7 @@ static void do_writes(struct mirror_set 
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+		map_bio(ms->default_mirror, bio);
 		generic_make_request(bio);
 	}
 }
@@ -844,12 +1069,12 @@ static void do_mirror(struct mirror_set 
 {
 	struct bio_list reads, writes;
 
-	spin_lock(&ms->lock);
+	spin_lock_irq(&ms->lock);
 	reads = ms->reads;
 	writes = ms->writes;
 	bio_list_init(&ms->reads);
 	bio_list_init(&ms->writes);
-	spin_unlock(&ms->lock);
+	spin_unlock_irq(&ms->lock);
 
 	rh_update_states(&ms->rh);
 	do_recovery(ms);
@@ -871,7 +1096,7 @@ static void do_work(void *ignored)
  * Target functions
  *---------------------------------------------------------------*/
 static struct mirror_set *alloc_context(unsigned int nr_mirrors,
-					sector_t region_size,
+					uint32_t region_size,
 					struct dm_target *ti,
 					struct dirty_log *dl)
 {
@@ -891,11 +1116,16 @@ static struct mirror_set *alloc_context(
 
 	memset(ms, 0, len);
 	spin_lock_init(&ms->lock);
+	spin_lock_init(&ms->choose_lock);
 
 	ms->ti = ti;
 	ms->nr_mirrors = nr_mirrors;
-	ms->nr_regions = dm_div_up(ti->len, region_size);
+	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
+	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+
+	/* a resume must be issued to start the device */
+	atomic_set(&ms->suspended, 1);
 
 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
 		ti->error = "dm-mirror: Error creating dirty region hash";
@@ -903,6 +1133,13 @@ static struct mirror_set *alloc_context(
 		return NULL;
 	}
 
+	atomic_set(&ms->read_count, MIN_READS);
+
+	bio_list_init(&ms->failures);
+	INIT_WORK(&ms->failure_work, write_failure_handler, ms);
+
+	init_completion(&ms->failure_completion);
+
 	return ms;
 }
 
@@ -916,7 +1153,7 @@ static void free_context(struct mirror_s
 	kfree(ms);
 }
 
-static inline int _check_region_size(struct dm_target *ti, sector_t size)
+static inline int _check_region_size(struct dm_target *ti, uint32_t size)
 {
 	return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
 		 size > ti->len);
@@ -940,6 +1177,8 @@ static int get_mirror(struct mirror_set 
 	}
 
 	ms->mirror[mirror].offset = offset;
+	atomic_set(&(ms->mirror[mirror].error_count), 0);
+	ms->mirror[mirror].ms = ms;
 
 	return 0;
 }
@@ -1009,8 +1248,8 @@ static struct dirty_log *create_dirty_lo
  * log_type #log_params <log_params>
  * #mirrors [mirror_path offset]{2,}
  *
- * For now, #log_params = 1, log_type = "core"
- *
+ * log_type is "core" or "disk"
+ * #log_params is between 1 and 3
  */
 #define DM_IO_PAGES 64
 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -1060,6 +1299,7 @@ static int mirror_ctr(struct dm_target *
 	}
 
 	ti->private = ms;
+ 	ti->split_io = ms->rh.region_size;
 
 	r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
 	if (r) {
@@ -1082,14 +1322,15 @@ static void mirror_dtr(struct dm_target 
 
 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
 {
+	unsigned long flags;
 	int should_wake = 0;
 	struct bio_list *bl;
 
 	bl = (rw == WRITE) ? &ms->writes : &ms->reads;
-	spin_lock(&ms->lock);
+	spin_lock_irqsave(&ms->lock, flags);
 	should_wake = !(bl->head);
 	bio_list_add(bl, bio);
-	spin_unlock(&ms->lock);
+	spin_unlock_irqrestore(&ms->lock, flags);
 
 	if (should_wake)
 		wake();
@@ -1104,42 +1345,64 @@ static int mirror_map(struct dm_target *
 	int r, rw = bio_rw(bio);
 	struct mirror *m;
 	struct mirror_set *ms = ti->private;
-
-	map_context->ll = bio->bi_sector >> ms->rh.region_shift;
+	struct dm_bio_details *bd;
+	struct bio_map_info *bmi;
 
 	if (rw == WRITE) {
+		/* Save region for mirror_end_io() handler */
+		map_context->ll = bio_to_region(&ms->rh, bio);
 		queue_bio(ms, bio, rw);
 		return 0;
 	}
 
+	/* It's all about the READs now */
+
 	r = ms->rh.log->type->in_sync(ms->rh.log,
 				      bio_to_region(&ms->rh, bio), 0);
 	if (r < 0 && r != -EWOULDBLOCK)
 		return r;
 
-	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
+	if (r == -EWOULDBLOCK)
 		r = 0;
 
-	/*
-	 * We don't want to fast track a recovery just for a read
-	 * ahead.  So we just let it silently fail.
-	 * FIXME: get rid of this.
-	 */
-	if (!r && rw == READA)
-		return -EIO;
+	if (likely(r)) {
+		/*
+		 * Optimize reads by avoiding to hand them to daemon.
+		 *
+		 * In case they fail, queue them for another shot
+		 * in the mirror_end_io() function.
+		 */
+		m = choose_mirror(ms, NULL);
+		if (likely(m)) {
+			bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO);
+
+			if (likely(bmi)) {
+				/* without this, a read is not retryable */
+				bd = &bmi->bmi_bd;
+				dm_bio_record(bd, bio);
+				map_context->ptr = bmi;
+				bmi->bmi_m = m;
+			} else {
+				/* we could fail now, but we can at least  **
+				** give it a shot.  The bd is only used to **
+				** retry in the event of a failure anyway. **
+				** If we fail, we can fail the I/O then.   */
+				map_context->ptr = NULL;
+			}
+
+			map_bio(m, bio);
+			return 1; /* Mapped -> queue request. */
+		} else
+			return -EIO;
+	} else {
+		/* Either not clean, or -EWOULDBLOCK */
+		if (rw == READA)
+			return -EWOULDBLOCK;
 
-	if (!r) {
-		/* Pass this io over to the daemon */
 		queue_bio(ms, bio, rw);
-		return 0;
 	}
 
-	m = choose_mirror(ms, bio->bi_sector);
-	if (!m)
-		return -EIO;
-
-	map_bio(ms, m, bio);
-	return 1;
+	return 0;
 }
 
 static int mirror_end_io(struct dm_target *ti, struct bio *bio,
@@ -1147,71 +1410,140 @@ static int mirror_end_io(struct dm_targe
 {
 	int rw = bio_rw(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-	region_t region = map_context->ll;
+	struct mirror *m = NULL;
+	struct dm_bio_details *bd = NULL;
 
 	/*
 	 * We need to dec pending if this was a write.
 	 */
-	if (rw == WRITE)
-		rh_dec(&ms->rh, region);
+	if (rw == WRITE) {
+		rh_dec(&ms->rh, map_context->ll);
+		return error;
+	}
 
-	return 0;
+	if (error == -EOPNOTSUPP)
+		goto out;
+
+	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+		goto out;
+
+	if (unlikely(error)) {
+		DMERR("A read failure occurred on a mirror device.");
+		if (!map_context->ptr) {
+			/*
+			 * There wasn't enough memory to record necessary
+			 * information for a retry.
+			 */
+			DMERR("Out of memory causing inability to retry read.");
+			return -EIO;
+		}
+		m = ((struct bio_map_info *)map_context->ptr)->bmi_m;
+		fail_mirror(m); /* Flag error on mirror. */
+
+		/*
+		 * A failed read needs to get queued
+		 * to the daemon for another shot to
+		 * one (if any) intact mirrors.
+		 */
+		if (default_ok(m)) {
+			bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd);
+
+			DMWARN("Trying different device.");
+			dm_bio_restore(bd, bio);
+			mempool_free(map_context->ptr, bio_map_info_pool);
+			map_context->ptr = NULL;
+			queue_bio(ms, bio, rw);
+			return 1; /* We want another shot on the bio. */
+		}
+		DMERR("All replicated volumes dead, failing I/O");
+	}
+
+ out:
+	if (map_context->ptr)
+		mempool_free(map_context->ptr, bio_map_info_pool);
+
+	return error;
 }
 
-static void mirror_suspend(struct dm_target *ti)
+static void mirror_presuspend(struct dm_target *ti)
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
+	unsigned long flags;
+	int run;
+
+	/*
+	 * Only run the completion if we are suspending after
+	 * a disk failure.
+	 */
+	spin_lock_irqsave(&ms->lock, flags);
+	run = ms->failures.head ? 1 : 0;
+	spin_unlock_irqrestore(&ms->lock, flags);
+
+	if (run && (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK))
+		complete(&ms->failure_completion);
+
+	if (log->type->presuspend && log->type->presuspend(log))
+		/* FIXME: need better error handling */
+		DMWARN("log presuspend failed");
+
+}  
+
+static void mirror_postsuspend(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct dirty_log *log = ms->rh.log;
+
 	rh_stop_recovery(&ms->rh);
-	if (log->type->suspend && log->type->suspend(log))
+	if (log->type->postsuspend && log->type->postsuspend(log))
 		/* FIXME: need better error handling */
-		DMWARN("log suspend failed");
+		DMWARN("log postsuspend failed");
+	atomic_set(&ms->suspended, 1);
 }
 
 static void mirror_resume(struct dm_target *ti)
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dirty_log *log = ms->rh.log;
+
 	if (log->type->resume && log->type->resume(log))
 		/* FIXME: need better error handling */
 		DMWARN("log resume failed");
-	rh_start_recovery(&ms->rh);
+
+	if (atomic_dec_and_test(&ms->suspended))
+		rh_start_recovery(&ms->rh);
+	atomic_set(&ms->suspended, 0);
 }
 
 static int mirror_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
-	char buffer[32];
 	unsigned int m, sz = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
-
-#define EMIT(x...) sz += ((sz >= maxlen) ? \
-			  0 : scnprintf(result + sz, maxlen - sz, x))
+	char buffer[ms->nr_mirrors + 1];
 
 	switch (type) {
 	case STATUSTYPE_INFO:
-		EMIT("%d ", ms->nr_mirrors);
-
+		DMEMIT("%d ", ms->nr_mirrors);
 		for (m = 0; m < ms->nr_mirrors; m++) {
-			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
-			EMIT("%s ", buffer);
+			DMEMIT("%s ", ms->mirror[m].dev->name);
+			buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ? 
+				'D' : 'A';
 		}
+		buffer[m] = '\0';
 
-		EMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
-		     ms->rh.log->type->get_sync_count(ms->rh.log),
-		     ms->nr_regions);
+		DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ",
+		       ms->rh.log->type->get_sync_count(ms->rh.log),
+		       ms->nr_regions, buffer);
+		ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
 		break;
 
 	case STATUSTYPE_TABLE:
-		EMIT("%s 1 " SECTOR_FORMAT " %d ",
-		     ms->rh.log->type->name, ms->rh.region_size,
-		     ms->nr_mirrors);
-
-		for (m = 0; m < ms->nr_mirrors; m++) {
-			format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev);
-			EMIT("%s " SECTOR_FORMAT " ",
-			     buffer, ms->mirror[m].offset);
-		}
+		sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+		DMEMIT("%d ", ms->nr_mirrors);
+		for (m = 0; m < ms->nr_mirrors; m++)
+			DMEMIT("%s " SECTOR_FORMAT " ",
+			       ms->mirror[m].dev->name, ms->mirror[m].offset);
 	}
 
 	return 0;
@@ -1219,13 +1551,14 @@ static int mirror_status(struct dm_targe
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 0, 1},
+	.version = {1, 1, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
 	.map	 = mirror_map,
 	.end_io	 = mirror_end_io,
-	.suspend = mirror_suspend,
+	.presuspend = mirror_presuspend,
+	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
 };
@@ -1234,24 +1567,38 @@ static int __init dm_mirror_init(void)
 {
 	int r;
 
+	bio_map_info_pool = mempool_create(100, bio_map_info_alloc,
+					   bio_map_info_free, NULL);
+	if (!bio_map_info_pool)
+		return -ENOMEM;
+
 	r = dm_dirty_log_init();
 	if (r)
 		return r;
 
-	_kmirrord_wq = create_workqueue("kmirrord");
+	_kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!_kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		dm_dirty_log_exit();
-		return r;
+		return -ENOMEM;
 	}
 	INIT_WORK(&_kmirrord_work, do_work, NULL);
 
+	_kmir_mon_wq = create_singlethread_workqueue("kmir_mon");
+	if (!_kmir_mon_wq) {
+		DMERR("couldn't start kmir_mon");
+		dm_dirty_log_exit();
+		destroy_workqueue(_kmirrord_wq);
+		return -ENOMEM;
+	}
+
 	r = dm_register_target(&mirror_target);
 	if (r < 0) {
 		DMERR("%s: Failed to register mirror target",
 		      mirror_target.name);
 		dm_dirty_log_exit();
 		destroy_workqueue(_kmirrord_wq);
+		destroy_workqueue(_kmir_mon_wq);
 	}
 
 	return r;
diff -pruN ./drivers/md.dm/dm-round-robin.c ./drivers/md/dm-round-robin.c
--- ./drivers/md.dm/dm-round-robin.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/dm-round-robin.c	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Round-robin path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+
+/*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+ *---------------------------------------------------------------*/
+struct path_info {
+	struct list_head list;
+	struct path *path;
+	unsigned repeat_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Round-robin selector
+ *---------------------------------------------------------------*/
+
+#define RR_MIN_IO		1000
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head invalid_paths;
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->invalid_paths);
+	}
+
+	return s;
+}
+
+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s;
+
+	s = alloc_selector();
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void rr_destroy(struct path_selector *ps)
+{
+	struct selector *s = (struct selector *) ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->invalid_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int rr_status(struct path_selector *ps, struct path *path,
+		     status_type_t type, char *result, unsigned int maxlen)
+{
+	struct path_info *pi;
+	int sz = 0;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		switch(type) {
+		case STATUSTYPE_INFO:
+			break;
+		case STATUSTYPE_TABLE:
+			pi = path->pscontext;
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int rr_add_path(struct path_selector *ps, struct path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = RR_MIN_IO;
+
+	if (argc > 1) {
+		*error = "round-robin ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	/* First path argument is number of I/Os before switching path */
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "round-robin ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "round-robin ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+
+	path->pscontext = pi;
+
+	list_add(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void rr_fail_path(struct path_selector *ps, struct path *p)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = p->pscontext;
+
+	list_move(&pi->list, &s->invalid_paths);
+}
+
+static int rr_reinstate_path(struct path_selector *ps, struct path *p)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = p->pscontext;
+
+	list_move(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static struct path *rr_select_path(struct path_selector *ps,
+				   unsigned *repeat_count)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = NULL;
+
+	if (!list_empty(&s->valid_paths)) {
+		pi = list_entry(s->valid_paths.next, struct path_info, list);
+		list_move_tail(&pi->list, &s->valid_paths);
+		*repeat_count = pi->repeat_count;
+	}
+
+	return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type rr_ps = {
+	.name = "round-robin",
+	.module = THIS_MODULE,
+	.table_args = 1,
+	.info_args = 0,
+	.create = rr_create,
+	.destroy = rr_destroy,
+	.status = rr_status,
+	.add_path = rr_add_path,
+	.fail_path = rr_fail_path,
+	.reinstate_path = rr_reinstate_path,
+	.select_path = rr_select_path,
+};
+
+static int __init dm_rr_init(void)
+{
+	int r = dm_register_path_selector(&rr_ps);
+
+	if (r < 0)
+		DMERR("round-robin: register failed %d", r);
+
+	DMINFO("dm-round-robin version 1.0.0 loaded");
+
+	return r;
+}
+
+static void __exit dm_rr_exit(void)
+{
+	int r = dm_unregister_path_selector(&rr_ps);
+
+	if (r < 0)
+		DMERR("round-robin: unregister failed %d", r);
+}
+
+module_init(dm_rr_init);
+module_exit(dm_rr_exit);
+
+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff -pruN ./drivers/md.dm/dm-snap.c ./drivers/md/dm-snap.c
--- ./drivers/md.dm/dm-snap.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-snap.c	2006-03-17 13:16:38.000000000 +0300
@@ -49,6 +49,11 @@ struct pending_exception {
 	struct bio_list snapshot_bios;
 
 	/*
+	 * Short-term queue of pending exceptions prior to submission.
+	 */
+	struct list_head list;
+
+	/*
 	 * Other pending_exceptions that are processing this
 	 * chunk.  When this list is empty, we know we can
 	 * complete the origins.
@@ -371,6 +376,15 @@ static inline ulong round_up(ulong n, ul
 	return (n + size) & ~size;
 }
 
+static void read_snapshot_metadata(struct dm_snapshot *s)
+{
+	if (s->store.read_metadata(&s->store)) {
+		down_write(&s->lock);
+		s->valid = 0;
+		up_write(&s->lock);
+	}
+}
+
 /*
  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  */
@@ -457,7 +471,7 @@ static int snapshot_ctr(struct dm_target
 	s->chunk_shift = ffs(chunk_size) - 1;
 
 	s->valid = 1;
-	s->have_metadata = 0;
+	s->active = 0;
 	s->last_percent = 0;
 	init_rwsem(&s->lock);
 	s->table = ti->table;
@@ -492,7 +506,11 @@ static int snapshot_ctr(struct dm_target
 		goto bad5;
 	}
 
+	/* Metadata must only be loaded into one table at once */
+	read_snapshot_metadata(s);
+
 	/* Add snapshot to the list of snapshots for this origin */
+	/* Exceptions aren't triggered till snapshot_resume() is called */
 	if (register_snapshot(s)) {
 		r = -EINVAL;
 		ti->error = "Cannot register snapshot origin";
@@ -529,8 +547,12 @@ static void snapshot_dtr(struct dm_targe
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
 
+	/* Prevent further origin writes from using this snapshot. */
+	/* After this returns there can be no new kcopyd jobs. */
 	unregister_snapshot(s);
 
+	kcopyd_client_destroy(s->kcopyd_client);
+
 	exit_exception_table(&s->pending, pending_cache);
 	exit_exception_table(&s->complete, exception_cache);
 
@@ -539,7 +561,7 @@ static void snapshot_dtr(struct dm_targe
 
 	dm_put_device(ti, s->origin);
 	dm_put_device(ti, s->cow);
-	kcopyd_client_destroy(s->kcopyd_client);
+
 	kfree(s);
 }
 
@@ -777,7 +799,10 @@ static int snapshot_map(struct dm_target
 
 	/* Full snapshots are not usable */
 	if (!s->valid)
-		return -1;
+		return -EIO;
+
+	if (unlikely(bio_barrier(bio)))
+		return -EOPNOTSUPP;
 
 	/*
 	 * Write to snapshot - higher level takes care of RW/RO
@@ -848,24 +873,15 @@ static void snapshot_resume(struct dm_ta
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
 
-	if (s->have_metadata)
-		return;
-
-	if (s->store.read_metadata(&s->store)) {
-		down_write(&s->lock);
-		s->valid = 0;
-		up_write(&s->lock);
-	}
-
-	s->have_metadata = 1;
+	down_write(&s->lock);
+	s->active = 1;
+	up_write(&s->lock);
 }
 
 static int snapshot_status(struct dm_target *ti, status_type_t type,
 			   char *result, unsigned int maxlen)
 {
 	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
-	char cow[32];
-	char org[32];
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -892,9 +908,8 @@ static int snapshot_status(struct dm_tar
 		 * to make private copies if the output is to
 		 * make sense.
 		 */
-		format_dev_t(cow, snap->cow->bdev->bd_dev);
-		format_dev_t(org, snap->origin->bdev->bd_dev);
-		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, org, cow,
+		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
+			 snap->origin->name, snap->cow->name,
 			 snap->type, snap->chunk_size);
 		break;
 	}
@@ -924,14 +939,19 @@ static int __origin_write(struct list_he
 	int r = 1, first = 1;
 	struct dm_snapshot *snap;
 	struct exception *e;
-	struct pending_exception *pe, *last = NULL;
+	struct pending_exception *pe, *next_pe, *last = NULL;
 	chunk_t chunk;
+	LIST_HEAD(pe_queue);
 
 	/* Do all the snapshots on this origin */
 	list_for_each_entry (snap, snapshots, list) {
 
-		/* Only deal with valid snapshots */
-		if (!snap->valid)
+		/* Only deal with valid and active snapshots */
+		if (!snap->valid || !snap->active)
+			continue;
+
+		/* Nothing to do if writing beyond end of snapshot */
+		if (bio->bi_sector >= dm_table_get_size(snap->table))
 			continue;
 
 		down_write(&snap->lock);
@@ -955,12 +975,19 @@ static int __origin_write(struct list_he
 				snap->valid = 0;
 
 			} else {
-				if (last)
+				if (first) {
+					bio_list_add(&pe->origin_bios, bio);
+					r = 0;
+					first = 0;
+				}
+				if (last && list_empty(&pe->siblings))
 					list_merge(&pe->siblings,
 						   &last->siblings);
-
+				if (!pe->started) {
+					pe->started = 1;
+					list_add_tail(&pe->list, &pe_queue);
+				}
 				last = pe;
-				r = 0;
 			}
 		}
 
@@ -970,24 +997,8 @@ static int __origin_write(struct list_he
 	/*
 	 * Now that we have a complete pe list we can start the copying.
 	 */
-	if (last) {
-		pe = last;
-		do {
-			down_write(&pe->snap->lock);
-			if (first)
-				bio_list_add(&pe->origin_bios, bio);
-			if (!pe->started) {
-				pe->started = 1;
-				up_write(&pe->snap->lock);
-				start_copy(pe);
-			} else
-				up_write(&pe->snap->lock);
-			first = 0;
-			pe = list_entry(pe->siblings.next,
-					struct pending_exception, siblings);
-
-		} while (pe != last);
-	}
+	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
+		start_copy(pe);
 
 	return r;
 }
@@ -1051,6 +1062,9 @@ static int origin_map(struct dm_target *
 	struct dm_dev *dev = (struct dm_dev *) ti->private;
 	bio->bi_bdev = dev->bdev;
 
+	if (unlikely(bio_barrier(bio)))
+		return -EOPNOTSUPP;
+
 	/* Only tell snapshots if this is a write */
 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
 }
@@ -1082,7 +1096,6 @@ static int origin_status(struct dm_targe
 			 unsigned int maxlen)
 {
 	struct dm_dev *dev = (struct dm_dev *) ti->private;
-	char buffer[32];
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -1090,8 +1103,7 @@ static int origin_status(struct dm_targe
 		break;
 
 	case STATUSTYPE_TABLE:
-		format_dev_t(buffer, dev->bdev->bd_dev);
-		snprintf(result, maxlen, "%s", buffer);
+		snprintf(result, maxlen, "%s", dev->name);
 		break;
 	}
 
@@ -1100,7 +1112,7 @@ static int origin_status(struct dm_targe
 
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 0, 1},
+	.version = {1, 2, 0},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
@@ -1111,7 +1123,7 @@ static struct target_type origin_target 
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 0, 1},
+	.version = {1, 2, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
diff -pruN ./drivers/md.dm/dm-snap.h ./drivers/md/dm-snap.h
--- ./drivers/md.dm/dm-snap.h	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-snap.h	2006-03-17 13:16:38.000000000 +0300
@@ -99,7 +99,9 @@ struct dm_snapshot {
 
 	/* You can't use a snapshot if this is 0 (e.g. if full) */
 	int valid;
-	int have_metadata;
+
+	/* Origin writes don't trigger exceptions until this is set */
+	int active;
 
 	/* Used for display of table */
 	char type;
diff -pruN ./drivers/md.dm/dm-stripe.c ./drivers/md/dm-stripe.c
--- ./drivers/md.dm/dm-stripe.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-stripe.c	2006-03-17 13:16:38.000000000 +0300
@@ -21,7 +21,7 @@ struct stripe_c {
 	uint32_t stripes;
 
 	/* The size of this target / num. stripes */
-	uint32_t stripe_width;
+	sector_t stripe_width;
 
 	/* stripe chunk size */
 	uint32_t chunk_shift;
@@ -173,9 +173,8 @@ static int stripe_map(struct dm_target *
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 
 	sector_t offset = bio->bi_sector - ti->begin;
-	uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
-	uint32_t stripe = chunk % sc->stripes;	/* 32bit modulus */
-	chunk = chunk / sc->stripes;
+	sector_t chunk = offset >> sc->chunk_shift;
+	uint32_t stripe = do_div(chunk, sc->stripes);
 
 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
 	bio->bi_sector = sc->stripe[stripe].physical_start +
@@ -189,10 +188,6 @@ static int stripe_status(struct dm_targe
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 	unsigned int sz = 0;
 	unsigned int i;
-	char buffer[32];
-
-#define EMIT(x...) sz += ((sz >= maxlen) ? \
-			  0 : scnprintf(result + sz, maxlen - sz, x))
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -200,12 +195,10 @@ static int stripe_status(struct dm_targe
 		break;
 
 	case STATUSTYPE_TABLE:
-		EMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
-		for (i = 0; i < sc->stripes; i++) {
-			format_dev_t(buffer, sc->stripe[i].dev->bdev->bd_dev);
-			EMIT(" %s " SECTOR_FORMAT, buffer,
-			     sc->stripe[i].physical_start);
-		}
+		DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
+		for (i = 0; i < sc->stripes; i++)
+			DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
+			       sc->stripe[i].physical_start);
 		break;
 	}
 	return 0;
@@ -213,7 +206,7 @@ static int stripe_status(struct dm_targe
 
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version= {1, 0, 1},
+	.version= {1, 0, 2},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
diff -pruN ./drivers/md.dm/dm-table.c ./drivers/md/dm-table.c
--- ./drivers/md.dm/dm-table.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-table.c	2006-03-17 13:16:38.000000000 +0300
@@ -57,7 +57,7 @@ struct dm_table {
 /*
  * Similar to ceiling(log_size(n))
  */
-static unsigned int int_log(unsigned long n, unsigned long base)
+static unsigned int int_log(unsigned int n, unsigned int base)
 {
 	int result = 0;
 
@@ -454,6 +454,8 @@ static int __table_get_device(struct dm_
 			return r;
 		}
 
+		format_dev_t(dd->name, dev);
+
 		atomic_set(&dd->count, 0);
 		list_add(&dd->list, &t->devices);
 
@@ -575,7 +577,7 @@ static char **realloc_argv(unsigned *arr
 /*
  * Destructively splits up the argument list to pass to ctr.
  */
-static int split_args(int *argc, char ***argvp, char *input)
+int dm_split_args(int *argc, char ***argvp, char *input)
 {
 	char *start, *end = input, *out, **argv = NULL;
 	unsigned array_size = 0;
@@ -663,14 +665,14 @@ int dm_table_add_target(struct dm_table 
 
 	if (!len) {
 		tgt->error = "zero-length target";
-		DMERR(": %s\n", tgt->error);
+		DMERR("%s", tgt->error);
 		return -EINVAL;
 	}
 
 	tgt->type = dm_get_target_type(type);
 	if (!tgt->type) {
 		tgt->error = "unknown target type";
-		DMERR(": %s\n", tgt->error);
+		DMERR("%s", tgt->error);
 		return -EINVAL;
 	}
 
@@ -688,7 +690,7 @@ int dm_table_add_target(struct dm_table 
 		goto bad;
 	}
 
-	r = split_args(&argc, &argv, params);
+	r = dm_split_args(&argc, &argv, params);
 	if (r) {
 		tgt->error = "couldn't split parameters (insufficient memory)";
 		goto bad;
@@ -707,7 +709,7 @@ int dm_table_add_target(struct dm_table 
 	return 0;
 
  bad:
-	DMERR(": %s\n", tgt->error);
+	DMERR("%s", tgt->error);
 	dm_put_target_type(tgt->type);
 	return r;
 }
@@ -825,7 +827,7 @@ void dm_table_set_restrictions(struct dm
 	 * Make sure we obey the optimistic sub devices
 	 * restrictions.
 	 */
-	q->max_sectors = t->limits.max_sectors;
+	blk_queue_max_sectors(q, t->limits.max_sectors);
 	q->max_phys_segments = t->limits.max_phys_segments;
 	q->max_hw_segments = t->limits.max_hw_segments;
 	q->hardsect_size = t->limits.hardsect_size;
@@ -848,18 +850,38 @@ int dm_table_get_mode(struct dm_table *t
 	return t->mode;
 }
 
-void dm_table_suspend_targets(struct dm_table *t)
+static void suspend_targets(struct dm_table *t, unsigned postsuspend)
 {
-	int i;
+	int i = t->num_targets;
+	struct dm_target *ti = t->targets;
 
-	for (i = 0; i < t->num_targets; i++) {
-		struct dm_target *ti = t->targets + i;
+	while (i--) {
+		if (postsuspend) {
+			if (ti->type->postsuspend)
+				ti->type->postsuspend(ti);
+		} else if (ti->type->presuspend)
+			ti->type->presuspend(ti);
 
-		if (ti->type->suspend)
-			ti->type->suspend(ti);
+		ti++;
 	}
 }
 
+void dm_table_presuspend_targets(struct dm_table *t)
+{
+	if (!t)
+		return;
+
+	return suspend_targets(t, 0);
+}
+
+void dm_table_postsuspend_targets(struct dm_table *t)
+{
+	if (!t)
+		return;
+
+	return suspend_targets(t, 1);
+}
+
 void dm_table_resume_targets(struct dm_table *t)
 {
 	int i;
@@ -900,11 +922,35 @@ void dm_table_unplug_all(struct dm_table
 	}
 }
 
+int dm_table_flush_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+	int ret = 0;
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+		int err;
+
+		if (!q->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
+EXPORT_SYMBOL(dm_table_get_size);
 EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
+EXPORT_SYMBOL(dm_table_flush_all);
diff -pruN ./drivers/md.dm/dm-target.c ./drivers/md/dm-target.c
--- ./drivers/md.dm/dm-target.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/dm-target.c	2006-03-17 13:16:38.000000000 +0300
@@ -120,10 +120,9 @@ int dm_register_target(struct target_typ
 		return -ENOMEM;
 
 	down_write(&_lock);
-	if (__find_target_type(t->name)) {
-		kfree(ti);
+	if (__find_target_type(t->name))
 		rv = -EEXIST;
-	} else
+	else
 		list_add(&ti->list, &_targets);
 
 	up_write(&_lock);
diff -pruN ./drivers/md.dm/Kconfig ./drivers/md/Kconfig
--- ./drivers/md.dm/Kconfig	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/Kconfig	2006-03-17 13:16:38.000000000 +0300
@@ -85,6 +85,24 @@ config MD_RAID1
 
 	  If unsure, say Y.
 
+config MD_RAID10
+	tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
+	depends on BLK_DEV_MD && EXPERIMENTAL
+	---help---
+	  RAID-10 provides a combination of striping (RAID-0) and
+	  mirroring (RAID-1) with easier configuration and more flexable
+	  layout.
+	  Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
+	  be the same size (or atleast, only as much as the smallest device
+	  will be used).
+	  RAID-10 provides a variety of layouts that provide different levels
+	  of redundancy and performance.
+
+	  RAID-10 requires mdadm-1.7.0 or later, available at:
+
+	  ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+
+
 config MD_RAID5
 	tristate "RAID-4/RAID-5 mode"
 	depends on BLK_DEV_MD
@@ -200,5 +218,17 @@ config DM_ZERO
 	  A target that discards writes, and returns all zeroes for
 	  reads.  Useful in some recovery situations.
 
+config DM_MULTIPATH
+	tristate "Multipath target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Allow volume managers to support multipath hardware.
+
+config DM_MULTIPATH_EMC
+	tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
+	depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Multipath support for EMC CX/AX series hardware.
+
 endmenu
 
diff -pruN ./drivers/md.dm/kcopyd.c ./drivers/md/kcopyd.c
--- ./drivers/md.dm/kcopyd.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/kcopyd.c	2006-03-20 09:36:55.000000000 +0300
@@ -43,6 +43,10 @@ struct kcopyd_client {
 	struct page_list *pages;
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
+#ifndef __GENKSYMS__
+	wait_queue_head_t destroyq;
+	atomic_t nr_jobs;
+#endif
 };
 
 static struct page_list *alloc_pl(void)
@@ -292,10 +296,15 @@ static int run_complete_job(struct kcopy
 	int read_err = job->read_err;
 	unsigned int write_err = job->write_err;
 	kcopyd_notify_fn fn = job->fn;
+	struct kcopyd_client *kc = job->kc;
 
-	kcopyd_put_pages(job->kc, job->pages);
+	kcopyd_put_pages(kc, job->pages);
 	mempool_free(job, _job_pool);
 	fn(read_err, write_err, context);
+
+	if (atomic_dec_and_test(&kc->nr_jobs))
+		wake_up(&kc->destroyq);
+
 	return 0;
 }
 
@@ -430,6 +439,7 @@ static void do_work(void *ignored)
  */
 static void dispatch_job(struct kcopyd_job *job)
 {
+	atomic_inc(&job->kc->nr_jobs);
 	push(&_pages_jobs, job);
 	wake();
 }
@@ -667,6 +677,9 @@ int kcopyd_client_create(unsigned int nr
 		return r;
 	}
 
+	init_waitqueue_head(&kc->destroyq);
+	atomic_set(&kc->nr_jobs, 0);
+
 	client_add(kc);
 	*result = kc;
 	return 0;
@@ -674,6 +687,9 @@ int kcopyd_client_create(unsigned int nr
 
 void kcopyd_client_destroy(struct kcopyd_client *kc)
 {
+	/* Wait for completion of all jobs submitted by this client. */
+	wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+
 	dm_io_put(kc->nr_pages);
 	client_free_pages(kc);
 	client_del(kc);
diff -pruN ./drivers/md.dm/linear.c ./drivers/md/linear.c
--- ./drivers/md.dm/linear.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/linear.c	2006-03-17 13:16:38.000000000 +0300
@@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde
 		return hash->dev0;
 }
 
-
 /**
  *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
@@ -93,13 +92,35 @@ static void linear_unplug(request_queue_
 	}
 }
 
+static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
+			      sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		struct block_device *bdev = conf->disks[i].rdev->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+		ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
 
 static int linear_run (mddev_t *mddev)
 {
 	linear_conf_t *conf;
 	struct linear_hash *table;
 	mdk_rdev_t *rdev;
-	int size, i, nb_zone, cnt;
+	int i, nb_zone, cnt;
+	sector_t size;
 	unsigned int curr_offset;
 	struct list_head *tmp;
 
@@ -137,7 +158,7 @@ static int linear_run (mddev_t *mddev)
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->size = rdev->size;
 		mddev->array_size += rdev->size;
@@ -200,6 +221,7 @@ static int linear_run (mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
+	mddev->queue->issue_flush_fn = linear_issue_flush;
 	return 0;
 
 out:
@@ -247,10 +269,11 @@ static int linear_make_request (request_
 		char b[BDEVNAME_SIZE];
 
 		printk("linear_make_request: Block %llu out of bounds on "
-			"dev %s size %ld offset %ld\n",
+			"dev %s size %llu offset %llu\n",
 			(unsigned long long)block,
 			bdevname(tmp_dev->rdev->bdev, b),
-			tmp_dev->size, tmp_dev->offset);
+			(unsigned long long)tmp_dev->size,
+		        (unsigned long long)tmp_dev->offset);
 		bio_io_error(bio, bio->bi_size);
 		return 0;
 	}
diff -pruN ./drivers/md.dm/Makefile ./drivers/md/Makefile
--- ./drivers/md.dm/Makefile	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/Makefile	2006-03-17 13:16:38.000000000 +0300
@@ -4,13 +4,16 @@
 
 dm-mod-objs	:= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
 		   dm-ioctl.o dm-io.o kcopyd.o
+dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
 dm-snapshot-objs := dm-snap.o dm-exception-store.o
 dm-mirror-objs	:= dm-log.o dm-raid1.o
 raid6-objs	:= raid6main.o raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
 		   raid6int8.o raid6int16.o raid6int32.o \
 		   raid6mmx.o raid6sse1.o raid6sse2.o
-host-progs	:= mktables
+hostprogs-y	:= mktables
+
+CFLAGS_raid6int8.o += -O2
 
 # Note: link order is important.  All raid personalities
 # and xor.o must come before md.o, as they each initialise 
@@ -20,12 +23,15 @@ host-progs	:= mktables
 obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
+obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o
 obj-$(CONFIG_MD_RAID6)		+= raid6.o xor.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
+obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_EMC)	+= dm-emc.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
diff -pruN ./drivers/md.dm/md.c ./drivers/md/md.c
--- ./drivers/md.dm/md.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/md.c	2006-03-17 13:22:09.000000000 +0300
@@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN
 		tmp = tmp->next;})					\
 		)
 
+int md_flush_mddev(mddev_t *mddev, sector_t *error_sector)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+	int ret = 0;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+		int err;
+
+		if (!r_queue->issue_flush_fn)
+			err = -EOPNOTSUPP;
+		else
+			err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector);
+
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int md_flush_all(request_queue_t *q, struct gendisk *disk,
+			 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+
+	return md_flush_mddev(mddev, error_sector);
+}
+
 static int md_fail_request (request_queue_t *q, struct bio *bio)
 {
 	bio_io_error(bio, bio->bi_size);
@@ -331,29 +364,24 @@ static int bi_complete(struct bio *bio, 
 static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 		   struct page *page, int rw)
 {
-	struct bio bio;
-	struct bio_vec vec;
+	struct bio *bio = bio_alloc(GFP_NOIO, 1);
 	struct completion event;
+	int ret;
 
 	rw |= (1 << BIO_RW_SYNC);
 
-	bio_init(&bio);
-	bio.bi_io_vec = &vec;
-	vec.bv_page = page;
-	vec.bv_len = size;
-	vec.bv_offset = 0;
-	bio.bi_vcnt = 1;
-	bio.bi_idx = 0;
-	bio.bi_size = size;
-	bio.bi_bdev = bdev;
-	bio.bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio_add_page(bio, page, size, 0);
 	init_completion(&event);
-	bio.bi_private = &event;
-	bio.bi_end_io = bi_complete;
-	submit_bio(rw, &bio);
+	bio->bi_private = &event;
+	bio->bi_end_io = bi_complete;
+	submit_bio(rw, bio);
 	wait_for_completion(&event);
 
-	return test_bit(BIO_UPTODATE, &bio.bi_flags);
+	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_put(bio);
+	return ret;
 }
 
 static int read_disk_sb(mdk_rdev_t * rdev)
@@ -373,7 +401,7 @@ static int read_disk_sb(mdk_rdev_t * rde
 	return 0;
 
 fail:
-	printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+	printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
 		bdevname(rdev->bdev,b));
 	return -EINVAL;
 }
@@ -439,6 +467,31 @@ static unsigned int calc_sb_csum(mdp_sup
 	return csum;
 }
 
+/* csum_partial is not consistent between different architectures.
+ * Some (i386) do a 32bit csum.  Some (alpha) do 16 bit.
+ * This makes it hard for user-space to know what to do.
+ * So we use calc_sb_csum to set the checksum to allow working
+ * with older kernels, but allow calc_sb_csum_common to
+ * be used when checking if a checksum is correct, to
+ * make life easier for user-space tools that might write
+ * a superblock.
+ */
+static unsigned int calc_sb_csum_common(mdp_super_t *super)
+{
+	unsigned int  disk_csum = super->sb_csum;
+	unsigned long long newcsum = 0;
+	unsigned int csum;
+	int i;
+	unsigned int *superc = (int*) super;
+	super->sb_csum = 0;
+
+	for (i=0; i<MD_SB_BYTES/4; i++)
+		newcsum+= superc[i];
+	csum = (newcsum& 0xffffffff) + (newcsum>>32);
+	super->sb_csum = disk_csum;
+	return csum;
+}
+
 /*
  * Handle superblock details.
  * We want to be able to handle multiple superblock formats
@@ -521,7 +574,8 @@ static int super_90_load(mdk_rdev_t *rde
 	if (sb->raid_disks <= 0)
 		goto abort;
 
-	if (calc_sb_csum(sb) != sb->sb_csum) {
+	if (calc_sb_csum(sb) != sb->sb_csum &&
+		calc_sb_csum_common(sb) != sb->sb_csum) {
 		printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
 			b);
 		goto abort;
@@ -530,7 +584,7 @@ static int super_90_load(mdk_rdev_t *rde
 	rdev->preferred_minor = sb->md_minor;
 	rdev->data_offset = 0;
 
-	if (sb->level == MULTIPATH)
+	if (sb->level == LEVEL_MULTIPATH)
 		rdev->desc_nr = -1;
 	else
 		rdev->desc_nr = sb->this_disk.number;
@@ -745,11 +799,21 @@ static void super_90_sync(mddev_t *mddev
 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
 {
 	unsigned int disk_csum, csum;
+	unsigned long long newcsum;
 	int size = 256 + sb->max_dev*2;
+	unsigned int *isuper = (unsigned int*)sb;
+	int i;
 
 	disk_csum = sb->sb_csum;
 	sb->sb_csum = 0;
-	csum = csum_partial((void *)sb, size, 0);
+	newcsum = 0;
+	for (i=0; size>=4; size -= 4 )
+		newcsum += le32_to_cpu(*isuper++);
+
+	if (size == 2)
+		newcsum += le16_to_cpu(*(unsigned short*) isuper);
+
+	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
 	sb->sb_csum = disk_csum;
 	return csum;
 }
@@ -924,12 +988,12 @@ static void super_1_sync(mddev_t *mddev,
 
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
-		if (rdev2->desc_nr > max_dev)
-			max_dev = rdev2->desc_nr;
+		if (rdev2->desc_nr+1 > max_dev)
+			max_dev = rdev2->desc_nr+1;
 	
 	sb->max_dev = max_dev;
 	for (i=0; i<max_dev;i++)
-		sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+		sb->dev_roles[i] = cpu_to_le16(0xfffe);
 	
 	ITERATE_RDEV(mddev,rdev2,tmp) {
 		i = rdev2->desc_nr;
@@ -942,6 +1006,7 @@ static void super_1_sync(mddev_t *mddev,
 	}
 
 	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
 
@@ -1042,20 +1107,24 @@ static void unbind_rdev_from_array(mdk_r
 /*
  * prevent the device from being mounted, repartitioned or
  * otherwise reused by a RAID array (or any other kernel
- * subsystem), by opening the device. [simply getting an
- * inode is not enough, the SCSI module usage code needs
- * an explicit open() on the device]
+ * subsystem), by bd_claiming the device.
  */
 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
 {
 	int err = 0;
 	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
 
 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
-	if (IS_ERR(bdev))
+	if (IS_ERR(bdev)) {
+		printk(KERN_ERR "md: could not open %s.\n",
+			__bdevname(dev, b));
 		return PTR_ERR(bdev);
+	}
 	err = bd_claim(bdev, rdev);
 	if (err) {
+		printk(KERN_ERR "md: could not bd_claim %s.\n",
+			bdevname(bdev, b));
 		blkdev_put(bdev);
 		return err;
 	}
@@ -1117,10 +1186,7 @@ static void export_array(mddev_t *mddev)
 
 static void print_desc(mdp_disk_t *desc)
 {
-	char b[BDEVNAME_SIZE];
-
-	printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
-		__bdevname(MKDEV(desc->major, desc->minor), b),
+	printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
 		desc->major,desc->minor,desc->raid_disk,desc->state);
 }
 
@@ -1312,8 +1378,7 @@ static mdk_rdev_t *md_import_device(dev_
 
 	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
 	if (!rdev) {
-		printk(KERN_ERR "md: could not alloc mem for %s!\n", 
-			__bdevname(newdev, b));
+		printk(KERN_ERR "md: could not alloc mem for new device!\n");
 		return ERR_PTR(-ENOMEM);
 	}
 	memset(rdev, 0, sizeof(*rdev));
@@ -1322,11 +1387,9 @@ static mdk_rdev_t *md_import_device(dev_
 		goto abort_free;
 
 	err = lock_rdev(rdev, newdev);
-	if (err) {
-		printk(KERN_ERR "md: could not lock %s.\n",
-			__bdevname(newdev, b));
+	if (err)
 		goto abort_free;
-	}
+
 	rdev->desc_nr = -1;
 	rdev->faulty = 0;
 	rdev->in_sync = 0;
@@ -1436,9 +1499,8 @@ static int analyze_sbs(mddev_t * mddev)
 		goto abort;
 	}
 
-	if ((mddev->recovery_cp != MaxSector) &&
-	    ((mddev->level == 1) ||
-	     ((mddev->level >= 4) && (mddev->level <= 6))))
+	if (mddev->recovery_cp != MaxSector &&
+	    mddev->level >= 1)
 		printk(KERN_ERR "md: %s: raid array is not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
@@ -1615,6 +1677,8 @@ static int do_md_run(mddev_t * mddev)
 	mddev->pers = pers[pnum];
 	spin_unlock(&pers_lock);
 
+	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+
 	err = mddev->pers->run(mddev);
 	if (err) {
 		printk(KERN_ERR "md: pers->run() failed ...\n");
@@ -1645,6 +1709,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->issue_flush_fn = md_flush_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -1881,11 +1946,9 @@ static int autostart_array(dev_t startde
 	mdk_rdev_t *start_rdev = NULL, *rdev;
 
 	start_rdev = md_import_device(startdev, 0, 0);
-	if (IS_ERR(start_rdev)) {
-		printk(KERN_WARNING "md: could not import %s!\n",
-			__bdevname(startdev, b));
+	if (IS_ERR(start_rdev))
 		return err;
-	}
+
 
 	/* NOTE: this can only work for 0.90.0 superblocks */
 	sb = (mdp_super_t*)page_address(start_rdev->sb_page);
@@ -1916,12 +1979,9 @@ static int autostart_array(dev_t startde
 		if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
 			continue;
 		rdev = md_import_device(dev, 0, 0);
-		if (IS_ERR(rdev)) {
-			printk(KERN_WARNING "md: could not import %s,"
-				" trying to run array nevertheless.\n",
-				__bdevname(dev, b));
+		if (IS_ERR(rdev))
 			continue;
-		}
+
 		list_add(&rdev->same_set, &pending_raid_disks);
 	}
 
@@ -2153,42 +2213,6 @@ static int add_new_disk(mddev_t * mddev,
 	return 0;
 }
 
-static int hot_generate_error(mddev_t * mddev, dev_t dev)
-{
-	char b[BDEVNAME_SIZE];
-	struct request_queue *q;
-	mdk_rdev_t *rdev;
-
-	if (!mddev->pers)
-		return -ENODEV;
-
-	printk(KERN_INFO "md: trying to generate %s error in %s ... \n",
-		__bdevname(dev, b), mdname(mddev));
-
-	rdev = find_rdev(mddev, dev);
-	if (!rdev) {
-		/* MD_BUG(); */ /* like hell - it's not a driver bug */
-		return -ENXIO;
-	}
-
-	if (rdev->desc_nr == -1) {
-		MD_BUG();
-		return -EINVAL;
-	}
-	if (!rdev->in_sync)
-		return -ENODEV;
-
-	q = bdev_get_queue(rdev->bdev);
-	if (!q) {
-		MD_BUG();
-		return -ENODEV;
-	}
-	printk(KERN_INFO "md: okay, generating error!\n");
-//	q->oneshot_error = 1; // disabled for now
-
-	return 0;
-}
-
 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
@@ -2197,9 +2221,6 @@ static int hot_remove_disk(mddev_t * mdd
 	if (!mddev->pers)
 		return -ENODEV;
 
-	printk(KERN_INFO "md: trying to remove %s from %s ... \n",
-		__bdevname(dev, b), mdname(mddev));
-
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
@@ -2227,9 +2248,6 @@ static int hot_add_disk(mddev_t * mddev,
 	if (!mddev->pers)
 		return -ENODEV;
 
-	printk(KERN_INFO "md: trying to hot-add %s to %s ... \n",
-		__bdevname(dev, b), mdname(mddev));
-
 	if (mddev->major_version != 0) {
 		printk(KERN_WARNING "%s: HOT_ADD may only be used with"
 			" version-0 superblocks.\n",
@@ -2478,6 +2496,9 @@ static int set_disk_faulty(mddev_t *mdde
 {
 	mdk_rdev_t *rdev;
 
+	if (mddev->pers == NULL)
+		return -ENODEV;
+
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENODEV;
@@ -2489,7 +2510,6 @@ static int set_disk_faulty(mddev_t *mdde
 static int md_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
-	char b[BDEVNAME_SIZE];
 	int err = 0;
 	void __user *argp = (void __user *)arg;
 	struct hd_geometry __user *loc = argp;
@@ -2548,8 +2568,7 @@ static int md_ioctl(struct inode *inode,
 		}
 		err = autostart_array(new_decode_dev(arg));
 		if (err) {
-			printk(KERN_WARNING "md: autostart %s failed!\n",
-				__bdevname(arg, b));
+			printk(KERN_WARNING "md: autostart failed!\n");
 			goto abort;
 		}
 		goto done;
@@ -2690,9 +2709,7 @@ static int md_ioctl(struct inode *inode,
 				err = add_new_disk(mddev, &info);
 			goto done_unlock;
 		}
-		case HOT_GENERATE_ERROR:
-			err = hot_generate_error(mddev, new_decode_dev(arg));
-			goto done_unlock;
+
 		case HOT_REMOVE_DISK:
 			err = hot_remove_disk(mddev, new_decode_dev(arg));
 			goto done_unlock;
@@ -2876,7 +2893,7 @@ mdk_thread_t *md_register_thread(void (*
 	return thread;
 }
 
-void md_interrupt_thread(mdk_thread_t *thread)
+static void md_interrupt_thread(mdk_thread_t *thread)
 {
 	if (!thread->tsk) {
 		MD_BUG();
@@ -2919,6 +2936,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t
 	if (!mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
+	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 }
@@ -2951,7 +2969,11 @@ static void status_resync(struct seq_fil
 	unsigned long max_blocks, resync, res, dt, db, rt;
 
 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
-	max_blocks = mddev->size;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors >> 1;
+	else
+		max_blocks = mddev->size;
 
 	/*
 	 * Should not happen.
@@ -3187,11 +3209,6 @@ int unregister_md_personality(int pnum)
 	return 0;
 }
 
-void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
-{
-	rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
-}
-
 static int is_mddev_idle(mddev_t *mddev)
 {
 	mdk_rdev_t * rdev;
@@ -3204,8 +3221,12 @@ static int is_mddev_idle(mddev_t *mddev)
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
 		curr_events = disk_stat_read(disk, read_sectors) + 
 				disk_stat_read(disk, write_sectors) - 
-				disk->sync_io;
-		if ((curr_events - rdev->last_events) > 32) {
+				atomic_read(&disk->sync_io);
+		/* Allow some slack between valud of curr_events and last_events,
+		 * as there are some uninteresting races.
+		 * Note: the following is an unsigned comparison.
+		 */
+		if ((curr_events - rdev->last_events + 32) > 64) {
 			rdev->last_events = curr_events;
 			idle = 0;
 		}
@@ -3339,7 +3360,14 @@ static void md_do_sync(mddev_t *mddev)
 		}
 	} while (mddev->curr_resync < 2);
 
-	max_sectors = mddev->size << 1;
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		/* resync follows the size requested by the personality,
+		 * which default to physical size, but can be virtual size
+		 */
+		max_sectors = mddev->resync_max_sectors;
+	else
+		/* recovery follows the physical size of devices */
+		max_sectors = mddev->size << 1;
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -3372,10 +3400,12 @@ static void md_do_sync(mddev_t *mddev)
 	init_waitqueue_head(&mddev->recovery_wait);
 	last_check = 0;
 
-	if (j)
+	if (j>2) {
 		printk(KERN_INFO 
 			"md: resuming recovery of %s from checkpoint.\n",
 			mdname(mddev));
+		mddev->curr_resync = j;
+	}
 
 	while (j < max_sectors) {
 		int sectors;
@@ -3458,7 +3488,7 @@ static void md_do_sync(mddev_t *mddev)
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
-	    mddev->curr_resync > mddev->recovery_cp) {
+	    mddev->curr_resync >= mddev->recovery_cp) {
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 			printk(KERN_INFO 
 				"md: checkpointing recovery of %s.\n",
@@ -3697,7 +3727,6 @@ void md_autodetect_dev(dev_t dev)
 
 static void autostart_arrays(int part)
 {
-	char b[BDEVNAME_SIZE];
 	mdk_rdev_t *rdev;
 	int i;
 
@@ -3707,11 +3736,9 @@ static void autostart_arrays(int part)
 		dev_t dev = detected_devices[i];
 
 		rdev = md_import_device(dev,0, 0);
-		if (IS_ERR(rdev)) {
-			printk(KERN_ALERT "md: could not import %s!\n",
-				__bdevname(dev, b));
+		if (IS_ERR(rdev))
 			continue;
-		}
+
 		if (rdev->faulty) {
 			MD_BUG();
 			continue;
@@ -3762,7 +3789,6 @@ module_exit(md_exit)
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
 EXPORT_SYMBOL(md_error);
-EXPORT_SYMBOL(md_sync_acct);
 EXPORT_SYMBOL(md_done_sync);
 EXPORT_SYMBOL(md_write_start);
 EXPORT_SYMBOL(md_write_end);
@@ -3771,6 +3797,5 @@ EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_print_devices);
-EXPORT_SYMBOL(md_interrupt_thread);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
diff -pruN ./drivers/md.dm/multipath.c ./drivers/md/multipath.c
--- ./drivers/md.dm/multipath.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/multipath.c	2006-03-17 13:16:38.000000000 +0300
@@ -99,12 +99,12 @@ static void multipath_reschedule_retry (
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate)
+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
 	struct bio *bio = mp_bh->master_bio;
 	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
 
-	bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO);
+	bio_endio(bio, bio->bi_size, err);
 	mempool_free(mp_bh, conf->pool);
 }
 
@@ -119,8 +119,8 @@ int multipath_end_request(struct bio *bi
 		return 1;
 
 	if (uptodate)
-		multipath_end_bh_io(mp_bh, uptodate);
-	else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) {
+		multipath_end_bh_io(mp_bh, 0);
+	else if (!bio_rw_ahead(bio)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -131,7 +131,7 @@ int multipath_end_request(struct bio *bi
 		       (unsigned long long)bio->bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
-		multipath_end_bh_io(mp_bh, 0);
+		multipath_end_bh_io(mp_bh, error);
 	rdev_dec_pending(rdev, conf->mddev);
 	return 0;
 }
@@ -155,7 +155,7 @@ static void unplug_slaves(mddev_t *mddev
 				r_queue->unplug_fn(r_queue);
 
 			spin_lock_irqsave(&conf->device_lock, flags);
-			atomic_dec(&rdev->nr_pending);
+			rdev_dec_pending(rdev, mddev);
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -217,6 +217,31 @@ static void multipath_status (struct seq
 	seq_printf (seq, "]");
 }
 
+static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
+				 sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
 
 /*
  * Careful, this can execute in IRQ contexts as well!
@@ -300,7 +325,7 @@ static int multipath_add_disk(mddev_t *m
 		 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			conf->working_disks++;
 			rdev->raid_disk = path;
@@ -377,7 +402,7 @@ static void multipathd (mddev_t *mddev)
 				" error for block %llu\n",
 				bdevname(bio->bi_bdev,b),
 				(unsigned long long)bio->bi_sector);
-			multipath_end_bh_io(mp_bh, 0);
+			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
 			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
 				" to another IO path\n",
@@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev
 
 	mddev->queue->unplug_fn = multipath_unplug;
 
+	mddev->queue->issue_flush_fn = multipath_issue_flush;
+
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		disk_idx = rdev->raid_disk;
@@ -452,7 +479,7 @@ static int multipath_run (mddev_t *mddev
 		 * a merge_bvec_fn to be involved in multipath */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!rdev->faulty) 
 			conf->working_disks++;
diff -pruN ./drivers/md.dm/raid0.c ./drivers/md/raid0.c
--- ./drivers/md.dm/raid0.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/raid0.c	2006-03-17 13:16:38.000000000 +0300
@@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t
 	}
 }
 
+static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		struct block_device *bdev = devlist[i]->bdev;
+		request_queue_t *r_queue = bdev_get_queue(bdev);
+
+		if (!r_queue->issue_flush_fn) {
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -137,7 +162,7 @@ static int create_strip_zones (mddev_t *
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!smallest || (rdev1->size <smallest->size))
 			smallest = rdev1;
@@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t *
 
 	mddev->queue->unplug_fn = raid0_unplug;
 
+	mddev->queue->issue_flush_fn = raid0_issue_flush;
+
 	printk("raid0: done.\n");
 	return 0;
  abort:
diff -pruN ./drivers/md.dm/raid10.c ./drivers/md/raid10.c
--- ./drivers/md.dm/raid10.c	1970-01-01 03:00:00.000000000 +0300
+++ ./drivers/md/raid10.c	2006-03-17 13:16:38.000000000 +0300
@@ -0,0 +1,1780 @@
+/*
+ * raid10.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2000-2004 Neil Brown
+ *
+ * RAID-10 support for md.
+ *
+ * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/raid/raid10.h>
+
+/*
+ * RAID10 provides a combination of RAID0 and RAID1 functionality.
+ * The layout of data is defined by
+ *    chunk_size
+ *    raid_disks
+ *    near_copies (stored in low byte of layout)
+ *    far_copies (stored in second byte of layout)
+ *
+ * The data to be stored is divided into chunks using chunksize.
+ * Each device is divided into far_copies sections.
+ * In each section, chunks are laid out in a style similar to raid0, but
+ * near_copies copies of each chunk is stored (each on a different drive).
+ * The starting device for each section is offset near_copies from the starting
+ * device of the previous section.
+ * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
+ * drive.
+ * near_copies and far_copies must be at least one, and there product is at most
+ * raid_disks.
+ */
+
+/*
+ * Number of guaranteed r10bios in case of extreme VM load:
+ */
+#define	NR_RAID10_BIOS 256
+
+static void unplug_slaves(mddev_t *mddev);
+
+static void * r10bio_pool_alloc(int gfp_flags, void *data)
+{
+	conf_t *conf = data;
+	r10bio_t *r10_bio;
+	int size = offsetof(struct r10bio_s, devs[conf->copies]);
+
+	/* allocate a r10bio with room for raid_disks entries in the bios array */
+	r10_bio = kmalloc(size, gfp_flags);
+	if (r10_bio)
+		memset(r10_bio, 0, size);
+	else
+		unplug_slaves(conf->mddev);
+
+	return r10_bio;
+}
+
+static void r10bio_pool_free(void *r10_bio, void *data)
+{
+	kfree(r10_bio);
+}
+
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (2048*1024)
+
+/*
+ * When performing a resync, we need to read and compare, so
+ * we need as many pages are there are copies.
+ * When performing a recovery, we need 2 bios, one for read,
+ * one for write (we recover only one drive per r10buf)
+ *
+ */
+static void * r10buf_pool_alloc(int gfp_flags, void *data)
+{
+	conf_t *conf = data;
+	struct page *page;
+	r10bio_t *r10_bio;
+	struct bio *bio;
+	int i, j;
+	int nalloc;
+
+	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
+	if (!r10_bio) {
+		unplug_slaves(conf->mddev);
+		return NULL;
+	}
+
+	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+		nalloc = conf->copies; /* resync */
+	else
+		nalloc = 2; /* recovery */
+
+	/*
+	 * Allocate bios.
+	 */
+	for (j = nalloc ; j-- ; ) {
+		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+		if (!bio)
+			goto out_free_bio;
+		r10_bio->devs[j].bio = bio;
+	}
+	/*
+	 * Allocate RESYNC_PAGES data pages and attach them
+	 * where needed.
+	 */
+	for (j = 0 ; j < nalloc; j++) {
+		bio = r10_bio->devs[j].bio;
+		for (i = 0; i < RESYNC_PAGES; i++) {
+			page = alloc_page(gfp_flags);
+			if (unlikely(!page))
+				goto out_free_pages;
+
+			bio->bi_io_vec[i].bv_page = page;
+		}
+	}
+
+	return r10_bio;
+
+out_free_pages:
+	for ( ; i > 0 ; i--)
+		__free_page(bio->bi_io_vec[i-1].bv_page);
+	while (j--)
+		for (i = 0; i < RESYNC_PAGES ; i++)
+			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+	j = -1;
+out_free_bio:
+	while ( ++j < nalloc )
+		bio_put(r10_bio->devs[j].bio);
+	r10bio_pool_free(r10_bio, conf);
+	return NULL;
+}
+
+static void r10buf_pool_free(void *__r10_bio, void *data)
+{
+	int i;
+	conf_t *conf = data;
+	r10bio_t *r10bio = __r10_bio;
+	int j;
+
+	for (j=0; j < conf->copies; j++) {
+		struct bio *bio = r10bio->devs[j].bio;
+		if (bio) {
+			for (i = 0; i < RESYNC_PAGES; i++) {
+				__free_page(bio->bi_io_vec[i].bv_page);
+				bio->bi_io_vec[i].bv_page = NULL;
+			}
+			bio_put(bio);
+		}
+	}
+	r10bio_pool_free(r10bio, conf);
+}
+
+static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
+{
+	int i;
+
+	for (i = 0; i < conf->copies; i++) {
+		struct bio **bio = & r10_bio->devs[i].bio;
+		if (*bio)
+			bio_put(*bio);
+		*bio = NULL;
+	}
+}
+
+static inline void free_r10bio(r10bio_t *r10_bio)
+{
+	unsigned long flags;
+
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	if (!--conf->nr_pending) {
+		wake_up(&conf->wait_idle);
+		wake_up(&conf->wait_resume);
+	}
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+
+	put_all_bios(conf, r10_bio);
+	mempool_free(r10_bio, conf->r10bio_pool);
+}
+
+static inline void put_buf(r10bio_t *r10_bio)
+{
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	unsigned long flags;
+
+	mempool_free(r10_bio, conf->r10buf_pool);
+
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	if (!conf->barrier)
+		BUG();
+	--conf->barrier;
+	wake_up(&conf->wait_resume);
+	wake_up(&conf->wait_idle);
+
+	if (!--conf->nr_pending) {
+		wake_up(&conf->wait_idle);
+		wake_up(&conf->wait_resume);
+	}
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+}
+
+static void reschedule_retry(r10bio_t *r10_bio)
+{
+	unsigned long flags;
+	mddev_t *mddev = r10_bio->mddev;
+	conf_t *conf = mddev_to_conf(mddev);
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	list_add(&r10_bio->retry_list, &conf->retry_list);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(r10bio_t *r10_bio)
+{
+	struct bio *bio = r10_bio->master_bio;
+
+	bio_endio(bio, bio->bi_size,
+		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+	free_r10bio(r10_bio);
+}
+
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int slot, r10bio_t *r10_bio)
+{
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+
+	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
+		r10_bio->devs[slot].addr + (r10_bio->sectors);
+}
+
+static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	int slot, dev;
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+
+	if (bio->bi_size)
+		return 1;
+
+	slot = r10_bio->read_slot;
+	dev = r10_bio->devs[slot].devnum;
+	/*
+	 * this branch is our 'one mirror IO has finished' event handler:
+	 */
+	if (!uptodate)
+		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
+	else
+		/*
+		 * Set R10BIO_Uptodate in our master bio, so that
+		 * we will return a good error code to the higher
+		 * levels even if IO on some other mirrored buffer fails.
+		 *
+		 * The 'master' represents the composite IO operation to
+		 * user-side. So if something waits for IO, then it will
+		 * wait for the 'master' bio.
+		 */
+		set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+	update_head_pos(slot, r10_bio);
+
+	/*
+	 * we have only one bio on the read side
+	 */
+	if (uptodate)
+		raid_end_bio_io(r10_bio);
+	else {
+		/*
+		 * oops, read error:
+		 */
+		char b[BDEVNAME_SIZE];
+		if (printk_ratelimit())
+			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
+			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+		reschedule_retry(r10_bio);
+	}
+
+	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+	return 0;
+}
+
+static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	int slot, dev;
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+
+	if (bio->bi_size)
+		return 1;
+
+	for (slot = 0; slot < conf->copies; slot++)
+		if (r10_bio->devs[slot].bio == bio)
+			break;
+	dev = r10_bio->devs[slot].devnum;
+
+	/*
+	 * this branch is our 'one mirror IO has finished' event handler:
+	 */
+	if (!uptodate)
+		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
+	else
+		/*
+		 * Set R10BIO_Uptodate in our master bio, so that
+		 * we will return a good error code for to the higher
+		 * levels even if IO on some other mirrored buffer fails.
+		 *
+		 * The 'master' represents the composite IO operation to
+		 * user-side. So if something waits for IO, then it will
+		 * wait for the 'master' bio.
+		 */
+		set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+	update_head_pos(slot, r10_bio);
+
+	/*
+	 *
+	 * Let's see if all mirrored write operations have finished
+	 * already.
+	 */
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(r10_bio->mddev);
+		raid_end_bio_io(r10_bio);
+	}
+
+	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+	return 0;
+}
+
+
+/*
+ * RAID10 layout manager
+ * Aswell as the chunksize and raid_disks count, there are two
+ * parameters: near_copies and far_copies.
+ * near_copies * far_copies must be <= raid_disks.
+ * Normally one of these will be 1.
+ * If both are 1, we get raid0.
+ * If near_copies == raid_disks, we get raid1.
+ *
+ * Chunks are layed out in raid0 style with near_copies copies of the
+ * first chunk, followed by near_copies copies of the next chunk and
+ * so on.
+ * If far_copies > 1, then after 1/far_copies of the array has been assigned
+ * as described above, we start again with a device offset of near_copies.
+ * So we effectively have another copy of the whole array further down all
+ * the drives, but with blocks on different drives.
+ * With this layout, and block is never stored twice on the one device.
+ *
+ * raid10_find_phys finds the sector offset of a given virtual sector
+ * on each device that it is on. If a block isn't on a device,
+ * that entry in the array is set to MaxSector.
+ *
+ * raid10_find_virt does the reverse mapping, from a device and a
+ * sector offset to a virtual address
+ */
+
+static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
+{
+	int n,f;
+	sector_t sector;
+	sector_t chunk;
+	sector_t stripe;
+	int dev;
+
+	int slot = 0;
+
+	/* now calculate first sector/dev */
+	chunk = r10bio->sector >> conf->chunk_shift;
+	sector = r10bio->sector & conf->chunk_mask;
+
+	chunk *= conf->near_copies;
+	stripe = chunk;
+	dev = sector_div(stripe, conf->raid_disks);
+
+	sector += stripe << conf->chunk_shift;
+
+	/* and calculate all the others */
+	for (n=0; n < conf->near_copies; n++) {
+		int d = dev;
+		sector_t s = sector;
+		r10bio->devs[slot].addr = sector;
+		r10bio->devs[slot].devnum = d;
+		slot++;
+
+		for (f = 1; f < conf->far_copies; f++) {
+			d += conf->near_copies;
+			if (d >= conf->raid_disks)
+				d -= conf->raid_disks;
+			s += conf->stride;
+			r10bio->devs[slot].devnum = d;
+			r10bio->devs[slot].addr = s;
+			slot++;
+		}
+		dev++;
+		if (dev >= conf->raid_disks) {
+			dev = 0;
+			sector += (conf->chunk_mask + 1);
+		}
+	}
+	BUG_ON(slot != conf->copies);
+}
+
+static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
+{
+	sector_t offset, chunk, vchunk;
+
+	while (sector > conf->stride) {
+		sector -= conf->stride;
+		if (dev < conf->near_copies)
+			dev += conf->raid_disks - conf->near_copies;
+		else
+			dev -= conf->near_copies;
+	}
+
+	offset = sector & conf->chunk_mask;
+	chunk = sector >> conf->chunk_shift;
+	vchunk = chunk * conf->raid_disks + dev;
+	sector_div(vchunk, conf->near_copies);
+	return (vchunk << conf->chunk_shift) + offset;
+}
+
+/**
+ *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *	@q: request queue
+ *	@bio: the buffer head that's been built up so far
+ *	@biovec: the request that could be merged to it.
+ *
+ *	Return amount of bytes we can accept at this offset
+ *      If near_copies == raid_disk, there are no striping issues,
+ *      but in that case, the function isn't called at all.
+ */
+static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
+				struct bio_vec *bio_vec)
+{
+	mddev_t *mddev = q->queuedata;
+	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	int max;
+	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int bio_sectors = bio->bi_size >> 9;
+
+	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+	if (max <= bio_vec->bv_len && bio_sectors == 0)
+		return bio_vec->bv_len;
+	else
+		return max;
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+
+/*
+ * FIXME: possibly should rethink readbalancing and do it differently
+ * depending on near_copies / far_copies geometry.
+ */
+static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+{
+	const unsigned long this_sector = r10_bio->sector;
+	int disk, slot, nslot;
+	const int sectors = r10_bio->sectors;
+	sector_t new_distance, current_distance;
+
+	raid10_find_phys(conf, r10_bio);
+	spin_lock_irq(&conf->device_lock);
+	/*
+	 * Check if we can balance. We can balance on the whole
+	 * device if no resync is going on, or below the resync window.
+	 * We take the first readable disk when above the resync window.
+	 */
+	if (conf->mddev->recovery_cp < MaxSector
+	    && (this_sector + sectors >= conf->next_resync)) {
+		/* make sure that disk is operational */
+		slot = 0;
+		disk = r10_bio->devs[slot].devnum;
+
+		while (!conf->mirrors[disk].rdev ||
+		       !conf->mirrors[disk].rdev->in_sync) {
+			slot++;
+			if (slot == conf->copies) {
+				slot = 0;
+				disk = -1;
+				break;
+			}
+			disk = r10_bio->devs[slot].devnum;
+		}
+		goto rb_out;
+	}
+
+
+	/* make sure the disk is operational */
+	slot = 0;
+	disk = r10_bio->devs[slot].devnum;
+	while (!conf->mirrors[disk].rdev ||
+	       !conf->mirrors[disk].rdev->in_sync) {
+		slot ++;
+		if (slot == conf->copies) {
+			disk = -1;
+			goto rb_out;
+		}
+		disk = r10_bio->devs[slot].devnum;
+	}
+
+
+	current_distance = abs(this_sector - conf->mirrors[disk].head_position);
+
+	/* Find the disk whose head is closest */
+
+	for (nslot = slot; nslot < conf->copies; nslot++) {
+		int ndisk = r10_bio->devs[nslot].devnum;
+
+
+		if (!conf->mirrors[ndisk].rdev ||
+		    !conf->mirrors[ndisk].rdev->in_sync)
+			continue;
+
+		if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {
+			disk = ndisk;
+			slot = nslot;
+			break;
+		}
+		new_distance = abs(r10_bio->devs[nslot].addr -
+				   conf->mirrors[ndisk].head_position);
+		if (new_distance < current_distance) {
+			current_distance = new_distance;
+			disk = ndisk;
+			slot = nslot;
+		}
+	}
+
+rb_out:
+	r10_bio->read_slot = slot;
+/*	conf->next_seq_sect = this_sector + sectors;*/
+
+	if (disk >= 0 && conf->mirrors[disk].rdev)
+		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
+	spin_unlock_irq(&conf->device_lock);
+
+	return disk;
+}
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && atomic_read(&rdev->nr_pending)) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			atomic_inc(&rdev->nr_pending);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+
+			spin_lock_irqsave(&conf->device_lock, flags);
+			rdev_dec_pending(rdev, mddev);
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static void raid10_unplug(request_queue_t *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
+static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = mddev_to_conf(mddev);
+	unsigned long flags;
+	int i, ret = 0;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (r_queue->issue_flush_fn) {
+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	return ret;
+}
+
+/*
+ * Throttle resync depth, so that we can both get proper overlapping of
+ * requests, but are still able to handle normal requests quickly.
+ */
+#define RESYNC_DEPTH 32
+
+static void device_barrier(conf_t *conf, sector_t sect)
+{
+	spin_lock_irq(&conf->resync_lock);
+	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
+			    conf->resync_lock, unplug_slaves(conf->mddev));
+
+	if (!conf->barrier++) {
+		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+				    conf->resync_lock, unplug_slaves(conf->mddev));
+		if (conf->nr_pending)
+			BUG();
+	}
+	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock, unplug_slaves(conf->mddev));
+	conf->next_resync = sect;
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static int make_request(request_queue_t *q, struct bio * bio)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = mddev_to_conf(mddev);
+	mirror_info_t *mirror;
+	r10bio_t *r10_bio;
+	struct bio *read_bio;
+	int i;
+	int chunk_sects = conf->chunk_mask + 1;
+
+	/* If this request crosses a chunk boundary, we need to
+	 * split it.  This will only happen for 1 PAGE (or less) requests.
+	 */
+	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
+		      > chunk_sects &&
+		    conf->near_copies < conf->raid_disks)) {
+		struct bio_pair *bp;
+		/* Sanity check -- queue functions should prevent this happening */
+		if (bio->bi_vcnt != 1 ||
+		    bio->bi_idx != 0)
+			goto bad_map;
+		/* This is a one page bio that upper layers
+		 * refuse to split for us, so we need to split it.
+		 */
+		bp = bio_split(bio, bio_split_pool,
+			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+		if (make_request(q, &bp->bio1))
+			generic_make_request(&bp->bio1);
+		if (make_request(q, &bp->bio2))
+			generic_make_request(&bp->bio2);
+
+		bio_pair_release(bp);
+		return 0;
+	bad_map:
+		printk("raid10_make_request bug: can't convert block across chunks"
+		       " or bigger than %dk %llu %d\n", chunk_sects/2,
+		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+
+		bio_io_error(bio, bio->bi_size);
+		return 0;
+	}
+
+	/*
+	 * Register the new request and wait if the reconstruction
+	 * thread has put up a bar for new requests.
+	 * Continue immediately if no resync is active currently.
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
+	conf->nr_pending++;
+	spin_unlock_irq(&conf->resync_lock);
+
+	if (bio_data_dir(bio)==WRITE) {
+		disk_stat_inc(mddev->gendisk, writes);
+		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
+	} else {
+		disk_stat_inc(mddev->gendisk, reads);
+		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
+	}
+
+	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+	r10_bio->master_bio = bio;
+	r10_bio->sectors = bio->bi_size >> 9;
+
+	r10_bio->mddev = mddev;
+	r10_bio->sector = bio->bi_sector;
+
+	if (bio_data_dir(bio) == READ) {
+		/*
+		 * read balancing logic:
+		 */
+		int disk = read_balance(conf, r10_bio);
+		int slot = r10_bio->read_slot;
+		if (disk < 0) {
+			raid_end_bio_io(r10_bio);
+			return 0;
+		}
+		mirror = conf->mirrors + disk;
+
+		read_bio = bio_clone(bio, GFP_NOIO);
+
+		r10_bio->devs[slot].bio = read_bio;
+
+		read_bio->bi_sector = r10_bio->devs[slot].addr +
+			mirror->rdev->data_offset;
+		read_bio->bi_bdev = mirror->rdev->bdev;
+		read_bio->bi_end_io = raid10_end_read_request;
+		read_bio->bi_rw = READ;
+		read_bio->bi_private = r10_bio;
+
+		generic_make_request(read_bio);
+		return 0;
+	}
+
+	/*
+	 * WRITE:
+	 */
+	/* first select target devices under spinlock and
+	 * inc refcount on their rdev.  Record them by setting
+	 * bios[x] to bio
+	 */
+	raid10_find_phys(conf, r10_bio);
+	spin_lock_irq(&conf->device_lock);
+	for (i = 0;  i < conf->copies; i++) {
+		int d = r10_bio->devs[i].devnum;
+		if (conf->mirrors[d].rdev &&
+		    !conf->mirrors[d].rdev->faulty) {
+			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+			r10_bio->devs[i].bio = bio;
+		} else
+			r10_bio->devs[i].bio = NULL;
+	}
+	spin_unlock_irq(&conf->device_lock);
+
+	atomic_set(&r10_bio->remaining, 1);
+	md_write_start(mddev);
+	for (i = 0; i < conf->copies; i++) {
+		struct bio *mbio;
+		int d = r10_bio->devs[i].devnum;
+		if (!r10_bio->devs[i].bio)
+			continue;
+
+		mbio = bio_clone(bio, GFP_NOIO);
+		r10_bio->devs[i].bio = mbio;
+
+		mbio->bi_sector	= r10_bio->devs[i].addr+
+			conf->mirrors[d].rdev->data_offset;
+		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+		mbio->bi_end_io	= raid10_end_write_request;
+		mbio->bi_rw = WRITE;
+		mbio->bi_private = r10_bio;
+
+		atomic_inc(&r10_bio->remaining);
+		generic_make_request(mbio);
+	}
+
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(mddev);
+		raid_end_bio_io(r10_bio);
+	}
+
+	return 0;
+}
+
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	if (conf->near_copies < conf->raid_disks)
+		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
+	if (conf->near_copies > 1)
+		seq_printf(seq, " %d near-copies", conf->near_copies);
+	if (conf->far_copies > 1)
+		seq_printf(seq, " %d far-copies", conf->far_copies);
+
+	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+						conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		seq_printf(seq, "%s",
+			      conf->mirrors[i].rdev &&
+			      conf->mirrors[i].rdev->in_sync ? "U" : "_");
+	seq_printf(seq, "]");
+}
+
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	char b[BDEVNAME_SIZE];
+	conf_t *conf = mddev_to_conf(mddev);
+
+	/*
+	 * If it is not operational, then we have already marked it as dead
+	 * else if it is the last working disks, ignore the error, let the
+	 * next level up know.
+	 * else mark the drive as failed
+	 */
+	if (rdev->in_sync
+	    && conf->working_disks == 1)
+		/*
+		 * Don't fail the drive, just return an IO error.
+		 * The test should really be more sophisticated than
+		 * "working_disks == 1", but it isn't critical, and
+		 * can wait until we do more sophisticated "is the drive
+		 * really dead" tests...
+		 */
+		return;
+	if (rdev->in_sync) {
+		mddev->degraded++;
+		conf->working_disks--;
+		/*
+		 * if recovery is running, make sure it aborts.
+		 */
+		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+	}
+	rdev->in_sync = 0;
+	rdev->faulty = 1;
+	mddev->sb_dirty = 1;
+	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
+		"	Operation continuing on %d devices\n",
+		bdevname(rdev->bdev,b), conf->working_disks);
+}
+
+static void print_conf(conf_t *conf)
+{
+	int i;
+	mirror_info_t *tmp;
+
+	printk("RAID10 conf printout:\n");
+	if (!conf) {
+		printk("(!conf)\n");
+		return;
+	}
+	printk(" --- wd:%d rd:%d\n", conf->working_disks,
+		conf->raid_disks);
+
+	for (i = 0; i < conf->raid_disks; i++) {
+		char b[BDEVNAME_SIZE];
+		tmp = conf->mirrors + i;
+		if (tmp->rdev)
+			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+				i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
+				bdevname(tmp->rdev->bdev,b));
+	}
+}
+
+static void close_sync(conf_t *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
+			    conf->resync_lock, 	unplug_slaves(conf->mddev));
+	spin_unlock_irq(&conf->resync_lock);
+
+	if (conf->barrier) BUG();
+	if (waitqueue_active(&conf->wait_idle)) BUG();
+
+	mempool_destroy(conf->r10buf_pool);
+	conf->r10buf_pool = NULL;
+}
+
+static int raid10_spare_active(mddev_t *mddev)
+{
+	int i;
+	conf_t *conf = mddev->private;
+	mirror_info_t *tmp;
+
+	spin_lock_irq(&conf->device_lock);
+	/*
+	 * Find all non-in_sync disks within the RAID10 configuration
+	 * and mark them in_sync
+	 */
+	for (i = 0; i < conf->raid_disks; i++) {
+		tmp = conf->mirrors + i;
+		if (tmp->rdev
+		    && !tmp->rdev->faulty
+		    && !tmp->rdev->in_sync) {
+			conf->working_disks++;
+			mddev->degraded--;
+			tmp->rdev->in_sync = 1;
+		}
+	}
+	spin_unlock_irq(&conf->device_lock);
+
+	print_conf(conf);
+	return 0;
+}
+
+
+static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	conf_t *conf = mddev->private;
+	int found = 0;
+	int mirror;
+	mirror_info_t *p;
+
+	if (mddev->recovery_cp < MaxSector)
+		/* only hot-add to in-sync arrays, as recovery is
+		 * very different from resync
+		 */
+		return 0;
+	spin_lock_irq(&conf->device_lock);
+	for (mirror=0; mirror < mddev->raid_disks; mirror++)
+		if ( !(p=conf->mirrors+mirror)->rdev) {
+			p->rdev = rdev;
+
+			blk_queue_stack_limits(mddev->queue,
+					       rdev->bdev->bd_disk->queue);
+			/* as we don't honour merge_bvec_fn, we must never risk
+			 * violating it, so limit ->max_sector to one PAGE, as
+			 * a one page request is never in violation.
+			 */
+			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+
+			p->head_position = 0;
+			rdev->raid_disk = mirror;
+			found = 1;
+			break;
+		}
+	spin_unlock_irq(&conf->device_lock);
+
+	print_conf(conf);
+	return found;
+}
+
+static int raid10_remove_disk(mddev_t *mddev, int number)
+{
+	conf_t *conf = mddev->private;
+	int err = 1;
+	mirror_info_t *p = conf->mirrors+ number;
+
+	print_conf(conf);
+	spin_lock_irq(&conf->device_lock);
+	if (p->rdev) {
+		if (p->rdev->in_sync ||
+		    atomic_read(&p->rdev->nr_pending)) {
+			err = -EBUSY;
+			goto abort;
+		}
+		p->rdev = NULL;
+		err = 0;
+	}
+	if (err)
+		MD_BUG();
+abort:
+	spin_unlock_irq(&conf->device_lock);
+
+	print_conf(conf);
+	return err;
+}
+
+
+static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	int i,d;
+
+	if (bio->bi_size)
+		return 1;
+
+	for (i=0; i<conf->copies; i++)
+		if (r10_bio->devs[i].bio == bio)
+			break;
+	if (i == conf->copies)
+		BUG();
+	update_head_pos(i, r10_bio);
+	d = r10_bio->devs[i].devnum;
+	if (!uptodate)
+		md_error(r10_bio->mddev,
+			 conf->mirrors[d].rdev);
+
+	/* for reconstruct, we always reschedule after a read.
+	 * for resync, only after all reads
+	 */
+	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
+	    atomic_dec_and_test(&r10_bio->remaining)) {
+		/* we have read all the blocks,
+		 * do the comparison in process context in raid10d
+		 */
+		reschedule_retry(r10_bio);
+	}
+	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
+	return 0;
+}
+
+static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
+	mddev_t *mddev = r10_bio->mddev;
+	conf_t *conf = mddev_to_conf(mddev);
+	int i,d;
+
+	if (bio->bi_size)
+		return 1;
+
+	for (i = 0; i < conf->copies; i++)
+		if (r10_bio->devs[i].bio == bio)
+			break;
+	d = r10_bio->devs[i].devnum;
+
+	if (!uptodate)
+		md_error(mddev, conf->mirrors[d].rdev);
+	update_head_pos(i, r10_bio);
+
+	while (atomic_dec_and_test(&r10_bio->remaining)) {
+		if (r10_bio->master_bio == NULL) {
+			/* the primary of several recovery bios */
+			md_done_sync(mddev, r10_bio->sectors, 1);
+			put_buf(r10_bio);
+			break;
+		} else {
+			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
+			put_buf(r10_bio);
+			r10_bio = r10_bio2;
+		}
+	}
+	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+	return 0;
+}
+
+/*
+ * Note: sync and recover and handled very differently for raid10
+ * This code is for resync.
+ * For resync, we read through virtual addresses and read all blocks.
+ * If there is any error, we schedule a write.  The lowest numbered
+ * drive is authoritative.
+ * However requests come for physical address, so we need to map.
+ * For every physical address there are raid_disks/copies virtual addresses,
+ * which is always are least one, but is not necessarly an integer.
+ * This means that a physical address can span multiple chunks, so we may
+ * have to submit multiple io requests for a single sync request.
+ */
+/*
+ * We check if all blocks are in-sync and only write to blocks that
+ * aren't in sync
+ */
+static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i, first;
+	struct bio *tbio, *fbio;
+
+	atomic_set(&r10_bio->remaining, 1);
+
+	/* find the first device with a block */
+	for (i=0; i<conf->copies; i++)
+		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+			break;
+
+	if (i == conf->copies)
+		goto done;
+
+	first = i;
+	fbio = r10_bio->devs[i].bio;
+
+	/* now find blocks with errors */
+	for (i=first+1 ; i < conf->copies ; i++) {
+		int vcnt, j, d;
+
+		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+			continue;
+		/* We know that the bi_io_vec layout is the same for
+		 * both 'first' and 'i', so we just compare them.
+		 * All vec entries are PAGE_SIZE;
+		 */
+		tbio = r10_bio->devs[i].bio;
+		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
+		for (j = 0; j < vcnt; j++)
+			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+				   page_address(tbio->bi_io_vec[j].bv_page),
+				   PAGE_SIZE))
+				break;
+		if (j == vcnt)
+			continue;
+		/* Ok, we need to write this bio
+		 * First we need to fixup bv_offset, bv_len and
+		 * bi_vecs, as the read request might have corrupted these
+		 */
+		tbio->bi_vcnt = vcnt;
+		tbio->bi_size = r10_bio->sectors << 9;
+		tbio->bi_idx = 0;
+		tbio->bi_phys_segments = 0;
+		tbio->bi_hw_segments = 0;
+		tbio->bi_hw_front_size = 0;
+		tbio->bi_hw_back_size = 0;
+		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+		tbio->bi_flags |= 1 << BIO_UPTODATE;
+		tbio->bi_next = NULL;
+		tbio->bi_rw = WRITE;
+		tbio->bi_private = r10_bio;
+		tbio->bi_sector = r10_bio->devs[i].addr;
+
+		for (j=0; j < vcnt ; j++) {
+			tbio->bi_io_vec[j].bv_offset = 0;
+			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
+
+			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+			       page_address(fbio->bi_io_vec[j].bv_page),
+			       PAGE_SIZE);
+		}
+		tbio->bi_end_io = end_sync_write;
+
+		d = r10_bio->devs[i].devnum;
+		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+		atomic_inc(&r10_bio->remaining);
+		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+
+		generic_make_request(tbio);
+	}
+
+done:
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_done_sync(mddev, r10_bio->sectors, 1);
+		put_buf(r10_bio);
+	}
+}
+
+/*
+ * Now for the recovery code.
+ * Recovery happens across physical sectors.
+ * We recover all non-is_sync drives by finding the virtual address of
+ * each, and then choose a working drive that also has that virt address.
+ * There is a separate r10_bio for each non-in_sync drive.
+ * Only the first two slots are in use. The first for reading,
+ * The second for writing.
+ *
+ */
+
+static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i, d;
+	struct bio *bio, *wbio;
+
+
+	/* move the pages across to the second bio
+	 * and submit the write request
+	 */
+	bio = r10_bio->devs[0].bio;
+	wbio = r10_bio->devs[1].bio;
+	for (i=0; i < wbio->bi_vcnt; i++) {
+		struct page *p = bio->bi_io_vec[i].bv_page;
+		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
+		wbio->bi_io_vec[i].bv_page = p;
+	}
+	d = r10_bio->devs[1].devnum;
+
+	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+	generic_make_request(wbio);
+}
+
+
+/*
+ * This is a kernel thread which:
+ *
+ *	1.	Retries failed read operations on working mirrors.
+ *	2.	Updates the raid superblock when problems encounter.
+ *	3.	Performs writes following reads for array syncronising.
+ */
+
+static void raid10d(mddev_t *mddev)
+{
+	r10bio_t *r10_bio;
+	struct bio *bio;
+	unsigned long flags;
+	conf_t *conf = mddev_to_conf(mddev);
+	struct list_head *head = &conf->retry_list;
+	int unplug=0;
+	mdk_rdev_t *rdev;
+
+	md_check_recovery(mddev);
+	md_handle_safemode(mddev);
+
+	for (;;) {
+		char b[BDEVNAME_SIZE];
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (list_empty(head))
+			break;
+		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
+		list_del(head->prev);
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+
+		mddev = r10_bio->mddev;
+		conf = mddev_to_conf(mddev);
+		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+			sync_request_write(mddev, r10_bio);
+			unplug = 1;
+		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+			recovery_request_write(mddev, r10_bio);
+			unplug = 1;
+		} else {
+			int mirror;
+			bio = r10_bio->devs[r10_bio->read_slot].bio;
+			r10_bio->devs[r10_bio->read_slot].bio = NULL;
+			bio_put(bio);
+			mirror = read_balance(conf, r10_bio);
+			if (mirror == -1) {
+				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
+				       " read error for block %llu\n",
+				       bdevname(bio->bi_bdev,b),
+				       (unsigned long long)r10_bio->sector);
+				raid_end_bio_io(r10_bio);
+			} else {
+				rdev = conf->mirrors[mirror].rdev;
+				if (printk_ratelimit())
+					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
+					       " another mirror\n",
+					       bdevname(rdev->bdev,b),
+					       (unsigned long long)r10_bio->sector);
+				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
+				r10_bio->devs[r10_bio->read_slot].bio = bio;
+				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+					+ rdev->data_offset;
+				bio->bi_bdev = rdev->bdev;
+				bio->bi_rw = READ;
+				bio->bi_private = r10_bio;
+				bio->bi_end_io = raid10_end_read_request;
+				unplug = 1;
+				generic_make_request(bio);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (unplug)
+		unplug_slaves(mddev);
+}
+
+
+static int init_resync(conf_t *conf)
+{
+	int buffs;
+
+	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+	if (conf->r10buf_pool)
+		BUG();
+	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
+	if (!conf->r10buf_pool)
+		return -ENOMEM;
+	conf->next_resync = 0;
+	return 0;
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ *
+ * Resync and recovery are handled very differently.
+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
+ *
+ * For resync, we iterate over virtual addresses, read all copies,
+ * and update if there are differences.  If only one copy is live,
+ * skip it.
+ * For recovery, we iterate over physical addresses, read a good
+ * value for each non-in_sync drive, and over-write.
+ *
+ * So, for recovery we may have several outstanding complex requests for a
+ * given address, one for each out-of-sync device.  We model this by allocating
+ * a number of r10_bio structures, one for each out-of-sync device.
+ * As we setup these structures, we collect all bio's together into a list
+ * which we then process collectively to add pages, and then process again
+ * to pass to generic_make_request.
+ *
+ * The r10_bio structures are linked using a borrowed master_bio pointer.
+ * This link is counted in ->remaining.  When the r10_bio that points to NULL
+ * has its remaining count decremented to 0, the whole complex operation
+ * is complete.
+ *
+ */
+
+static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	r10bio_t *r10_bio;
+	struct bio *biolist = NULL, *bio;
+	sector_t max_sector, nr_sectors;
+	int disk;
+	int i;
+
+	sector_t sectors_skipped = 0;
+	int chunks_skipped = 0;
+
+	if (!conf->r10buf_pool)
+		if (init_resync(conf))
+			return -ENOMEM;
+
+ skipped:
+	max_sector = mddev->size << 1;
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_sector = mddev->resync_max_sectors;
+	if (sector_nr >= max_sector) {
+		close_sync(conf);
+		return sectors_skipped;
+	}
+	if (chunks_skipped >= conf->raid_disks) {
+		/* if there has been nothing to do on any drive,
+		 * then there is nothing to do at all..
+		 */
+		sector_t sec = max_sector - sector_nr;
+		md_done_sync(mddev, sec, 1);
+		return sec + sectors_skipped;
+	}
+
+	/* make sure whole request will fit in a chunk - if chunks
+	 * are meaningful
+	 */
+	if (conf->near_copies < conf->raid_disks &&
+	    max_sector > (sector_nr | conf->chunk_mask))
+		max_sector = (sector_nr | conf->chunk_mask) + 1;
+	/*
+	 * If there is non-resync activity waiting for us then
+	 * put in a delay to throttle resync.
+	 */
+	if (!go_faster && waitqueue_active(&conf->wait_resume))
+		schedule_timeout(HZ);
+	device_barrier(conf, sector_nr + RESYNC_SECTORS);
+
+	/* Again, very different code for resync and recovery.
+	 * Both must result in an r10bio with a list of bios that
+	 * have bi_end_io, bi_sector, bi_bdev set,
+	 * and bi_private set to the r10bio.
+	 * For recovery, we may actually create several r10bios
+	 * with 2 bios in each, that correspond to the bios in the main one.
+	 * In this case, the subordinate r10bios link back through a
+	 * borrowed master_bio pointer, and the counter in the master
+	 * includes a ref from each subordinate.
+	 */
+	/* First, we decide what to do and set ->bi_end_io
+	 * To end_sync_read if we want to read, and
+	 * end_sync_write if we will want to write.
+	 */
+
+	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		/* recovery... the complicated one */
+		int i, j, k;
+		r10_bio = NULL;
+
+		for (i=0 ; i<conf->raid_disks; i++)
+			if (conf->mirrors[i].rdev &&
+			    !conf->mirrors[i].rdev->in_sync) {
+				/* want to reconstruct this device */
+				r10bio_t *rb2 = r10_bio;
+
+				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+				spin_lock_irq(&conf->resync_lock);
+				conf->nr_pending++;
+				if (rb2) conf->barrier++;
+				spin_unlock_irq(&conf->resync_lock);
+				atomic_set(&r10_bio->remaining, 0);
+
+				r10_bio->master_bio = (struct bio*)rb2;
+				if (rb2)
+					atomic_inc(&rb2->remaining);
+				r10_bio->mddev = mddev;
+				set_bit(R10BIO_IsRecover, &r10_bio->state);
+				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
+				raid10_find_phys(conf, r10_bio);
+				for (j=0; j<conf->copies;j++) {
+					int d = r10_bio->devs[j].devnum;
+					if (conf->mirrors[d].rdev &&
+					    conf->mirrors[d].rdev->in_sync) {
+						/* This is where we read from */
+						bio = r10_bio->devs[0].bio;
+						bio->bi_next = biolist;
+						biolist = bio;
+						bio->bi_private = r10_bio;
+						bio->bi_end_io = end_sync_read;
+						bio->bi_rw = 0;
+						bio->bi_sector = r10_bio->devs[j].addr +
+							conf->mirrors[d].rdev->data_offset;
+						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+						atomic_inc(&r10_bio->remaining);
+						/* and we write to 'i' */
+
+						for (k=0; k<conf->copies; k++)
+							if (r10_bio->devs[k].devnum == i)
+								break;
+						bio = r10_bio->devs[1].bio;
+						bio->bi_next = biolist;
+						biolist = bio;
+						bio->bi_private = r10_bio;
+						bio->bi_end_io = end_sync_write;
+						bio->bi_rw = 1;
+						bio->bi_sector = r10_bio->devs[k].addr +
+							conf->mirrors[i].rdev->data_offset;
+						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+
+						r10_bio->devs[0].devnum = d;
+						r10_bio->devs[1].devnum = i;
+
+						break;
+					}
+				}
+				if (j == conf->copies) {
+					BUG();
+				}
+			}
+		if (biolist == NULL) {
+			while (r10_bio) {
+				r10bio_t *rb2 = r10_bio;
+				r10_bio = (r10bio_t*) rb2->master_bio;
+				rb2->master_bio = NULL;
+				put_buf(rb2);
+			}
+			goto giveup;
+		}
+	} else {
+		/* resync. Schedule a read for every block at this virt offset */
+		int count = 0;
+		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+
+		spin_lock_irq(&conf->resync_lock);
+		conf->nr_pending++;
+		spin_unlock_irq(&conf->resync_lock);
+
+		r10_bio->mddev = mddev;
+		atomic_set(&r10_bio->remaining, 0);
+
+		r10_bio->master_bio = NULL;
+		r10_bio->sector = sector_nr;
+		set_bit(R10BIO_IsSync, &r10_bio->state);
+		raid10_find_phys(conf, r10_bio);
+		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
+		spin_lock_irq(&conf->device_lock);
+		for (i=0; i<conf->copies; i++) {
+			int d = r10_bio->devs[i].devnum;
+			bio = r10_bio->devs[i].bio;
+			bio->bi_end_io = NULL;
+			if (conf->mirrors[d].rdev == NULL ||
+			    conf->mirrors[d].rdev->faulty)
+				continue;
+			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+			atomic_inc(&r10_bio->remaining);
+			bio->bi_next = biolist;
+			biolist = bio;
+			bio->bi_private = r10_bio;
+			bio->bi_end_io = end_sync_read;
+			bio->bi_rw = 0;
+			bio->bi_sector = r10_bio->devs[i].addr +
+				conf->mirrors[d].rdev->data_offset;
+			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+			count++;
+		}
+		spin_unlock_irq(&conf->device_lock);
+		if (count < 2) {
+			for (i=0; i<conf->copies; i++) {
+				int d = r10_bio->devs[i].devnum;
+				if (r10_bio->devs[i].bio->bi_end_io)
+					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+			}
+			put_buf(r10_bio);
+			biolist = NULL;
+			goto giveup;
+		}
+	}
+
+	for (bio = biolist; bio ; bio=bio->bi_next) {
+
+		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+		if (bio->bi_end_io)
+			bio->bi_flags |= 1 << BIO_UPTODATE;
+		bio->bi_vcnt = 0;
+		bio->bi_idx = 0;
+		bio->bi_phys_segments = 0;
+		bio->bi_hw_segments = 0;
+		bio->bi_size = 0;
+	}
+
+	nr_sectors = 0;
+	do {
+		struct page *page;
+		int len = PAGE_SIZE;
+		disk = 0;
+		if (sector_nr + (len>>9) > max_sector)
+			len = (max_sector - sector_nr) << 9;
+		if (len == 0)
+			break;
+		for (bio= biolist ; bio ; bio=bio->bi_next) {
+			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			if (bio_add_page(bio, page, len, 0) == 0) {
+				/* stop here */
+				struct bio *bio2;
+				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
+					/* remove last page from this bio */
+					bio2->bi_vcnt--;
+					bio2->bi_size -= len;
+					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
+				}
+				goto bio_full;
+			}
+			disk = i;
+		}
+		nr_sectors += len>>9;
+		sector_nr += len>>9;
+	} while (biolist->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+	r10_bio->sectors = nr_sectors;
+
+	while (biolist) {
+		bio = biolist;
+		biolist = biolist->bi_next;
+
+		bio->bi_next = NULL;
+		r10_bio = bio->bi_private;
+		r10_bio->sectors = nr_sectors;
+
+		if (bio->bi_end_io == end_sync_read) {
+			md_sync_acct(bio->bi_bdev, nr_sectors);
+			generic_make_request(bio);
+		}
+	}
+
+	return sectors_skipped + nr_sectors;
+ giveup:
+	/* There is nowhere to write, so all non-sync
+	 * drives must be failed, so try the next chunk...
+	 */
+	{
+	int sec = max_sector - sector_nr;
+	sectors_skipped += sec;
+	chunks_skipped ++;
+	sector_nr = max_sector;
+	md_done_sync(mddev, sec, 1);
+	goto skipped;
+	}
+}
+
+static int run(mddev_t *mddev)
+{
+	conf_t *conf;
+	int i, disk_idx;
+	mirror_info_t *disk;
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+	int nc, fc;
+	sector_t stride, size;
+
+	if (mddev->level != 10) {
+		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
+		       mdname(mddev), mddev->level);
+		goto out;
+	}
+	nc = mddev->layout & 255;
+	fc = (mddev->layout >> 8) & 255;
+	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
+	    (mddev->layout >> 16)) {
+		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
+		       mdname(mddev), mddev->layout);
+		goto out;
+	}
+	/*
+	 * copy the already verified devices into our private RAID10
+	 * bookkeeping area. [whatever we allocate in run(),
+	 * should be freed in stop()]
+	 */
+	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+	mddev->private = conf;
+	if (!conf) {
+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+			mdname(mddev));
+		goto out;
+	}
+	memset(conf, 0, sizeof(*conf));
+	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+				 GFP_KERNEL);
+	if (!conf->mirrors) {
+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+		       mdname(mddev));
+		goto out_free_conf;
+	}
+	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+
+	conf->near_copies = nc;
+	conf->far_copies = fc;
+	conf->copies = nc*fc;
+	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
+	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
+	stride = mddev->size >> (conf->chunk_shift-1);
+	sector_div(stride, fc);
+	conf->stride = stride << conf->chunk_shift;
+
+	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
+						r10bio_pool_free, conf);
+	if (!conf->r10bio_pool) {
+		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+	mddev->queue->unplug_fn = raid10_unplug;
+
+	mddev->queue->issue_flush_fn = raid10_issue_flush;
+
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		disk_idx = rdev->raid_disk;
+		if (disk_idx >= mddev->raid_disks
+		    || disk_idx < 0)
+			continue;
+		disk = conf->mirrors + disk_idx;
+
+		disk->rdev = rdev;
+
+		blk_queue_stack_limits(mddev->queue,
+				       rdev->bdev->bd_disk->queue);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_sector to one PAGE, as
+		 * a one page request is never in violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+
+		disk->head_position = 0;
+		if (!rdev->faulty && rdev->in_sync)
+			conf->working_disks++;
+	}
+	conf->raid_disks = mddev->raid_disks;
+	conf->mddev = mddev;
+	conf->device_lock = SPIN_LOCK_UNLOCKED;
+	INIT_LIST_HEAD(&conf->retry_list);
+
+	conf->resync_lock = SPIN_LOCK_UNLOCKED;
+	init_waitqueue_head(&conf->wait_idle);
+	init_waitqueue_head(&conf->wait_resume);
+
+	if (!conf->working_disks) {
+		printk(KERN_ERR "raid10: no operational mirrors for %s\n",
+			mdname(mddev));
+		goto out_free_conf;
+	}
+
+	mddev->degraded = 0;
+	for (i = 0; i < conf->raid_disks; i++) {
+
+		disk = conf->mirrors + i;
+
+		if (!disk->rdev) {
+			disk->head_position = 0;
+			mddev->degraded++;
+		}
+	}
+
+
+	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
+	if (!mddev->thread) {
+		printk(KERN_ERR
+		       "raid10: couldn't allocate thread for %s\n",
+		       mdname(mddev));
+		goto out_free_conf;
+	}
+
+	printk(KERN_INFO
+		"raid10: raid set %s active with %d out of %d devices\n",
+		mdname(mddev), mddev->raid_disks - mddev->degraded,
+		mddev->raid_disks);
+	/*
+	 * Ok, everything is just fine now
+	 */
+	size = conf->stride * conf->raid_disks;
+	sector_div(size, conf->near_copies);
+	mddev->array_size = size/2;
+	mddev->resync_max_sectors = size;
+
+	/* Calculate max read-ahead size.
+	 * We need to readahead at least twice a whole stripe....
+	 * maybe...
+	 */
+	{
+		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+		stripe /= conf->near_copies;
+		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+	}
+
+	if (conf->near_copies < mddev->raid_disks)
+		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+	return 0;
+
+out_free_conf:
+	if (conf->r10bio_pool)
+		mempool_destroy(conf->r10bio_pool);
+	if (conf->mirrors)
+		kfree(conf->mirrors);
+	kfree(conf);
+	mddev->private = NULL;
+out:
+	return -EIO;
+}
+
+static int stop(mddev_t *mddev)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+
+	md_unregister_thread(mddev->thread);
+	mddev->thread = NULL;
+	if (conf->r10bio_pool)
+		mempool_destroy(conf->r10bio_pool);
+	if (conf->mirrors)
+		kfree(conf->mirrors);
+	kfree(conf);
+	mddev->private = NULL;
+	return 0;
+}
+
+
+static mdk_personality_t raid10_personality =
+{
+	.name		= "raid10",
+	.owner		= THIS_MODULE,
+	.make_request	= make_request,
+	.run		= run,
+	.stop		= stop,
+	.status		= status,
+	.error_handler	= error,
+	.hot_add_disk	= raid10_add_disk,
+	.hot_remove_disk= raid10_remove_disk,
+	.spare_active	= raid10_spare_active,
+	.sync_request	= sync_request,
+};
+
+static int __init raid_init(void)
+{
+	return register_md_personality(RAID10, &raid10_personality);
+}
+
+static void raid_exit(void)
+{
+	unregister_md_personality(RAID10);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff -pruN ./drivers/md.dm/raid1.c ./drivers/md/raid1.c
--- ./drivers/md.dm/raid1.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/raid1.c	2006-03-17 13:16:38.000000000 +0300
@@ -24,10 +24,6 @@
 
 #include <linux/raid/raid1.h>
 
-#define MAJOR_NR MD_MAJOR
-#define MD_DRIVER
-#define MD_PERSONALITY
-
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
  */
@@ -44,13 +40,12 @@ static void * r1bio_pool_alloc(int gfp_f
 {
 	struct pool_info *pi = data;
 	r1bio_t *r1_bio;
+	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
-	r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks,
-			 gfp_flags);
+	r1_bio = kmalloc(size, gfp_flags);
 	if (r1_bio)
-		memset(r1_bio, 0, sizeof(*r1_bio) +
-			       sizeof(struct bio*) * pi->raid_disks);
+		memset(r1_bio, 0, size);
 	else
 		unplug_slaves(pi->mddev);
 
@@ -104,7 +99,7 @@ static void * r1buf_pool_alloc(int gfp_f
 		bio->bi_io_vec[i].bv_page = page;
 	}
 
-	r1_bio->master_bio = bio;
+	r1_bio->master_bio = NULL;
 
 	return r1_bio;
 
@@ -189,32 +184,6 @@ static inline void put_buf(r1bio_t *r1_b
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 }
 
-static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
-{
-	conf_t *conf = mddev_to_conf(mddev);
-	int i, disks = conf->raid_disks;
-
-	/*
-	 * Later we do read balancing on the read side
-	 * now we use the first available disk.
-	 */
-
-	spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < disks; i++) {
-		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
-		if (rdev && rdev->in_sync) {
-			*rdevp = rdev;
-			atomic_inc(&rdev->nr_pending);
-			spin_unlock_irq(&conf->device_lock);
-			return i;
-		}
-	}
-	spin_unlock_irq(&conf->device_lock);
-
-	printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n");
-	return -1;
-}
-
 static void reschedule_retry(r1bio_t *r1_bio)
 {
 	unsigned long flags;
@@ -292,8 +261,9 @@ static int raid1_end_read_request(struct
 		 * oops, read error:
 		 */
 		char b[BDEVNAME_SIZE];
-		printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
-		       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+		if (printk_ratelimit())
+			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
 
@@ -363,12 +333,13 @@ static int raid1_end_write_request(struc
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
-static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
 	const unsigned long this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
-	const int sectors = bio->bi_size >> 9;
+	const int sectors = r1_bio->sectors;
 	sector_t new_distance, current_distance;
+	mdk_rdev_t *new_rdev, *rdev;
 
 	spin_lock_irq(&conf->device_lock);
 	/*
@@ -376,16 +347,17 @@ static int read_balance(conf_t *conf, st
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
+ retry:
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync)) {
-		/* make sure that disk is operational */
+		/* Choose the first operation device, for consistancy */
 		new_disk = 0;
 
-		while (!conf->mirrors[new_disk].rdev ||
-		       !conf->mirrors[new_disk].rdev->in_sync) {
+		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
+		       !new_rdev->in_sync) {
 			new_disk++;
 			if (new_disk == conf->raid_disks) {
-				new_disk = 0;
+				new_disk = -1;
 				break;
 			}
 		}
@@ -394,13 +366,13 @@ static int read_balance(conf_t *conf, st
 
 
 	/* make sure the disk is operational */
-	while (!conf->mirrors[new_disk].rdev ||
-	       !conf->mirrors[new_disk].rdev->in_sync) {
+	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
+	       !new_rdev->in_sync) {
 		if (new_disk <= 0)
 			new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
-			new_disk = conf->last_used;
+			new_disk = -1;
 			goto rb_out;
 		}
 	}
@@ -424,29 +396,38 @@ static int read_balance(conf_t *conf, st
 			disk = conf->raid_disks;
 		disk--;
 
-		if (!conf->mirrors[disk].rdev ||
-		    !conf->mirrors[disk].rdev->in_sync)
+		if ((rdev=conf->mirrors[disk].rdev) == NULL ||
+		    !rdev->in_sync)
 			continue;
 
-		if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {
+		if (!atomic_read(&rdev->nr_pending)) {
 			new_disk = disk;
+			new_rdev = rdev;
 			break;
 		}
 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
 		if (new_distance < current_distance) {
 			current_distance = new_distance;
 			new_disk = disk;
+			new_rdev = rdev;
 		}
 	} while (disk != conf->last_used);
 
 rb_out:
-	r1_bio->read_disk = new_disk;
-	conf->next_seq_sect = this_sector + sectors;
 
-	conf->last_used = new_disk;
 
-	if (conf->mirrors[new_disk].rdev)
-		atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);
+	if (new_disk >= 0) {
+		conf->next_seq_sect = this_sector + sectors;
+		conf->last_used = new_disk;
+		atomic_inc(&new_rdev->nr_pending);
+		if (!new_rdev->in_sync) {
+			/* cannot risk returning a device that failed
+			 * before we inc'ed nr_pending
+			 */
+			atomic_dec(&new_rdev->nr_pending);
+			goto retry;
+		}
+	}
 	spin_unlock_irq(&conf->device_lock);
 
 	return new_disk;
@@ -471,7 +452,7 @@ static void unplug_slaves(mddev_t *mddev
 				r_queue->unplug_fn(r_queue);
 
 			spin_lock_irqsave(&conf->device_lock, flags);
-			atomic_dec(&rdev->nr_pending);
+			rdev_dec_pending(rdev, mddev);
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -481,6 +462,32 @@ static void raid1_unplug(request_queue_t
 	unplug_slaves(q->queuedata);
 }
 
+static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	conf_t *conf = mddev_to_conf(mddev);
+	unsigned long flags;
+	int i, ret = 0;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue = bdev_get_queue(bdev);
+
+			if (r_queue->issue_flush_fn) {
+				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+				if (ret)
+					break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	return ret;
+}
+
 /*
  * Throttle resync depth, so that we can both get proper overlapping of
  * requests, but are still able to handle normal requests quickly.
@@ -513,6 +520,7 @@ static int make_request(request_queue_t 
 	r1bio_t *r1_bio;
 	struct bio *read_bio;
 	int i, disks;
+	mdk_rdev_t *rdev;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -545,15 +553,26 @@ static int make_request(request_queue_t 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
+	r1_bio->state = 0;
+
 	if (bio_data_dir(bio) == READ) {
 		/*
 		 * read balancing logic:
 		 */
-		mirror = conf->mirrors + read_balance(conf, bio, r1_bio);
+		int rdisk = read_balance(conf, r1_bio);
+
+		if (rdisk < 0) {
+			/* couldn't find anywhere to read from */
+			raid_end_bio_io(r1_bio);
+			return 0;
+		}
+		mirror = conf->mirrors + rdisk;
+
+		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone(bio, GFP_NOIO);
 
-		r1_bio->bios[r1_bio->read_disk] = read_bio;
+		r1_bio->bios[rdisk] = read_bio;
 
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
@@ -575,10 +594,14 @@ static int make_request(request_queue_t 
 	disks = conf->raid_disks;
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0;  i < disks; i++) {
-		if (conf->mirrors[i].rdev &&
-		    !conf->mirrors[i].rdev->faulty) {
-			atomic_inc(&conf->mirrors[i].rdev->nr_pending);
-			r1_bio->bios[i] = bio;
+		if ((rdev=conf->mirrors[i].rdev) != NULL &&
+		    !rdev->faulty) {
+			atomic_inc(&rdev->nr_pending);
+			if (rdev->faulty) {
+				atomic_dec(&rdev->nr_pending);
+				r1_bio->bios[i] = NULL;
+			} else
+				r1_bio->bios[i] = bio;
 		} else
 			r1_bio->bios[i] = NULL;
 	}
@@ -746,7 +769,7 @@ static int raid1_add_disk(mddev_t *mddev
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -877,7 +900,7 @@ static void sync_request_write(mddev_t *
 
 		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
 		atomic_inc(&r1_bio->remaining);
-		md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9);
+		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
 		generic_make_request(wbio);
 	}
 
@@ -925,7 +948,7 @@ static void raid1d(mddev_t *mddev)
 		} else {
 			int disk;
 			bio = r1_bio->bios[r1_bio->read_disk];
-			if ((disk=map(mddev, &rdev)) == -1) {
+			if ((disk=read_balance(conf, r1_bio)) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
 				       " read error for block %llu\n",
 				       bdevname(bio->bi_bdev,b),
@@ -934,14 +957,20 @@ static void raid1d(mddev_t *mddev)
 			} else {
 				r1_bio->bios[r1_bio->read_disk] = NULL;
 				r1_bio->read_disk = disk;
+				bio_put(bio);
+				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
 				r1_bio->bios[r1_bio->read_disk] = bio;
-				printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
-				       " another mirror\n",
-				       bdevname(rdev->bdev,b),
-				       (unsigned long long)r1_bio->sector);
-				bio->bi_bdev = rdev->bdev;
+				rdev = conf->mirrors[disk].rdev;
+				if (printk_ratelimit())
+					printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
+					       " another mirror\n",
+					       bdevname(rdev->bdev,b),
+					       (unsigned long long)r1_bio->sector);
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
+				bio->bi_bdev = rdev->bdev;
+				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ;
+				bio->bi_private = r1_bio;
 				unplug = 1;
 				generic_make_request(bio);
 			}
@@ -1078,7 +1107,7 @@ static int sync_request(mddev_t *mddev, 
 		int rv = max_sector - sector_nr;
 		md_done_sync(mddev, rv, 1);
 		put_buf(r1_bio);
-		atomic_dec(&conf->mirrors[disk].rdev->nr_pending);
+		rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
 		return rv;
 	}
 
@@ -1117,7 +1146,7 @@ static int sync_request(mddev_t *mddev, 
 	bio = r1_bio->bios[disk];
 	r1_bio->sectors = nr_sectors;
 
-	md_sync_acct(mirror->rdev, nr_sectors);
+	md_sync_acct(mirror->rdev->bdev, nr_sectors);
 
 	generic_make_request(bio);
 
@@ -1168,6 +1197,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid1_unplug;
 
+	mddev->queue->issue_flush_fn = raid1_issue_flush;
 
 	ITERATE_RDEV(mddev, rdev, tmp) {
 		disk_idx = rdev->raid_disk;
@@ -1186,7 +1216,7 @@ static int run(mddev_t *mddev)
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 		if (!rdev->faulty && rdev->in_sync)
@@ -1328,7 +1358,7 @@ static int raid1_reshape(mddev_t *mddev,
 		if (conf->mirrors[d].rdev)
 			return -EBUSY;
 
-	newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL);
+	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
 	if (!newpoolinfo)
 		return -ENOMEM;
 	newpoolinfo->mddev = mddev;
diff -pruN ./drivers/md.dm/raid5.c ./drivers/md/raid5.c
--- ./drivers/md.dm/raid5.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/raid5.c	2006-03-17 13:16:38.000000000 +0300
@@ -457,6 +457,7 @@ static void raid5_build_block (struct st
 	bio_init(&dev->req);
 	dev->req.bi_io_vec = &dev->vec;
 	dev->req.bi_vcnt++;
+	dev->req.bi_max_vecs++;
 	dev->vec.bv_page = dev->page;
 	dev->vec.bv_len = STRIPE_SIZE;
 	dev->vec.bv_offset = 0;
@@ -477,8 +478,8 @@ static void error(mddev_t *mddev, mdk_rd
 
 	if (!rdev->faulty) {
 		mddev->sb_dirty = 1;
-		conf->working_disks--;
 		if (rdev->in_sync) {
+			conf->working_disks--;
 			mddev->degraded++;
 			conf->failed_disks++;
 			rdev->in_sync = 0;
@@ -1071,7 +1072,8 @@ static void handle_stripe(struct stripe_
 					PRINTK("Reading block %d (sync=%d)\n", 
 						i, syncing);
 					if (syncing)
-						md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
+						md_sync_acct(conf->disks[i].rdev->bdev,
+							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1256,7 +1258,7 @@ static void handle_stripe(struct stripe_
  
 		if (rdev) {
 			if (test_bit(R5_Syncio, &sh->dev[i].flags))
-				md_sync_acct(rdev, STRIPE_SECTORS);
+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
 			PRINTK("for %llu schedule op %ld on disc %d\n",
@@ -1265,6 +1267,7 @@ static void handle_stripe(struct stripe_
 			bi->bi_sector = sh->sector + rdev->data_offset;
 			bi->bi_flags = 1 << BIO_UPTODATE;
 			bi->bi_vcnt = 1;	
+			bi->bi_max_vecs = 1;
 			bi->bi_idx = 0;
 			bi->bi_io_vec = &sh->dev[i].vec;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1316,7 +1319,7 @@ static void unplug_slaves(mddev_t *mddev
 				r_queue->unplug_fn(r_queue);
 
 			spin_lock_irqsave(&conf->device_lock, flags);
-			atomic_dec(&rdev->nr_pending);
+			rdev_dec_pending(rdev, mddev);
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1328,6 +1331,8 @@ static void raid5_unplug_device(request_
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
 
+	if (!conf) return;
+
 	spin_lock_irqsave(&conf->device_lock, flags);
 
 	if (blk_remove_plug(q))
@@ -1339,6 +1344,39 @@ static void raid5_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid5_plug_device(raid5_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1545,6 +1583,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
+	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
 	PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
diff -pruN ./drivers/md.dm/raid6main.c ./drivers/md/raid6main.c
--- ./drivers/md.dm/raid6main.c	2006-03-17 08:57:42.000000000 +0300
+++ ./drivers/md/raid6main.c	2006-03-17 13:16:38.000000000 +0300
@@ -478,6 +478,7 @@ static void raid6_build_block (struct st
 	bio_init(&dev->req);
 	dev->req.bi_io_vec = &dev->vec;
 	dev->req.bi_vcnt++;
+	dev->req.bi_max_vecs++;
 	dev->vec.bv_page = dev->page;
 	dev->vec.bv_len = STRIPE_SIZE;
 	dev->vec.bv_offset = 0;
@@ -498,8 +499,8 @@ static void error(mddev_t *mddev, mdk_rd
 
 	if (!rdev->faulty) {
 		mddev->sb_dirty = 1;
-		conf->working_disks--;
 		if (rdev->in_sync) {
+			conf->working_disks--;
 			mddev->degraded++;
 			conf->failed_disks++;
 			rdev->in_sync = 0;
@@ -1208,7 +1209,8 @@ static void handle_stripe(struct stripe_
 					PRINTK("Reading block %d (sync=%d)\n",
 						i, syncing);
 					if (syncing)
-						md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
+						md_sync_acct(conf->disks[i].rdev->bdev,
+							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1418,7 +1420,7 @@ static void handle_stripe(struct stripe_
 
 		if (rdev) {
 			if (test_bit(R5_Syncio, &sh->dev[i].flags))
-				md_sync_acct(rdev, STRIPE_SECTORS);
+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
 			PRINTK("for %llu schedule op %ld on disc %d\n",
@@ -1427,6 +1429,7 @@ static void handle_stripe(struct stripe_
 			bi->bi_sector = sh->sector + rdev->data_offset;
 			bi->bi_flags = 1 << BIO_UPTODATE;
 			bi->bi_vcnt = 1;
+			bi->bi_max_vecs = 1;
 			bi->bi_idx = 0;
 			bi->bi_io_vec = &sh->dev[i].vec;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1478,7 +1481,7 @@ static void unplug_slaves(mddev_t *mddev
 				r_queue->unplug_fn(r_queue);
 
 			spin_lock_irqsave(&conf->device_lock, flags);
-			atomic_dec(&rdev->nr_pending);
+			rdev_dec_pending(rdev, mddev);
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1501,6 +1504,39 @@ static void raid6_unplug_device(request_
 	unplug_slaves(mddev);
 }
 
+static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
+			     sector_t *error_sector)
+{
+	mddev_t *mddev = q->queuedata;
+	raid6_conf_t *conf = mddev_to_conf(mddev);
+	int i, ret = 0;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			request_queue_t *r_queue;
+
+			if (!bdev)
+				continue;
+
+			r_queue = bdev_get_queue(bdev);
+			if (!r_queue)
+				continue;
+
+			if (!r_queue->issue_flush_fn) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+
+			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static inline void raid6_plug_device(raid6_conf_t *conf)
 {
 	spin_lock_irq(&conf->device_lock);
@@ -1708,6 +1744,7 @@ static int run (mddev_t *mddev)
 	atomic_set(&conf->preread_active_stripes, 0);
 
 	mddev->queue->unplug_fn = raid6_unplug_device;
+	mddev->queue->issue_flush_fn = raid6_issue_flush;
 
 	PRINTK("raid6: run(%s) called.\n", mdname(mddev));
 
--- ./include/linux/compat_ioctl.h.dm	2006-03-17 08:58:47.000000000 +0300
+++ ./include/linux/compat_ioctl.h	2006-03-17 08:16:12.000000000 +0300
@@ -102,6 +102,7 @@ COMPATIBLE_IOCTL(BLKROGET)
 COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
+COMPATIBLE_IOCTL(BLKSECTGET)
 COMPATIBLE_IOCTL(BLKSSZGET)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
@@ -141,6 +142,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32)
 COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
+COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
 COMPATIBLE_IOCTL(DM_VERSION)
 COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 COMPATIBLE_IOCTL(DM_LIST_DEVICES)
@@ -155,6 +157,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
 COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
+COMPATIBLE_IOCTL(DM_TARGET_MSG)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
@@ -387,6 +390,7 @@ COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
 COMPATIBLE_IOCTL(DVD_AUTH)
 /* Big L */
 ULONG_IOCTL(LOOP_SET_FD)
+ULONG_IOCTL(LOOP_CHANGE_FD)
 COMPATIBLE_IOCTL(LOOP_CLR_FD)
 COMPATIBLE_IOCTL(LOOP_GET_STATUS64)
 COMPATIBLE_IOCTL(LOOP_SET_STATUS64)
@@ -595,13 +599,15 @@ COMPATIBLE_IOCTL(ATMTCP_CREATE)
 COMPATIBLE_IOCTL(ATMTCP_REMOVE)
 COMPATIBLE_IOCTL(ATMMPC_CTRL)
 COMPATIBLE_IOCTL(ATMMPC_DATA)
-/* Big W */
-/* WIOC_GETSUPPORT not yet implemented -E */
+/* Watchdog */
+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
 COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
 COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
 COMPATIBLE_IOCTL(WDIOC_GETTEMP)
 COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
 COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
 /* Big R */
 COMPATIBLE_IOCTL(RNDGETENTCNT)
 COMPATIBLE_IOCTL(RNDADDTOENTCNT)
@@ -735,3 +741,20 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY)
 COMPATIBLE_IOCTL(SIOCGIWRETRY)
 COMPATIBLE_IOCTL(SIOCSIWPOWER)
 COMPATIBLE_IOCTL(SIOCGIWPOWER)
+/* hiddev */
+COMPATIBLE_IOCTL(HIDIOCGVERSION)
+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
+COMPATIBLE_IOCTL(HIDIOCGSTRING)
+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORT)
+COMPATIBLE_IOCTL(HIDIOCSREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
+COMPATIBLE_IOCTL(HIDIOCGUCODE)
+COMPATIBLE_IOCTL(HIDIOCGFLAG)
+COMPATIBLE_IOCTL(HIDIOCSFLAG)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
--- ./include/linux/device-mapper.h.dm	2006-03-17 08:58:56.000000000 +0300
+++ ./include/linux/device-mapper.h	2006-03-17 08:16:12.000000000 +0300
@@ -51,12 +51,15 @@ typedef int (*dm_endio_fn) (struct dm_ta
 			    struct bio *bio, int error,
 			    union map_info *map_context);
 
-typedef void (*dm_suspend_fn) (struct dm_target *ti);
+typedef void (*dm_presuspend_fn) (struct dm_target *ti);
+typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
 typedef void (*dm_resume_fn) (struct dm_target *ti);
 
 typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
 			     char *result, unsigned int maxlen);
 
+typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
+
 void dm_error(const char *message);
 
 /*
@@ -79,9 +82,11 @@ struct target_type {
 	dm_dtr_fn dtr;
 	dm_map_fn map;
 	dm_endio_fn end_io;
-	dm_suspend_fn suspend;
+	dm_presuspend_fn presuspend;
+	dm_postsuspend_fn postsuspend;
 	dm_resume_fn resume;
 	dm_status_fn status;
+	dm_message_fn message;
 };
 
 struct io_restrictions {
@@ -102,6 +107,7 @@ struct dm_target {
 	sector_t len;
 
 	/* FIXME: turn this into a mask, and merge with io_restrictions */
+	/* Always a power of 2 */
 	sector_t split_io;
 
 	/*
--- ./include/linux/dm-ioctl.h.dm	2006-03-17 08:59:07.000000000 +0300
+++ ./include/linux/dm-ioctl.h	2006-03-17 08:16:12.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the LGPL.
  */
@@ -76,6 +77,9 @@
  *
  * DM_TABLE_STATUS:
  * Return the targets status for the 'active' table.
+ *
+ * DM_TARGET_MSG:
+ * Pass a message string to the target at a specific offset of a device.
  */
 
 /*
@@ -179,6 +183,15 @@ struct dm_target_versions {
 };
 
 /*
+ * Used to pass message to a target
+ */
+struct dm_target_msg {
+	uint64_t sector;	/* Device sector */
+
+	char message[0];
+};
+
+/*
  * If you change this make sure you make the corresponding change
  * to dm-ioctl.c:lookup_ioctl()
  */
@@ -204,6 +217,7 @@ enum {
 
 	/* Added later */
 	DM_LIST_VERSIONS_CMD,
+	DM_TARGET_MSG_CMD,
 };
 
 /*
@@ -232,6 +246,7 @@ typedef char ioctl_struct[308];
 #define DM_TABLE_DEPS_32    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct)
 #define DM_TABLE_STATUS_32  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
 #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
+#define DM_TARGET_MSG_32    _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
 #endif
 
 #define DM_IOCTL 0xfd
@@ -254,10 +269,12 @@ typedef char ioctl_struct[308];
 
 #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
 
+#define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
+
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	1
+#define DM_VERSION_MINOR	5
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2003-12-10)"
+#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -283,4 +300,14 @@ typedef char ioctl_struct[308];
  */
 #define DM_BUFFER_FULL_FLAG	(1 << 8) /* Out */
 
+/*
+ * Set this to improve performance when you aren't going to use open_count.
+ */
+#define DM_SKIP_BDGET_FLAG	(1 << 9) /* In */
+
+/*
+ * Set this to avoid attempting to freeze any filesystem when suspending.
+ */
+#define DM_SKIP_LOCKFS_FLAG	(1 << 10) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
--- ./include/linux/genhd.h.dm	2006-03-20 08:42:40.000000000 +0300
+++ ./include/linux/genhd.h	2006-03-17 13:44:40.000000000 +0300
@@ -100,7 +100,7 @@ struct gendisk {
 	struct timer_rand_state *random;
 	int policy;
 
-	unsigned sync_io;		/* RAID */
+	atomic_t sync_io;		/* RAID */
 	unsigned long stamp, stamp_idle;
 	int in_flight;
 #ifdef	CONFIG_SMP
diff -pruN ./include/linux/raid.dm/linear.h ./include/linux/raid/linear.h
--- ./include/linux/raid.dm/linear.h	2006-03-17 13:26:03.000000000 +0300
+++ ./include/linux/raid/linear.h	2006-03-17 13:26:59.000000000 +0300
@@ -5,8 +5,8 @@
 
 struct dev_info {
 	mdk_rdev_t	*rdev;
-	unsigned long	size;
-	unsigned long	offset;
+	sector_t	size;
+	sector_t	offset;
 };
 
 typedef struct dev_info dev_info_t;
diff -pruN ./include/linux/raid.dm/md.h ./include/linux/raid/md.h
--- ./include/linux/raid.dm/md.h	2006-03-17 13:26:03.000000000 +0300
+++ ./include/linux/raid/md.h	2006-03-17 13:26:59.000000000 +0300
@@ -69,12 +69,10 @@ extern mdk_thread_t * md_register_thread
 extern void md_unregister_thread (mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
-extern void md_interrupt_thread (mdk_thread_t *thread);
 extern void md_write_start(mddev_t *mddev);
 extern void md_write_end(mddev_t *mddev);
 extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
 extern void md_unplug_mddev(mddev_t *mddev);
 
diff -pruN ./include/linux/raid.dm/md_k.h ./include/linux/raid/md_k.h
--- ./include/linux/raid.dm/md_k.h	2006-03-17 13:26:03.000000000 +0300
+++ ./include/linux/raid/md_k.h	2006-03-17 13:26:59.000000000 +0300
@@ -24,7 +24,8 @@
 #define HSM               6UL
 #define MULTIPATH         7UL
 #define RAID6		  8UL
-#define MAX_PERSONALITY   9UL
+#define	RAID10		  9UL
+#define MAX_PERSONALITY   10UL
 
 #define	LEVEL_MULTIPATH		(-4)
 #define	LEVEL_LINEAR		(-1)
@@ -43,6 +44,7 @@ static inline int pers_to_level (int per
 		case RAID1:		return 1;
 		case RAID5:		return 5;
 		case RAID6:		return 6;
+		case RAID10:		return 10;
 	}
 	BUG();
 	return MD_RESERVED;
@@ -60,6 +62,7 @@ static inline int level_to_pers (int lev
 		case 4:
 		case 5: return RAID5;
 		case 6: return RAID6;
+		case 10: return RAID10;
 	}
 	return MD_RESERVED;
 }
@@ -216,6 +219,7 @@ struct mddev_s
 	unsigned long			resync_mark;	/* a recent timestamp */
 	sector_t			resync_mark_cnt;/* blocks written at resync_mark */
 
+	sector_t			resync_max_sectors; /* may be set by personality */
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
@@ -263,6 +267,11 @@ static inline void rdev_dec_pending(mdk_
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 }
 
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+        atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
 struct mdk_personality_s
 {
 	char *name;
diff -pruN ./include/linux/raid.dm/raid10.h ./include/linux/raid/raid10.h
--- ./include/linux/raid.dm/raid10.h	1970-01-01 03:00:00.000000000 +0300
+++ ./include/linux/raid/raid10.h	2006-03-17 13:26:59.000000000 +0300
@@ -0,0 +1,103 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+#include <linux/raid/md.h>
+
+typedef struct mirror_info mirror_info_t;
+
+struct mirror_info {
+	mdk_rdev_t	*rdev;
+	sector_t	head_position;
+};
+
+typedef struct r10bio_s r10bio_t;
+
+struct r10_private_data_s {
+	mddev_t			*mddev;
+	mirror_info_t		*mirrors;
+	int			raid_disks;
+	int			working_disks;
+	spinlock_t		device_lock;
+
+	/* geometry */
+	int			near_copies;  /* number of copies layed out raid0 style */
+	int 			far_copies;   /* number of copies layed out
+					       * at large strides across drives
+					       */
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
+	sector_t		stride;	      /* distance between far copies.
+					       * This is size / far_copies
+					       */
+
+	int chunk_shift; /* shift from chunks to sectors */
+	sector_t chunk_mask;
+
+	struct list_head	retry_list;
+	/* for use when syncing mirrors: */
+
+	spinlock_t		resync_lock;
+	int nr_pending;
+	int barrier;
+	sector_t		next_resync;
+
+	wait_queue_head_t	wait_idle;
+	wait_queue_head_t	wait_resume;
+
+	mempool_t *r10bio_pool;
+	mempool_t *r10buf_pool;
+};
+
+typedef struct r10_private_data_s conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio_s {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	sector_t		sector;	/* virtual sector number */
+	int			sectors;
+	unsigned long		state;
+	mddev_t			*mddev;
+	/*
+	 * original bio going to /dev/mdx
+	 */
+	struct bio		*master_bio;
+	/*
+	 * if the IO is in READ direction, then this is where we read
+	 */
+	int			read_slot;
+
+	struct list_head	retry_list;
+	/*
+	 * if the IO is in WRITE direction, then multiple bios are used,
+	 * one for each copy.
+	 * When resyncing we also use one for each copy.
+	 * When reconstructing, we use 2 bios, one for read, one for write.
+	 * We choose the number when they are allocated.
+	 */
+	struct {
+		struct bio		*bio;
+		sector_t addr;
+		int devnum;
+	} devs[0];
+};
+
+/* bits for r10bio.state */
+#define	R10BIO_Uptodate	0
+#define	R10BIO_IsSync	1
+#define	R10BIO_IsRecover 2
+#endif