diff -pruN ./drivers/md.dm/dm-bio-list.h ./drivers/md/dm-bio-list.h --- ./drivers/md.dm/dm-bio-list.h 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-bio-list.h 2006-03-17 13:16:38.000000000 +0300 @@ -33,6 +33,9 @@ static inline void bio_list_add(struct b static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) { + if (!bl2->head) + return; + if (bl->tail) bl->tail->bi_next = bl2->head; else diff -pruN ./drivers/md.dm/dm-bio-record.h ./drivers/md/dm-bio-record.h --- ./drivers/md.dm/dm-bio-record.h 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-bio-record.h 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_RECORD_H +#define DM_BIO_RECORD_H + +#include + +/* + * There are lots of mutable fields in the bio struct that get + * changed by the lower levels of the block layer. Some targets, + * such as multipath, may wish to resubmit a bio on error. The + * functions in this file help the target record and restore the + * original bio state. + */ +struct dm_bio_details { + sector_t bi_sector; + struct block_device *bi_bdev; + unsigned int bi_size; + unsigned short bi_idx; + unsigned long bi_flags; +}; + +static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) +{ + bd->bi_sector = bio->bi_sector; + bd->bi_bdev = bio->bi_bdev; + bd->bi_size = bio->bi_size; + bd->bi_idx = bio->bi_idx; + bd->bi_flags = bio->bi_flags; +} + +static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) +{ + bio->bi_sector = bd->bi_sector; + bio->bi_bdev = bd->bi_bdev; + bio->bi_size = bd->bi_size; + bio->bi_idx = bd->bi_idx; + bio->bi_flags = bd->bi_flags; +} + +#endif diff -pruN ./drivers/md.dm/dm.c ./drivers/md/dm.c --- ./drivers/md.dm/dm.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm.c 2006-03-17 13:16:38.000000000 +0300 @@ -15,15 +15,13 @@ #include #include #include +#include static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; -static int realloc_minor_bits(unsigned long requested_minor); -static void free_minor_bits(void); - /* * One of these is allocated per bio. */ @@ -32,6 +30,7 @@ struct dm_io { int error; struct bio *bio; atomic_t io_count; + unsigned long start_time; }; /* @@ -44,15 +43,23 @@ struct target_io { union map_info info; }; +union map_info *dm_get_mapinfo(struct bio *bio) +{ + if (bio && bio->bi_private) + return &((struct target_io *)bio->bi_private)->info; + return NULL; +} + /* * Bits for the md->flags field. */ #define DMF_BLOCK_IO 0 #define DMF_SUSPENDED 1 -#define DMF_FS_LOCKED 2 +#define DMF_FROZEN 2 struct mapped_device { - struct rw_semaphore lock; + struct rw_semaphore io_lock; + struct semaphore suspend_lock; rwlock_t map_lock; atomic_t holders; @@ -61,6 +68,8 @@ struct mapped_device { request_queue_t *queue; struct gendisk *disk; + void *interface_ptr; + /* * A list of ios that arrived while we were suspended. */ @@ -89,6 +98,7 @@ struct mapped_device { * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; + struct block_device *suspended_bdev; }; #define MIN_IOS 256 @@ -113,19 +123,11 @@ static int __init local_init(void) return -ENOMEM; } - r = realloc_minor_bits(1024); - if (r < 0) { - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); - return r; - } - _major = major; r = register_blkdev(_major, _name); if (r < 0) { kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); - free_minor_bits(); return r; } @@ -139,7 +141,6 @@ static void local_exit(void) { kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); - free_minor_bits(); if (unregister_blkdev(_major, _name) < 0) DMERR("devfs_unregister_blkdev failed"); @@ -238,21 +239,53 @@ static inline void free_tio(struct mappe mempool_free(tio, md->tio_pool); } +static void start_io_acct(struct dm_io *io) +{ + struct mapped_device *md = io->md; + + io->start_time = jiffies; + + disk_round_stats(dm_disk(md)); + dm_disk(md)->in_flight = atomic_inc_return(&md->pending); +} + +static int end_io_acct(struct dm_io *io) +{ + struct mapped_device *md = io->md; + struct bio *bio = io->bio; + unsigned long duration = jiffies - io->start_time; + int pending; + + disk_round_stats(dm_disk(md)); + dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); + + switch (bio_data_dir(bio)) { + case WRITE: + disk_stat_add(dm_disk(md), write_ticks, duration); + break; + case READ: + disk_stat_add(dm_disk(md), read_ticks, duration); + break; + } + + return !pending; +} + /* * Add the bio to the list of deferred io. */ static int queue_io(struct mapped_device *md, struct bio *bio) { - down_write(&md->lock); + down_write(&md->io_lock); if (!test_bit(DMF_BLOCK_IO, &md->flags)) { - up_write(&md->lock); + up_write(&md->io_lock); return 1; } bio_list_add(&md->deferred, bio); - up_write(&md->lock); + up_write(&md->io_lock); return 0; /* deferred successfully */ } @@ -293,7 +326,7 @@ static inline void dec_pending(struct dm io->error = error; if (atomic_dec_and_test(&io->io_count)) { - if (atomic_dec_and_test(&io->md->pending)) + if (end_io_acct(io)) /* nudge anyone waiting on suspend queue */ wake_up(&io->md->wait); @@ -342,8 +375,8 @@ static sector_t max_io_len(struct mapped */ if (ti->split_io) { sector_t boundary; - boundary = dm_round_up(offset + 1, ti->split_io) - offset; - + boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) + - offset; if (len > boundary) len = boundary; } @@ -379,7 +412,7 @@ static void __map_bio(struct dm_target * /* error the io and bail out */ struct dm_io *io = tio->io; free_tio(tio->io->md, tio); - dec_pending(io, -EIO); + dec_pending(io, r); bio_put(clone); } } @@ -542,7 +575,7 @@ static void __split_bio(struct mapped_de ci.sector_count = bio_sectors(bio); ci.idx = bio->bi_idx; - atomic_inc(&md->pending); + start_io_acct(ci.io); while (ci.sector_count) __clone_and_map(&ci); @@ -563,14 +596,22 @@ static int dm_request(request_queue_t *q int r; struct mapped_device *md = q->queuedata; - down_read(&md->lock); + down_read(&md->io_lock); + + if (bio_data_dir(bio) == WRITE) { + disk_stat_inc(dm_disk(md), writes); + disk_stat_add(dm_disk(md), write_sectors, bio_sectors(bio)); + } else { + disk_stat_inc(dm_disk(md), reads); + disk_stat_add(dm_disk(md), read_sectors, bio_sectors(bio)); + } /* * If we're suspended we have to queue * this io for later. */ while (test_bit(DMF_BLOCK_IO, &md->flags)) { - up_read(&md->lock); + up_read(&md->io_lock); if (bio_rw(bio) == READA) { bio_io_error(bio, bio->bi_size); @@ -589,14 +630,29 @@ static int dm_request(request_queue_t *q * We're in a while loop, because someone could suspend * before we get to the following read lock. */ - down_read(&md->lock); + down_read(&md->io_lock); } __split_bio(md, bio); - up_read(&md->lock); + up_read(&md->io_lock); return 0; } +static int dm_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + struct mapped_device *md = q->queuedata; + struct dm_table *map = dm_get_table(md); + int ret = -ENXIO; + + if (map) { + ret = dm_table_flush_all(map); + dm_table_put(map); + } + + return ret; +} + static void dm_unplug_all(request_queue_t *q) { struct mapped_device *md = q->queuedata; @@ -624,109 +680,86 @@ static int dm_any_congested(void *conges } /*----------------------------------------------------------------- - * A bitset is used to keep track of allocated minor numbers. + * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ static DECLARE_MUTEX(_minor_lock); -static unsigned long *_minor_bits = NULL; -static unsigned long _max_minors = 0; - -#define MINORS_SIZE(minors) ((minors / BITS_PER_LONG) * sizeof(unsigned long)) - -static int realloc_minor_bits(unsigned long requested_minor) -{ - unsigned long max_minors; - unsigned long *minor_bits, *tmp; - - if (requested_minor < _max_minors) - return -EINVAL; - - /* Round up the requested minor to the next power-of-2. */ - max_minors = 1 << fls(requested_minor - 1); - if (max_minors > (1 << MINORBITS)) - return -EINVAL; - - minor_bits = kmalloc(MINORS_SIZE(max_minors), GFP_KERNEL); - if (!minor_bits) - return -ENOMEM; - memset(minor_bits, 0, MINORS_SIZE(max_minors)); - - /* Copy the existing bit-set to the new one. */ - if (_minor_bits) - memcpy(minor_bits, _minor_bits, MINORS_SIZE(_max_minors)); - - tmp = _minor_bits; - _minor_bits = minor_bits; - _max_minors = max_minors; - if (tmp) - kfree(tmp); - - return 0; -} - -static void free_minor_bits(void) -{ - down(&_minor_lock); - kfree(_minor_bits); - _minor_bits = NULL; - _max_minors = 0; - up(&_minor_lock); -} +static DEFINE_IDR(_minor_idr); static void free_minor(unsigned int minor) { down(&_minor_lock); - if (minor < _max_minors) - clear_bit(minor, _minor_bits); + idr_remove(&_minor_idr, minor); up(&_minor_lock); } /* * See if the device with a specific minor # is free. */ -static int specific_minor(unsigned int minor) +static int specific_minor(struct mapped_device *md, unsigned int minor) { - int r = 0; + int r, m; - if (minor > (1 << MINORBITS)) + if (minor >= (1 << MINORBITS)) return -EINVAL; down(&_minor_lock); - if (minor >= _max_minors) { - r = realloc_minor_bits(minor); - if (r) { - up(&_minor_lock); - return r; - } + + if (idr_find(&_minor_idr, minor)) { + r = -EBUSY; + goto out; + } + + r = idr_pre_get(&_minor_idr, GFP_KERNEL); + if (!r) { + r = -ENOMEM; + goto out; + } + + r = idr_get_new_above(&_minor_idr, md, minor, &m); + if (r) { + goto out; } - if (test_and_set_bit(minor, _minor_bits)) + if (m != minor) { + idr_remove(&_minor_idr, m); r = -EBUSY; - up(&_minor_lock); + goto out; + } +out: + up(&_minor_lock); return r; } -static int next_free_minor(unsigned int *minor) +static int next_free_minor(struct mapped_device *md, unsigned int *minor) { int r; unsigned int m; down(&_minor_lock); - m = find_first_zero_bit(_minor_bits, _max_minors); - if (m >= _max_minors) { - r = realloc_minor_bits(_max_minors * 2); - if (r) { - up(&_minor_lock); - return r; - } - m = find_first_zero_bit(_minor_bits, _max_minors); + + r = idr_pre_get(&_minor_idr, GFP_KERNEL); + if (!r) { + r = -ENOMEM; + goto out; + } + + r = idr_get_new(&_minor_idr, md, &m); + if (r) { + goto out; + } + + if (m >= (1 << MINORBITS)) { + idr_remove(&_minor_idr, m); + r = -ENOSPC; + goto out; } - set_bit(m, _minor_bits); *minor = m; - up(&_minor_lock); - return 0; +out: + up(&_minor_lock); + return r; } static struct block_device_operations dm_blk_dops; @@ -745,12 +778,13 @@ static struct mapped_device *alloc_dev(u } /* get a minor number for the dev */ - r = persistent ? specific_minor(minor) : next_free_minor(&minor); + r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); if (r < 0) goto bad1; memset(md, 0, sizeof(*md)); - init_rwsem(&md->lock); + init_rwsem(&md->io_lock); + init_MUTEX(&md->suspend_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->event_nr, 0); @@ -764,6 +798,7 @@ static struct mapped_device *alloc_dev(u md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); md->queue->unplug_fn = dm_unplug_all; + md->queue->issue_flush_fn = dm_flush_all; md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); @@ -823,22 +858,17 @@ static void event_callback(void *context { struct mapped_device *md = (struct mapped_device *) context; - atomic_inc(&md->event_nr);; + atomic_inc(&md->event_nr); wake_up(&md->eventq); } -static void __set_size(struct gendisk *disk, sector_t size) +static void __set_size(struct mapped_device *md, sector_t size) { - struct block_device *bdev; + set_capacity(md->disk, size); - set_capacity(disk, size); - bdev = bdget_disk(disk, 0); - if (bdev) { - down(&bdev->bd_inode->i_sem); - i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - up(&bdev->bd_inode->i_sem); - bdput(bdev); - } + down(&md->suspended_bdev->bd_inode->i_sem); + i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); + up(&md->suspended_bdev->bd_inode->i_sem); } static int __bind(struct mapped_device *md, struct dm_table *t) @@ -847,17 +877,18 @@ static int __bind(struct mapped_device * sector_t size; size = dm_table_get_size(t); - __set_size(md->disk, size); + __set_size(md, size); if (size == 0) return 0; + dm_table_get(t); + dm_table_event_callback(t, event_callback, md); + write_lock(&md->map_lock); md->map = t; + dm_table_set_restrictions(t, q); write_unlock(&md->map_lock); - dm_table_get(t); - dm_table_event_callback(md->map, event_callback, md); - dm_table_set_restrictions(t, q); return 0; } @@ -901,6 +932,32 @@ int dm_create_with_minor(unsigned int mi return create_aux(minor, 1, result); } +void *dm_get_mdptr(dev_t dev) +{ + struct mapped_device *md; + void *mdptr = NULL; + unsigned minor = MINOR(dev); + + if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) + return NULL; + + down(&_minor_lock); + + md = idr_find(&_minor_idr, minor); + + if (md && (dm_disk(md)->first_minor == minor)) + mdptr = md->interface_ptr; + + up(&_minor_lock); + + return mdptr; +} + +void dm_set_mdptr(struct mapped_device *md, void *ptr) +{ + md->interface_ptr = ptr; +} + void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); @@ -911,8 +968,10 @@ void dm_put(struct mapped_device *md) struct dm_table *map = dm_get_table(md); if (atomic_dec_and_test(&md->holders)) { - if (!test_bit(DMF_SUSPENDED, &md->flags) && map) - dm_table_suspend_targets(map); + if (!dm_suspended(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); + } __unbind(md); free_dev(md); } @@ -940,69 +999,55 @@ static void __flush_deferred_io(struct m */ int dm_swap_table(struct mapped_device *md, struct dm_table *table) { - int r; + int r = -EINVAL; - down_write(&md->lock); + down(&md->suspend_lock); /* device must be suspended */ - if (!test_bit(DMF_SUSPENDED, &md->flags)) { - up_write(&md->lock); - return -EPERM; - } + if (!dm_suspended(md)) + goto out; __unbind(md); r = __bind(md, table); - if (r) - return r; - up_write(&md->lock); - return 0; +out: + up(&md->suspend_lock); + return r; } /* * Functions to lock and unlock any filesystem running on the * device. */ -static int __lock_fs(struct mapped_device *md) +static int lock_fs(struct mapped_device *md) { - struct block_device *bdev; + int r; - if (test_and_set_bit(DMF_FS_LOCKED, &md->flags)) - return 0; + WARN_ON(md->frozen_sb); - bdev = bdget_disk(md->disk, 0); - if (!bdev) { - DMWARN("bdget failed in __lock_fs"); - return -ENOMEM; + md->frozen_sb = freeze_bdev(md->suspended_bdev); + if (IS_ERR(md->frozen_sb)) { + r = PTR_ERR(md->frozen_sb); + md->frozen_sb = NULL; + return r; } - WARN_ON(md->frozen_sb); - md->frozen_sb = freeze_bdev(bdev); + set_bit(DMF_FROZEN, &md->flags); + /* don't bdput right now, we don't want the bdev - * to go away while it is locked. We'll bdput - * in __unlock_fs + * to go away while it is locked. */ return 0; } -static int __unlock_fs(struct mapped_device *md) +static void unlock_fs(struct mapped_device *md) { - struct block_device *bdev; - - if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags)) - return 0; - - bdev = bdget_disk(md->disk, 0); - if (!bdev) { - DMWARN("bdget failed in __unlock_fs"); - return -ENOMEM; - } + if (!test_bit(DMF_FROZEN, &md->flags)) + return; - thaw_bdev(bdev, md->frozen_sb); + thaw_bdev(md->suspended_bdev, md->frozen_sb); md->frozen_sb = NULL; - bdput(bdev); - bdput(bdev); - return 0; + clear_bit(DMF_FROZEN, &md->flags); } /* @@ -1012,46 +1057,48 @@ static int __unlock_fs(struct mapped_dev * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ -int dm_suspend(struct mapped_device *md) +int dm_suspend(struct mapped_device *md, int do_lockfs) { - struct dm_table *map; + struct dm_table *map = NULL; DECLARE_WAITQUEUE(wait, current); + int r = -EINVAL; - /* Flush I/O to the device. */ - down_read(&md->lock); - if (test_bit(DMF_BLOCK_IO, &md->flags)) { - up_read(&md->lock); - return -EINVAL; + down(&md->suspend_lock); + + if (dm_suspended(md)) + goto out; + + map = dm_get_table(md); + + /* This does not get reverted if there's an error later. */ + dm_table_presuspend_targets(map); + + md->suspended_bdev = bdget_disk(md->disk, 0); + if (!md->suspended_bdev) { + DMWARN("bdget failed in dm_suspend"); + r = -ENOMEM; + goto out; } - __lock_fs(md); - up_read(&md->lock); + /* Flush I/O to the device. */ + if (do_lockfs) { + r = lock_fs(md); + if (r) + goto out; + } /* - * First we set the BLOCK_IO flag so no more ios will be - * mapped. + * First we set the BLOCK_IO flag so no more ios will be mapped. */ - down_write(&md->lock); - if (test_bit(DMF_BLOCK_IO, &md->flags)) { - /* - * If we get here we know another thread is - * trying to suspend as well, so we leave the fs - * locked for this thread. - */ - up_write(&md->lock); - return -EINVAL; - } - + down_write(&md->io_lock); set_bit(DMF_BLOCK_IO, &md->flags); + add_wait_queue(&md->wait, &wait); - up_write(&md->lock); + up_write(&md->io_lock); /* unplug */ - map = dm_get_table(md); - if (map) { + if (map) dm_table_unplug_all(map); - dm_table_put(map); - } /* * Then we wait for the already mapped ios to @@ -1067,54 +1114,75 @@ int dm_suspend(struct mapped_device *md) } set_current_state(TASK_RUNNING); - down_write(&md->lock); + down_write(&md->io_lock); remove_wait_queue(&md->wait, &wait); /* were we interrupted ? */ + r = -EINTR; if (atomic_read(&md->pending)) { - __unlock_fs(md); + up_write(&md->io_lock); + unlock_fs(md); clear_bit(DMF_BLOCK_IO, &md->flags); - up_write(&md->lock); - return -EINTR; + goto out; } + up_write(&md->io_lock); + + dm_table_postsuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); - map = dm_get_table(md); - if (map) - dm_table_suspend_targets(map); - dm_table_put(map); - up_write(&md->lock); + r = 0; - return 0; +out: + if (r && md->suspended_bdev) { + bdput(md->suspended_bdev); + md->suspended_bdev = NULL; + } + + dm_table_put(map); + up(&md->suspend_lock); + return r; } int dm_resume(struct mapped_device *md) { + int r = -EINVAL; struct bio *def; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = NULL; - down_write(&md->lock); - if (!map || - !test_bit(DMF_SUSPENDED, &md->flags) || - !dm_table_get_size(map)) { - up_write(&md->lock); - dm_table_put(map); - return -EINVAL; - } + down(&md->suspend_lock); + if (!dm_suspended(md)) + goto out; + + map = dm_get_table(md); + if (!map || !dm_table_get_size(map)) + goto out; dm_table_resume_targets(map); - clear_bit(DMF_SUSPENDED, &md->flags); + + down_write(&md->io_lock); clear_bit(DMF_BLOCK_IO, &md->flags); def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); - up_write(&md->lock); - __unlock_fs(md); + up_write(&md->io_lock); + + unlock_fs(md); + + bdput(md->suspended_bdev); + md->suspended_bdev = NULL; + + clear_bit(DMF_SUSPENDED, &md->flags); + dm_table_unplug_all(map); + + r = 0; + +out: dm_table_put(map); + up(&md->suspend_lock); - return 0; + return r; } /*----------------------------------------------------------------- @@ -1151,6 +1219,8 @@ static struct block_device_operations dm .owner = THIS_MODULE }; +EXPORT_SYMBOL(dm_get_mapinfo); + /* * module hooks */ @@ -1160,5 +1230,5 @@ module_exit(dm_exit); module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); MODULE_DESCRIPTION(DM_NAME " driver"); -MODULE_AUTHOR("Joe Thornber "); +MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); diff -pruN ./drivers/md.dm/dm-crypt.c ./drivers/md/dm-crypt.c --- ./drivers/md.dm/dm-crypt.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-crypt.c 2006-03-17 13:16:38.000000000 +0300 @@ -40,8 +40,8 @@ struct convert_context { struct bio *bio_out; unsigned int offset_in; unsigned int offset_out; - int idx_in; - int idx_out; + unsigned int idx_in; + unsigned int idx_out; sector_t sector; int write; }; @@ -67,8 +67,8 @@ struct crypt_config { struct crypto_tfm *tfm; sector_t iv_offset; int (*iv_generator)(struct crypt_config *cc, u8 *iv, sector_t sector); - int iv_size; - int key_size; + unsigned int iv_size; + unsigned int key_size; u8 key[0]; }; @@ -97,10 +97,8 @@ static void mempool_free_page(void *page */ static int crypt_iv_plain(struct crypt_config *cc, u8 *iv, sector_t sector) { + memset(iv, 0, cc->iv_size); *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); - if (cc->iv_size > sizeof(u32) / sizeof(u8)) - memset(iv + (sizeof(u32) / sizeof(u8)), 0, - cc->iv_size - (sizeof(u32) / sizeof(u8))); return 0; } @@ -200,13 +198,13 @@ static int crypt_convert(struct crypt_co */ static struct bio * crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, - struct bio *base_bio, int *bio_vec_idx) + struct bio *base_bio, unsigned int *bio_vec_idx) { struct bio *bio; - int nr_iovecs = dm_div_up(size, PAGE_SIZE); + unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; int gfp_mask = GFP_NOIO | __GFP_HIGHMEM; - int flags = current->flags; - int i; + unsigned long flags = current->flags; + unsigned int i; /* * Tell VM to act less aggressively and fail earlier. @@ -280,9 +278,8 @@ crypt_alloc_buffer(struct crypt_config * static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *bio, unsigned int bytes) { - unsigned int start, end; + unsigned int i, start, end; struct bio_vec *bv; - int i; /* * This is ugly, but Jens Axboe thinks that using bi_idx in the @@ -366,11 +363,11 @@ static void kcryptd_queue_io(struct cryp /* * Decode key from its hex representation */ -static int crypt_decode_key(u8 *key, char *hex, int size) +static int crypt_decode_key(u8 *key, char *hex, unsigned int size) { char buffer[3]; char *endp; - int i; + unsigned int i; buffer[2] = '\0'; @@ -393,9 +390,9 @@ static int crypt_decode_key(u8 *key, cha /* * Encode key into its hex representation */ -static void crypt_encode_key(char *hex, u8 *key, int size) +static void crypt_encode_key(char *hex, u8 *key, unsigned int size) { - int i; + unsigned int i; for(i = 0; i < size; i++) { sprintf(hex, "%02x", *key); @@ -415,8 +412,8 @@ static int crypt_ctr(struct dm_target *t char *tmp; char *cipher; char *mode; - int crypto_flags; - int key_size; + unsigned int crypto_flags; + unsigned int key_size; if (argc != 5) { ti->error = PFX "Not enough arguments"; @@ -464,9 +461,9 @@ static int crypt_ctr(struct dm_target *t } if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv) - /* at least a 32 bit sector number should fit in our buffer */ + /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(crypto_tfm_alg_ivsize(tfm), - (unsigned int)(sizeof(u32) / sizeof(u8))); + (unsigned int)(sizeof(u64) / sizeof(u8))); else { cc->iv_size = 0; if (cc->iv_generator) { @@ -528,6 +525,8 @@ bad3: bad2: crypto_free_tfm(tfm); bad1: + /* Must zero key material before freeing */ + memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); kfree(cc); return -EINVAL; } @@ -541,6 +540,9 @@ static void crypt_dtr(struct dm_target * crypto_free_tfm(cc->tfm); dm_put_device(ti, cc->dev); + + /* Must zero key material before freeing */ + memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); kfree(cc); } @@ -577,7 +579,8 @@ static int crypt_endio(struct bio *bio, static inline struct bio * crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, - sector_t sector, int *bvec_idx, struct convert_context *ctx) + sector_t sector, unsigned int *bvec_idx, + struct convert_context *ctx) { struct bio *clone; @@ -630,7 +633,7 @@ static int crypt_map(struct dm_target *t struct bio *clone; unsigned int remaining = bio->bi_size; sector_t sector = bio->bi_sector - ti->begin; - int bvec_idx = 0; + unsigned int bvec_idx = 0; io->target = ti; io->bio = bio; @@ -693,7 +696,7 @@ static int crypt_status(struct dm_target char buffer[32]; const char *cipher; const char *mode = NULL; - int offset; + unsigned int offset; switch (type) { case STATUSTYPE_INFO: diff -pruN ./drivers/md.dm/dm-emc.c ./drivers/md/dm-emc.c --- ./drivers/md.dm/dm-emc.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-emc.c 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,359 @@ +/* + * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath support for EMC CLARiiON AX/CX-series hardware. + */ + +#include "dm.h" +#include "dm-hw-handler.h" +#include +#include + +struct emc_handler { + spinlock_t lock; + + /* Whether we should send the short trespass command (FC-series) + * or the long version (default for AX/CX CLARiiON arrays). */ + unsigned short_trespass; + /* Whether or not to honor SCSI reservations when initiating a + * switch-over. Default: Don't. */ + unsigned hr; + + unsigned char sense[SCSI_SENSE_BUFFERSIZE]; +}; + +#define TRESPASS_PAGE 0x22 +#define EMC_FAILOVER_TIMEOUT (60 * HZ) + +/* Code borrowed from dm-lsi-rdac by Mike Christie */ + +static inline void free_bio(struct bio *bio) +{ + __free_page(bio->bi_io_vec[0].bv_page); + bio_put(bio); +} + +static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) +{ + struct path *path = bio->bi_private; + + if (bio->bi_size) + return 1; + + /* We also need to look at the sense keys here whether or not to + * switch to the next PG etc. + * + * For now simple logic: either it works or it doesn't. + */ + if (error) + dm_pg_init_complete(path, MP_FAIL_PATH); + else + dm_pg_init_complete(path, 0); + + /* request is freed in block layer */ + free_bio(bio); + + return 0; +} + +static struct bio *get_failover_bio(struct path *path, unsigned data_size) +{ + struct bio *bio; + struct page *page; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) { + DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); + return NULL; + } + + bio->bi_rw |= (1 << BIO_RW); + bio->bi_bdev = path->dev->bdev; + bio->bi_sector = 0; + bio->bi_private = path; + bio->bi_end_io = emc_endio; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + DMERR("dm-emc: get_failover_bio: alloc_page() failed."); + bio_put(bio); + return NULL; + } + + if (bio_add_page(bio, page, data_size, 0) != data_size) { + DMERR("dm-emc: get_failover_bio: alloc_page() failed."); + __free_page(page); + bio_put(bio); + return NULL; + } + + return bio; +} + +static struct request *get_failover_req(struct emc_handler *h, + struct bio *bio, struct path *path) +{ + struct request *rq; + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev_get_queue(bdev); + + /* FIXME: Figure out why it fails with GFP_ATOMIC. */ + rq = blk_get_request(q, WRITE, __GFP_WAIT); + if (!rq) { + DMERR("dm-emc: get_failover_req: blk_get_request failed"); + return NULL; + } + + rq->bio = rq->biotail = bio; + blk_rq_bio_prep(q, rq, bio); + + rq->rq_disk = bdev->bd_contains->bd_disk; + + /* bio backed don't set data */ + rq->buffer = rq->data = NULL; + /* rq data_len used for pc cmd's request_bufflen */ + rq->data_len = bio->bi_size; + + rq->sense = h->sense; + memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); + rq->sense_len = 0; + + memset(&rq->cmd, 0, BLK_MAX_CDB); + + rq->timeout = EMC_FAILOVER_TIMEOUT; + rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); + + return rq; +} + +static struct request *emc_trespass_get(struct emc_handler *h, + struct path *path) +{ + struct bio *bio; + struct request *rq; + unsigned char *page22; + unsigned char long_trespass_pg[] = { + 0, 0, 0, 0, + TRESPASS_PAGE, /* Page code */ + 0x09, /* Page length - 2 */ + h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ + 0xff, 0xff, /* Trespass target */ + 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */ + }; + unsigned char short_trespass_pg[] = { + 0, 0, 0, 0, + TRESPASS_PAGE, /* Page code */ + 0x02, /* Page length - 2 */ + h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */ + 0xff, /* Trespass target */ + }; + unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) : + sizeof(long_trespass_pg); + + /* get bio backing */ + if (data_size > PAGE_SIZE) + /* this should never happen */ + return NULL; + + bio = get_failover_bio(path, data_size); + if (!bio) { + DMERR("dm-emc: emc_trespass_get: no bio"); + return NULL; + } + + page22 = (unsigned char *)bio_data(bio); + memset(page22, 0, data_size); + + memcpy(page22, h->short_trespass ? + short_trespass_pg : long_trespass_pg, data_size); + + /* get request for block layer packet command */ + rq = get_failover_req(h, bio, path); + if (!rq) { + DMERR("dm-emc: emc_trespass_get: no rq"); + free_bio(bio); + return NULL; + } + + /* Prepare the command. */ + rq->cmd[0] = MODE_SELECT; + rq->cmd[1] = 0x10; + rq->cmd[4] = data_size; + rq->cmd_len = COMMAND_SIZE(rq->cmd[0]); + + return rq; +} + +static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, + struct path *path) +{ + struct request *rq; + struct request_queue *q = bdev_get_queue(path->dev->bdev); + + /* + * We can either blindly init the pg (then look at the sense), + * or we can send some commands to get the state here (then + * possibly send the fo cmnd), or we can also have the + * initial state passed into us and then get an update here. + */ + if (!q) { + DMINFO("dm-emc: emc_pg_init: no queue"); + goto fail_path; + } + + /* FIXME: The request should be pre-allocated. */ + rq = emc_trespass_get(hwh->context, path); + if (!rq) { + DMERR("dm-emc: emc_pg_init: no rq"); + goto fail_path; + } + + DMINFO("dm-emc: emc_pg_init: sending switch-over command"); + elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); + return; + +fail_path: + dm_pg_init_complete(path, MP_FAIL_PATH); +} + +static struct emc_handler *alloc_emc_handler(void) +{ + struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL); + + if (h) { + memset(h, 0, sizeof(*h)); + spin_lock_init(&h->lock); + } + + return h; +} + +static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) +{ + struct emc_handler *h; + unsigned hr, short_trespass; + + if (argc == 0) { + /* No arguments: use defaults */ + hr = 0; + short_trespass = 0; + } else if (argc != 2) { + DMWARN("dm-emc hwhandler: incorrect number of arguments"); + return -EINVAL; + } else { + if ((sscanf(argv[0], "%u", &short_trespass) != 1) + || (short_trespass > 1)) { + DMWARN("dm-emc: invalid trespass mode selected"); + return -EINVAL; + } + + if ((sscanf(argv[1], "%u", &hr) != 1) + || (hr > 1)) { + DMWARN("dm-emc: invalid honor reservation flag selected"); + return -EINVAL; + } + } + + h = alloc_emc_handler(); + if (!h) + return -ENOMEM; + + hwh->context = h; + + if ((h->short_trespass = short_trespass)) + DMWARN("dm-emc: short trespass command will be send"); + else + DMWARN("dm-emc: long trespass command will be send"); + + if ((h->hr = hr)) + DMWARN("dm-emc: honor reservation bit will be set"); + else + DMWARN("dm-emc: honor reservation bit will not be set (default)"); + + return 0; +} + +static void emc_destroy(struct hw_handler *hwh) +{ + struct emc_handler *h = (struct emc_handler *) hwh->context; + + kfree(h); + hwh->context = NULL; +} + +static unsigned emc_error(struct hw_handler *hwh, struct bio *bio) +{ + /* FIXME: Patch from axboe still missing */ +#if 0 + int sense; + + if (bio->bi_error & BIO_SENSE) { + sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */ + + if (sense == 0x020403) { + /* LUN Not Ready - Manual Intervention Required + * indicates this is a passive path. + * + * FIXME: However, if this is seen and EVPD C0 + * indicates that this is due to a NDU in + * progress, we should set FAIL_PATH too. + * This indicates we might have to do a SCSI + * inquiry in the end_io path. Ugh. */ + return MP_BYPASS_PG | MP_RETRY_IO; + } else if (sense == 0x052501) { + /* An array based copy is in progress. Do not + * fail the path, do not bypass to another PG, + * do not retry. Fail the IO immediately. + * (Actually this is the same conclusion as in + * the default handler, but lets make sure.) */ + return 0; + } else if (sense == 0x062900) { + /* Unit Attention Code. This is the first IO + * to the new path, so just retry. */ + return MP_RETRY_IO; + } + } +#endif + + /* Try default handler */ + return dm_scsi_err_handler(hwh, bio); +} + +static struct hw_handler_type emc_hwh = { + .name = "emc", + .module = THIS_MODULE, + .create = emc_create, + .destroy = emc_destroy, + .pg_init = emc_pg_init, + .error = emc_error, +}; + +static int __init dm_emc_init(void) +{ + int r = dm_register_hw_handler(&emc_hwh); + + if (r < 0) + DMERR("emc: register failed %d", r); + + DMINFO("dm-emc version 0.0.3 loaded"); + + return r; +} + +static void __exit dm_emc_exit(void) +{ + int r = dm_unregister_hw_handler(&emc_hwh); + + if (r < 0) + DMERR("emc: unregister failed %d", r); +} + +module_init(dm_emc_init); +module_exit(dm_emc_exit); + +MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath"); +MODULE_AUTHOR("Lars Marowsky-Bree "); +MODULE_LICENSE("GPL"); diff -pruN ./drivers/md.dm/dm.h ./drivers/md/dm.h --- ./drivers/md.dm/dm.h 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm.h 2006-03-17 13:16:38.000000000 +0300 @@ -19,6 +19,9 @@ #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) +#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ + 0 : scnprintf(result + sz, maxlen - sz, x)) + /* * FIXME: I think this should be with the definition of sector_t * in types.h. @@ -40,6 +43,7 @@ struct dm_dev { atomic_t count; int mode; struct block_device *bdev; + char name[16]; }; struct dm_table; @@ -51,6 +55,8 @@ struct mapped_device; *---------------------------------------------------------------*/ int dm_create(struct mapped_device **md); int dm_create_with_minor(unsigned int minor, struct mapped_device **md); +void dm_set_mdptr(struct mapped_device *md, void *ptr); +void *dm_get_mdptr(dev_t dev); /* * Reference counting for md. @@ -61,7 +67,7 @@ void dm_put(struct mapped_device *md); /* * A device can still be used while suspended, but I/O is deferred. */ -int dm_suspend(struct mapped_device *md); +int dm_suspend(struct mapped_device *md, int with_lockfs); int dm_resume(struct mapped_device *md); /* @@ -109,10 +115,12 @@ void dm_table_set_restrictions(struct dm unsigned int dm_table_get_num_targets(struct dm_table *t); struct list_head *dm_table_get_devices(struct dm_table *t); int dm_table_get_mode(struct dm_table *t); -void dm_table_suspend_targets(struct dm_table *t); +void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_postsuspend_targets(struct dm_table *t); void dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); void dm_table_unplug_all(struct dm_table *t); +int dm_table_flush_all(struct dm_table *t); /*----------------------------------------------------------------- * A registry of target types. @@ -135,21 +143,22 @@ static inline int array_too_big(unsigned } /* - * ceiling(n / size) * size + * Ceiling(n / sz) */ -static inline unsigned long dm_round_up(unsigned long n, unsigned long size) -{ - unsigned long r = n % size; - return n + (r ? (size - r) : 0); -} +#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz)) + +#define dm_sector_div_up(n, sz) ( \ +{ \ + sector_t _r = ((n) + (sz) - 1); \ + sector_div(_r, (sz)); \ + _r; \ +} \ +) /* - * Ceiling(n / size) + * ceiling(n / size) * size */ -static inline unsigned long dm_div_up(unsigned long n, unsigned long size) -{ - return dm_round_up(n, size) / size; -} +#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) static inline sector_t to_sector(unsigned long n) { @@ -161,6 +170,8 @@ static inline unsigned long to_bytes(sec return (n << 9); } +int dm_split_args(int *argc, char ***argvp, char *input); + /* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. @@ -178,5 +189,6 @@ int dm_stripe_init(void); void dm_stripe_exit(void); void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); +union map_info *dm_get_mapinfo(struct bio *bio); #endif diff -pruN ./drivers/md.dm/dm-hw-handler.c ./drivers/md/dm-hw-handler.c --- ./drivers/md.dm/dm-hw-handler.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-hw-handler.c 2006-03-20 09:38:13.000000000 +0300 @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath hardware handler registration. + */ + +#include "dm.h" +#include "dm-hw-handler.h" + +#include + +struct hwh_internal { + struct hw_handler_type hwht; + + struct list_head list; + long use; +}; + +#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht) + +static LIST_HEAD(_hw_handlers); +static DECLARE_RWSEM(_hwh_lock); + +struct hwh_internal *__find_hw_handler_type(const char *name) +{ + struct hwh_internal *hwhi; + + list_for_each_entry(hwhi, &_hw_handlers, list) { + if (!strcmp(name, hwhi->hwht.name)) + return hwhi; + } + + return NULL; +} + +static struct hwh_internal *get_hw_handler(const char *name) +{ + struct hwh_internal *hwhi; + + down_read(&_hwh_lock); + hwhi = __find_hw_handler_type(name); + if (hwhi) { + if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module)) + hwhi = NULL; + else + hwhi->use++; + } + up_read(&_hwh_lock); + + return hwhi; +} + +struct hw_handler_type *dm_get_hw_handler(const char *name) +{ + struct hwh_internal *hwhi; + + if (!name) + return NULL; + + hwhi = get_hw_handler(name); + if (!hwhi) { + request_module("dm-%s", name); + hwhi = get_hw_handler(name); + } + + return hwhi ? &hwhi->hwht : NULL; +} + +void dm_put_hw_handler(struct hw_handler_type *hwht) +{ + struct hwh_internal *hwhi; + + if (!hwht) + return; + + down_read(&_hwh_lock); + hwhi = __find_hw_handler_type(hwht->name); + if (!hwhi) + goto out; + + if (--hwhi->use == 0) + module_put(hwhi->hwht.module); + + if (hwhi->use < 0) + BUG(); + + out: + up_read(&_hwh_lock); +} + +static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht) +{ + struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL); + + if (hwhi) { + memset(hwhi, 0, sizeof(*hwhi)); + hwhi->hwht = *hwht; + } + + return hwhi; +} + +int dm_register_hw_handler(struct hw_handler_type *hwht) +{ + int r = 0; + struct hwh_internal *hwhi = _alloc_hw_handler(hwht); + + if (!hwhi) + return -ENOMEM; + + down_write(&_hwh_lock); + + if (__find_hw_handler_type(hwht->name)) { + kfree(hwhi); + r = -EEXIST; + } else + list_add(&hwhi->list, &_hw_handlers); + + up_write(&_hwh_lock); + + return r; +} + +int dm_unregister_hw_handler(struct hw_handler_type *hwht) +{ + struct hwh_internal *hwhi; + + down_write(&_hwh_lock); + + hwhi = __find_hw_handler_type(hwht->name); + if (!hwhi) { + up_write(&_hwh_lock); + return -EINVAL; + } + + if (hwhi->use) { + up_write(&_hwh_lock); + return -ETXTBSY; + } + + list_del(&hwhi->list); + + up_write(&_hwh_lock); + + kfree(hwhi); + + return 0; +} + +unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio) +{ +#if 0 + int sense_key, asc, ascq; + + if (bio->bi_error & BIO_SENSE) { + /* FIXME: This is just an initial guess. */ + /* key / asc / ascq */ + sense_key = (bio->bi_error >> 16) & 0xff; + asc = (bio->bi_error >> 8) & 0xff; + ascq = bio->bi_error & 0xff; + + switch (sense_key) { + /* This block as a whole comes from the device. + * So no point retrying on another path. */ + case 0x03: /* Medium error */ + case 0x05: /* Illegal request */ + case 0x07: /* Data protect */ + case 0x08: /* Blank check */ + case 0x0a: /* copy aborted */ + case 0x0c: /* obsolete - no clue ;-) */ + case 0x0d: /* volume overflow */ + case 0x0e: /* data miscompare */ + case 0x0f: /* reserved - no idea either. */ + return MP_ERROR_IO; + + /* For these errors it's unclear whether they + * come from the device or the controller. + * So just lets try a different path, and if + * it eventually succeeds, user-space will clear + * the paths again... */ + case 0x02: /* Not ready */ + case 0x04: /* Hardware error */ + case 0x09: /* vendor specific */ + case 0x0b: /* Aborted command */ + return MP_FAIL_PATH; + + case 0x06: /* Unit attention - might want to decode */ + if (asc == 0x04 && ascq == 0x01) + /* "Unit in the process of + * becoming ready" */ + return 0; + return MP_FAIL_PATH; + + /* FIXME: For Unit Not Ready we may want + * to have a generic pg activation + * feature (START_UNIT). */ + + /* Should these two ever end up in the + * error path? I don't think so. */ + case 0x00: /* No sense */ + case 0x01: /* Recovered error */ + return 0; + } + } +#endif + + /* We got no idea how to decode the other kinds of errors -> + * assume generic error condition. */ + return MP_FAIL_PATH; +} + +EXPORT_SYMBOL_GPL(dm_register_hw_handler); +EXPORT_SYMBOL_GPL(dm_unregister_hw_handler); +EXPORT_SYMBOL_GPL(dm_scsi_err_handler); diff -pruN ./drivers/md.dm/dm-hw-handler.h ./drivers/md/dm-hw-handler.h --- ./drivers/md.dm/dm-hw-handler.h 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-hw-handler.h 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath hardware handler registration. + */ + +#ifndef DM_HW_HANDLER_H +#define DM_HW_HANDLER_H + +#include + +#include "dm-mpath.h" + +struct hw_handler_type; +struct hw_handler { + struct hw_handler_type *type; + void *context; +}; + +/* + * Constructs a hardware handler object, takes custom arguments + */ +/* Information about a hardware handler type */ +struct hw_handler_type { + char *name; + struct module *module; + + int (*create) (struct hw_handler *handler, unsigned int argc, + char **argv); + void (*destroy) (struct hw_handler *hwh); + + void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, + struct path *path); + unsigned (*error) (struct hw_handler *hwh, struct bio *bio); + int (*status) (struct hw_handler *hwh, status_type_t type, + char *result, unsigned int maxlen); +}; + +/* Register a hardware handler */ +int dm_register_hw_handler(struct hw_handler_type *type); + +/* Unregister a hardware handler */ +int dm_unregister_hw_handler(struct hw_handler_type *type); + +/* Returns a registered hardware handler type */ +struct hw_handler_type *dm_get_hw_handler(const char *name); + +/* Releases a hardware handler */ +void dm_put_hw_handler(struct hw_handler_type *hwht); + +/* Default err function */ +unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio); + +/* Error flags for err and dm_pg_init_complete */ +#define MP_FAIL_PATH 1 +#define MP_BYPASS_PG 2 +#define MP_ERROR_IO 4 /* Don't retry this I/O */ + +#endif diff -pruN ./drivers/md.dm/dm-io.c ./drivers/md/dm-io.c --- ./drivers/md.dm/dm-io.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-io.c 2006-03-17 13:16:38.000000000 +0300 @@ -267,7 +267,7 @@ static int resize_pool(unsigned int new_ /* create new pool */ _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL); if (!_io_pool) - r = -ENOMEM; + return -ENOMEM; r = bio_set_init(&_bios, "dm-io", 512, 1); if (r) { diff -pruN ./drivers/md.dm/dm-ioctl.c ./drivers/md/dm-ioctl.c --- ./drivers/md.dm/dm-ioctl.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-ioctl.c 2006-03-17 13:16:38.000000000 +0300 @@ -1,5 +1,6 @@ /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -17,7 +18,7 @@ #include -#define DM_DRIVER_EMAIL "dm@uk.sistina.com" +#define DM_DRIVER_EMAIL "dm-devel@redhat.com" /*----------------------------------------------------------------- * The ioctl interface needs to be able to look up devices by @@ -121,14 +122,6 @@ static struct hash_cell *__get_uuid_cell /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ -static inline char *kstrdup(const char *str) -{ - char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); - if (r) - strcpy(r, str); - return r; -} - static struct hash_cell *alloc_cell(const char *name, const char *uuid, struct mapped_device *md) { @@ -138,7 +131,7 @@ static struct hash_cell *alloc_cell(cons if (!hc) return NULL; - hc->name = kstrdup(name); + hc->name = kstrdup(name, GFP_KERNEL); if (!hc->name) { kfree(hc); return NULL; @@ -148,7 +141,7 @@ static struct hash_cell *alloc_cell(cons hc->uuid = NULL; else { - hc->uuid = kstrdup(uuid); + hc->uuid = kstrdup(uuid, GFP_KERNEL); if (!hc->uuid) { kfree(hc->name); kfree(hc); @@ -224,6 +217,7 @@ static int dm_hash_insert(const char *na } register_with_devfs(cell); dm_get(md); + dm_set_mdptr(md, cell); up_write(&_hash_lock); return 0; @@ -236,10 +230,20 @@ static int dm_hash_insert(const char *na static void __hash_remove(struct hash_cell *hc) { + struct dm_table *table; + /* remove from the dev hash */ list_del(&hc->uuid_list); list_del(&hc->name_list); unregister_with_devfs(hc); + dm_set_mdptr(hc->md, NULL); + + table = dm_get_table(hc->md); + if (table) { + dm_table_event(table); + dm_table_put(table); + } + dm_put(hc->md); if (hc->new_map) dm_table_put(hc->new_map); @@ -266,11 +270,12 @@ static int dm_hash_rename(const char *ol { char *new_name, *old_name; struct hash_cell *hc; + struct dm_table *table; /* * duplicate new. */ - new_name = kstrdup(new); + new_name = kstrdup(new, GFP_KERNEL); if (!new_name) return -ENOMEM; @@ -313,6 +318,15 @@ static int dm_hash_rename(const char *ol /* rename the device node in devfs */ register_with_devfs(hc); + /* + * Wake up any dm event waiters. + */ + table = dm_get_table(hc->md); + if (table) { + dm_table_event(table); + dm_table_put(table); + } + up_write(&_hash_lock); kfree(old_name); return 0; @@ -421,8 +435,8 @@ static void list_version_get_needed(stru { size_t *needed = needed_param; + *needed += sizeof(struct dm_target_versions); *needed += strlen(tt->name); - *needed += sizeof(tt->version); *needed += ALIGN_MASK; } @@ -517,19 +531,22 @@ static int __dev_status(struct mapped_de if (dm_suspended(md)) param->flags |= DM_SUSPEND_FLAG; - bdev = bdget_disk(disk, 0); - if (!bdev) - return -ENXIO; - param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); - /* - * Yes, this will be out of date by the time it gets back - * to userland, but it is still very useful ofr - * debugging. - */ - param->open_count = bdev->bd_openers; - bdput(bdev); + if (!(param->flags & DM_SKIP_BDGET_FLAG)) { + bdev = bdget_disk(disk, 0); + if (!bdev) + return -ENXIO; + + /* + * Yes, this will be out of date by the time it gets back + * to userland, but it is still very useful for + * debugging. + */ + param->open_count = bdev->bd_openers; + bdput(bdev); + } else + param->open_count = -1; if (disk->policy) param->flags |= DM_READONLY_FLAG; @@ -579,12 +596,16 @@ static int dev_create(struct dm_ioctl *p } /* - * Always use UUID for lookups if it's present, otherwise use name. + * Always use UUID for lookups if it's present, otherwise use name or dev. */ static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { - return *param->uuid ? - __get_uuid_cell(param->uuid) : __get_name_cell(param->name); + if (*param->uuid) + return __get_uuid_cell(param->uuid); + else if (*param->name) + return __get_name_cell(param->name); + else + return dm_get_mdptr(huge_decode_dev(param->dev)); } static inline struct mapped_device *find_device(struct dm_ioctl *param) @@ -596,6 +617,7 @@ static inline struct mapped_device *find hc = __find_device_hash_cell(param); if (hc) { md = hc->md; + dm_get(md); /* * Sneakily write in both the name and the uuid @@ -611,8 +633,6 @@ static inline struct mapped_device *find param->flags |= DM_INACTIVE_PRESENT_FLAG; else param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - dm_get(md); } up_read(&_hash_lock); @@ -673,14 +693,18 @@ static int dev_rename(struct dm_ioctl *p static int do_suspend(struct dm_ioctl *param) { int r = 0; + int do_lockfs = 1; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; + if (param->flags & DM_SKIP_LOCKFS_FLAG) + do_lockfs = 0; + if (!dm_suspended(md)) - r = dm_suspend(md); + r = dm_suspend(md, do_lockfs); if (!r) r = __dev_status(md, param); @@ -692,6 +716,7 @@ static int do_suspend(struct dm_ioctl *p static int do_resume(struct dm_ioctl *param) { int r = 0; + int do_lockfs = 1; struct hash_cell *hc; struct mapped_device *md; struct dm_table *new_map; @@ -717,8 +742,10 @@ static int do_resume(struct dm_ioctl *pa /* Do we need to load a new map ? */ if (new_map) { /* Suspend if it isn't already suspended */ + if (param->flags & DM_SKIP_LOCKFS_FLAG) + do_lockfs = 0; if (!dm_suspended(md)) - dm_suspend(md); + dm_suspend(md, do_lockfs); r = dm_swap_table(md, new_map); if (r) { @@ -964,6 +991,7 @@ static int table_load(struct dm_ioctl *p if (!hc) { DMWARN("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); + dm_table_put(t); return -ENXIO; } @@ -1097,6 +1125,67 @@ static int table_status(struct dm_ioctl return r; } +/* + * Pass a message to the target that's at the supplied device offset. + */ +static int target_message(struct dm_ioctl *param, size_t param_size) +{ + int r, argc; + char **argv; + struct mapped_device *md; + struct dm_table *table; + struct dm_target *ti; + struct dm_target_msg *tmsg = (void *) param + param->data_start; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + if (tmsg < (struct dm_target_msg *) (param + 1) || + invalid_str(tmsg->message, (void *) param + param_size)) { + DMWARN("Invalid target message parameters."); + r = -EINVAL; + goto out; + } + + r = dm_split_args(&argc, &argv, tmsg->message); + if (r) { + DMWARN("Failed to split target message parameters"); + goto out; + } + + table = dm_get_table(md); + if (!table) + goto out_argv; + + if (tmsg->sector >= dm_table_get_size(table)) { + DMWARN("Target message sector outside device."); + r = -EINVAL; + goto out_table; + } + + ti = dm_table_find_target(table, tmsg->sector); + if (ti->type->message) + r = ti->type->message(ti, argc, argv); + else { + DMWARN("Target type does not support messages"); + r = -EINVAL; + } + + out_table: + dm_table_put(table); + out_argv: + kfree(argv); + out: + param->data_size = 0; + dm_put(md); + return r; +} + /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char * device. @@ -1123,7 +1212,9 @@ static ioctl_fn lookup_ioctl(unsigned in {DM_TABLE_DEPS_CMD, table_deps}, {DM_TABLE_STATUS_CMD, table_status}, - {DM_LIST_VERSIONS_CMD, list_versions} + {DM_LIST_VERSIONS_CMD, list_versions}, + + {DM_TARGET_MSG_CMD, target_message} }; return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; @@ -1202,14 +1293,14 @@ static int validate_params(uint cmd, str cmd == DM_LIST_VERSIONS_CMD) return 0; - /* Unless creating, either name or uuid but not both */ - if (cmd != DM_DEV_CREATE_CMD) { - if ((!*param->uuid && !*param->name) || - (*param->uuid && *param->name)) { - DMWARN("one of name or uuid must be supplied, cmd(%u)", - cmd); + if ((cmd == DM_DEV_CREATE_CMD)) { + if (!*param->name) { + DMWARN("name not supplied when creating device"); return -EINVAL; } + } else if ((*param->uuid && *param->name)) { + DMWARN("only supply one of name or uuid, cmd(%u)", cmd); + return -EINVAL; } /* Ensure strings are terminated */ @@ -1268,16 +1359,11 @@ static int ctl_ioctl(struct inode *inode * Copy the parameters into kernel space. */ r = copy_params(user, ¶m); - if (r) { - current->flags &= ~PF_MEMALLOC; - return r; - } - /* - * FIXME: eventually we will remove the PF_MEMALLOC flag - * here. However the tools still do nasty things like - * 'load' while a device is suspended. - */ + current->flags &= ~PF_MEMALLOC; + + if (r) + return r; r = validate_params(cmd, param); if (r) @@ -1295,7 +1381,6 @@ static int ctl_ioctl(struct inode *inode out: free_params(param); - current->flags &= ~PF_MEMALLOC; return r; } diff -pruN ./drivers/md.dm/dm-linear.c ./drivers/md/dm-linear.c --- ./drivers/md.dm/dm-linear.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-linear.c 2006-03-17 13:16:38.000000000 +0300 @@ -80,7 +80,6 @@ static int linear_status(struct dm_targe char *result, unsigned int maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; - char buffer[32]; switch (type) { case STATUSTYPE_INFO: @@ -88,8 +87,8 @@ static int linear_status(struct dm_targe break; case STATUSTYPE_TABLE: - format_dev_t(buffer, lc->dev->bdev->bd_dev); - snprintf(result, maxlen, "%s " SECTOR_FORMAT, buffer, lc->start); + snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name, + lc->start); break; } return 0; diff -pruN ./drivers/md.dm/dm-log.c ./drivers/md/dm-log.c --- ./drivers/md.dm/dm-log.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-log.c 2006-03-17 13:16:38.000000000 +0300 @@ -17,9 +17,6 @@ static spinlock_t _lock = SPIN_LOCK_UNLO int dm_register_dirty_log_type(struct dirty_log_type *type) { - if (!try_module_get(type->module)) - return -EINVAL; - spin_lock(&_lock); type->use_count = 0; list_add(&type->list, &_log_types); @@ -33,11 +30,10 @@ int dm_unregister_dirty_log_type(struct spin_lock(&_lock); if (type->use_count) - DMWARN("Attempt to unregister a log type that is still in use"); - else { + DMWARN("Unregister failed: log type '%s' still in use", + type->name); + else list_del(&type->list); - module_put(type->module); - } spin_unlock(&_lock); @@ -51,6 +47,10 @@ static struct dirty_log_type *get_type(c spin_lock(&_lock); list_for_each_entry (type, &_log_types, list) if (!strcmp(type_name, type->name)) { + if (!type->use_count && !try_module_get(type->module)){ + spin_unlock(&_lock); + return NULL; + } type->use_count++; spin_unlock(&_lock); return type; @@ -63,7 +63,8 @@ static struct dirty_log_type *get_type(c static void put_type(struct dirty_log_type *type) { spin_lock(&_lock); - type->use_count--; + if (!--type->use_count) + module_put(type->module); spin_unlock(&_lock); } @@ -112,7 +113,7 @@ void dm_destroy_dirty_log(struct dirty_l /* * The on-disk version of the metadata. */ -#define MIRROR_DISK_VERSION 1 +#define MIRROR_DISK_VERSION 2 #define LOG_OFFSET 2 struct log_header { @@ -129,20 +130,32 @@ struct log_header { struct log_c { struct dm_target *ti; int touched; - sector_t region_size; + uint32_t region_size; unsigned int region_count; region_t sync_count; unsigned bitset_uint32_count; uint32_t *clean_bits; uint32_t *sync_bits; - uint32_t *recovering_bits; /* FIXME: this seems excessive */ + uint32_t *recovering_bits; int sync_search; + /* Resync flag */ + enum sync { + DEFAULTSYNC, /* Synchronize if necessary */ + NOSYNC, /* Devices known to be already in sync */ + FORCESYNC, /* Force a sync to happen */ + } sync; + + int failure_response; + /* * Disk log fields */ + int log_dev_failed; + atomic_t suspended; + struct completion failure_completion; struct dm_dev *log_dev; struct log_header header; @@ -150,7 +163,6 @@ struct log_c { struct log_header *disk_header; struct io_region bits_location; - uint32_t *disk_bits; }; /* @@ -159,20 +171,20 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return test_bit(bit, (unsigned long *) bs) ? 1 : 0; + return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - set_bit(bit, (unsigned long *) bs); + ext2_set_bit(bit, (unsigned long *) bs); l->touched = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - clear_bit(bit, (unsigned long *) bs); + ext2_clear_bit(bit, (unsigned long *) bs); l->touched = 1; } @@ -205,12 +217,19 @@ static int read_header(struct log_c *log header_from_disk(&log->header, log->disk_header); - if (log->header.magic != MIRROR_MAGIC) { + /* New log required? */ + if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) { log->header.magic = MIRROR_MAGIC; log->header.version = MIRROR_DISK_VERSION; log->header.nr_regions = 0; } + /* Version 2 is like version 1 but always little endian on disk. */ +#ifdef __LITTLE_ENDIAN + if (log->header.version == 1) + log->header.version = 2; +#endif + if (log->header.version != MIRROR_DISK_VERSION) { DMWARN("incompatible disk log version"); return -EINVAL; @@ -231,70 +250,69 @@ static inline int write_header(struct lo /*---------------------------------------------------------------- * Bits IO *--------------------------------------------------------------*/ -static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count) -{ - unsigned i; - - for (i = 0; i < count; i++) - core[i] = le32_to_cpu(disk[i]); -} - -static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count) -{ - unsigned i; - - /* copy across the clean/dirty bitset */ - for (i = 0; i < count; i++) - disk[i] = cpu_to_le32(core[i]); -} - static int read_bits(struct log_c *log) { int r; unsigned long ebits; r = dm_io_sync_vm(1, &log->bits_location, READ, - log->disk_bits, &ebits); + log->clean_bits, &ebits); if (r) return r; - bits_to_core(log->clean_bits, log->disk_bits, - log->bitset_uint32_count); return 0; } static int write_bits(struct log_c *log) { unsigned long ebits; - bits_to_disk(log->clean_bits, log->disk_bits, - log->bitset_uint32_count); return dm_io_sync_vm(1, &log->bits_location, WRITE, - log->disk_bits, &ebits); + log->clean_bits, &ebits); } /*---------------------------------------------------------------- - * constructor/destructor + * core log constructor/destructor + * + * argv contains: [[no]sync] [block_on_error] *--------------------------------------------------------------*/ #define BYTE_SHIFT 3 static int core_ctr(struct dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { + enum sync sync = DEFAULTSYNC; + int failure_response = DMLOG_IOERR_IGNORE; + struct log_c *lc; - sector_t region_size; + uint32_t region_size; unsigned int region_count; size_t bitset_size; + unsigned i; - if (argc != 1) { - DMWARN("wrong number of arguments to log_c"); + if (argc < 1 || argc > 3) { + DMWARN("wrong number of arguments to mirror log"); return -EINVAL; } - if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "sync")) + sync = FORCESYNC; + else if (!strcmp(argv[i], "nosync")) + sync = NOSYNC; + else if (!strcmp(argv[i], "block_on_error")) + failure_response = DMLOG_IOERR_BLOCK; + else { + DMWARN("unrecognised sync argument to mirror log: %s", + argv[i]); + return -EINVAL; + } + } + + if (sscanf(argv[0], "%u", ®ion_size) != 1) { DMWARN("invalid region size string"); return -EINVAL; } - region_count = dm_div_up(ti->len, region_size); + region_count = dm_sector_div_up(ti->len, region_size); lc = kmalloc(sizeof(*lc), GFP_KERNEL); if (!lc) { @@ -306,12 +324,14 @@ static int core_ctr(struct dirty_log *lo lc->touched = 0; lc->region_size = region_size; lc->region_count = region_count; + lc->sync = sync; + lc->failure_response = failure_response; /* - * Work out how many words we need to hold the bitset. + * Work out how many "unsigned long"s we need to hold the bitset. */ bitset_size = dm_round_up(region_count, - sizeof(*lc->clean_bits) << BYTE_SHIFT); + sizeof(unsigned long) << BYTE_SHIFT); bitset_size >>= BYTE_SHIFT; lc->bitset_uint32_count = bitset_size / 4; @@ -330,12 +350,12 @@ static int core_ctr(struct dirty_log *lo kfree(lc); return -ENOMEM; } - memset(lc->sync_bits, 0, bitset_size); - lc->sync_count = 0; + memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); + lc->sync_count = (sync == NOSYNC) ? region_count : 0; lc->recovering_bits = vmalloc(bitset_size); if (!lc->recovering_bits) { - DMWARN("couldn't allocate sync bitset"); + DMWARN("couldn't allocate recovering bitset"); vfree(lc->sync_bits); vfree(lc->clean_bits); kfree(lc); @@ -356,6 +376,11 @@ static void core_dtr(struct dirty_log *l kfree(lc); } +/*---------------------------------------------------------------- + * disk log constructor/destructor + * + * argv contains log_device region_size followed optionally by [no]sync + *--------------------------------------------------------------*/ static int disk_ctr(struct dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { @@ -364,8 +389,8 @@ static int disk_ctr(struct dirty_log *lo struct log_c *lc; struct dm_dev *dev; - if (argc != 2) { - DMWARN("wrong number of arguments to log_d"); + if (argc < 2 || argc > 3) { + DMWARN("wrong number of arguments to disk mirror log"); return -EINVAL; } @@ -382,6 +407,8 @@ static int disk_ctr(struct dirty_log *lo lc = (struct log_c *) log->context; lc->log_dev = dev; + lc->log_dev_failed = 0; + init_completion(&lc->failure_completion); /* setup the disk header fields */ lc->header_location.bdev = lc->log_dev->bdev; @@ -403,11 +430,6 @@ static int disk_ctr(struct dirty_log *lo size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), 1 << SECTOR_SHIFT); lc->bits_location.count = size >> SECTOR_SHIFT; - lc->disk_bits = vmalloc(size); - if (!lc->disk_bits) { - vfree(lc->disk_header); - goto bad; - } return 0; bad: @@ -421,7 +443,6 @@ static void disk_dtr(struct dirty_log *l struct log_c *lc = (struct log_c *) log->context; dm_put_device(lc->ti, lc->log_dev); vfree(lc->disk_header); - vfree(lc->disk_bits); core_dtr(log); } @@ -435,42 +456,65 @@ static int count_bits32(uint32_t *addr, return count; } +static void fail_log_device(struct log_c *lc) +{ + lc->log_dev_failed = 1; + if (lc->failure_response == DMLOG_IOERR_BLOCK) + dm_table_event(lc->ti->table); +} + +static void restore_log_device(struct log_c *lc) +{ + lc->log_dev_failed = 0; +} + static int disk_resume(struct dirty_log *log) { - int r; + int r = 0; unsigned i; struct log_c *lc = (struct log_c *) log->context; size_t size = lc->bitset_uint32_count * sizeof(uint32_t); - /* read the disk header */ - r = read_header(lc); - if (r) - return r; - - /* read the bits */ - r = read_bits(lc); - if (r) - return r; - - /* zero any new bits if the mirror has grown */ - for (i = lc->header.nr_regions; i < lc->region_count; i++) - /* FIXME: amazingly inefficient */ - log_clear_bit(lc, lc->clean_bits, i); + /* + * Read the disk header, but only if we know it is good. + * Assume the worst in the event of failure. + */ + if (!lc->log_dev_failed && + ((r = read_header(lc)) || read_bits(lc))) { + DMWARN("Read %s failed on mirror log device, %s.", + r ? "header" : "bits", lc->log_dev->name); + fail_log_device(lc); + lc->header.nr_regions = 0; + } + + /* set or clear any new bits */ + if (lc->sync == NOSYNC) + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_set_bit(lc, lc->clean_bits, i); + else + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_clear_bit(lc, lc->clean_bits, i); /* copy clean across to sync */ memcpy(lc->sync_bits, lc->clean_bits, size); lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); - /* write the bits */ - r = write_bits(lc); - if (r) - return r; - /* set the correct number of regions in the header */ lc->header.nr_regions = lc->region_count; - /* write the new header */ - return write_header(lc); + /* write out the log. 'i' tells us which has failed if any */ + i = 1; + if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) { + DMWARN("Write %s failed on mirror log device, %s.", + i ? "bits" : "header", lc->log_dev->name); + fail_log_device(lc); + } else + restore_log_device(lc); + + atomic_set(&lc->suspended, 0); + return r; } static sector_t core_get_region_size(struct dirty_log *log) @@ -497,6 +541,17 @@ static int core_flush(struct dirty_log * return 0; } +static int disk_presuspend(struct dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + atomic_set(&lc->suspended, 1); + if (lc->log_dev_failed && (lc->failure_response == DMLOG_IOERR_BLOCK)) + complete(&lc->failure_completion); + + return 0; +} + static int disk_flush(struct dirty_log *log) { int r; @@ -506,9 +561,24 @@ static int disk_flush(struct dirty_log * if (!lc->touched) return 0; + /* + * If a failure occurs, we must wait for a suspension. + * We must not proceed in the event of a failure, + * because if the machine reboots with the log + * incorrect, recovery could be compromised + */ r = write_bits(lc); - if (!r) + if (!r) { lc->touched = 0; + restore_log_device(lc); + } else { + DMERR("Write failure on mirror log device, %s.", + lc->log_dev->name); + fail_log_device(lc); + if (!atomic_read(&lc->suspended) && + (lc->failure_response == DMLOG_IOERR_BLOCK)) + wait_for_completion(&lc->failure_completion); + } return r; } @@ -538,7 +608,7 @@ static int core_get_resync_work(struct d lc->sync_search); lc->sync_search = *region + 1; - if (*region == lc->region_count) + if (*region >= lc->region_count) return 0; } while (log_test_bit(lc->recovering_bits, *region)); @@ -566,6 +636,60 @@ static region_t core_get_sync_count(stru return lc->sync_count; } +#define DMEMIT_SYNC \ + if (lc->sync != DEFAULTSYNC) \ + DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") + +static int core_status(struct dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("1 core"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size); + DMEMIT_SYNC; + } + + return sz; +} + +static int disk_status(struct dirty_log *log, status_type_t status, + char *result, unsigned int maxlen) +{ + int sz = 0; + struct log_c *lc = log->context; + + switch(status) { + case STATUSTYPE_INFO: + DMEMIT("3 disk %s %c", lc->log_dev->name, + lc->log_dev_failed ? 'D' : 'A'); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %u %s %u ", log->type->name, + lc->sync == DEFAULTSYNC ? 2 : 3, + lc->log_dev->name, + lc->region_size); + DMEMIT_SYNC; + } + + return sz; +} + +static int core_get_failure_response(struct dirty_log *log) +{ + struct log_c *lc = log->context; + + return lc->failure_response; +} + static struct dirty_log_type _core_type = { .name = "core", .module = THIS_MODULE, @@ -579,7 +703,9 @@ static struct dirty_log_type _core_type .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, .complete_resync_work = core_complete_resync_work, - .get_sync_count = core_get_sync_count + .get_sync_count = core_get_sync_count, + .status = core_status, + .get_failure_response = core_get_failure_response, }; static struct dirty_log_type _disk_type = { @@ -587,7 +713,8 @@ static struct dirty_log_type _disk_type .module = THIS_MODULE, .ctr = disk_ctr, .dtr = disk_dtr, - .suspend = disk_flush, + .presuspend = disk_presuspend, + .postsuspend = disk_flush, .resume = disk_resume, .get_region_size = core_get_region_size, .is_clean = core_is_clean, @@ -597,7 +724,9 @@ static struct dirty_log_type _disk_type .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, .complete_resync_work = core_complete_resync_work, - .get_sync_count = core_get_sync_count + .get_sync_count = core_get_sync_count, + .status = disk_status, + .get_failure_response = core_get_failure_response, }; int __init dm_dirty_log_init(void) diff -pruN ./drivers/md.dm/dm-log.h ./drivers/md/dm-log.h --- ./drivers/md.dm/dm-log.h 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-log.h 2006-03-17 13:16:38.000000000 +0300 @@ -9,6 +9,15 @@ #include "dm.h" +/* + * Values returned by get_failure_response() + * DMLOG_IOERR_IGNORE: ignore device failures + * DMLOG_IOERR_BLOCK: issue dm event, and do not complete + * I/O until presuspend is recieved. + */ +#define DMLOG_IOERR_IGNORE 0 +#define DMLOG_IOERR_BLOCK 1 + typedef sector_t region_t; struct dirty_log_type; @@ -32,7 +41,8 @@ struct dirty_log_type { * There are times when we don't want the log to touch * the disk. */ - int (*suspend)(struct dirty_log *log); + int (*presuspend)(struct dirty_log *log); + int (*postsuspend)(struct dirty_log *log); int (*resume)(struct dirty_log *log); /* @@ -48,6 +58,16 @@ struct dirty_log_type { int (*is_clean)(struct dirty_log *log, region_t region); /* + * Returns: 0, 1 + * + * This is necessary for cluster mirroring. It provides + * a way to detect recovery on another node, so we + * aren't writing concurrently. This function is likely + * to block (when a cluster log is used). + */ + int (*is_remote_recovering)(struct dirty_log *log, region_t region); + + /* * Returns: 0, 1, -EWOULDBLOCK, < 0 * * A predicate function to check the area given by @@ -101,6 +121,18 @@ struct dirty_log_type { * Returns the number of regions that are in sync. */ region_t (*get_sync_count)(struct dirty_log *log); + + /* + * Support function for mirror status requests. + */ + int (*status)(struct dirty_log *log, status_type_t status_type, + char *result, unsigned int maxlen); + + /* + * Return the code describing what to do in the event + * of a device failure. + */ + int (*get_failure_response)(struct dirty_log *log); }; int dm_register_dirty_log_type(struct dirty_log_type *type); diff -pruN ./drivers/md.dm/dm-mpath.c ./drivers/md/dm-mpath.c --- ./drivers/md.dm/dm-mpath.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-mpath.c 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,1342 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-path-selector.h" +#include "dm-hw-handler.h" +#include "dm-bio-list.h" +#include "dm-bio-record.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MESG_STR(x) x, sizeof(x) + +/* Path properties */ +struct pgpath { + struct list_head list; + + struct priority_group *pg; /* Owning PG */ + unsigned fail_count; /* Cumulative failure count */ + + struct path path; +}; + +#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) + +/* + * Paths are grouped into Priority Groups and numbered from 1 upwards. + * Each has a path selector which controls which path gets used. + */ +struct priority_group { + struct list_head list; + + struct multipath *m; /* Owning multipath instance */ + struct path_selector ps; + + unsigned pg_num; /* Reference number */ + unsigned bypassed; /* Temporarily bypass this PG? */ + + unsigned nr_pgpaths; /* Number of paths in PG */ + struct list_head pgpaths; +}; + +/* Multipath context */ +struct multipath { + struct list_head list; + struct dm_target *ti; + + spinlock_t lock; + + struct hw_handler hw_handler; + unsigned nr_priority_groups; + struct list_head priority_groups; + unsigned pg_init_required; /* pg_init needs calling? */ + unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ + + unsigned nr_valid_paths; /* Total number of usable paths */ + struct pgpath *current_pgpath; + struct priority_group *current_pg; + struct priority_group *next_pg; /* Switch to this PG if set */ + unsigned repeat_count; /* I/Os left before calling PS again */ + + unsigned queue_io; /* Must we queue all I/O? */ + unsigned queue_if_no_path; /* Queue I/O if last path fails? */ + unsigned saved_queue_if_no_path;/* Saved state during suspension */ + + struct work_struct process_queued_ios; + struct bio_list queued_ios; + unsigned queue_size; + + struct work_struct trigger_event; + + /* + * We must use a mempool of mpath_io structs so that we + * can resubmit bios on error. + */ + mempool_t *mpio_pool; +}; + +/* + * Context information attached to each bio we process. + */ +struct mpath_io { + struct pgpath *pgpath; + struct dm_bio_details details; +}; + +typedef int (*action_fn) (struct pgpath *pgpath); + +#define MIN_IOS 256 /* Mempool size */ + +static kmem_cache_t *_mpio_cache; + +struct workqueue_struct *kmultipathd; +static void process_queued_ios(void *data); +static void trigger_event(void *data); + + +/*----------------------------------------------- + * Allocation routines + *-----------------------------------------------*/ + +static struct pgpath *alloc_pgpath(void) +{ + struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); + + if (pgpath) { + memset(pgpath, 0, sizeof(*pgpath)); + pgpath->path.is_active = 1; + } + + return pgpath; +} + +static inline void free_pgpath(struct pgpath *pgpath) +{ + kfree(pgpath); +} + +static struct priority_group *alloc_priority_group(void) +{ + struct priority_group *pg; + + pg = kmalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + memset(pg, 0, sizeof(*pg)); + INIT_LIST_HEAD(&pg->pgpaths); + + return pg; +} + +static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti) +{ + struct pgpath *pgpath, *tmp; + + list_for_each_entry_safe(pgpath, tmp, pgpaths, list) { + list_del(&pgpath->list); + dm_put_device(ti, pgpath->path.dev); + free_pgpath(pgpath); + } +} + +static void free_priority_group(struct priority_group *pg, + struct dm_target *ti) +{ + struct path_selector *ps = &pg->ps; + + if (ps->type) { + ps->type->destroy(ps); + dm_put_path_selector(ps->type); + } + + free_pgpaths(&pg->pgpaths, ti); + kfree(pg); +} + +static struct multipath *alloc_multipath(void) +{ + struct multipath *m; + + m = kmalloc(sizeof(*m), GFP_KERNEL); + if (m) { + memset(m, 0, sizeof(*m)); + INIT_LIST_HEAD(&m->priority_groups); + spin_lock_init(&m->lock); + m->queue_io = 1; + INIT_WORK(&m->process_queued_ios, process_queued_ios, m); + INIT_WORK(&m->trigger_event, trigger_event, m); + m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _mpio_cache); + if (!m->mpio_pool) { + kfree(m); + return NULL; + } + } + + return m; +} + +static void free_multipath(struct multipath *m) +{ + struct priority_group *pg, *tmp; + struct hw_handler *hwh = &m->hw_handler; + + list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) { + list_del(&pg->list); + free_priority_group(pg, m->ti); + } + + if (hwh->type) { + hwh->type->destroy(hwh); + dm_put_hw_handler(hwh->type); + } + + mempool_destroy(m->mpio_pool); + kfree(m); +} + + +/*----------------------------------------------- + * Path selection + *-----------------------------------------------*/ + +static void __switch_pg(struct multipath *m, struct pgpath *pgpath) +{ + struct hw_handler *hwh = &m->hw_handler; + + m->current_pg = pgpath->pg; + + /* Must we initialise the PG first, and queue I/O till it's ready? */ + if (hwh->type && hwh->type->pg_init) { + m->pg_init_required = 1; + m->queue_io = 1; + } else { + m->pg_init_required = 0; + m->queue_io = 0; + } +} + +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +{ + struct path *path; + + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); + if (!path) + return -ENXIO; + + m->current_pgpath = path_to_pgpath(path); + + if (m->current_pg != pg) + __switch_pg(m, m->current_pgpath); + + return 0; +} + +static void __choose_pgpath(struct multipath *m) +{ + struct priority_group *pg; + unsigned bypassed = 1; + + if (!m->nr_valid_paths) + goto failed; + + /* Were we instructed to switch PG? */ + if (m->next_pg) { + pg = m->next_pg; + m->next_pg = NULL; + if (!__choose_path_in_pg(m, pg)) + return; + } + + /* Don't change PG until it has no remaining paths */ + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) + return; + + /* + * Loop through priority groups until we find a valid path. + * First time we skip PGs marked 'bypassed'. + * Second time we only try the ones we skipped. + */ + do { + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed == bypassed) + continue; + if (!__choose_path_in_pg(m, pg)) + return; + } + } while (bypassed--); + +failed: + m->current_pgpath = NULL; + m->current_pg = NULL; +} + +static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, + unsigned was_queued) +{ + int r = 1; + unsigned long flags; + struct pgpath *pgpath; + + spin_lock_irqsave(&m->lock, flags); + + /* Do we need to select a new pgpath? */ + if (!m->current_pgpath || + (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) + __choose_pgpath(m); + + pgpath = m->current_pgpath; + + if (was_queued) + m->queue_size--; + + if ((pgpath && m->queue_io) || + (!pgpath && m->queue_if_no_path)) { + /* Queue for the daemon to resubmit */ + bio_list_add(&m->queued_ios, bio); + m->queue_size++; + if ((m->pg_init_required && !m->pg_init_in_progress) || + !m->queue_io) + queue_work(kmultipathd, &m->process_queued_ios); + pgpath = NULL; + r = 0; + } else if (!pgpath) + r = -EIO; /* Failed */ + else + bio->bi_bdev = pgpath->path.dev->bdev; + + mpio->pgpath = pgpath; + + spin_unlock_irqrestore(&m->lock, flags); + + return r; +} + +/* + * If we run out of usable paths, should we queue I/O or error it? + */ +static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, + unsigned save_old_value) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + if (save_old_value) + m->saved_queue_if_no_path = m->queue_if_no_path; + else + m->saved_queue_if_no_path = queue_if_no_path; + m->queue_if_no_path = queue_if_no_path; + if (!m->queue_if_no_path && m->queue_size) + queue_work(kmultipathd, &m->process_queued_ios); + + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +/*----------------------------------------------------------------- + * The multipath daemon is responsible for resubmitting queued ios. + *---------------------------------------------------------------*/ + +static void dispatch_queued_ios(struct multipath *m) +{ + int r; + unsigned long flags; + struct bio *bio = NULL, *next; + struct mpath_io *mpio; + union map_info *info; + + spin_lock_irqsave(&m->lock, flags); + bio = bio_list_get(&m->queued_ios); + spin_unlock_irqrestore(&m->lock, flags); + + while (bio) { + next = bio->bi_next; + bio->bi_next = NULL; + + info = dm_get_mapinfo(bio); + mpio = info->ptr; + + r = map_io(m, bio, mpio, 1); + if (r < 0) + bio_endio(bio, bio->bi_size, r); + else if (r == 1) + generic_make_request(bio); + + bio = next; + } +} + +static void process_queued_ios(void *data) +{ + struct multipath *m = (struct multipath *) data; + struct hw_handler *hwh = &m->hw_handler; + struct pgpath *pgpath = NULL; + unsigned init_required = 0, must_queue = 1; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + if (!m->queue_size) + goto out; + + if (!m->current_pgpath) + __choose_pgpath(m); + + pgpath = m->current_pgpath; + + if ((pgpath && !m->queue_io) || + (!pgpath && !m->queue_if_no_path)) + must_queue = 0; + + if (m->pg_init_required && !m->pg_init_in_progress) { + m->pg_init_required = 0; + m->pg_init_in_progress = 1; + init_required = 1; + } + +out: + spin_unlock_irqrestore(&m->lock, flags); + + if (init_required) + hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path); + + if (!must_queue) + dispatch_queued_ios(m); +} + +/* + * An event is triggered whenever a path is taken out of use. + * Includes path failure and PG bypass. + */ +static void trigger_event(void *data) +{ + struct multipath *m = (struct multipath *) data; + + dm_table_event(m->ti->table); +} + +/*----------------------------------------------------------------- + * Constructor/argument parsing: + * <#multipath feature args> []* + * <#hw_handler args> [hw_handler []*] + * <#priority groups> + * + * [ <#selector args> []* + * <#paths> <#per-path selector args> + * [ []* ]+ ]+ + *---------------------------------------------------------------*/ +struct param { + unsigned min; + unsigned max; + char *error; +}; + +#define ESTR(s) ("dm-multipath: " s) + +static int read_param(struct param *param, char *str, unsigned *v, char **error) +{ + if (!str || + (sscanf(str, "%u", v) != 1) || + (*v < param->min) || + (*v > param->max)) { + *error = param->error; + return -EINVAL; + } + + return 0; +} + +struct arg_set { + unsigned argc; + char **argv; +}; + +static char *shift(struct arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} + +static void consume(struct arg_set *as, unsigned n) +{ + BUG_ON (as->argc < n); + as->argc -= n; + as->argv += n; +} + +static int parse_path_selector(struct arg_set *as, struct priority_group *pg, + struct dm_target *ti) +{ + int r; + struct path_selector_type *pst; + unsigned ps_argc; + + static struct param _params[] = { + {0, 1024, ESTR("invalid number of path selector args")}, + }; + + pst = dm_get_path_selector(shift(as)); + if (!pst) { + ti->error = ESTR("unknown path selector type"); + return -EINVAL; + } + + r = read_param(_params, shift(as), &ps_argc, &ti->error); + if (r) + return -EINVAL; + + r = pst->create(&pg->ps, ps_argc, as->argv); + if (r) { + dm_put_path_selector(pst); + ti->error = ESTR("path selector constructor failed"); + return r; + } + + pg->ps.type = pst; + consume(as, ps_argc); + + return 0; +} + +static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, + struct dm_target *ti) +{ + int r; + struct pgpath *p; + + /* we need at least a path arg */ + if (as->argc < 1) { + ti->error = ESTR("no device given"); + return NULL; + } + + p = alloc_pgpath(); + if (!p) + return NULL; + + r = dm_get_device(ti, shift(as), ti->begin, ti->len, + dm_table_get_mode(ti->table), &p->path.dev); + if (r) { + ti->error = ESTR("error getting device"); + goto bad; + } + + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + + return p; + + bad: + free_pgpath(p); + return NULL; +} + +static struct priority_group *parse_priority_group(struct arg_set *as, + struct multipath *m, + struct dm_target *ti) +{ + static struct param _params[] = { + {1, 1024, ESTR("invalid number of paths")}, + {0, 1024, ESTR("invalid number of selector args")} + }; + + int r; + unsigned i, nr_selector_args, nr_params; + struct priority_group *pg; + + if (as->argc < 2) { + as->argc = 0; + ti->error = ESTR("not enough priority group aruments"); + return NULL; + } + + pg = alloc_priority_group(); + if (!pg) { + ti->error = ESTR("couldn't allocate priority group"); + return NULL; + } + pg->m = m; + + r = parse_path_selector(as, pg, ti); + if (r) + goto bad; + + /* + * read the paths + */ + r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); + if (r) + goto bad; + + r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); + if (r) + goto bad; + + nr_params = 1 + nr_selector_args; + for (i = 0; i < pg->nr_pgpaths; i++) { + struct pgpath *pgpath; + struct arg_set path_args; + + if (as->argc < nr_params) + goto bad; + + path_args.argc = nr_params; + path_args.argv = as->argv; + + pgpath = parse_path(&path_args, &pg->ps, ti); + if (!pgpath) + goto bad; + + pgpath->pg = pg; + list_add_tail(&pgpath->list, &pg->pgpaths); + consume(as, nr_params); + } + + return pg; + + bad: + free_priority_group(pg, ti); + return NULL; +} + +static int parse_hw_handler(struct arg_set *as, struct multipath *m, + struct dm_target *ti) +{ + int r; + struct hw_handler_type *hwht; + unsigned hw_argc; + + static struct param _params[] = { + {0, 1024, ESTR("invalid number of hardware handler args")}, + }; + + r = read_param(_params, shift(as), &hw_argc, &ti->error); + if (r) + return -EINVAL; + + if (!hw_argc) + return 0; + + hwht = dm_get_hw_handler(shift(as)); + if (!hwht) { + ti->error = ESTR("unknown hardware handler type"); + return -EINVAL; + } + + r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); + if (r) { + dm_put_hw_handler(hwht); + ti->error = ESTR("hardware handler constructor failed"); + return r; + } + + m->hw_handler.type = hwht; + consume(as, hw_argc - 1); + + return 0; +} + +static int parse_features(struct arg_set *as, struct multipath *m, + struct dm_target *ti) +{ + int r; + unsigned argc; + + static struct param _params[] = { + {0, 1, ESTR("invalid number of feature args")}, + }; + + r = read_param(_params, shift(as), &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + if (!strnicmp(shift(as), MESG_STR("queue_if_no_path"))) + return queue_if_no_path(m, 1, 0); + else { + ti->error = "Unrecognised multipath feature request"; + return -EINVAL; + } +} + +static int multipath_ctr(struct dm_target *ti, unsigned int argc, + char **argv) +{ + /* target parameters */ + static struct param _params[] = { + {1, 1024, ESTR("invalid number of priority groups")}, + {1, 1024, ESTR("invalid initial priority group number")}, + }; + + int r; + struct multipath *m; + struct arg_set as; + unsigned pg_count = 0; + unsigned next_pg_num; + + as.argc = argc; + as.argv = argv; + + m = alloc_multipath(); + if (!m) { + ti->error = ESTR("can't allocate multipath"); + return -EINVAL; + } + + r = parse_features(&as, m, ti); + if (r) + goto bad; + + r = parse_hw_handler(&as, m, ti); + if (r) + goto bad; + + r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); + if (r) + goto bad; + + r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); + if (r) + goto bad; + + /* parse the priority groups */ + while (as.argc) { + struct priority_group *pg; + + pg = parse_priority_group(&as, m, ti); + if (!pg) { + r = -EINVAL; + goto bad; + } + + m->nr_valid_paths += pg->nr_pgpaths; + list_add_tail(&pg->list, &m->priority_groups); + pg_count++; + pg->pg_num = pg_count; + if (!--next_pg_num) + m->next_pg = pg; + } + + if (pg_count != m->nr_priority_groups) { + ti->error = ESTR("priority group count mismatch"); + r = -EINVAL; + goto bad; + } + + ti->private = m; + m->ti = ti; + + return 0; + + bad: + free_multipath(m); + return r; +} + +static void multipath_dtr(struct dm_target *ti) +{ + struct multipath *m = (struct multipath *) ti->private; + + flush_workqueue(kmultipathd); + free_multipath(m); +} + +/* + * Map bios, recording original fields for later in case we have to resubmit + */ +static int multipath_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + int r; + struct mpath_io *mpio; + struct multipath *m = (struct multipath *) ti->private; + + if (bio_barrier(bio)) + return -EOPNOTSUPP; + + mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); + dm_bio_record(&mpio->details, bio); + + map_context->ptr = mpio; + bio->bi_rw |= (1 << BIO_RW_FAILFAST); + r = map_io(m, bio, mpio, 0); + if (r < 0) + mempool_free(mpio, m->mpio_pool); + + return r; +} + +/* + * Take a path out of use. + */ +static int fail_path(struct pgpath *pgpath) +{ + unsigned long flags; + struct multipath *m = pgpath->pg->m; + + spin_lock_irqsave(&m->lock, flags); + + if (!pgpath->path.is_active) + goto out; + + DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); + + pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); + pgpath->path.is_active = 0; + pgpath->fail_count++; + + m->nr_valid_paths--; + + if (pgpath == m->current_pgpath) + m->current_pgpath = NULL; + + queue_work(kmultipathd, &m->trigger_event); + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +/* + * Reinstate a previously-failed path + */ +static int reinstate_path(struct pgpath *pgpath) +{ + int r = 0; + unsigned long flags; + struct multipath *m = pgpath->pg->m; + + spin_lock_irqsave(&m->lock, flags); + + if (pgpath->path.is_active) + goto out; + + if (!pgpath->pg->ps.type) { + DMWARN("Reinstate path not supported by path selector %s", + pgpath->pg->ps.type->name); + r = -EINVAL; + goto out; + } + + r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); + if (r) + goto out; + + pgpath->path.is_active = 1; + + m->current_pgpath = NULL; + if (!m->nr_valid_paths++ && m->queue_size) + queue_work(kmultipathd, &m->process_queued_ios); + + queue_work(kmultipathd, &m->trigger_event); + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return r; +} + +/* + * Fail or reinstate all paths that match the provided struct dm_dev. + */ +static int action_dev(struct multipath *m, struct dm_dev *dev, + action_fn action) +{ + int r = 0; + struct pgpath *pgpath; + struct priority_group *pg; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pgpath->path.dev == dev) + r = action(pgpath); + } + } + + return r; +} + +/* + * Temporarily try to avoid having to use the specified PG + */ +static void bypass_pg(struct multipath *m, struct priority_group *pg, + int bypassed) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + pg->bypassed = bypassed; + m->current_pgpath = NULL; + m->current_pg = NULL; + + spin_unlock_irqrestore(&m->lock, flags); + + queue_work(kmultipathd, &m->trigger_event); +} + +/* + * Switch to using the specified PG from the next I/O that gets mapped + */ +static int switch_pg_num(struct multipath *m, const char *pgstr) +{ + struct priority_group *pg; + unsigned pgnum; + unsigned long flags; + + if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to switch_pg_num"); + return -EINVAL; + } + + spin_lock_irqsave(&m->lock, flags); + list_for_each_entry(pg, &m->priority_groups, list) { + pg->bypassed = 0; + if (--pgnum) + continue; + + m->current_pgpath = NULL; + m->current_pg = NULL; + m->next_pg = pg; + } + spin_unlock_irqrestore(&m->lock, flags); + + queue_work(kmultipathd, &m->trigger_event); + return 0; +} + +/* + * Set/clear bypassed status of a PG. + * PGs are numbered upwards from 1 in the order they were declared. + */ +static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +{ + struct priority_group *pg; + unsigned pgnum; + + if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + (pgnum > m->nr_priority_groups)) { + DMWARN("invalid PG number supplied to bypass_pg"); + return -EINVAL; + } + + list_for_each_entry(pg, &m->priority_groups, list) { + if (!--pgnum) + break; + } + + bypass_pg(m, pg, bypassed); + return 0; +} + +/* + * pg_init must call this when it has completed its initialisation + */ +void dm_pg_init_complete(struct path *path, unsigned err_flags) +{ + struct pgpath *pgpath = path_to_pgpath(path); + struct priority_group *pg = pgpath->pg; + struct multipath *m = pg->m; + unsigned long flags; + + /* We insist on failing the path if the PG is already bypassed. */ + if (err_flags && pg->bypassed) + err_flags |= MP_FAIL_PATH; + + if (err_flags & MP_FAIL_PATH) + fail_path(pgpath); + + if (err_flags & MP_BYPASS_PG) + bypass_pg(m, pg, 1); + + spin_lock_irqsave(&m->lock, flags); + if (err_flags) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } else if (!m->pg_init_required) + m->queue_io = 0; + + m->pg_init_in_progress = 0; + queue_work(kmultipathd, &m->process_queued_ios); + spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * end_io handling + */ +static int do_end_io(struct multipath *m, struct bio *bio, + int error, struct mpath_io *mpio) +{ + struct hw_handler *hwh = &m->hw_handler; + unsigned err_flags = MP_FAIL_PATH; /* Default behavior */ + unsigned long flags; + + if (!error) + return 0; /* I/O complete */ + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + return error; + + if (error == -EOPNOTSUPP) + return error; + + spin_lock_irqsave(&m->lock, flags); + if (!m->nr_valid_paths) { + if (!m->queue_if_no_path) { + spin_unlock_irqrestore(&m->lock, flags); + return -EIO; + } else { + spin_unlock_irqrestore(&m->lock, flags); + goto requeue; + } + } + spin_unlock_irqrestore(&m->lock, flags); + + if (hwh->type && hwh->type->error) + err_flags = hwh->type->error(hwh, bio); + else + err_flags = dm_scsi_err_handler(hwh, bio); + + if (mpio->pgpath) { + if (err_flags & MP_FAIL_PATH) + fail_path(mpio->pgpath); + + if (err_flags & MP_BYPASS_PG) + bypass_pg(m, mpio->pgpath->pg, 1); + } + + if (err_flags & MP_ERROR_IO) + return -EIO; + + requeue: + dm_bio_restore(&mpio->details, bio); + + /* queue for the daemon to resubmit or fail */ + spin_lock_irqsave(&m->lock, flags); + bio_list_add(&m->queued_ios, bio); + m->queue_size++; + if (!m->queue_io) + queue_work(kmultipathd, &m->process_queued_ios); + spin_unlock_irqrestore(&m->lock, flags); + + return 1; /* io not complete */ +} + +static int multipath_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct multipath *m = (struct multipath *) ti->private; + struct mpath_io *mpio = (struct mpath_io *) map_context->ptr; + struct pgpath *pgpath = mpio->pgpath; + struct path_selector *ps; + int r; + + r = do_end_io(m, bio, error, mpio); + if (pgpath) { + ps = &pgpath->pg->ps; + if (ps->type->end_io) + ps->type->end_io(ps, &pgpath->path); + } + if (r <= 0) + mempool_free(mpio, m->mpio_pool); + + return r; +} + +/* + * Suspend can't complete until all the I/O is processed so if + * the last path fails we must error any remaining I/O. + * Note that if the freeze_bdev fails while suspending, the + * queue_if_no_path state is lost - userspace should reset it. + */ +static void multipath_presuspend(struct dm_target *ti) +{ + struct multipath *m = (struct multipath *) ti->private; + + queue_if_no_path(m, 0, 1); +} + +/* + * Restore the queue_if_no_path setting. + */ +static void multipath_resume(struct dm_target *ti) +{ + struct multipath *m = (struct multipath *) ti->private; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + m->queue_if_no_path = m->saved_queue_if_no_path; + spin_unlock_irqrestore(&m->lock, flags); +} + +/* + * Info output has the following format: + * num_multipath_feature_args [multipath_feature_args]* + * num_handler_status_args [handler_status_args]* + * num_groups init_group_number + * [A|D|E num_ps_status_args [ps_status_args]* + * num_paths num_selector_args + * [path_dev A|F fail_count [selector_args]* ]+ ]+ + * + * Table output has the following format (identical to the constructor string): + * num_feature_args [features_args]* + * num_handler_args hw_handler [hw_handler_args]* + * num_groups init_group_number + * [priority selector-name num_ps_args [ps_args]* + * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ + */ +static int multipath_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + int sz = 0; + unsigned long flags; + struct multipath *m = (struct multipath *) ti->private; + struct hw_handler *hwh = &m->hw_handler; + struct priority_group *pg; + struct pgpath *p; + unsigned pg_num; + char state; + + spin_lock_irqsave(&m->lock, flags); + + /* Features */ + if (type == STATUSTYPE_INFO) + DMEMIT("1 %u ", m->queue_size); + else if (m->queue_if_no_path) + DMEMIT("1 queue_if_no_path "); + else + DMEMIT("0 "); + + if (hwh->type && hwh->type->status) + sz += hwh->type->status(hwh, type, result + sz, maxlen - sz); + else if (!hwh->type || type == STATUSTYPE_INFO) + DMEMIT("0 "); + else + DMEMIT("1 %s ", hwh->type->name); + + DMEMIT("%u ", m->nr_priority_groups); + + if (m->next_pg) + pg_num = m->next_pg->pg_num; + else if (m->current_pg) + pg_num = m->current_pg->pg_num; + else + pg_num = 1; + + DMEMIT("%u ", pg_num); + + switch (type) { + case STATUSTYPE_INFO: + list_for_each_entry(pg, &m->priority_groups, list) { + if (pg->bypassed) + state = 'D'; /* Disabled */ + else if (pg == m->current_pg) + state = 'A'; /* Currently Active */ + else + state = 'E'; /* Enabled */ + + DMEMIT("%c ", state); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->info_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s %s %u ", p->path.dev->name, + p->path.is_active ? "A" : "F", + p->fail_count); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + + case STATUSTYPE_TABLE: + list_for_each_entry(pg, &m->priority_groups, list) { + DMEMIT("%s ", pg->ps.type->name); + + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, NULL, type, + result + sz, + maxlen - sz); + else + DMEMIT("0 "); + + DMEMIT("%u %u ", pg->nr_pgpaths, + pg->ps.type->table_args); + + list_for_each_entry(p, &pg->pgpaths, list) { + DMEMIT("%s ", p->path.dev->name); + if (pg->ps.type->status) + sz += pg->ps.type->status(&pg->ps, + &p->path, type, result + sz, + maxlen - sz); + } + } + break; + } + + spin_unlock_irqrestore(&m->lock, flags); + + return 0; +} + +static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + struct dm_dev *dev; + struct multipath *m = (struct multipath *) ti->private; + action_fn action; + + if (argc == 1) { + if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) + return queue_if_no_path(m, 1, 0); + else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) + return queue_if_no_path(m, 0, 0); + } + + if (argc != 2) + goto error; + + if (!strnicmp(argv[0], MESG_STR("disable_group"))) + return bypass_pg_num(m, argv[1], 1); + else if (!strnicmp(argv[0], MESG_STR("enable_group"))) + return bypass_pg_num(m, argv[1], 0); + else if (!strnicmp(argv[0], MESG_STR("switch_group"))) + return switch_pg_num(m, argv[1]); + else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + action = reinstate_path; + else if (!strnicmp(argv[0], MESG_STR("fail_path"))) + action = fail_path; + else + goto error; + + r = dm_get_device(ti, argv[1], ti->begin, ti->len, + dm_table_get_mode(ti->table), &dev); + if (r) { + DMWARN("dm-multipath message: error getting device %s", + argv[1]); + return -EINVAL; + } + + r = action_dev(m, dev, action); + + dm_put_device(ti, dev); + + return r; + +error: + DMWARN("Unrecognised multipath message received."); + return -EINVAL; +} + +/*----------------------------------------------------------------- + * Module setup + *---------------------------------------------------------------*/ +static struct target_type multipath_target = { + .name = "multipath", + .version = {1, 0, 4}, + .module = THIS_MODULE, + .ctr = multipath_ctr, + .dtr = multipath_dtr, + .map = multipath_map, + .end_io = multipath_end_io, + .presuspend = multipath_presuspend, + .resume = multipath_resume, + .status = multipath_status, + .message = multipath_message, +}; + +static int __init dm_multipath_init(void) +{ + int r; + + /* allocate a slab for the dm_ios */ + _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io), + 0, 0, NULL, NULL); + if (!_mpio_cache) + return -ENOMEM; + + r = dm_register_target(&multipath_target); + if (r < 0) { + DMERR("%s: register failed %d", multipath_target.name, r); + kmem_cache_destroy(_mpio_cache); + return -EINVAL; + } + + kmultipathd = create_workqueue("kmpathd"); + if (!kmultipathd) { + DMERR("%s: failed to create workqueue kmpathd", + multipath_target.name); + dm_unregister_target(&multipath_target); + kmem_cache_destroy(_mpio_cache); + return -ENOMEM; + } + + DMINFO("dm-multipath version %u.%u.%u loaded", + multipath_target.version[0], multipath_target.version[1], + multipath_target.version[2]); + + return r; +} + +static void __exit dm_multipath_exit(void) +{ + int r; + + destroy_workqueue(kmultipathd); + + r = dm_unregister_target(&multipath_target); + if (r < 0) + DMERR("%s: target unregister failed %d", + multipath_target.name, r); + kmem_cache_destroy(_mpio_cache); +} + +EXPORT_SYMBOL_GPL(dm_pg_init_complete); + +module_init(dm_multipath_init); +module_exit(dm_multipath_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath target"); +MODULE_AUTHOR("Sistina Software "); +MODULE_LICENSE("GPL"); diff -pruN ./drivers/md.dm/dm-mpath.h ./drivers/md/dm-mpath.h --- ./drivers/md.dm/dm-mpath.h 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-mpath.h 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Multipath. + */ + +#ifndef DM_MPATH_H +#define DM_MPATH_H + +struct dm_dev; + +struct path { + struct dm_dev *dev; /* Read-only */ + unsigned is_active; /* Read-only */ + + void *pscontext; /* For path-selector use */ + void *hwhcontext; /* For hw-handler use */ +}; + +/* Callback for hwh_pg_init_fn to use when complete */ +void dm_pg_init_complete(struct path *path, unsigned err_flags); + +#endif diff -pruN ./drivers/md.dm/dm-path-selector.c ./drivers/md/dm-path-selector.c --- ./drivers/md.dm/dm-path-selector.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-path-selector.c 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path selector registration. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include + +struct ps_internal { + struct path_selector_type pst; + + struct list_head list; + long use; +}; + +#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) + +static LIST_HEAD(_path_selectors); +static DECLARE_RWSEM(_ps_lock); + +struct ps_internal *__find_path_selector_type(const char *name) +{ + struct ps_internal *psi; + + list_for_each_entry(psi, &_path_selectors, list) { + if (!strcmp(name, psi->pst.name)) + return psi; + } + + return NULL; +} + +static struct ps_internal *get_path_selector(const char *name) +{ + struct ps_internal *psi; + + down_read(&_ps_lock); + psi = __find_path_selector_type(name); + if (psi) { + if ((psi->use == 0) && !try_module_get(psi->pst.module)) + psi = NULL; + else + psi->use++; + } + up_read(&_ps_lock); + + return psi; +} + +struct path_selector_type *dm_get_path_selector(const char *name) +{ + struct ps_internal *psi; + + if (!name) + return NULL; + + psi = get_path_selector(name); + if (!psi) { + request_module("dm-%s", name); + psi = get_path_selector(name); + } + + return psi ? &psi->pst : NULL; +} + +void dm_put_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + if (!pst) + return; + + down_read(&_ps_lock); + psi = __find_path_selector_type(pst->name); + if (!psi) + goto out; + + if (--psi->use == 0) + module_put(psi->pst.module); + + if (psi->use < 0) + BUG(); + +out: + up_read(&_ps_lock); +} + +static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL); + + if (psi) { + memset(psi, 0, sizeof(*psi)); + psi->pst = *pst; + } + + return psi; +} + +int dm_register_path_selector(struct path_selector_type *pst) +{ + int r = 0; + struct ps_internal *psi = _alloc_path_selector(pst); + + if (!psi) + return -ENOMEM; + + down_write(&_ps_lock); + + if (__find_path_selector_type(pst->name)) { + kfree(psi); + r = -EEXIST; + } else + list_add(&psi->list, &_path_selectors); + + up_write(&_ps_lock); + + return r; +} + +int dm_unregister_path_selector(struct path_selector_type *pst) +{ + struct ps_internal *psi; + + down_write(&_ps_lock); + + psi = __find_path_selector_type(pst->name); + if (!psi) { + up_write(&_ps_lock); + return -EINVAL; + } + + if (psi->use) { + up_write(&_ps_lock); + return -ETXTBSY; + } + + list_del(&psi->list); + + up_write(&_ps_lock); + + kfree(psi); + + return 0; +} + +EXPORT_SYMBOL_GPL(dm_register_path_selector); +EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff -pruN ./drivers/md.dm/dm-path-selector.h ./drivers/md/dm-path-selector.h --- ./drivers/md.dm/dm-path-selector.h 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-path-selector.h 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Path-Selector registration. + */ + +#ifndef DM_PATH_SELECTOR_H +#define DM_PATH_SELECTOR_H + +#include + +#include "dm-mpath.h" + +/* + * We provide an abstraction for the code that chooses which path + * to send some io down. + */ +struct path_selector_type; +struct path_selector { + struct path_selector_type *type; + void *context; +}; + +/* Information about a path selector type */ +struct path_selector_type { + char *name; + struct module *module; + + unsigned int table_args; + unsigned int info_args; + + /* + * Constructs a path selector object, takes custom arguments + */ + int (*create) (struct path_selector *ps, unsigned argc, char **argv); + void (*destroy) (struct path_selector *ps); + + /* + * Add an opaque path object, along with some selector specific + * path args (eg, path priority). + */ + int (*add_path) (struct path_selector *ps, struct path *path, + int argc, char **argv, char **error); + + /* + * Chooses a path for this io, if no paths are available then + * NULL will be returned. + * + * repeat_count is the number of times to use the path before + * calling the function again. 0 means don't call it again unless + * the path fails. + */ + struct path *(*select_path) (struct path_selector *ps, + unsigned *repeat_count); + + /* + * Notify the selector that a path has failed. + */ + void (*fail_path) (struct path_selector *ps, struct path *p); + + /* + * Ask selector to reinstate a path. + */ + int (*reinstate_path) (struct path_selector *ps, struct path *p); + + /* + * Table content based on parameters added in ps_add_path_fn + * or path selector status + */ + int (*status) (struct path_selector *ps, struct path *path, + status_type_t type, char *result, unsigned int maxlen); + + int (*end_io) (struct path_selector *ps, struct path *path); +}; + +/* Register a path selector */ +int dm_register_path_selector(struct path_selector_type *type); + +/* Unregister a path selector */ +int dm_unregister_path_selector(struct path_selector_type *type); + +/* Returns a registered path selector type */ +struct path_selector_type *dm_get_path_selector(const char *name); + +/* Releases a path selector */ +void dm_put_path_selector(struct path_selector_type *pst); + +#endif diff -pruN ./drivers/md.dm/dm-raid1.c ./drivers/md/dm-raid1.c --- ./drivers/md.dm/dm-raid1.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-raid1.c 2006-03-17 13:16:38.000000000 +0300 @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-list.h" +#include "dm-bio-record.h" #include "dm-io.h" #include "dm-log.h" #include "kcopyd.h" @@ -28,6 +29,8 @@ static inline void wake(void) queue_work(_kmirrord_wq, &_kmirrord_work); } +static struct workqueue_struct *_kmir_mon_wq; + /*----------------------------------------------------------------- * Region hash * @@ -67,7 +70,7 @@ static inline void wake(void) struct mirror_set; struct region_hash { struct mirror_set *ms; - sector_t region_size; + uint32_t region_size; unsigned region_shift; /* holds persistent region state */ @@ -135,7 +138,7 @@ static void region_free(void *element, v #define MIN_REGIONS 64 #define MAX_RECOVERY 1 static int rh_init(struct region_hash *rh, struct mirror_set *ms, - struct dirty_log *log, sector_t region_size, + struct dirty_log *log, uint32_t region_size, region_t nr_regions) { unsigned int nr_buckets, max_buckets; @@ -253,9 +256,9 @@ static struct region *__rh_alloc(struct else { __rh_insert(rh, nreg); if (nreg->state == RH_CLEAN) { - spin_lock_irq(&rh->region_lock); + spin_lock(&rh->region_lock); list_add(&nreg->list, &rh->clean_regions); - spin_unlock_irq(&rh->region_lock); + spin_unlock(&rh->region_lock); } reg = nreg; } @@ -375,16 +378,19 @@ static void rh_inc(struct region_hash *r read_lock(&rh->hash_lock); reg = __rh_find(rh, region); - if (reg->state == RH_CLEAN) { - rh->log->type->mark_region(rh->log, reg->key); - spin_lock_irq(&rh->region_lock); + spin_lock_irq(&rh->region_lock); + atomic_inc(®->pending); + + if (reg->state == RH_CLEAN) { reg->state = RH_DIRTY; list_del_init(®->list); /* take off the clean list */ spin_unlock_irq(&rh->region_lock); - } - atomic_inc(®->pending); + rh->log->type->mark_region(rh->log, reg->key); + } else + spin_unlock_irq(&rh->region_lock); + read_unlock(&rh->hash_lock); } @@ -406,17 +412,17 @@ static void rh_dec(struct region_hash *r reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); + spin_lock_irqsave(&rh->region_lock, flags); if (atomic_dec_and_test(®->pending)) { - spin_lock_irqsave(&rh->region_lock, flags); if (reg->state == RH_RECOVERING) { list_add_tail(®->list, &rh->quiesced_regions); } else { reg->state = RH_CLEAN; list_add(®->list, &rh->clean_regions); } - spin_unlock_irqrestore(&rh->region_lock, flags); should_wake = 1; } + spin_unlock_irqrestore(&rh->region_lock, flags); if (should_wake) wake(); @@ -539,7 +545,8 @@ static void rh_start_recovery(struct reg * Mirror set structures. *---------------------------------------------------------------*/ struct mirror { - atomic_t error_count; + atomic_t error_count; /* Error counter to flag mirror failure */ + struct mirror_set *ms; struct dm_dev *dev; sector_t offset; }; @@ -550,36 +557,59 @@ struct mirror_set { struct region_hash rh; struct kcopyd_client *kcopyd_client; - spinlock_t lock; /* protects the next two lists */ + spinlock_t lock; /* protects the lists */ struct bio_list reads; struct bio_list writes; + struct bio_list failures; + struct work_struct failure_work; + struct completion failure_completion; /* recovery */ + atomic_t suspended; region_t nr_regions; int in_sync; unsigned int nr_mirrors; - struct mirror mirror[0]; + spinlock_t choose_lock; /* protects select in choose_mirror(). */ + atomic_t read_count; /* Read counter for read balancing. */ + unsigned int read_mirror; /* Last mirror read. */ + struct mirror *default_mirror; /* Default mirror. */ + struct mirror mirror[0]; }; +struct bio_map_info { + struct mirror *bmi_m; + struct dm_bio_details bmi_bd; +}; + +static mempool_t *bio_map_info_pool = NULL; + +static void *bio_map_info_alloc(int gfp_mask, void *pool_data){ + return kmalloc(sizeof(struct bio_map_info), gfp_mask); +} + +static void bio_map_info_free(void *element, void *pool_data){ + kfree(element); +} + /* * Every mirror should look like this one. */ #define DEFAULT_MIRROR 0 /* - * This is yucky. We squirrel the mirror_set struct away inside - * bi_next for write buffers. This is safe since the bh + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */ -static struct mirror_set *bio_get_ms(struct bio *bio) +static struct mirror *bio_get_m(struct bio *bio) { - return (struct mirror_set *) bio->bi_next; + return (struct mirror *) bio->bi_next; } -static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +static void bio_set_m(struct bio *bio, struct mirror *m) { - bio->bi_next = (struct bio *) ms; + bio->bi_next = (struct bio *) m; } /*----------------------------------------------------------------- @@ -607,7 +637,7 @@ static int recover(struct mirror_set *ms unsigned long flags = 0; /* fill in the source */ - m = ms->mirror + DEFAULT_MIRROR; + m = ms->default_mirror; from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { @@ -623,7 +653,7 @@ static int recover(struct mirror_set *ms /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (i == DEFAULT_MIRROR) + if (&ms->mirror[i] == ms->default_mirror) continue; m = ms->mirror + i; @@ -673,42 +703,163 @@ static void do_recovery(struct mirror_se } /*----------------------------------------------------------------- - * Reads + * Misc Functions *---------------------------------------------------------------*/ -static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +#define MIN_READS 128 +/* + * choose_mirror + * @ms: the mirror set + * @m: mirror that has failed, or NULL if just choosing + * + * Returns: chosen mirror, or NULL on failure + */ +static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m) { - /* FIXME: add read balancing */ - return ms->mirror + DEFAULT_MIRROR; + int i, retry; + unsigned long flags; + struct mirror *ret = NULL; + + spin_lock_irqsave(&ms->choose_lock, flags); + + if (unlikely(m == ms->default_mirror)) { + i = DEFAULT_MIRROR; + atomic_set(&ms->read_count, MIN_READS); + } else + i = ms->read_mirror; + + for (retry = 0; retry < ms->nr_mirrors; ) { + i %= ms->nr_mirrors; + ret = ms->mirror + i; + + if (unlikely(atomic_read(&ret->error_count))) { + retry++; + i++; + } else { + /* + * Guarantee that a number of read IOs + * get queued to the same mirror. + */ + if (atomic_dec_and_test(&ms->read_count)) { + atomic_set(&ms->read_count, MIN_READS); + i++; + } + + ms->read_mirror = i; + break; + } + } + + /* Check for failure of default mirror, reset if necessary */ + if (unlikely(m == ms->default_mirror)) + ms->default_mirror = ret; + + spin_unlock_irqrestore(&ms->choose_lock, flags); + + if (unlikely(atomic_read(&ret->error_count))) { + DMERR("All mirror devices are dead. Unable to choose mirror."); + return NULL; + } + + return ret; +} + +static void fail_mirror(struct mirror *m) +{ + DMINFO("incrementing error_count on %s", m->dev->name); + atomic_inc(&m->error_count); + + choose_mirror(m->ms, m); +} + +static int default_ok(struct mirror *m) +{ + return !atomic_read(&m->ms->default_mirror->error_count); } /* * remap a buffer to a particular mirror. */ -static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + return m->offset + (bio->bi_sector - m->ms->ti->begin); +} + +static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = (struct bio *)context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (unlikely(error)) { + DMWARN("A read failure occurred on a mirror device."); + fail_mirror(m); + if (likely(default_ok(m))) { + DMWARN("Trying different device."); + queue_bio(m->ms, bio, bio_rw(bio)); + } else { + DMERR("No other device available, failing I/O."); + bio_endio(bio, 0, -EIO); + } + } else + bio_endio(bio, bio->bi_size, 0); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct io_region io; + + map_region(&io, m, bio); + bio_set_m(bio, m); + dm_io_async_bvec(1, &io, READ, + bio->bi_io_vec + bio->bi_idx, + read_callback, bio); } static void do_reads(struct mirror_set *ms, struct bio_list *reads) { - region_t region; struct bio *bio; struct mirror *m; while ((bio = bio_list_pop(reads))) { - region = bio_to_region(&ms->rh, bio); - /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 0)) - m = choose_mirror(ms, bio->bi_sector); - else - m = ms->mirror + DEFAULT_MIRROR; + if (likely(rh_in_sync(&ms->rh, + bio_to_region(&ms->rh, bio), + 0))) + m = choose_mirror(ms, NULL); + else { + m = ms->default_mirror; + + /* If the default fails, we give up .*/ + if (unlikely(m && atomic_read(&m->error_count))) + m = NULL; + } - map_bio(ms, m, bio); - generic_make_request(bio); + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, 0, -EIO); } } @@ -722,56 +873,116 @@ static void do_reads(struct mirror_set * * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ +static void write_failure_handler(void *data) +{ + struct bio *bio; + struct bio_list failed_writes; + struct mirror_set *ms = (struct mirror_set *)data; + struct dirty_log *log = ms->rh.log; + + if (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK) { + dm_table_event(ms->ti->table); + wait_for_completion(&ms->failure_completion); + } + + /* Take list out to handle endios. */ + spin_lock_irq(&ms->lock); + failed_writes = ms->failures; + bio_list_init(&ms->failures); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&failed_writes))) + bio_endio(bio, bio->bi_size, 0); +} + static void write_callback(unsigned long error, void *context) { - unsigned int i; - int uptodate = 1; + unsigned int i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; - - ms = bio_get_ms(bio); - bio_set_ms(bio, NULL); - + int uptodate = 0, run; + + ms = (bio_get_m(bio))->ms; + bio_set_m(bio, NULL); + /* * NOTE: We don't decrement the pending count here, * instead it is done by the targets endio function. * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ + if (unlikely(error)) { + DMERR("Error during write occurred."); - if (error) { /* - * only error the io if all mirrors failed. - * FIXME: bogus + * Test all bits - if all failed, fail io. + * Otherwise, go through hassle of failing a device... */ - uptodate = 0; - for (i = 0; i < ms->nr_mirrors; i++) - if (!test_bit(i, &error)) { + for (i = 0; i < ms->nr_mirrors; i++) { + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i); + else uptodate = 1; - break; + } + + if (likely(uptodate)) { + spin_lock(&ms->lock); + if (atomic_read(&ms->suspended)) { + /* + * The device is suspended, it is + * safe to complete I/O. + */ + spin_unlock(&ms->lock); + } else { + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * seperate thread. + * + * run gets set if this will be the first + * bio in the list. + */ + run = !ms->failures.head; + bio_list_add(&ms->failures, bio); + spin_unlock(&ms->lock); + + if (run) + queue_work(_kmir_mon_wq, + &ms->failure_work); + + return; } + } else { + DMERR("All replicated volumes dead, failing I/O"); + /* None of the writes succeeded, fail the I/O. */ + ret = -EIO; + } } - bio_endio(bio, bio->bi_size, 0); + + bio_endio(bio, bio->bi_size, ret); } static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[KCOPYD_MAX_REGIONS+1]; + struct io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; - for (i = 0; i < ms->nr_mirrors; i++) { - m = ms->mirror + i; - - io[i].bdev = m->dev->bdev; - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; - } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); - bio_set_ms(bio, ms); - dm_io_async_bvec(ms->nr_mirrors, io, WRITE, - bio->bi_io_vec + bio->bi_idx, - write_callback, bio); + if (likely(dest - io)) { + /* + * We can use the default mirror here, because we + * only need it in order to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, ms->default_mirror); + dm_io_async_bvec(dest - io, io, WRITE, + bio->bi_io_vec + bio->bi_idx, + write_callback, bio); + } else + bio_endio(bio, bio->bi_size, -EIO); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) @@ -779,6 +990,9 @@ static void do_writes(struct mirror_set int state; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dirty_log *log = ms->rh.log; + region_t region; if (!writes->head) return; @@ -789,9 +1003,18 @@ static void do_writes(struct mirror_set bio_list_init(&sync); bio_list_init(&nosync); bio_list_init(&recover); + bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); + region = bio_to_region(&ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = rh_state(&ms->rh, region, 1); switch (state) { case RH_CLEAN: case RH_DIRTY: @@ -810,6 +1033,8 @@ static void do_writes(struct mirror_set bio_list_add(this_list, bio); } + bio_list_merge(writes, &requeue); + /* * Increment the pending counts for any regions that will * be written to (writes to recover regions are going to @@ -829,7 +1054,7 @@ static void do_writes(struct mirror_set rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); + map_bio(ms->default_mirror, bio); generic_make_request(bio); } } @@ -844,12 +1069,12 @@ static void do_mirror(struct mirror_set { struct bio_list reads, writes; - spin_lock(&ms->lock); + spin_lock_irq(&ms->lock); reads = ms->reads; writes = ms->writes; bio_list_init(&ms->reads); bio_list_init(&ms->writes); - spin_unlock(&ms->lock); + spin_unlock_irq(&ms->lock); rh_update_states(&ms->rh); do_recovery(ms); @@ -871,7 +1096,7 @@ static void do_work(void *ignored) * Target functions *---------------------------------------------------------------*/ static struct mirror_set *alloc_context(unsigned int nr_mirrors, - sector_t region_size, + uint32_t region_size, struct dm_target *ti, struct dirty_log *dl) { @@ -891,11 +1116,16 @@ static struct mirror_set *alloc_context( memset(ms, 0, len); spin_lock_init(&ms->lock); + spin_lock_init(&ms->choose_lock); ms->ti = ti; ms->nr_mirrors = nr_mirrors; - ms->nr_regions = dm_div_up(ti->len, region_size); + ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; + ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; + + /* a resume must be issued to start the device */ + atomic_set(&ms->suspended, 1); if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "dm-mirror: Error creating dirty region hash"; @@ -903,6 +1133,13 @@ static struct mirror_set *alloc_context( return NULL; } + atomic_set(&ms->read_count, MIN_READS); + + bio_list_init(&ms->failures); + INIT_WORK(&ms->failure_work, write_failure_handler, ms); + + init_completion(&ms->failure_completion); + return ms; } @@ -916,7 +1153,7 @@ static void free_context(struct mirror_s kfree(ms); } -static inline int _check_region_size(struct dm_target *ti, sector_t size) +static inline int _check_region_size(struct dm_target *ti, uint32_t size) { return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || size > ti->len); @@ -940,6 +1177,8 @@ static int get_mirror(struct mirror_set } ms->mirror[mirror].offset = offset; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].ms = ms; return 0; } @@ -1009,8 +1248,8 @@ static struct dirty_log *create_dirty_lo * log_type #log_params * #mirrors [mirror_path offset]{2,} * - * For now, #log_params = 1, log_type = "core" - * + * log_type is "core" or "disk" + * #log_params is between 1 and 3 */ #define DM_IO_PAGES 64 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -1060,6 +1299,7 @@ static int mirror_ctr(struct dm_target * } ti->private = ms; + ti->split_io = ms->rh.region_size; r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); if (r) { @@ -1082,14 +1322,15 @@ static void mirror_dtr(struct dm_target static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { + unsigned long flags; int should_wake = 0; struct bio_list *bl; bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock(&ms->lock); + spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock(&ms->lock); + spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wake(); @@ -1104,42 +1345,64 @@ static int mirror_map(struct dm_target * int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - - map_context->ll = bio->bi_sector >> ms->rh.region_shift; + struct dm_bio_details *bd; + struct bio_map_info *bmi; if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = bio_to_region(&ms->rh, bio); queue_bio(ms, bio, rw); return 0; } + /* It's all about the READs now */ + r = ms->rh.log->type->in_sync(ms->rh.log, bio_to_region(&ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; - if (r == -EWOULDBLOCK) /* FIXME: ugly */ + if (r == -EWOULDBLOCK) r = 0; - /* - * We don't want to fast track a recovery just for a read - * ahead. So we just let it silently fail. - * FIXME: get rid of this. - */ - if (!r && rw == READA) - return -EIO; + if (likely(r)) { + /* + * Optimize reads by avoiding to hand them to daemon. + * + * In case they fail, queue them for another shot + * in the mirror_end_io() function. + */ + m = choose_mirror(ms, NULL); + if (likely(m)) { + bmi = mempool_alloc(bio_map_info_pool, GFP_NOIO); + + if (likely(bmi)) { + /* without this, a read is not retryable */ + bd = &bmi->bmi_bd; + dm_bio_record(bd, bio); + map_context->ptr = bmi; + bmi->bmi_m = m; + } else { + /* we could fail now, but we can at least ** + ** give it a shot. The bd is only used to ** + ** retry in the event of a failure anyway. ** + ** If we fail, we can fail the I/O then. */ + map_context->ptr = NULL; + } + + map_bio(m, bio); + return 1; /* Mapped -> queue request. */ + } else + return -EIO; + } else { + /* Either not clean, or -EWOULDBLOCK */ + if (rw == READA) + return -EWOULDBLOCK; - if (!r) { - /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); - return 0; } - m = choose_mirror(ms, bio->bi_sector); - if (!m) - return -EIO; - - map_bio(ms, m, bio); - return 1; + return 0; } static int mirror_end_io(struct dm_target *ti, struct bio *bio, @@ -1147,71 +1410,140 @@ static int mirror_end_io(struct dm_targe { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE) { + rh_dec(&ms->rh, map_context->ll); + return error; + } - return 0; + if (error == -EOPNOTSUPP) + goto out; + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + goto out; + + if (unlikely(error)) { + DMERR("A read failure occurred on a mirror device."); + if (!map_context->ptr) { + /* + * There wasn't enough memory to record necessary + * information for a retry. + */ + DMERR("Out of memory causing inability to retry read."); + return -EIO; + } + m = ((struct bio_map_info *)map_context->ptr)->bmi_m; + fail_mirror(m); /* Flag error on mirror. */ + + /* + * A failed read needs to get queued + * to the daemon for another shot to + * one (if any) intact mirrors. + */ + if (default_ok(m)) { + bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd); + + DMWARN("Trying different device."); + dm_bio_restore(bd, bio); + mempool_free(map_context->ptr, bio_map_info_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; /* We want another shot on the bio. */ + } + DMERR("All replicated volumes dead, failing I/O"); + } + + out: + if (map_context->ptr) + mempool_free(map_context->ptr, bio_map_info_pool); + + return error; } -static void mirror_suspend(struct dm_target *ti) +static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + unsigned long flags; + int run; + + /* + * Only run the completion if we are suspending after + * a disk failure. + */ + spin_lock_irqsave(&ms->lock, flags); + run = ms->failures.head ? 1 : 0; + spin_unlock_irqrestore(&ms->lock, flags); + + if (run && (log->type->get_failure_response(log) == DMLOG_IOERR_BLOCK)) + complete(&ms->failure_completion); + + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dirty_log *log = ms->rh.log; + rh_stop_recovery(&ms->rh); - if (log->type->suspend && log->type->suspend(log)) + if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ - DMWARN("log suspend failed"); + DMWARN("log postsuspend failed"); + atomic_set(&ms->suspended, 1); } static void mirror_resume(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); - rh_start_recovery(&ms->rh); + + if (atomic_dec_and_test(&ms->suspended)) + rh_start_recovery(&ms->rh); + atomic_set(&ms->suspended, 0); } static int mirror_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - char buffer[32]; unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; - -#define EMIT(x...) sz += ((sz >= maxlen) ? \ - 0 : scnprintf(result + sz, maxlen - sz, x)) + char buffer[ms->nr_mirrors + 1]; switch (type) { case STATUSTYPE_INFO: - EMIT("%d ", ms->nr_mirrors); - + DMEMIT("%d ", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) { - format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); - EMIT("%s ", buffer); + DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = atomic_read(&(ms->mirror[m].error_count)) ? + 'D' : 'A'; } + buffer[m] = '\0'; - EMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, - ms->rh.log->type->get_sync_count(ms->rh.log), - ms->nr_regions); + DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT " 1 %s ", + ms->rh.log->type->get_sync_count(ms->rh.log), + ms->nr_regions, buffer); + ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); break; case STATUSTYPE_TABLE: - EMIT("%s 1 " SECTOR_FORMAT " %d ", - ms->rh.log->type->name, ms->rh.region_size, - ms->nr_mirrors); - - for (m = 0; m < ms->nr_mirrors; m++) { - format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); - EMIT("%s " SECTOR_FORMAT " ", - buffer, ms->mirror[m].offset); - } + sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + DMEMIT("%d ", ms->nr_mirrors); + for (m = 0; m < ms->nr_mirrors; m++) + DMEMIT("%s " SECTOR_FORMAT " ", + ms->mirror[m].dev->name, ms->mirror[m].offset); } return 0; @@ -1219,13 +1551,14 @@ static int mirror_status(struct dm_targe static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 1}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, .map = mirror_map, .end_io = mirror_end_io, - .suspend = mirror_suspend, + .presuspend = mirror_presuspend, + .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, }; @@ -1234,24 +1567,38 @@ static int __init dm_mirror_init(void) { int r; + bio_map_info_pool = mempool_create(100, bio_map_info_alloc, + bio_map_info_free, NULL); + if (!bio_map_info_pool) + return -ENOMEM; + r = dm_dirty_log_init(); if (r) return r; - _kmirrord_wq = create_workqueue("kmirrord"); + _kmirrord_wq = create_singlethread_workqueue("kmirrord"); if (!_kmirrord_wq) { DMERR("couldn't start kmirrord"); dm_dirty_log_exit(); - return r; + return -ENOMEM; } INIT_WORK(&_kmirrord_work, do_work, NULL); + _kmir_mon_wq = create_singlethread_workqueue("kmir_mon"); + if (!_kmir_mon_wq) { + DMERR("couldn't start kmir_mon"); + dm_dirty_log_exit(); + destroy_workqueue(_kmirrord_wq); + return -ENOMEM; + } + r = dm_register_target(&mirror_target); if (r < 0) { DMERR("%s: Failed to register mirror target", mirror_target.name); dm_dirty_log_exit(); destroy_workqueue(_kmirrord_wq); + destroy_workqueue(_kmir_mon_wq); } return r; diff -pruN ./drivers/md.dm/dm-round-robin.c ./drivers/md/dm-round-robin.c --- ./drivers/md.dm/dm-round-robin.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/dm-round-robin.c 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Round-robin path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include + +/*----------------------------------------------------------------- + * Path-handling code, paths are held in lists + *---------------------------------------------------------------*/ +struct path_info { + struct list_head list; + struct path *path; + unsigned repeat_count; +}; + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +/*----------------------------------------------------------------- + * Round-robin selector + *---------------------------------------------------------------*/ + +#define RR_MIN_IO 1000 + +struct selector { + struct list_head valid_paths; + struct list_head invalid_paths; +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + } + + return s; +} + +static int rr_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s; + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void rr_destroy(struct path_selector *ps) +{ + struct selector *s = (struct selector *) ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->invalid_paths); + kfree(s); + ps->context = NULL; +} + +static int rr_status(struct path_selector *ps, struct path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct path_info *pi; + int sz = 0; + + if (!path) + DMEMIT("0 "); + else { + switch(type) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +/* + * Called during initialisation to register each path with an + * optional repeat_count. + */ +static int rr_add_path(struct path_selector *ps, struct path *path, + int argc, char **argv, char **error) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi; + unsigned repeat_count = RR_MIN_IO; + + if (argc > 1) { + *error = "round-robin ps: incorrect number of arguments"; + return -EINVAL; + } + + /* First path argument is number of I/Os before switching path */ + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "round-robin ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "round-robin ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + path->pscontext = pi; + + list_add(&pi->list, &s->valid_paths); + + return 0; +} + +static void rr_fail_path(struct path_selector *ps, struct path *p) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = p->pscontext; + + list_move(&pi->list, &s->invalid_paths); +} + +static int rr_reinstate_path(struct path_selector *ps, struct path *p) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = p->pscontext; + + list_move(&pi->list, &s->valid_paths); + + return 0; +} + +static struct path *rr_select_path(struct path_selector *ps, + unsigned *repeat_count) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = NULL; + + if (!list_empty(&s->valid_paths)) { + pi = list_entry(s->valid_paths.next, struct path_info, list); + list_move_tail(&pi->list, &s->valid_paths); + *repeat_count = pi->repeat_count; + } + + return pi ? pi->path : NULL; +} + +static struct path_selector_type rr_ps = { + .name = "round-robin", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 0, + .create = rr_create, + .destroy = rr_destroy, + .status = rr_status, + .add_path = rr_add_path, + .fail_path = rr_fail_path, + .reinstate_path = rr_reinstate_path, + .select_path = rr_select_path, +}; + +static int __init dm_rr_init(void) +{ + int r = dm_register_path_selector(&rr_ps); + + if (r < 0) + DMERR("round-robin: register failed %d", r); + + DMINFO("dm-round-robin version 1.0.0 loaded"); + + return r; +} + +static void __exit dm_rr_exit(void) +{ + int r = dm_unregister_path_selector(&rr_ps); + + if (r < 0) + DMERR("round-robin: unregister failed %d", r); +} + +module_init(dm_rr_init); +module_exit(dm_rr_exit); + +MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); +MODULE_AUTHOR("Sistina Software "); +MODULE_LICENSE("GPL"); diff -pruN ./drivers/md.dm/dm-snap.c ./drivers/md/dm-snap.c --- ./drivers/md.dm/dm-snap.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-snap.c 2006-03-17 13:16:38.000000000 +0300 @@ -49,6 +49,11 @@ struct pending_exception { struct bio_list snapshot_bios; /* + * Short-term queue of pending exceptions prior to submission. + */ + struct list_head list; + + /* * Other pending_exceptions that are processing this * chunk. When this list is empty, we know we can * complete the origins. @@ -371,6 +376,15 @@ static inline ulong round_up(ulong n, ul return (n + size) & ~size; } +static void read_snapshot_metadata(struct dm_snapshot *s) +{ + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } +} + /* * Construct a snapshot mapping:

*/ @@ -457,7 +471,7 @@ static int snapshot_ctr(struct dm_target s->chunk_shift = ffs(chunk_size) - 1; s->valid = 1; - s->have_metadata = 0; + s->active = 0; s->last_percent = 0; init_rwsem(&s->lock); s->table = ti->table; @@ -492,7 +506,11 @@ static int snapshot_ctr(struct dm_target goto bad5; } + /* Metadata must only be loaded into one table at once */ + read_snapshot_metadata(s); + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ if (register_snapshot(s)) { r = -EINVAL; ti->error = "Cannot register snapshot origin"; @@ -529,8 +547,12 @@ static void snapshot_dtr(struct dm_targe { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + /* Prevent further origin writes from using this snapshot. */ + /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); + kcopyd_client_destroy(s->kcopyd_client); + exit_exception_table(&s->pending, pending_cache); exit_exception_table(&s->complete, exception_cache); @@ -539,7 +561,7 @@ static void snapshot_dtr(struct dm_targe dm_put_device(ti, s->origin); dm_put_device(ti, s->cow); - kcopyd_client_destroy(s->kcopyd_client); + kfree(s); } @@ -777,7 +799,10 @@ static int snapshot_map(struct dm_target /* Full snapshots are not usable */ if (!s->valid) - return -1; + return -EIO; + + if (unlikely(bio_barrier(bio))) + return -EOPNOTSUPP; /* * Write to snapshot - higher level takes care of RW/RO @@ -848,24 +873,15 @@ static void snapshot_resume(struct dm_ta { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - if (s->have_metadata) - return; - - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); - } - - s->have_metadata = 1; + down_write(&s->lock); + s->active = 1; + up_write(&s->lock); } static int snapshot_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; - char cow[32]; - char org[32]; switch (type) { case STATUSTYPE_INFO: @@ -892,9 +908,8 @@ static int snapshot_status(struct dm_tar * to make private copies if the output is to * make sense. */ - format_dev_t(cow, snap->cow->bdev->bd_dev); - format_dev_t(org, snap->origin->bdev->bd_dev); - snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, org, cow, + snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, + snap->origin->name, snap->cow->name, snap->type, snap->chunk_size); break; } @@ -924,14 +939,19 @@ static int __origin_write(struct list_he int r = 1, first = 1; struct dm_snapshot *snap; struct exception *e; - struct pending_exception *pe, *last = NULL; + struct pending_exception *pe, *next_pe, *last = NULL; chunk_t chunk; + LIST_HEAD(pe_queue); /* Do all the snapshots on this origin */ list_for_each_entry (snap, snapshots, list) { - /* Only deal with valid snapshots */ - if (!snap->valid) + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) + continue; + + /* Nothing to do if writing beyond end of snapshot */ + if (bio->bi_sector >= dm_table_get_size(snap->table)) continue; down_write(&snap->lock); @@ -955,12 +975,19 @@ static int __origin_write(struct list_he snap->valid = 0; } else { - if (last) + if (first) { + bio_list_add(&pe->origin_bios, bio); + r = 0; + first = 0; + } + if (last && list_empty(&pe->siblings)) list_merge(&pe->siblings, &last->siblings); - + if (!pe->started) { + pe->started = 1; + list_add_tail(&pe->list, &pe_queue); + } last = pe; - r = 0; } } @@ -970,24 +997,8 @@ static int __origin_write(struct list_he /* * Now that we have a complete pe list we can start the copying. */ - if (last) { - pe = last; - do { - down_write(&pe->snap->lock); - if (first) - bio_list_add(&pe->origin_bios, bio); - if (!pe->started) { - pe->started = 1; - up_write(&pe->snap->lock); - start_copy(pe); - } else - up_write(&pe->snap->lock); - first = 0; - pe = list_entry(pe->siblings.next, - struct pending_exception, siblings); - - } while (pe != last); - } + list_for_each_entry_safe(pe, next_pe, &pe_queue, list) + start_copy(pe); return r; } @@ -1051,6 +1062,9 @@ static int origin_map(struct dm_target * struct dm_dev *dev = (struct dm_dev *) ti->private; bio->bi_bdev = dev->bdev; + if (unlikely(bio_barrier(bio))) + return -EOPNOTSUPP; + /* Only tell snapshots if this is a write */ return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; } @@ -1082,7 +1096,6 @@ static int origin_status(struct dm_targe unsigned int maxlen) { struct dm_dev *dev = (struct dm_dev *) ti->private; - char buffer[32]; switch (type) { case STATUSTYPE_INFO: @@ -1090,8 +1103,7 @@ static int origin_status(struct dm_targe break; case STATUSTYPE_TABLE: - format_dev_t(buffer, dev->bdev->bd_dev); - snprintf(result, maxlen, "%s", buffer); + snprintf(result, maxlen, "%s", dev->name); break; } @@ -1100,7 +1112,7 @@ static int origin_status(struct dm_targe static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 0, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1111,7 +1123,7 @@ static struct target_type origin_target static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 0, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff -pruN ./drivers/md.dm/dm-snap.h ./drivers/md/dm-snap.h --- ./drivers/md.dm/dm-snap.h 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-snap.h 2006-03-17 13:16:38.000000000 +0300 @@ -99,7 +99,9 @@ struct dm_snapshot { /* You can't use a snapshot if this is 0 (e.g. if full) */ int valid; - int have_metadata; + + /* Origin writes don't trigger exceptions until this is set */ + int active; /* Used for display of table */ char type; diff -pruN ./drivers/md.dm/dm-stripe.c ./drivers/md/dm-stripe.c --- ./drivers/md.dm/dm-stripe.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-stripe.c 2006-03-17 13:16:38.000000000 +0300 @@ -21,7 +21,7 @@ struct stripe_c { uint32_t stripes; /* The size of this target / num. stripes */ - uint32_t stripe_width; + sector_t stripe_width; /* stripe chunk size */ uint32_t chunk_shift; @@ -173,9 +173,8 @@ static int stripe_map(struct dm_target * struct stripe_c *sc = (struct stripe_c *) ti->private; sector_t offset = bio->bi_sector - ti->begin; - uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); - uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ - chunk = chunk / sc->stripes; + sector_t chunk = offset >> sc->chunk_shift; + uint32_t stripe = do_div(chunk, sc->stripes); bio->bi_bdev = sc->stripe[stripe].dev->bdev; bio->bi_sector = sc->stripe[stripe].physical_start + @@ -189,10 +188,6 @@ static int stripe_status(struct dm_targe struct stripe_c *sc = (struct stripe_c *) ti->private; unsigned int sz = 0; unsigned int i; - char buffer[32]; - -#define EMIT(x...) sz += ((sz >= maxlen) ? \ - 0 : scnprintf(result + sz, maxlen - sz, x)) switch (type) { case STATUSTYPE_INFO: @@ -200,12 +195,10 @@ static int stripe_status(struct dm_targe break; case STATUSTYPE_TABLE: - EMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); - for (i = 0; i < sc->stripes; i++) { - format_dev_t(buffer, sc->stripe[i].dev->bdev->bd_dev); - EMIT(" %s " SECTOR_FORMAT, buffer, - sc->stripe[i].physical_start); - } + DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) + DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name, + sc->stripe[i].physical_start); break; } return 0; @@ -213,7 +206,7 @@ static int stripe_status(struct dm_targe static struct target_type stripe_target = { .name = "striped", - .version= {1, 0, 1}, + .version= {1, 0, 2}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff -pruN ./drivers/md.dm/dm-table.c ./drivers/md/dm-table.c --- ./drivers/md.dm/dm-table.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-table.c 2006-03-17 13:16:38.000000000 +0300 @@ -57,7 +57,7 @@ struct dm_table { /* * Similar to ceiling(log_size(n)) */ -static unsigned int int_log(unsigned long n, unsigned long base) +static unsigned int int_log(unsigned int n, unsigned int base) { int result = 0; @@ -454,6 +454,8 @@ static int __table_get_device(struct dm_ return r; } + format_dev_t(dd->name, dev); + atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); @@ -575,7 +577,7 @@ static char **realloc_argv(unsigned *arr /* * Destructively splits up the argument list to pass to ctr. */ -static int split_args(int *argc, char ***argvp, char *input) +int dm_split_args(int *argc, char ***argvp, char *input) { char *start, *end = input, *out, **argv = NULL; unsigned array_size = 0; @@ -663,14 +665,14 @@ int dm_table_add_target(struct dm_table if (!len) { tgt->error = "zero-length target"; - DMERR(": %s\n", tgt->error); + DMERR("%s", tgt->error); return -EINVAL; } tgt->type = dm_get_target_type(type); if (!tgt->type) { tgt->error = "unknown target type"; - DMERR(": %s\n", tgt->error); + DMERR("%s", tgt->error); return -EINVAL; } @@ -688,7 +690,7 @@ int dm_table_add_target(struct dm_table goto bad; } - r = split_args(&argc, &argv, params); + r = dm_split_args(&argc, &argv, params); if (r) { tgt->error = "couldn't split parameters (insufficient memory)"; goto bad; @@ -707,7 +709,7 @@ int dm_table_add_target(struct dm_table return 0; bad: - DMERR(": %s\n", tgt->error); + DMERR("%s", tgt->error); dm_put_target_type(tgt->type); return r; } @@ -825,7 +827,7 @@ void dm_table_set_restrictions(struct dm * Make sure we obey the optimistic sub devices * restrictions. */ - q->max_sectors = t->limits.max_sectors; + blk_queue_max_sectors(q, t->limits.max_sectors); q->max_phys_segments = t->limits.max_phys_segments; q->max_hw_segments = t->limits.max_hw_segments; q->hardsect_size = t->limits.hardsect_size; @@ -848,18 +850,38 @@ int dm_table_get_mode(struct dm_table *t return t->mode; } -void dm_table_suspend_targets(struct dm_table *t) +static void suspend_targets(struct dm_table *t, unsigned postsuspend) { - int i; + int i = t->num_targets; + struct dm_target *ti = t->targets; - for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; + while (i--) { + if (postsuspend) { + if (ti->type->postsuspend) + ti->type->postsuspend(ti); + } else if (ti->type->presuspend) + ti->type->presuspend(ti); - if (ti->type->suspend) - ti->type->suspend(ti); + ti++; } } +void dm_table_presuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + return suspend_targets(t, 0); +} + +void dm_table_postsuspend_targets(struct dm_table *t) +{ + if (!t) + return; + + return suspend_targets(t, 1); +} + void dm_table_resume_targets(struct dm_table *t) { int i; @@ -900,11 +922,35 @@ void dm_table_unplug_all(struct dm_table } } +int dm_table_flush_all(struct dm_table *t) +{ + struct list_head *d, *devices = dm_table_get_devices(t); + int ret = 0; + + for (d = devices->next; d != devices; d = d->next) { + struct dm_dev *dd = list_entry(d, struct dm_dev, list); + request_queue_t *q = bdev_get_queue(dd->bdev); + int err; + + if (!q->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL); + + if (!ret) + ret = err; + } + + return ret; +} + EXPORT_SYMBOL(dm_vcalloc); EXPORT_SYMBOL(dm_get_device); EXPORT_SYMBOL(dm_put_device); EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_size); EXPORT_SYMBOL(dm_table_get_mode); EXPORT_SYMBOL(dm_table_put); EXPORT_SYMBOL(dm_table_get); EXPORT_SYMBOL(dm_table_unplug_all); +EXPORT_SYMBOL(dm_table_flush_all); diff -pruN ./drivers/md.dm/dm-target.c ./drivers/md/dm-target.c --- ./drivers/md.dm/dm-target.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/dm-target.c 2006-03-17 13:16:38.000000000 +0300 @@ -120,10 +120,9 @@ int dm_register_target(struct target_typ return -ENOMEM; down_write(&_lock); - if (__find_target_type(t->name)) { - kfree(ti); + if (__find_target_type(t->name)) rv = -EEXIST; - } else + else list_add(&ti->list, &_targets); up_write(&_lock); diff -pruN ./drivers/md.dm/Kconfig ./drivers/md/Kconfig --- ./drivers/md.dm/Kconfig 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/Kconfig 2006-03-17 13:16:38.000000000 +0300 @@ -85,6 +85,24 @@ config MD_RAID1 If unsure, say Y. +config MD_RAID10 + tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" + depends on BLK_DEV_MD && EXPERIMENTAL + ---help--- + RAID-10 provides a combination of striping (RAID-0) and + mirroring (RAID-1) with easier configuration and more flexable + layout. + Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to + be the same size (or atleast, only as much as the smallest device + will be used). + RAID-10 provides a variety of layouts that provide different levels + of redundancy and performance. + + RAID-10 requires mdadm-1.7.0 or later, available at: + + ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ + + config MD_RAID5 tristate "RAID-4/RAID-5 mode" depends on BLK_DEV_MD @@ -200,5 +218,17 @@ config DM_ZERO A target that discards writes, and returns all zeroes for reads. Useful in some recovery situations. +config DM_MULTIPATH + tristate "Multipath target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Allow volume managers to support multipath hardware. + +config DM_MULTIPATH_EMC + tristate "EMC CX/AX multipath support (EXPERIMENTAL)" + depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL + ---help--- + Multipath support for EMC CX/AX series hardware. + endmenu diff -pruN ./drivers/md.dm/kcopyd.c ./drivers/md/kcopyd.c --- ./drivers/md.dm/kcopyd.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/kcopyd.c 2006-03-20 09:36:55.000000000 +0300 @@ -43,6 +43,10 @@ struct kcopyd_client { struct page_list *pages; unsigned int nr_pages; unsigned int nr_free_pages; +#ifndef __GENKSYMS__ + wait_queue_head_t destroyq; + atomic_t nr_jobs; +#endif }; static struct page_list *alloc_pl(void) @@ -292,10 +296,15 @@ static int run_complete_job(struct kcopy int read_err = job->read_err; unsigned int write_err = job->write_err; kcopyd_notify_fn fn = job->fn; + struct kcopyd_client *kc = job->kc; - kcopyd_put_pages(job->kc, job->pages); + kcopyd_put_pages(kc, job->pages); mempool_free(job, _job_pool); fn(read_err, write_err, context); + + if (atomic_dec_and_test(&kc->nr_jobs)) + wake_up(&kc->destroyq); + return 0; } @@ -430,6 +439,7 @@ static void do_work(void *ignored) */ static void dispatch_job(struct kcopyd_job *job) { + atomic_inc(&job->kc->nr_jobs); push(&_pages_jobs, job); wake(); } @@ -667,6 +677,9 @@ int kcopyd_client_create(unsigned int nr return r; } + init_waitqueue_head(&kc->destroyq); + atomic_set(&kc->nr_jobs, 0); + client_add(kc); *result = kc; return 0; @@ -674,6 +687,9 @@ int kcopyd_client_create(unsigned int nr void kcopyd_client_destroy(struct kcopyd_client *kc) { + /* Wait for completion of all jobs submitted by this client. */ + wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); + dm_io_put(kc->nr_pages); client_free_pages(kc); client_del(kc); diff -pruN ./drivers/md.dm/linear.c ./drivers/md/linear.c --- ./drivers/md.dm/linear.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/linear.c 2006-03-17 13:16:38.000000000 +0300 @@ -47,7 +47,6 @@ static inline dev_info_t *which_dev(mdde return hash->dev0; } - /** * linear_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue @@ -93,13 +92,35 @@ static void linear_unplug(request_queue_ } } +static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + linear_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; i < mddev->raid_disks; i++) { + struct block_device *bdev = conf->disks[i].rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} static int linear_run (mddev_t *mddev) { linear_conf_t *conf; struct linear_hash *table; mdk_rdev_t *rdev; - int size, i, nb_zone, cnt; + int i, nb_zone, cnt; + sector_t size; unsigned int curr_offset; struct list_head *tmp; @@ -137,7 +158,7 @@ static int linear_run (mddev_t *mddev) */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->size = rdev->size; mddev->array_size += rdev->size; @@ -200,6 +221,7 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; + mddev->queue->issue_flush_fn = linear_issue_flush; return 0; out: @@ -247,10 +269,11 @@ static int linear_make_request (request_ char b[BDEVNAME_SIZE]; printk("linear_make_request: Block %llu out of bounds on " - "dev %s size %ld offset %ld\n", + "dev %s size %llu offset %llu\n", (unsigned long long)block, bdevname(tmp_dev->rdev->bdev, b), - tmp_dev->size, tmp_dev->offset); + (unsigned long long)tmp_dev->size, + (unsigned long long)tmp_dev->offset); bio_io_error(bio, bio->bi_size); return 0; } diff -pruN ./drivers/md.dm/Makefile ./drivers/md/Makefile --- ./drivers/md.dm/Makefile 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/Makefile 2006-03-17 13:16:38.000000000 +0300 @@ -4,13 +4,16 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o kcopyd.o +dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-mirror-objs := dm-log.o dm-raid1.o raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6mmx.o raid6sse1.o raid6sse2.o -host-progs := mktables +hostprogs-y := mktables + +CFLAGS_raid6int8.o += -O2 # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise @@ -20,12 +23,15 @@ host-progs := mktables obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID5) += raid5.o xor.o obj-$(CONFIG_MD_RAID6) += raid6.o xor.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o +obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o obj-$(CONFIG_DM_ZERO) += dm-zero.o diff -pruN ./drivers/md.dm/md.c ./drivers/md/md.c --- ./drivers/md.dm/md.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/md.c 2006-03-17 13:22:09.000000000 +0300 @@ -154,6 +154,39 @@ static spinlock_t all_mddevs_lock = SPIN tmp = tmp->next;}) \ ) +int md_flush_mddev(mddev_t *mddev, sector_t *error_sector) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + int ret = 0; + + /* + * this list iteration is done without any locking in md?! + */ + ITERATE_RDEV(mddev, rdev, tmp) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + int err; + + if (!r_queue->issue_flush_fn) + err = -EOPNOTSUPP; + else + err = r_queue->issue_flush_fn(r_queue, rdev->bdev->bd_disk, error_sector); + + if (!ret) + ret = err; + } + + return ret; +} + +static int md_flush_all(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + + return md_flush_mddev(mddev, error_sector); +} + static int md_fail_request (request_queue_t *q, struct bio *bio) { bio_io_error(bio, bio->bi_size); @@ -331,29 +364,24 @@ static int bi_complete(struct bio *bio, static int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw) { - struct bio bio; - struct bio_vec vec; + struct bio *bio = bio_alloc(GFP_NOIO, 1); struct completion event; + int ret; rw |= (1 << BIO_RW_SYNC); - bio_init(&bio); - bio.bi_io_vec = &vec; - vec.bv_page = page; - vec.bv_len = size; - vec.bv_offset = 0; - bio.bi_vcnt = 1; - bio.bi_idx = 0; - bio.bi_size = size; - bio.bi_bdev = bdev; - bio.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); init_completion(&event); - bio.bi_private = &event; - bio.bi_end_io = bi_complete; - submit_bio(rw, &bio); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(rw, bio); wait_for_completion(&event); - return test_bit(BIO_UPTODATE, &bio.bi_flags); + ret = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ret; } static int read_disk_sb(mdk_rdev_t * rdev) @@ -373,7 +401,7 @@ static int read_disk_sb(mdk_rdev_t * rde return 0; fail: - printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", bdevname(rdev->bdev,b)); return -EINVAL; } @@ -439,6 +467,31 @@ static unsigned int calc_sb_csum(mdp_sup return csum; } +/* csum_partial is not consistent between different architectures. + * Some (i386) do a 32bit csum. Some (alpha) do 16 bit. + * This makes it hard for user-space to know what to do. + * So we use calc_sb_csum to set the checksum to allow working + * with older kernels, but allow calc_sb_csum_common to + * be used when checking if a checksum is correct, to + * make life easier for user-space tools that might write + * a superblock. + */ +static unsigned int calc_sb_csum_common(mdp_super_t *super) +{ + unsigned int disk_csum = super->sb_csum; + unsigned long long newcsum = 0; + unsigned int csum; + int i; + unsigned int *superc = (int*) super; + super->sb_csum = 0; + + for (i=0; i>32); + super->sb_csum = disk_csum; + return csum; +} + /* * Handle superblock details. * We want to be able to handle multiple superblock formats @@ -521,7 +574,8 @@ static int super_90_load(mdk_rdev_t *rde if (sb->raid_disks <= 0) goto abort; - if (calc_sb_csum(sb) != sb->sb_csum) { + if (calc_sb_csum(sb) != sb->sb_csum && + calc_sb_csum_common(sb) != sb->sb_csum) { printk(KERN_WARNING "md: invalid superblock checksum on %s\n", b); goto abort; @@ -530,7 +584,7 @@ static int super_90_load(mdk_rdev_t *rde rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; - if (sb->level == MULTIPATH) + if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else rdev->desc_nr = sb->this_disk.number; @@ -745,11 +799,21 @@ static void super_90_sync(mddev_t *mddev static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) { unsigned int disk_csum, csum; + unsigned long long newcsum; int size = 256 + sb->max_dev*2; + unsigned int *isuper = (unsigned int*)sb; + int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; - csum = csum_partial((void *)sb, size, 0); + newcsum = 0; + for (i=0; size>=4; size -= 4 ) + newcsum += le32_to_cpu(*isuper++); + + if (size == 2) + newcsum += le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; return csum; } @@ -924,12 +988,12 @@ static void super_1_sync(mddev_t *mddev, max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) - if (rdev2->desc_nr > max_dev) - max_dev = rdev2->desc_nr; + if (rdev2->desc_nr+1 > max_dev) + max_dev = rdev2->desc_nr+1; sb->max_dev = max_dev; for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(0xfffe); ITERATE_RDEV(mddev,rdev2,tmp) { i = rdev2->desc_nr; @@ -942,6 +1006,7 @@ static void super_1_sync(mddev_t *mddev, } sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ + sb->sb_csum = calc_sb_1_csum(sb); } @@ -1042,20 +1107,24 @@ static void unbind_rdev_from_array(mdk_r /* * prevent the device from being mounted, repartitioned or * otherwise reused by a RAID array (or any other kernel - * subsystem), by opening the device. [simply getting an - * inode is not enough, the SCSI module usage code needs - * an explicit open() on the device] + * subsystem), by bd_claiming the device. */ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) { int err = 0; struct block_device *bdev; + char b[BDEVNAME_SIZE]; bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); - if (IS_ERR(bdev)) + if (IS_ERR(bdev)) { + printk(KERN_ERR "md: could not open %s.\n", + __bdevname(dev, b)); return PTR_ERR(bdev); + } err = bd_claim(bdev, rdev); if (err) { + printk(KERN_ERR "md: could not bd_claim %s.\n", + bdevname(bdev, b)); blkdev_put(bdev); return err; } @@ -1117,10 +1186,7 @@ static void export_array(mddev_t *mddev) static void print_desc(mdp_disk_t *desc) { - char b[BDEVNAME_SIZE]; - - printk(" DISK\n", desc->number, - __bdevname(MKDEV(desc->major, desc->minor), b), + printk(" DISK\n", desc->number, desc->major,desc->minor,desc->raid_disk,desc->state); } @@ -1312,8 +1378,7 @@ static mdk_rdev_t *md_import_device(dev_ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { - printk(KERN_ERR "md: could not alloc mem for %s!\n", - __bdevname(newdev, b)); + printk(KERN_ERR "md: could not alloc mem for new device!\n"); return ERR_PTR(-ENOMEM); } memset(rdev, 0, sizeof(*rdev)); @@ -1322,11 +1387,9 @@ static mdk_rdev_t *md_import_device(dev_ goto abort_free; err = lock_rdev(rdev, newdev); - if (err) { - printk(KERN_ERR "md: could not lock %s.\n", - __bdevname(newdev, b)); + if (err) goto abort_free; - } + rdev->desc_nr = -1; rdev->faulty = 0; rdev->in_sync = 0; @@ -1436,9 +1499,8 @@ static int analyze_sbs(mddev_t * mddev) goto abort; } - if ((mddev->recovery_cp != MaxSector) && - ((mddev->level == 1) || - ((mddev->level >= 4) && (mddev->level <= 6)))) + if (mddev->recovery_cp != MaxSector && + mddev->level >= 1) printk(KERN_ERR "md: %s: raid array is not clean" " -- starting background reconstruction\n", mdname(mddev)); @@ -1615,6 +1677,8 @@ static int do_md_run(mddev_t * mddev) mddev->pers = pers[pnum]; spin_unlock(&pers_lock); + mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + err = mddev->pers->run(mddev); if (err) { printk(KERN_ERR "md: pers->run() failed ...\n"); @@ -1645,6 +1709,7 @@ static int do_md_run(mddev_t * mddev) */ mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + mddev->queue->issue_flush_fn = md_flush_all; mddev->changed = 1; return 0; @@ -1881,11 +1946,9 @@ static int autostart_array(dev_t startde mdk_rdev_t *start_rdev = NULL, *rdev; start_rdev = md_import_device(startdev, 0, 0); - if (IS_ERR(start_rdev)) { - printk(KERN_WARNING "md: could not import %s!\n", - __bdevname(startdev, b)); + if (IS_ERR(start_rdev)) return err; - } + /* NOTE: this can only work for 0.90.0 superblocks */ sb = (mdp_super_t*)page_address(start_rdev->sb_page); @@ -1916,12 +1979,9 @@ static int autostart_array(dev_t startde if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) continue; rdev = md_import_device(dev, 0, 0); - if (IS_ERR(rdev)) { - printk(KERN_WARNING "md: could not import %s," - " trying to run array nevertheless.\n", - __bdevname(dev, b)); + if (IS_ERR(rdev)) continue; - } + list_add(&rdev->same_set, &pending_raid_disks); } @@ -2153,42 +2213,6 @@ static int add_new_disk(mddev_t * mddev, return 0; } -static int hot_generate_error(mddev_t * mddev, dev_t dev) -{ - char b[BDEVNAME_SIZE]; - struct request_queue *q; - mdk_rdev_t *rdev; - - if (!mddev->pers) - return -ENODEV; - - printk(KERN_INFO "md: trying to generate %s error in %s ... \n", - __bdevname(dev, b), mdname(mddev)); - - rdev = find_rdev(mddev, dev); - if (!rdev) { - /* MD_BUG(); */ /* like hell - it's not a driver bug */ - return -ENXIO; - } - - if (rdev->desc_nr == -1) { - MD_BUG(); - return -EINVAL; - } - if (!rdev->in_sync) - return -ENODEV; - - q = bdev_get_queue(rdev->bdev); - if (!q) { - MD_BUG(); - return -ENODEV; - } - printk(KERN_INFO "md: okay, generating error!\n"); -// q->oneshot_error = 1; // disabled for now - - return 0; -} - static int hot_remove_disk(mddev_t * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; @@ -2197,9 +2221,6 @@ static int hot_remove_disk(mddev_t * mdd if (!mddev->pers) return -ENODEV; - printk(KERN_INFO "md: trying to remove %s from %s ... \n", - __bdevname(dev, b), mdname(mddev)); - rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; @@ -2227,9 +2248,6 @@ static int hot_add_disk(mddev_t * mddev, if (!mddev->pers) return -ENODEV; - printk(KERN_INFO "md: trying to hot-add %s to %s ... \n", - __bdevname(dev, b), mdname(mddev)); - if (mddev->major_version != 0) { printk(KERN_WARNING "%s: HOT_ADD may only be used with" " version-0 superblocks.\n", @@ -2478,6 +2496,9 @@ static int set_disk_faulty(mddev_t *mdde { mdk_rdev_t *rdev; + if (mddev->pers == NULL) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENODEV; @@ -2489,7 +2510,6 @@ static int set_disk_faulty(mddev_t *mdde static int md_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - char b[BDEVNAME_SIZE]; int err = 0; void __user *argp = (void __user *)arg; struct hd_geometry __user *loc = argp; @@ -2548,8 +2568,7 @@ static int md_ioctl(struct inode *inode, } err = autostart_array(new_decode_dev(arg)); if (err) { - printk(KERN_WARNING "md: autostart %s failed!\n", - __bdevname(arg, b)); + printk(KERN_WARNING "md: autostart failed!\n"); goto abort; } goto done; @@ -2690,9 +2709,7 @@ static int md_ioctl(struct inode *inode, err = add_new_disk(mddev, &info); goto done_unlock; } - case HOT_GENERATE_ERROR: - err = hot_generate_error(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); goto done_unlock; @@ -2876,7 +2893,7 @@ mdk_thread_t *md_register_thread(void (* return thread; } -void md_interrupt_thread(mdk_thread_t *thread) +static void md_interrupt_thread(mdk_thread_t *thread) { if (!thread->tsk) { MD_BUG(); @@ -2919,6 +2936,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } @@ -2951,7 +2969,11 @@ static void status_resync(struct seq_fil unsigned long max_blocks, resync, res, dt, db, rt; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; - max_blocks = mddev->size; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_blocks = mddev->resync_max_sectors >> 1; + else + max_blocks = mddev->size; /* * Should not happen. @@ -3187,11 +3209,6 @@ int unregister_md_personality(int pnum) return 0; } -void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) -{ - rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; -} - static int is_mddev_idle(mddev_t *mddev) { mdk_rdev_t * rdev; @@ -3204,8 +3221,12 @@ static int is_mddev_idle(mddev_t *mddev) struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, read_sectors) + disk_stat_read(disk, write_sectors) - - disk->sync_io; - if ((curr_events - rdev->last_events) > 32) { + atomic_read(&disk->sync_io); + /* Allow some slack between valud of curr_events and last_events, + * as there are some uninteresting races. + * Note: the following is an unsigned comparison. + */ + if ((curr_events - rdev->last_events + 32) > 64) { rdev->last_events = curr_events; idle = 0; } @@ -3339,7 +3360,14 @@ static void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); - max_sectors = mddev->size << 1; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* resync follows the size requested by the personality, + * which default to physical size, but can be virtual size + */ + max_sectors = mddev->resync_max_sectors; + else + /* recovery follows the physical size of devices */ + max_sectors = mddev->size << 1; printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" @@ -3372,10 +3400,12 @@ static void md_do_sync(mddev_t *mddev) init_waitqueue_head(&mddev->recovery_wait); last_check = 0; - if (j) + if (j>2) { printk(KERN_INFO "md: resuming recovery of %s from checkpoint.\n", mdname(mddev)); + mddev->curr_resync = j; + } while (j < max_sectors) { int sectors; @@ -3458,7 +3488,7 @@ static void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && mddev->curr_resync > 2 && - mddev->curr_resync > mddev->recovery_cp) { + mddev->curr_resync >= mddev->recovery_cp) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { printk(KERN_INFO "md: checkpointing recovery of %s.\n", @@ -3697,7 +3727,6 @@ void md_autodetect_dev(dev_t dev) static void autostart_arrays(int part) { - char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev; int i; @@ -3707,11 +3736,9 @@ static void autostart_arrays(int part) dev_t dev = detected_devices[i]; rdev = md_import_device(dev,0, 0); - if (IS_ERR(rdev)) { - printk(KERN_ALERT "md: could not import %s!\n", - __bdevname(dev, b)); + if (IS_ERR(rdev)) continue; - } + if (rdev->faulty) { MD_BUG(); continue; @@ -3762,7 +3789,6 @@ module_exit(md_exit) EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_sync_acct); EXPORT_SYMBOL(md_done_sync); EXPORT_SYMBOL(md_write_start); EXPORT_SYMBOL(md_write_end); @@ -3771,6 +3797,5 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_print_devices); -EXPORT_SYMBOL(md_interrupt_thread); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL"); diff -pruN ./drivers/md.dm/multipath.c ./drivers/md/multipath.c --- ./drivers/md.dm/multipath.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/multipath.c 2006-03-17 13:16:38.000000000 +0300 @@ -99,12 +99,12 @@ static void multipath_reschedule_retry ( * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate) +static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) { struct bio *bio = mp_bh->master_bio; multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); - bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO); + bio_endio(bio, bio->bi_size, err); mempool_free(mp_bh, conf->pool); } @@ -119,8 +119,8 @@ int multipath_end_request(struct bio *bi return 1; if (uptodate) - multipath_end_bh_io(mp_bh, uptodate); - else if ((bio->bi_rw & (1 << BIO_RW_AHEAD)) == 0) { + multipath_end_bh_io(mp_bh, 0); + else if (!bio_rw_ahead(bio)) { /* * oops, IO error: */ @@ -131,7 +131,7 @@ int multipath_end_request(struct bio *bi (unsigned long long)bio->bi_sector); multipath_reschedule_retry(mp_bh); } else - multipath_end_bh_io(mp_bh, 0); + multipath_end_bh_io(mp_bh, error); rdev_dec_pending(rdev, conf->mddev); return 0; } @@ -155,7 +155,7 @@ static void unplug_slaves(mddev_t *mddev r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); } } spin_unlock_irqrestore(&conf->device_lock, flags); @@ -217,6 +217,31 @@ static void multipath_status (struct seq seq_printf (seq, "]"); } +static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + multipath_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->multipaths[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} /* * Careful, this can execute in IRQ contexts as well! @@ -300,7 +325,7 @@ static int multipath_add_disk(mddev_t *m */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; rdev->raid_disk = path; @@ -377,7 +402,7 @@ static void multipathd (mddev_t *mddev) " error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_sector); - multipath_end_bh_io(mp_bh, 0); + multipath_end_bh_io(mp_bh, -EIO); } else { printk(KERN_ERR "multipath: %s: redirecting sector %llu" " to another IO path\n", @@ -435,6 +460,8 @@ static int multipath_run (mddev_t *mddev mddev->queue->unplug_fn = multipath_unplug; + mddev->queue->issue_flush_fn = multipath_issue_flush; + conf->working_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { disk_idx = rdev->raid_disk; @@ -452,7 +479,7 @@ static int multipath_run (mddev_t *mddev * a merge_bvec_fn to be involved in multipath */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!rdev->faulty) conf->working_disks++; diff -pruN ./drivers/md.dm/raid0.c ./drivers/md/raid0.c --- ./drivers/md.dm/raid0.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/raid0.c 2006-03-17 13:16:38.000000000 +0300 @@ -40,6 +40,31 @@ static void raid0_unplug(request_queue_t } } +static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + struct block_device *bdev = devlist[i]->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret =r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + return ret; +} + + static int create_strip_zones (mddev_t *mddev) { int i, c, j; @@ -137,7 +162,7 @@ static int create_strip_zones (mddev_t * if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!smallest || (rdev1->size size)) smallest = rdev1; @@ -219,6 +244,8 @@ static int create_strip_zones (mddev_t * mddev->queue->unplug_fn = raid0_unplug; + mddev->queue->issue_flush_fn = raid0_issue_flush; + printk("raid0: done.\n"); return 0; abort: diff -pruN ./drivers/md.dm/raid10.c ./drivers/md/raid10.c --- ./drivers/md.dm/raid10.c 1970-01-01 03:00:00.000000000 +0300 +++ ./drivers/md/raid10.c 2006-03-17 13:16:38.000000000 +0300 @@ -0,0 +1,1780 @@ +/* + * raid10.c : Multiple Devices driver for Linux + * + * Copyright (C) 2000-2004 Neil Brown + * + * RAID-10 support for md. + * + * Base on code in raid1.c. See raid1.c for futher copyright information. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +/* + * RAID10 provides a combination of RAID0 and RAID1 functionality. + * The layout of data is defined by + * chunk_size + * raid_disks + * near_copies (stored in low byte of layout) + * far_copies (stored in second byte of layout) + * + * The data to be stored is divided into chunks using chunksize. + * Each device is divided into far_copies sections. + * In each section, chunks are laid out in a style similar to raid0, but + * near_copies copies of each chunk is stored (each on a different drive). + * The starting device for each section is offset near_copies from the starting + * device of the previous section. + * Thus there are (near_copies*far_copies) of each chunk, and each is on a different + * drive. + * near_copies and far_copies must be at least one, and there product is at most + * raid_disks. + */ + +/* + * Number of guaranteed r10bios in case of extreme VM load: + */ +#define NR_RAID10_BIOS 256 + +static void unplug_slaves(mddev_t *mddev); + +static void * r10bio_pool_alloc(int gfp_flags, void *data) +{ + conf_t *conf = data; + r10bio_t *r10_bio; + int size = offsetof(struct r10bio_s, devs[conf->copies]); + + /* allocate a r10bio with room for raid_disks entries in the bios array */ + r10_bio = kmalloc(size, gfp_flags); + if (r10_bio) + memset(r10_bio, 0, size); + else + unplug_slaves(conf->mddev); + + return r10_bio; +} + +static void r10bio_pool_free(void *r10_bio, void *data) +{ + kfree(r10_bio); +} + +#define RESYNC_BLOCK_SIZE (64*1024) +//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +#define RESYNC_WINDOW (2048*1024) + +/* + * When performing a resync, we need to read and compare, so + * we need as many pages are there are copies. + * When performing a recovery, we need 2 bios, one for read, + * one for write (we recover only one drive per r10buf) + * + */ +static void * r10buf_pool_alloc(int gfp_flags, void *data) +{ + conf_t *conf = data; + struct page *page; + r10bio_t *r10_bio; + struct bio *bio; + int i, j; + int nalloc; + + r10_bio = r10bio_pool_alloc(gfp_flags, conf); + if (!r10_bio) { + unplug_slaves(conf->mddev); + return NULL; + } + + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + nalloc = conf->copies; /* resync */ + else + nalloc = 2; /* recovery */ + + /* + * Allocate bios. + */ + for (j = nalloc ; j-- ; ) { + bio = bio_alloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].bio = bio; + } + /* + * Allocate RESYNC_PAGES data pages and attach them + * where needed. + */ + for (j = 0 ; j < nalloc; j++) { + bio = r10_bio->devs[j].bio; + for (i = 0; i < RESYNC_PAGES; i++) { + page = alloc_page(gfp_flags); + if (unlikely(!page)) + goto out_free_pages; + + bio->bi_io_vec[i].bv_page = page; + } + } + + return r10_bio; + +out_free_pages: + for ( ; i > 0 ; i--) + __free_page(bio->bi_io_vec[i-1].bv_page); + while (j--) + for (i = 0; i < RESYNC_PAGES ; i++) + __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + j = -1; +out_free_bio: + while ( ++j < nalloc ) + bio_put(r10_bio->devs[j].bio); + r10bio_pool_free(r10_bio, conf); + return NULL; +} + +static void r10buf_pool_free(void *__r10_bio, void *data) +{ + int i; + conf_t *conf = data; + r10bio_t *r10bio = __r10_bio; + int j; + + for (j=0; j < conf->copies; j++) { + struct bio *bio = r10bio->devs[j].bio; + if (bio) { + for (i = 0; i < RESYNC_PAGES; i++) { + __free_page(bio->bi_io_vec[i].bv_page); + bio->bi_io_vec[i].bv_page = NULL; + } + bio_put(bio); + } + } + r10bio_pool_free(r10bio, conf); +} + +static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) +{ + int i; + + for (i = 0; i < conf->copies; i++) { + struct bio **bio = & r10_bio->devs[i].bio; + if (*bio) + bio_put(*bio); + *bio = NULL; + } +} + +static inline void free_r10bio(r10bio_t *r10_bio) +{ + unsigned long flags; + + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + spin_lock_irqsave(&conf->resync_lock, flags); + if (!--conf->nr_pending) { + wake_up(&conf->wait_idle); + wake_up(&conf->wait_resume); + } + spin_unlock_irqrestore(&conf->resync_lock, flags); + + put_all_bios(conf, r10_bio); + mempool_free(r10_bio, conf->r10bio_pool); +} + +static inline void put_buf(r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(r10_bio->mddev); + unsigned long flags; + + mempool_free(r10_bio, conf->r10buf_pool); + + spin_lock_irqsave(&conf->resync_lock, flags); + if (!conf->barrier) + BUG(); + --conf->barrier; + wake_up(&conf->wait_resume); + wake_up(&conf->wait_idle); + + if (!--conf->nr_pending) { + wake_up(&conf->wait_idle); + wake_up(&conf->wait_resume); + } + spin_unlock_irqrestore(&conf->resync_lock, flags); +} + +static void reschedule_retry(r10bio_t *r10_bio) +{ + unsigned long flags; + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev_to_conf(mddev); + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&r10_bio->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); + + md_wakeup_thread(mddev->thread); +} + +/* + * raid_end_bio_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid_end_bio_io(r10bio_t *r10_bio) +{ + struct bio *bio = r10_bio->master_bio; + + bio_endio(bio, bio->bi_size, + test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); + free_r10bio(r10_bio); +} + +/* + * Update disk head position estimator based on IRQ completion info. + */ +static inline void update_head_pos(int slot, r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + conf->mirrors[r10_bio->devs[slot].devnum].head_position = + r10_bio->devs[slot].addr + (r10_bio->sectors); +} + +static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + int slot, dev; + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + if (bio->bi_size) + return 1; + + slot = r10_bio->read_slot; + dev = r10_bio->devs[slot].devnum; + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error(r10_bio->mddev, conf->mirrors[dev].rdev); + else + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + + update_head_pos(slot, r10_bio); + + /* + * we have only one bio on the read side + */ + if (uptodate) + raid_end_bio_io(r10_bio); + else { + /* + * oops, read error: + */ + char b[BDEVNAME_SIZE]; + if (printk_ratelimit()) + printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", + bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); + reschedule_retry(r10_bio); + } + + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + return 0; +} + +static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + int slot, dev; + conf_t *conf = mddev_to_conf(r10_bio->mddev); + + if (bio->bi_size) + return 1; + + for (slot = 0; slot < conf->copies; slot++) + if (r10_bio->devs[slot].bio == bio) + break; + dev = r10_bio->devs[slot].devnum; + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error(r10_bio->mddev, conf->mirrors[dev].rdev); + else + /* + * Set R10BIO_Uptodate in our master bio, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the composite IO operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' bio. + */ + set_bit(R10BIO_Uptodate, &r10_bio->state); + + update_head_pos(slot, r10_bio); + + /* + * + * Let's see if all mirrored write operations have finished + * already. + */ + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + return 0; +} + + +/* + * RAID10 layout manager + * Aswell as the chunksize and raid_disks count, there are two + * parameters: near_copies and far_copies. + * near_copies * far_copies must be <= raid_disks. + * Normally one of these will be 1. + * If both are 1, we get raid0. + * If near_copies == raid_disks, we get raid1. + * + * Chunks are layed out in raid0 style with near_copies copies of the + * first chunk, followed by near_copies copies of the next chunk and + * so on. + * If far_copies > 1, then after 1/far_copies of the array has been assigned + * as described above, we start again with a device offset of near_copies. + * So we effectively have another copy of the whole array further down all + * the drives, but with blocks on different drives. + * With this layout, and block is never stored twice on the one device. + * + * raid10_find_phys finds the sector offset of a given virtual sector + * on each device that it is on. If a block isn't on a device, + * that entry in the array is set to MaxSector. + * + * raid10_find_virt does the reverse mapping, from a device and a + * sector offset to a virtual address + */ + +static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) +{ + int n,f; + sector_t sector; + sector_t chunk; + sector_t stripe; + int dev; + + int slot = 0; + + /* now calculate first sector/dev */ + chunk = r10bio->sector >> conf->chunk_shift; + sector = r10bio->sector & conf->chunk_mask; + + chunk *= conf->near_copies; + stripe = chunk; + dev = sector_div(stripe, conf->raid_disks); + + sector += stripe << conf->chunk_shift; + + /* and calculate all the others */ + for (n=0; n < conf->near_copies; n++) { + int d = dev; + sector_t s = sector; + r10bio->devs[slot].addr = sector; + r10bio->devs[slot].devnum = d; + slot++; + + for (f = 1; f < conf->far_copies; f++) { + d += conf->near_copies; + if (d >= conf->raid_disks) + d -= conf->raid_disks; + s += conf->stride; + r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; + slot++; + } + dev++; + if (dev >= conf->raid_disks) { + dev = 0; + sector += (conf->chunk_mask + 1); + } + } + BUG_ON(slot != conf->copies); +} + +static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) +{ + sector_t offset, chunk, vchunk; + + while (sector > conf->stride) { + sector -= conf->stride; + if (dev < conf->near_copies) + dev += conf->raid_disks - conf->near_copies; + else + dev -= conf->near_copies; + } + + offset = sector & conf->chunk_mask; + chunk = sector >> conf->chunk_shift; + vchunk = chunk * conf->raid_disks + dev; + sector_div(vchunk, conf->near_copies); + return (vchunk << conf->chunk_shift) + offset; +} + +/** + * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged + * @q: request queue + * @bio: the buffer head that's been built up so far + * @biovec: the request that could be merged to it. + * + * Return amount of bytes we can accept at this offset + * If near_copies == raid_disk, there are no striping issues, + * but in that case, the function isn't called at all. + */ +static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, + struct bio_vec *bio_vec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; /* bio_add cannot handle a negative return */ + if (max <= bio_vec->bv_len && bio_sectors == 0) + return bio_vec->bv_len; + else + return max; +} + +/* + * This routine returns the disk from which the requested read should + * be done. There is a per-array 'next expected sequential IO' sector + * number - if this matches on the next IO then we use the last disk. + * There is also a per-disk 'last know head position' sector that is + * maintained from IRQ contexts, both the normal and the resync IO + * completion handlers update this position correctly. If there is no + * perfect sequential match then we pick the disk whose head is closest. + * + * If there are 2 mirrors in the same 2 devices, performance degrades + * because position is mirror, not device based. + * + * The rdev for the device selected will have nr_pending incremented. + */ + +/* + * FIXME: possibly should rethink readbalancing and do it differently + * depending on near_copies / far_copies geometry. + */ +static int read_balance(conf_t *conf, r10bio_t *r10_bio) +{ + const unsigned long this_sector = r10_bio->sector; + int disk, slot, nslot; + const int sectors = r10_bio->sectors; + sector_t new_distance, current_distance; + + raid10_find_phys(conf, r10_bio); + spin_lock_irq(&conf->device_lock); + /* + * Check if we can balance. We can balance on the whole + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ + if (conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) { + /* make sure that disk is operational */ + slot = 0; + disk = r10_bio->devs[slot].devnum; + + while (!conf->mirrors[disk].rdev || + !conf->mirrors[disk].rdev->in_sync) { + slot++; + if (slot == conf->copies) { + slot = 0; + disk = -1; + break; + } + disk = r10_bio->devs[slot].devnum; + } + goto rb_out; + } + + + /* make sure the disk is operational */ + slot = 0; + disk = r10_bio->devs[slot].devnum; + while (!conf->mirrors[disk].rdev || + !conf->mirrors[disk].rdev->in_sync) { + slot ++; + if (slot == conf->copies) { + disk = -1; + goto rb_out; + } + disk = r10_bio->devs[slot].devnum; + } + + + current_distance = abs(this_sector - conf->mirrors[disk].head_position); + + /* Find the disk whose head is closest */ + + for (nslot = slot; nslot < conf->copies; nslot++) { + int ndisk = r10_bio->devs[nslot].devnum; + + + if (!conf->mirrors[ndisk].rdev || + !conf->mirrors[ndisk].rdev->in_sync) + continue; + + if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { + disk = ndisk; + slot = nslot; + break; + } + new_distance = abs(r10_bio->devs[nslot].addr - + conf->mirrors[ndisk].head_position); + if (new_distance < current_distance) { + current_distance = new_distance; + disk = ndisk; + slot = nslot; + } + } + +rb_out: + r10_bio->read_slot = slot; +/* conf->next_seq_sect = this_sector + sectors;*/ + + if (disk >= 0 && conf->mirrors[disk].rdev) + atomic_inc(&conf->mirrors[disk].rdev->nr_pending); + spin_unlock_irq(&conf->device_lock); + + return disk; +} + +static void unplug_slaves(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + int i; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && atomic_read(&rdev->nr_pending)) { + request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + + atomic_inc(&rdev->nr_pending); + spin_unlock_irqrestore(&conf->device_lock, flags); + + if (r_queue->unplug_fn) + r_queue->unplug_fn(r_queue); + + spin_lock_irqsave(&conf->device_lock, flags); + rdev_dec_pending(rdev, mddev); + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); +} +static void raid10_unplug(request_queue_t *q) +{ + unplug_slaves(q->queuedata); +} + +static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + unsigned long flags; + int i, ret = 0; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (r_queue->issue_flush_fn) { + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + return ret; +} + +/* + * Throttle resync depth, so that we can both get proper overlapping of + * requests, but are still able to handle normal requests quickly. + */ +#define RESYNC_DEPTH 32 + +static void device_barrier(conf_t *conf, sector_t sect) +{ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), + conf->resync_lock, unplug_slaves(conf->mddev)); + + if (!conf->barrier++) { + wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, + conf->resync_lock, unplug_slaves(conf->mddev)); + if (conf->nr_pending) + BUG(); + } + wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, + conf->resync_lock, unplug_slaves(conf->mddev)); + conf->next_resync = sect; + spin_unlock_irq(&conf->resync_lock); +} + +static int make_request(request_queue_t *q, struct bio * bio) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + mirror_info_t *mirror; + r10bio_t *r10_bio; + struct bio *read_bio; + int i; + int chunk_sects = conf->chunk_mask + 1; + + /* If this request crosses a chunk boundary, we need to + * split it. This will only happen for 1 PAGE (or less) requests. + */ + if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) + > chunk_sects && + conf->near_copies < conf->raid_disks)) { + struct bio_pair *bp; + /* Sanity check -- queue functions should prevent this happening */ + if (bio->bi_vcnt != 1 || + bio->bi_idx != 0) + goto bad_map; + /* This is a one page bio that upper layers + * refuse to split for us, so we need to split it. + */ + bp = bio_split(bio, bio_split_pool, + chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); + if (make_request(q, &bp->bio1)) + generic_make_request(&bp->bio1); + if (make_request(q, &bp->bio2)) + generic_make_request(&bp->bio2); + + bio_pair_release(bp); + return 0; + bad_map: + printk("raid10_make_request bug: can't convert block across chunks" + " or bigger than %dk %llu %d\n", chunk_sects/2, + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + + bio_io_error(bio, bio->bi_size); + return 0; + } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); + + if (bio_data_dir(bio)==WRITE) { + disk_stat_inc(mddev->gendisk, writes); + disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); + } else { + disk_stat_inc(mddev->gendisk, reads); + disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); + } + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio->bi_size >> 9; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_sector; + + if (bio_data_dir(bio) == READ) { + /* + * read balancing logic: + */ + int disk = read_balance(conf, r10_bio); + int slot = r10_bio->read_slot; + if (disk < 0) { + raid_end_bio_io(r10_bio); + return 0; + } + mirror = conf->mirrors + disk; + + read_bio = bio_clone(bio, GFP_NOIO); + + r10_bio->devs[slot].bio = read_bio; + + read_bio->bi_sector = r10_bio->devs[slot].addr + + mirror->rdev->data_offset; + read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_end_io = raid10_end_read_request; + read_bio->bi_rw = READ; + read_bio->bi_private = r10_bio; + + generic_make_request(read_bio); + return 0; + } + + /* + * WRITE: + */ + /* first select target devices under spinlock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ + raid10_find_phys(conf, r10_bio); + spin_lock_irq(&conf->device_lock); + for (i = 0; i < conf->copies; i++) { + int d = r10_bio->devs[i].devnum; + if (conf->mirrors[d].rdev && + !conf->mirrors[d].rdev->faulty) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + r10_bio->devs[i].bio = bio; + } else + r10_bio->devs[i].bio = NULL; + } + spin_unlock_irq(&conf->device_lock); + + atomic_set(&r10_bio->remaining, 1); + md_write_start(mddev); + for (i = 0; i < conf->copies; i++) { + struct bio *mbio; + int d = r10_bio->devs[i].devnum; + if (!r10_bio->devs[i].bio) + continue; + + mbio = bio_clone(bio, GFP_NOIO); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = r10_bio->devs[i].addr+ + conf->mirrors[d].rdev->data_offset; + mbio->bi_bdev = conf->mirrors[d].rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + generic_make_request(mbio); + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(mddev); + raid_end_bio_io(r10_bio); + } + + return 0; +} + +static void status(struct seq_file *seq, mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + int i; + + if (conf->near_copies < conf->raid_disks) + seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); + if (conf->near_copies > 1) + seq_printf(seq, " %d near-copies", conf->near_copies); + if (conf->far_copies > 1) + seq_printf(seq, " %d far-copies", conf->far_copies); + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, "%s", + conf->mirrors[i].rdev && + conf->mirrors[i].rdev->in_sync ? "U" : "_"); + seq_printf(seq, "]"); +} + +static void error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char b[BDEVNAME_SIZE]; + conf_t *conf = mddev_to_conf(mddev); + + /* + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + if (rdev->in_sync + && conf->working_disks == 1) + /* + * Don't fail the drive, just return an IO error. + * The test should really be more sophisticated than + * "working_disks == 1", but it isn't critical, and + * can wait until we do more sophisticated "is the drive + * really dead" tests... + */ + return; + if (rdev->in_sync) { + mddev->degraded++; + conf->working_disks--; + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + } + rdev->in_sync = 0; + rdev->faulty = 1; + mddev->sb_dirty = 1; + printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" + " Operation continuing on %d devices\n", + bdevname(rdev->bdev,b), conf->working_disks); +} + +static void print_conf(conf_t *conf) +{ + int i; + mirror_info_t *tmp; + + printk("RAID10 conf printout:\n"); + if (!conf) { + printk("(!conf)\n"); + return; + } + printk(" --- wd:%d rd:%d\n", conf->working_disks, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->mirrors + i; + if (tmp->rdev) + printk(" disk %d, wo:%d, o:%d, dev:%s\n", + i, !tmp->rdev->in_sync, !tmp->rdev->faulty, + bdevname(tmp->rdev->bdev,b)); + } +} + +static void close_sync(conf_t *conf) +{ + spin_lock_irq(&conf->resync_lock); + wait_event_lock_irq(conf->wait_resume, !conf->barrier, + conf->resync_lock, unplug_slaves(conf->mddev)); + spin_unlock_irq(&conf->resync_lock); + + if (conf->barrier) BUG(); + if (waitqueue_active(&conf->wait_idle)) BUG(); + + mempool_destroy(conf->r10buf_pool); + conf->r10buf_pool = NULL; +} + +static int raid10_spare_active(mddev_t *mddev) +{ + int i; + conf_t *conf = mddev->private; + mirror_info_t *tmp; + + spin_lock_irq(&conf->device_lock); + /* + * Find all non-in_sync disks within the RAID10 configuration + * and mark them in_sync + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if (tmp->rdev + && !tmp->rdev->faulty + && !tmp->rdev->in_sync) { + conf->working_disks++; + mddev->degraded--; + tmp->rdev->in_sync = 1; + } + } + spin_unlock_irq(&conf->device_lock); + + print_conf(conf); + return 0; +} + + +static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + conf_t *conf = mddev->private; + int found = 0; + int mirror; + mirror_info_t *p; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is + * very different from resync + */ + return 0; + spin_lock_irq(&conf->device_lock); + for (mirror=0; mirror < mddev->raid_disks; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + p->rdev = rdev; + + blk_queue_stack_limits(mddev->queue, + rdev->bdev->bd_disk->queue); + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_sector to one PAGE, as + * a one page request is never in violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn && + mddev->queue->max_sectors > (PAGE_SIZE>>9)) + mddev->queue->max_sectors = (PAGE_SIZE>>9); + + p->head_position = 0; + rdev->raid_disk = mirror; + found = 1; + break; + } + spin_unlock_irq(&conf->device_lock); + + print_conf(conf); + return found; +} + +static int raid10_remove_disk(mddev_t *mddev, int number) +{ + conf_t *conf = mddev->private; + int err = 1; + mirror_info_t *p = conf->mirrors+ number; + + print_conf(conf); + spin_lock_irq(&conf->device_lock); + if (p->rdev) { + if (p->rdev->in_sync || + atomic_read(&p->rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + err = 0; + } + if (err) + MD_BUG(); +abort: + spin_unlock_irq(&conf->device_lock); + + print_conf(conf); + return err; +} + + +static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + conf_t *conf = mddev_to_conf(r10_bio->mddev); + int i,d; + + if (bio->bi_size) + return 1; + + for (i=0; icopies; i++) + if (r10_bio->devs[i].bio == bio) + break; + if (i == conf->copies) + BUG(); + update_head_pos(i, r10_bio); + d = r10_bio->devs[i].devnum; + if (!uptodate) + md_error(r10_bio->mddev, + conf->mirrors[d].rdev); + + /* for reconstruct, we always reschedule after a read. + * for resync, only after all reads + */ + if (test_bit(R10BIO_IsRecover, &r10_bio->state) || + atomic_dec_and_test(&r10_bio->remaining)) { + /* we have read all the blocks, + * do the comparison in process context in raid10d + */ + reschedule_retry(r10_bio); + } + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); + return 0; +} + +static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + mddev_t *mddev = r10_bio->mddev; + conf_t *conf = mddev_to_conf(mddev); + int i,d; + + if (bio->bi_size) + return 1; + + for (i = 0; i < conf->copies; i++) + if (r10_bio->devs[i].bio == bio) + break; + d = r10_bio->devs[i].devnum; + + if (!uptodate) + md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); + + while (atomic_dec_and_test(&r10_bio->remaining)) { + if (r10_bio->master_bio == NULL) { + /* the primary of several recovery bios */ + md_done_sync(mddev, r10_bio->sectors, 1); + put_buf(r10_bio); + break; + } else { + r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; + put_buf(r10_bio); + r10_bio = r10_bio2; + } + } + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + return 0; +} + +/* + * Note: sync and recover and handled very differently for raid10 + * This code is for resync. + * For resync, we read through virtual addresses and read all blocks. + * If there is any error, we schedule a write. The lowest numbered + * drive is authoritative. + * However requests come for physical address, so we need to map. + * For every physical address there are raid_disks/copies virtual addresses, + * which is always are least one, but is not necessarly an integer. + * This means that a physical address can span multiple chunks, so we may + * have to submit multiple io requests for a single sync request. + */ +/* + * We check if all blocks are in-sync and only write to blocks that + * aren't in sync + */ +static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(mddev); + int i, first; + struct bio *tbio, *fbio; + + atomic_set(&r10_bio->remaining, 1); + + /* find the first device with a block */ + for (i=0; icopies; i++) + if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + break; + + if (i == conf->copies) + goto done; + + first = i; + fbio = r10_bio->devs[i].bio; + + /* now find blocks with errors */ + for (i=first+1 ; i < conf->copies ; i++) { + int vcnt, j, d; + + if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) + continue; + /* We know that the bi_io_vec layout is the same for + * both 'first' and 'i', so we just compare them. + * All vec entries are PAGE_SIZE; + */ + tbio = r10_bio->devs[i].bio; + vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); + for (j = 0; j < vcnt; j++) + if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), + page_address(tbio->bi_io_vec[j].bv_page), + PAGE_SIZE)) + break; + if (j == vcnt) + continue; + /* Ok, we need to write this bio + * First we need to fixup bv_offset, bv_len and + * bi_vecs, as the read request might have corrupted these + */ + tbio->bi_vcnt = vcnt; + tbio->bi_size = r10_bio->sectors << 9; + tbio->bi_idx = 0; + tbio->bi_phys_segments = 0; + tbio->bi_hw_segments = 0; + tbio->bi_hw_front_size = 0; + tbio->bi_hw_back_size = 0; + tbio->bi_flags &= ~(BIO_POOL_MASK - 1); + tbio->bi_flags |= 1 << BIO_UPTODATE; + tbio->bi_next = NULL; + tbio->bi_rw = WRITE; + tbio->bi_private = r10_bio; + tbio->bi_sector = r10_bio->devs[i].addr; + + for (j=0; j < vcnt ; j++) { + tbio->bi_io_vec[j].bv_offset = 0; + tbio->bi_io_vec[j].bv_len = PAGE_SIZE; + + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + tbio->bi_end_io = end_sync_write; + + d = r10_bio->devs[i].devnum; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + + generic_make_request(tbio); + } + +done: + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_done_sync(mddev, r10_bio->sectors, 1); + put_buf(r10_bio); + } +} + +/* + * Now for the recovery code. + * Recovery happens across physical sectors. + * We recover all non-is_sync drives by finding the virtual address of + * each, and then choose a working drive that also has that virt address. + * There is a separate r10_bio for each non-in_sync drive. + * Only the first two slots are in use. The first for reading, + * The second for writing. + * + */ + +static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) +{ + conf_t *conf = mddev_to_conf(mddev); + int i, d; + struct bio *bio, *wbio; + + + /* move the pages across to the second bio + * and submit the write request + */ + bio = r10_bio->devs[0].bio; + wbio = r10_bio->devs[1].bio; + for (i=0; i < wbio->bi_vcnt; i++) { + struct page *p = bio->bi_io_vec[i].bv_page; + bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; + wbio->bi_io_vec[i].bv_page = p; + } + d = r10_bio->devs[1].devnum; + + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); +} + + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ + +static void raid10d(mddev_t *mddev) +{ + r10bio_t *r10_bio; + struct bio *bio; + unsigned long flags; + conf_t *conf = mddev_to_conf(mddev); + struct list_head *head = &conf->retry_list; + int unplug=0; + mdk_rdev_t *rdev; + + md_check_recovery(mddev); + md_handle_safemode(mddev); + + for (;;) { + char b[BDEVNAME_SIZE]; + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) + break; + r10_bio = list_entry(head->prev, r10bio_t, retry_list); + list_del(head->prev); + spin_unlock_irqrestore(&conf->device_lock, flags); + + mddev = r10_bio->mddev; + conf = mddev_to_conf(mddev); + if (test_bit(R10BIO_IsSync, &r10_bio->state)) { + sync_request_write(mddev, r10_bio); + unplug = 1; + } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { + recovery_request_write(mddev, r10_bio); + unplug = 1; + } else { + int mirror; + bio = r10_bio->devs[r10_bio->read_slot].bio; + r10_bio->devs[r10_bio->read_slot].bio = NULL; + bio_put(bio); + mirror = read_balance(conf, r10_bio); + if (mirror == -1) { + printk(KERN_ALERT "raid10: %s: unrecoverable I/O" + " read error for block %llu\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)r10_bio->sector); + raid_end_bio_io(r10_bio); + } else { + rdev = conf->mirrors[mirror].rdev; + if (printk_ratelimit()) + printk(KERN_ERR "raid10: %s: redirecting sector %llu to" + " another mirror\n", + bdevname(rdev->bdev,b), + (unsigned long long)r10_bio->sector); + bio = bio_clone(r10_bio->master_bio, GFP_NOIO); + r10_bio->devs[r10_bio->read_slot].bio = bio; + bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_rw = READ; + bio->bi_private = r10_bio; + bio->bi_end_io = raid10_end_read_request; + unplug = 1; + generic_make_request(bio); + } + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (unplug) + unplug_slaves(mddev); +} + + +static int init_resync(conf_t *conf) +{ + int buffs; + + buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; + if (conf->r10buf_pool) + BUG(); + conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); + if (!conf->r10buf_pool) + return -ENOMEM; + conf->next_resync = 0; + return 0; +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * + * This is achieved by tracking pending requests and a 'barrier' concept + * that can be installed to exclude normal IO requests. + * + * Resync and recovery are handled very differently. + * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. + * + * For resync, we iterate over virtual addresses, read all copies, + * and update if there are differences. If only one copy is live, + * skip it. + * For recovery, we iterate over physical addresses, read a good + * value for each non-in_sync drive, and over-write. + * + * So, for recovery we may have several outstanding complex requests for a + * given address, one for each out-of-sync device. We model this by allocating + * a number of r10_bio structures, one for each out-of-sync device. + * As we setup these structures, we collect all bio's together into a list + * which we then process collectively to add pages, and then process again + * to pass to generic_make_request. + * + * The r10_bio structures are linked using a borrowed master_bio pointer. + * This link is counted in ->remaining. When the r10_bio that points to NULL + * has its remaining count decremented to 0, the whole complex operation + * is complete. + * + */ + +static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) +{ + conf_t *conf = mddev_to_conf(mddev); + r10bio_t *r10_bio; + struct bio *biolist = NULL, *bio; + sector_t max_sector, nr_sectors; + int disk; + int i; + + sector_t sectors_skipped = 0; + int chunks_skipped = 0; + + if (!conf->r10buf_pool) + if (init_resync(conf)) + return -ENOMEM; + + skipped: + max_sector = mddev->size << 1; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + if (sector_nr >= max_sector) { + close_sync(conf); + return sectors_skipped; + } + if (chunks_skipped >= conf->raid_disks) { + /* if there has been nothing to do on any drive, + * then there is nothing to do at all.. + */ + sector_t sec = max_sector - sector_nr; + md_done_sync(mddev, sec, 1); + return sec + sectors_skipped; + } + + /* make sure whole request will fit in a chunk - if chunks + * are meaningful + */ + if (conf->near_copies < conf->raid_disks && + max_sector > (sector_nr | conf->chunk_mask)) + max_sector = (sector_nr | conf->chunk_mask) + 1; + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. + */ + if (!go_faster && waitqueue_active(&conf->wait_resume)) + schedule_timeout(HZ); + device_barrier(conf, sector_nr + RESYNC_SECTORS); + + /* Again, very different code for resync and recovery. + * Both must result in an r10bio with a list of bios that + * have bi_end_io, bi_sector, bi_bdev set, + * and bi_private set to the r10bio. + * For recovery, we may actually create several r10bios + * with 2 bios in each, that correspond to the bios in the main one. + * In this case, the subordinate r10bios link back through a + * borrowed master_bio pointer, and the counter in the master + * includes a ref from each subordinate. + */ + /* First, we decide what to do and set ->bi_end_io + * To end_sync_read if we want to read, and + * end_sync_write if we will want to write. + */ + + if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* recovery... the complicated one */ + int i, j, k; + r10_bio = NULL; + + for (i=0 ; iraid_disks; i++) + if (conf->mirrors[i].rdev && + !conf->mirrors[i].rdev->in_sync) { + /* want to reconstruct this device */ + r10bio_t *rb2 = r10_bio; + + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + spin_lock_irq(&conf->resync_lock); + conf->nr_pending++; + if (rb2) conf->barrier++; + spin_unlock_irq(&conf->resync_lock); + atomic_set(&r10_bio->remaining, 0); + + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = raid10_find_virt(conf, sector_nr, i); + raid10_find_phys(conf, r10_bio); + for (j=0; jcopies;j++) { + int d = r10_bio->devs[j].devnum; + if (conf->mirrors[d].rdev && + conf->mirrors[d].rdev->in_sync) { + /* This is where we read from */ + bio = r10_bio->devs[0].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = 0; + bio->bi_sector = r10_bio->devs[j].addr + + conf->mirrors[d].rdev->data_offset; + bio->bi_bdev = conf->mirrors[d].rdev->bdev; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + /* and we write to 'i' */ + + for (k=0; kcopies; k++) + if (r10_bio->devs[k].devnum == i) + break; + bio = r10_bio->devs[1].bio; + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = 1; + bio->bi_sector = r10_bio->devs[k].addr + + conf->mirrors[i].rdev->data_offset; + bio->bi_bdev = conf->mirrors[i].rdev->bdev; + + r10_bio->devs[0].devnum = d; + r10_bio->devs[1].devnum = i; + + break; + } + } + if (j == conf->copies) { + BUG(); + } + } + if (biolist == NULL) { + while (r10_bio) { + r10bio_t *rb2 = r10_bio; + r10_bio = (r10bio_t*) rb2->master_bio; + rb2->master_bio = NULL; + put_buf(rb2); + } + goto giveup; + } + } else { + /* resync. Schedule a read for every block at this virt offset */ + int count = 0; + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + + spin_lock_irq(&conf->resync_lock); + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); + + r10_bio->mddev = mddev; + atomic_set(&r10_bio->remaining, 0); + + r10_bio->master_bio = NULL; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsSync, &r10_bio->state); + raid10_find_phys(conf, r10_bio); + r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; + spin_lock_irq(&conf->device_lock); + for (i=0; icopies; i++) { + int d = r10_bio->devs[i].devnum; + bio = r10_bio->devs[i].bio; + bio->bi_end_io = NULL; + if (conf->mirrors[d].rdev == NULL || + conf->mirrors[d].rdev->faulty) + continue; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&r10_bio->remaining); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = 0; + bio->bi_sector = r10_bio->devs[i].addr + + conf->mirrors[d].rdev->data_offset; + bio->bi_bdev = conf->mirrors[d].rdev->bdev; + count++; + } + spin_unlock_irq(&conf->device_lock); + if (count < 2) { + for (i=0; icopies; i++) { + int d = r10_bio->devs[i].devnum; + if (r10_bio->devs[i].bio->bi_end_io) + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + put_buf(r10_bio); + biolist = NULL; + goto giveup; + } + } + + for (bio = biolist; bio ; bio=bio->bi_next) { + + bio->bi_flags &= ~(BIO_POOL_MASK - 1); + if (bio->bi_end_io) + bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_vcnt = 0; + bio->bi_idx = 0; + bio->bi_phys_segments = 0; + bio->bi_hw_segments = 0; + bio->bi_size = 0; + } + + nr_sectors = 0; + do { + struct page *page; + int len = PAGE_SIZE; + disk = 0; + if (sector_nr + (len>>9) > max_sector) + len = (max_sector - sector_nr) << 9; + if (len == 0) + break; + for (bio= biolist ; bio ; bio=bio->bi_next) { + page = bio->bi_io_vec[bio->bi_vcnt].bv_page; + if (bio_add_page(bio, page, len, 0) == 0) { + /* stop here */ + struct bio *bio2; + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { + /* remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_size -= len; + bio2->bi_flags &= ~(1<< BIO_SEG_VALID); + } + goto bio_full; + } + disk = i; + } + nr_sectors += len>>9; + sector_nr += len>>9; + } while (biolist->bi_vcnt < RESYNC_PAGES); + bio_full: + r10_bio->sectors = nr_sectors; + + while (biolist) { + bio = biolist; + biolist = biolist->bi_next; + + bio->bi_next = NULL; + r10_bio = bio->bi_private; + r10_bio->sectors = nr_sectors; + + if (bio->bi_end_io == end_sync_read) { + md_sync_acct(bio->bi_bdev, nr_sectors); + generic_make_request(bio); + } + } + + return sectors_skipped + nr_sectors; + giveup: + /* There is nowhere to write, so all non-sync + * drives must be failed, so try the next chunk... + */ + { + int sec = max_sector - sector_nr; + sectors_skipped += sec; + chunks_skipped ++; + sector_nr = max_sector; + md_done_sync(mddev, sec, 1); + goto skipped; + } +} + +static int run(mddev_t *mddev) +{ + conf_t *conf; + int i, disk_idx; + mirror_info_t *disk; + mdk_rdev_t *rdev; + struct list_head *tmp; + int nc, fc; + sector_t stride, size; + + if (mddev->level != 10) { + printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", + mdname(mddev), mddev->level); + goto out; + } + nc = mddev->layout & 255; + fc = (mddev->layout >> 8) & 255; + if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || + (mddev->layout >> 16)) { + printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->layout); + goto out; + } + /* + * copy the already verified devices into our private RAID10 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] + */ + conf = kmalloc(sizeof(conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, + GFP_KERNEL); + if (!conf->mirrors) { + printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out_free_conf; + } + memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); + + conf->near_copies = nc; + conf->far_copies = fc; + conf->copies = nc*fc; + conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; + conf->chunk_shift = ffz(~mddev->chunk_size) - 9; + stride = mddev->size >> (conf->chunk_shift-1); + sector_div(stride, fc); + conf->stride = stride << conf->chunk_shift; + + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (!conf->r10bio_pool) { + printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", + mdname(mddev)); + goto out_free_conf; + } + mddev->queue->unplug_fn = raid10_unplug; + + mddev->queue->issue_flush_fn = raid10_issue_flush; + + ITERATE_RDEV(mddev, rdev, tmp) { + disk_idx = rdev->raid_disk; + if (disk_idx >= mddev->raid_disks + || disk_idx < 0) + continue; + disk = conf->mirrors + disk_idx; + + disk->rdev = rdev; + + blk_queue_stack_limits(mddev->queue, + rdev->bdev->bd_disk->queue); + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_sector to one PAGE, as + * a one page request is never in violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn && + mddev->queue->max_sectors > (PAGE_SIZE>>9)) + mddev->queue->max_sectors = (PAGE_SIZE>>9); + + disk->head_position = 0; + if (!rdev->faulty && rdev->in_sync) + conf->working_disks++; + } + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + conf->device_lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&conf->retry_list); + + conf->resync_lock = SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_idle); + init_waitqueue_head(&conf->wait_resume); + + if (!conf->working_disks) { + printk(KERN_ERR "raid10: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + + mddev->degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + + disk = conf->mirrors + i; + + if (!disk->rdev) { + disk->head_position = 0; + mddev->degraded++; + } + } + + + mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); + if (!mddev->thread) { + printk(KERN_ERR + "raid10: couldn't allocate thread for %s\n", + mdname(mddev)); + goto out_free_conf; + } + + printk(KERN_INFO + "raid10: raid set %s active with %d out of %d devices\n", + mdname(mddev), mddev->raid_disks - mddev->degraded, + mddev->raid_disks); + /* + * Ok, everything is just fine now + */ + size = conf->stride * conf->raid_disks; + sector_div(size, conf->near_copies); + mddev->array_size = size/2; + mddev->resync_max_sectors = size; + + /* Calculate max read-ahead size. + * We need to readahead at least twice a whole stripe.... + * maybe... + */ + { + int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; + stripe /= conf->near_copies; + if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) + mddev->queue->backing_dev_info.ra_pages = 2* stripe; + } + + if (conf->near_copies < mddev->raid_disks) + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + return 0; + +out_free_conf: + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + if (conf->mirrors) + kfree(conf->mirrors); + kfree(conf); + mddev->private = NULL; +out: + return -EIO; +} + +static int stop(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + + md_unregister_thread(mddev->thread); + mddev->thread = NULL; + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + if (conf->mirrors) + kfree(conf->mirrors); + kfree(conf); + mddev->private = NULL; + return 0; +} + + +static mdk_personality_t raid10_personality = +{ + .name = "raid10", + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid10_add_disk, + .hot_remove_disk= raid10_remove_disk, + .spare_active = raid10_spare_active, + .sync_request = sync_request, +}; + +static int __init raid_init(void) +{ + return register_md_personality(RAID10, &raid10_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(RAID10); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("md-personality-9"); /* RAID10 */ diff -pruN ./drivers/md.dm/raid1.c ./drivers/md/raid1.c --- ./drivers/md.dm/raid1.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/raid1.c 2006-03-17 13:16:38.000000000 +0300 @@ -24,10 +24,6 @@ #include -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - /* * Number of guaranteed r1bios in case of extreme VM load: */ @@ -44,13 +40,12 @@ static void * r1bio_pool_alloc(int gfp_f { struct pool_info *pi = data; r1bio_t *r1_bio; + int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kmalloc(sizeof(r1bio_t) + sizeof(struct bio*)*pi->raid_disks, - gfp_flags); + r1_bio = kmalloc(size, gfp_flags); if (r1_bio) - memset(r1_bio, 0, sizeof(*r1_bio) + - sizeof(struct bio*) * pi->raid_disks); + memset(r1_bio, 0, size); else unplug_slaves(pi->mddev); @@ -104,7 +99,7 @@ static void * r1buf_pool_alloc(int gfp_f bio->bi_io_vec[i].bv_page = page; } - r1_bio->master_bio = bio; + r1_bio->master_bio = NULL; return r1_bio; @@ -189,32 +184,6 @@ static inline void put_buf(r1bio_t *r1_b spin_unlock_irqrestore(&conf->resync_lock, flags); } -static int map(mddev_t *mddev, mdk_rdev_t **rdevp) -{ - conf_t *conf = mddev_to_conf(mddev); - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; - if (rdev && rdev->in_sync) { - *rdevp = rdev; - atomic_inc(&rdev->nr_pending); - spin_unlock_irq(&conf->device_lock); - return i; - } - } - spin_unlock_irq(&conf->device_lock); - - printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n"); - return -1; -} - static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; @@ -292,8 +261,9 @@ static int raid1_end_read_request(struct * oops, read error: */ char b[BDEVNAME_SIZE]; - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", - bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); + if (printk_ratelimit()) + printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", + bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } @@ -363,12 +333,13 @@ static int raid1_end_write_request(struc * * The rdev for the device selected will have nr_pending incremented. */ -static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) +static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; - const int sectors = bio->bi_size >> 9; + const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; + mdk_rdev_t *new_rdev, *rdev; spin_lock_irq(&conf->device_lock); /* @@ -376,16 +347,17 @@ static int read_balance(conf_t *conf, st * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ + retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ + /* Choose the first operation device, for consistancy */ new_disk = 0; - while (!conf->mirrors[new_disk].rdev || - !conf->mirrors[new_disk].rdev->in_sync) { + while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || + !new_rdev->in_sync) { new_disk++; if (new_disk == conf->raid_disks) { - new_disk = 0; + new_disk = -1; break; } } @@ -394,13 +366,13 @@ static int read_balance(conf_t *conf, st /* make sure the disk is operational */ - while (!conf->mirrors[new_disk].rdev || - !conf->mirrors[new_disk].rdev->in_sync) { + while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || + !new_rdev->in_sync) { if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = conf->last_used; + new_disk = -1; goto rb_out; } } @@ -424,29 +396,38 @@ static int read_balance(conf_t *conf, st disk = conf->raid_disks; disk--; - if (!conf->mirrors[disk].rdev || - !conf->mirrors[disk].rdev->in_sync) + if ((rdev=conf->mirrors[disk].rdev) == NULL || + !rdev->in_sync) continue; - if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) { + if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; + new_rdev = rdev; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; + new_rdev = rdev; } } while (disk != conf->last_used); rb_out: - r1_bio->read_disk = new_disk; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - if (conf->mirrors[new_disk].rdev) - atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); + if (new_disk >= 0) { + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; + atomic_inc(&new_rdev->nr_pending); + if (!new_rdev->in_sync) { + /* cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + atomic_dec(&new_rdev->nr_pending); + goto retry; + } + } spin_unlock_irq(&conf->device_lock); return new_disk; @@ -471,7 +452,7 @@ static void unplug_slaves(mddev_t *mddev r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); } } spin_unlock_irqrestore(&conf->device_lock, flags); @@ -481,6 +462,32 @@ static void raid1_unplug(request_queue_t unplug_slaves(q->queuedata); } +static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + conf_t *conf = mddev_to_conf(mddev); + unsigned long flags; + int i, ret = 0; + + spin_lock_irqsave(&conf->device_lock, flags); + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue = bdev_get_queue(bdev); + + if (r_queue->issue_flush_fn) { + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + return ret; +} + /* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. @@ -513,6 +520,7 @@ static int make_request(request_queue_t r1bio_t *r1_bio; struct bio *read_bio; int i, disks; + mdk_rdev_t *rdev; /* * Register the new request and wait if the reconstruction @@ -545,15 +553,26 @@ static int make_request(request_queue_t r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; + r1_bio->state = 0; + if (bio_data_dir(bio) == READ) { /* * read balancing logic: */ - mirror = conf->mirrors + read_balance(conf, bio, r1_bio); + int rdisk = read_balance(conf, r1_bio); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + raid_end_bio_io(r1_bio); + return 0; + } + mirror = conf->mirrors + rdisk; + + r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); - r1_bio->bios[r1_bio->read_disk] = read_bio; + r1_bio->bios[rdisk] = read_bio; read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; @@ -575,10 +594,14 @@ static int make_request(request_queue_t disks = conf->raid_disks; spin_lock_irq(&conf->device_lock); for (i = 0; i < disks; i++) { - if (conf->mirrors[i].rdev && - !conf->mirrors[i].rdev->faulty) { - atomic_inc(&conf->mirrors[i].rdev->nr_pending); - r1_bio->bios[i] = bio; + if ((rdev=conf->mirrors[i].rdev) != NULL && + !rdev->faulty) { + atomic_inc(&rdev->nr_pending); + if (rdev->faulty) { + atomic_dec(&rdev->nr_pending); + r1_bio->bios[i] = NULL; + } else + r1_bio->bios[i] = bio; } else r1_bio->bios[i] = NULL; } @@ -746,7 +769,7 @@ static int raid1_add_disk(mddev_t *mddev */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; @@ -877,7 +900,7 @@ static void sync_request_write(mddev_t * atomic_inc(&conf->mirrors[i].rdev->nr_pending); atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); generic_make_request(wbio); } @@ -925,7 +948,7 @@ static void raid1d(mddev_t *mddev) } else { int disk; bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=map(mddev, &rdev)) == -1) { + if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), @@ -934,14 +957,20 @@ static void raid1d(mddev_t *mddev) } else { r1_bio->bios[r1_bio->read_disk] = NULL; r1_bio->read_disk = disk; + bio_put(bio); + bio = bio_clone(r1_bio->master_bio, GFP_NOIO); r1_bio->bios[r1_bio->read_disk] = bio; - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); - bio->bi_bdev = rdev->bdev; + rdev = conf->mirrors[disk].rdev; + if (printk_ratelimit()) + printk(KERN_ERR "raid1: %s: redirecting sector %llu to" + " another mirror\n", + bdevname(rdev->bdev,b), + (unsigned long long)r1_bio->sector); bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ; + bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); } @@ -1078,7 +1107,7 @@ static int sync_request(mddev_t *mddev, int rv = max_sector - sector_nr; md_done_sync(mddev, rv, 1); put_buf(r1_bio); - atomic_dec(&conf->mirrors[disk].rdev->nr_pending); + rdev_dec_pending(conf->mirrors[disk].rdev, mddev); return rv; } @@ -1117,7 +1146,7 @@ static int sync_request(mddev_t *mddev, bio = r1_bio->bios[disk]; r1_bio->sectors = nr_sectors; - md_sync_acct(mirror->rdev, nr_sectors); + md_sync_acct(mirror->rdev->bdev, nr_sectors); generic_make_request(bio); @@ -1168,6 +1197,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; + mddev->queue->issue_flush_fn = raid1_issue_flush; ITERATE_RDEV(mddev, rdev, tmp) { disk_idx = rdev->raid_disk; @@ -1186,7 +1216,7 @@ static int run(mddev_t *mddev) */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; if (!rdev->faulty && rdev->in_sync) @@ -1328,7 +1358,7 @@ static int raid1_reshape(mddev_t *mddev, if (conf->mirrors[d].rdev) return -EBUSY; - newpoolinfo = kmalloc(sizeof(newpoolinfo), GFP_KERNEL); + newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); if (!newpoolinfo) return -ENOMEM; newpoolinfo->mddev = mddev; diff -pruN ./drivers/md.dm/raid5.c ./drivers/md/raid5.c --- ./drivers/md.dm/raid5.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/raid5.c 2006-03-17 13:16:38.000000000 +0300 @@ -457,6 +457,7 @@ static void raid5_build_block (struct st bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; dev->req.bi_vcnt++; + dev->req.bi_max_vecs++; dev->vec.bv_page = dev->page; dev->vec.bv_len = STRIPE_SIZE; dev->vec.bv_offset = 0; @@ -477,8 +478,8 @@ static void error(mddev_t *mddev, mdk_rd if (!rdev->faulty) { mddev->sb_dirty = 1; - conf->working_disks--; if (rdev->in_sync) { + conf->working_disks--; mddev->degraded++; conf->failed_disks++; rdev->in_sync = 0; @@ -1071,7 +1072,8 @@ static void handle_stripe(struct stripe_ PRINTK("Reading block %d (sync=%d)\n", i, syncing); if (syncing) - md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS); + md_sync_acct(conf->disks[i].rdev->bdev, + STRIPE_SECTORS); } } } @@ -1256,7 +1258,7 @@ static void handle_stripe(struct stripe_ if (rdev) { if (test_bit(R5_Syncio, &sh->dev[i].flags)) - md_sync_acct(rdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; PRINTK("for %llu schedule op %ld on disc %d\n", @@ -1265,6 +1267,7 @@ static void handle_stripe(struct stripe_ bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1316,7 +1319,7 @@ static void unplug_slaves(mddev_t *mddev r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); } } spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1328,6 +1331,8 @@ static void raid5_unplug_device(request_ raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; + if (!conf) return; + spin_lock_irqsave(&conf->device_lock, flags); if (blk_remove_plug(q)) @@ -1339,6 +1344,39 @@ static void raid5_unplug_device(request_ unplug_slaves(mddev); } +static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid5_plug_device(raid5_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1545,6 +1583,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid5_unplug_device; + mddev->queue->issue_flush_fn = raid5_issue_flush; PRINTK("raid5: run(%s) called.\n", mdname(mddev)); diff -pruN ./drivers/md.dm/raid6main.c ./drivers/md/raid6main.c --- ./drivers/md.dm/raid6main.c 2006-03-17 08:57:42.000000000 +0300 +++ ./drivers/md/raid6main.c 2006-03-17 13:16:38.000000000 +0300 @@ -478,6 +478,7 @@ static void raid6_build_block (struct st bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; dev->req.bi_vcnt++; + dev->req.bi_max_vecs++; dev->vec.bv_page = dev->page; dev->vec.bv_len = STRIPE_SIZE; dev->vec.bv_offset = 0; @@ -498,8 +499,8 @@ static void error(mddev_t *mddev, mdk_rd if (!rdev->faulty) { mddev->sb_dirty = 1; - conf->working_disks--; if (rdev->in_sync) { + conf->working_disks--; mddev->degraded++; conf->failed_disks++; rdev->in_sync = 0; @@ -1208,7 +1209,8 @@ static void handle_stripe(struct stripe_ PRINTK("Reading block %d (sync=%d)\n", i, syncing); if (syncing) - md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS); + md_sync_acct(conf->disks[i].rdev->bdev, + STRIPE_SECTORS); } } } @@ -1418,7 +1420,7 @@ static void handle_stripe(struct stripe_ if (rdev) { if (test_bit(R5_Syncio, &sh->dev[i].flags)) - md_sync_acct(rdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; PRINTK("for %llu schedule op %ld on disc %d\n", @@ -1427,6 +1429,7 @@ static void handle_stripe(struct stripe_ bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1478,7 +1481,7 @@ static void unplug_slaves(mddev_t *mddev r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); } } spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1501,6 +1504,39 @@ static void raid6_unplug_device(request_ unplug_slaves(mddev); } +static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, + sector_t *error_sector) +{ + mddev_t *mddev = q->queuedata; + raid6_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i=0; iraid_disks; i++) { + mdk_rdev_t *rdev = conf->disks[i].rdev; + if (rdev && !rdev->faulty) { + struct block_device *bdev = rdev->bdev; + request_queue_t *r_queue; + + if (!bdev) + continue; + + r_queue = bdev_get_queue(bdev); + if (!r_queue) + continue; + + if (!r_queue->issue_flush_fn) { + ret = -EOPNOTSUPP; + break; + } + + ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); + if (ret) + break; + } + } + return ret; +} + static inline void raid6_plug_device(raid6_conf_t *conf) { spin_lock_irq(&conf->device_lock); @@ -1708,6 +1744,7 @@ static int run (mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); mddev->queue->unplug_fn = raid6_unplug_device; + mddev->queue->issue_flush_fn = raid6_issue_flush; PRINTK("raid6: run(%s) called.\n", mdname(mddev)); --- ./include/linux/compat_ioctl.h.dm 2006-03-17 08:58:47.000000000 +0300 +++ ./include/linux/compat_ioctl.h 2006-03-17 08:16:12.000000000 +0300 @@ -102,6 +102,7 @@ COMPATIBLE_IOCTL(BLKROGET) COMPATIBLE_IOCTL(BLKRRPART) COMPATIBLE_IOCTL(BLKFLSBUF) COMPATIBLE_IOCTL(BLKSECTSET) +COMPATIBLE_IOCTL(BLKSECTGET) COMPATIBLE_IOCTL(BLKSSZGET) ULONG_IOCTL(BLKRASET) ULONG_IOCTL(BLKFRASET) @@ -141,6 +142,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32) COMPATIBLE_IOCTL(DM_TABLE_DEPS_32) COMPATIBLE_IOCTL(DM_TABLE_STATUS_32) COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32) +COMPATIBLE_IOCTL(DM_TARGET_MSG_32) COMPATIBLE_IOCTL(DM_VERSION) COMPATIBLE_IOCTL(DM_REMOVE_ALL) COMPATIBLE_IOCTL(DM_LIST_DEVICES) @@ -155,6 +157,7 @@ COMPATIBLE_IOCTL(DM_TABLE_CLEAR) COMPATIBLE_IOCTL(DM_TABLE_DEPS) COMPATIBLE_IOCTL(DM_TABLE_STATUS) COMPATIBLE_IOCTL(DM_LIST_VERSIONS) +COMPATIBLE_IOCTL(DM_TARGET_MSG) /* Big K */ COMPATIBLE_IOCTL(PIO_FONT) COMPATIBLE_IOCTL(GIO_FONT) @@ -387,6 +390,7 @@ COMPATIBLE_IOCTL(DVD_WRITE_STRUCT) COMPATIBLE_IOCTL(DVD_AUTH) /* Big L */ ULONG_IOCTL(LOOP_SET_FD) +ULONG_IOCTL(LOOP_CHANGE_FD) COMPATIBLE_IOCTL(LOOP_CLR_FD) COMPATIBLE_IOCTL(LOOP_GET_STATUS64) COMPATIBLE_IOCTL(LOOP_SET_STATUS64) @@ -595,13 +599,15 @@ COMPATIBLE_IOCTL(ATMTCP_CREATE) COMPATIBLE_IOCTL(ATMTCP_REMOVE) COMPATIBLE_IOCTL(ATMMPC_CTRL) COMPATIBLE_IOCTL(ATMMPC_DATA) -/* Big W */ -/* WIOC_GETSUPPORT not yet implemented -E */ +/* Watchdog */ +COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) COMPATIBLE_IOCTL(WDIOC_GETSTATUS) COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) COMPATIBLE_IOCTL(WDIOC_GETTEMP) COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) +COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) @@ -735,3 +741,20 @@ COMPATIBLE_IOCTL(SIOCSIWRETRY) COMPATIBLE_IOCTL(SIOCGIWRETRY) COMPATIBLE_IOCTL(SIOCSIWPOWER) COMPATIBLE_IOCTL(SIOCGIWPOWER) +/* hiddev */ +COMPATIBLE_IOCTL(HIDIOCGVERSION) +COMPATIBLE_IOCTL(HIDIOCAPPLICATION) +COMPATIBLE_IOCTL(HIDIOCGDEVINFO) +COMPATIBLE_IOCTL(HIDIOCGSTRING) +COMPATIBLE_IOCTL(HIDIOCINITREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORT) +COMPATIBLE_IOCTL(HIDIOCSREPORT) +COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) +COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) +COMPATIBLE_IOCTL(HIDIOCGUSAGE) +COMPATIBLE_IOCTL(HIDIOCSUSAGE) +COMPATIBLE_IOCTL(HIDIOCGUCODE) +COMPATIBLE_IOCTL(HIDIOCGFLAG) +COMPATIBLE_IOCTL(HIDIOCSFLAG) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) +COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) --- ./include/linux/device-mapper.h.dm 2006-03-17 08:58:56.000000000 +0300 +++ ./include/linux/device-mapper.h 2006-03-17 08:16:12.000000000 +0300 @@ -51,12 +51,15 @@ typedef int (*dm_endio_fn) (struct dm_ta struct bio *bio, int error, union map_info *map_context); -typedef void (*dm_suspend_fn) (struct dm_target *ti); +typedef void (*dm_presuspend_fn) (struct dm_target *ti); +typedef void (*dm_postsuspend_fn) (struct dm_target *ti); typedef void (*dm_resume_fn) (struct dm_target *ti); typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, char *result, unsigned int maxlen); +typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv); + void dm_error(const char *message); /* @@ -79,9 +82,11 @@ struct target_type { dm_dtr_fn dtr; dm_map_fn map; dm_endio_fn end_io; - dm_suspend_fn suspend; + dm_presuspend_fn presuspend; + dm_postsuspend_fn postsuspend; dm_resume_fn resume; dm_status_fn status; + dm_message_fn message; }; struct io_restrictions { @@ -102,6 +107,7 @@ struct dm_target { sector_t len; /* FIXME: turn this into a mask, and merge with io_restrictions */ + /* Always a power of 2 */ sector_t split_io; /* --- ./include/linux/dm-ioctl.h.dm 2006-03-17 08:59:07.000000000 +0300 +++ ./include/linux/dm-ioctl.h 2006-03-17 08:16:12.000000000 +0300 @@ -1,5 +1,6 @@ /* * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. * * This file is released under the LGPL. */ @@ -76,6 +77,9 @@ * * DM_TABLE_STATUS: * Return the targets status for the 'active' table. + * + * DM_TARGET_MSG: + * Pass a message string to the target at a specific offset of a device. */ /* @@ -179,6 +183,15 @@ struct dm_target_versions { }; /* + * Used to pass message to a target + */ +struct dm_target_msg { + uint64_t sector; /* Device sector */ + + char message[0]; +}; + +/* * If you change this make sure you make the corresponding change * to dm-ioctl.c:lookup_ioctl() */ @@ -204,6 +217,7 @@ enum { /* Added later */ DM_LIST_VERSIONS_CMD, + DM_TARGET_MSG_CMD, }; /* @@ -232,6 +246,7 @@ typedef char ioctl_struct[308]; #define DM_TABLE_DEPS_32 _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct) #define DM_TABLE_STATUS_32 _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct) #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct) +#define DM_TARGET_MSG_32 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct) #endif #define DM_IOCTL 0xfd @@ -254,10 +269,12 @@ typedef char ioctl_struct[308]; #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl) +#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) + #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 1 +#define DM_VERSION_MINOR 5 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2003-12-10)" +#define DM_VERSION_EXTRA "-ioctl (2005-10-04)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -283,4 +300,14 @@ typedef char ioctl_struct[308]; */ #define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ +/* + * Set this to improve performance when you aren't going to use open_count. + */ +#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */ + +/* + * Set this to avoid attempting to freeze any filesystem when suspending. + */ +#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ --- ./include/linux/genhd.h.dm 2006-03-20 08:42:40.000000000 +0300 +++ ./include/linux/genhd.h 2006-03-17 13:44:40.000000000 +0300 @@ -100,7 +100,7 @@ struct gendisk { struct timer_rand_state *random; int policy; - unsigned sync_io; /* RAID */ + atomic_t sync_io; /* RAID */ unsigned long stamp, stamp_idle; int in_flight; #ifdef CONFIG_SMP diff -pruN ./include/linux/raid.dm/linear.h ./include/linux/raid/linear.h --- ./include/linux/raid.dm/linear.h 2006-03-17 13:26:03.000000000 +0300 +++ ./include/linux/raid/linear.h 2006-03-17 13:26:59.000000000 +0300 @@ -5,8 +5,8 @@ struct dev_info { mdk_rdev_t *rdev; - unsigned long size; - unsigned long offset; + sector_t size; + sector_t offset; }; typedef struct dev_info dev_info_t; diff -pruN ./include/linux/raid.dm/md.h ./include/linux/raid/md.h --- ./include/linux/raid.dm/md.h 2006-03-17 13:26:03.000000000 +0300 +++ ./include/linux/raid/md.h 2006-03-17 13:26:59.000000000 +0300 @@ -69,12 +69,10 @@ extern mdk_thread_t * md_register_thread extern void md_unregister_thread (mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_check_recovery(mddev_t *mddev); -extern void md_interrupt_thread (mdk_thread_t *thread); extern void md_write_start(mddev_t *mddev); extern void md_write_end(mddev_t *mddev); extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); extern void md_unplug_mddev(mddev_t *mddev); diff -pruN ./include/linux/raid.dm/md_k.h ./include/linux/raid/md_k.h --- ./include/linux/raid.dm/md_k.h 2006-03-17 13:26:03.000000000 +0300 +++ ./include/linux/raid/md_k.h 2006-03-17 13:26:59.000000000 +0300 @@ -24,7 +24,8 @@ #define HSM 6UL #define MULTIPATH 7UL #define RAID6 8UL -#define MAX_PERSONALITY 9UL +#define RAID10 9UL +#define MAX_PERSONALITY 10UL #define LEVEL_MULTIPATH (-4) #define LEVEL_LINEAR (-1) @@ -43,6 +44,7 @@ static inline int pers_to_level (int per case RAID1: return 1; case RAID5: return 5; case RAID6: return 6; + case RAID10: return 10; } BUG(); return MD_RESERVED; @@ -60,6 +62,7 @@ static inline int level_to_pers (int lev case 4: case 5: return RAID5; case 6: return RAID6; + case 10: return RAID10; } return MD_RESERVED; } @@ -216,6 +219,7 @@ struct mddev_s unsigned long resync_mark; /* a recent timestamp */ sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t resync_max_sectors; /* may be set by personality */ /* recovery/resync flags * NEEDED: we might need to start a resync/recover * RUNNING: a thread is running, or about to be started @@ -263,6 +267,11 @@ static inline void rdev_dec_pending(mdk_ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + struct mdk_personality_s { char *name; diff -pruN ./include/linux/raid.dm/raid10.h ./include/linux/raid/raid10.h --- ./include/linux/raid.dm/raid10.h 1970-01-01 03:00:00.000000000 +0300 +++ ./include/linux/raid/raid10.h 2006-03-17 13:26:59.000000000 +0300 @@ -0,0 +1,103 @@ +#ifndef _RAID10_H +#define _RAID10_H + +#include + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +typedef struct r10bio_s r10bio_t; + +struct r10_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + int working_disks; + spinlock_t device_lock; + + /* geometry */ + int near_copies; /* number of copies layed out raid0 style */ + int far_copies; /* number of copies layed out + * at large strides across drives + */ + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies + */ + + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + + struct list_head retry_list; + /* for use when syncing mirrors: */ + + spinlock_t resync_lock; + int nr_pending; + int barrier; + sector_t next_resync; + + wait_queue_head_t wait_idle; + wait_queue_head_t wait_resume; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; +}; + +typedef struct r10_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + */ + struct { + struct bio *bio; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* bits for r10bio.state */ +#define R10BIO_Uptodate 0 +#define R10BIO_IsSync 1 +#define R10BIO_IsRecover 2 +#endif