--- linux-2.4.22/Documentation/Configure.help Mon Nov 17 19:15:53 2003 +++ linux/Documentation/Configure.help Thu Nov 20 18:29:35 2003 @@ -1856,6 +1856,20 @@ want), say M here and read . The module will be called lvm-mod.o. +Device-mapper support +CONFIG_BLK_DEV_DM + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + If you want to compile this as a module, say M here and read + . The module will be called dm-mod.o. + + If unsure, say N. + Multiple devices driver support (RAID and LVM) CONFIG_MD Support multiple physical spindles through a single logical device. --- linux-2.4.22/MAINTAINERS Mon Nov 17 19:15:45 2003 +++ linux/MAINTAINERS Thu Nov 20 18:29:35 2003 @@ -554,6 +554,13 @@ W: http://www.debian.org/~dz/i8k/ S: Maintained +DEVICE MAPPER +P: Joe Thornber +M: dm@uk.sistina.com +L: linux-LVM@sistina.com +W: http://www.sistina.com/lvm +S: Maintained + DEVICE NUMBER REGISTRY P: H. Peter Anvin M: hpa@zytor.com --- linux-2.4.22/arch/mips64/kernel/ioctl32.c Mon Nov 17 19:16:11 2003 +++ linux/arch/mips64/kernel/ioctl32.c Thu Nov 20 18:29:35 2003 @@ -37,6 +37,7 @@ #include #include #include +#include #include #undef __KERNEL__ /* This file was born to be ugly ... */ @@ -1221,6 +1222,22 @@ IOCTL32_DEFAULT(STOP_ARRAY_RO), IOCTL32_DEFAULT(RESTART_ARRAY_RW), #endif /* CONFIG_MD */ + +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ #ifdef CONFIG_SIBYTE_TBPROF IOCTL32_DEFAULT(SBPROF_ZBSTART), --- linux-2.4.22/arch/parisc/kernel/ioctl32.c Mon Nov 17 19:16:11 2003 +++ linux/arch/parisc/kernel/ioctl32.c Thu Nov 20 18:29:35 2003 @@ -55,6 +55,7 @@ #define max max */ #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -3423,6 +3424,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE) COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC) COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID) --- linux-2.4.22/arch/ppc64/kernel/ioctl32.c Mon Nov 17 19:16:18 2003 +++ linux/arch/ppc64/kernel/ioctl32.c Thu Nov 20 18:29:35 2003 @@ -66,6 +66,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4435,6 +4436,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG), COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS), COMPATIBLE_IOCTL(NBD_DISCONNECT), +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION), +COMPATIBLE_IOCTL(DM_REMOVE_ALL), +COMPATIBLE_IOCTL(DM_DEV_CREATE), +COMPATIBLE_IOCTL(DM_DEV_REMOVE), +COMPATIBLE_IOCTL(DM_TABLE_LOAD), +COMPATIBLE_IOCTL(DM_DEV_SUSPEND), +COMPATIBLE_IOCTL(DM_DEV_RENAME), +COMPATIBLE_IOCTL(DM_TABLE_DEPS), +COMPATIBLE_IOCTL(DM_DEV_STATUS), +COMPATIBLE_IOCTL(DM_TABLE_STATUS), +COMPATIBLE_IOCTL(DM_DEV_WAIT), +COMPATIBLE_IOCTL(DM_LIST_DEVICES), +COMPATIBLE_IOCTL(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ /* Remove *PRIVATE in 2.5 */ COMPATIBLE_IOCTL(SIOCDEVPRIVATE), COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1), --- linux-2.4.22/arch/s390x/kernel/ioctl32.c Mon Nov 17 19:16:20 2003 +++ linux/arch/s390x/kernel/ioctl32.c Thu Nov 20 18:29:35 2003 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -626,6 +627,20 @@ IOCTL32_DEFAULT(VT_UNLOCKSWITCH), IOCTL32_DEFAULT(SIOCGSTAMP), + + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), IOCTL32_DEFAULT(LOOP_SET_FD), IOCTL32_DEFAULT(LOOP_CLR_FD), --- linux-2.4.22/arch/sparc64/kernel/ioctl32.c Mon Nov 17 19:16:23 2003 +++ linux/arch/sparc64/kernel/ioctl32.c Thu Nov 20 18:29:35 2003 @@ -56,6 +56,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -5086,6 +5087,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ /* Linux-1394 */ #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE) COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL) --- linux-2.4.22/arch/x86_64/ia32/ia32_ioctl.c Mon Nov 17 19:16:25 2003 +++ linux/arch/x86_64/ia32/ia32_ioctl.c Thu Nov 20 18:29:35 2003 @@ -67,6 +67,7 @@ #define max max #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4047,6 +4048,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #ifdef CONFIG_AUTOFS_FS COMPATIBLE_IOCTL(AUTOFS_IOC_READY) COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL) --- linux-2.4.22/drivers/md/Config.in Mon Nov 17 19:16:45 2003 +++ linux/drivers/md/Config.in Thu Nov 20 18:29:35 2003 @@ -14,5 +14,7 @@ dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD +dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM endmenu --- linux-2.4.22/drivers/md/Makefile Mon Nov 17 19:16:45 2003 +++ linux/drivers/md/Makefile Thu Nov 20 18:29:35 2003 @@ -4,24 +4,41 @@ O_TARGET := mddev.o -export-objs := md.o xor.o -list-multi := lvm-mod.o +export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \ + dm-log.o dm-io.o dm.o + +list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \ + kcopyd.o dm-daemon.o dm-io.o +dm-mirror-mod-objs := dm-raid1.o dm-log.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. -obj-$(CONFIG_MD_LINEAR) += linear.o -obj-$(CONFIG_MD_RAID0) += raid0.o -obj-$(CONFIG_MD_RAID1) += raid1.o -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_BLK_DEV_MD) += md.o -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +obj-$(CONFIG_MD_MULTIPATH) += multipath.o +obj-$(CONFIG_BLK_DEV_MD) += md.o + +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o + +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o include $(TOPDIR)/Rules.make lvm-mod.o: $(lvm-mod-objs) $(LD) -r -o $@ $(lvm-mod-objs) + +dm-mod.o: $(dm-mod-objs) + $(LD) -r -o $@ $(dm-mod-objs) + +dm-mirror.o: $(dm-mirror-mod-objs) + $(LD) -r -o $@ $(dm-mirror-mod-objs) + --- linux-2.4.22/drivers/md/dm-daemon.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-daemon.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include "dm.h" +#include "dm-daemon.h" + +#include +#include + +static int daemon(void *arg) +{ + struct dm_daemon *dd = (struct dm_daemon *) arg; + DECLARE_WAITQUEUE(wq, current); + + daemonize(); + reparent_to_init(); + + /* block all signals */ + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + flush_signals(current); + spin_unlock_irq(¤t->sigmask_lock); + + strcpy(current->comm, dd->name); + atomic_set(&dd->please_die, 0); + + add_wait_queue(&dd->job_queue, &wq); + + down(&dd->run_lock); + up(&dd->start_lock); + + /* + * dd->fn() could do anything, very likely it will + * suspend. So we can't set the state to + * TASK_INTERRUPTIBLE before calling it. In order to + * prevent a race with a waking thread we do this little + * dance with the dd->woken variable. + */ + while (1) { + do { + set_current_state(TASK_RUNNING); + + if (atomic_read(&dd->please_die)) + goto out; + + atomic_set(&dd->woken, 0); + dd->fn(); + yield(); + + set_current_state(TASK_INTERRUPTIBLE); + } while (atomic_read(&dd->woken)); + + schedule(); + } + + out: + remove_wait_queue(&dd->job_queue, &wq); + up(&dd->run_lock); + return 0; +} + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)) +{ + pid_t pid = 0; + + /* + * Initialise the dm_daemon. + */ + dd->fn = fn; + strncpy(dd->name, name, sizeof(dd->name) - 1); + sema_init(&dd->start_lock, 1); + sema_init(&dd->run_lock, 1); + init_waitqueue_head(&dd->job_queue); + + /* + * Start the new thread. + */ + down(&dd->start_lock); + pid = kernel_thread(daemon, dd, 0); + if (pid <= 0) { + DMERR("Failed to start %s thread", name); + return -EAGAIN; + } + + /* + * wait for the daemon to up this mutex. + */ + down(&dd->start_lock); + up(&dd->start_lock); + + return 0; +} + +void dm_daemon_stop(struct dm_daemon *dd) +{ + atomic_set(&dd->please_die, 1); + dm_daemon_wake(dd); + down(&dd->run_lock); + up(&dd->run_lock); +} + +void dm_daemon_wake(struct dm_daemon *dd) +{ + atomic_set(&dd->woken, 1); + wake_up_interruptible(&dd->job_queue); +} + +EXPORT_SYMBOL(dm_daemon_start); +EXPORT_SYMBOL(dm_daemon_stop); +EXPORT_SYMBOL(dm_daemon_wake); --- linux-2.4.22/drivers/md/dm-daemon.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-daemon.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DAEMON_H +#define DM_DAEMON_H + +#include +#include + +struct dm_daemon { + void (*fn)(void); + char name[16]; + atomic_t please_die; + struct semaphore start_lock; + struct semaphore run_lock; + + atomic_t woken; + wait_queue_head_t job_queue; +}; + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)); +void dm_daemon_stop(struct dm_daemon *dd); +void dm_daemon_wake(struct dm_daemon *dd); +int dm_daemon_running(struct dm_daemon *dd); + +#endif --- linux-2.4.22/drivers/md/dm-exception-store.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-exception-store.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,673 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm-snapshot.h" +#include "dm-io.h" +#include "kcopyd.h" + +#include +#include +#include +#include + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +struct disk_header { + uint32_t magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + uint32_t valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + + /* In sectors */ + uint32_t chunk_size; +}; + +struct disk_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_snapshot *snap; /* up pointer to my snapshot */ + int version; + int valid; + uint32_t chunk_size; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + uint32_t current_area; + + /* + * The next free chunk for an exception. + */ + uint32_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; +}; + +static inline unsigned int sectors_to_pages(unsigned int sectors) +{ + return sectors / (PAGE_SIZE / SECTOR_SIZE); +} + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t i, len, nr_pages; + struct page *page, *last = NULL; + + len = ps->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + return r; + + nr_pages = sectors_to_pages(ps->chunk_size); + + /* + * We lock the pages for ps->area into memory since + * they'll be doing a lot of io. We also chain them + * together ready for dm-io. + */ + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + LockPage(page); + if (last) + last->list.next = &page->list; + last = page; + } + + return 0; +} + +static void free_area(struct pstore *ps) +{ + size_t i, nr_pages; + struct page *page; + + nr_pages = sectors_to_pages(ps->chunk_size); + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + page->list.next = NULL; + UnlockPage(page); + } + + vfree(ps->area); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) +{ + struct io_region where; + unsigned int bits; + + where.dev = ps->snap->cow->dev; + where.sector = ps->chunk_size * chunk; + where.count = ps->chunk_size; + + return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits); +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, uint32_t area, int rw) +{ + int r; + uint32_t chunk; + + /* convert a metadata area index to a chunk index */ + chunk = 1 + ((ps->exceptions_per_area + 1) * area); + + r = chunk_io(ps, chunk, rw); + if (r) + return r; + + ps->current_area = area; + return 0; +} + +static int zero_area(struct pstore *ps, uint32_t area) +{ + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + return area_io(ps, area, WRITE); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + + r = chunk_io(ps, 0, READ); + if (r) + return r; + + dh = (struct disk_header *) ps->area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + + } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + ps->chunk_size = le32_to_cpu(dh->chunk_size); + + } else { + DMWARN("Invalid/corrupt snapshot"); + r = -ENXIO; + } + + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + + dh = (struct disk_header *) ps->area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->chunk_size); + + return chunk_io(ps, 0, WRITE); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ + if (index >= ps->exceptions_per_area) + return NULL; + + return ((struct disk_exception *) ps->area) + index; +} + +static int read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + result->old_chunk = le64_to_cpu(e->old_chunk); + result->new_chunk = le64_to_cpu(e->new_chunk); + + return 0; +} + +static int write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + e->old_chunk = cpu_to_le64(de->old_chunk); + e->new_chunk = cpu_to_le64(de->new_chunk); + + return 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, int *full) +{ + int r; + unsigned int i; + struct disk_exception de; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + r = read_exception(ps, i, &de); + + if (r) + return r; + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (de.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= de.new_chunk) + ps->next_free = de.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps) +{ + uint32_t area; + int r, full = 1; + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (area = 0; full; area++) { + r = area_io(ps, area, READ); + if (r) + return r; + + r = insert_exceptions(ps, &full); + if (r) + return r; + } + + return 0; +} + +static inline struct pstore *get_info(struct exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = get_info(store)->next_free * store->snap->chunk_size; + *denominator = get_dev_size(store->snap->cow->dev); +} + +static void persistent_destroy(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + dm_io_put(sectors_to_pages(ps->chunk_size)); + vfree(ps->callbacks); + free_area(ps); + kfree(ps); +} + +static int persistent_read_metadata(struct exception_store *store) +{ + int r, new_snapshot; + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + r = zero_area(ps, 0); + if (r) { + DMWARN("zero_area(0) failed"); + return r; + } + + } else { + /* + * Sanity checks. + */ + if (!ps->valid) { + DMWARN("snapshot is marked invalid"); + return -EINVAL; + } + + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Read the metadata. + */ + r = read_exceptions(ps); + if (r) + return r; + } + + return 0; +} + +static int persistent_prepare(struct exception_store *store, + struct exception *e) +{ + struct pstore *ps = get_info(store); + uint32_t stride; + sector_t size = get_dev_size(store->snap->cow->dev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + stride = (ps->exceptions_per_area + 1); + if ((++ps->next_free % stride) == 1) + ps->next_free++; + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + int r; + unsigned int i; + struct pstore *ps = get_info(store); + struct disk_exception de; + struct commit_callback *cb; + + de.old_chunk = e->old_chunk; + de.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &de); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are no more exceptions in flight, or we have + * filled this metadata area we commit the exceptions to + * disk. + */ + if (atomic_dec_and_test(&ps->pending_count) || + (ps->current_committed == ps->exceptions_per_area)) { + r = area_io(ps, ps->current_area, WRITE); + if (r) + ps->valid = 0; + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, r == 0 ? 1 : 0); + } + + ps->callback_count = 0; + } + + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } +} + +static void persistent_drop(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +{ + int r; + struct pstore *ps; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + + /* allocate the pstore */ + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) { + r = -ENOMEM; + goto bad; + } + + ps->snap = store->snap; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->chunk_size = chunk_size; + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->next_free = 2; /* skipping the header and first area */ + ps->current_committed = 0; + + r = alloc_area(ps); + if (r) + goto bad; + + /* + * Allocate space for all the callbacks. + */ + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + + if (!ps->callbacks) { + r = -ENOMEM; + goto bad; + } + + store->destroy = persistent_destroy; + store->read_metadata = persistent_read_metadata; + store->prepare_exception = persistent_prepare; + store->commit_exception = persistent_commit; + store->drop_snapshot = persistent_drop; + store->fraction_full = persistent_fraction_full; + store->context = ps; + + return 0; + + bad: + dm_io_put(sectors_to_pages(chunk_size)); + if (ps) { + if (ps->callbacks) + vfree(ps->callbacks); + + kfree(ps); + } + return r; +} + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +void transient_destroy(struct exception_store *store) +{ + kfree(store->context); +} + +int transient_read_metadata(struct exception_store *store) +{ + return 0; +} + +int transient_prepare(struct exception_store *store, struct exception *e) +{ + struct transient_c *tc = (struct transient_c *) store->context; + sector_t size = get_dev_size(store->snap->cow->dev); + + if (size < (tc->next_free + store->snap->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store->snap, tc->next_free); + tc->next_free += store->snap->chunk_size; + + return 0; +} + +void transient_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static void transient_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = ((struct transient_c *) store->context)->next_free; + *denominator = get_dev_size(store->snap->cow->dev); +} + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize) +{ + struct transient_c *tc; + + memset(store, 0, sizeof(*store)); + store->destroy = transient_destroy; + store->read_metadata = transient_read_metadata; + store->prepare_exception = transient_prepare; + store->commit_exception = transient_commit; + store->fraction_full = transient_fraction_full; + store->snap = s; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} --- linux-2.4.22/drivers/md/dm-io.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-io.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,361 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#include "dm-io.h" + +#include +#include +#include +#include +#include + +/* FIXME: can we shrink this ? */ +struct io_context { + int rw; + unsigned int error; + atomic_t count; + struct task_struct *sleeper; + io_notify_fn callback; + void *context; +}; + +/* + * We maintain a pool of buffer heads for dispatching the io. + */ +static unsigned int _num_bhs; +static mempool_t *_buffer_pool; + +/* + * io contexts are only dynamically allocated for asynchronous + * io. Since async io is likely to be the majority of io we'll + * have the same number of io contexts as buffer heads ! (FIXME: + * must reduce this). + */ +mempool_t *_io_pool; + +static void *alloc_bh(int gfp_mask, void *pool_data) +{ + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, gfp_mask); + if (bh) { + bh->b_reqnext = NULL; + init_waitqueue_head(&bh->b_wait); + INIT_LIST_HEAD(&bh->b_inode_buffers); + } + + return bh; +} + +static void *alloc_io(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct io_context), gfp_mask); +} + +static void free_io(void *element, void *pool_data) +{ + kfree(element); +} + +static unsigned int pages_to_buffers(unsigned int pages) +{ + return 4 * pages; /* too many ? */ +} + +static int resize_pool(unsigned int new_bhs) +{ + int r = 0; + + if (_buffer_pool) { + if (new_bhs == 0) { + /* free off the pools */ + mempool_destroy(_buffer_pool); + mempool_destroy(_io_pool); + _buffer_pool = _io_pool = NULL; + } else { + /* resize the pools */ + r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL); + if (!r) + r = mempool_resize(_io_pool, + new_bhs, GFP_KERNEL); + } + } else { + /* create new pools */ + _buffer_pool = mempool_create(new_bhs, alloc_bh, + mempool_free_slab, bh_cachep); + if (!_buffer_pool) + r = -ENOMEM; + + _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL); + if (!_io_pool) { + mempool_destroy(_buffer_pool); + _buffer_pool = NULL; + r = -ENOMEM; + } + } + + if (!r) + _num_bhs = new_bhs; + + return r; +} + +int dm_io_get(unsigned int num_pages) +{ + return resize_pool(_num_bhs + pages_to_buffers(num_pages)); +} + +void dm_io_put(unsigned int num_pages) +{ + resize_pool(_num_bhs - pages_to_buffers(num_pages)); +} + +/*----------------------------------------------------------------- + * We need to keep track of which region a buffer is doing io + * for. In order to save a memory allocation we store this in an + * unused field of the buffer head, and provide these access + * functions. + * + * FIXME: add compile time check that an unsigned int can fit + * into a pointer. + * + *---------------------------------------------------------------*/ +static inline void bh_set_region(struct buffer_head *bh, unsigned int region) +{ + bh->b_journal_head = (void *) region; +} + +static inline int bh_get_region(struct buffer_head *bh) +{ + return (unsigned int) bh->b_journal_head; +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bhs that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void dec_count(struct io_context *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error); + + if (atomic_dec_and_test(&io->count)) { + if (io->sleeper) + wake_up_process(io->sleeper); + + else { + int r = io->error; + io_notify_fn fn = io->callback; + void *context = io->context; + + mempool_free(io, _io_pool); + fn(r, context); + } + } +} + +static void endio(struct buffer_head *bh, int uptodate) +{ + struct io_context *io = (struct io_context *) bh->b_private; + + if (!uptodate && io->rw != WRITE) { + /* + * We need to zero this region, otherwise people + * like kcopyd may write the arbitrary contents + * of the page. + */ + memset(bh->b_data, 0, bh->b_size); + } + + dec_count((struct io_context *) bh->b_private, + bh_get_region(bh), !uptodate); + mempool_free(bh, _buffer_pool); +} + +/* + * Primitives for alignment calculations. + */ +int fls(unsigned n) +{ + return generic_fls32(n); +} + +static inline int log2_floor(unsigned n) +{ + return ffs(n) - 1; +} + +static inline int log2_align(unsigned n) +{ + return fls(n) - 1; +} + +/* + * Returns the next block for io. + */ +static int do_page(kdev_t dev, sector_t *block, sector_t end_block, + unsigned int block_size, + struct page *p, unsigned int offset, + unsigned int region, struct io_context *io) +{ + struct buffer_head *bh; + sector_t b = *block; + sector_t blocks_per_page = PAGE_SIZE / block_size; + unsigned int this_size; /* holds the size of the current io */ + sector_t len; + + if (!blocks_per_page) { + DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported", + PAGE_SIZE, block_size); + return 0; + } + + while ((offset < PAGE_SIZE) && (b != end_block)) { + bh = mempool_alloc(_buffer_pool, GFP_NOIO); + init_buffer(bh, endio, io); + bh_set_region(bh, region); + + /* + * Block size must be a power of 2 and aligned + * correctly. + */ + + len = min(end_block - b, blocks_per_page); + len = min(len, blocks_per_page - offset / block_size); + + if (!len) { + DMERR("dm-io: Invalid offset/block_size (%u/%u).", + offset, block_size); + return 0; + } + + this_size = 1 << log2_align(len); + if (b) + this_size = min(this_size, + (unsigned) 1 << log2_floor(b)); + + /* + * Add in the job offset. + */ + bh->b_blocknr = (b / this_size); + bh->b_size = block_size * this_size; + set_bh_page(bh, p, offset); + bh->b_this_page = bh; + + bh->b_dev = dev; + atomic_set(&bh->b_count, 1); + + bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) | + (1 << BH_Lock)); + + if (io->rw == WRITE) + clear_bit(BH_Dirty, &bh->b_state); + + atomic_inc(&io->count); + submit_bh(io->rw, bh); + + b += this_size; + offset += block_size * this_size; + } + + *block = b; + return (b == end_block); +} + +static void do_region(unsigned int region, struct io_region *where, + struct page *page, unsigned int offset, + struct io_context *io) +{ + unsigned int block_size = get_hardsect_size(where->dev); + unsigned int sblock_size = block_size >> 9; + sector_t block = where->sector / sblock_size; + sector_t end_block = (where->sector + where->count) / sblock_size; + + while (1) { + if (do_page(where->dev, &block, end_block, block_size, + page, offset, region, io)) + break; + + offset = 0; /* only offset the first page */ + + page = list_entry(page->list.next, struct page, list); + } +} + +static void dispatch_io(unsigned int num_regions, struct io_region *where, + struct page *pages, unsigned int offset, + struct io_context *io) +{ + int i; + + for (i = 0; i < num_regions; i++) + if (where[i].count) + do_region(i, where + i, pages, offset, io); + + /* + * Drop the extra refence that we were holding to avoid + * the io being completed too early. + */ + dec_count(io, 0, 0); +} + +/* + * Synchronous io + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, + int rw, struct page *pages, unsigned int offset, + unsigned int *error_bits) +{ + struct io_context io; + + BUG_ON(num_regions > 1 && rw != WRITE); + + io.rw = rw; + io.error = 0; + atomic_set(&io.count, 1); /* see dispatch_io() */ + io.sleeper = current; + + dispatch_io(num_regions, where, pages, offset, &io); + run_task_queue(&tq_disk); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!atomic_read(&io.count)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + *error_bits = io.error; + return io.error ? -EIO : 0; +} + +/* + * Asynchronous io + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context) +{ + struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO); + + io->rw = rw; + io->error = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = NULL; + io->callback = fn; + io->context = context; + + dispatch_io(num_regions, where, pages, offset, io); + return 0; +} + +EXPORT_SYMBOL(dm_io_get); +EXPORT_SYMBOL(dm_io_put); +EXPORT_SYMBOL(dm_io_sync); +EXPORT_SYMBOL(dm_io_async); --- linux-2.4.22/drivers/md/dm-io.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-io.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef _DM_IO_H +#define _DM_IO_H + +#include "dm.h" + +#include + +/* Move these to bitops.h eventually */ +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */ +/* (c) 2002, D.Phillips and Sistina Software */ +/* Licensed under Version 2 of the GPL */ + +static unsigned generic_fls8(unsigned n) +{ + return n & 0xf0 ? + n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5: + n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2); +} + +static inline unsigned generic_fls16(unsigned n) +{ + return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n); +} + +static inline unsigned generic_fls32(unsigned n) +{ + return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n); +} + +/* FIXME make this configurable */ +#define DM_MAX_IO_REGIONS 8 + +struct io_region { + kdev_t dev; + sector_t sector; + sector_t count; +}; + + +/* + * 'error' is a bitset, with each bit indicating whether an error + * occurred doing io to the corresponding region. + */ +typedef void (*io_notify_fn)(unsigned int error, void *context); + + +/* + * Before anyone uses the IO interface they should call + * dm_io_get(), specifying roughly how many pages they are + * expecting to perform io on concurrently. + * + * This function may block. + */ +int dm_io_get(unsigned int num_pages); +void dm_io_put(unsigned int num_pages); + + +/* + * Synchronous IO. + * + * Please ensure that the rw flag in the next two functions is + * either READ or WRITE, ie. we don't take READA. Any + * regions with a zero count field will be ignored. + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + unsigned int *error_bits); + + +/* + * Aynchronous IO. + * + * The 'where' array may be safely allocated on the stack since + * the function takes a copy. + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context); + +#endif --- linux-2.4.22/drivers/md/dm-ioctl.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-ioctl.c Thu Nov 20 18:29:54 2003 @@ -0,0 +1,1284 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DM_DRIVER_EMAIL "dm@uk.sistina.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct list_head name_list; + struct list_head uuid_list; + + char *name; + char *uuid; + struct mapped_device *md; + struct dm_table *new_map; + + /* I hate devfs */ + devfs_handle_t devfs_entry; +}; + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static devfs_handle_t _dev_dir; +void dm_hash_remove_all(void); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +static void init_buckets(struct list_head *buckets) +{ + unsigned int i; + + for (i = 0; i < NUM_BUCKETS; i++) + INIT_LIST_HEAD(buckets + i); +} + +int dm_hash_init(void) +{ + init_buckets(_name_buckets); + init_buckets(_uuid_buckets); + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL); + return 0; +} + +void dm_hash_exit(void) +{ + dm_hash_remove_all(); + devfs_unregister(_dev_dir); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ + const unsigned int hash_mult = 2654435387U; + unsigned int h = 0; + + while (*str) + h = (h + (unsigned int) *str++) * hash_mult; + + return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _name_buckets + h) { + hc = list_entry(tmp, struct hash_cell, name_list); + if (!strcmp(hc->name, str)) + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _uuid_buckets + h) { + hc = list_entry(tmp, struct hash_cell, uuid_list); + if (!strcmp(hc->uuid, str)) + return hc; + } + + return NULL; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static inline char *kstrdup(const char *str) +{ + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); + if (r) + strcpy(r, str); + return r; +} + +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + INIT_LIST_HEAD(&hc->name_list); + INIT_LIST_HEAD(&hc->uuid_list); + hc->md = md; + hc->new_map = NULL; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * devfs stuff. + */ +static int register_with_devfs(struct hash_cell *hc) +{ + kdev_t dev = dm_kdev(hc->md); + + hc->devfs_entry = + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER, + major(dev), minor(dev), + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, + &dm_blk_dops, NULL); + + return 0; +} + +static int unregister_with_devfs(struct hash_cell *hc) +{ + devfs_unregister(hc->devfs_entry); + return 0; +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into both hash tables. + */ + down_write(&_hash_lock); + if (__get_name_cell(name)) + goto bad; + + list_add(&cell->name_list, _name_buckets + hash_str(name)); + + if (uuid) { + if (__get_uuid_cell(uuid)) { + list_del(&cell->name_list); + goto bad; + } + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + } + register_with_devfs(cell); + dm_get(md); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +void __hash_remove(struct hash_cell *hc) +{ + /* remove from the dev hash */ + list_del(&hc->uuid_list); + list_del(&hc->name_list); + unregister_with_devfs(hc); + dm_put(hc->md); + if (hc->new_map) + dm_table_put(hc->new_map); + free_cell(hc); +} + +void dm_hash_remove_all(void) +{ + int i; + struct hash_cell *hc; + struct list_head *tmp, *n; + + down_write(&_hash_lock); + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_safe (tmp, n, _name_buckets + i) { + hc = list_entry(tmp, struct hash_cell, name_list); + __hash_remove(hc); + } + } + up_write(&_hash_lock); +} + +int dm_hash_rename(const char *old, const char *new) +{ + char *new_name, *old_name; + struct hash_cell *hc; + + /* + * duplicate new. + */ + new_name = kstrdup(new); + if (!new_name) + return -ENOMEM; + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + hc = __get_name_cell(new); + if (hc) { + DMWARN("asked to rename to an already existing name %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -EBUSY; + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(old); + if (!hc) { + DMWARN("asked to rename a non existent device %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -ENXIO; + } + + /* + * rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + hc->name = new_name; + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + /* rename the device node in devfs */ + unregister_with_devfs(hc); + register_with_devfs(hc); + + up_write(&_hash_lock); + kfree(old_name); + return 0; +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct dm_ioctl *param, size_t param_size) +{ + dm_hash_remove_all(); + param->data_size = 0; + return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline void *align_ptr(void *ptr) +{ + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, + size_t *len) +{ + param->data_start = align_ptr(param + 1) - (void *) param; + + if (param->data_start < param_size) + *len = param_size - param->data_start; + else + *len = 0; + + return ((void *) param) + param->data_start; +} + +static int list_devices(struct dm_ioctl *param, size_t param_size) +{ + unsigned int i; + struct hash_cell *hc; + size_t len, needed = 0; + struct dm_name_list *nl, *old_nl = NULL; + + down_write(&_hash_lock); + + /* + * Loop through all the devices working out how much + * space we need. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name); + needed += ALIGN_MASK; + } + } + + /* + * Grab our output buffer. + */ + nl = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + nl->dev = 0; /* Flags no data */ + + /* + * Now loop through filling out the names. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + + nl->dev = dm_kdev(hc->md); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); + } + } + + out: + up_write(&_hash_lock); + return 0; +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMWARN("invalid device name"); + return -EINVAL; + } + + return 0; +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ + kdev_t dev = dm_kdev(md); + struct dm_table *table; + struct block_device *bdev; + + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | + DM_ACTIVE_PRESENT_FLAG); + + if (dm_suspended(md)) + param->flags |= DM_SUSPEND_FLAG; + + param->dev = kdev_t_to_nr(dev); + + if (is_read_only(dev)) + param->flags |= DM_READONLY_FLAG; + + param->event_nr = dm_get_event_nr(md); + + table = dm_get_table(md); + if (table) { + param->flags |= DM_ACTIVE_PRESENT_FLAG; + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + } else + param->target_count = 0; + + bdev = bdget(param->dev); + if (!bdev) + return -ENXIO; + param->open_count = bdev->bd_openers; + bdput(bdev); + + return 0; +} + +static int dev_create(struct dm_ioctl *param, size_t param_size) +{ + int r; + kdev_t dev = 0; + struct mapped_device *md; + + r = check_name(param->name); + if (r) + return r; + + if (param->flags & DM_PERSISTENT_DEV_FLAG) + dev = to_kdev_t(param->dev); + + r = dm_create(dev, &md); + if (r) + return r; + + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + if (r) { + dm_put(md); + return r; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(md, param); + dm_put(md); + + return r; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name. + */ +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ + return *param->uuid ? + __get_uuid_cell(param->uuid) : __get_name_cell(param->name); +} + +static inline struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = __find_device_hash_cell(param); + if (hc) { + md = hc->md; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strncpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1); + else + param->uuid[0] = '\0'; + + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + dm_get(md); + } + up_read(&_hash_lock); + + return md; +} + +static int dev_remove(struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + __hash_remove(hc); + up_write(&_hash_lock); + param->data_size = 0; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str < end) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int dev_rename(struct dm_ioctl *param, size_t param_size) +{ + int r; + char *new_name = (char *) param + param->data_start; + + if (new_name < (char *) (param + 1) || + invalid_str(new_name, (void *) param + param_size)) { + DMWARN("Invalid new logical volume name supplied."); + return -EINVAL; + } + + r = check_name(new_name); + if (r) + return r; + + param->data_size = 0; + return dm_hash_rename(param->name, new_name); +} + +static int do_suspend(struct dm_ioctl *param) +{ + int r = 0; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (!dm_suspended(md)) + r = dm_suspend(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +static int do_resume(struct dm_ioctl *param) +{ + int r = 0; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *new_map; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + dm_get(md); + + new_map = hc->new_map; + hc->new_map = NULL; + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + up_write(&_hash_lock); + + /* Do we need to load a new map ? */ + if (new_map) { + /* Suspend if it isn't already suspended */ + if (!dm_suspended(md)) + dm_suspend(md); + + r = dm_swap_table(md, new_map); + if (r) { + dm_put(md); + dm_table_put(new_map); + return r; + } + + if (dm_table_get_mode(new_map) & FMODE_WRITE) + set_device_ro(dm_kdev(md), 0); + else + set_device_ro(dm_kdev(md), 1); + + dm_table_put(new_map); + } + + if (dm_suspended(md)) + r = dm_resume(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct dm_ioctl *param, size_t param_size) +{ + if (param->flags & DM_SUSPEND_FLAG) + return do_suspend(param); + + return do_resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + dm_put(md); + return r; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + remaining = len - (outptr - outbuf); + if (remaining < sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (ti->type->status(ti, type, outptr, remaining)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + } else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + used = param->data_start + (outptr - outbuf); + + align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + DECLARE_WAITQUEUE(wq, current); + + md = find_device(param); + if (!md) + return -ENXIO; + + /* + * Wait for a notification event + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!dm_add_wait_queue(md, &wq, param->event_nr)) { + schedule(); + dm_remove_wait_queue(md, &wq); + } + set_current_state(TASK_RUNNING); + + /* + * The userland program is going to want to know what + * changed to trigger the event, so we may as well tell + * him and save an ioctl. + */ + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +static inline int get_mode(struct dm_ioctl *param) +{ + int mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, + struct dm_target_spec **spec, char **target_params) +{ + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); + *target_params = (char *) (*spec + 1); + + if (*spec < (last + 1)) + return -EINVAL; + + return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + int r; + unsigned int i = 0; + struct dm_target_spec *spec = (struct dm_target_spec *) param; + uint32_t next = param->data_start; + void *end = (void *) param + param_size; + char *target_params; + + if (!param->target_count) { + DMWARN("populate_table: no targets specified"); + return -EINVAL; + } + + for (i = 0; i < param->target_count; i++) { + + r = next_target(spec, next, end, &spec, &target_params); + if (r) { + DMWARN("unable to find target"); + return r; + } + + r = dm_table_add_target(table, spec->target_type, + (sector_t) spec->sector_start, + (sector_t) spec->length, + target_params); + if (r) { + DMWARN("error adding target to table"); + return r; + } + + next = spec->next; + } + + return dm_table_complete(table); +} + +static int table_load(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + struct dm_table *t; + + r = dm_table_create(&t, get_mode(param), param->target_count); + if (r) + return r; + + r = populate_table(t, param, param_size); + if (r) { + dm_table_put(t); + return r; + } + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) + dm_table_put(hc->new_map); + hc->new_map = t; + param->flags |= DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +static int table_clear(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) { + dm_table_put(hc->new_map); + hc->new_map = NULL; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int count = 0; + struct list_head *tmp; + size_t len, needed; + struct dm_target_deps *deps; + + deps = get_result_buffer(param, param_size, &len); + + /* + * Count the devices. + */ + list_for_each(tmp, dm_table_get_devices(table)) + count++; + + /* + * Check we have enough space. + */ + needed = sizeof(*deps) + (sizeof(*deps->dev) * count); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + return; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each(tmp, dm_table_get_devices(table)) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + deps->dev[count++] = dd->bdev->bd_dev; + } + + param->data_size = param->data_start + needed; +} + +static int table_deps(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_deps(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, remove_all}, + {DM_LIST_DEVICES_CMD, list_devices}, + + {DM_DEV_CREATE_CMD, dev_create}, + {DM_DEV_REMOVE_CMD, dev_remove}, + {DM_DEV_RENAME_CMD, dev_rename}, + {DM_DEV_SUSPEND_CMD, dev_suspend}, + {DM_DEV_STATUS_CMD, dev_status}, + {DM_DEV_WAIT_CMD, dev_wait}, + + {DM_TABLE_LOAD_CMD, table_load}, + {DM_TABLE_CLEAR_CMD, table_clear}, + {DM_TABLE_DEPS_CMD, table_deps}, + {DM_TABLE_STATUS_CMD, table_status} + }; + + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl *user) +{ + uint32_t version[3]; + int r = 0; + + if (copy_from_user(version, user->version, sizeof(version))) + return -EFAULT; + + if ((DM_VERSION_MAJOR != version[0]) || + (DM_VERSION_MINOR < version[1])) { + DMWARN("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + version[0] = DM_VERSION_MAJOR; + version[1] = DM_VERSION_MINOR; + version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, version, sizeof(version))) + return -EFAULT; + + return r; +} + +static void free_params(struct dm_ioctl *param) +{ + vfree(param); +} + +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param) +{ + struct dm_ioctl tmp, *dmi; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (tmp.data_size < sizeof(tmp)) + return -EINVAL; + + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); + if (!dmi) + return -ENOMEM; + + if (copy_from_user(dmi, user, tmp.data_size)) { + vfree(dmi); + return -EFAULT; + } + + *param = dmi; + return 0; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Always clear this flag */ + param->flags &= ~DM_BUFFER_FULL_FLAG; + + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD) + return 0; + + /* Unless creating, either name or uuid but not both */ + if (cmd != DM_DEV_CREATE_CMD) { + if ((!*param->uuid && !*param->name) || + (*param->uuid && *param->name)) { + DMWARN("one of name or uuid must be supplied, cmd(%u)", + cmd); + return -EINVAL; + } + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + int r = 0; + unsigned int cmd; + struct dm_ioctl *param; + struct dm_ioctl *user = (struct dm_ioctl *) u; + ioctl_fn fn = NULL; + size_t param_size; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd); + if (!fn) { + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * FIXME: I don't like this, we're trying to avoid low + * memory issues when a device is suspended. + */ + current->flags |= PF_MEMALLOC; + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); + if (r) { + current->flags &= ~PF_MEMALLOC; + return r; + } + + r = validate_params(cmd, param); + if (r) + goto out; + + param_size = param->data_size; + param->data_size = sizeof(*param); + r = fn(param, param_size); + + /* + * Copy the results back to userland. + */ + if (!r && copy_to_user(user, param, param->data_size)) + r = -EFAULT; + + out: + free_params(param); + current->flags &= ~PF_MEMALLOC; + return r; +} + +static struct file_operations _ctl_fops = { + .ioctl = ctl_ioctl, + .owner = THIS_MODULE, +}; + +static devfs_handle_t _ctl_handle; + +static struct miscdevice _dm_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DM_NAME, + .fops = &_ctl_fops +}; + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + char rname[64]; + + r = dm_hash_init(); + if (r) + return r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + dm_hash_exit(); + return r; + } + + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3, + sizeof rname - 3); + if (r == -ENOSYS) + goto done; /* devfs not present */ + + if (r < 0) { + DMERR("devfs_generate_path failed for control device"); + goto failed; + } + + strncpy(rname + r, "../", 3); + r = devfs_mk_symlink(NULL, DM_DIR "/control", + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL); + if (r) { + DMERR("devfs_mk_symlink failed for control device"); + goto failed; + } + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle); + + done: + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; + + failed: + misc_deregister(&_dm_misc); + dm_hash_exit(); + return r; +} + +void dm_interface_exit(void) +{ + if (misc_deregister(&_dm_misc) < 0) + DMERR("misc_deregister failed for control device"); + + dm_hash_exit(); +} --- linux-2.4.22/drivers/md/dm-linear.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-linear.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct linear_c *lc; + + if (argc != 2) { + ti->error = "dm-linear: Invalid argument count"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "dm-linear: Cannot allocate linear context"; + return -ENOMEM; + } + + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { + ti->error = "dm-linear: Invalid device sector"; + goto bad; + } + + if (dm_get_device(ti, argv[0], lc->start, ti->len, + dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "dm-linear: Device lookup failed"; + goto bad; + } + + ti->private = lc; + return 0; + + bad: + kfree(lc); + return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + bh->b_rdev = lc->dev->dev; + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin); + + return 1; +} + +static int linear_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + kdev_t kdev; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + kdev = to_kdev_t(lc->dev->bdev->bd_dev); + snprintf(result, maxlen, "%s " SECTOR_FORMAT, + dm_kdevname(kdev), lc->start); + break; + } + return 0; +} + +static struct target_type linear_target = { + .name = "linear", + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("linear: register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + int r = dm_unregister_target(&linear_target); + + if (r < 0) + DMERR("linear: unregister failed %d", r); +} --- linux-2.4.22/drivers/md/dm-log.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-log.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include + +#include "dm-log.h" +#include "dm-io.h" + +static LIST_HEAD(_log_types); +static spinlock_t _lock = SPIN_LOCK_UNLOCKED; + +int dm_register_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count = 0; + if (type->module) + __MOD_INC_USE_COUNT(type->module); + + list_add(&type->list, &_log_types); + spin_unlock(&_lock); + + return 0; +} + +int dm_unregister_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + + if (type->use_count) + DMWARN("Attempt to unregister a log type that is still in use"); + else { + list_del(&type->list); + if (type->module) + __MOD_DEC_USE_COUNT(type->module); + } + + spin_unlock(&_lock); + + return 0; +} + +static struct dirty_log_type *get_type(const char *type_name) +{ + struct dirty_log_type *type; + struct list_head *tmp; + + spin_lock(&_lock); + list_for_each (tmp, &_log_types) { + type = list_entry(tmp, struct dirty_log_type, list); + if (!strcmp(type_name, type->name)) { + type->use_count++; + spin_unlock(&_lock); + return type; + } + } + + spin_unlock(&_lock); + return NULL; +} + +static void put_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count--; + spin_unlock(&_lock); +} + +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct dirty_log_type *type; + struct dirty_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return NULL; + + type = get_type(type_name); + if (!type) { + kfree(log); + return NULL; + } + + log->type = type; + if (type->ctr(log, dev_size, argc, argv)) { + kfree(log); + put_type(type); + return NULL; + } + + return log; +} + +void dm_destroy_dirty_log(struct dirty_log *log) +{ + log->type->dtr(log); + put_type(log->type); + kfree(log); +} + + +/*----------------------------------------------------------------- + * In core log, ie. trivial, non-persistent + * + * For now we'll keep this simple and just have 2 bitsets, one + * for clean/dirty, the other for sync/nosync. The sync bitset + * will be freed when everything is in sync. + * + * FIXME: problems with a 64bit sector_t + *---------------------------------------------------------------*/ +struct core_log { + sector_t region_size; + unsigned int region_count; + unsigned long *clean_bits; + unsigned long *sync_bits; + unsigned long *recovering_bits; /* FIXME: this seems excessive */ + + int sync_search; +}; + +#define BYTE_SHIFT 3 + +static int core_ctr(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct core_log *clog; + sector_t region_size; + unsigned int region_count; + size_t bitset_size; + + if (argc != 1) { + DMWARN("wrong number of arguments to core_log"); + return -EINVAL; + } + + if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { + DMWARN("invalid region size string"); + return -EINVAL; + } + + region_count = dm_div_up(dev_size, region_size); + + clog = kmalloc(sizeof(*clog), GFP_KERNEL); + if (!clog) { + DMWARN("couldn't allocate core log"); + return -ENOMEM; + } + + clog->region_size = region_size; + clog->region_count = region_count; + + /* + * Work out how many words we need to hold the bitset. + */ + bitset_size = dm_round_up(region_count, + sizeof(*clog->clean_bits) << BYTE_SHIFT); + bitset_size >>= BYTE_SHIFT; + + clog->clean_bits = vmalloc(bitset_size); + if (!clog->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(clog); + return -ENOMEM; + } + memset(clog->clean_bits, -1, bitset_size); + + clog->sync_bits = vmalloc(bitset_size); + if (!clog->sync_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->sync_bits, 0, bitset_size); + + clog->recovering_bits = vmalloc(bitset_size); + if (!clog->recovering_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->sync_bits); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->recovering_bits, 0, bitset_size); + clog->sync_search = 0; + log->context = clog; + return 0; +} + +static void core_dtr(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + vfree(clog->clean_bits); + vfree(clog->sync_bits); + vfree(clog->recovering_bits); + kfree(clog); +} + +static sector_t core_get_region_size(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + return clog->region_size; +} + +static int core_is_clean(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + return test_bit(region, clog->clean_bits); +} + +static int core_in_sync(struct dirty_log *log, region_t region, int block) +{ + struct core_log *clog = (struct core_log *) log->context; + + return test_bit(region, clog->sync_bits) ? 1 : 0; +} + +static int core_flush(struct dirty_log *log) +{ + /* no op */ + return 0; +} + +static void core_mark_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + clear_bit(region, clog->clean_bits); +} + +static void core_clear_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + set_bit(region, clog->clean_bits); +} + +static int core_get_resync_work(struct dirty_log *log, region_t *region) +{ + struct core_log *clog = (struct core_log *) log->context; + + if (clog->sync_search >= clog->region_count) + return 0; + + do { + *region = find_next_zero_bit(clog->sync_bits, + clog->region_count, + clog->sync_search); + clog->sync_search = *region + 1; + + if (*region == clog->region_count) + return 0; + + } while (test_bit(*region, clog->recovering_bits)); + + set_bit(*region, clog->recovering_bits); + return 1; +} + +static void core_complete_resync_work(struct dirty_log *log, region_t region, + int success) +{ + struct core_log *clog = (struct core_log *) log->context; + + clear_bit(region, clog->recovering_bits); + if (success) + set_bit(region, clog->sync_bits); +} + +static struct dirty_log_type _core_type = { + .name = "core", + + .ctr = core_ctr, + .dtr = core_dtr, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = core_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .complete_resync_work = core_complete_resync_work +}; + +__init int dm_dirty_log_init(void) +{ + int r; + + r = dm_register_dirty_log_type(&_core_type); + if (r) + DMWARN("couldn't register core log"); + + return r; +} + +void dm_dirty_log_exit(void) +{ + dm_unregister_dirty_log_type(&_core_type); +} + +EXPORT_SYMBOL(dm_register_dirty_log_type); +EXPORT_SYMBOL(dm_unregister_dirty_log_type); +EXPORT_SYMBOL(dm_dirty_log_init); +EXPORT_SYMBOL(dm_dirty_log_exit); +EXPORT_SYMBOL(dm_create_dirty_log); +EXPORT_SYMBOL(dm_destroy_dirty_log); --- linux-2.4.22/drivers/md/dm-log.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-log.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DIRTY_LOG +#define DM_DIRTY_LOG + +#include "dm.h" + +typedef sector_t region_t; + +struct dirty_log_type; + +struct dirty_log { + struct dirty_log_type *type; + void *context; +}; + +struct dirty_log_type { + struct list_head list; + const char *name; + struct module *module; + unsigned int use_count; + + int (*ctr)(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv); + void (*dtr)(struct dirty_log *log); + + /* + * Retrieves the smallest size of region that the log can + * deal with. + */ + sector_t (*get_region_size)(struct dirty_log *log); + + /* + * A predicate to say whether a region is clean or not. + * May block. + */ + int (*is_clean)(struct dirty_log *log, region_t region); + + /* + * Returns: 0, 1, -EWOULDBLOCK, < 0 + * + * A predicate function to check the area given by + * [sector, sector + len) is in sync. + * + * If -EWOULDBLOCK is returned the state of the region is + * unknown, typically this will result in a read being + * passed to a daemon to deal with, since a daemon is + * allowed to block. + */ + int (*in_sync)(struct dirty_log *log, region_t region, int can_block); + + /* + * Flush the current log state (eg, to disk). This + * function may block. + */ + int (*flush)(struct dirty_log *log); + + /* + * Mark an area as clean or dirty. These functions may + * block, though for performance reasons blocking should + * be extremely rare (eg, allocating another chunk of + * memory for some reason). + */ + void (*mark_region)(struct dirty_log *log, region_t region); + void (*clear_region)(struct dirty_log *log, region_t region); + + /* + * Returns: <0 (error), 0 (no region), 1 (region) + * + * The mirrord will need perform recovery on regions of + * the mirror that are in the NOSYNC state. This + * function asks the log to tell the caller about the + * next region that this machine should recover. + * + * Do not confuse this function with 'in_sync()', one + * tells you if an area is synchronised, the other + * assigns recovery work. + */ + int (*get_resync_work)(struct dirty_log *log, region_t *region); + + /* + * This notifies the log that the resync of an area has + * been completed. The log should then mark this region + * as CLEAN. + */ + void (*complete_resync_work)(struct dirty_log *log, + region_t region, int success); +}; + +int dm_register_dirty_log_type(struct dirty_log_type *type); +int dm_unregister_dirty_log_type(struct dirty_log_type *type); + + +/* + * Make sure you use these two functions, rather than calling + * type->constructor/destructor() directly. + */ +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv); +void dm_destroy_dirty_log(struct dirty_log *log); + +/* + * init/exit functions. + */ +int dm_dirty_log_init(void); +void dm_dirty_log_exit(void); + +#endif --- linux-2.4.22/drivers/md/dm-raid1.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-raid1.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,1294 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-daemon.h" +#include "dm-io.h" +#include "dm-log.h" +#include "kcopyd.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct dm_daemon _kmirrord; + +/*----------------------------------------------------------------- + * buffer lists: + * + * We play with singly linked lists of buffers, but we want to be + * careful to add new buffers to the back of the list, to avoid + * buffers being starved of attention. + *---------------------------------------------------------------*/ +struct buffer_list { + struct buffer_head *head; + struct buffer_head *tail; +}; + +static inline void buffer_list_init(struct buffer_list *bl) +{ + bl->head = bl->tail = NULL; +} + +static inline void buffer_list_add(struct buffer_list *bl, + struct buffer_head *bh) +{ + bh->b_reqnext = NULL; + + if (bl->tail) { + bl->tail->b_reqnext = bh; + bl->tail = bh; + } else + bl->head = bl->tail = bh; +} + +static struct buffer_head *buffer_list_pop(struct buffer_list *bl) +{ + struct buffer_head *bh = bl->head; + + if (bh) { + bl->head = bl->head->b_reqnext; + if (!bl->head) + bl->tail = NULL; + + bh->b_reqnext = NULL; + } + + return bh; +} + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'bhs_delayed' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct mirror_set; +struct region_hash { + struct mirror_set *ms; + sector_t region_size; + + /* holds persistent region state */ + struct dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned int mask; + unsigned int nr_buckets; + struct list_head *buckets; + + spinlock_t region_lock; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; +}; + +enum { + RH_CLEAN, + RH_DIRTY, + RH_NOSYNC, + RH_RECOVERING +}; + +struct region { + struct region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct buffer_head *delayed_bhs; +}; + +/* + * Conversion fns + */ +static inline region_t bh_to_region(struct region_hash *rh, + struct buffer_head *bh) +{ + return bh->b_rsector / rh->region_size; +} + +static inline sector_t region_to_sector(struct region_hash *rh, region_t region) +{ + return region * rh->region_size; +} + +/* FIXME move this */ +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw); + +static void *region_alloc(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ + kfree(element); +} + +#define MIN_REGIONS 64 +#define MAX_RECOVERY 1 +static int rh_init(struct region_hash *rh, struct mirror_set *ms, + struct dirty_log *log, sector_t region_size, + region_t nr_regions) +{ + unsigned int nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh->ms = ms; + rh->log = log; + rh->region_size = region_size; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash memory"); + return -ENOMEM; + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, + region_free, NULL); + if (!rh->region_pool) { + vfree(rh->buckets); + rh->buckets = NULL; + return -ENOMEM; + } + + return 0; +} + +static void rh_exit(struct region_hash *rh) +{ + unsigned int h; + struct region *reg; + struct list_head *tmp, *tmp2; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_safe (tmp, tmp2, rh->buckets + h) { + reg = list_entry(tmp, struct region, hash_list); + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_destroy_dirty_log(rh->log); + if (rh->region_pool) + mempool_destroy(rh->region_pool); + vfree(rh->buckets); +} + +#define RH_HASH_MULT 2654435387U + +static inline unsigned int rh_hash(struct region_hash *rh, region_t region) +{ + return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ + struct region *reg; + + list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ + unsigned int h = rh_hash(rh, reg->key); + list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ + struct region *reg, *nreg; + + read_unlock(&rh->hash_lock); + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + RH_CLEAN : RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + + INIT_LIST_HEAD(&nreg->list); + + atomic_set(&nreg->pending, 0); + nreg->delayed_bhs = NULL; + write_lock_irq(&rh->hash_lock); + + reg = __rh_lookup(rh, region); + if (reg) + /* we lost the race */ + mempool_free(nreg, rh->region_pool); + + else { + __rh_insert(rh, nreg); + if (nreg->state == RH_CLEAN) { + spin_lock_irq(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock_irq(&rh->region_lock); + } + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + read_lock(&rh->hash_lock); + + return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ + struct region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) + reg = __rh_alloc(rh, region); + + return reg; +} + +static int rh_state(struct region_hash *rh, region_t region, int may_block) +{ + int r; + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a RH_NOSYNC + */ + return r == 1 ? RH_CLEAN : RH_NOSYNC; +} + +static inline int rh_in_sync(struct region_hash *rh, + region_t region, int may_block) +{ + int state = rh_state(rh, region, may_block); + return state == RH_CLEAN || state == RH_DIRTY; +} + +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh) +{ + struct buffer_head *nbh; + + while (bh) { + nbh = bh->b_reqnext; + queue_bh(ms, bh, WRITE); + bh = nbh; + } +} + +static void rh_update_states(struct region_hash *rh) +{ + struct list_head *tmp, *tmp2; + struct region *reg; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice(&rh->clean_regions, &clean); + INIT_LIST_HEAD(&rh->clean_regions); + + list_for_each_entry (reg, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + list_del(®->hash_list); + } + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice(&rh->recovered_regions, &recovered); + INIT_LIST_HEAD(&rh->recovered_regions); + + list_for_each_entry (reg, &recovered, list) + list_del(®->hash_list); + } + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_safe (tmp, tmp2, &recovered) { + reg = list_entry(tmp, struct region, list); + + rh->log->type->complete_resync_work(rh->log, reg->key, 1); + dispatch_buffers(rh->ms, reg->delayed_bhs); + up(&rh->recovery_count); + mempool_free(reg, rh->region_pool); + } + + list_for_each_safe (tmp, tmp2, &clean) { + reg = list_entry(tmp, struct region, list); + mempool_free(reg, rh->region_pool); + } +} + +static void rh_inc(struct region_hash *rh, region_t region) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + if (reg->state == RH_CLEAN) { + rh->log->type->mark_region(rh->log, reg->key); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + } + + atomic_inc(®->pending); + read_unlock(&rh->hash_lock); +} + +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers) +{ + struct buffer_head *bh; + + for (bh = buffers->head; bh; bh = bh->b_reqnext) + rh_inc(rh, bh_to_region(rh, bh)); +} + +static void rh_dec(struct region_hash *rh, region_t region) +{ + unsigned long flags; + struct region *reg; + int wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (atomic_dec_and_test(®->pending)) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else { + reg->state = RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + spin_unlock_irqrestore(&rh->region_lock, flags); + wake = 1; + } + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ + int r; + struct region *reg; + region_t region; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + + else { + list_del_init(®->list); + list_add(®->list, &rh->quiesced_regions); + } + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +static void rh_recovery_prepare(struct region_hash *rh) +{ + while (!down_trylock(&rh->recovery_count)) + if (__rh_recovery_prepare(rh) <= 0) { + up(&rh->recovery_count); + break; + } +} + +/* + * Returns any quiesced regions. + */ +static struct region *rh_recovery_start(struct region_hash *rh) +{ + struct region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} + +/* FIXME: success ignored for now */ +static void rh_recovery_end(struct region *reg, int success) +{ + struct region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + list_add(®->list, ®->rh->recovered_regions); + spin_unlock_irq(&rh->region_lock); + + dm_daemon_wake(&_kmirrord); +} + +static void rh_flush(struct region_hash *rh) +{ + rh->log->type->flush(rh->log); +} + +static void rh_delay(struct region_hash *rh, struct buffer_head *bh) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, bh_to_region(rh, bh)); + bh->b_reqnext = reg->delayed_bhs; + reg->delayed_bhs = bh; + read_unlock(&rh->hash_lock); +} + +static void rh_stop_recovery(struct region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < MAX_RECOVERY; i++) + down(&rh->recovery_count); +} + +static void rh_start_recovery(struct region_hash *rh) +{ + int i; + + for (i = 0; i < MAX_RECOVERY; i++) + up(&rh->recovery_count); + + dm_daemon_wake(&_kmirrord); +} + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +struct mirror { + atomic_t error_count; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + struct region_hash rh; + struct kcopyd_client *kcopyd_client; + + spinlock_t lock; /* protects the next two lists */ + struct buffer_list reads; + struct buffer_list writes; + + /* recovery */ + region_t nr_regions; + region_t sync_count; + + unsigned int nr_mirrors; + struct mirror mirror[0]; +}; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky. We squirrel the mirror_set struct away inside + * b_reqnext for write buffers. This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror_set *bh_get_ms(struct buffer_head *bh) +{ + return (struct mirror_set *) bh->b_reqnext; +} + +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms) +{ + bh->b_reqnext = (struct buffer_head *) ms; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state. We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned int write_err, + void *context) +{ + struct region *reg = (struct region *) context; + struct mirror_set *ms = reg->rh->ms; + + /* FIXME: better error handling */ + rh_recovery_end(reg, read_err || write_err); + if (++ms->sync_count == ms->nr_regions) + /* the sync is complete */ + dm_table_event(ms->ti->table); +} + +static int recover(struct mirror_set *ms, struct region *reg) +{ + int r; + unsigned int i; + struct io_region from, to[ms->nr_mirrors - 1], *dest; + struct mirror *m; + unsigned int flags = 0; + + /* fill in the source */ + m = ms->mirror + DEFAULT_MIRROR; + from.dev = m->dev->dev; + from.sector = m->offset + region_to_sector(reg->rh, reg->key); + if (reg->key == (ms->nr_regions - 1)) { + /* + * The final region may be smaller than + * region_size. + */ + from.count = ms->ti->len & (reg->rh->region_size - 1); + if (!from.count) + from.count = reg->rh->region_size; + } else + from.count = reg->rh->region_size; + + /* fill in the destinations */ + for (i = 1; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + dest = to + (i - 1); + + dest->dev = m->dev->dev; + dest->sector = m->offset + region_to_sector(reg->rh, reg->key); + dest->count = from.count; + } + + /* hand to kcopyd */ + set_bit(KCOPYD_IGNORE_ERROR, &flags); + r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, + recovery_complete, reg); + + return r; +} + +static void do_recovery(struct mirror_set *ms) +{ + int r; + struct region *reg; + + /* + * Start quiescing some regions. + */ + rh_recovery_prepare(&ms->rh); + + /* + * Copy any already quiesced regions. + */ + while ((reg = rh_recovery_start(&ms->rh))) { + r = recover(ms, reg); + if (r) + rh_recovery_end(reg, 0); + } +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ + /* FIXME: add read balancing */ + return ms->mirror + DEFAULT_MIRROR; +} + +/* + * remap a buffer to a particular mirror. + */ +static void map_buffer(struct mirror_set *ms, + struct mirror *m, struct buffer_head *bh) +{ + bh->b_rdev = m->dev->dev; + bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin); +} + +static void do_reads(struct mirror_set *ms, struct buffer_list *reads) +{ + region_t region; + struct buffer_head *bh; + struct mirror *m; + + while ((bh = buffer_list_pop(reads))) { + region = bh_to_region(&ms->rh, bh); + + /* + * We can only read balance if the region is in sync. + */ + if (rh_in_sync(&ms->rh, region, 0)) + m = choose_mirror(ms, bh->b_rsector); + else + m = ms->mirror + DEFAULT_MIRROR; + + map_buffer(ms, m, bh); + generic_make_request(READ, bh); + } +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: increment pending, use kcopyd to write to *all* mirrors + * RECOVERING: delay the io until recovery completes + * NOSYNC: increment pending, just write to the default mirror + *---------------------------------------------------------------*/ +static void write_callback(unsigned int error, void *context) +{ + unsigned int i; + int uptodate = 1; + struct buffer_head *bh = (struct buffer_head *) context; + struct mirror_set *ms; + + ms = bh_get_ms(bh); + bh_set_ms(bh, NULL); + + /* + * NOTE: We don't decrement the pending count here, + * instead it is done by the targets endio function. + * This way we handle both writes to SYNC and NOSYNC + * regions with the same code. + */ + + if (error) { + /* + * only error the io if all mirrors failed. + * FIXME: bogus + */ + uptodate = 0; + for (i = 0; i < ms->nr_mirrors; i++) + if (!test_bit(i, &error)) { + uptodate = 1; + break; + } + } + bh->b_end_io(bh, uptodate); +} + +static void do_write(struct mirror_set *ms, struct buffer_head *bh) +{ + unsigned int i; + struct io_region io[ms->nr_mirrors]; + struct mirror *m; + + for (i = 0; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + + io[i].dev = m->dev->dev; + io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin); + io[i].count = bh->b_size >> 9; + } + + bh_set_ms(bh, ms); + dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page, + (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh); +} + +static void do_writes(struct mirror_set *ms, struct buffer_list *writes) +{ + int state; + struct buffer_head *bh; + struct buffer_list sync, nosync, recover, *this_list = NULL; + + if (!writes->head) + return; + + /* + * Classify each write. + */ + buffer_list_init(&sync); + buffer_list_init(&nosync); + buffer_list_init(&recover); + + while ((bh = buffer_list_pop(writes))) { + state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1); + switch (state) { + case RH_CLEAN: + case RH_DIRTY: + this_list = &sync; + break; + + case RH_NOSYNC: + this_list = &nosync; + break; + + case RH_RECOVERING: + this_list = &recover; + break; + } + + buffer_list_add(this_list, bh); + } + + /* + * Increment the pending counts for any regions that will + * be written to (writes to recover regions are going to + * be delayed). + */ + rh_inc_pending(&ms->rh, &sync); + rh_inc_pending(&ms->rh, &nosync); + rh_flush(&ms->rh); + + /* + * Dispatch io. + */ + while ((bh = buffer_list_pop(&sync))) + do_write(ms, bh); + + while ((bh = buffer_list_pop(&recover))) + rh_delay(&ms->rh, bh); + + while ((bh = buffer_list_pop(&nosync))) { + map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh); + generic_make_request(WRITE, bh); + } +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static LIST_HEAD(_mirror_sets); +static DECLARE_RWSEM(_mirror_sets_lock); + +static void do_mirror(struct mirror_set *ms) +{ + struct buffer_list reads, writes; + + spin_lock(&ms->lock); + memcpy(&reads, &ms->reads, sizeof(reads)); + buffer_list_init(&ms->reads); + memcpy(&writes, &ms->writes, sizeof(writes)); + buffer_list_init(&ms->writes); + spin_unlock(&ms->lock); + + rh_update_states(&ms->rh); + do_recovery(ms); + do_reads(ms, &reads); + do_writes(ms, &writes); + run_task_queue(&tq_disk); +} + +static void do_work(void) +{ + struct mirror_set *ms; + + down_read(&_mirror_sets_lock); + list_for_each_entry (ms, &_mirror_sets, list) + do_mirror(ms); + up_read(&_mirror_sets_lock); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, + sector_t region_size, + struct dm_target *ti, + struct dirty_log *dl) +{ + size_t len; + struct mirror_set *ms = NULL; + + if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) + return NULL; + + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + + ms = kmalloc(len, GFP_KERNEL); + if (!ms) { + ti->error = "dm-mirror: Cannot allocate mirror context"; + return NULL; + } + + memset(ms, 0, len); + spin_lock_init(&ms->lock); + + ms->ti = ti; + ms->nr_mirrors = nr_mirrors; + ms->nr_regions = dm_div_up(ti->len, region_size); + + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { + ti->error = "dm-mirror: Error creating dirty region hash"; + kfree(ms); + return NULL; + } + + return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, + unsigned int m) +{ + while (m--) + dm_put_device(ti, ms->mirror[m].dev); + + rh_exit(&ms->rh); + kfree(ms); +} + +static inline int _check_region_size(struct dm_target *ti, sector_t size) +{ + return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || + size > ti->len); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, + unsigned int mirror, char **argv) +{ + sector_t offset; + + if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { + ti->error = "dm-mirror: Invalid offset"; + return -EINVAL; + } + + if (dm_get_device(ti, argv[0], offset, ti->len, + dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev)) { + ti->error = "dm-mirror: Device lookup failure"; + return -ENXIO; + } + + ms->mirror[mirror].offset = offset; + + return 0; +} + +static int add_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_add_tail(&ms->list, &_mirror_sets); + up_write(&_mirror_sets_lock); + dm_daemon_wake(&_kmirrord); + + return 0; +} + +static void del_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_del(&ms->list); + up_write(&_mirror_sets_lock); +} + +/* + * Create dirty log: log_type #log_params + */ +static struct dirty_log *create_dirty_log(struct dm_target *ti, + unsigned int argc, char **argv, + unsigned int *args_used) +{ + unsigned int param_count; + struct dirty_log *dl; + + if (argc < 2) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) { + ti->error = "dm-mirror: Invalid mirror log argument count"; + return NULL; + } + + *args_used = 2 + param_count; + + if (argc < *args_used) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2); + if (!dl) { + ti->error = "dm-mirror: Error creating mirror dirty log"; + return NULL; + } + + if (!_check_region_size(ti, dl->type->get_region_size(dl))) { + ti->error = "dm-mirror: Invalid region size"; + dm_destroy_dirty_log(dl); + return NULL; + } + + return dl; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params + * #mirrors [mirror_path offset]{2,} + * + * For now, #log_params = 1, log_type = "core" + * + */ +#define DM_IO_PAGES 64 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned int nr_mirrors, m, args_used; + struct mirror_set *ms; + struct dirty_log *dl; + + dl = create_dirty_log(ti, argc, argv, &args_used); + if (!dl) + return -EINVAL; + + argv += args_used; + argc -= args_used; + + if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || + nr_mirrors < 2) { + ti->error = "dm-mirror: Invalid number of mirrors"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + argv++, argc--; + + if (argc != nr_mirrors * 2) { + ti->error = "dm-mirror: Wrong number of mirror arguments"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); + if (!ms) { + dm_destroy_dirty_log(dl); + return -ENOMEM; + } + + /* Get the mirror parameter sets */ + for (m = 0; m < nr_mirrors; m++) { + r = get_mirror(ms, ti, m, argv); + if (r) { + free_context(ms, ti, m); + return r; + } + argv += 2; + argc -= 2; + } + + ti->private = ms; + + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); + if (r) { + free_context(ms, ti, ms->nr_mirrors); + return r; + } + + add_mirror_set(ms); + return 0; +} + +static void mirror_dtr(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + + del_mirror_set(ms); + kcopyd_client_destroy(ms->kcopyd_client); + free_context(ms, ti, ms->nr_mirrors); +} + +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw) +{ + int wake = 0; + struct buffer_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock(&ms->lock); + wake = !(bl->head); + buffer_list_add(bl, bh); + spin_unlock(&ms->lock); + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct buffer_head *bh, + int rw, union map_info *map_context) +{ + int r; + struct mirror *m; + struct mirror_set *ms = ti->private; + + /* FIXME: nasty hack, 32 bit sector_t only */ + map_context->ll = bh->b_rsector / ms->rh.region_size; + + if (rw == WRITE) { + queue_bh(ms, bh, rw); + return 0; + } + + r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0); + if (r < 0 && r != -EWOULDBLOCK) + return r; + + if (r == -EWOULDBLOCK) /* FIXME: ugly */ + r = 0; + + /* + * We don't want to fast track a recovery just for a read + * ahead. So we just let it silently fail. + * FIXME: get rid of this. + */ + if (!r && rw == READA) + return -EIO; + + if (!r) { + /* Pass this io over to the daemon */ + queue_bh(ms, bh, rw); + return 0; + } + + m = choose_mirror(ms, bh->b_rsector); + if (!m) + return -EIO; + + map_buffer(ms, m, bh); + return 1; +} + +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh, + int rw, int error, union map_info *map_context) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + region_t region = map_context->ll; + + /* + * We need to dec pending if this was a write. + */ + if (rw == WRITE) + rh_dec(&ms->rh, region); + + return 0; +} + +static void mirror_suspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_stop_recovery(&ms->rh); +} + +static void mirror_resume(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_start_recovery(&ms->rh); +} + +static int mirror_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + unsigned int m, sz = 0; + struct mirror_set *ms = (struct mirror_set *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) + sz += snprintf(result + sz, maxlen - sz, "%s ", + dm_kdevname(ms->mirror[m].dev->dev)); + + sz += snprintf(result + sz, maxlen - sz, "%lu/%lu", + ms->sync_count, ms->nr_regions); + break; + + case STATUSTYPE_TABLE: + sz += snprintf(result + sz, maxlen - sz, + "%s 1 " SECTOR_FORMAT " %d ", + ms->rh.log->type->name, ms->rh.region_size, + ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) + sz += snprintf(result + sz, maxlen - sz, "%s %ld ", + dm_kdevname(ms->mirror[m].dev->dev), + ms->mirror[m].offset); + } + + return 0; +} + +static struct target_type mirror_target = { + .name = "mirror", + .module = THIS_MODULE, + .ctr = mirror_ctr, + .dtr = mirror_dtr, + .map = mirror_map, + .end_io = mirror_end_io, + .suspend = mirror_suspend, + .resume = mirror_resume, + .status = mirror_status, +}; + +static int __init dm_mirror_init(void) +{ + int r; + + r = dm_dirty_log_init(); + if (r) + return r; + + r = dm_daemon_start(&_kmirrord, "kmirrord", do_work); + if (r) { + DMERR("couldn't start kmirrord"); + dm_dirty_log_exit(); + return r; + } + + r = dm_register_target(&mirror_target); + if (r < 0) { + DMERR("%s: Failed to register mirror target", + mirror_target.name); + dm_dirty_log_exit(); + dm_daemon_stop(&_kmirrord); + } + + return r; +} + +static void __exit dm_mirror_exit(void) +{ + int r; + + r = dm_unregister_target(&mirror_target); + if (r < 0) + DMERR("%s: unregister failed %d", mirror_target.name, r); + + dm_daemon_stop(&_kmirrord); + dm_dirty_log_exit(); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); --- linux-2.4.22/drivers/md/dm-snapshot.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-snapshot.c Thu Nov 20 18:31:01 2003 @@ -0,0 +1,1235 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-snapshot.h" +#include "kcopyd.h" + +/* + * FIXME: Remove this before release. + */ +#if 0 +#define DMDEBUG(x...) DMWARN( ## x) +#else +#define DMDEBUG(x...) +#endif + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * kcopyd priority of snapshot operations + */ +#define SNAPSHOT_COPY_PRIORITY 2 + +/* + * Each snapshot reserves this many pages for io + * FIXME: calculate this + */ +#define SNAPSHOT_PAGES 256 + +struct pending_exception { + struct exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a list (using b_reqnext). + */ + struct buffer_head *origin_bhs; + struct buffer_head *snapshot_bhs; + + /* + * Other pending_exceptions that are processing this + * chunk. When this list is empty, we know we can + * complete the origins. + */ + struct list_head siblings; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + kdev_t dev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("Device mapper: Snapshot: unable to allocate memory"); + return -ENOMEM; + } + + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); +} + +static inline unsigned int origin_hash(kdev_t dev) +{ + return MINOR(dev) & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(kdev_t origin) +{ + struct list_head *slist; + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each(slist, ol) { + o = list_entry(slist, struct origin, hash_list); + + if (o->dev == origin) + return o; + } + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->dev)]; + list_add_tail(&o->hash_list, sl); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o; + kdev_t dev = snap->origin->dev; + + down_write(&_origins_lock); + o = __lookup_origin(dev); + + if (!o) { + /* New origin */ + o = kmalloc(sizeof(*o), GFP_KERNEL); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->dev = dev; + + __insert_origin(o); + } + + list_add_tail(&snap->list, &o->snapshots); + + up_write(&_origins_lock); + return 0; +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->dev); + + list_del(&s->list); + if (list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + */ +static int init_exception_table(struct exception_table *et, uint32_t size) +{ + unsigned int i; + + et->hash_mask = size - 1; + et->table = vcalloc(size, sizeof(struct list_head)); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(et->table + i); + + return 0; +} + +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +{ + struct list_head *slot, *entry, *temp; + struct exception *ex; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + list_for_each_safe(entry, temp, slot) { + ex = list_entry(entry, struct exception, hash_list); + kmem_cache_free(mem, ex); + } + } + + vfree(et->table); +} + +/* + * FIXME: check how this hash fn is performing. + */ +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +{ + return chunk & et->hash_mask; +} + +static void insert_exception(struct exception_table *eh, struct exception *e) +{ + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; + list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct exception *lookup_exception(struct exception_table *et, + chunk_t chunk) +{ + struct list_head *slot, *el; + struct exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + list_for_each(el, slot) { + e = list_entry(el, struct exception, hash_list); + if (e->old_chunk == chunk) + return e; + } + + return NULL; +} + +static inline struct exception *alloc_exception(void) +{ + struct exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static inline void free_exception(struct exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ + return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ + mempool_free(pe, pending_pool); +} + +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +{ + struct exception *e; + + e = alloc_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + e->new_chunk = new; + insert_exception(&s->complete, e); + return 0; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 50; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Rounds a number down to a power of 2. + */ +static inline uint32_t round_down(uint32_t n) +{ + while (n & (n - 1)) + n &= (n - 1); + return n; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->dev); + origin_dev_size = get_dev_size(s->origin->dev); + max_buckets = calc_max_buckets(); + + hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size; + hash_size = min(hash_size, max_buckets); + + /* Round it down to a power of 2 */ + hash_size = round_down(hash_size); + if (init_exception_table(&s->complete, hash_size)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (!hash_size) + hash_size = 64; + + if (init_exception_table(&s->pending, hash_size)) { + exit_exception_table(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static inline ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +/* + * Construct a snapshot mapping:

+ */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_snapshot *s; + unsigned long chunk_size; + int r = -EINVAL; + char persistent; + char *origin_path; + char *cow_path; + char *value; + int blocksize; + + if (argc < 4) { + ti->error = "dm-snapshot: requires exactly 4 arguments"; + r = -EINVAL; + goto bad1; + } + + origin_path = argv[0]; + cow_path = argv[1]; + persistent = toupper(*argv[2]); + + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad1; + } + + chunk_size = simple_strtoul(argv[3], &value, 10); + if (chunk_size == 0 || value == NULL) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad1; + } + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) { + ti->error = "Cannot allocate snapshot context private " + "structure"; + r = -ENOMEM; + goto bad1; + } + + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad2; + } + + /* FIXME: get cow length */ + r = dm_get_device(ti, cow_path, 0, 0, + FMODE_READ | FMODE_WRITE, &s->cow); + if (r) { + dm_put_device(ti, s->origin); + ti->error = "Cannot get COW device"; + goto bad2; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); + + /* Validate the chunk size against the device block size */ + blocksize = get_hardsect_size(s->cow->dev); + if (chunk_size % (blocksize / SECTOR_SIZE)) { + ti->error = "Chunk size is not a multiple of device blocksize"; + r = -EINVAL; + goto bad3; + } + + /* Check the sizes are small enough to fit in one kiovec */ + if (chunk_size > KIO_MAX_SECTORS) { + ti->error = "Chunk size is too big"; + r = -EINVAL; + goto bad3; + } + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + ti->error = "Chunk size is not a power of 2"; + r = -EINVAL; + goto bad3; + } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->type = persistent; + for (s->chunk_shift = 0; chunk_size; + s->chunk_shift++, chunk_size >>= 1) + ; + s->chunk_shift--; + + s->valid = 1; + s->have_metadata = 0; + s->last_percent = 0; + init_rwsem(&s->lock); + s->table = ti->table; + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad3; + } + + /* + * Check the persistent flag - done here because we need the iobuf + * to check the LV header + */ + s->store.snap = s; + + if (persistent == 'P') + r = dm_create_persistent(&s->store, s->chunk_size); + else + r = dm_create_transient(&s->store, s, blocksize); + + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad4; + } + + r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); + if (r) { + ti->error = "Could not create kcopyd client"; + goto bad5; + } + + /* Flush IO to the origin device */ + fsync_dev(s->origin->dev); + + /* Add snapshot to the list of snapshots for this origin */ + if (register_snapshot(s)) { + r = -EINVAL; + ti->error = "Cannot register snapshot origin"; + goto bad6; + } + + ti->private = s; + return 0; + + bad6: + kcopyd_client_destroy(s->kcopyd_client); + + bad5: + s->store.destroy(&s->store); + + bad4: + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + bad3: + dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + + bad2: + kfree(s); + + bad1: + return r; +} + +static void snapshot_dtr(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + dm_table_event(ti->table); + + unregister_snapshot(s); + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + /* Deallocate memory used */ + s->store.destroy(&s->store); + + dm_put_device(ti, s->origin); + dm_put_device(ti, s->cow); + kcopyd_client_destroy(s->kcopyd_client); + kfree(s); +} + +/* + * We hold lists of buffer_heads, using the b_reqnext field. + */ +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) +{ + bh->b_reqnext = *queue; + *queue = bh; +} + +/* + * FIXME: inefficient. + */ +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs) +{ + while (*queue) + queue = &((*queue)->b_reqnext); + + *queue = bhs; +} + +/* + * Flush a list of buffers. + */ +static void flush_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + DMDEBUG("begin flush"); + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + DMDEBUG("flushing %p", bh); + generic_make_request(WRITE, bh); + bh = n; + } + + run_task_queue(&tq_disk); +} + +/* + * Error a list of buffers. + */ +static void error_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + buffer_IO_error(bh); + bh = n; + } +} + +static struct buffer_head *__flush_bhs(struct pending_exception *pe) +{ + struct pending_exception *sibling; + + if (list_empty(&pe->siblings)) + return pe->origin_bhs; + + sibling = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + list_del(&pe->siblings); + + /* FIXME: I think there's a race on SMP machines here, add spin lock */ + queue_buffers(&sibling->origin_bhs, pe->origin_bhs); + + return NULL; +} + +static void pending_complete(struct pending_exception *pe, int success) +{ + struct exception *e; + struct dm_snapshot *s = pe->snap; + struct buffer_head *flush = NULL; + + if (success) { + e = alloc_exception(); + if (!e) { + DMWARN("Unable to allocate exception."); + down_write(&s->lock); + s->store.drop_snapshot(&s->store); + s->valid = 0; + flush = __flush_bhs(pe); + up_write(&s->lock); + + error_buffers(pe->snapshot_bhs); + goto out; + } + + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ + down_write(&s->lock); + + memcpy(e, &pe->e, sizeof(*e)); + insert_exception(&s->complete, e); + remove_exception(&pe->e); + flush = __flush_bhs(pe); + + /* Submit any pending write BHs */ + up_write(&s->lock); + + flush_buffers(pe->snapshot_bhs); + DMDEBUG("Exception completed successfully."); + + /* Notify any interested parties */ + if (s->store.fraction_full) { + sector_t numerator, denominator; + int pc; + + s->store.fraction_full(&s->store, &numerator, + &denominator); + pc = numerator * 100 / denominator; + + if (pc >= s->last_percent + WAKE_UP_PERCENT) { + dm_table_event(s->table); + s->last_percent = pc - pc % WAKE_UP_PERCENT; + } + } + + } else { + /* Read/write error - snapshot is unusable */ + down_write(&s->lock); + if (s->valid) + DMERR("Error reading/writing snapshot"); + s->store.drop_snapshot(&s->store); + s->valid = 0; + remove_exception(&pe->e); + flush = __flush_bhs(pe); + up_write(&s->lock); + + error_buffers(pe->snapshot_bhs); + + dm_table_event(s->table); + DMDEBUG("Exception failed."); + } + + out: + if (flush) + flush_buffers(flush); + + free_pending_exception(pe); +} + +static void commit_callback(void *context, int success) +{ + struct pending_exception *pe = (struct pending_exception *) context; + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned int write_err, void *context) +{ + struct pending_exception *pe = (struct pending_exception *) context; + struct dm_snapshot *s = pe->snap; + + if (read_err || write_err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store.commit_exception(&s->store, &pe->e, commit_callback, + pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static inline void start_copy(struct pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct io_region src, dest; + kdev_t dev = s->origin->dev; + int *sizes = blk_size[major(dev)]; + sector_t dev_size = (sector_t) -1; + + if (pe->started) + return; + + /* this is protected by snap->lock */ + pe->started = 1; + + if (sizes && sizes[minor(dev)]) + dev_size = sizes[minor(dev)] << 1; + + src.dev = dev; + src.sector = chunk_to_sector(s, pe->e.old_chunk); + src.count = min(s->chunk_size, dev_size - src.sector); + + dest.dev = s->cow->dev; + dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + kcopyd_copy(s->kcopyd_client, + &src, 1, &dest, 0, copy_callback, pe); +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + */ +static struct pending_exception *find_pending_exception(struct dm_snapshot *s, + struct buffer_head *bh) +{ + struct exception *e; + struct pending_exception *pe; + chunk_t chunk = sector_to_chunk(s, bh->b_rsector); + + /* + * Is there a pending exception for this already ? + */ + e = lookup_exception(&s->pending, chunk); + if (e) { + /* cast the exception to a pending exception */ + pe = list_entry(e, struct pending_exception, e); + + } else { + /* Create a new pending exception */ + pe = alloc_pending_exception(); + pe->e.old_chunk = chunk; + pe->origin_bhs = pe->snapshot_bhs = NULL; + INIT_LIST_HEAD(&pe->siblings); + pe->snap = s; + pe->started = 0; + + if (s->store.prepare_exception(&s->store, &pe->e)) { + free_pending_exception(pe); + s->valid = 0; + return NULL; + } + + insert_exception(&s->pending, &pe->e); + } + + return pe; +} + +static inline void remap_exception(struct dm_snapshot *s, struct exception *e, + struct buffer_head *bh) +{ + bh->b_rdev = s->cow->dev; + bh->b_rsector = chunk_to_sector(s, e->new_chunk) + + (bh->b_rsector & s->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct exception *e; + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + int r = 1; + chunk_t chunk; + struct pending_exception *pe; + + chunk = sector_to_chunk(s, bh->b_rsector); + + /* Full snapshots are not usable */ + if (!s->valid) + return -1; + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writeable. + */ + if (rw == WRITE) { + + down_write(&s->lock); + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + + else { + pe = find_pending_exception(s, bh); + + if (!pe) { + s->store.drop_snapshot(&s->store); + s->valid = 0; + r = -EIO; + } else { + remap_exception(s, &pe->e, bh); + queue_buffer(&pe->snapshot_bhs, bh); + start_copy(pe); + r = 0; + } + } + + up_write(&s->lock); + + } else { + /* + * FIXME: this read path scares me because we + * always use the origin when we have a pending + * exception. However I can't think of a + * situation where this is wrong - ejt. + */ + + /* Do reads */ + down_read(&s->lock); + + /* See if it it has been remapped */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + else + bh->b_rdev = s->origin->dev; + + up_read(&s->lock); + } + + return r; +} + +void snapshot_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + if (s->have_metadata) + return; + + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } + + s->have_metadata = 1; +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; + char cow[16]; + char org[16]; + + switch (type) { + case STATUSTYPE_INFO: + if (!snap->valid) + snprintf(result, maxlen, "Invalid"); + else { + if (snap->store.fraction_full) { + sector_t numerator, denominator; + snap->store.fraction_full(&snap->store, + &numerator, + &denominator); + snprintf(result, maxlen, + SECTOR_FORMAT "/" SECTOR_FORMAT, + numerator, denominator); + } + else + snprintf(result, maxlen, "Unknown"); + } + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow)); + strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org)); + snprintf(result, maxlen, "%s %s %c %ld", org, cow, + snap->type, snap->chunk_size); + break; + } + + return 0; +} + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ +static void list_merge(struct list_head *l1, struct list_head *l2) +{ + struct list_head *l1_n, *l2_p; + + l1_n = l1->next; + l2_p = l2->prev; + + l1->next = l2; + l2->prev = l1; + + l2_p->next = l1_n; + l1_n->prev = l2_p; +} + +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh) +{ + int r = 1, first = 1; + struct list_head *sl; + struct dm_snapshot *snap; + struct exception *e; + struct pending_exception *pe, *last = NULL; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each(sl, snapshots) { + snap = list_entry(sl, struct dm_snapshot, list); + + /* Only deal with valid snapshots */ + if (!snap->valid) + continue; + + down_write(&snap->lock); + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap, bh->b_rsector); + + /* + * Check exception table to see if block + * is already remapped in this snapshot + * and trigger an exception if not. + */ + e = lookup_exception(&snap->complete, chunk); + if (!e) { + pe = find_pending_exception(snap, bh); + if (!pe) { + snap->store.drop_snapshot(&snap->store); + snap->valid = 0; + + } else { + if (last) + list_merge(&pe->siblings, + &last->siblings); + + last = pe; + r = 0; + } + } + + up_write(&snap->lock); + } + + /* + * Now that we have a complete pe list we can start the copying. + */ + if (last) { + pe = last; + do { + down_write(&pe->snap->lock); + if (first) + queue_buffer(&pe->origin_bhs, bh); + start_copy(pe); + up_write(&pe->snap->lock); + first = 0; + pe = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + } while (pe != last); + } + + return r; +} + +/* + * Called on a write from the origin driver. + */ +int do_origin(struct dm_dev *origin, struct buffer_head *bh) +{ + struct origin *o; + int r; + + down_read(&_origins_lock); + o = __lookup_origin(origin->dev); + if (!o) + BUG(); + + r = __origin_write(&o->snapshots, bh); + up_read(&_origins_lock); + + return r; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc != 1) { + ti->error = "dm-origin: incorrect number of arguments"; + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &dev); + if (r) { + ti->error = "Cannot get target device"; + return r; + } + + ti->private = dev; + return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + bh->b_rdev = dev->dev; + + /* Only tell snapshots if this is a write */ + return (rw == WRITE) ? do_origin(dev, bh) : 1; +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, + unsigned int maxlen) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s", dm_kdevname(dev->dev)); + break; + } + + return 0; +} + +static struct target_type origin_target = { + name: "snapshot-origin", + module: THIS_MODULE, + ctr: origin_ctr, + dtr: origin_dtr, + map: origin_map, + status: origin_status, +}; + +static struct target_type snapshot_target = { + name: "snapshot", + module: THIS_MODULE, + ctr: snapshot_ctr, + dtr: snapshot_dtr, + map: snapshot_map, + resume: snapshot_resume, + status: snapshot_status, +}; + +int __init dm_snapshot_init(void) +{ + int r; + + r = dm_register_target(&snapshot_target); + if (r) { + DMERR("snapshot target register failed %d", r); + return r; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Device mapper: Origin: register failed %d\n", r); + goto bad1; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad2; + } + + exception_cache = kmem_cache_create("dm-snapshot-ex", + sizeof(struct exception), + __alignof__(struct exception), + 0, NULL, NULL); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad3; + } + + pending_cache = + kmem_cache_create("dm-snapshot-in", + sizeof(struct pending_exception), + __alignof__(struct pending_exception), + 0, NULL, NULL); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad4; + } + + pending_pool = mempool_create(128, mempool_alloc_slab, + mempool_free_slab, pending_cache); + if (!pending_pool) { + DMERR("Couldn't create pending pool."); + r = -ENOMEM; + goto bad5; + } + + return 0; + + bad5: + kmem_cache_destroy(pending_cache); + bad4: + kmem_cache_destroy(exception_cache); + bad3: + exit_origin_hash(); + bad2: + dm_unregister_target(&origin_target); + bad1: + dm_unregister_target(&snapshot_target); + return r; +} + +void dm_snapshot_exit(void) +{ + int r; + + r = dm_unregister_target(&snapshot_target); + if (r) + DMERR("snapshot unregister failed %d", r); + + r = dm_unregister_target(&origin_target); + if (r) + DMERR("origin unregister failed %d", r); + + exit_origin_hash(); + mempool_destroy(pending_pool); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); +} --- linux-2.4.22/drivers/md/dm-snapshot.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-snapshot.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,158 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_SNAPSHOT_H +#define DM_SNAPSHOT_H + +#include "dm.h" +#include + +struct exception_table { + uint32_t hash_mask; + struct list_head *table; +}; + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +/* FIXME: can we get away with limiting these to a uint32_t ? */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + */ +struct exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct exception_store { + + /* + * Destroys this object when you've finished with it. + */ + void (*destroy) (struct exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. + */ + int (*read_metadata) (struct exception_store *store); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct exception_store *store, + struct exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct exception_store *store); + + /* + * Return how full the snapshot is. + */ + void (*fraction_full) (struct exception_store *store, + sector_t *numerator, + sector_t *denominator); + + struct dm_snapshot *snap; + void *context; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + struct dm_table *table; + + struct dm_dev *origin; + struct dm_dev *cow; + + /* List of snapshots per Origin */ + struct list_head list; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + int have_metadata; + + /* Used for display of table */ + char type; + + /* The last percentage we notified */ + int last_percent; + + struct exception_table pending; + struct exception_table complete; + + /* The on disk metadata handler */ + struct exception_store store; + + struct kcopyd_client *kcopyd_client; +}; + +/* + * Used by the exception stores to load exceptions hen + * initialising. + */ +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); + +/* + * Constructor and destructor for the default persistent + * store. + */ +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize); + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(kdev_t dev) +{ + int *sizes; + + sizes = blk_size[MAJOR(dev)]; + if (sizes) + return sizes[MINOR(dev)] << 1; + + return 0; +} + +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) +{ + return (sector & ~s->chunk_mask) >> s->chunk_shift; +} + +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) +{ + return chunk << s->chunk_shift; +} + +#endif --- linux-2.4.22/drivers/md/dm-stripe.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-stripe.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; +}; + +struct stripe_c { + uint32_t stripes; + + /* The size of this target / num. stripes */ + uint32_t stripe_width; + + /* stripe chunk size */ + uint32_t chunk_shift; + sector_t chunk_mask; + + struct stripe stripe[0]; +}; + +static inline struct stripe_c *alloc_context(unsigned int stripes) +{ + size_t len; + + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) + return NULL; + + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + + return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + unsigned int stripe, char **argv) +{ + sector_t start; + + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) + return -EINVAL; + + if (dm_get_device(ti, argv[0], start, sc->stripe_width, + dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev)) + return -ENXIO; + + sc->stripe[stripe].physical_start = start; + return 0; +} + +/* + * FIXME: Nasty function, only present because we can't link + * against __moddi3 and __divdi3. + * + * returns a == b * n + */ +static int multiple(sector_t a, sector_t b, sector_t *n) +{ + sector_t acc, prev, i; + + *n = 0; + while (a >= b) { + for (acc = b, prev = 0, i = 1; + acc <= a; + prev = acc, acc <<= 1, i <<= 1) + ; + + a -= prev; + *n += i >> 1; + } + + return a == 0; +} + +/* + * Construct a striped mapping. + * [ ]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width; + uint32_t stripes; + uint32_t chunk_size; + char *end; + int r; + unsigned int i; + + if (argc < 2) { + ti->error = "dm-stripe: Not enough arguments"; + return -EINVAL; + } + + stripes = simple_strtoul(argv[0], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid stripe count"; + return -EINVAL; + } + + chunk_size = simple_strtoul(argv[1], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid chunk_size"; + return -EINVAL; + } + + /* + * chunk_size is a power of two + */ + if (!chunk_size || (chunk_size & (chunk_size - 1))) { + ti->error = "dm-stripe: Invalid chunk size"; + return -EINVAL; + } + + if (!multiple(ti->len, stripes, &width)) { + ti->error = "dm-stripe: Target length not divisable by " + "number of stripes"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "dm-stripe: Not enough destinations specified"; + return -EINVAL; + } + + sc = alloc_context(stripes); + if (!sc) { + ti->error = "dm-stripe: Memory allocation for striped context " + "failed"; + return -ENOMEM; + } + + sc->stripes = stripes; + sc->stripe_width = width; + + sc->chunk_mask = ((sector_t) chunk_size) - 1; + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) + chunk_size >>= 1; + sc->chunk_shift--; + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "dm-stripe: Couldn't parse stripe " + "destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + } + + ti->private = sc; + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + kfree(sc); +} + +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *context) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + + sector_t offset = bh->b_rsector - ti->begin; + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ + chunk = chunk / sc->stripes; + + bh->b_rdev = sc->stripe[stripe].dev->dev; + bh->b_rsector = sc->stripe[stripe].physical_start + + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return 1; +} + +static int stripe_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + int offset; + unsigned int i; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT, + sc->stripes, sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) { + offset += + snprintf(result + offset, maxlen - offset, + " %s " SECTOR_FORMAT, + dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)), + sc->stripe[i].physical_start); + } + break; + } + return 0; +} + +static struct target_type stripe_target = { + .name = "striped", + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .status = stripe_status, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) + DMWARN("striped target registration failed"); + + return r; +} + +void dm_stripe_exit(void) +{ + if (dm_unregister_target(&stripe_target)) + DMWARN("striped target unregistration failed"); + + return; +} --- linux-2.4.22/drivers/md/dm-table.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-table.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,679 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { + atomic_t holders; + + /* btree table */ + unsigned int depth; + unsigned int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + int mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned long n, unsigned long base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, unsigned int l, + unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + + + +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) +{ + struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + + if (!t) + return -ENOMEM; + + memset(t, 0, sizeof(*t)); + INIT_LIST_HEAD(&t->devices); + atomic_set(&t->holders, 1); + + num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + + /* Allocate both the target array and offset array at once. */ + t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) + + sizeof(sector_t), num_targets); + if (!t->highs) { + kfree(t); + return -ENOMEM; + } + + memset(t->highs, -1, sizeof(*t->highs) * num_targets); + + t->targets = (struct dm_target *) (t->highs + num_targets); + t->num_allocated = num_targets; + t->mode = mode; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + for (tmp = devices->next; tmp != devices; tmp = next) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + next = tmp->next; + kfree(dd); + } +} + +void table_destroy(struct dm_table *t) +{ + unsigned int i; + + /* free the indexes (see dm_table_complete) */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + if (t->devices.next != &t->devices) { + DMWARN("devices still present during destroy: " + "dm_table_remove_device calls missing"); + + free_devices(&t->devices); + } + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ + if (atomic_dec_and_test(&t->holders)) + table_destroy(t); +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, kdev_t *dev) +{ + int r; + struct nameidata nd; + struct inode *inode; + + if (!path_init(path, LOOKUP_FOLLOW, &nd)) + return 0; + + if ((r = path_walk(path, &nd))) + goto out; + + inode = nd.dentry->d_inode; + if (!inode) { + r = -ENOENT; + goto out; + } + + if (!S_ISBLK(inode->i_mode)) { + r = -ENOTBLK; + goto out; + } + + *dev = inode->i_rdev; + + out: + path_release(&nd); + return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, kdev_t dev) +{ + struct list_head *tmp; + + list_for_each(tmp, l) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + if (kdev_same(dd->dev, dev)) + return dd; + } + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *dd) +{ + if (dd->bdev) + BUG(); + + dd->bdev = bdget(kdev_t_to_nr(dd->dev)); + if (!dd->bdev) + return -ENOMEM; + + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW); +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *dd) +{ + if (!dd->bdev) + return; + + blkdev_put(dd->bdev, BDEV_RAW); + dd->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(kdev_t dev, sector_t start, sector_t len) +{ + int *sizes; + sector_t dev_size; + + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) + /* we don't know the device details, + * so give the benefit of the doubt */ + return 1; + + /* convert to 512-byte sectors */ + dev_size <<= 1; + + return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev. Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ + int r; + struct dm_dev dd_copy; + + memcpy(&dd_copy, dd, sizeof(dd_copy)); + + dd->mode |= new_mode; + dd->bdev = NULL; + r = open_dev(dd); + if (!r) + close_dev(&dd_copy); + else + memcpy(dd, &dd_copy, sizeof(dd_copy)); + + return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r; + kdev_t dev; + struct dm_dev *dd; + unsigned major, minor; + struct dm_table *t = ti->table; + + if (!t) + BUG(); + + if (sscanf(path, "%u:%u", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = mk_kdev(major, minor); + } else { + /* convert the path to a device */ + if ((r = lookup_device(path, &dev))) + return r; + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->dev = dev; + dd->mode = mode; + dd->bdev = NULL; + + if ((r = open_dev(dd))) { + kfree(dd); + return r; + } + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->mode != (mode | dd->mode)) { + r = upgrade_mode(dd, mode); + if (r) + return r; + } + atomic_inc(&dd->count); + + if (!check_device_area(dd->dev, start, len)) { + DMWARN("device %s too small for target", path); + dm_put_device(ti, dd); + return -EINVAL; + } + + *result = dd; + + return 0; +} + +/* + * Decrement a devices use count and remove it if neccessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd); + list_del(&dd->list); + kfree(dd); + } +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ + char **argv; + unsigned new_size; + + new_size = *array_size ? *array_size * 2 : 64; + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (argv) { + memcpy(argv, old_argv, *array_size * sizeof(*argv)); + *array_size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +static int split_args(int *argc, char ***argvp, char *input) +{ + char *start, *end = input, *out, **argv = NULL; + unsigned array_size = 0; + + *argc = 0; + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + + while (1) { + start = end; + + /* Skip whitespace */ + while (*start && isspace(*start)) + start++; + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + *argvp = argv; + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char **argv; + struct dm_target *tgt; + + if (t->num_targets >= t->num_allocated) + return -ENOMEM; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + tgt->error = "unknown target type"; + return -EINVAL; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = split_args(&argc, &argv, params); + if (r) { + tgt->error = "couldn't split parameters (insufficient memory)"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + kfree(argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + return 0; + + bad: + printk(KERN_ERR DM_NAME ": %s\n", tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2, total = 0; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED; +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + spin_lock_irq(&_event_lock); + t->event_fn = fn; + t->event_context = context; + spin_unlock_irq(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + spin_lock(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + spin_unlock(&_event_lock); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ + if (index > t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} + +void dm_table_suspend_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->suspend) + ti->type->suspend(ti); + } +} + +void dm_table_resume_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->resume) + ti->type->resume(ti); + } +} + +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); --- linux-2.4.22/drivers/md/dm-target.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm-target.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include + +struct tt_internal { + struct target_type tt; + + struct list_head list; + long use; +}; + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct tt_internal *__find_target_type(const char *name) +{ + struct list_head *tih; + struct tt_internal *ti; + + list_for_each(tih, &_targets) { + ti = list_entry(tih, struct tt_internal, list); + + if (!strcmp(name, ti->tt.name)) + return ti; + } + + return NULL; +} + +static struct tt_internal *get_target_type(const char *name) +{ + struct tt_internal *ti; + + down_read(&_lock); + ti = __find_target_type(name); + + if (ti) { + if (ti->use == 0 && ti->tt.module) + __MOD_INC_USE_COUNT(ti->tt.module); + ti->use++; + } + up_read(&_lock); + + return ti; +} + +static void load_module(const char *name) +{ + char module_name[DM_MOD_NAME_SIZE] = "dm-"; + + /* Length check for strcat() below */ + if (strlen(name) > (DM_MOD_NAME_SIZE - 4)) + return; + + strcat(module_name, name); + request_module(module_name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct tt_internal *ti = get_target_type(name); + + if (!ti) { + load_module(name); + ti = get_target_type(name); + } + + return ti ? &ti->tt : NULL; +} + +void dm_put_target_type(struct target_type *t) +{ + struct tt_internal *ti = (struct tt_internal *) t; + + down_read(&_lock); + if (--ti->use == 0 && ti->tt.module) + __MOD_DEC_USE_COUNT(ti->tt.module); + + if (ti->use < 0) + BUG(); + up_read(&_lock); + + return; +} + +static struct tt_internal *alloc_target(struct target_type *t) +{ + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (ti) { + memset(ti, 0, sizeof(*ti)); + ti->tt = *t; + } + + return ti; +} + +int dm_register_target(struct target_type *t) +{ + int rv = 0; + struct tt_internal *ti = alloc_target(t); + + if (!ti) + return -ENOMEM; + + down_write(&_lock); + if (__find_target_type(t->name)) { + kfree(ti); + rv = -EEXIST; + } else + list_add(&ti->list, &_targets); + + up_write(&_lock); + return rv; +} + +int dm_unregister_target(struct target_type *t) +{ + struct tt_internal *ti; + + down_write(&_lock); + if (!(ti = __find_target_type(t->name))) { + up_write(&_lock); + return -EINVAL; + } + + if (ti->use) { + up_write(&_lock); + return -ETXTBSY; + } + + list_del(&ti->list); + kfree(ti); + + up_write(&_lock); + return 0; +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +{ + return 0; +} + +static void io_err_dtr(struct dm_target *ti) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, +}; + +int dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + if (dm_unregister_target(&error_target)) + DMWARN("error target unregistration failed"); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); --- linux-2.4.22/drivers/md/dm.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,1115 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "kcopyd.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char *_name = DM_NAME; +#define DEFAULT_READ_AHEAD 64 + +struct dm_io { + struct mapped_device *md; + + struct dm_target *ti; + int rw; + union map_info map_context; + void (*end_io) (struct buffer_head * bh, int uptodate); + void *context; +}; + +struct deferred_io { + int rw; + struct buffer_head *bh; + struct deferred_io *next; +}; + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO 0 +#define DMF_SUSPENDED 1 + +struct mapped_device { + struct rw_semaphore lock; + atomic_t holders; + + kdev_t dev; + unsigned long flags; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending; + wait_queue_head_t wait; + struct deferred_io *deferred; + + /* + * The current mapping. + */ + struct dm_table *map; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; + + /* + * Event handling. + */ + uint32_t event_nr; + wait_queue_head_t eventq; +}; + +#define MIN_IOS 256 +static kmem_cache_t *_io_cache; + +static struct mapped_device *get_kdev(kdev_t dev); +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh); +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); + +/*----------------------------------------------------------------- + * In order to avoid the 256 minor number limit we are going to + * register more major numbers as neccessary. + *---------------------------------------------------------------*/ +#define MAX_MINORS (1 << MINORBITS) + +struct major_details { + unsigned int major; + + int transient; + struct list_head transient_list; + + unsigned int first_free_minor; + int nr_free_minors; + + struct mapped_device *mds[MAX_MINORS]; + int blk_size[MAX_MINORS]; + int blksize_size[MAX_MINORS]; + int hardsect_size[MAX_MINORS]; +}; + +static struct rw_semaphore _dev_lock; +static struct major_details *_majors[MAX_BLKDEV]; + +/* + * This holds a list of majors that non-specified device numbers + * may be allocated from. Only majors with free minors appear on + * this list. + */ +static LIST_HEAD(_transients_free); + +static int __alloc_major(unsigned int major, struct major_details **result) +{ + int r; + unsigned int transient = !major; + struct major_details *maj; + + /* Major already allocated? */ + if (major && _majors[major]) + return 0; + + maj = kmalloc(sizeof(*maj), GFP_KERNEL); + if (!maj) + return -ENOMEM; + + memset(maj, 0, sizeof(*maj)); + INIT_LIST_HEAD(&maj->transient_list); + + maj->nr_free_minors = MAX_MINORS; + + r = register_blkdev(major, _name, &dm_blk_dops); + if (r < 0) { + DMERR("register_blkdev failed for %d", major); + kfree(maj); + return r; + } + if (r > 0) + major = r; + + maj->major = major; + + if (transient) { + maj->transient = transient; + list_add_tail(&maj->transient_list, &_transients_free); + } + + _majors[major] = maj; + + blk_size[major] = maj->blk_size; + blksize_size[major] = maj->blksize_size; + hardsect_size[major] = maj->hardsect_size; + read_ahead[major] = DEFAULT_READ_AHEAD; + + blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request); + + *result = maj; + return 0; +} + +static void __free_major(struct major_details *maj) +{ + unsigned int major = maj->major; + + list_del(&maj->transient_list); + + read_ahead[major] = 0; + blk_size[major] = NULL; + blksize_size[major] = NULL; + hardsect_size[major] = NULL; + + _majors[major] = NULL; + kfree(maj); + + if (unregister_blkdev(major, _name) < 0) + DMERR("devfs_unregister_blkdev failed"); +} + +static void free_all_majors(void) +{ + unsigned int major = ARRAY_SIZE(_majors); + + down_write(&_dev_lock); + + while (major--) + if (_majors[major]) + __free_major(_majors[major]); + + up_write(&_dev_lock); +} + +static void free_dev(kdev_t dev) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + down_write(&_dev_lock); + + maj = _majors[major]; + if (!maj) + goto out; + + maj->mds[minor] = NULL; + maj->nr_free_minors++; + + if (maj->nr_free_minors == MAX_MINORS) { + __free_major(maj); + goto out; + } + + if (!maj->transient) + goto out; + + if (maj->nr_free_minors == 1) + list_add_tail(&maj->transient_list, &_transients_free); + + if (minor < maj->first_free_minor) + maj->first_free_minor = minor; + + out: + up_write(&_dev_lock); +} + +static void __alloc_minor(struct major_details *maj, unsigned int minor, + struct mapped_device *md) +{ + maj->mds[minor] = md; + md->dev = mk_kdev(maj->major, minor); + maj->nr_free_minors--; + + if (maj->transient && !maj->nr_free_minors) + list_del_init(&maj->transient_list); +} + +/* + * See if requested kdev_t is available. + */ +static int specific_dev(kdev_t dev, struct mapped_device *md) +{ + int r = 0; + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) { + DMWARN("device number requested out of range (%d, %d)", + major, minor); + return -EINVAL; + } + + down_write(&_dev_lock); + maj = _majors[major]; + + /* Register requested major? */ + if (!maj) { + r = __alloc_major(major, &maj); + if (r) + goto out; + + major = maj->major; + } + + if (maj->mds[minor]) { + r = -EBUSY; + goto out; + } + + __alloc_minor(maj, minor, md); + + out: + up_write(&_dev_lock); + + return r; +} + +/* + * Find first unused device number, requesting a new major number if required. + */ +static int first_free_dev(struct mapped_device *md) +{ + int r = 0; + struct major_details *maj; + + down_write(&_dev_lock); + + if (list_empty(&_transients_free)) { + r = __alloc_major(0, &maj); + if (r) + goto out; + } else + maj = list_entry(_transients_free.next, struct major_details, + transient_list); + + while (maj->mds[maj->first_free_minor++]) + ; + + __alloc_minor(maj, maj->first_free_minor - 1, md); + + out: + up_write(&_dev_lock); + + return r; +} + +static struct mapped_device *get_kdev(kdev_t dev) +{ + struct mapped_device *md; + struct major_details *maj; + + down_read(&_dev_lock); + maj = _majors[major(dev)]; + if (!maj) { + md = NULL; + goto out; + } + md = maj->mds[minor(dev)]; + if (md) + dm_get(md); + out: + up_read(&_dev_lock); + + return md; +} + +/*----------------------------------------------------------------- + * init/exit code + *---------------------------------------------------------------*/ + +static __init int local_init(void) +{ + init_rwsem(&_dev_lock); + + /* allocate a slab for the dm_ios */ + _io_cache = kmem_cache_create("dm io", + sizeof(struct dm_io), 0, 0, NULL, NULL); + + if (!_io_cache) + return -ENOMEM; + + return 0; +} + +static void local_exit(void) +{ + kmem_cache_destroy(_io_cache); + free_all_majors(); + + DMINFO("cleaned up"); +} + +/* + * We have a lot of init/exit functions, so it seems easier to + * store them in an array. The disposable macro 'xx' + * expands a prefix into a pair of function names. + */ +static struct { + int (*init) (void); + void (*exit) (void); + +} _inits[] = { +#define xx(n) {n ## _init, n ## _exit}, + xx(local) + xx(kcopyd) + xx(dm_target) + xx(dm_linear) + xx(dm_stripe) + xx(dm_snapshot) + xx(dm_interface) +#undef xx +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i].init(); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _inits[i].exit(); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_inits); + + while (i--) + _inits[i].exit(); +} + +/* + * Block device functions + */ +static int dm_blk_open(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + return 0; +} + +static int dm_blk_close(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + dm_put(md); /* put the reference gained by dm_blk_open */ + dm_put(md); + return 0; +} + +static inline struct dm_io *alloc_io(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static inline void free_io(struct mapped_device *md, struct dm_io *io) +{ + mempool_free(io, md->io_pool); +} + +static inline struct deferred_io *alloc_deferred(void) +{ + return kmalloc(sizeof(struct deferred_io), GFP_NOIO); +} + +static inline void free_deferred(struct deferred_io *di) +{ + kfree(di); +} + +static inline sector_t volume_size(kdev_t dev) +{ + return blk_size[major(dev)][minor(dev)] << 1; +} + +/* FIXME: check this */ +static int dm_blk_ioctl(struct inode *inode, struct file *file, + unsigned int command, unsigned long a) +{ + kdev_t dev = inode->i_rdev; + long size; + + switch (command) { + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKFLSBUF: + case BLKSSZGET: + //case BLKRRPART: /* Re-read partition tables */ + //case BLKPG: + case BLKELVGET: + case BLKELVSET: + case BLKBSZGET: + case BLKBSZSET: + return blk_ioctl(dev, command, a); + break; + + case BLKGETSIZE: + size = volume_size(dev); + if (copy_to_user((void *) a, &size, sizeof(long))) + return -EFAULT; + break; + + case BLKGETSIZE64: + size = volume_size(dev); + if (put_user((u64) ((u64) size) << 9, (u64 *) a)) + return -EFAULT; + break; + + case BLKRRPART: + return -ENOTTY; + + case LV_BMAP: + return dm_user_bmap(inode, (struct lv_bmap *) a); + + default: + DMWARN("unknown block ioctl 0x%x", command); + return -ENOTTY; + } + + return 0; +} + +/* + * Add the buffer to the list of deferred io. + */ +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) +{ + struct deferred_io *di; + + di = alloc_deferred(); + if (!di) + return -ENOMEM; + + down_write(&md->lock); + + if (!test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + free_deferred(di); + return 1; + } + + di->bh = bh; + di->rw = rw; + di->next = md->deferred; + md->deferred = di; + + up_write(&md->lock); + return 0; /* deferred successfully */ +} + +/* + * bh->b_end_io routine that decrements the pending count + * and then calls the original bh->b_end_io fn. + */ +static void dec_pending(struct buffer_head *bh, int uptodate) +{ + int r; + struct dm_io *io = bh->b_private; + dm_endio_fn endio = io->ti->type->end_io; + + if (endio) { + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO, + &io->map_context); + if (r < 0) + uptodate = 0; + + else if (r > 0) + /* the target wants another shot at the io */ + return; + } + + if (atomic_dec_and_test(&io->md->pending)) + /* nudge anyone waiting on suspend queue */ + wake_up(&io->md->wait); + + bh->b_end_io = io->end_io; + bh->b_private = io->context; + free_io(io->md, io); + + bh->b_end_io(bh, uptodate); +} + +/* + * Do the bh mapping for a given leaf + */ +static inline int __map_buffer(struct mapped_device *md, int rw, + struct buffer_head *bh, struct dm_io *io) +{ + struct dm_target *ti; + + if (!md->map) + return -EINVAL; + + ti = dm_table_find_target(md->map, bh->b_rsector); + if (!ti->type) + return -EINVAL; + + /* hook the end io request fn */ + atomic_inc(&md->pending); + io->md = md; + io->ti = ti; + io->rw = rw; + io->end_io = bh->b_end_io; + io->context = bh->b_private; + bh->b_end_io = dec_pending; + bh->b_private = io; + + return ti->type->map(ti, bh, rw, &io->map_context); +} + +/* + * Checks to see if we should be deferring io, if so it queues it + * and returns 1. + */ +static inline int __deferring(struct mapped_device *md, int rw, + struct buffer_head *bh) +{ + int r; + + /* + * If we're suspended we have to queue this io for later. + */ + while (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + + /* + * There's no point deferring a read ahead + * request, just drop it. + */ + if (rw == READA) { + down_read(&md->lock); + return -EIO; + } + + r = queue_io(md, bh, rw); + down_read(&md->lock); + + if (r < 0) + return r; + + if (r == 0) + return 1; /* deferred successfully */ + + } + + return 0; +} + +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh) +{ + int r; + struct dm_io *io; + struct mapped_device *md; + + md = get_kdev(bh->b_rdev); + if (!md) { + buffer_IO_error(bh); + return 0; + } + + io = alloc_io(md); + down_read(&md->lock); + + r = __deferring(md, rw, bh); + if (r < 0) + goto bad; + + else if (!r) { + /* not deferring */ + r = __map_buffer(md, rw, bh, io); + if (r < 0) + goto bad; + } else + r = 0; + + up_read(&md->lock); + dm_put(md); + return r; + + bad: + buffer_IO_error(bh); + up_read(&md->lock); + dm_put(md); + return 0; +} + +static int check_dev_size(kdev_t dev, unsigned long block) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + + /* FIXME: check this */ + unsigned long max_sector = (blk_size[major][minor] << 1) + 1; + unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9); + + return (sector > max_sector) ? 0 : 1; +} + +/* + * Creates a dummy buffer head and maps it (for lilo). + */ +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block, + kdev_t *r_dev, unsigned long *r_block) +{ + struct buffer_head bh; + struct dm_target *ti; + union map_info map_context; + int r; + + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + return -EPERM; + } + + if (!check_dev_size(dev, block)) { + return -EINVAL; + } + + if (!md->map) + return -EINVAL; + + /* setup dummy bh */ + memset(&bh, 0, sizeof(bh)); + bh.b_blocknr = block; + bh.b_dev = bh.b_rdev = dev; + bh.b_size = blksize_size[major(dev)][minor(dev)]; + bh.b_rsector = block * (bh.b_size >> 9); + + /* find target */ + ti = dm_table_find_target(md->map, bh.b_rsector); + + /* do the mapping */ + r = ti->type->map(ti, &bh, READ, &map_context); + ti->type->end_io(ti, &bh, READ, 0, &map_context); + + if (!r) { + *r_dev = bh.b_rdev; + *r_block = bh.b_rsector / (bh.b_size >> 9); + } + + return r; +} + +/* + * Marshals arguments and results between user and kernel space. + */ +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) +{ + struct mapped_device *md; + unsigned long block, r_block; + kdev_t r_dev; + int r; + + if (get_user(block, &lvb->lv_block)) + return -EFAULT; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + down_read(&md->lock); + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block); + up_read(&md->lock); + dm_put(md); + + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) || + put_user(r_block, &lvb->lv_block))) + r = -EFAULT; + + return r; +} + +static void free_md(struct mapped_device *md) +{ + free_dev(md->dev); + mempool_destroy(md->io_pool); + kfree(md); +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_md(kdev_t dev) +{ + int r; + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + + if (!md) { + DMWARN("unable to allocate device, out of memory."); + return NULL; + } + + memset(md, 0, sizeof(*md)); + + /* Allocate suitable device number */ + if (!dev) + r = first_free_dev(md); + else + r = specific_dev(dev, md); + + if (r) { + kfree(md); + return NULL; + } + + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _io_cache); + if (!md->io_pool) { + free_md(md); + kfree(md); + return NULL; + } + + init_rwsem(&md->lock); + atomic_set(&md->holders, 1); + atomic_set(&md->pending, 0); + init_waitqueue_head(&md->wait); + init_waitqueue_head(&md->eventq); + + return md; +} + +/* + * The hardsect size for a mapped device is the largest hardsect size + * from the devices it maps onto. + */ +static int __find_hardsect_size(struct list_head *devices) +{ + int result = 512, size; + struct list_head *tmp; + + list_for_each (tmp, devices) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + size = get_hardsect_size(dd->dev); + if (size > result) + result = size; + } + + return result; +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ + struct mapped_device *md = (struct mapped_device *) context; + + down_write(&md->lock); + md->event_nr++; + wake_up_interruptible(&md->eventq); + up_write(&md->lock); +} + +static int __bind(struct mapped_device *md, struct dm_table *t) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + md->map = t; + + /* in k */ + blk_size[major][minor] = dm_table_get_size(t) >> 1; + blksize_size[major][minor] = BLOCK_SIZE; + hardsect_size[major][minor] = + __find_hardsect_size(dm_table_get_devices(t)); + register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]); + + dm_table_event_callback(md->map, event_callback, md); + dm_table_get(t); + return 0; +} + +static void __unbind(struct mapped_device *md) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + + if (md->map) { + dm_table_event_callback(md->map, NULL, NULL); + dm_table_put(md->map); + md->map = NULL; + + } + + blk_size[major][minor] = 0; + blksize_size[major][minor] = 0; + hardsect_size[major][minor] = 0; +} + +/* + * Constructor for a new device. + */ +int dm_create(kdev_t dev, struct mapped_device **result) +{ + struct mapped_device *md; + + md = alloc_md(dev); + if (!md) + return -ENXIO; + + __unbind(md); /* Ensure zero device size */ + + *result = md; + return 0; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); +} + +void dm_put(struct mapped_device *md) +{ + if (atomic_dec_and_test(&md->holders)) { + if (md->map) + dm_table_suspend_targets(md->map); + __unbind(md); + free_md(md); + } +} + +/* + * Requeue the deferred io by calling generic_make_request. + */ +static void flush_deferred_io(struct deferred_io *c) +{ + struct deferred_io *n; + + while (c) { + n = c->next; + generic_make_request(c->rw, c->bh); + free_deferred(c); + c = n; + } +} + +/* + * Swap in a new table (destroying old one). + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + int r; + + down_write(&md->lock); + + /* + * The device must be suspended, or have no table bound yet. + */ + if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EPERM; + } + + __unbind(md); + r = __bind(md, table); + if (r) + return r; + + up_write(&md->lock); + return 0; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight io and ensure that any further io gets deferred. + */ +int dm_suspend(struct mapped_device *md) +{ + int r = 0; + DECLARE_WAITQUEUE(wait, current); + + down_write(&md->lock); + + /* + * First we set the BLOCK_IO flag so no more ios will be + * mapped. + */ + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + set_bit(DMF_BLOCK_IO, &md->flags); + add_wait_queue(&md->wait, &wait); + up_write(&md->lock); + + /* + * Then we wait for the already mapped ios to + * complete. + */ + run_task_queue(&tq_disk); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!atomic_read(&md->pending) || signal_pending(current)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + down_write(&md->lock); + remove_wait_queue(&md->wait, &wait); + + /* did we flush everything ? */ + if (atomic_read(&md->pending)) { + clear_bit(DMF_BLOCK_IO, &md->flags); + r = -EINTR; + } else { + set_bit(DMF_SUSPENDED, &md->flags); + if (md->map) + dm_table_suspend_targets(md->map); + } + up_write(&md->lock); + + return r; +} + +int dm_resume(struct mapped_device *md) +{ + struct deferred_io *def; + + down_write(&md->lock); + if (!test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + if (md->map) + dm_table_resume_targets(md->map); + + clear_bit(DMF_SUSPENDED, &md->flags); + clear_bit(DMF_BLOCK_IO, &md->flags); + def = md->deferred; + md->deferred = NULL; + up_write(&md->lock); + + flush_deferred_io(def); + run_task_queue(&tq_disk); + + return 0; +} + +struct dm_table *dm_get_table(struct mapped_device *md) +{ + struct dm_table *t; + + down_read(&md->lock); + t = md->map; + if (t) + dm_table_get(t); + up_read(&md->lock); + + return t; +} + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +uint32_t dm_get_event_nr(struct mapped_device *md) +{ + uint32_t r; + + down_read(&md->lock); + r = md->event_nr; + up_read(&md->lock); + + return r; +} + +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr) +{ + down_write(&md->lock); + if (event_nr != md->event_nr) { + up_write(&md->lock); + return 1; + } + + add_wait_queue(&md->eventq, wq); + up_write(&md->lock); + + return 0; +} + +const char *dm_kdevname(kdev_t dev) +{ + static char buffer[32]; + sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev)); + return buffer; +} + +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq) +{ + down_write(&md->lock); + remove_wait_queue(&md->eventq, wq); + up_write(&md->lock); +} + +kdev_t dm_kdev(struct mapped_device *md) +{ + kdev_t dev; + + down_read(&md->lock); + dev = md->dev; + up_read(&md->lock); + + return dev; +} + +int dm_suspended(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +struct block_device_operations dm_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .owner = THIS_MODULE +}; + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(dm_kdevname); --- linux-2.4.22/drivers/md/dm.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/dm.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,175 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include +#include +#include +#include + +#define DM_NAME "device-mapper" +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +/* + * FIXME: I think this should be with the definition of sector_t + * in types.h. + */ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lu" +#else +#define SECTOR_FORMAT "%lu" +#endif + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +extern struct block_device_operations dm_blk_dops; + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev { + struct list_head list; + + atomic_t count; + int mode; + kdev_t dev; + struct block_device *bdev; +}; + +struct dm_table; +struct mapped_device; + +/*----------------------------------------------------------------- + * Functions for manipulating a struct mapped_device. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ +int dm_create(kdev_t dev, struct mapped_device **md); + +/* + * Reference counting for md. + */ +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md); +int dm_resume(struct mapped_device *md); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Drop a reference on the table when you've finished with the + * result. + */ +struct dm_table *dm_get_table(struct mapped_device *md); + +/* + * Event functions. + */ +uint32_t dm_get_event_nr(struct mapped_device *md); +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr); +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq); + +/* + * Info functions. + */ +kdev_t dm_kdev(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/*----------------------------------------------------------------- + * Functions for manipulating a table. Tables are also reference + * counted. + *---------------------------------------------------------------*/ +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); + +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params); +int dm_table_complete(struct dm_table *t); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context); +void dm_table_event(struct dm_table *t); +sector_t dm_table_get_size(struct dm_table *t); +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +unsigned int dm_table_get_num_targets(struct dm_table *t); +struct list_head *dm_table_get_devices(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +void dm_table_suspend_targets(struct dm_table *t); +void dm_table_resume_targets(struct dm_table *t); + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *t); + + +/*----------------------------------------------------------------- + * Useful inlines. + *---------------------------------------------------------------*/ +static inline int array_too_big(unsigned long fixed, unsigned long obj, + unsigned long num) +{ + return (num > (ULONG_MAX - fixed) / obj); +} + +/* + * ceiling(n / size) * size + */ +static inline unsigned long dm_round_up(unsigned long n, unsigned long size) +{ + unsigned long r = n % size; + return n + (r ? (size - r) : 0); +} + +/* + * Ceiling(n / size) + */ +static inline unsigned long dm_div_up(unsigned long n, unsigned long size) +{ + return dm_round_up(n, size) / size; +} + +const char *dm_kdevname(kdev_t dev); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +int dm_snapshot_init(void); +void dm_snapshot_exit(void); + +#endif --- linux-2.4.22/drivers/md/kcopyd.c Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/kcopyd.c Thu Nov 20 18:29:35 2003 @@ -0,0 +1,666 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcopyd.h" +#include "dm-daemon.h" + +/* FIXME: this is only needed for the DMERR macros */ +#include "dm.h" + +static struct dm_daemon _kcopyd; + +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE) +#define SUB_JOB_SIZE 128 +#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE) +#define SUB_JOB_COUNT 8 + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct kcopyd_client { + struct list_head list; + + spinlock_t lock; + struct list_head pages; + unsigned int nr_pages; + unsigned int nr_free_pages; + unsigned int max_split; +}; + +static inline void __push_page(struct kcopyd_client *kc, struct page *p) +{ + list_add(&p->list, &kc->pages); + kc->nr_free_pages++; +} + +static inline struct page *__pop_page(struct kcopyd_client *kc) +{ + struct page *p; + + p = list_entry(kc->pages.next, struct page, list); + list_del(&p->list); + kc->nr_free_pages--; + + return p; +} + +static int kcopyd_get_pages(struct kcopyd_client *kc, + unsigned int nr, struct list_head *pages) +{ + struct page *p; + INIT_LIST_HEAD(pages); + + spin_lock(&kc->lock); + if (kc->nr_free_pages < nr) { + spin_unlock(&kc->lock); + return -ENOMEM; + } + + while (nr--) { + p = __pop_page(kc); + list_add(&p->list, pages); + } + spin_unlock(&kc->lock); + + return 0; +} + +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages) +{ + struct list_head *tmp, *tmp2; + + spin_lock(&kc->lock); + list_for_each_safe (tmp, tmp2, pages) + __push_page(kc, list_entry(tmp, struct page, list)); + spin_unlock(&kc->lock); +} + +/* + * These three functions resize the page pool. + */ +static void release_pages(struct list_head *pages) +{ + struct page *p; + struct list_head *tmp, *tmp2; + + list_for_each_safe (tmp, tmp2, pages) { + p = list_entry(tmp, struct page, list); + UnlockPage(p); + __free_page(p); + } +} + +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) +{ + unsigned int i; + struct page *p; + LIST_HEAD(new); + + for (i = 0; i < nr; i++) { + p = alloc_page(GFP_KERNEL); + if (!p) { + release_pages(&new); + return -ENOMEM; + } + + LockPage(p); + list_add(&p->list, &new); + } + + kcopyd_put_pages(kc, &new); + kc->nr_pages += nr; + kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB; + if (kc->max_split > SUB_JOB_COUNT) + kc->max_split = SUB_JOB_COUNT; + + return 0; +} + +static void client_free_pages(struct kcopyd_client *kc) +{ + BUG_ON(kc->nr_free_pages != kc->nr_pages); + release_pages(&kc->pages); + kc->nr_free_pages = kc->nr_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { + struct kcopyd_client *kc; + struct list_head list; + unsigned int flags; + + /* + * Error state of the job. + */ + int read_err; + unsigned int write_err; + + /* + * Either READ or WRITE + */ + int rw; + struct io_region source; + + /* + * The destinations for the transfer. + */ + unsigned int num_dests; + struct io_region dests[KCOPYD_MAX_REGIONS]; + + sector_t offset; + unsigned int nr_pages; + struct list_head pages; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + kcopyd_notify_fn fn; + void *context; + + /* + * These fields are only used if the job has been split + * into more manageable parts. + */ + struct semaphore lock; + atomic_t sub_jobs; + sector_t progress; +}; + +/* FIXME: this should scale with the number of pages */ +#define MIN_JOBS 512 + +static kmem_cache_t *_job_cache; +static mempool_t *_job_pool; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(_complete_jobs); +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_pages_jobs); + +static int jobs_init(void) +{ + INIT_LIST_HEAD(&_complete_jobs); + INIT_LIST_HEAD(&_io_jobs); + INIT_LIST_HEAD(&_pages_jobs); + + _job_cache = kmem_cache_create("kcopyd-jobs", + sizeof(struct kcopyd_job), + __alignof__(struct kcopyd_job), + 0, NULL, NULL); + if (!_job_cache) + return -ENOMEM; + + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, + mempool_free_slab, _job_cache); + if (!_job_pool) { + kmem_cache_destroy(_job_cache); + return -ENOMEM; + } + + return 0; +} + +static void jobs_exit(void) +{ + BUG_ON(!list_empty(&_complete_jobs)); + BUG_ON(!list_empty(&_io_jobs)); + BUG_ON(!list_empty(&_pages_jobs)); + + mempool_destroy(_job_pool); + kmem_cache_destroy(_job_cache); +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static inline struct kcopyd_job *pop(struct list_head *jobs) +{ + struct kcopyd_job *job = NULL; + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&_job_lock, flags); + + return job; +} + +static inline void push(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&_job_lock, flags); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + void *context = job->context; + int read_err = job->read_err; + unsigned int write_err = job->write_err; + kcopyd_notify_fn fn = job->fn; + + kcopyd_put_pages(job->kc, &job->pages); + mempool_free(job, _job_pool); + fn(read_err, write_err, context); + return 0; +} + +static void complete_io(unsigned int error, void *context) +{ + struct kcopyd_job *job = (struct kcopyd_job *) context; + + if (error) { + if (job->rw == WRITE) + job->write_err &= error; + else + job->read_err = 1; + + if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + push(&_complete_jobs, job); + dm_daemon_wake(&_kcopyd); + return; + } + } + + if (job->rw == WRITE) + push(&_complete_jobs, job); + + else { + job->rw = WRITE; + push(&_io_jobs, job); + } + + dm_daemon_wake(&_kcopyd); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + int r; + + if (job->rw == READ) + r = dm_io_async(1, &job->source, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + else + r = dm_io_async(job->num_dests, job->dests, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + return r; +} + +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + + job->nr_pages = dm_div_up(job->dests[0].count + job->offset, + SECTORS_PER_PAGE); + r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + if (!r) { + /* this job is ready for io */ + push(&_io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + if (job->rw == WRITE) + job->write_err = (unsigned int) -1; + else + job->read_err = 1; + push(&_complete_jobs, job); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(void) +{ + /* + * The order that these are called is *very* important. + * complete jobs can free some pages for pages jobs. + * Pages jobs when successful will jump onto the io jobs + * list. io jobs call wake when they complete and it all + * starts again. + */ + process_jobs(&_complete_jobs, run_complete_job); + process_jobs(&_pages_jobs, run_pages_job); + process_jobs(&_io_jobs, run_io_job); + run_task_queue(&tq_disk); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ + push(&_pages_jobs, job); + dm_daemon_wake(&_kcopyd); +} + +static void segment_complete(int read_err, + unsigned int write_err, void *context) +{ + /* FIXME: tidy this function */ + sector_t progress = 0; + sector_t count = 0; + struct kcopyd_job *job = (struct kcopyd_job *) context; + + down(&job->lock); + + /* update the error */ + if (read_err) + job->read_err = 1; + + if (write_err) + job->write_err &= write_err; + + /* + * Only dispatch more work if there hasn't been an error. + */ + if ((!job->read_err && !job->write_err) || + test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + /* get the next chunk of work */ + progress = job->progress; + count = job->source.count - progress; + if (count) { + if (count > SUB_JOB_SIZE) + count = SUB_JOB_SIZE; + + job->progress += count; + } + } + up(&job->lock); + + if (count) { + int i; + struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); + + memcpy(sub_job, job, sizeof(*job)); + sub_job->source.sector += progress; + sub_job->source.count = count; + + for (i = 0; i < job->num_dests; i++) { + sub_job->dests[i].sector += progress; + sub_job->dests[i].count = count; + } + + sub_job->fn = segment_complete; + sub_job->context = job; + dispatch_job(sub_job); + + } else if (atomic_dec_and_test(&job->sub_jobs)) { + + /* + * To avoid a race we must keep the job around + * until after the notify function has completed. + * Otherwise the client may try and stop the job + * after we've completed. + */ + job->fn(read_err, write_err, job->context); + mempool_free(job, _job_pool); + } +} + +/* + * Create some little jobs that will do the move between + * them. + */ +static void split_job(struct kcopyd_job *job) +{ + int nr; + + nr = dm_div_up(job->source.count, SUB_JOB_SIZE); + if (nr > job->kc->max_split) + nr = job->kc->max_split; + + atomic_set(&job->sub_jobs, nr); + while (nr--) + segment_complete(0, 0u, job); +} + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + /* + * Allocate a new job. + */ + job = mempool_alloc(_job_pool, GFP_NOIO); + + /* + * set up for the read. + */ + job->kc = kc; + job->flags = flags; + job->read_err = 0; + job->write_err = 0; + job->rw = READ; + + memcpy(&job->source, from, sizeof(*from)); + + job->num_dests = num_dests; + memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + + job->offset = 0; + job->nr_pages = 0; + INIT_LIST_HEAD(&job->pages); + + job->fn = fn; + job->context = context; + + if (job->source.count < SUB_JOB_SIZE) + dispatch_job(job); + + else { + init_MUTEX(&job->lock); + job->progress = 0; + split_job(job); + } + + return 0; +} + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ + /* FIXME: finish */ + return -1; +} + +/*----------------------------------------------------------------- + * Unit setup + *---------------------------------------------------------------*/ +static DECLARE_MUTEX(_client_lock); +static LIST_HEAD(_clients); + +static int client_add(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_add(&kc->list, &_clients); + up(&_client_lock); + return 0; +} + +static void client_del(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_del(&kc->list); + up(&_client_lock); +} + +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) +{ + int r = 0; + struct kcopyd_client *kc; + + if (nr_pages * SECTORS_PER_PAGE < SUB_JOB_SIZE) { + DMERR("kcopyd client requested %u pages: minimum is %lu", + nr_pages, SUB_JOB_SIZE / SECTORS_PER_PAGE); + return -ENOMEM; + } + + kc = kmalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) + return -ENOMEM; + + kc->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&kc->pages); + kc->nr_pages = kc->nr_free_pages = 0; + r = client_alloc_pages(kc, nr_pages); + if (r) { + kfree(kc); + return r; + } + + r = dm_io_get(nr_pages); + if (r) { + client_free_pages(kc); + kfree(kc); + return r; + } + + r = client_add(kc); + if (r) { + dm_io_put(nr_pages); + client_free_pages(kc); + kfree(kc); + return r; + } + + *result = kc; + return 0; +} + +void kcopyd_client_destroy(struct kcopyd_client *kc) +{ + dm_io_put(kc->nr_pages); + client_free_pages(kc); + client_del(kc); + kfree(kc); +} + + +int __init kcopyd_init(void) +{ + int r; + + r = jobs_init(); + if (r) + return r; + + r = dm_daemon_start(&_kcopyd, "kcopyd", do_work); + if (r) + jobs_exit(); + + return r; +} + +void kcopyd_exit(void) +{ + jobs_exit(); + dm_daemon_stop(&_kcopyd); +} + +EXPORT_SYMBOL(kcopyd_client_create); +EXPORT_SYMBOL(kcopyd_client_destroy); +EXPORT_SYMBOL(kcopyd_copy); +EXPORT_SYMBOL(kcopyd_cancel); --- linux-2.4.22/drivers/md/kcopyd.h Thu Jan 1 01:00:00 1970 +++ linux/drivers/md/kcopyd.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2001 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef DM_KCOPYD_H +#define DM_KCOPYD_H + +/* + * Needed for the definition of offset_t. + */ +#include +#include + +#include "dm-io.h" + +int kcopyd_init(void); +void kcopyd_exit(void); + +/* FIXME: make this configurable */ +#define KCOPYD_MAX_REGIONS 8 + +#define KCOPYD_IGNORE_ERROR 1 + +/* + * To use kcopyd you must first create a kcopyd client object. + */ +struct kcopyd_client; +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); +void kcopyd_client_destroy(struct kcopyd_client *kc); + +/* + * Submit a copy job to kcopyd. This is built on top of the + * previous three fns. + * + * read_err is a boolean, + * write_err is a bitset, with 1 bit for each destination region + */ +typedef void (*kcopyd_notify_fn)(int read_err, + unsigned int write_err, void *context); + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + +#endif --- linux-2.4.22/fs/buffer.c Mon Nov 17 19:17:23 2003 +++ linux/fs/buffer.c Thu Nov 20 18:31:01 2003 @@ -756,6 +756,7 @@ bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_private = private; + bh->b_journal_head = NULL; } static void end_buffer_io_async(struct buffer_head * bh, int uptodate) --- linux-2.4.22/fs/jbd/journal.c Mon Nov 17 19:17:25 2003 +++ linux/fs/jbd/journal.c Thu Nov 20 18:29:35 2003 @@ -1802,9 +1802,9 @@ if (buffer_jbd(bh)) { /* Someone did it for us! */ - J_ASSERT_BH(bh, bh->b_private != NULL); + J_ASSERT_BH(bh, bh->b_journal_head != NULL); journal_free_journal_head(jh); - jh = bh->b_private; + jh = bh->b_journal_head; } else { /* * We actually don't need jh_splice_lock when @@ -1812,7 +1812,7 @@ */ spin_lock(&jh_splice_lock); set_bit(BH_JBD, &bh->b_state); - bh->b_private = jh; + bh->b_journal_head = jh; jh->b_bh = bh; atomic_inc(&bh->b_count); spin_unlock(&jh_splice_lock); @@ -1821,7 +1821,7 @@ } jh->b_jcount++; spin_unlock(&journal_datalist_lock); - return bh->b_private; + return bh->b_journal_head; } /* @@ -1854,7 +1854,7 @@ J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); spin_lock(&jh_splice_lock); - bh->b_private = NULL; + bh->b_journal_head = NULL; jh->b_bh = NULL; /* debug, really */ clear_bit(BH_JBD, &bh->b_state); __brelse(bh); --- linux-2.4.22/include/linux/device-mapper.h Thu Jan 1 01:00:00 1970 +++ linux/include/linux/device-mapper.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DEVICE_MAPPER_H +#define _LINUX_DEVICE_MAPPER_H + +typedef unsigned long sector_t; + +struct dm_target; +struct dm_table; +struct dm_dev; + +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; + +union map_info { + void *ptr; + unsigned long long ll; +}; + +/* + * In the constructor the target parameter will already have the + * table, type, begin and len fields filled in. + */ +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc, + char **argv); + +/* + * The destructor doesn't need to free the dm_target, just + * anything hidden ti->private. + */ +typedef void (*dm_dtr_fn) (struct dm_target * ti); + +/* + * The map function must return: + * < 0: error + * = 0: The target will handle the io by resubmitting it later + * > 0: simple remap complete + */ +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh, + int rw, union map_info *map_context); + +/* + * Returns: + * < 0 : error (currently ignored) + * 0 : ended successfully + * 1 : for some reason the io has still not completed (eg, + * multipath target might want to requeue a failed io). + */ +typedef int (*dm_endio_fn) (struct dm_target * ti, + struct buffer_head * bh, int rw, int error, + union map_info *map_context); +typedef void (*dm_suspend_fn) (struct dm_target *ti); +typedef void (*dm_resume_fn) (struct dm_target *ti); +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type, + char *result, unsigned int maxlen); + +void dm_error(const char *message); + +/* + * Constructors should call these functions to ensure destination devices + * are opened/closed correctly. + * FIXME: too many arguments. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result); +void dm_put_device(struct dm_target *ti, struct dm_dev *d); + +/* + * Information about a target type + */ +struct target_type { + const char *name; + struct module *module; + dm_ctr_fn ctr; + dm_dtr_fn dtr; + dm_map_fn map; + dm_endio_fn end_io; + dm_suspend_fn suspend; + dm_resume_fn resume; + dm_status_fn status; +}; + +struct dm_target { + struct dm_table *table; + struct target_type *type; + + /* target limits */ + sector_t begin; + sector_t len; + + /* target specific data */ + void *private; + + /* Used to provide an error string from the ctr */ + char *error; +}; + +int dm_register_target(struct target_type *t); +int dm_unregister_target(struct target_type *t); + +#endif /* _LINUX_DEVICE_MAPPER_H */ --- linux-2.4.22/include/linux/dm-ioctl.h Thu Jan 1 01:00:00 1970 +++ linux/include/linux/dm-ioctl.h Thu Nov 20 18:29:35 2003 @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DM_IOCTL_H +#define _LINUX_DM_IOCTL_H + +#include + +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + +/* + * A traditional ioctl interface for the device mapper. + * + * Each device can have two tables associated with it, an + * 'active' table which is the one currently used by io passing + * through the device, and an 'inactive' one which is a table + * that is being prepared as a replacement for the 'active' one. + * + * DM_VERSION: + * Just get the version information for the ioctl interface. + * + * DM_REMOVE_ALL: + * Remove all dm devices, destroy all tables. Only really used + * for debug. + * + * DM_LIST_DEVICES: + * Get a list of all the dm device names. + * + * DM_DEV_CREATE: + * Create a new device, neither the 'active' or 'inactive' table + * slots will be filled. The device will be in suspended state + * after creation, however any io to the device will get errored + * since it will be out-of-bounds. + * + * DM_DEV_REMOVE: + * Remove a device, destroy any tables. + * + * DM_DEV_RENAME: + * Rename a device. + * + * DM_SUSPEND: + * This performs both suspend and resume, depending which flag is + * passed in. + * Suspend: This command will not return until all pending io to + * the device has completed. Further io will be deferred until + * the device is resumed. + * Resume: It is no longer an error to issue this command on an + * unsuspended device. If a table is present in the 'inactive' + * slot, it will be moved to the active slot, then the old table + * from the active slot will be _destroyed_. Finally the device + * is resumed. + * + * DM_DEV_STATUS: + * Retrieves the status for the table in the 'active' slot. + * + * DM_DEV_WAIT: + * Wait for a significant event to occur to the device. This + * could either be caused by an event triggered by one of the + * targets of the table in the 'active' slot, or a table change. + * + * DM_TABLE_LOAD: + * Load a table into the 'inactive' slot for the device. The + * device does _not_ need to be suspended prior to this command. + * + * DM_TABLE_CLEAR: + * Destroy any table in the 'inactive' slot (ie. abort). + * + * DM_TABLE_DEPS: + * Return a set of device dependencies for the 'active' table. + * + * DM_TABLE_STATUS: + * Return the targets status for the 'active' table. + */ + +/* + * All ioctl arguments consist of a single chunk of memory, with + * this structure at the start. If a uuid is specified any + * lookup (eg. for a DM_INFO) will be done on that, *not* the + * name. + */ +struct dm_ioctl { + /* + * The version number is made up of three parts: + * major - no backward or forward compatibility, + * minor - only backwards compatible, + * patch - both backwards and forwards compatible. + * + * All clients of the ioctl interface should fill in the + * version number of the interface that they were + * compiled with. + * + * All recognised ioctl commands (ie. those that don't + * return -ENOTTY) fill out this field, even if the + * command failed. + */ + uint32_t version[3]; /* in/out */ + uint32_t data_size; /* total size of data passed in + * including this struct */ + + uint32_t data_start; /* offset to start of data + * relative to start of this struct */ + + uint32_t target_count; /* in/out */ + int32_t open_count; /* out */ + uint32_t flags; /* in/out */ + uint32_t event_nr; /* in/out */ + uint32_t padding; + + uint64_t dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for + * the block device */ +}; + +/* + * Used to specify tables. These structures appear after the + * dm_ioctl. + */ +struct dm_target_spec { + uint64_t sector_start; + uint64_t length; + int32_t status; /* used when reading from kernel only */ + + /* + * Offset in bytes (from the start of this struct) to + * next target_spec. + */ + uint32_t next; + + char target_type[DM_MAX_TYPE_NAME]; + + /* + * Parameter string starts immediately after this object. + * Be careful to add padding after string to ensure correct + * alignment of subsequent dm_target_spec. + */ +}; + +/* + * Used to retrieve the target dependencies. + */ +struct dm_target_deps { + uint32_t count; /* Array size */ + uint32_t padding; /* unused */ + uint64_t dev[0]; /* out */ +}; + +/* + * Used to get a list of all dm devices. + */ +struct dm_name_list { + uint64_t dev; + uint32_t next; /* offset to the next record from + the _start_ of this */ + char name[0]; +}; + +/* + * If you change this make sure you make the corresponding change + * to dm-ioctl.c:lookup_ioctl() + */ +enum { + /* Top level cmds */ + DM_VERSION_CMD = 0, + DM_REMOVE_ALL_CMD, + DM_LIST_DEVICES_CMD, + + /* device level cmds */ + DM_DEV_CREATE_CMD, + DM_DEV_REMOVE_CMD, + DM_DEV_RENAME_CMD, + DM_DEV_SUSPEND_CMD, + DM_DEV_STATUS_CMD, + DM_DEV_WAIT_CMD, + + /* Table level cmds */ + DM_TABLE_LOAD_CMD, + DM_TABLE_CLEAR_CMD, + DM_TABLE_DEPS_CMD, + DM_TABLE_STATUS_CMD, +}; + +#define DM_IOCTL 0xfd + +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) + +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) + +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl) +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) + +#define DM_VERSION_MAJOR 4 +#define DM_VERSION_MINOR 0 +#define DM_VERSION_PATCHLEVEL 5 +#define DM_VERSION_EXTRA "-ioctl (2003-11-18)" + +/* Status bits */ +#define DM_READONLY_FLAG (1 << 0) /* In/Out */ +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */ +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */ + +/* + * Flag passed into ioctl STATUS command to get table information + * rather than current status. + */ +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */ + +/* + * Flags that indicate whether a table is present in either of + * the two table slots that a device has. + */ +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */ +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */ + +/* + * Indicates that the buffer passed in wasn't big enough for the + * results. + */ +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ + +#endif /* _LINUX_DM_IOCTL_H */ --- linux-2.4.22/include/linux/fs.h Mon Nov 17 19:17:56 2003 +++ linux/include/linux/fs.h Thu Nov 20 18:31:01 2003 @@ -265,7 +265,7 @@ struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_private; /* reserved for b_end_io */ - + void *b_journal_head; /* ext3 journal_heads */ unsigned long b_rsector; /* Real buffer location on disk */ wait_queue_head_t b_wait; --- linux-2.4.22/include/linux/jbd.h Mon Nov 17 19:17:58 2003 +++ linux/include/linux/jbd.h Thu Nov 20 18:29:35 2003 @@ -311,7 +311,7 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) { - return bh->b_private; + return bh->b_journal_head; } #define HAVE_JOURNAL_CALLBACK_STATUS --- linux-2.4.22/include/linux/mempool.h Thu Jan 1 01:00:00 1970 +++ linux/include/linux/mempool.h Thu Nov 20 18:29:37 2003 @@ -0,0 +1,31 @@ +/* + * memory buffer pool support + */ +#ifndef _LINUX_MEMPOOL_H +#define _LINUX_MEMPOOL_H + +#include +#include + +struct mempool_s; +typedef struct mempool_s mempool_t; + +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); +extern void mempool_destroy(mempool_t *pool); +extern void * mempool_alloc(mempool_t *pool, int gfp_mask); +extern void mempool_free(void *element, mempool_t *pool); + +/* + * A mempool_alloc_t and mempool_free_t that get the memory from + * a slab that is passed in through pool_data. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data); +void mempool_free_slab(void *element, void *pool_data); + + +#endif /* _LINUX_MEMPOOL_H */ --- linux-2.4.22/include/linux/vmalloc.h Mon Nov 17 19:18:01 2003 +++ linux/include/linux/vmalloc.h Thu Nov 20 18:29:37 2003 @@ -29,6 +29,7 @@ extern void vmfree_area_pages(unsigned long address, unsigned long size); extern int vmalloc_area_pages(unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot); +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size); /* * Allocate any pages --- linux-2.4.22/kernel/ksyms.c Mon Nov 17 19:18:06 2003 +++ linux/kernel/ksyms.c Thu Nov 20 18:31:01 2003 @@ -114,6 +114,7 @@ EXPORT_SYMBOL(__vmalloc); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vcalloc); EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(remap_page_range); EXPORT_SYMBOL(max_mapnr); --- linux-2.4.22/mm/Makefile Mon Nov 17 19:18:06 2003 +++ linux/mm/Makefile Thu Nov 20 18:29:37 2003 @@ -9,12 +9,12 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o memory.o page_alloc.o +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o mempool.o obj-$(CONFIG_HIGHMEM) += highmem.o --- linux-2.4.22/mm/filemap.c Mon Nov 17 19:18:07 2003 +++ linux/mm/filemap.c Thu Nov 20 18:29:37 2003 @@ -1742,7 +1742,8 @@ } up(&inode->i_sem); up_read(&inode->i_alloc_sem); - UPDATE_ATIME(filp->f_dentry->d_inode); + if (!S_ISBLK(inode->i_mode)) + UPDATE_ATIME(filp->f_dentry->d_inode); goto out; } } @@ -3120,8 +3121,12 @@ goto out; remove_suid(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - mark_inode_dirty_sync(inode); + + /* Don't update times for block devices using O_DIRECT */ + if (!(file->f_flags & O_DIRECT) || !S_ISBLK(inode->i_mode)) { + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + } do { unsigned long index, offset; --- linux-2.4.22/mm/mempool.c Thu Jan 1 01:00:00 1970 +++ linux/mm/mempool.c Thu Nov 20 18:29:37 2003 @@ -0,0 +1,299 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include +#include +#include +#include + +struct mempool_s { + spinlock_t lock; + int min_nr; /* nr of elements at *elements */ + int curr_nr; /* Current nr of elements at *elements */ + void **elements; + + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; + wait_queue_head_t wait; +}; + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc and mempool_free + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc function is not called + * from IRQ contexts. + */ +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + mempool_t *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + memset(pool, 0, sizeof(*pool)); + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr < pool->min_nr) { + while (pool->curr_nr > new_min_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) + add_element(pool, element); + else + kfree(element); /* Raced */ + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + if (pool->curr_nr != pool->min_nr) + BUG(); /* There were outstanding elements */ + free_pool(pool); +} + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, int gfp_mask) +{ + void *element; + unsigned long flags; + int curr_nr; + DECLARE_WAITQUEUE(wait, current); + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + +repeat_alloc: + element = pool->alloc(gfp_nowait, pool->pool_data); + if (likely(element != NULL)) + return element; + + /* + * If the pool is less than 50% full then try harder + * to allocate an element: + */ + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) { + element = pool->alloc(gfp_mask, pool->pool_data); + if (likely(element != NULL)) + return element; + } + + /* + * Kick the VM at this point. + */ + wakeup_bdflush(); + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (gfp_mask == gfp_nowait) + return NULL; + + run_task_queue(&tq_disk); + + add_wait_queue_exclusive(&pool->wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&pool->lock, flags); + curr_nr = pool->curr_nr; + spin_unlock_irqrestore(&pool->lock, flags); + + if (!curr_nr) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(&pool->wait, &wait); + + goto repeat_alloc; +} + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} + +void mempool_free_slab(void *element, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + kmem_cache_free(mem, element); +} + + +EXPORT_SYMBOL(mempool_create); +EXPORT_SYMBOL(mempool_resize); +EXPORT_SYMBOL(mempool_destroy); +EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_free); +EXPORT_SYMBOL(mempool_alloc_slab); +EXPORT_SYMBOL(mempool_free_slab); --- linux-2.4.22/mm/vmalloc.c Mon Nov 17 19:18:07 2003 +++ linux/mm/vmalloc.c Thu Nov 20 18:29:37 2003 @@ -374,3 +374,22 @@ read_unlock(&vmlist_lock); return buf - buf_start; } + +void *vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vmalloc(size); + if (addr) + memset(addr, 0, size); + + return addr; +}