From a9a8dceacc0d9b13d592cf7d39b80e17b5bda95f Mon Sep 17 00:00:00 2001 From: Alasdair Kergon Date: Thu, 20 Nov 2003 18:44:04 +0000 Subject: [PATCH] 2.4.22 patches --- patches/common/linux-2.4.22-arch64.patch | 196 + patches/common/linux-2.4.22-b_private.patch | 73 + patches/common/linux-2.4.22-config.patch | 49 + patches/common/linux-2.4.22-devmapper.patch | 9310 +++++++++++++++++++ patches/common/linux-2.4.22-makefile.patch | 53 + patches/common/linux-2.4.22-memalloc.patch | 263 + patches/common/linux-2.4.22-mempool.patch | 354 + patches/common/linux-2.4.22-o_direct.patch | 27 + patches/common/linux-2.4.22-vcalloc.patch | 45 + 9 files changed, 10370 insertions(+) create mode 100644 patches/common/linux-2.4.22-arch64.patch create mode 100644 patches/common/linux-2.4.22-b_private.patch create mode 100644 patches/common/linux-2.4.22-config.patch create mode 100644 patches/common/linux-2.4.22-devmapper.patch create mode 100644 patches/common/linux-2.4.22-makefile.patch create mode 100644 patches/common/linux-2.4.22-memalloc.patch create mode 100644 patches/common/linux-2.4.22-mempool.patch create mode 100644 patches/common/linux-2.4.22-o_direct.patch create mode 100644 patches/common/linux-2.4.22-vcalloc.patch diff --git a/patches/common/linux-2.4.22-arch64.patch b/patches/common/linux-2.4.22-arch64.patch new file mode 100644 index 0000000..fb1dd8f --- /dev/null +++ b/patches/common/linux-2.4.22-arch64.patch @@ -0,0 +1,196 @@ +--- linux-2.4.22/arch/mips64/kernel/ioctl32.c Mon Nov 17 19:16:11 2003 ++++ linux/arch/mips64/kernel/ioctl32.c Tue Nov 18 13:21:14 2003 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #undef __KERNEL__ /* This file was born to be ugly ... */ +@@ -1221,6 +1222,22 @@ + IOCTL32_DEFAULT(STOP_ARRAY_RO), + IOCTL32_DEFAULT(RESTART_ARRAY_RW), + #endif /* CONFIG_MD */ ++ ++#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) ++ IOCTL32_DEFAULT(DM_VERSION), ++ IOCTL32_DEFAULT(DM_REMOVE_ALL), ++ IOCTL32_DEFAULT(DM_DEV_CREATE), ++ IOCTL32_DEFAULT(DM_DEV_REMOVE), ++ IOCTL32_DEFAULT(DM_TABLE_LOAD), ++ IOCTL32_DEFAULT(DM_DEV_SUSPEND), ++ IOCTL32_DEFAULT(DM_DEV_RENAME), ++ IOCTL32_DEFAULT(DM_TABLE_DEPS), ++ IOCTL32_DEFAULT(DM_DEV_STATUS), ++ IOCTL32_DEFAULT(DM_TABLE_STATUS), ++ IOCTL32_DEFAULT(DM_DEV_WAIT), ++ IOCTL32_DEFAULT(DM_LIST_DEVICES), ++ IOCTL32_DEFAULT(DM_TABLE_CLEAR), ++#endif /* CONFIG_BLK_DEV_DM */ + + #ifdef CONFIG_SIBYTE_TBPROF + IOCTL32_DEFAULT(SBPROF_ZBSTART), +--- linux-2.4.22/arch/parisc/kernel/ioctl32.c Mon Nov 17 19:16:11 2003 ++++ linux/arch/parisc/kernel/ioctl32.c Tue Nov 18 13:17:03 2003 +@@ -55,6 +55,7 @@ + #define max max */ + #include + #endif /* LVM */ ++#include + + #include + /* Ugly hack. */ +@@ -3423,6 +3424,22 @@ + COMPATIBLE_IOCTL(LV_BMAP) + COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) + #endif /* LVM */ ++/* Device-Mapper */ ++#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) ++COMPATIBLE_IOCTL(DM_VERSION) ++COMPATIBLE_IOCTL(DM_REMOVE_ALL) ++COMPATIBLE_IOCTL(DM_DEV_CREATE) ++COMPATIBLE_IOCTL(DM_DEV_REMOVE) ++COMPATIBLE_IOCTL(DM_TABLE_LOAD) ++COMPATIBLE_IOCTL(DM_DEV_SUSPEND) ++COMPATIBLE_IOCTL(DM_DEV_RENAME) ++COMPATIBLE_IOCTL(DM_TABLE_DEPS) ++COMPATIBLE_IOCTL(DM_DEV_STATUS) ++COMPATIBLE_IOCTL(DM_TABLE_STATUS) ++COMPATIBLE_IOCTL(DM_DEV_WAIT) ++COMPATIBLE_IOCTL(DM_LIST_DEVICES) ++COMPATIBLE_IOCTL(DM_TABLE_CLEAR) ++#endif /* CONFIG_BLK_DEV_DM */ + #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE) + COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC) + COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID) +--- linux-2.4.22/arch/ppc64/kernel/ioctl32.c Mon Nov 17 19:16:18 2003 ++++ linux/arch/ppc64/kernel/ioctl32.c Tue Nov 18 13:17:03 2003 +@@ -66,6 +66,7 @@ + #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) + #include + #endif /* LVM */ ++#include + + #include + /* Ugly hack. */ +@@ -4435,6 +4436,22 @@ + COMPATIBLE_IOCTL(NBD_PRINT_DEBUG), + COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS), + COMPATIBLE_IOCTL(NBD_DISCONNECT), ++/* device-mapper */ ++#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) ++COMPATIBLE_IOCTL(DM_VERSION), ++COMPATIBLE_IOCTL(DM_REMOVE_ALL), ++COMPATIBLE_IOCTL(DM_DEV_CREATE), ++COMPATIBLE_IOCTL(DM_DEV_REMOVE), ++COMPATIBLE_IOCTL(DM_TABLE_LOAD), ++COMPATIBLE_IOCTL(DM_DEV_SUSPEND), ++COMPATIBLE_IOCTL(DM_DEV_RENAME), ++COMPATIBLE_IOCTL(DM_TABLE_DEPS), ++COMPATIBLE_IOCTL(DM_DEV_STATUS), ++COMPATIBLE_IOCTL(DM_TABLE_STATUS), ++COMPATIBLE_IOCTL(DM_DEV_WAIT), ++COMPATIBLE_IOCTL(DM_LIST_DEVICES), ++COMPATIBLE_IOCTL(DM_TABLE_CLEAR), ++#endif /* CONFIG_BLK_DEV_DM */ + /* Remove *PRIVATE in 2.5 */ + COMPATIBLE_IOCTL(SIOCDEVPRIVATE), + COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1), +--- linux-2.4.22/arch/s390x/kernel/ioctl32.c Mon Nov 17 19:16:20 2003 ++++ linux/arch/s390x/kernel/ioctl32.c Tue Nov 18 13:22:20 2003 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -626,6 +627,20 @@ + IOCTL32_DEFAULT(VT_UNLOCKSWITCH), + + IOCTL32_DEFAULT(SIOCGSTAMP), ++ ++ IOCTL32_DEFAULT(DM_VERSION), ++ IOCTL32_DEFAULT(DM_REMOVE_ALL), ++ IOCTL32_DEFAULT(DM_DEV_CREATE), ++ IOCTL32_DEFAULT(DM_DEV_REMOVE), ++ IOCTL32_DEFAULT(DM_TABLE_LOAD), ++ IOCTL32_DEFAULT(DM_DEV_SUSPEND), ++ IOCTL32_DEFAULT(DM_DEV_RENAME), ++ IOCTL32_DEFAULT(DM_TABLE_DEPS), ++ IOCTL32_DEFAULT(DM_DEV_STATUS), ++ IOCTL32_DEFAULT(DM_TABLE_STATUS), ++ IOCTL32_DEFAULT(DM_DEV_WAIT), ++ IOCTL32_DEFAULT(DM_LIST_DEVICES), ++ IOCTL32_DEFAULT(DM_TABLE_CLEAR), + + IOCTL32_DEFAULT(LOOP_SET_FD), + IOCTL32_DEFAULT(LOOP_CLR_FD), +--- linux-2.4.22/arch/sparc64/kernel/ioctl32.c Mon Nov 17 19:16:23 2003 ++++ linux/arch/sparc64/kernel/ioctl32.c Tue Nov 18 13:17:03 2003 +@@ -56,6 +56,7 @@ + #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) + #include + #endif /* LVM */ ++#include + + #include + /* Ugly hack. */ +@@ -5086,6 +5087,22 @@ + COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) + COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS) + COMPATIBLE_IOCTL(NBD_DISCONNECT) ++/* device-mapper */ ++#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) ++COMPATIBLE_IOCTL(DM_VERSION) ++COMPATIBLE_IOCTL(DM_REMOVE_ALL) ++COMPATIBLE_IOCTL(DM_DEV_CREATE) ++COMPATIBLE_IOCTL(DM_DEV_REMOVE) ++COMPATIBLE_IOCTL(DM_TABLE_LOAD) ++COMPATIBLE_IOCTL(DM_DEV_SUSPEND) ++COMPATIBLE_IOCTL(DM_DEV_RENAME) ++COMPATIBLE_IOCTL(DM_TABLE_DEPS) ++COMPATIBLE_IOCTL(DM_DEV_STATUS) ++COMPATIBLE_IOCTL(DM_TABLE_STATUS) ++COMPATIBLE_IOCTL(DM_DEV_WAIT) ++COMPATIBLE_IOCTL(DM_LIST_DEVICES) ++COMPATIBLE_IOCTL(DM_TABLE_CLEAR) ++#endif /* CONFIG_BLK_DEV_DM */ + /* Linux-1394 */ + #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE) + COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL) +--- linux-2.4.22/arch/x86_64/ia32/ia32_ioctl.c Mon Nov 17 19:16:25 2003 ++++ linux/arch/x86_64/ia32/ia32_ioctl.c Tue Nov 18 13:17:03 2003 +@@ -67,6 +67,7 @@ + #define max max + #include + #endif /* LVM */ ++#include + + #include + /* Ugly hack. */ +@@ -4047,6 +4048,22 @@ + COMPATIBLE_IOCTL(LV_BMAP) + COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) + #endif /* LVM */ ++/* Device-Mapper */ ++#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) ++COMPATIBLE_IOCTL(DM_VERSION) ++COMPATIBLE_IOCTL(DM_REMOVE_ALL) ++COMPATIBLE_IOCTL(DM_DEV_CREATE) ++COMPATIBLE_IOCTL(DM_DEV_REMOVE) ++COMPATIBLE_IOCTL(DM_TABLE_LOAD) ++COMPATIBLE_IOCTL(DM_DEV_SUSPEND) ++COMPATIBLE_IOCTL(DM_DEV_RENAME) ++COMPATIBLE_IOCTL(DM_TABLE_DEPS) ++COMPATIBLE_IOCTL(DM_DEV_STATUS) ++COMPATIBLE_IOCTL(DM_TABLE_STATUS) ++COMPATIBLE_IOCTL(DM_DEV_WAIT) ++COMPATIBLE_IOCTL(DM_LIST_DEVICES) ++COMPATIBLE_IOCTL(DM_TABLE_CLEAR) ++#endif /* CONFIG_BLK_DEV_DM */ + #ifdef CONFIG_AUTOFS_FS + COMPATIBLE_IOCTL(AUTOFS_IOC_READY) + COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL) diff --git a/patches/common/linux-2.4.22-b_private.patch b/patches/common/linux-2.4.22-b_private.patch new file mode 100644 index 0000000..77b5b51 --- /dev/null +++ b/patches/common/linux-2.4.22-b_private.patch @@ -0,0 +1,73 @@ +--- linux-2.4.22/fs/buffer.c Mon Nov 17 19:17:23 2003 ++++ linux/fs/buffer.c Tue Nov 18 13:43:32 2003 +@@ -756,6 +756,7 @@ + bh->b_list = BUF_CLEAN; + bh->b_end_io = handler; + bh->b_private = private; ++ bh->b_journal_head = NULL; + } + + static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +--- linux-2.4.22/fs/jbd/journal.c Mon Nov 17 19:17:25 2003 ++++ linux/fs/jbd/journal.c Tue Nov 18 13:43:32 2003 +@@ -1802,9 +1802,9 @@ + + if (buffer_jbd(bh)) { + /* Someone did it for us! */ +- J_ASSERT_BH(bh, bh->b_private != NULL); ++ J_ASSERT_BH(bh, bh->b_journal_head != NULL); + journal_free_journal_head(jh); +- jh = bh->b_private; ++ jh = bh->b_journal_head; + } else { + /* + * We actually don't need jh_splice_lock when +@@ -1812,7 +1812,7 @@ + */ + spin_lock(&jh_splice_lock); + set_bit(BH_JBD, &bh->b_state); +- bh->b_private = jh; ++ bh->b_journal_head = jh; + jh->b_bh = bh; + atomic_inc(&bh->b_count); + spin_unlock(&jh_splice_lock); +@@ -1821,7 +1821,7 @@ + } + jh->b_jcount++; + spin_unlock(&journal_datalist_lock); +- return bh->b_private; ++ return bh->b_journal_head; + } + + /* +@@ -1854,7 +1854,7 @@ + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + spin_lock(&jh_splice_lock); +- bh->b_private = NULL; ++ bh->b_journal_head = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_bit(BH_JBD, &bh->b_state); + __brelse(bh); +--- linux-2.4.22/include/linux/fs.h Mon Nov 17 19:17:56 2003 ++++ linux/include/linux/fs.h Tue Nov 18 13:43:32 2003 +@@ -265,7 +265,7 @@ + struct page *b_page; /* the page this bh is mapped to */ + void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ + void *b_private; /* reserved for b_end_io */ +- ++ void *b_journal_head; /* ext3 journal_heads */ + unsigned long b_rsector; /* Real buffer location on disk */ + wait_queue_head_t b_wait; + +--- linux-2.4.22/include/linux/jbd.h Mon Nov 17 19:17:58 2003 ++++ linux/include/linux/jbd.h Tue Nov 18 13:43:32 2003 +@@ -311,7 +311,7 @@ + + static inline struct journal_head *bh2jh(struct buffer_head *bh) + { +- return bh->b_private; ++ return bh->b_journal_head; + } + + #define HAVE_JOURNAL_CALLBACK_STATUS diff --git a/patches/common/linux-2.4.22-config.patch b/patches/common/linux-2.4.22-config.patch new file mode 100644 index 0000000..c6e25d1 --- /dev/null +++ b/patches/common/linux-2.4.22-config.patch @@ -0,0 +1,49 @@ +--- linux-2.4.22/MAINTAINERS Mon Nov 17 19:15:45 2003 ++++ linux/MAINTAINERS Tue Nov 18 13:22:41 2003 +@@ -554,6 +554,13 @@ + W: http://www.debian.org/~dz/i8k/ + S: Maintained + ++DEVICE MAPPER ++P: Joe Thornber ++M: dm@uk.sistina.com ++L: linux-LVM@sistina.com ++W: http://www.sistina.com/lvm ++S: Maintained ++ + DEVICE NUMBER REGISTRY + P: H. Peter Anvin + M: hpa@zytor.com +--- linux-2.4.22/drivers/md/Config.in Mon Nov 17 19:16:45 2003 ++++ linux/drivers/md/Config.in Tue Nov 18 13:22:41 2003 +@@ -14,5 +14,7 @@ + dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD + + dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD ++dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD ++dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM + + endmenu +--- linux-2.4.22/Documentation/Configure.help Mon Nov 17 19:15:53 2003 ++++ linux/Documentation/Configure.help Tue Nov 18 13:22:41 2003 +@@ -1856,6 +1856,20 @@ + want), say M here and read . The + module will be called lvm-mod.o. + ++Device-mapper support ++CONFIG_BLK_DEV_DM ++ Device-mapper is a low level volume manager. It works by allowing ++ people to specify mappings for ranges of logical sectors. Various ++ mapping types are available, in addition people may write their own ++ modules containing custom mappings if they wish. ++ ++ Higher level volume managers such as LVM2 use this driver. ++ ++ If you want to compile this as a module, say M here and read ++ . The module will be called dm-mod.o. ++ ++ If unsure, say N. ++ + Multiple devices driver support (RAID and LVM) + CONFIG_MD + Support multiple physical spindles through a single logical device. diff --git a/patches/common/linux-2.4.22-devmapper.patch b/patches/common/linux-2.4.22-devmapper.patch new file mode 100644 index 0000000..78ff799 --- /dev/null +++ b/patches/common/linux-2.4.22-devmapper.patch @@ -0,0 +1,9310 @@ +--- linux-2.4.22/drivers/md/dm-daemon.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-daemon.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,113 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#include "dm.h" ++#include "dm-daemon.h" ++ ++#include ++#include ++ ++static int daemon(void *arg) ++{ ++ struct dm_daemon *dd = (struct dm_daemon *) arg; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ daemonize(); ++ reparent_to_init(); ++ ++ /* block all signals */ ++ spin_lock_irq(¤t->sigmask_lock); ++ sigfillset(¤t->blocked); ++ flush_signals(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ strcpy(current->comm, dd->name); ++ atomic_set(&dd->please_die, 0); ++ ++ add_wait_queue(&dd->job_queue, &wq); ++ ++ down(&dd->run_lock); ++ up(&dd->start_lock); ++ ++ /* ++ * dd->fn() could do anything, very likely it will ++ * suspend. So we can't set the state to ++ * TASK_INTERRUPTIBLE before calling it. In order to ++ * prevent a race with a waking thread we do this little ++ * dance with the dd->woken variable. ++ */ ++ while (1) { ++ do { ++ set_current_state(TASK_RUNNING); ++ ++ if (atomic_read(&dd->please_die)) ++ goto out; ++ ++ atomic_set(&dd->woken, 0); ++ dd->fn(); ++ yield(); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ } while (atomic_read(&dd->woken)); ++ ++ schedule(); ++ } ++ ++ out: ++ remove_wait_queue(&dd->job_queue, &wq); ++ up(&dd->run_lock); ++ return 0; ++} ++ ++int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)) ++{ ++ pid_t pid = 0; ++ ++ /* ++ * Initialise the dm_daemon. ++ */ ++ dd->fn = fn; ++ strncpy(dd->name, name, sizeof(dd->name) - 1); ++ sema_init(&dd->start_lock, 1); ++ sema_init(&dd->run_lock, 1); ++ init_waitqueue_head(&dd->job_queue); ++ ++ /* ++ * Start the new thread. ++ */ ++ down(&dd->start_lock); ++ pid = kernel_thread(daemon, dd, 0); ++ if (pid <= 0) { ++ DMERR("Failed to start %s thread", name); ++ return -EAGAIN; ++ } ++ ++ /* ++ * wait for the daemon to up this mutex. ++ */ ++ down(&dd->start_lock); ++ up(&dd->start_lock); ++ ++ return 0; ++} ++ ++void dm_daemon_stop(struct dm_daemon *dd) ++{ ++ atomic_set(&dd->please_die, 1); ++ dm_daemon_wake(dd); ++ down(&dd->run_lock); ++ up(&dd->run_lock); ++} ++ ++void dm_daemon_wake(struct dm_daemon *dd) ++{ ++ atomic_set(&dd->woken, 1); ++ wake_up_interruptible(&dd->job_queue); ++} ++ ++EXPORT_SYMBOL(dm_daemon_start); ++EXPORT_SYMBOL(dm_daemon_stop); ++EXPORT_SYMBOL(dm_daemon_wake); +--- linux-2.4.22/drivers/md/dm-daemon.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-daemon.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,29 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#ifndef DM_DAEMON_H ++#define DM_DAEMON_H ++ ++#include ++#include ++ ++struct dm_daemon { ++ void (*fn)(void); ++ char name[16]; ++ atomic_t please_die; ++ struct semaphore start_lock; ++ struct semaphore run_lock; ++ ++ atomic_t woken; ++ wait_queue_head_t job_queue; ++}; ++ ++int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)); ++void dm_daemon_stop(struct dm_daemon *dd); ++void dm_daemon_wake(struct dm_daemon *dd); ++int dm_daemon_running(struct dm_daemon *dd); ++ ++#endif +--- linux-2.4.22/drivers/md/dm-exception-store.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-exception-store.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,673 @@ ++/* ++ * dm-snapshot.c ++ * ++ * Copyright (C) 2001-2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm-snapshot.h" ++#include "dm-io.h" ++#include "kcopyd.h" ++ ++#include ++#include ++#include ++#include ++ ++/*----------------------------------------------------------------- ++ * Persistent snapshots, by persistent we mean that the snapshot ++ * will survive a reboot. ++ *---------------------------------------------------------------*/ ++ ++/* ++ * We need to store a record of which parts of the origin have ++ * been copied to the snapshot device. The snapshot code ++ * requires that we copy exception chunks to chunk aligned areas ++ * of the COW store. It makes sense therefore, to store the ++ * metadata in chunk size blocks. ++ * ++ * There is no backward or forward compatibility implemented, ++ * snapshots with different disk versions than the kernel will ++ * not be usable. It is expected that "lvcreate" will blank out ++ * the start of a fresh COW device before calling the snapshot ++ * constructor. ++ * ++ * The first chunk of the COW device just contains the header. ++ * After this there is a chunk filled with exception metadata, ++ * followed by as many exception chunks as can fit in the ++ * metadata areas. ++ * ++ * All on disk structures are in little-endian format. The end ++ * of the exceptions info is indicated by an exception with a ++ * new_chunk of 0, which is invalid since it would point to the ++ * header chunk. ++ */ ++ ++/* ++ * Magic for persistent snapshots: "SnAp" - Feeble isn't it. ++ */ ++#define SNAP_MAGIC 0x70416e53 ++ ++/* ++ * The on-disk version of the metadata. ++ */ ++#define SNAPSHOT_DISK_VERSION 1 ++ ++struct disk_header { ++ uint32_t magic; ++ ++ /* ++ * Is this snapshot valid. There is no way of recovering ++ * an invalid snapshot. ++ */ ++ uint32_t valid; ++ ++ /* ++ * Simple, incrementing version. no backward ++ * compatibility. ++ */ ++ uint32_t version; ++ ++ /* In sectors */ ++ uint32_t chunk_size; ++}; ++ ++struct disk_exception { ++ uint64_t old_chunk; ++ uint64_t new_chunk; ++}; ++ ++struct commit_callback { ++ void (*callback)(void *, int success); ++ void *context; ++}; ++ ++/* ++ * The top level structure for a persistent exception store. ++ */ ++struct pstore { ++ struct dm_snapshot *snap; /* up pointer to my snapshot */ ++ int version; ++ int valid; ++ uint32_t chunk_size; ++ uint32_t exceptions_per_area; ++ ++ /* ++ * Now that we have an asynchronous kcopyd there is no ++ * need for large chunk sizes, so it wont hurt to have a ++ * whole chunks worth of metadata in memory at once. ++ */ ++ void *area; ++ ++ /* ++ * Used to keep track of which metadata area the data in ++ * 'chunk' refers to. ++ */ ++ uint32_t current_area; ++ ++ /* ++ * The next free chunk for an exception. ++ */ ++ uint32_t next_free; ++ ++ /* ++ * The index of next free exception in the current ++ * metadata area. ++ */ ++ uint32_t current_committed; ++ ++ atomic_t pending_count; ++ uint32_t callback_count; ++ struct commit_callback *callbacks; ++}; ++ ++static inline unsigned int sectors_to_pages(unsigned int sectors) ++{ ++ return sectors / (PAGE_SIZE / SECTOR_SIZE); ++} ++ ++static int alloc_area(struct pstore *ps) ++{ ++ int r = -ENOMEM; ++ size_t i, len, nr_pages; ++ struct page *page, *last = NULL; ++ ++ len = ps->chunk_size << SECTOR_SHIFT; ++ ++ /* ++ * Allocate the chunk_size block of memory that will hold ++ * a single metadata area. ++ */ ++ ps->area = vmalloc(len); ++ if (!ps->area) ++ return r; ++ ++ nr_pages = sectors_to_pages(ps->chunk_size); ++ ++ /* ++ * We lock the pages for ps->area into memory since ++ * they'll be doing a lot of io. We also chain them ++ * together ready for dm-io. ++ */ ++ for (i = 0; i < nr_pages; i++) { ++ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); ++ LockPage(page); ++ if (last) ++ last->list.next = &page->list; ++ last = page; ++ } ++ ++ return 0; ++} ++ ++static void free_area(struct pstore *ps) ++{ ++ size_t i, nr_pages; ++ struct page *page; ++ ++ nr_pages = sectors_to_pages(ps->chunk_size); ++ for (i = 0; i < nr_pages; i++) { ++ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); ++ page->list.next = NULL; ++ UnlockPage(page); ++ } ++ ++ vfree(ps->area); ++} ++ ++/* ++ * Read or write a chunk aligned and sized block of data from a device. ++ */ ++static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) ++{ ++ struct io_region where; ++ unsigned int bits; ++ ++ where.dev = ps->snap->cow->dev; ++ where.sector = ps->chunk_size * chunk; ++ where.count = ps->chunk_size; ++ ++ return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits); ++} ++ ++/* ++ * Read or write a metadata area. Remembering to skip the first ++ * chunk which holds the header. ++ */ ++static int area_io(struct pstore *ps, uint32_t area, int rw) ++{ ++ int r; ++ uint32_t chunk; ++ ++ /* convert a metadata area index to a chunk index */ ++ chunk = 1 + ((ps->exceptions_per_area + 1) * area); ++ ++ r = chunk_io(ps, chunk, rw); ++ if (r) ++ return r; ++ ++ ps->current_area = area; ++ return 0; ++} ++ ++static int zero_area(struct pstore *ps, uint32_t area) ++{ ++ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); ++ return area_io(ps, area, WRITE); ++} ++ ++static int read_header(struct pstore *ps, int *new_snapshot) ++{ ++ int r; ++ struct disk_header *dh; ++ ++ r = chunk_io(ps, 0, READ); ++ if (r) ++ return r; ++ ++ dh = (struct disk_header *) ps->area; ++ ++ if (le32_to_cpu(dh->magic) == 0) { ++ *new_snapshot = 1; ++ ++ } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { ++ *new_snapshot = 0; ++ ps->valid = le32_to_cpu(dh->valid); ++ ps->version = le32_to_cpu(dh->version); ++ ps->chunk_size = le32_to_cpu(dh->chunk_size); ++ ++ } else { ++ DMWARN("Invalid/corrupt snapshot"); ++ r = -ENXIO; ++ } ++ ++ return r; ++} ++ ++static int write_header(struct pstore *ps) ++{ ++ struct disk_header *dh; ++ ++ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); ++ ++ dh = (struct disk_header *) ps->area; ++ dh->magic = cpu_to_le32(SNAP_MAGIC); ++ dh->valid = cpu_to_le32(ps->valid); ++ dh->version = cpu_to_le32(ps->version); ++ dh->chunk_size = cpu_to_le32(ps->chunk_size); ++ ++ return chunk_io(ps, 0, WRITE); ++} ++ ++/* ++ * Access functions for the disk exceptions, these do the endian conversions. ++ */ ++static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) ++{ ++ if (index >= ps->exceptions_per_area) ++ return NULL; ++ ++ return ((struct disk_exception *) ps->area) + index; ++} ++ ++static int read_exception(struct pstore *ps, ++ uint32_t index, struct disk_exception *result) ++{ ++ struct disk_exception *e; ++ ++ e = get_exception(ps, index); ++ if (!e) ++ return -EINVAL; ++ ++ /* copy it */ ++ result->old_chunk = le64_to_cpu(e->old_chunk); ++ result->new_chunk = le64_to_cpu(e->new_chunk); ++ ++ return 0; ++} ++ ++static int write_exception(struct pstore *ps, ++ uint32_t index, struct disk_exception *de) ++{ ++ struct disk_exception *e; ++ ++ e = get_exception(ps, index); ++ if (!e) ++ return -EINVAL; ++ ++ /* copy it */ ++ e->old_chunk = cpu_to_le64(de->old_chunk); ++ e->new_chunk = cpu_to_le64(de->new_chunk); ++ ++ return 0; ++} ++ ++/* ++ * Registers the exceptions that are present in the current area. ++ * 'full' is filled in to indicate if the area has been ++ * filled. ++ */ ++static int insert_exceptions(struct pstore *ps, int *full) ++{ ++ int r; ++ unsigned int i; ++ struct disk_exception de; ++ ++ /* presume the area is full */ ++ *full = 1; ++ ++ for (i = 0; i < ps->exceptions_per_area; i++) { ++ r = read_exception(ps, i, &de); ++ ++ if (r) ++ return r; ++ ++ /* ++ * If the new_chunk is pointing at the start of ++ * the COW device, where the first metadata area ++ * is we know that we've hit the end of the ++ * exceptions. Therefore the area is not full. ++ */ ++ if (de.new_chunk == 0LL) { ++ ps->current_committed = i; ++ *full = 0; ++ break; ++ } ++ ++ /* ++ * Keep track of the start of the free chunks. ++ */ ++ if (ps->next_free <= de.new_chunk) ++ ps->next_free = de.new_chunk + 1; ++ ++ /* ++ * Otherwise we add the exception to the snapshot. ++ */ ++ r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++ ++static int read_exceptions(struct pstore *ps) ++{ ++ uint32_t area; ++ int r, full = 1; ++ ++ /* ++ * Keeping reading chunks and inserting exceptions until ++ * we find a partially full area. ++ */ ++ for (area = 0; full; area++) { ++ r = area_io(ps, area, READ); ++ if (r) ++ return r; ++ ++ r = insert_exceptions(ps, &full); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++ ++static inline struct pstore *get_info(struct exception_store *store) ++{ ++ return (struct pstore *) store->context; ++} ++ ++static void persistent_fraction_full(struct exception_store *store, ++ sector_t *numerator, sector_t *denominator) ++{ ++ *numerator = get_info(store)->next_free * store->snap->chunk_size; ++ *denominator = get_dev_size(store->snap->cow->dev); ++} ++ ++static void persistent_destroy(struct exception_store *store) ++{ ++ struct pstore *ps = get_info(store); ++ ++ dm_io_put(sectors_to_pages(ps->chunk_size)); ++ vfree(ps->callbacks); ++ free_area(ps); ++ kfree(ps); ++} ++ ++static int persistent_read_metadata(struct exception_store *store) ++{ ++ int r, new_snapshot; ++ struct pstore *ps = get_info(store); ++ ++ /* ++ * Read the snapshot header. ++ */ ++ r = read_header(ps, &new_snapshot); ++ if (r) ++ return r; ++ ++ /* ++ * Do we need to setup a new snapshot ? ++ */ ++ if (new_snapshot) { ++ r = write_header(ps); ++ if (r) { ++ DMWARN("write_header failed"); ++ return r; ++ } ++ ++ r = zero_area(ps, 0); ++ if (r) { ++ DMWARN("zero_area(0) failed"); ++ return r; ++ } ++ ++ } else { ++ /* ++ * Sanity checks. ++ */ ++ if (!ps->valid) { ++ DMWARN("snapshot is marked invalid"); ++ return -EINVAL; ++ } ++ ++ if (ps->version != SNAPSHOT_DISK_VERSION) { ++ DMWARN("unable to handle snapshot disk version %d", ++ ps->version); ++ return -EINVAL; ++ } ++ ++ /* ++ * Read the metadata. ++ */ ++ r = read_exceptions(ps); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++ ++static int persistent_prepare(struct exception_store *store, ++ struct exception *e) ++{ ++ struct pstore *ps = get_info(store); ++ uint32_t stride; ++ sector_t size = get_dev_size(store->snap->cow->dev); ++ ++ /* Is there enough room ? */ ++ if (size < ((ps->next_free + 1) * store->snap->chunk_size)) ++ return -ENOSPC; ++ ++ e->new_chunk = ps->next_free; ++ ++ /* ++ * Move onto the next free pending, making sure to take ++ * into account the location of the metadata chunks. ++ */ ++ stride = (ps->exceptions_per_area + 1); ++ if ((++ps->next_free % stride) == 1) ++ ps->next_free++; ++ ++ atomic_inc(&ps->pending_count); ++ return 0; ++} ++ ++static void persistent_commit(struct exception_store *store, ++ struct exception *e, ++ void (*callback) (void *, int success), ++ void *callback_context) ++{ ++ int r; ++ unsigned int i; ++ struct pstore *ps = get_info(store); ++ struct disk_exception de; ++ struct commit_callback *cb; ++ ++ de.old_chunk = e->old_chunk; ++ de.new_chunk = e->new_chunk; ++ write_exception(ps, ps->current_committed++, &de); ++ ++ /* ++ * Add the callback to the back of the array. This code ++ * is the only place where the callback array is ++ * manipulated, and we know that it will never be called ++ * multiple times concurrently. ++ */ ++ cb = ps->callbacks + ps->callback_count++; ++ cb->callback = callback; ++ cb->context = callback_context; ++ ++ /* ++ * If there are no more exceptions in flight, or we have ++ * filled this metadata area we commit the exceptions to ++ * disk. ++ */ ++ if (atomic_dec_and_test(&ps->pending_count) || ++ (ps->current_committed == ps->exceptions_per_area)) { ++ r = area_io(ps, ps->current_area, WRITE); ++ if (r) ++ ps->valid = 0; ++ ++ for (i = 0; i < ps->callback_count; i++) { ++ cb = ps->callbacks + i; ++ cb->callback(cb->context, r == 0 ? 1 : 0); ++ } ++ ++ ps->callback_count = 0; ++ } ++ ++ /* ++ * Have we completely filled the current area ? ++ */ ++ if (ps->current_committed == ps->exceptions_per_area) { ++ ps->current_committed = 0; ++ r = zero_area(ps, ps->current_area + 1); ++ if (r) ++ ps->valid = 0; ++ } ++} ++ ++static void persistent_drop(struct exception_store *store) ++{ ++ struct pstore *ps = get_info(store); ++ ++ ps->valid = 0; ++ if (write_header(ps)) ++ DMWARN("write header failed"); ++} ++ ++int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) ++{ ++ int r; ++ struct pstore *ps; ++ ++ r = dm_io_get(sectors_to_pages(chunk_size)); ++ if (r) ++ return r; ++ ++ /* allocate the pstore */ ++ ps = kmalloc(sizeof(*ps), GFP_KERNEL); ++ if (!ps) { ++ r = -ENOMEM; ++ goto bad; ++ } ++ ++ ps->snap = store->snap; ++ ps->valid = 1; ++ ps->version = SNAPSHOT_DISK_VERSION; ++ ps->chunk_size = chunk_size; ++ ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / ++ sizeof(struct disk_exception); ++ ps->next_free = 2; /* skipping the header and first area */ ++ ps->current_committed = 0; ++ ++ r = alloc_area(ps); ++ if (r) ++ goto bad; ++ ++ /* ++ * Allocate space for all the callbacks. ++ */ ++ ps->callback_count = 0; ++ atomic_set(&ps->pending_count, 0); ++ ps->callbacks = vcalloc(ps->exceptions_per_area, ++ sizeof(*ps->callbacks)); ++ ++ if (!ps->callbacks) { ++ r = -ENOMEM; ++ goto bad; ++ } ++ ++ store->destroy = persistent_destroy; ++ store->read_metadata = persistent_read_metadata; ++ store->prepare_exception = persistent_prepare; ++ store->commit_exception = persistent_commit; ++ store->drop_snapshot = persistent_drop; ++ store->fraction_full = persistent_fraction_full; ++ store->context = ps; ++ ++ return 0; ++ ++ bad: ++ dm_io_put(sectors_to_pages(chunk_size)); ++ if (ps) { ++ if (ps->callbacks) ++ vfree(ps->callbacks); ++ ++ kfree(ps); ++ } ++ return r; ++} ++ ++/*----------------------------------------------------------------- ++ * Implementation of the store for non-persistent snapshots. ++ *---------------------------------------------------------------*/ ++struct transient_c { ++ sector_t next_free; ++}; ++ ++void transient_destroy(struct exception_store *store) ++{ ++ kfree(store->context); ++} ++ ++int transient_read_metadata(struct exception_store *store) ++{ ++ return 0; ++} ++ ++int transient_prepare(struct exception_store *store, struct exception *e) ++{ ++ struct transient_c *tc = (struct transient_c *) store->context; ++ sector_t size = get_dev_size(store->snap->cow->dev); ++ ++ if (size < (tc->next_free + store->snap->chunk_size)) ++ return -1; ++ ++ e->new_chunk = sector_to_chunk(store->snap, tc->next_free); ++ tc->next_free += store->snap->chunk_size; ++ ++ return 0; ++} ++ ++void transient_commit(struct exception_store *store, ++ struct exception *e, ++ void (*callback) (void *, int success), ++ void *callback_context) ++{ ++ /* Just succeed */ ++ callback(callback_context, 1); ++} ++ ++static void transient_fraction_full(struct exception_store *store, ++ sector_t *numerator, sector_t *denominator) ++{ ++ *numerator = ((struct transient_c *) store->context)->next_free; ++ *denominator = get_dev_size(store->snap->cow->dev); ++} ++ ++int dm_create_transient(struct exception_store *store, ++ struct dm_snapshot *s, int blocksize) ++{ ++ struct transient_c *tc; ++ ++ memset(store, 0, sizeof(*store)); ++ store->destroy = transient_destroy; ++ store->read_metadata = transient_read_metadata; ++ store->prepare_exception = transient_prepare; ++ store->commit_exception = transient_commit; ++ store->fraction_full = transient_fraction_full; ++ store->snap = s; ++ ++ tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); ++ if (!tc) ++ return -ENOMEM; ++ ++ tc->next_free = 0; ++ store->context = tc; ++ ++ return 0; ++} +--- linux-2.4.22/drivers/md/dm-io.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-io.c Tue Nov 18 14:01:30 2003 +@@ -0,0 +1,361 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* FIXME: can we shrink this ? */ ++struct io_context { ++ int rw; ++ unsigned int error; ++ atomic_t count; ++ struct task_struct *sleeper; ++ io_notify_fn callback; ++ void *context; ++}; ++ ++/* ++ * We maintain a pool of buffer heads for dispatching the io. ++ */ ++static unsigned int _num_bhs; ++static mempool_t *_buffer_pool; ++ ++/* ++ * io contexts are only dynamically allocated for asynchronous ++ * io. Since async io is likely to be the majority of io we'll ++ * have the same number of io contexts as buffer heads ! (FIXME: ++ * must reduce this). ++ */ ++mempool_t *_io_pool; ++ ++static void *alloc_bh(int gfp_mask, void *pool_data) ++{ ++ struct buffer_head *bh; ++ ++ bh = kmem_cache_alloc(bh_cachep, gfp_mask); ++ if (bh) { ++ bh->b_reqnext = NULL; ++ init_waitqueue_head(&bh->b_wait); ++ INIT_LIST_HEAD(&bh->b_inode_buffers); ++ } ++ ++ return bh; ++} ++ ++static void *alloc_io(int gfp_mask, void *pool_data) ++{ ++ return kmalloc(sizeof(struct io_context), gfp_mask); ++} ++ ++static void free_io(void *element, void *pool_data) ++{ ++ kfree(element); ++} ++ ++static unsigned int pages_to_buffers(unsigned int pages) ++{ ++ return 4 * pages; /* too many ? */ ++} ++ ++static int resize_pool(unsigned int new_bhs) ++{ ++ int r = 0; ++ ++ if (_buffer_pool) { ++ if (new_bhs == 0) { ++ /* free off the pools */ ++ mempool_destroy(_buffer_pool); ++ mempool_destroy(_io_pool); ++ _buffer_pool = _io_pool = NULL; ++ } else { ++ /* resize the pools */ ++ r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL); ++ if (!r) ++ r = mempool_resize(_io_pool, ++ new_bhs, GFP_KERNEL); ++ } ++ } else { ++ /* create new pools */ ++ _buffer_pool = mempool_create(new_bhs, alloc_bh, ++ mempool_free_slab, bh_cachep); ++ if (!_buffer_pool) ++ r = -ENOMEM; ++ ++ _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL); ++ if (!_io_pool) { ++ mempool_destroy(_buffer_pool); ++ _buffer_pool = NULL; ++ r = -ENOMEM; ++ } ++ } ++ ++ if (!r) ++ _num_bhs = new_bhs; ++ ++ return r; ++} ++ ++int dm_io_get(unsigned int num_pages) ++{ ++ return resize_pool(_num_bhs + pages_to_buffers(num_pages)); ++} ++ ++void dm_io_put(unsigned int num_pages) ++{ ++ resize_pool(_num_bhs - pages_to_buffers(num_pages)); ++} ++ ++/*----------------------------------------------------------------- ++ * We need to keep track of which region a buffer is doing io ++ * for. In order to save a memory allocation we store this in an ++ * unused field of the buffer head, and provide these access ++ * functions. ++ * ++ * FIXME: add compile time check that an unsigned int can fit ++ * into a pointer. ++ * ++ *---------------------------------------------------------------*/ ++static inline void bh_set_region(struct buffer_head *bh, unsigned int region) ++{ ++ bh->b_journal_head = (void *) region; ++} ++ ++static inline int bh_get_region(struct buffer_head *bh) ++{ ++ return (unsigned int) bh->b_journal_head; ++} ++ ++/*----------------------------------------------------------------- ++ * We need an io object to keep track of the number of bhs that ++ * have been dispatched for a particular io. ++ *---------------------------------------------------------------*/ ++static void dec_count(struct io_context *io, unsigned int region, int error) ++{ ++ if (error) ++ set_bit(region, &io->error); ++ ++ if (atomic_dec_and_test(&io->count)) { ++ if (io->sleeper) ++ wake_up_process(io->sleeper); ++ ++ else { ++ int r = io->error; ++ io_notify_fn fn = io->callback; ++ void *context = io->context; ++ ++ mempool_free(io, _io_pool); ++ fn(r, context); ++ } ++ } ++} ++ ++static void endio(struct buffer_head *bh, int uptodate) ++{ ++ struct io_context *io = (struct io_context *) bh->b_private; ++ ++ if (!uptodate && io->rw != WRITE) { ++ /* ++ * We need to zero this region, otherwise people ++ * like kcopyd may write the arbitrary contents ++ * of the page. ++ */ ++ memset(bh->b_data, 0, bh->b_size); ++ } ++ ++ dec_count((struct io_context *) bh->b_private, ++ bh_get_region(bh), !uptodate); ++ mempool_free(bh, _buffer_pool); ++} ++ ++/* ++ * Primitives for alignment calculations. ++ */ ++int fls(unsigned n) ++{ ++ return generic_fls32(n); ++} ++ ++static inline int log2_floor(unsigned n) ++{ ++ return ffs(n) - 1; ++} ++ ++static inline int log2_align(unsigned n) ++{ ++ return fls(n) - 1; ++} ++ ++/* ++ * Returns the next block for io. ++ */ ++static int do_page(kdev_t dev, sector_t *block, sector_t end_block, ++ unsigned int block_size, ++ struct page *p, unsigned int offset, ++ unsigned int region, struct io_context *io) ++{ ++ struct buffer_head *bh; ++ sector_t b = *block; ++ sector_t blocks_per_page = PAGE_SIZE / block_size; ++ unsigned int this_size; /* holds the size of the current io */ ++ sector_t len; ++ ++ if (!blocks_per_page) { ++ DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported", ++ PAGE_SIZE, block_size); ++ return 0; ++ } ++ ++ while ((offset < PAGE_SIZE) && (b != end_block)) { ++ bh = mempool_alloc(_buffer_pool, GFP_NOIO); ++ init_buffer(bh, endio, io); ++ bh_set_region(bh, region); ++ ++ /* ++ * Block size must be a power of 2 and aligned ++ * correctly. ++ */ ++ ++ len = min(end_block - b, blocks_per_page); ++ len = min(len, blocks_per_page - offset / block_size); ++ ++ if (!len) { ++ DMERR("dm-io: Invalid offset/block_size (%u/%u).", ++ offset, block_size); ++ return 0; ++ } ++ ++ this_size = 1 << log2_align(len); ++ if (b) ++ this_size = min(this_size, ++ (unsigned) 1 << log2_floor(b)); ++ ++ /* ++ * Add in the job offset. ++ */ ++ bh->b_blocknr = (b / this_size); ++ bh->b_size = block_size * this_size; ++ set_bh_page(bh, p, offset); ++ bh->b_this_page = bh; ++ ++ bh->b_dev = dev; ++ atomic_set(&bh->b_count, 1); ++ ++ bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) | ++ (1 << BH_Lock)); ++ ++ if (io->rw == WRITE) ++ clear_bit(BH_Dirty, &bh->b_state); ++ ++ atomic_inc(&io->count); ++ submit_bh(io->rw, bh); ++ ++ b += this_size; ++ offset += block_size * this_size; ++ } ++ ++ *block = b; ++ return (b == end_block); ++} ++ ++static void do_region(unsigned int region, struct io_region *where, ++ struct page *page, unsigned int offset, ++ struct io_context *io) ++{ ++ unsigned int block_size = get_hardsect_size(where->dev); ++ unsigned int sblock_size = block_size >> 9; ++ sector_t block = where->sector / sblock_size; ++ sector_t end_block = (where->sector + where->count) / sblock_size; ++ ++ while (1) { ++ if (do_page(where->dev, &block, end_block, block_size, ++ page, offset, region, io)) ++ break; ++ ++ offset = 0; /* only offset the first page */ ++ ++ page = list_entry(page->list.next, struct page, list); ++ } ++} ++ ++static void dispatch_io(unsigned int num_regions, struct io_region *where, ++ struct page *pages, unsigned int offset, ++ struct io_context *io) ++{ ++ int i; ++ ++ for (i = 0; i < num_regions; i++) ++ if (where[i].count) ++ do_region(i, where + i, pages, offset, io); ++ ++ /* ++ * Drop the extra refence that we were holding to avoid ++ * the io being completed too early. ++ */ ++ dec_count(io, 0, 0); ++} ++ ++/* ++ * Synchronous io ++ */ ++int dm_io_sync(unsigned int num_regions, struct io_region *where, ++ int rw, struct page *pages, unsigned int offset, ++ unsigned int *error_bits) ++{ ++ struct io_context io; ++ ++ BUG_ON(num_regions > 1 && rw != WRITE); ++ ++ io.rw = rw; ++ io.error = 0; ++ atomic_set(&io.count, 1); /* see dispatch_io() */ ++ io.sleeper = current; ++ ++ dispatch_io(num_regions, where, pages, offset, &io); ++ run_task_queue(&tq_disk); ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ ++ if (!atomic_read(&io.count)) ++ break; ++ ++ schedule(); ++ } ++ set_current_state(TASK_RUNNING); ++ ++ *error_bits = io.error; ++ return io.error ? -EIO : 0; ++} ++ ++/* ++ * Asynchronous io ++ */ ++int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, ++ struct page *pages, unsigned int offset, ++ io_notify_fn fn, void *context) ++{ ++ struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO); ++ ++ io->rw = rw; ++ io->error = 0; ++ atomic_set(&io->count, 1); /* see dispatch_io() */ ++ io->sleeper = NULL; ++ io->callback = fn; ++ io->context = context; ++ ++ dispatch_io(num_regions, where, pages, offset, io); ++ return 0; ++} ++ ++EXPORT_SYMBOL(dm_io_get); ++EXPORT_SYMBOL(dm_io_put); ++EXPORT_SYMBOL(dm_io_sync); ++EXPORT_SYMBOL(dm_io_async); +--- linux-2.4.22/drivers/md/dm-io.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-io.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,86 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the GPL. ++ */ ++ ++#ifndef _DM_IO_H ++#define _DM_IO_H ++ ++#include "dm.h" ++ ++#include ++ ++/* Move these to bitops.h eventually */ ++/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */ ++/* (c) 2002, D.Phillips and Sistina Software */ ++/* Licensed under Version 2 of the GPL */ ++ ++static unsigned generic_fls8(unsigned n) ++{ ++ return n & 0xf0 ? ++ n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5: ++ n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2); ++} ++ ++static inline unsigned generic_fls16(unsigned n) ++{ ++ return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n); ++} ++ ++static inline unsigned generic_fls32(unsigned n) ++{ ++ return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n); ++} ++ ++/* FIXME make this configurable */ ++#define DM_MAX_IO_REGIONS 8 ++ ++struct io_region { ++ kdev_t dev; ++ sector_t sector; ++ sector_t count; ++}; ++ ++ ++/* ++ * 'error' is a bitset, with each bit indicating whether an error ++ * occurred doing io to the corresponding region. ++ */ ++typedef void (*io_notify_fn)(unsigned int error, void *context); ++ ++ ++/* ++ * Before anyone uses the IO interface they should call ++ * dm_io_get(), specifying roughly how many pages they are ++ * expecting to perform io on concurrently. ++ * ++ * This function may block. ++ */ ++int dm_io_get(unsigned int num_pages); ++void dm_io_put(unsigned int num_pages); ++ ++ ++/* ++ * Synchronous IO. ++ * ++ * Please ensure that the rw flag in the next two functions is ++ * either READ or WRITE, ie. we don't take READA. Any ++ * regions with a zero count field will be ignored. ++ */ ++int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, ++ struct page *pages, unsigned int offset, ++ unsigned int *error_bits); ++ ++ ++/* ++ * Aynchronous IO. ++ * ++ * The 'where' array may be safely allocated on the stack since ++ * the function takes a copy. ++ */ ++int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, ++ struct page *pages, unsigned int offset, ++ io_notify_fn fn, void *context); ++ ++#endif +--- linux-2.4.22/drivers/md/dm-ioctl.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-ioctl.c Tue Nov 18 14:45:13 2003 +@@ -0,0 +1,1284 @@ ++/* ++ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define DM_DRIVER_EMAIL "dm@uk.sistina.com" ++ ++/*----------------------------------------------------------------- ++ * The ioctl interface needs to be able to look up devices by ++ * name or uuid. ++ *---------------------------------------------------------------*/ ++struct hash_cell { ++ struct list_head name_list; ++ struct list_head uuid_list; ++ ++ char *name; ++ char *uuid; ++ struct mapped_device *md; ++ struct dm_table *new_map; ++ ++ /* I hate devfs */ ++ devfs_handle_t devfs_entry; ++}; ++ ++#define NUM_BUCKETS 64 ++#define MASK_BUCKETS (NUM_BUCKETS - 1) ++static struct list_head _name_buckets[NUM_BUCKETS]; ++static struct list_head _uuid_buckets[NUM_BUCKETS]; ++ ++static devfs_handle_t _dev_dir; ++void dm_hash_remove_all(void); ++ ++/* ++ * Guards access to both hash tables. ++ */ ++static DECLARE_RWSEM(_hash_lock); ++ ++static void init_buckets(struct list_head *buckets) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < NUM_BUCKETS; i++) ++ INIT_LIST_HEAD(buckets + i); ++} ++ ++int dm_hash_init(void) ++{ ++ init_buckets(_name_buckets); ++ init_buckets(_uuid_buckets); ++ _dev_dir = devfs_mk_dir(0, DM_DIR, NULL); ++ return 0; ++} ++ ++void dm_hash_exit(void) ++{ ++ dm_hash_remove_all(); ++ devfs_unregister(_dev_dir); ++} ++ ++/*----------------------------------------------------------------- ++ * Hash function: ++ * We're not really concerned with the str hash function being ++ * fast since it's only used by the ioctl interface. ++ *---------------------------------------------------------------*/ ++static unsigned int hash_str(const char *str) ++{ ++ const unsigned int hash_mult = 2654435387U; ++ unsigned int h = 0; ++ ++ while (*str) ++ h = (h + (unsigned int) *str++) * hash_mult; ++ ++ return h & MASK_BUCKETS; ++} ++ ++/*----------------------------------------------------------------- ++ * Code for looking up a device by name ++ *---------------------------------------------------------------*/ ++static struct hash_cell *__get_name_cell(const char *str) ++{ ++ struct list_head *tmp; ++ struct hash_cell *hc; ++ unsigned int h = hash_str(str); ++ ++ list_for_each (tmp, _name_buckets + h) { ++ hc = list_entry(tmp, struct hash_cell, name_list); ++ if (!strcmp(hc->name, str)) ++ return hc; ++ } ++ ++ return NULL; ++} ++ ++static struct hash_cell *__get_uuid_cell(const char *str) ++{ ++ struct list_head *tmp; ++ struct hash_cell *hc; ++ unsigned int h = hash_str(str); ++ ++ list_for_each (tmp, _uuid_buckets + h) { ++ hc = list_entry(tmp, struct hash_cell, uuid_list); ++ if (!strcmp(hc->uuid, str)) ++ return hc; ++ } ++ ++ return NULL; ++} ++ ++/*----------------------------------------------------------------- ++ * Inserting, removing and renaming a device. ++ *---------------------------------------------------------------*/ ++static inline char *kstrdup(const char *str) ++{ ++ char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); ++ if (r) ++ strcpy(r, str); ++ return r; ++} ++ ++static struct hash_cell *alloc_cell(const char *name, const char *uuid, ++ struct mapped_device *md) ++{ ++ struct hash_cell *hc; ++ ++ hc = kmalloc(sizeof(*hc), GFP_KERNEL); ++ if (!hc) ++ return NULL; ++ ++ hc->name = kstrdup(name); ++ if (!hc->name) { ++ kfree(hc); ++ return NULL; ++ } ++ ++ if (!uuid) ++ hc->uuid = NULL; ++ ++ else { ++ hc->uuid = kstrdup(uuid); ++ if (!hc->uuid) { ++ kfree(hc->name); ++ kfree(hc); ++ return NULL; ++ } ++ } ++ ++ INIT_LIST_HEAD(&hc->name_list); ++ INIT_LIST_HEAD(&hc->uuid_list); ++ hc->md = md; ++ hc->new_map = NULL; ++ return hc; ++} ++ ++static void free_cell(struct hash_cell *hc) ++{ ++ if (hc) { ++ kfree(hc->name); ++ kfree(hc->uuid); ++ kfree(hc); ++ } ++} ++ ++/* ++ * devfs stuff. ++ */ ++static int register_with_devfs(struct hash_cell *hc) ++{ ++ kdev_t dev = dm_kdev(hc->md); ++ ++ hc->devfs_entry = ++ devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER, ++ major(dev), minor(dev), ++ S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, ++ &dm_blk_dops, NULL); ++ ++ return 0; ++} ++ ++static int unregister_with_devfs(struct hash_cell *hc) ++{ ++ devfs_unregister(hc->devfs_entry); ++ return 0; ++} ++ ++/* ++ * The kdev_t and uuid of a device can never change once it is ++ * initially inserted. ++ */ ++int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) ++{ ++ struct hash_cell *cell; ++ ++ /* ++ * Allocate the new cells. ++ */ ++ cell = alloc_cell(name, uuid, md); ++ if (!cell) ++ return -ENOMEM; ++ ++ /* ++ * Insert the cell into both hash tables. ++ */ ++ down_write(&_hash_lock); ++ if (__get_name_cell(name)) ++ goto bad; ++ ++ list_add(&cell->name_list, _name_buckets + hash_str(name)); ++ ++ if (uuid) { ++ if (__get_uuid_cell(uuid)) { ++ list_del(&cell->name_list); ++ goto bad; ++ } ++ list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); ++ } ++ register_with_devfs(cell); ++ dm_get(md); ++ up_write(&_hash_lock); ++ ++ return 0; ++ ++ bad: ++ up_write(&_hash_lock); ++ free_cell(cell); ++ return -EBUSY; ++} ++ ++void __hash_remove(struct hash_cell *hc) ++{ ++ /* remove from the dev hash */ ++ list_del(&hc->uuid_list); ++ list_del(&hc->name_list); ++ unregister_with_devfs(hc); ++ dm_put(hc->md); ++ if (hc->new_map) ++ dm_table_put(hc->new_map); ++ free_cell(hc); ++} ++ ++void dm_hash_remove_all(void) ++{ ++ int i; ++ struct hash_cell *hc; ++ struct list_head *tmp, *n; ++ ++ down_write(&_hash_lock); ++ for (i = 0; i < NUM_BUCKETS; i++) { ++ list_for_each_safe (tmp, n, _name_buckets + i) { ++ hc = list_entry(tmp, struct hash_cell, name_list); ++ __hash_remove(hc); ++ } ++ } ++ up_write(&_hash_lock); ++} ++ ++int dm_hash_rename(const char *old, const char *new) ++{ ++ char *new_name, *old_name; ++ struct hash_cell *hc; ++ ++ /* ++ * duplicate new. ++ */ ++ new_name = kstrdup(new); ++ if (!new_name) ++ return -ENOMEM; ++ ++ down_write(&_hash_lock); ++ ++ /* ++ * Is new free ? ++ */ ++ hc = __get_name_cell(new); ++ if (hc) { ++ DMWARN("asked to rename to an already existing name %s -> %s", ++ old, new); ++ up_write(&_hash_lock); ++ kfree(new_name); ++ return -EBUSY; ++ } ++ ++ /* ++ * Is there such a device as 'old' ? ++ */ ++ hc = __get_name_cell(old); ++ if (!hc) { ++ DMWARN("asked to rename a non existent device %s -> %s", ++ old, new); ++ up_write(&_hash_lock); ++ kfree(new_name); ++ return -ENXIO; ++ } ++ ++ /* ++ * rename and move the name cell. ++ */ ++ list_del(&hc->name_list); ++ old_name = hc->name; ++ hc->name = new_name; ++ list_add(&hc->name_list, _name_buckets + hash_str(new_name)); ++ ++ /* rename the device node in devfs */ ++ unregister_with_devfs(hc); ++ register_with_devfs(hc); ++ ++ up_write(&_hash_lock); ++ kfree(old_name); ++ return 0; ++} ++ ++/*----------------------------------------------------------------- ++ * Implementation of the ioctl commands ++ *---------------------------------------------------------------*/ ++/* ++ * All the ioctl commands get dispatched to functions with this ++ * prototype. ++ */ ++typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); ++ ++static int remove_all(struct dm_ioctl *param, size_t param_size) ++{ ++ dm_hash_remove_all(); ++ param->data_size = 0; ++ return 0; ++} ++ ++/* ++ * Round up the ptr to an 8-byte boundary. ++ */ ++#define ALIGN_MASK 7 ++static inline void *align_ptr(void *ptr) ++{ ++ return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); ++} ++ ++/* ++ * Retrieves the data payload buffer from an already allocated ++ * struct dm_ioctl. ++ */ ++static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, ++ size_t *len) ++{ ++ param->data_start = align_ptr(param + 1) - (void *) param; ++ ++ if (param->data_start < param_size) ++ *len = param_size - param->data_start; ++ else ++ *len = 0; ++ ++ return ((void *) param) + param->data_start; ++} ++ ++static int list_devices(struct dm_ioctl *param, size_t param_size) ++{ ++ unsigned int i; ++ struct hash_cell *hc; ++ size_t len, needed = 0; ++ struct dm_name_list *nl, *old_nl = NULL; ++ ++ down_write(&_hash_lock); ++ ++ /* ++ * Loop through all the devices working out how much ++ * space we need. ++ */ ++ for (i = 0; i < NUM_BUCKETS; i++) { ++ list_for_each_entry (hc, _name_buckets + i, name_list) { ++ needed += sizeof(struct dm_name_list); ++ needed += strlen(hc->name); ++ needed += ALIGN_MASK; ++ } ++ } ++ ++ /* ++ * Grab our output buffer. ++ */ ++ nl = get_result_buffer(param, param_size, &len); ++ if (len < needed) { ++ param->flags |= DM_BUFFER_FULL_FLAG; ++ goto out; ++ } ++ param->data_size = param->data_start + needed; ++ ++ nl->dev = 0; /* Flags no data */ ++ ++ /* ++ * Now loop through filling out the names. ++ */ ++ for (i = 0; i < NUM_BUCKETS; i++) { ++ list_for_each_entry (hc, _name_buckets + i, name_list) { ++ if (old_nl) ++ old_nl->next = (uint32_t) ((void *) nl - ++ (void *) old_nl); ++ ++ nl->dev = dm_kdev(hc->md); ++ nl->next = 0; ++ strcpy(nl->name, hc->name); ++ ++ old_nl = nl; ++ nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); ++ } ++ } ++ ++ out: ++ up_write(&_hash_lock); ++ return 0; ++} ++ ++static int check_name(const char *name) ++{ ++ if (strchr(name, '/')) { ++ DMWARN("invalid device name"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Fills in a dm_ioctl structure, ready for sending back to ++ * userland. ++ */ ++static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) ++{ ++ kdev_t dev = dm_kdev(md); ++ struct dm_table *table; ++ struct block_device *bdev; ++ ++ param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | ++ DM_ACTIVE_PRESENT_FLAG); ++ ++ if (dm_suspended(md)) ++ param->flags |= DM_SUSPEND_FLAG; ++ ++ param->dev = kdev_t_to_nr(dev); ++ ++ if (is_read_only(dev)) ++ param->flags |= DM_READONLY_FLAG; ++ ++ param->event_nr = dm_get_event_nr(md); ++ ++ table = dm_get_table(md); ++ if (table) { ++ param->flags |= DM_ACTIVE_PRESENT_FLAG; ++ param->target_count = dm_table_get_num_targets(table); ++ dm_table_put(table); ++ } else ++ param->target_count = 0; ++ ++ bdev = bdget(param->dev); ++ if (!bdev) ++ return -ENXIO; ++ param->open_count = bdev->bd_openers; ++ bdput(bdev); ++ ++ return 0; ++} ++ ++static int dev_create(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ kdev_t dev = 0; ++ struct mapped_device *md; ++ ++ r = check_name(param->name); ++ if (r) ++ return r; ++ ++ if (param->flags & DM_PERSISTENT_DEV_FLAG) ++ dev = to_kdev_t(param->dev); ++ ++ r = dm_create(dev, &md); ++ if (r) ++ return r; ++ ++ r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); ++ if (r) { ++ dm_put(md); ++ return r; ++ } ++ ++ param->flags &= ~DM_INACTIVE_PRESENT_FLAG; ++ ++ r = __dev_status(md, param); ++ dm_put(md); ++ ++ return r; ++} ++ ++/* ++ * Always use UUID for lookups if it's present, otherwise use name. ++ */ ++static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) ++{ ++ return *param->uuid ? ++ __get_uuid_cell(param->uuid) : __get_name_cell(param->name); ++} ++ ++static inline struct mapped_device *find_device(struct dm_ioctl *param) ++{ ++ struct hash_cell *hc; ++ struct mapped_device *md = NULL; ++ ++ down_read(&_hash_lock); ++ hc = __find_device_hash_cell(param); ++ if (hc) { ++ md = hc->md; ++ ++ /* ++ * Sneakily write in both the name and the uuid ++ * while we have the cell. ++ */ ++ strncpy(param->name, hc->name, sizeof(param->name)); ++ if (hc->uuid) ++ strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1); ++ else ++ param->uuid[0] = '\0'; ++ ++ if (hc->new_map) ++ param->flags |= DM_INACTIVE_PRESENT_FLAG; ++ else ++ param->flags &= ~DM_INACTIVE_PRESENT_FLAG; ++ ++ dm_get(md); ++ } ++ up_read(&_hash_lock); ++ ++ return md; ++} ++ ++static int dev_remove(struct dm_ioctl *param, size_t param_size) ++{ ++ struct hash_cell *hc; ++ ++ down_write(&_hash_lock); ++ hc = __find_device_hash_cell(param); ++ ++ if (!hc) { ++ DMWARN("device doesn't appear to be in the dev hash table."); ++ up_write(&_hash_lock); ++ return -ENXIO; ++ } ++ ++ __hash_remove(hc); ++ up_write(&_hash_lock); ++ param->data_size = 0; ++ return 0; ++} ++ ++/* ++ * Check a string doesn't overrun the chunk of ++ * memory we copied from userland. ++ */ ++static int invalid_str(char *str, void *end) ++{ ++ while ((void *) str < end) ++ if (!*str++) ++ return 0; ++ ++ return -EINVAL; ++} ++ ++static int dev_rename(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ char *new_name = (char *) param + param->data_start; ++ ++ if (new_name < (char *) (param + 1) || ++ invalid_str(new_name, (void *) param + param_size)) { ++ DMWARN("Invalid new logical volume name supplied."); ++ return -EINVAL; ++ } ++ ++ r = check_name(new_name); ++ if (r) ++ return r; ++ ++ param->data_size = 0; ++ return dm_hash_rename(param->name, new_name); ++} ++ ++static int do_suspend(struct dm_ioctl *param) ++{ ++ int r = 0; ++ struct mapped_device *md; ++ ++ md = find_device(param); ++ if (!md) ++ return -ENXIO; ++ ++ if (!dm_suspended(md)) ++ r = dm_suspend(md); ++ ++ if (!r) ++ r = __dev_status(md, param); ++ ++ dm_put(md); ++ return r; ++} ++ ++static int do_resume(struct dm_ioctl *param) ++{ ++ int r = 0; ++ struct hash_cell *hc; ++ struct mapped_device *md; ++ struct dm_table *new_map; ++ ++ down_write(&_hash_lock); ++ ++ hc = __find_device_hash_cell(param); ++ if (!hc) { ++ DMWARN("device doesn't appear to be in the dev hash table."); ++ up_write(&_hash_lock); ++ return -ENXIO; ++ } ++ ++ md = hc->md; ++ dm_get(md); ++ ++ new_map = hc->new_map; ++ hc->new_map = NULL; ++ param->flags &= ~DM_INACTIVE_PRESENT_FLAG; ++ ++ up_write(&_hash_lock); ++ ++ /* Do we need to load a new map ? */ ++ if (new_map) { ++ /* Suspend if it isn't already suspended */ ++ if (!dm_suspended(md)) ++ dm_suspend(md); ++ ++ r = dm_swap_table(md, new_map); ++ if (r) { ++ dm_put(md); ++ dm_table_put(new_map); ++ return r; ++ } ++ ++ if (dm_table_get_mode(new_map) & FMODE_WRITE) ++ set_device_ro(dm_kdev(md), 0); ++ else ++ set_device_ro(dm_kdev(md), 1); ++ ++ dm_table_put(new_map); ++ } ++ ++ if (dm_suspended(md)) ++ r = dm_resume(md); ++ ++ if (!r) ++ r = __dev_status(md, param); ++ ++ dm_put(md); ++ return r; ++} ++ ++/* ++ * Set or unset the suspension state of a device. ++ * If the device already is in the requested state we just return its status. ++ */ ++static int dev_suspend(struct dm_ioctl *param, size_t param_size) ++{ ++ if (param->flags & DM_SUSPEND_FLAG) ++ return do_suspend(param); ++ ++ return do_resume(param); ++} ++ ++/* ++ * Copies device info back to user space, used by ++ * the create and info ioctls. ++ */ ++static int dev_status(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct mapped_device *md; ++ ++ md = find_device(param); ++ if (!md) ++ return -ENXIO; ++ ++ r = __dev_status(md, param); ++ dm_put(md); ++ return r; ++} ++ ++/* ++ * Build up the status struct for each target ++ */ ++static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, ++ size_t param_size) ++{ ++ unsigned int i, num_targets; ++ struct dm_target_spec *spec; ++ char *outbuf, *outptr; ++ status_type_t type; ++ size_t remaining, len, used = 0; ++ ++ outptr = outbuf = get_result_buffer(param, param_size, &len); ++ ++ if (param->flags & DM_STATUS_TABLE_FLAG) ++ type = STATUSTYPE_TABLE; ++ else ++ type = STATUSTYPE_INFO; ++ ++ /* Get all the target info */ ++ num_targets = dm_table_get_num_targets(table); ++ for (i = 0; i < num_targets; i++) { ++ struct dm_target *ti = dm_table_get_target(table, i); ++ ++ remaining = len - (outptr - outbuf); ++ if (remaining < sizeof(struct dm_target_spec)) { ++ param->flags |= DM_BUFFER_FULL_FLAG; ++ break; ++ } ++ ++ spec = (struct dm_target_spec *) outptr; ++ ++ spec->status = 0; ++ spec->sector_start = ti->begin; ++ spec->length = ti->len; ++ strncpy(spec->target_type, ti->type->name, ++ sizeof(spec->target_type)); ++ ++ outptr += sizeof(struct dm_target_spec); ++ remaining = len - (outptr - outbuf); ++ ++ /* Get the status/table string from the target driver */ ++ if (ti->type->status) { ++ if (ti->type->status(ti, type, outptr, remaining)) { ++ param->flags |= DM_BUFFER_FULL_FLAG; ++ break; ++ } ++ } else ++ outptr[0] = '\0'; ++ ++ outptr += strlen(outptr) + 1; ++ used = param->data_start + (outptr - outbuf); ++ ++ align_ptr(outptr); ++ spec->next = outptr - outbuf; ++ } ++ ++ if (used) ++ param->data_size = used; ++ ++ param->target_count = num_targets; ++} ++ ++/* ++ * Wait for a device to report an event ++ */ ++static int dev_wait(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct mapped_device *md; ++ struct dm_table *table; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ md = find_device(param); ++ if (!md) ++ return -ENXIO; ++ ++ /* ++ * Wait for a notification event ++ */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (!dm_add_wait_queue(md, &wq, param->event_nr)) { ++ schedule(); ++ dm_remove_wait_queue(md, &wq); ++ } ++ set_current_state(TASK_RUNNING); ++ ++ /* ++ * The userland program is going to want to know what ++ * changed to trigger the event, so we may as well tell ++ * him and save an ioctl. ++ */ ++ r = __dev_status(md, param); ++ if (r) ++ goto out; ++ ++ table = dm_get_table(md); ++ if (table) { ++ retrieve_status(table, param, param_size); ++ dm_table_put(table); ++ } ++ ++ out: ++ dm_put(md); ++ return r; ++} ++ ++static inline int get_mode(struct dm_ioctl *param) ++{ ++ int mode = FMODE_READ | FMODE_WRITE; ++ ++ if (param->flags & DM_READONLY_FLAG) ++ mode = FMODE_READ; ++ ++ return mode; ++} ++ ++static int next_target(struct dm_target_spec *last, uint32_t next, void *end, ++ struct dm_target_spec **spec, char **target_params) ++{ ++ *spec = (struct dm_target_spec *) ((unsigned char *) last + next); ++ *target_params = (char *) (*spec + 1); ++ ++ if (*spec < (last + 1)) ++ return -EINVAL; ++ ++ return invalid_str(*target_params, end); ++} ++ ++static int populate_table(struct dm_table *table, struct dm_ioctl *param, ++ size_t param_size) ++{ ++ int r; ++ unsigned int i = 0; ++ struct dm_target_spec *spec = (struct dm_target_spec *) param; ++ uint32_t next = param->data_start; ++ void *end = (void *) param + param_size; ++ char *target_params; ++ ++ if (!param->target_count) { ++ DMWARN("populate_table: no targets specified"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < param->target_count; i++) { ++ ++ r = next_target(spec, next, end, &spec, &target_params); ++ if (r) { ++ DMWARN("unable to find target"); ++ return r; ++ } ++ ++ r = dm_table_add_target(table, spec->target_type, ++ (sector_t) spec->sector_start, ++ (sector_t) spec->length, ++ target_params); ++ if (r) { ++ DMWARN("error adding target to table"); ++ return r; ++ } ++ ++ next = spec->next; ++ } ++ ++ return dm_table_complete(table); ++} ++ ++static int table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct hash_cell *hc; ++ struct dm_table *t; ++ ++ r = dm_table_create(&t, get_mode(param), param->target_count); ++ if (r) ++ return r; ++ ++ r = populate_table(t, param, param_size); ++ if (r) { ++ dm_table_put(t); ++ return r; ++ } ++ ++ down_write(&_hash_lock); ++ hc = __find_device_hash_cell(param); ++ if (!hc) { ++ DMWARN("device doesn't appear to be in the dev hash table."); ++ up_write(&_hash_lock); ++ return -ENXIO; ++ } ++ ++ if (hc->new_map) ++ dm_table_put(hc->new_map); ++ hc->new_map = t; ++ param->flags |= DM_INACTIVE_PRESENT_FLAG; ++ ++ r = __dev_status(hc->md, param); ++ up_write(&_hash_lock); ++ return r; ++} ++ ++static int table_clear(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct hash_cell *hc; ++ ++ down_write(&_hash_lock); ++ ++ hc = __find_device_hash_cell(param); ++ if (!hc) { ++ DMWARN("device doesn't appear to be in the dev hash table."); ++ up_write(&_hash_lock); ++ return -ENXIO; ++ } ++ ++ if (hc->new_map) { ++ dm_table_put(hc->new_map); ++ hc->new_map = NULL; ++ } ++ ++ param->flags &= ~DM_INACTIVE_PRESENT_FLAG; ++ ++ r = __dev_status(hc->md, param); ++ up_write(&_hash_lock); ++ return r; ++} ++ ++/* ++ * Retrieves a list of devices used by a particular dm device. ++ */ ++static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param, ++ size_t param_size) ++{ ++ unsigned int count = 0; ++ struct list_head *tmp; ++ size_t len, needed; ++ struct dm_target_deps *deps; ++ ++ deps = get_result_buffer(param, param_size, &len); ++ ++ /* ++ * Count the devices. ++ */ ++ list_for_each(tmp, dm_table_get_devices(table)) ++ count++; ++ ++ /* ++ * Check we have enough space. ++ */ ++ needed = sizeof(*deps) + (sizeof(*deps->dev) * count); ++ if (len < needed) { ++ param->flags |= DM_BUFFER_FULL_FLAG; ++ return; ++ } ++ ++ /* ++ * Fill in the devices. ++ */ ++ deps->count = count; ++ count = 0; ++ list_for_each(tmp, dm_table_get_devices(table)) { ++ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); ++ deps->dev[count++] = dd->bdev->bd_dev; ++ } ++ ++ param->data_size = param->data_start + needed; ++} ++ ++static int table_deps(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct mapped_device *md; ++ struct dm_table *table; ++ ++ md = find_device(param); ++ if (!md) ++ return -ENXIO; ++ ++ r = __dev_status(md, param); ++ if (r) ++ goto out; ++ ++ table = dm_get_table(md); ++ if (table) { ++ retrieve_deps(table, param, param_size); ++ dm_table_put(table); ++ } ++ ++ out: ++ dm_put(md); ++ return r; ++} ++ ++/* ++ * Return the status of a device as a text string for each ++ * target. ++ */ ++static int table_status(struct dm_ioctl *param, size_t param_size) ++{ ++ int r; ++ struct mapped_device *md; ++ struct dm_table *table; ++ ++ md = find_device(param); ++ if (!md) ++ return -ENXIO; ++ ++ r = __dev_status(md, param); ++ if (r) ++ goto out; ++ ++ table = dm_get_table(md); ++ if (table) { ++ retrieve_status(table, param, param_size); ++ dm_table_put(table); ++ } ++ ++ out: ++ dm_put(md); ++ return r; ++} ++ ++/*----------------------------------------------------------------- ++ * Implementation of open/close/ioctl on the special char ++ * device. ++ *---------------------------------------------------------------*/ ++static ioctl_fn lookup_ioctl(unsigned int cmd) ++{ ++ static struct { ++ int cmd; ++ ioctl_fn fn; ++ } _ioctls[] = { ++ {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ ++ {DM_REMOVE_ALL_CMD, remove_all}, ++ {DM_LIST_DEVICES_CMD, list_devices}, ++ ++ {DM_DEV_CREATE_CMD, dev_create}, ++ {DM_DEV_REMOVE_CMD, dev_remove}, ++ {DM_DEV_RENAME_CMD, dev_rename}, ++ {DM_DEV_SUSPEND_CMD, dev_suspend}, ++ {DM_DEV_STATUS_CMD, dev_status}, ++ {DM_DEV_WAIT_CMD, dev_wait}, ++ ++ {DM_TABLE_LOAD_CMD, table_load}, ++ {DM_TABLE_CLEAR_CMD, table_clear}, ++ {DM_TABLE_DEPS_CMD, table_deps}, ++ {DM_TABLE_STATUS_CMD, table_status} ++ }; ++ ++ return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; ++} ++ ++/* ++ * As well as checking the version compatibility this always ++ * copies the kernel interface version out. ++ */ ++static int check_version(unsigned int cmd, struct dm_ioctl *user) ++{ ++ uint32_t version[3]; ++ int r = 0; ++ ++ if (copy_from_user(version, user->version, sizeof(version))) ++ return -EFAULT; ++ ++ if ((DM_VERSION_MAJOR != version[0]) || ++ (DM_VERSION_MINOR < version[1])) { ++ DMWARN("ioctl interface mismatch: " ++ "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", ++ DM_VERSION_MAJOR, DM_VERSION_MINOR, ++ DM_VERSION_PATCHLEVEL, ++ version[0], version[1], version[2], cmd); ++ r = -EINVAL; ++ } ++ ++ /* ++ * Fill in the kernel version. ++ */ ++ version[0] = DM_VERSION_MAJOR; ++ version[1] = DM_VERSION_MINOR; ++ version[2] = DM_VERSION_PATCHLEVEL; ++ if (copy_to_user(user->version, version, sizeof(version))) ++ return -EFAULT; ++ ++ return r; ++} ++ ++static void free_params(struct dm_ioctl *param) ++{ ++ vfree(param); ++} ++ ++static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param) ++{ ++ struct dm_ioctl tmp, *dmi; ++ ++ if (copy_from_user(&tmp, user, sizeof(tmp))) ++ return -EFAULT; ++ ++ if (tmp.data_size < sizeof(tmp)) ++ return -EINVAL; ++ ++ dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); ++ if (!dmi) ++ return -ENOMEM; ++ ++ if (copy_from_user(dmi, user, tmp.data_size)) { ++ vfree(dmi); ++ return -EFAULT; ++ } ++ ++ *param = dmi; ++ return 0; ++} ++ ++static int validate_params(uint cmd, struct dm_ioctl *param) ++{ ++ /* Always clear this flag */ ++ param->flags &= ~DM_BUFFER_FULL_FLAG; ++ ++ /* Ignores parameters */ ++ if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD) ++ return 0; ++ ++ /* Unless creating, either name or uuid but not both */ ++ if (cmd != DM_DEV_CREATE_CMD) { ++ if ((!*param->uuid && !*param->name) || ++ (*param->uuid && *param->name)) { ++ DMWARN("one of name or uuid must be supplied, cmd(%u)", ++ cmd); ++ return -EINVAL; ++ } ++ } ++ ++ /* Ensure strings are terminated */ ++ param->name[DM_NAME_LEN - 1] = '\0'; ++ param->uuid[DM_UUID_LEN - 1] = '\0'; ++ ++ return 0; ++} ++ ++static int ctl_ioctl(struct inode *inode, struct file *file, ++ uint command, ulong u) ++{ ++ int r = 0; ++ unsigned int cmd; ++ struct dm_ioctl *param; ++ struct dm_ioctl *user = (struct dm_ioctl *) u; ++ ioctl_fn fn = NULL; ++ size_t param_size; ++ ++ /* only root can play with this */ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ if (_IOC_TYPE(command) != DM_IOCTL) ++ return -ENOTTY; ++ ++ cmd = _IOC_NR(command); ++ ++ /* ++ * Check the interface version passed in. This also ++ * writes out the kernel's interface version. ++ */ ++ r = check_version(cmd, user); ++ if (r) ++ return r; ++ ++ /* ++ * Nothing more to do for the version command. ++ */ ++ if (cmd == DM_VERSION_CMD) ++ return 0; ++ ++ fn = lookup_ioctl(cmd); ++ if (!fn) { ++ DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); ++ return -ENOTTY; ++ } ++ ++ /* ++ * FIXME: I don't like this, we're trying to avoid low ++ * memory issues when a device is suspended. ++ */ ++ current->flags |= PF_MEMALLOC; ++ ++ /* ++ * Copy the parameters into kernel space. ++ */ ++ r = copy_params(user, ¶m); ++ if (r) { ++ current->flags &= ~PF_MEMALLOC; ++ return r; ++ } ++ ++ r = validate_params(cmd, param); ++ if (r) ++ goto out; ++ ++ param_size = param->data_size; ++ param->data_size = sizeof(*param); ++ r = fn(param, param_size); ++ ++ /* ++ * Copy the results back to userland. ++ */ ++ if (!r && copy_to_user(user, param, param->data_size)) ++ r = -EFAULT; ++ ++ out: ++ free_params(param); ++ current->flags &= ~PF_MEMALLOC; ++ return r; ++} ++ ++static struct file_operations _ctl_fops = { ++ .ioctl = ctl_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++static devfs_handle_t _ctl_handle; ++ ++static struct miscdevice _dm_misc = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = DM_NAME, ++ .fops = &_ctl_fops ++}; ++ ++/* ++ * Create misc character device and link to DM_DIR/control. ++ */ ++int __init dm_interface_init(void) ++{ ++ int r; ++ char rname[64]; ++ ++ r = dm_hash_init(); ++ if (r) ++ return r; ++ ++ r = misc_register(&_dm_misc); ++ if (r) { ++ DMERR("misc_register failed for control device"); ++ dm_hash_exit(); ++ return r; ++ } ++ ++ r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3, ++ sizeof rname - 3); ++ if (r == -ENOSYS) ++ goto done; /* devfs not present */ ++ ++ if (r < 0) { ++ DMERR("devfs_generate_path failed for control device"); ++ goto failed; ++ } ++ ++ strncpy(rname + r, "../", 3); ++ r = devfs_mk_symlink(NULL, DM_DIR "/control", ++ DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL); ++ if (r) { ++ DMERR("devfs_mk_symlink failed for control device"); ++ goto failed; ++ } ++ devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle); ++ ++ done: ++ DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, ++ DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, ++ DM_DRIVER_EMAIL); ++ return 0; ++ ++ failed: ++ misc_deregister(&_dm_misc); ++ dm_hash_exit(); ++ return r; ++} ++ ++void dm_interface_exit(void) ++{ ++ if (misc_deregister(&_dm_misc) < 0) ++ DMERR("misc_deregister failed for control device"); ++ ++ dm_hash_exit(); ++} +--- linux-2.4.22/drivers/md/dm-linear.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-linear.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,123 @@ ++/* ++ * Copyright (C) 2001 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Linear: maps a linear range of a device. ++ */ ++struct linear_c { ++ struct dm_dev *dev; ++ sector_t start; ++}; ++ ++/* ++ * Construct a linear mapping: ++ */ ++static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ++{ ++ struct linear_c *lc; ++ ++ if (argc != 2) { ++ ti->error = "dm-linear: Invalid argument count"; ++ return -EINVAL; ++ } ++ ++ lc = kmalloc(sizeof(*lc), GFP_KERNEL); ++ if (lc == NULL) { ++ ti->error = "dm-linear: Cannot allocate linear context"; ++ return -ENOMEM; ++ } ++ ++ if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { ++ ti->error = "dm-linear: Invalid device sector"; ++ goto bad; ++ } ++ ++ if (dm_get_device(ti, argv[0], lc->start, ti->len, ++ dm_table_get_mode(ti->table), &lc->dev)) { ++ ti->error = "dm-linear: Device lookup failed"; ++ goto bad; ++ } ++ ++ ti->private = lc; ++ return 0; ++ ++ bad: ++ kfree(lc); ++ return -EINVAL; ++} ++ ++static void linear_dtr(struct dm_target *ti) ++{ ++ struct linear_c *lc = (struct linear_c *) ti->private; ++ ++ dm_put_device(ti, lc->dev); ++ kfree(lc); ++} ++ ++static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw, ++ union map_info *map_context) ++{ ++ struct linear_c *lc = (struct linear_c *) ti->private; ++ ++ bh->b_rdev = lc->dev->dev; ++ bh->b_rsector = lc->start + (bh->b_rsector - ti->begin); ++ ++ return 1; ++} ++ ++static int linear_status(struct dm_target *ti, status_type_t type, ++ char *result, unsigned int maxlen) ++{ ++ struct linear_c *lc = (struct linear_c *) ti->private; ++ kdev_t kdev; ++ ++ switch (type) { ++ case STATUSTYPE_INFO: ++ result[0] = '\0'; ++ break; ++ ++ case STATUSTYPE_TABLE: ++ kdev = to_kdev_t(lc->dev->bdev->bd_dev); ++ snprintf(result, maxlen, "%s " SECTOR_FORMAT, ++ dm_kdevname(kdev), lc->start); ++ break; ++ } ++ return 0; ++} ++ ++static struct target_type linear_target = { ++ .name = "linear", ++ .module = THIS_MODULE, ++ .ctr = linear_ctr, ++ .dtr = linear_dtr, ++ .map = linear_map, ++ .status = linear_status, ++}; ++ ++int __init dm_linear_init(void) ++{ ++ int r = dm_register_target(&linear_target); ++ ++ if (r < 0) ++ DMERR("linear: register failed %d", r); ++ ++ return r; ++} ++ ++void dm_linear_exit(void) ++{ ++ int r = dm_unregister_target(&linear_target); ++ ++ if (r < 0) ++ DMERR("linear: unregister failed %d", r); ++} +--- linux-2.4.22/drivers/md/dm-log.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-log.c Tue Nov 18 14:15:58 2003 +@@ -0,0 +1,310 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "dm-log.h" ++#include "dm-io.h" ++ ++static LIST_HEAD(_log_types); ++static spinlock_t _lock = SPIN_LOCK_UNLOCKED; ++ ++int dm_register_dirty_log_type(struct dirty_log_type *type) ++{ ++ spin_lock(&_lock); ++ type->use_count = 0; ++ if (type->module) ++ __MOD_INC_USE_COUNT(type->module); ++ ++ list_add(&type->list, &_log_types); ++ spin_unlock(&_lock); ++ ++ return 0; ++} ++ ++int dm_unregister_dirty_log_type(struct dirty_log_type *type) ++{ ++ spin_lock(&_lock); ++ ++ if (type->use_count) ++ DMWARN("Attempt to unregister a log type that is still in use"); ++ else { ++ list_del(&type->list); ++ if (type->module) ++ __MOD_DEC_USE_COUNT(type->module); ++ } ++ ++ spin_unlock(&_lock); ++ ++ return 0; ++} ++ ++static struct dirty_log_type *get_type(const char *type_name) ++{ ++ struct dirty_log_type *type; ++ struct list_head *tmp; ++ ++ spin_lock(&_lock); ++ list_for_each (tmp, &_log_types) { ++ type = list_entry(tmp, struct dirty_log_type, list); ++ if (!strcmp(type_name, type->name)) { ++ type->use_count++; ++ spin_unlock(&_lock); ++ return type; ++ } ++ } ++ ++ spin_unlock(&_lock); ++ return NULL; ++} ++ ++static void put_type(struct dirty_log_type *type) ++{ ++ spin_lock(&_lock); ++ type->use_count--; ++ spin_unlock(&_lock); ++} ++ ++struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, ++ unsigned int argc, char **argv) ++{ ++ struct dirty_log_type *type; ++ struct dirty_log *log; ++ ++ log = kmalloc(sizeof(*log), GFP_KERNEL); ++ if (!log) ++ return NULL; ++ ++ type = get_type(type_name); ++ if (!type) { ++ kfree(log); ++ return NULL; ++ } ++ ++ log->type = type; ++ if (type->ctr(log, dev_size, argc, argv)) { ++ kfree(log); ++ put_type(type); ++ return NULL; ++ } ++ ++ return log; ++} ++ ++void dm_destroy_dirty_log(struct dirty_log *log) ++{ ++ log->type->dtr(log); ++ put_type(log->type); ++ kfree(log); ++} ++ ++ ++/*----------------------------------------------------------------- ++ * In core log, ie. trivial, non-persistent ++ * ++ * For now we'll keep this simple and just have 2 bitsets, one ++ * for clean/dirty, the other for sync/nosync. The sync bitset ++ * will be freed when everything is in sync. ++ * ++ * FIXME: problems with a 64bit sector_t ++ *---------------------------------------------------------------*/ ++struct core_log { ++ sector_t region_size; ++ unsigned int region_count; ++ unsigned long *clean_bits; ++ unsigned long *sync_bits; ++ unsigned long *recovering_bits; /* FIXME: this seems excessive */ ++ ++ int sync_search; ++}; ++ ++#define BYTE_SHIFT 3 ++ ++static int core_ctr(struct dirty_log *log, sector_t dev_size, ++ unsigned int argc, char **argv) ++{ ++ struct core_log *clog; ++ sector_t region_size; ++ unsigned int region_count; ++ size_t bitset_size; ++ ++ if (argc != 1) { ++ DMWARN("wrong number of arguments to core_log"); ++ return -EINVAL; ++ } ++ ++ if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { ++ DMWARN("invalid region size string"); ++ return -EINVAL; ++ } ++ ++ region_count = dm_div_up(dev_size, region_size); ++ ++ clog = kmalloc(sizeof(*clog), GFP_KERNEL); ++ if (!clog) { ++ DMWARN("couldn't allocate core log"); ++ return -ENOMEM; ++ } ++ ++ clog->region_size = region_size; ++ clog->region_count = region_count; ++ ++ /* ++ * Work out how many words we need to hold the bitset. ++ */ ++ bitset_size = dm_round_up(region_count, ++ sizeof(*clog->clean_bits) << BYTE_SHIFT); ++ bitset_size >>= BYTE_SHIFT; ++ ++ clog->clean_bits = vmalloc(bitset_size); ++ if (!clog->clean_bits) { ++ DMWARN("couldn't allocate clean bitset"); ++ kfree(clog); ++ return -ENOMEM; ++ } ++ memset(clog->clean_bits, -1, bitset_size); ++ ++ clog->sync_bits = vmalloc(bitset_size); ++ if (!clog->sync_bits) { ++ DMWARN("couldn't allocate sync bitset"); ++ vfree(clog->clean_bits); ++ kfree(clog); ++ return -ENOMEM; ++ } ++ memset(clog->sync_bits, 0, bitset_size); ++ ++ clog->recovering_bits = vmalloc(bitset_size); ++ if (!clog->recovering_bits) { ++ DMWARN("couldn't allocate sync bitset"); ++ vfree(clog->sync_bits); ++ vfree(clog->clean_bits); ++ kfree(clog); ++ return -ENOMEM; ++ } ++ memset(clog->recovering_bits, 0, bitset_size); ++ clog->sync_search = 0; ++ log->context = clog; ++ return 0; ++} ++ ++static void core_dtr(struct dirty_log *log) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ vfree(clog->clean_bits); ++ vfree(clog->sync_bits); ++ vfree(clog->recovering_bits); ++ kfree(clog); ++} ++ ++static sector_t core_get_region_size(struct dirty_log *log) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ return clog->region_size; ++} ++ ++static int core_is_clean(struct dirty_log *log, region_t region) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ return test_bit(region, clog->clean_bits); ++} ++ ++static int core_in_sync(struct dirty_log *log, region_t region, int block) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ ++ return test_bit(region, clog->sync_bits) ? 1 : 0; ++} ++ ++static int core_flush(struct dirty_log *log) ++{ ++ /* no op */ ++ return 0; ++} ++ ++static void core_mark_region(struct dirty_log *log, region_t region) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ clear_bit(region, clog->clean_bits); ++} ++ ++static void core_clear_region(struct dirty_log *log, region_t region) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ set_bit(region, clog->clean_bits); ++} ++ ++static int core_get_resync_work(struct dirty_log *log, region_t *region) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ ++ if (clog->sync_search >= clog->region_count) ++ return 0; ++ ++ do { ++ *region = find_next_zero_bit(clog->sync_bits, ++ clog->region_count, ++ clog->sync_search); ++ clog->sync_search = *region + 1; ++ ++ if (*region == clog->region_count) ++ return 0; ++ ++ } while (test_bit(*region, clog->recovering_bits)); ++ ++ set_bit(*region, clog->recovering_bits); ++ return 1; ++} ++ ++static void core_complete_resync_work(struct dirty_log *log, region_t region, ++ int success) ++{ ++ struct core_log *clog = (struct core_log *) log->context; ++ ++ clear_bit(region, clog->recovering_bits); ++ if (success) ++ set_bit(region, clog->sync_bits); ++} ++ ++static struct dirty_log_type _core_type = { ++ .name = "core", ++ ++ .ctr = core_ctr, ++ .dtr = core_dtr, ++ .get_region_size = core_get_region_size, ++ .is_clean = core_is_clean, ++ .in_sync = core_in_sync, ++ .flush = core_flush, ++ .mark_region = core_mark_region, ++ .clear_region = core_clear_region, ++ .get_resync_work = core_get_resync_work, ++ .complete_resync_work = core_complete_resync_work ++}; ++ ++__init int dm_dirty_log_init(void) ++{ ++ int r; ++ ++ r = dm_register_dirty_log_type(&_core_type); ++ if (r) ++ DMWARN("couldn't register core log"); ++ ++ return r; ++} ++ ++void dm_dirty_log_exit(void) ++{ ++ dm_unregister_dirty_log_type(&_core_type); ++} ++ ++EXPORT_SYMBOL(dm_register_dirty_log_type); ++EXPORT_SYMBOL(dm_unregister_dirty_log_type); ++EXPORT_SYMBOL(dm_dirty_log_init); ++EXPORT_SYMBOL(dm_dirty_log_exit); ++EXPORT_SYMBOL(dm_create_dirty_log); ++EXPORT_SYMBOL(dm_destroy_dirty_log); +--- linux-2.4.22/drivers/md/dm-log.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-log.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,112 @@ ++/* ++ * Copyright (C) 2003 Sistina Software ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#ifndef DM_DIRTY_LOG ++#define DM_DIRTY_LOG ++ ++#include "dm.h" ++ ++typedef sector_t region_t; ++ ++struct dirty_log_type; ++ ++struct dirty_log { ++ struct dirty_log_type *type; ++ void *context; ++}; ++ ++struct dirty_log_type { ++ struct list_head list; ++ const char *name; ++ struct module *module; ++ unsigned int use_count; ++ ++ int (*ctr)(struct dirty_log *log, sector_t dev_size, ++ unsigned int argc, char **argv); ++ void (*dtr)(struct dirty_log *log); ++ ++ /* ++ * Retrieves the smallest size of region that the log can ++ * deal with. ++ */ ++ sector_t (*get_region_size)(struct dirty_log *log); ++ ++ /* ++ * A predicate to say whether a region is clean or not. ++ * May block. ++ */ ++ int (*is_clean)(struct dirty_log *log, region_t region); ++ ++ /* ++ * Returns: 0, 1, -EWOULDBLOCK, < 0 ++ * ++ * A predicate function to check the area given by ++ * [sector, sector + len) is in sync. ++ * ++ * If -EWOULDBLOCK is returned the state of the region is ++ * unknown, typically this will result in a read being ++ * passed to a daemon to deal with, since a daemon is ++ * allowed to block. ++ */ ++ int (*in_sync)(struct dirty_log *log, region_t region, int can_block); ++ ++ /* ++ * Flush the current log state (eg, to disk). This ++ * function may block. ++ */ ++ int (*flush)(struct dirty_log *log); ++ ++ /* ++ * Mark an area as clean or dirty. These functions may ++ * block, though for performance reasons blocking should ++ * be extremely rare (eg, allocating another chunk of ++ * memory for some reason). ++ */ ++ void (*mark_region)(struct dirty_log *log, region_t region); ++ void (*clear_region)(struct dirty_log *log, region_t region); ++ ++ /* ++ * Returns: <0 (error), 0 (no region), 1 (region) ++ * ++ * The mirrord will need perform recovery on regions of ++ * the mirror that are in the NOSYNC state. This ++ * function asks the log to tell the caller about the ++ * next region that this machine should recover. ++ * ++ * Do not confuse this function with 'in_sync()', one ++ * tells you if an area is synchronised, the other ++ * assigns recovery work. ++ */ ++ int (*get_resync_work)(struct dirty_log *log, region_t *region); ++ ++ /* ++ * This notifies the log that the resync of an area has ++ * been completed. The log should then mark this region ++ * as CLEAN. ++ */ ++ void (*complete_resync_work)(struct dirty_log *log, ++ region_t region, int success); ++}; ++ ++int dm_register_dirty_log_type(struct dirty_log_type *type); ++int dm_unregister_dirty_log_type(struct dirty_log_type *type); ++ ++ ++/* ++ * Make sure you use these two functions, rather than calling ++ * type->constructor/destructor() directly. ++ */ ++struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, ++ unsigned int argc, char **argv); ++void dm_destroy_dirty_log(struct dirty_log *log); ++ ++/* ++ * init/exit functions. ++ */ ++int dm_dirty_log_init(void); ++void dm_dirty_log_exit(void); ++ ++#endif +--- linux-2.4.22/drivers/md/dm-raid1.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-raid1.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,1294 @@ ++/* ++ * Copyright (C) 2003 Sistina Software Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++#include "dm-daemon.h" ++#include "dm-io.h" ++#include "dm-log.h" ++#include "kcopyd.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct dm_daemon _kmirrord; ++ ++/*----------------------------------------------------------------- ++ * buffer lists: ++ * ++ * We play with singly linked lists of buffers, but we want to be ++ * careful to add new buffers to the back of the list, to avoid ++ * buffers being starved of attention. ++ *---------------------------------------------------------------*/ ++struct buffer_list { ++ struct buffer_head *head; ++ struct buffer_head *tail; ++}; ++ ++static inline void buffer_list_init(struct buffer_list *bl) ++{ ++ bl->head = bl->tail = NULL; ++} ++ ++static inline void buffer_list_add(struct buffer_list *bl, ++ struct buffer_head *bh) ++{ ++ bh->b_reqnext = NULL; ++ ++ if (bl->tail) { ++ bl->tail->b_reqnext = bh; ++ bl->tail = bh; ++ } else ++ bl->head = bl->tail = bh; ++} ++ ++static struct buffer_head *buffer_list_pop(struct buffer_list *bl) ++{ ++ struct buffer_head *bh = bl->head; ++ ++ if (bh) { ++ bl->head = bl->head->b_reqnext; ++ if (!bl->head) ++ bl->tail = NULL; ++ ++ bh->b_reqnext = NULL; ++ } ++ ++ return bh; ++} ++ ++/*----------------------------------------------------------------- ++ * Region hash ++ * ++ * The mirror splits itself up into discrete regions. Each ++ * region can be in one of three states: clean, dirty, ++ * nosync. There is no need to put clean regions in the hash. ++ * ++ * In addition to being present in the hash table a region _may_ ++ * be present on one of three lists. ++ * ++ * clean_regions: Regions on this list have no io pending to ++ * them, they are in sync, we are no longer interested in them, ++ * they are dull. rh_update_states() will remove them from the ++ * hash table. ++ * ++ * quiesced_regions: These regions have been spun down, ready ++ * for recovery. rh_recovery_start() will remove regions from ++ * this list and hand them to kmirrord, which will schedule the ++ * recovery io with kcopyd. ++ * ++ * recovered_regions: Regions that kcopyd has successfully ++ * recovered. rh_update_states() will now schedule any delayed ++ * io, up the recovery_count, and remove the region from the ++ * hash. ++ * ++ * There are 2 locks: ++ * A rw spin lock 'hash_lock' protects just the hash table, ++ * this is never held in write mode from interrupt context, ++ * which I believe means that we only have to disable irqs when ++ * doing a write lock. ++ * ++ * An ordinary spin lock 'region_lock' that protects the three ++ * lists in the region_hash, with the 'state', 'list' and ++ * 'bhs_delayed' fields of the regions. This is used from irq ++ * context, so all other uses will have to suspend local irqs. ++ *---------------------------------------------------------------*/ ++struct mirror_set; ++struct region_hash { ++ struct mirror_set *ms; ++ sector_t region_size; ++ ++ /* holds persistent region state */ ++ struct dirty_log *log; ++ ++ /* hash table */ ++ rwlock_t hash_lock; ++ mempool_t *region_pool; ++ unsigned int mask; ++ unsigned int nr_buckets; ++ struct list_head *buckets; ++ ++ spinlock_t region_lock; ++ struct semaphore recovery_count; ++ struct list_head clean_regions; ++ struct list_head quiesced_regions; ++ struct list_head recovered_regions; ++}; ++ ++enum { ++ RH_CLEAN, ++ RH_DIRTY, ++ RH_NOSYNC, ++ RH_RECOVERING ++}; ++ ++struct region { ++ struct region_hash *rh; /* FIXME: can we get rid of this ? */ ++ region_t key; ++ int state; ++ ++ struct list_head hash_list; ++ struct list_head list; ++ ++ atomic_t pending; ++ struct buffer_head *delayed_bhs; ++}; ++ ++/* ++ * Conversion fns ++ */ ++static inline region_t bh_to_region(struct region_hash *rh, ++ struct buffer_head *bh) ++{ ++ return bh->b_rsector / rh->region_size; ++} ++ ++static inline sector_t region_to_sector(struct region_hash *rh, region_t region) ++{ ++ return region * rh->region_size; ++} ++ ++/* FIXME move this */ ++static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw); ++ ++static void *region_alloc(int gfp_mask, void *pool_data) ++{ ++ return kmalloc(sizeof(struct region), gfp_mask); ++} ++ ++static void region_free(void *element, void *pool_data) ++{ ++ kfree(element); ++} ++ ++#define MIN_REGIONS 64 ++#define MAX_RECOVERY 1 ++static int rh_init(struct region_hash *rh, struct mirror_set *ms, ++ struct dirty_log *log, sector_t region_size, ++ region_t nr_regions) ++{ ++ unsigned int nr_buckets, max_buckets; ++ size_t i; ++ ++ /* ++ * Calculate a suitable number of buckets for our hash ++ * table. ++ */ ++ max_buckets = nr_regions >> 6; ++ for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) ++ ; ++ nr_buckets >>= 1; ++ ++ rh->ms = ms; ++ rh->log = log; ++ rh->region_size = region_size; ++ rwlock_init(&rh->hash_lock); ++ rh->mask = nr_buckets - 1; ++ rh->nr_buckets = nr_buckets; ++ ++ rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); ++ if (!rh->buckets) { ++ DMERR("unable to allocate region hash memory"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < nr_buckets; i++) ++ INIT_LIST_HEAD(rh->buckets + i); ++ ++ spin_lock_init(&rh->region_lock); ++ sema_init(&rh->recovery_count, 0); ++ INIT_LIST_HEAD(&rh->clean_regions); ++ INIT_LIST_HEAD(&rh->quiesced_regions); ++ INIT_LIST_HEAD(&rh->recovered_regions); ++ ++ rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, ++ region_free, NULL); ++ if (!rh->region_pool) { ++ vfree(rh->buckets); ++ rh->buckets = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static void rh_exit(struct region_hash *rh) ++{ ++ unsigned int h; ++ struct region *reg; ++ struct list_head *tmp, *tmp2; ++ ++ BUG_ON(!list_empty(&rh->quiesced_regions)); ++ for (h = 0; h < rh->nr_buckets; h++) { ++ list_for_each_safe (tmp, tmp2, rh->buckets + h) { ++ reg = list_entry(tmp, struct region, hash_list); ++ BUG_ON(atomic_read(®->pending)); ++ mempool_free(reg, rh->region_pool); ++ } ++ } ++ ++ if (rh->log) ++ dm_destroy_dirty_log(rh->log); ++ if (rh->region_pool) ++ mempool_destroy(rh->region_pool); ++ vfree(rh->buckets); ++} ++ ++#define RH_HASH_MULT 2654435387U ++ ++static inline unsigned int rh_hash(struct region_hash *rh, region_t region) ++{ ++ return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; ++} ++ ++static struct region *__rh_lookup(struct region_hash *rh, region_t region) ++{ ++ struct region *reg; ++ ++ list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) ++ if (reg->key == region) ++ return reg; ++ ++ return NULL; ++} ++ ++static void __rh_insert(struct region_hash *rh, struct region *reg) ++{ ++ unsigned int h = rh_hash(rh, reg->key); ++ list_add(®->hash_list, rh->buckets + h); ++} ++ ++static struct region *__rh_alloc(struct region_hash *rh, region_t region) ++{ ++ struct region *reg, *nreg; ++ ++ read_unlock(&rh->hash_lock); ++ nreg = mempool_alloc(rh->region_pool, GFP_NOIO); ++ nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? ++ RH_CLEAN : RH_NOSYNC; ++ nreg->rh = rh; ++ nreg->key = region; ++ ++ INIT_LIST_HEAD(&nreg->list); ++ ++ atomic_set(&nreg->pending, 0); ++ nreg->delayed_bhs = NULL; ++ write_lock_irq(&rh->hash_lock); ++ ++ reg = __rh_lookup(rh, region); ++ if (reg) ++ /* we lost the race */ ++ mempool_free(nreg, rh->region_pool); ++ ++ else { ++ __rh_insert(rh, nreg); ++ if (nreg->state == RH_CLEAN) { ++ spin_lock_irq(&rh->region_lock); ++ list_add(&nreg->list, &rh->clean_regions); ++ spin_unlock_irq(&rh->region_lock); ++ } ++ reg = nreg; ++ } ++ write_unlock_irq(&rh->hash_lock); ++ read_lock(&rh->hash_lock); ++ ++ return reg; ++} ++ ++static inline struct region *__rh_find(struct region_hash *rh, region_t region) ++{ ++ struct region *reg; ++ ++ reg = __rh_lookup(rh, region); ++ if (!reg) ++ reg = __rh_alloc(rh, region); ++ ++ return reg; ++} ++ ++static int rh_state(struct region_hash *rh, region_t region, int may_block) ++{ ++ int r; ++ struct region *reg; ++ ++ read_lock(&rh->hash_lock); ++ reg = __rh_lookup(rh, region); ++ read_unlock(&rh->hash_lock); ++ ++ if (reg) ++ return reg->state; ++ ++ /* ++ * The region wasn't in the hash, so we fall back to the ++ * dirty log. ++ */ ++ r = rh->log->type->in_sync(rh->log, region, may_block); ++ ++ /* ++ * Any error from the dirty log (eg. -EWOULDBLOCK) gets ++ * taken as a RH_NOSYNC ++ */ ++ return r == 1 ? RH_CLEAN : RH_NOSYNC; ++} ++ ++static inline int rh_in_sync(struct region_hash *rh, ++ region_t region, int may_block) ++{ ++ int state = rh_state(rh, region, may_block); ++ return state == RH_CLEAN || state == RH_DIRTY; ++} ++ ++static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh) ++{ ++ struct buffer_head *nbh; ++ ++ while (bh) { ++ nbh = bh->b_reqnext; ++ queue_bh(ms, bh, WRITE); ++ bh = nbh; ++ } ++} ++ ++static void rh_update_states(struct region_hash *rh) ++{ ++ struct list_head *tmp, *tmp2; ++ struct region *reg; ++ ++ LIST_HEAD(clean); ++ LIST_HEAD(recovered); ++ ++ /* ++ * Quickly grab the lists. ++ */ ++ write_lock_irq(&rh->hash_lock); ++ spin_lock(&rh->region_lock); ++ if (!list_empty(&rh->clean_regions)) { ++ list_splice(&rh->clean_regions, &clean); ++ INIT_LIST_HEAD(&rh->clean_regions); ++ ++ list_for_each_entry (reg, &clean, list) { ++ rh->log->type->clear_region(rh->log, reg->key); ++ list_del(®->hash_list); ++ } ++ } ++ ++ if (!list_empty(&rh->recovered_regions)) { ++ list_splice(&rh->recovered_regions, &recovered); ++ INIT_LIST_HEAD(&rh->recovered_regions); ++ ++ list_for_each_entry (reg, &recovered, list) ++ list_del(®->hash_list); ++ } ++ spin_unlock(&rh->region_lock); ++ write_unlock_irq(&rh->hash_lock); ++ ++ /* ++ * All the regions on the recovered and clean lists have ++ * now been pulled out of the system, so no need to do ++ * any more locking. ++ */ ++ list_for_each_safe (tmp, tmp2, &recovered) { ++ reg = list_entry(tmp, struct region, list); ++ ++ rh->log->type->complete_resync_work(rh->log, reg->key, 1); ++ dispatch_buffers(rh->ms, reg->delayed_bhs); ++ up(&rh->recovery_count); ++ mempool_free(reg, rh->region_pool); ++ } ++ ++ list_for_each_safe (tmp, tmp2, &clean) { ++ reg = list_entry(tmp, struct region, list); ++ mempool_free(reg, rh->region_pool); ++ } ++} ++ ++static void rh_inc(struct region_hash *rh, region_t region) ++{ ++ struct region *reg; ++ ++ read_lock(&rh->hash_lock); ++ reg = __rh_find(rh, region); ++ if (reg->state == RH_CLEAN) { ++ rh->log->type->mark_region(rh->log, reg->key); ++ ++ spin_lock_irq(&rh->region_lock); ++ reg->state = RH_DIRTY; ++ list_del_init(®->list); /* take off the clean list */ ++ spin_unlock_irq(&rh->region_lock); ++ } ++ ++ atomic_inc(®->pending); ++ read_unlock(&rh->hash_lock); ++} ++ ++static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers) ++{ ++ struct buffer_head *bh; ++ ++ for (bh = buffers->head; bh; bh = bh->b_reqnext) ++ rh_inc(rh, bh_to_region(rh, bh)); ++} ++ ++static void rh_dec(struct region_hash *rh, region_t region) ++{ ++ unsigned long flags; ++ struct region *reg; ++ int wake = 0; ++ ++ read_lock(&rh->hash_lock); ++ reg = __rh_lookup(rh, region); ++ read_unlock(&rh->hash_lock); ++ ++ if (atomic_dec_and_test(®->pending)) { ++ spin_lock_irqsave(&rh->region_lock, flags); ++ if (reg->state == RH_RECOVERING) { ++ list_add_tail(®->list, &rh->quiesced_regions); ++ } else { ++ reg->state = RH_CLEAN; ++ list_add(®->list, &rh->clean_regions); ++ } ++ spin_unlock_irqrestore(&rh->region_lock, flags); ++ wake = 1; ++ } ++ ++ if (wake) ++ dm_daemon_wake(&_kmirrord); ++} ++ ++/* ++ * Starts quiescing a region in preparation for recovery. ++ */ ++static int __rh_recovery_prepare(struct region_hash *rh) ++{ ++ int r; ++ struct region *reg; ++ region_t region; ++ ++ /* ++ * Ask the dirty log what's next. ++ */ ++ r = rh->log->type->get_resync_work(rh->log, ®ion); ++ if (r <= 0) ++ return r; ++ ++ /* ++ * Get this region, and start it quiescing by setting the ++ * recovering flag. ++ */ ++ read_lock(&rh->hash_lock); ++ reg = __rh_find(rh, region); ++ read_unlock(&rh->hash_lock); ++ ++ spin_lock_irq(&rh->region_lock); ++ reg->state = RH_RECOVERING; ++ ++ /* Already quiesced ? */ ++ if (atomic_read(®->pending)) ++ list_del_init(®->list); ++ ++ else { ++ list_del_init(®->list); ++ list_add(®->list, &rh->quiesced_regions); ++ } ++ spin_unlock_irq(&rh->region_lock); ++ ++ return 1; ++} ++ ++static void rh_recovery_prepare(struct region_hash *rh) ++{ ++ while (!down_trylock(&rh->recovery_count)) ++ if (__rh_recovery_prepare(rh) <= 0) { ++ up(&rh->recovery_count); ++ break; ++ } ++} ++ ++/* ++ * Returns any quiesced regions. ++ */ ++static struct region *rh_recovery_start(struct region_hash *rh) ++{ ++ struct region *reg = NULL; ++ ++ spin_lock_irq(&rh->region_lock); ++ if (!list_empty(&rh->quiesced_regions)) { ++ reg = list_entry(rh->quiesced_regions.next, ++ struct region, list); ++ list_del_init(®->list); /* remove from the quiesced list */ ++ } ++ spin_unlock_irq(&rh->region_lock); ++ ++ return reg; ++} ++ ++/* FIXME: success ignored for now */ ++static void rh_recovery_end(struct region *reg, int success) ++{ ++ struct region_hash *rh = reg->rh; ++ ++ spin_lock_irq(&rh->region_lock); ++ list_add(®->list, ®->rh->recovered_regions); ++ spin_unlock_irq(&rh->region_lock); ++ ++ dm_daemon_wake(&_kmirrord); ++} ++ ++static void rh_flush(struct region_hash *rh) ++{ ++ rh->log->type->flush(rh->log); ++} ++ ++static void rh_delay(struct region_hash *rh, struct buffer_head *bh) ++{ ++ struct region *reg; ++ ++ read_lock(&rh->hash_lock); ++ reg = __rh_find(rh, bh_to_region(rh, bh)); ++ bh->b_reqnext = reg->delayed_bhs; ++ reg->delayed_bhs = bh; ++ read_unlock(&rh->hash_lock); ++} ++ ++static void rh_stop_recovery(struct region_hash *rh) ++{ ++ int i; ++ ++ /* wait for any recovering regions */ ++ for (i = 0; i < MAX_RECOVERY; i++) ++ down(&rh->recovery_count); ++} ++ ++static void rh_start_recovery(struct region_hash *rh) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_RECOVERY; i++) ++ up(&rh->recovery_count); ++ ++ dm_daemon_wake(&_kmirrord); ++} ++ ++/*----------------------------------------------------------------- ++ * Mirror set structures. ++ *---------------------------------------------------------------*/ ++struct mirror { ++ atomic_t error_count; ++ struct dm_dev *dev; ++ sector_t offset; ++}; ++ ++struct mirror_set { ++ struct dm_target *ti; ++ struct list_head list; ++ struct region_hash rh; ++ struct kcopyd_client *kcopyd_client; ++ ++ spinlock_t lock; /* protects the next two lists */ ++ struct buffer_list reads; ++ struct buffer_list writes; ++ ++ /* recovery */ ++ region_t nr_regions; ++ region_t sync_count; ++ ++ unsigned int nr_mirrors; ++ struct mirror mirror[0]; ++}; ++ ++/* ++ * Every mirror should look like this one. ++ */ ++#define DEFAULT_MIRROR 0 ++ ++/* ++ * This is yucky. We squirrel the mirror_set struct away inside ++ * b_reqnext for write buffers. This is safe since the bh ++ * doesn't get submitted to the lower levels of block layer. ++ */ ++static struct mirror_set *bh_get_ms(struct buffer_head *bh) ++{ ++ return (struct mirror_set *) bh->b_reqnext; ++} ++ ++static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms) ++{ ++ bh->b_reqnext = (struct buffer_head *) ms; ++} ++ ++/*----------------------------------------------------------------- ++ * Recovery. ++ * ++ * When a mirror is first activated we may find that some regions ++ * are in the no-sync state. We have to recover these by ++ * recopying from the default mirror to all the others. ++ *---------------------------------------------------------------*/ ++static void recovery_complete(int read_err, unsigned int write_err, ++ void *context) ++{ ++ struct region *reg = (struct region *) context; ++ struct mirror_set *ms = reg->rh->ms; ++ ++ /* FIXME: better error handling */ ++ rh_recovery_end(reg, read_err || write_err); ++ if (++ms->sync_count == ms->nr_regions) ++ /* the sync is complete */ ++ dm_table_event(ms->ti->table); ++} ++ ++static int recover(struct mirror_set *ms, struct region *reg) ++{ ++ int r; ++ unsigned int i; ++ struct io_region from, to[ms->nr_mirrors - 1], *dest; ++ struct mirror *m; ++ unsigned int flags = 0; ++ ++ /* fill in the source */ ++ m = ms->mirror + DEFAULT_MIRROR; ++ from.dev = m->dev->dev; ++ from.sector = m->offset + region_to_sector(reg->rh, reg->key); ++ if (reg->key == (ms->nr_regions - 1)) { ++ /* ++ * The final region may be smaller than ++ * region_size. ++ */ ++ from.count = ms->ti->len & (reg->rh->region_size - 1); ++ if (!from.count) ++ from.count = reg->rh->region_size; ++ } else ++ from.count = reg->rh->region_size; ++ ++ /* fill in the destinations */ ++ for (i = 1; i < ms->nr_mirrors; i++) { ++ m = ms->mirror + i; ++ dest = to + (i - 1); ++ ++ dest->dev = m->dev->dev; ++ dest->sector = m->offset + region_to_sector(reg->rh, reg->key); ++ dest->count = from.count; ++ } ++ ++ /* hand to kcopyd */ ++ set_bit(KCOPYD_IGNORE_ERROR, &flags); ++ r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, ++ recovery_complete, reg); ++ ++ return r; ++} ++ ++static void do_recovery(struct mirror_set *ms) ++{ ++ int r; ++ struct region *reg; ++ ++ /* ++ * Start quiescing some regions. ++ */ ++ rh_recovery_prepare(&ms->rh); ++ ++ /* ++ * Copy any already quiesced regions. ++ */ ++ while ((reg = rh_recovery_start(&ms->rh))) { ++ r = recover(ms, reg); ++ if (r) ++ rh_recovery_end(reg, 0); ++ } ++} ++ ++/*----------------------------------------------------------------- ++ * Reads ++ *---------------------------------------------------------------*/ ++static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) ++{ ++ /* FIXME: add read balancing */ ++ return ms->mirror + DEFAULT_MIRROR; ++} ++ ++/* ++ * remap a buffer to a particular mirror. ++ */ ++static void map_buffer(struct mirror_set *ms, ++ struct mirror *m, struct buffer_head *bh) ++{ ++ bh->b_rdev = m->dev->dev; ++ bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin); ++} ++ ++static void do_reads(struct mirror_set *ms, struct buffer_list *reads) ++{ ++ region_t region; ++ struct buffer_head *bh; ++ struct mirror *m; ++ ++ while ((bh = buffer_list_pop(reads))) { ++ region = bh_to_region(&ms->rh, bh); ++ ++ /* ++ * We can only read balance if the region is in sync. ++ */ ++ if (rh_in_sync(&ms->rh, region, 0)) ++ m = choose_mirror(ms, bh->b_rsector); ++ else ++ m = ms->mirror + DEFAULT_MIRROR; ++ ++ map_buffer(ms, m, bh); ++ generic_make_request(READ, bh); ++ } ++} ++ ++/*----------------------------------------------------------------- ++ * Writes. ++ * ++ * We do different things with the write io depending on the ++ * state of the region that it's in: ++ * ++ * SYNC: increment pending, use kcopyd to write to *all* mirrors ++ * RECOVERING: delay the io until recovery completes ++ * NOSYNC: increment pending, just write to the default mirror ++ *---------------------------------------------------------------*/ ++static void write_callback(unsigned int error, void *context) ++{ ++ unsigned int i; ++ int uptodate = 1; ++ struct buffer_head *bh = (struct buffer_head *) context; ++ struct mirror_set *ms; ++ ++ ms = bh_get_ms(bh); ++ bh_set_ms(bh, NULL); ++ ++ /* ++ * NOTE: We don't decrement the pending count here, ++ * instead it is done by the targets endio function. ++ * This way we handle both writes to SYNC and NOSYNC ++ * regions with the same code. ++ */ ++ ++ if (error) { ++ /* ++ * only error the io if all mirrors failed. ++ * FIXME: bogus ++ */ ++ uptodate = 0; ++ for (i = 0; i < ms->nr_mirrors; i++) ++ if (!test_bit(i, &error)) { ++ uptodate = 1; ++ break; ++ } ++ } ++ bh->b_end_io(bh, uptodate); ++} ++ ++static void do_write(struct mirror_set *ms, struct buffer_head *bh) ++{ ++ unsigned int i; ++ struct io_region io[ms->nr_mirrors]; ++ struct mirror *m; ++ ++ for (i = 0; i < ms->nr_mirrors; i++) { ++ m = ms->mirror + i; ++ ++ io[i].dev = m->dev->dev; ++ io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin); ++ io[i].count = bh->b_size >> 9; ++ } ++ ++ bh_set_ms(bh, ms); ++ dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page, ++ (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh); ++} ++ ++static void do_writes(struct mirror_set *ms, struct buffer_list *writes) ++{ ++ int state; ++ struct buffer_head *bh; ++ struct buffer_list sync, nosync, recover, *this_list = NULL; ++ ++ if (!writes->head) ++ return; ++ ++ /* ++ * Classify each write. ++ */ ++ buffer_list_init(&sync); ++ buffer_list_init(&nosync); ++ buffer_list_init(&recover); ++ ++ while ((bh = buffer_list_pop(writes))) { ++ state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1); ++ switch (state) { ++ case RH_CLEAN: ++ case RH_DIRTY: ++ this_list = &sync; ++ break; ++ ++ case RH_NOSYNC: ++ this_list = &nosync; ++ break; ++ ++ case RH_RECOVERING: ++ this_list = &recover; ++ break; ++ } ++ ++ buffer_list_add(this_list, bh); ++ } ++ ++ /* ++ * Increment the pending counts for any regions that will ++ * be written to (writes to recover regions are going to ++ * be delayed). ++ */ ++ rh_inc_pending(&ms->rh, &sync); ++ rh_inc_pending(&ms->rh, &nosync); ++ rh_flush(&ms->rh); ++ ++ /* ++ * Dispatch io. ++ */ ++ while ((bh = buffer_list_pop(&sync))) ++ do_write(ms, bh); ++ ++ while ((bh = buffer_list_pop(&recover))) ++ rh_delay(&ms->rh, bh); ++ ++ while ((bh = buffer_list_pop(&nosync))) { ++ map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh); ++ generic_make_request(WRITE, bh); ++ } ++} ++ ++/*----------------------------------------------------------------- ++ * kmirrord ++ *---------------------------------------------------------------*/ ++static LIST_HEAD(_mirror_sets); ++static DECLARE_RWSEM(_mirror_sets_lock); ++ ++static void do_mirror(struct mirror_set *ms) ++{ ++ struct buffer_list reads, writes; ++ ++ spin_lock(&ms->lock); ++ memcpy(&reads, &ms->reads, sizeof(reads)); ++ buffer_list_init(&ms->reads); ++ memcpy(&writes, &ms->writes, sizeof(writes)); ++ buffer_list_init(&ms->writes); ++ spin_unlock(&ms->lock); ++ ++ rh_update_states(&ms->rh); ++ do_recovery(ms); ++ do_reads(ms, &reads); ++ do_writes(ms, &writes); ++ run_task_queue(&tq_disk); ++} ++ ++static void do_work(void) ++{ ++ struct mirror_set *ms; ++ ++ down_read(&_mirror_sets_lock); ++ list_for_each_entry (ms, &_mirror_sets, list) ++ do_mirror(ms); ++ up_read(&_mirror_sets_lock); ++} ++ ++/*----------------------------------------------------------------- ++ * Target functions ++ *---------------------------------------------------------------*/ ++static struct mirror_set *alloc_context(unsigned int nr_mirrors, ++ sector_t region_size, ++ struct dm_target *ti, ++ struct dirty_log *dl) ++{ ++ size_t len; ++ struct mirror_set *ms = NULL; ++ ++ if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) ++ return NULL; ++ ++ len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); ++ ++ ms = kmalloc(len, GFP_KERNEL); ++ if (!ms) { ++ ti->error = "dm-mirror: Cannot allocate mirror context"; ++ return NULL; ++ } ++ ++ memset(ms, 0, len); ++ spin_lock_init(&ms->lock); ++ ++ ms->ti = ti; ++ ms->nr_mirrors = nr_mirrors; ++ ms->nr_regions = dm_div_up(ti->len, region_size); ++ ++ if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ++ ti->error = "dm-mirror: Error creating dirty region hash"; ++ kfree(ms); ++ return NULL; ++ } ++ ++ return ms; ++} ++ ++static void free_context(struct mirror_set *ms, struct dm_target *ti, ++ unsigned int m) ++{ ++ while (m--) ++ dm_put_device(ti, ms->mirror[m].dev); ++ ++ rh_exit(&ms->rh); ++ kfree(ms); ++} ++ ++static inline int _check_region_size(struct dm_target *ti, sector_t size) ++{ ++ return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || ++ size > ti->len); ++} ++ ++static int get_mirror(struct mirror_set *ms, struct dm_target *ti, ++ unsigned int mirror, char **argv) ++{ ++ sector_t offset; ++ ++ if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { ++ ti->error = "dm-mirror: Invalid offset"; ++ return -EINVAL; ++ } ++ ++ if (dm_get_device(ti, argv[0], offset, ti->len, ++ dm_table_get_mode(ti->table), ++ &ms->mirror[mirror].dev)) { ++ ti->error = "dm-mirror: Device lookup failure"; ++ return -ENXIO; ++ } ++ ++ ms->mirror[mirror].offset = offset; ++ ++ return 0; ++} ++ ++static int add_mirror_set(struct mirror_set *ms) ++{ ++ down_write(&_mirror_sets_lock); ++ list_add_tail(&ms->list, &_mirror_sets); ++ up_write(&_mirror_sets_lock); ++ dm_daemon_wake(&_kmirrord); ++ ++ return 0; ++} ++ ++static void del_mirror_set(struct mirror_set *ms) ++{ ++ down_write(&_mirror_sets_lock); ++ list_del(&ms->list); ++ up_write(&_mirror_sets_lock); ++} ++ ++/* ++ * Create dirty log: log_type #log_params ++ */ ++static struct dirty_log *create_dirty_log(struct dm_target *ti, ++ unsigned int argc, char **argv, ++ unsigned int *args_used) ++{ ++ unsigned int param_count; ++ struct dirty_log *dl; ++ ++ if (argc < 2) { ++ ti->error = "dm-mirror: Insufficient mirror log arguments"; ++ return NULL; ++ } ++ ++ if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) { ++ ti->error = "dm-mirror: Invalid mirror log argument count"; ++ return NULL; ++ } ++ ++ *args_used = 2 + param_count; ++ ++ if (argc < *args_used) { ++ ti->error = "dm-mirror: Insufficient mirror log arguments"; ++ return NULL; ++ } ++ ++ dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2); ++ if (!dl) { ++ ti->error = "dm-mirror: Error creating mirror dirty log"; ++ return NULL; ++ } ++ ++ if (!_check_region_size(ti, dl->type->get_region_size(dl))) { ++ ti->error = "dm-mirror: Invalid region size"; ++ dm_destroy_dirty_log(dl); ++ return NULL; ++ } ++ ++ return dl; ++} ++ ++/* ++ * Construct a mirror mapping: ++ * ++ * log_type #log_params ++ * #mirrors [mirror_path offset]{2,} ++ * ++ * For now, #log_params = 1, log_type = "core" ++ * ++ */ ++#define DM_IO_PAGES 64 ++static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ++{ ++ int r; ++ unsigned int nr_mirrors, m, args_used; ++ struct mirror_set *ms; ++ struct dirty_log *dl; ++ ++ dl = create_dirty_log(ti, argc, argv, &args_used); ++ if (!dl) ++ return -EINVAL; ++ ++ argv += args_used; ++ argc -= args_used; ++ ++ if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || ++ nr_mirrors < 2) { ++ ti->error = "dm-mirror: Invalid number of mirrors"; ++ dm_destroy_dirty_log(dl); ++ return -EINVAL; ++ } ++ ++ argv++, argc--; ++ ++ if (argc != nr_mirrors * 2) { ++ ti->error = "dm-mirror: Wrong number of mirror arguments"; ++ dm_destroy_dirty_log(dl); ++ return -EINVAL; ++ } ++ ++ ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); ++ if (!ms) { ++ dm_destroy_dirty_log(dl); ++ return -ENOMEM; ++ } ++ ++ /* Get the mirror parameter sets */ ++ for (m = 0; m < nr_mirrors; m++) { ++ r = get_mirror(ms, ti, m, argv); ++ if (r) { ++ free_context(ms, ti, m); ++ return r; ++ } ++ argv += 2; ++ argc -= 2; ++ } ++ ++ ti->private = ms; ++ ++ r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); ++ if (r) { ++ free_context(ms, ti, ms->nr_mirrors); ++ return r; ++ } ++ ++ add_mirror_set(ms); ++ return 0; ++} ++ ++static void mirror_dtr(struct dm_target *ti) ++{ ++ struct mirror_set *ms = (struct mirror_set *) ti->private; ++ ++ del_mirror_set(ms); ++ kcopyd_client_destroy(ms->kcopyd_client); ++ free_context(ms, ti, ms->nr_mirrors); ++} ++ ++static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw) ++{ ++ int wake = 0; ++ struct buffer_list *bl; ++ ++ bl = (rw == WRITE) ? &ms->writes : &ms->reads; ++ spin_lock(&ms->lock); ++ wake = !(bl->head); ++ buffer_list_add(bl, bh); ++ spin_unlock(&ms->lock); ++ ++ if (wake) ++ dm_daemon_wake(&_kmirrord); ++} ++ ++/* ++ * Mirror mapping function ++ */ ++static int mirror_map(struct dm_target *ti, struct buffer_head *bh, ++ int rw, union map_info *map_context) ++{ ++ int r; ++ struct mirror *m; ++ struct mirror_set *ms = ti->private; ++ ++ /* FIXME: nasty hack, 32 bit sector_t only */ ++ map_context->ll = bh->b_rsector / ms->rh.region_size; ++ ++ if (rw == WRITE) { ++ queue_bh(ms, bh, rw); ++ return 0; ++ } ++ ++ r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0); ++ if (r < 0 && r != -EWOULDBLOCK) ++ return r; ++ ++ if (r == -EWOULDBLOCK) /* FIXME: ugly */ ++ r = 0; ++ ++ /* ++ * We don't want to fast track a recovery just for a read ++ * ahead. So we just let it silently fail. ++ * FIXME: get rid of this. ++ */ ++ if (!r && rw == READA) ++ return -EIO; ++ ++ if (!r) { ++ /* Pass this io over to the daemon */ ++ queue_bh(ms, bh, rw); ++ return 0; ++ } ++ ++ m = choose_mirror(ms, bh->b_rsector); ++ if (!m) ++ return -EIO; ++ ++ map_buffer(ms, m, bh); ++ return 1; ++} ++ ++static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh, ++ int rw, int error, union map_info *map_context) ++{ ++ struct mirror_set *ms = (struct mirror_set *) ti->private; ++ region_t region = map_context->ll; ++ ++ /* ++ * We need to dec pending if this was a write. ++ */ ++ if (rw == WRITE) ++ rh_dec(&ms->rh, region); ++ ++ return 0; ++} ++ ++static void mirror_suspend(struct dm_target *ti) ++{ ++ struct mirror_set *ms = (struct mirror_set *) ti->private; ++ rh_stop_recovery(&ms->rh); ++} ++ ++static void mirror_resume(struct dm_target *ti) ++{ ++ struct mirror_set *ms = (struct mirror_set *) ti->private; ++ rh_start_recovery(&ms->rh); ++} ++ ++static int mirror_status(struct dm_target *ti, status_type_t type, ++ char *result, unsigned int maxlen) ++{ ++ unsigned int m, sz = 0; ++ struct mirror_set *ms = (struct mirror_set *) ti->private; ++ ++ switch (type) { ++ case STATUSTYPE_INFO: ++ sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors); ++ ++ for (m = 0; m < ms->nr_mirrors; m++) ++ sz += snprintf(result + sz, maxlen - sz, "%s ", ++ dm_kdevname(ms->mirror[m].dev->dev)); ++ ++ sz += snprintf(result + sz, maxlen - sz, "%lu/%lu", ++ ms->sync_count, ms->nr_regions); ++ break; ++ ++ case STATUSTYPE_TABLE: ++ sz += snprintf(result + sz, maxlen - sz, ++ "%s 1 " SECTOR_FORMAT " %d ", ++ ms->rh.log->type->name, ms->rh.region_size, ++ ms->nr_mirrors); ++ ++ for (m = 0; m < ms->nr_mirrors; m++) ++ sz += snprintf(result + sz, maxlen - sz, "%s %ld ", ++ dm_kdevname(ms->mirror[m].dev->dev), ++ ms->mirror[m].offset); ++ } ++ ++ return 0; ++} ++ ++static struct target_type mirror_target = { ++ .name = "mirror", ++ .module = THIS_MODULE, ++ .ctr = mirror_ctr, ++ .dtr = mirror_dtr, ++ .map = mirror_map, ++ .end_io = mirror_end_io, ++ .suspend = mirror_suspend, ++ .resume = mirror_resume, ++ .status = mirror_status, ++}; ++ ++static int __init dm_mirror_init(void) ++{ ++ int r; ++ ++ r = dm_dirty_log_init(); ++ if (r) ++ return r; ++ ++ r = dm_daemon_start(&_kmirrord, "kmirrord", do_work); ++ if (r) { ++ DMERR("couldn't start kmirrord"); ++ dm_dirty_log_exit(); ++ return r; ++ } ++ ++ r = dm_register_target(&mirror_target); ++ if (r < 0) { ++ DMERR("%s: Failed to register mirror target", ++ mirror_target.name); ++ dm_dirty_log_exit(); ++ dm_daemon_stop(&_kmirrord); ++ } ++ ++ return r; ++} ++ ++static void __exit dm_mirror_exit(void) ++{ ++ int r; ++ ++ r = dm_unregister_target(&mirror_target); ++ if (r < 0) ++ DMERR("%s: unregister failed %d", mirror_target.name, r); ++ ++ dm_daemon_stop(&_kmirrord); ++ dm_dirty_log_exit(); ++} ++ ++/* Module hooks */ ++module_init(dm_mirror_init); ++module_exit(dm_mirror_exit); ++ ++MODULE_DESCRIPTION(DM_NAME " mirror target"); ++MODULE_AUTHOR("Heinz Mauelshagen "); ++MODULE_LICENSE("GPL"); +--- linux-2.4.22/drivers/md/dm-snapshot.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-snapshot.c Tue Nov 18 13:57:29 2003 +@@ -0,0 +1,1235 @@ ++/* ++ * dm-snapshot.c ++ * ++ * Copyright (C) 2001-2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "dm-snapshot.h" ++#include "kcopyd.h" ++ ++/* ++ * FIXME: Remove this before release. ++ */ ++#if 0 ++#define DMDEBUG(x...) DMWARN( ## x) ++#else ++#define DMDEBUG(x...) ++#endif ++ ++/* ++ * The percentage increment we will wake up users at ++ */ ++#define WAKE_UP_PERCENT 5 ++ ++/* ++ * kcopyd priority of snapshot operations ++ */ ++#define SNAPSHOT_COPY_PRIORITY 2 ++ ++/* ++ * Each snapshot reserves this many pages for io ++ * FIXME: calculate this ++ */ ++#define SNAPSHOT_PAGES 256 ++ ++struct pending_exception { ++ struct exception e; ++ ++ /* ++ * Origin buffers waiting for this to complete are held ++ * in a list (using b_reqnext). ++ */ ++ struct buffer_head *origin_bhs; ++ struct buffer_head *snapshot_bhs; ++ ++ /* ++ * Other pending_exceptions that are processing this ++ * chunk. When this list is empty, we know we can ++ * complete the origins. ++ */ ++ struct list_head siblings; ++ ++ /* Pointer back to snapshot context */ ++ struct dm_snapshot *snap; ++ ++ /* ++ * 1 indicates the exception has already been sent to ++ * kcopyd. ++ */ ++ int started; ++}; ++ ++/* ++ * Hash table mapping origin volumes to lists of snapshots and ++ * a lock to protect it ++ */ ++static kmem_cache_t *exception_cache; ++static kmem_cache_t *pending_cache; ++static mempool_t *pending_pool; ++ ++/* ++ * One of these per registered origin, held in the snapshot_origins hash ++ */ ++struct origin { ++ /* The origin device */ ++ kdev_t dev; ++ ++ struct list_head hash_list; ++ ++ /* List of snapshots for this origin */ ++ struct list_head snapshots; ++}; ++ ++/* ++ * Size of the hash table for origin volumes. If we make this ++ * the size of the minors list then it should be nearly perfect ++ */ ++#define ORIGIN_HASH_SIZE 256 ++#define ORIGIN_MASK 0xFF ++static struct list_head *_origins; ++static struct rw_semaphore _origins_lock; ++ ++static int init_origin_hash(void) ++{ ++ int i; ++ ++ _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!_origins) { ++ DMERR("Device mapper: Snapshot: unable to allocate memory"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < ORIGIN_HASH_SIZE; i++) ++ INIT_LIST_HEAD(_origins + i); ++ init_rwsem(&_origins_lock); ++ ++ return 0; ++} ++ ++static void exit_origin_hash(void) ++{ ++ kfree(_origins); ++} ++ ++static inline unsigned int origin_hash(kdev_t dev) ++{ ++ return MINOR(dev) & ORIGIN_MASK; ++} ++ ++static struct origin *__lookup_origin(kdev_t origin) ++{ ++ struct list_head *slist; ++ struct list_head *ol; ++ struct origin *o; ++ ++ ol = &_origins[origin_hash(origin)]; ++ list_for_each(slist, ol) { ++ o = list_entry(slist, struct origin, hash_list); ++ ++ if (o->dev == origin) ++ return o; ++ } ++ ++ return NULL; ++} ++ ++static void __insert_origin(struct origin *o) ++{ ++ struct list_head *sl = &_origins[origin_hash(o->dev)]; ++ list_add_tail(&o->hash_list, sl); ++} ++ ++/* ++ * Make a note of the snapshot and its origin so we can look it ++ * up when the origin has a write on it. ++ */ ++static int register_snapshot(struct dm_snapshot *snap) ++{ ++ struct origin *o; ++ kdev_t dev = snap->origin->dev; ++ ++ down_write(&_origins_lock); ++ o = __lookup_origin(dev); ++ ++ if (!o) { ++ /* New origin */ ++ o = kmalloc(sizeof(*o), GFP_KERNEL); ++ if (!o) { ++ up_write(&_origins_lock); ++ return -ENOMEM; ++ } ++ ++ /* Initialise the struct */ ++ INIT_LIST_HEAD(&o->snapshots); ++ o->dev = dev; ++ ++ __insert_origin(o); ++ } ++ ++ list_add_tail(&snap->list, &o->snapshots); ++ ++ up_write(&_origins_lock); ++ return 0; ++} ++ ++static void unregister_snapshot(struct dm_snapshot *s) ++{ ++ struct origin *o; ++ ++ down_write(&_origins_lock); ++ o = __lookup_origin(s->origin->dev); ++ ++ list_del(&s->list); ++ if (list_empty(&o->snapshots)) { ++ list_del(&o->hash_list); ++ kfree(o); ++ } ++ ++ up_write(&_origins_lock); ++} ++ ++/* ++ * Implementation of the exception hash tables. ++ */ ++static int init_exception_table(struct exception_table *et, uint32_t size) ++{ ++ unsigned int i; ++ ++ et->hash_mask = size - 1; ++ et->table = vcalloc(size, sizeof(struct list_head)); ++ if (!et->table) ++ return -ENOMEM; ++ ++ for (i = 0; i < size; i++) ++ INIT_LIST_HEAD(et->table + i); ++ ++ return 0; ++} ++ ++static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) ++{ ++ struct list_head *slot, *entry, *temp; ++ struct exception *ex; ++ int i, size; ++ ++ size = et->hash_mask + 1; ++ for (i = 0; i < size; i++) { ++ slot = et->table + i; ++ ++ list_for_each_safe(entry, temp, slot) { ++ ex = list_entry(entry, struct exception, hash_list); ++ kmem_cache_free(mem, ex); ++ } ++ } ++ ++ vfree(et->table); ++} ++ ++/* ++ * FIXME: check how this hash fn is performing. ++ */ ++static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) ++{ ++ return chunk & et->hash_mask; ++} ++ ++static void insert_exception(struct exception_table *eh, struct exception *e) ++{ ++ struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; ++ list_add(&e->hash_list, l); ++} ++ ++static inline void remove_exception(struct exception *e) ++{ ++ list_del(&e->hash_list); ++} ++ ++/* ++ * Return the exception data for a sector, or NULL if not ++ * remapped. ++ */ ++static struct exception *lookup_exception(struct exception_table *et, ++ chunk_t chunk) ++{ ++ struct list_head *slot, *el; ++ struct exception *e; ++ ++ slot = &et->table[exception_hash(et, chunk)]; ++ list_for_each(el, slot) { ++ e = list_entry(el, struct exception, hash_list); ++ if (e->old_chunk == chunk) ++ return e; ++ } ++ ++ return NULL; ++} ++ ++static inline struct exception *alloc_exception(void) ++{ ++ struct exception *e; ++ ++ e = kmem_cache_alloc(exception_cache, GFP_NOIO); ++ if (!e) ++ e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); ++ ++ return e; ++} ++ ++static inline void free_exception(struct exception *e) ++{ ++ kmem_cache_free(exception_cache, e); ++} ++ ++static inline struct pending_exception *alloc_pending_exception(void) ++{ ++ return mempool_alloc(pending_pool, GFP_NOIO); ++} ++ ++static inline void free_pending_exception(struct pending_exception *pe) ++{ ++ mempool_free(pe, pending_pool); ++} ++ ++int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) ++{ ++ struct exception *e; ++ ++ e = alloc_exception(); ++ if (!e) ++ return -ENOMEM; ++ ++ e->old_chunk = old; ++ e->new_chunk = new; ++ insert_exception(&s->complete, e); ++ return 0; ++} ++ ++/* ++ * Hard coded magic. ++ */ ++static int calc_max_buckets(void) ++{ ++ unsigned long mem; ++ ++ mem = num_physpages << PAGE_SHIFT; ++ mem /= 50; ++ mem /= sizeof(struct list_head); ++ ++ return mem; ++} ++ ++/* ++ * Rounds a number down to a power of 2. ++ */ ++static inline uint32_t round_down(uint32_t n) ++{ ++ while (n & (n - 1)) ++ n &= (n - 1); ++ return n; ++} ++ ++/* ++ * Allocate room for a suitable hash table. ++ */ ++static int init_hash_tables(struct dm_snapshot *s) ++{ ++ sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; ++ ++ /* ++ * Calculate based on the size of the original volume or ++ * the COW volume... ++ */ ++ cow_dev_size = get_dev_size(s->cow->dev); ++ origin_dev_size = get_dev_size(s->origin->dev); ++ max_buckets = calc_max_buckets(); ++ ++ hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size; ++ hash_size = min(hash_size, max_buckets); ++ ++ /* Round it down to a power of 2 */ ++ hash_size = round_down(hash_size); ++ if (init_exception_table(&s->complete, hash_size)) ++ return -ENOMEM; ++ ++ /* ++ * Allocate hash table for in-flight exceptions ++ * Make this smaller than the real hash table ++ */ ++ hash_size >>= 3; ++ if (!hash_size) ++ hash_size = 64; ++ ++ if (init_exception_table(&s->pending, hash_size)) { ++ exit_exception_table(&s->complete, exception_cache); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Round a number up to the nearest 'size' boundary. size must ++ * be a power of 2. ++ */ ++static inline ulong round_up(ulong n, ulong size) ++{ ++ size--; ++ return (n + size) & ~size; ++} ++ ++/* ++ * Construct a snapshot mapping:

++ */ ++static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ++{ ++ struct dm_snapshot *s; ++ unsigned long chunk_size; ++ int r = -EINVAL; ++ char persistent; ++ char *origin_path; ++ char *cow_path; ++ char *value; ++ int blocksize; ++ ++ if (argc < 4) { ++ ti->error = "dm-snapshot: requires exactly 4 arguments"; ++ r = -EINVAL; ++ goto bad1; ++ } ++ ++ origin_path = argv[0]; ++ cow_path = argv[1]; ++ persistent = toupper(*argv[2]); ++ ++ if (persistent != 'P' && persistent != 'N') { ++ ti->error = "Persistent flag is not P or N"; ++ r = -EINVAL; ++ goto bad1; ++ } ++ ++ chunk_size = simple_strtoul(argv[3], &value, 10); ++ if (chunk_size == 0 || value == NULL) { ++ ti->error = "Invalid chunk size"; ++ r = -EINVAL; ++ goto bad1; ++ } ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (s == NULL) { ++ ti->error = "Cannot allocate snapshot context private " ++ "structure"; ++ r = -ENOMEM; ++ goto bad1; ++ } ++ ++ r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); ++ if (r) { ++ ti->error = "Cannot get origin device"; ++ goto bad2; ++ } ++ ++ /* FIXME: get cow length */ ++ r = dm_get_device(ti, cow_path, 0, 0, ++ FMODE_READ | FMODE_WRITE, &s->cow); ++ if (r) { ++ dm_put_device(ti, s->origin); ++ ti->error = "Cannot get COW device"; ++ goto bad2; ++ } ++ ++ /* ++ * Chunk size must be multiple of page size. Silently ++ * round up if it's not. ++ */ ++ chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); ++ ++ /* Validate the chunk size against the device block size */ ++ blocksize = get_hardsect_size(s->cow->dev); ++ if (chunk_size % (blocksize / SECTOR_SIZE)) { ++ ti->error = "Chunk size is not a multiple of device blocksize"; ++ r = -EINVAL; ++ goto bad3; ++ } ++ ++ /* Check the sizes are small enough to fit in one kiovec */ ++ if (chunk_size > KIO_MAX_SECTORS) { ++ ti->error = "Chunk size is too big"; ++ r = -EINVAL; ++ goto bad3; ++ } ++ ++ /* Check chunk_size is a power of 2 */ ++ if (chunk_size & (chunk_size - 1)) { ++ ti->error = "Chunk size is not a power of 2"; ++ r = -EINVAL; ++ goto bad3; ++ } ++ ++ s->chunk_size = chunk_size; ++ s->chunk_mask = chunk_size - 1; ++ s->type = persistent; ++ for (s->chunk_shift = 0; chunk_size; ++ s->chunk_shift++, chunk_size >>= 1) ++ ; ++ s->chunk_shift--; ++ ++ s->valid = 1; ++ s->have_metadata = 0; ++ s->last_percent = 0; ++ init_rwsem(&s->lock); ++ s->table = ti->table; ++ ++ /* Allocate hash table for COW data */ ++ if (init_hash_tables(s)) { ++ ti->error = "Unable to allocate hash table space"; ++ r = -ENOMEM; ++ goto bad3; ++ } ++ ++ /* ++ * Check the persistent flag - done here because we need the iobuf ++ * to check the LV header ++ */ ++ s->store.snap = s; ++ ++ if (persistent == 'P') ++ r = dm_create_persistent(&s->store, s->chunk_size); ++ else ++ r = dm_create_transient(&s->store, s, blocksize); ++ ++ if (r) { ++ ti->error = "Couldn't create exception store"; ++ r = -EINVAL; ++ goto bad4; ++ } ++ ++ r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); ++ if (r) { ++ ti->error = "Could not create kcopyd client"; ++ goto bad5; ++ } ++ ++ /* Flush IO to the origin device */ ++ fsync_dev(s->origin->dev); ++ ++ /* Add snapshot to the list of snapshots for this origin */ ++ if (register_snapshot(s)) { ++ r = -EINVAL; ++ ti->error = "Cannot register snapshot origin"; ++ goto bad6; ++ } ++ ++ ti->private = s; ++ return 0; ++ ++ bad6: ++ kcopyd_client_destroy(s->kcopyd_client); ++ ++ bad5: ++ s->store.destroy(&s->store); ++ ++ bad4: ++ exit_exception_table(&s->pending, pending_cache); ++ exit_exception_table(&s->complete, exception_cache); ++ ++ bad3: ++ dm_put_device(ti, s->cow); ++ dm_put_device(ti, s->origin); ++ ++ bad2: ++ kfree(s); ++ ++ bad1: ++ return r; ++} ++ ++static void snapshot_dtr(struct dm_target *ti) ++{ ++ struct dm_snapshot *s = (struct dm_snapshot *) ti->private; ++ ++ dm_table_event(ti->table); ++ ++ unregister_snapshot(s); ++ ++ exit_exception_table(&s->pending, pending_cache); ++ exit_exception_table(&s->complete, exception_cache); ++ ++ /* Deallocate memory used */ ++ s->store.destroy(&s->store); ++ ++ dm_put_device(ti, s->origin); ++ dm_put_device(ti, s->cow); ++ kcopyd_client_destroy(s->kcopyd_client); ++ kfree(s); ++} ++ ++/* ++ * We hold lists of buffer_heads, using the b_reqnext field. ++ */ ++static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) ++{ ++ bh->b_reqnext = *queue; ++ *queue = bh; ++} ++ ++/* ++ * FIXME: inefficient. ++ */ ++static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs) ++{ ++ while (*queue) ++ queue = &((*queue)->b_reqnext); ++ ++ *queue = bhs; ++} ++ ++/* ++ * Flush a list of buffers. ++ */ ++static void flush_buffers(struct buffer_head *bh) ++{ ++ struct buffer_head *n; ++ ++ DMDEBUG("begin flush"); ++ while (bh) { ++ n = bh->b_reqnext; ++ bh->b_reqnext = NULL; ++ DMDEBUG("flushing %p", bh); ++ generic_make_request(WRITE, bh); ++ bh = n; ++ } ++ ++ run_task_queue(&tq_disk); ++} ++ ++/* ++ * Error a list of buffers. ++ */ ++static void error_buffers(struct buffer_head *bh) ++{ ++ struct buffer_head *n; ++ ++ while (bh) { ++ n = bh->b_reqnext; ++ bh->b_reqnext = NULL; ++ buffer_IO_error(bh); ++ bh = n; ++ } ++} ++ ++static struct buffer_head *__flush_bhs(struct pending_exception *pe) ++{ ++ struct pending_exception *sibling; ++ ++ if (list_empty(&pe->siblings)) ++ return pe->origin_bhs; ++ ++ sibling = list_entry(pe->siblings.next, ++ struct pending_exception, siblings); ++ ++ list_del(&pe->siblings); ++ ++ /* FIXME: I think there's a race on SMP machines here, add spin lock */ ++ queue_buffers(&sibling->origin_bhs, pe->origin_bhs); ++ ++ return NULL; ++} ++ ++static void pending_complete(struct pending_exception *pe, int success) ++{ ++ struct exception *e; ++ struct dm_snapshot *s = pe->snap; ++ struct buffer_head *flush = NULL; ++ ++ if (success) { ++ e = alloc_exception(); ++ if (!e) { ++ DMWARN("Unable to allocate exception."); ++ down_write(&s->lock); ++ s->store.drop_snapshot(&s->store); ++ s->valid = 0; ++ flush = __flush_bhs(pe); ++ up_write(&s->lock); ++ ++ error_buffers(pe->snapshot_bhs); ++ goto out; ++ } ++ ++ /* ++ * Add a proper exception, and remove the ++ * in-flight exception from the list. ++ */ ++ down_write(&s->lock); ++ ++ memcpy(e, &pe->e, sizeof(*e)); ++ insert_exception(&s->complete, e); ++ remove_exception(&pe->e); ++ flush = __flush_bhs(pe); ++ ++ /* Submit any pending write BHs */ ++ up_write(&s->lock); ++ ++ flush_buffers(pe->snapshot_bhs); ++ DMDEBUG("Exception completed successfully."); ++ ++ /* Notify any interested parties */ ++ if (s->store.fraction_full) { ++ sector_t numerator, denominator; ++ int pc; ++ ++ s->store.fraction_full(&s->store, &numerator, ++ &denominator); ++ pc = numerator * 100 / denominator; ++ ++ if (pc >= s->last_percent + WAKE_UP_PERCENT) { ++ dm_table_event(s->table); ++ s->last_percent = pc - pc % WAKE_UP_PERCENT; ++ } ++ } ++ ++ } else { ++ /* Read/write error - snapshot is unusable */ ++ down_write(&s->lock); ++ if (s->valid) ++ DMERR("Error reading/writing snapshot"); ++ s->store.drop_snapshot(&s->store); ++ s->valid = 0; ++ remove_exception(&pe->e); ++ flush = __flush_bhs(pe); ++ up_write(&s->lock); ++ ++ error_buffers(pe->snapshot_bhs); ++ ++ dm_table_event(s->table); ++ DMDEBUG("Exception failed."); ++ } ++ ++ out: ++ if (flush) ++ flush_buffers(flush); ++ ++ free_pending_exception(pe); ++} ++ ++static void commit_callback(void *context, int success) ++{ ++ struct pending_exception *pe = (struct pending_exception *) context; ++ pending_complete(pe, success); ++} ++ ++/* ++ * Called when the copy I/O has finished. kcopyd actually runs ++ * this code so don't block. ++ */ ++static void copy_callback(int read_err, unsigned int write_err, void *context) ++{ ++ struct pending_exception *pe = (struct pending_exception *) context; ++ struct dm_snapshot *s = pe->snap; ++ ++ if (read_err || write_err) ++ pending_complete(pe, 0); ++ ++ else ++ /* Update the metadata if we are persistent */ ++ s->store.commit_exception(&s->store, &pe->e, commit_callback, ++ pe); ++} ++ ++/* ++ * Dispatches the copy operation to kcopyd. ++ */ ++static inline void start_copy(struct pending_exception *pe) ++{ ++ struct dm_snapshot *s = pe->snap; ++ struct io_region src, dest; ++ kdev_t dev = s->origin->dev; ++ int *sizes = blk_size[major(dev)]; ++ sector_t dev_size = (sector_t) -1; ++ ++ if (pe->started) ++ return; ++ ++ /* this is protected by snap->lock */ ++ pe->started = 1; ++ ++ if (sizes && sizes[minor(dev)]) ++ dev_size = sizes[minor(dev)] << 1; ++ ++ src.dev = dev; ++ src.sector = chunk_to_sector(s, pe->e.old_chunk); ++ src.count = min(s->chunk_size, dev_size - src.sector); ++ ++ dest.dev = s->cow->dev; ++ dest.sector = chunk_to_sector(s, pe->e.new_chunk); ++ dest.count = src.count; ++ ++ /* Hand over to kcopyd */ ++ kcopyd_copy(s->kcopyd_client, ++ &src, 1, &dest, 0, copy_callback, pe); ++} ++ ++/* ++ * Looks to see if this snapshot already has a pending exception ++ * for this chunk, otherwise it allocates a new one and inserts ++ * it into the pending table. ++ */ ++static struct pending_exception *find_pending_exception(struct dm_snapshot *s, ++ struct buffer_head *bh) ++{ ++ struct exception *e; ++ struct pending_exception *pe; ++ chunk_t chunk = sector_to_chunk(s, bh->b_rsector); ++ ++ /* ++ * Is there a pending exception for this already ? ++ */ ++ e = lookup_exception(&s->pending, chunk); ++ if (e) { ++ /* cast the exception to a pending exception */ ++ pe = list_entry(e, struct pending_exception, e); ++ ++ } else { ++ /* Create a new pending exception */ ++ pe = alloc_pending_exception(); ++ pe->e.old_chunk = chunk; ++ pe->origin_bhs = pe->snapshot_bhs = NULL; ++ INIT_LIST_HEAD(&pe->siblings); ++ pe->snap = s; ++ pe->started = 0; ++ ++ if (s->store.prepare_exception(&s->store, &pe->e)) { ++ free_pending_exception(pe); ++ s->valid = 0; ++ return NULL; ++ } ++ ++ insert_exception(&s->pending, &pe->e); ++ } ++ ++ return pe; ++} ++ ++static inline void remap_exception(struct dm_snapshot *s, struct exception *e, ++ struct buffer_head *bh) ++{ ++ bh->b_rdev = s->cow->dev; ++ bh->b_rsector = chunk_to_sector(s, e->new_chunk) + ++ (bh->b_rsector & s->chunk_mask); ++} ++ ++static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw, ++ union map_info *map_context) ++{ ++ struct exception *e; ++ struct dm_snapshot *s = (struct dm_snapshot *) ti->private; ++ int r = 1; ++ chunk_t chunk; ++ struct pending_exception *pe; ++ ++ chunk = sector_to_chunk(s, bh->b_rsector); ++ ++ /* Full snapshots are not usable */ ++ if (!s->valid) ++ return -1; ++ ++ /* ++ * Write to snapshot - higher level takes care of RW/RO ++ * flags so we should only get this if we are ++ * writeable. ++ */ ++ if (rw == WRITE) { ++ ++ down_write(&s->lock); ++ ++ /* If the block is already remapped - use that, else remap it */ ++ e = lookup_exception(&s->complete, chunk); ++ if (e) ++ remap_exception(s, e, bh); ++ ++ else { ++ pe = find_pending_exception(s, bh); ++ ++ if (!pe) { ++ s->store.drop_snapshot(&s->store); ++ s->valid = 0; ++ r = -EIO; ++ } else { ++ remap_exception(s, &pe->e, bh); ++ queue_buffer(&pe->snapshot_bhs, bh); ++ start_copy(pe); ++ r = 0; ++ } ++ } ++ ++ up_write(&s->lock); ++ ++ } else { ++ /* ++ * FIXME: this read path scares me because we ++ * always use the origin when we have a pending ++ * exception. However I can't think of a ++ * situation where this is wrong - ejt. ++ */ ++ ++ /* Do reads */ ++ down_read(&s->lock); ++ ++ /* See if it it has been remapped */ ++ e = lookup_exception(&s->complete, chunk); ++ if (e) ++ remap_exception(s, e, bh); ++ else ++ bh->b_rdev = s->origin->dev; ++ ++ up_read(&s->lock); ++ } ++ ++ return r; ++} ++ ++void snapshot_resume(struct dm_target *ti) ++{ ++ struct dm_snapshot *s = (struct dm_snapshot *) ti->private; ++ ++ if (s->have_metadata) ++ return; ++ ++ if (s->store.read_metadata(&s->store)) { ++ down_write(&s->lock); ++ s->valid = 0; ++ up_write(&s->lock); ++ } ++ ++ s->have_metadata = 1; ++} ++ ++static int snapshot_status(struct dm_target *ti, status_type_t type, ++ char *result, unsigned int maxlen) ++{ ++ struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; ++ char cow[16]; ++ char org[16]; ++ ++ switch (type) { ++ case STATUSTYPE_INFO: ++ if (!snap->valid) ++ snprintf(result, maxlen, "Invalid"); ++ else { ++ if (snap->store.fraction_full) { ++ sector_t numerator, denominator; ++ snap->store.fraction_full(&snap->store, ++ &numerator, ++ &denominator); ++ snprintf(result, maxlen, ++ SECTOR_FORMAT "/" SECTOR_FORMAT, ++ numerator, denominator); ++ } ++ else ++ snprintf(result, maxlen, "Unknown"); ++ } ++ break; ++ ++ case STATUSTYPE_TABLE: ++ /* ++ * kdevname returns a static pointer so we need ++ * to make private copies if the output is to ++ * make sense. ++ */ ++ strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow)); ++ strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org)); ++ snprintf(result, maxlen, "%s %s %c %ld", org, cow, ++ snap->type, snap->chunk_size); ++ break; ++ } ++ ++ return 0; ++} ++ ++/*----------------------------------------------------------------- ++ * Origin methods ++ *---------------------------------------------------------------*/ ++static void list_merge(struct list_head *l1, struct list_head *l2) ++{ ++ struct list_head *l1_n, *l2_p; ++ ++ l1_n = l1->next; ++ l2_p = l2->prev; ++ ++ l1->next = l2; ++ l2->prev = l1; ++ ++ l2_p->next = l1_n; ++ l1_n->prev = l2_p; ++} ++ ++static int __origin_write(struct list_head *snapshots, struct buffer_head *bh) ++{ ++ int r = 1, first = 1; ++ struct list_head *sl; ++ struct dm_snapshot *snap; ++ struct exception *e; ++ struct pending_exception *pe, *last = NULL; ++ chunk_t chunk; ++ ++ /* Do all the snapshots on this origin */ ++ list_for_each(sl, snapshots) { ++ snap = list_entry(sl, struct dm_snapshot, list); ++ ++ /* Only deal with valid snapshots */ ++ if (!snap->valid) ++ continue; ++ ++ down_write(&snap->lock); ++ ++ /* ++ * Remember, different snapshots can have ++ * different chunk sizes. ++ */ ++ chunk = sector_to_chunk(snap, bh->b_rsector); ++ ++ /* ++ * Check exception table to see if block ++ * is already remapped in this snapshot ++ * and trigger an exception if not. ++ */ ++ e = lookup_exception(&snap->complete, chunk); ++ if (!e) { ++ pe = find_pending_exception(snap, bh); ++ if (!pe) { ++ snap->store.drop_snapshot(&snap->store); ++ snap->valid = 0; ++ ++ } else { ++ if (last) ++ list_merge(&pe->siblings, ++ &last->siblings); ++ ++ last = pe; ++ r = 0; ++ } ++ } ++ ++ up_write(&snap->lock); ++ } ++ ++ /* ++ * Now that we have a complete pe list we can start the copying. ++ */ ++ if (last) { ++ pe = last; ++ do { ++ down_write(&pe->snap->lock); ++ if (first) ++ queue_buffer(&pe->origin_bhs, bh); ++ start_copy(pe); ++ up_write(&pe->snap->lock); ++ first = 0; ++ pe = list_entry(pe->siblings.next, ++ struct pending_exception, siblings); ++ ++ } while (pe != last); ++ } ++ ++ return r; ++} ++ ++/* ++ * Called on a write from the origin driver. ++ */ ++int do_origin(struct dm_dev *origin, struct buffer_head *bh) ++{ ++ struct origin *o; ++ int r; ++ ++ down_read(&_origins_lock); ++ o = __lookup_origin(origin->dev); ++ if (!o) ++ BUG(); ++ ++ r = __origin_write(&o->snapshots, bh); ++ up_read(&_origins_lock); ++ ++ return r; ++} ++ ++/* ++ * Origin: maps a linear range of a device, with hooks for snapshotting. ++ */ ++ ++/* ++ * Construct an origin mapping: ++ * The context for an origin is merely a 'struct dm_dev *' ++ * pointing to the real device. ++ */ ++static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) ++{ ++ int r; ++ struct dm_dev *dev; ++ ++ if (argc != 1) { ++ ti->error = "dm-origin: incorrect number of arguments"; ++ return -EINVAL; ++ } ++ ++ r = dm_get_device(ti, argv[0], 0, ti->len, ++ dm_table_get_mode(ti->table), &dev); ++ if (r) { ++ ti->error = "Cannot get target device"; ++ return r; ++ } ++ ++ ti->private = dev; ++ return 0; ++} ++ ++static void origin_dtr(struct dm_target *ti) ++{ ++ struct dm_dev *dev = (struct dm_dev *) ti->private; ++ dm_put_device(ti, dev); ++} ++ ++static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw, ++ union map_info *map_context) ++{ ++ struct dm_dev *dev = (struct dm_dev *) ti->private; ++ bh->b_rdev = dev->dev; ++ ++ /* Only tell snapshots if this is a write */ ++ return (rw == WRITE) ? do_origin(dev, bh) : 1; ++} ++ ++static int origin_status(struct dm_target *ti, status_type_t type, char *result, ++ unsigned int maxlen) ++{ ++ struct dm_dev *dev = (struct dm_dev *) ti->private; ++ ++ switch (type) { ++ case STATUSTYPE_INFO: ++ result[0] = '\0'; ++ break; ++ ++ case STATUSTYPE_TABLE: ++ snprintf(result, maxlen, "%s", dm_kdevname(dev->dev)); ++ break; ++ } ++ ++ return 0; ++} ++ ++static struct target_type origin_target = { ++ name: "snapshot-origin", ++ module: THIS_MODULE, ++ ctr: origin_ctr, ++ dtr: origin_dtr, ++ map: origin_map, ++ status: origin_status, ++}; ++ ++static struct target_type snapshot_target = { ++ name: "snapshot", ++ module: THIS_MODULE, ++ ctr: snapshot_ctr, ++ dtr: snapshot_dtr, ++ map: snapshot_map, ++ resume: snapshot_resume, ++ status: snapshot_status, ++}; ++ ++int __init dm_snapshot_init(void) ++{ ++ int r; ++ ++ r = dm_register_target(&snapshot_target); ++ if (r) { ++ DMERR("snapshot target register failed %d", r); ++ return r; ++ } ++ ++ r = dm_register_target(&origin_target); ++ if (r < 0) { ++ DMERR("Device mapper: Origin: register failed %d\n", r); ++ goto bad1; ++ } ++ ++ r = init_origin_hash(); ++ if (r) { ++ DMERR("init_origin_hash failed."); ++ goto bad2; ++ } ++ ++ exception_cache = kmem_cache_create("dm-snapshot-ex", ++ sizeof(struct exception), ++ __alignof__(struct exception), ++ 0, NULL, NULL); ++ if (!exception_cache) { ++ DMERR("Couldn't create exception cache."); ++ r = -ENOMEM; ++ goto bad3; ++ } ++ ++ pending_cache = ++ kmem_cache_create("dm-snapshot-in", ++ sizeof(struct pending_exception), ++ __alignof__(struct pending_exception), ++ 0, NULL, NULL); ++ if (!pending_cache) { ++ DMERR("Couldn't create pending cache."); ++ r = -ENOMEM; ++ goto bad4; ++ } ++ ++ pending_pool = mempool_create(128, mempool_alloc_slab, ++ mempool_free_slab, pending_cache); ++ if (!pending_pool) { ++ DMERR("Couldn't create pending pool."); ++ r = -ENOMEM; ++ goto bad5; ++ } ++ ++ return 0; ++ ++ bad5: ++ kmem_cache_destroy(pending_cache); ++ bad4: ++ kmem_cache_destroy(exception_cache); ++ bad3: ++ exit_origin_hash(); ++ bad2: ++ dm_unregister_target(&origin_target); ++ bad1: ++ dm_unregister_target(&snapshot_target); ++ return r; ++} ++ ++void dm_snapshot_exit(void) ++{ ++ int r; ++ ++ r = dm_unregister_target(&snapshot_target); ++ if (r) ++ DMERR("snapshot unregister failed %d", r); ++ ++ r = dm_unregister_target(&origin_target); ++ if (r) ++ DMERR("origin unregister failed %d", r); ++ ++ exit_origin_hash(); ++ mempool_destroy(pending_pool); ++ kmem_cache_destroy(pending_cache); ++ kmem_cache_destroy(exception_cache); ++} +--- linux-2.4.22/drivers/md/dm-snapshot.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-snapshot.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,158 @@ ++/* ++ * dm-snapshot.c ++ * ++ * Copyright (C) 2001-2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#ifndef DM_SNAPSHOT_H ++#define DM_SNAPSHOT_H ++ ++#include "dm.h" ++#include ++ ++struct exception_table { ++ uint32_t hash_mask; ++ struct list_head *table; ++}; ++ ++/* ++ * The snapshot code deals with largish chunks of the disk at a ++ * time. Typically 64k - 256k. ++ */ ++/* FIXME: can we get away with limiting these to a uint32_t ? */ ++typedef sector_t chunk_t; ++ ++/* ++ * An exception is used where an old chunk of data has been ++ * replaced by a new one. ++ */ ++struct exception { ++ struct list_head hash_list; ++ ++ chunk_t old_chunk; ++ chunk_t new_chunk; ++}; ++ ++/* ++ * Abstraction to handle the meta/layout of exception stores (the ++ * COW device). ++ */ ++struct exception_store { ++ ++ /* ++ * Destroys this object when you've finished with it. ++ */ ++ void (*destroy) (struct exception_store *store); ++ ++ /* ++ * The target shouldn't read the COW device until this is ++ * called. ++ */ ++ int (*read_metadata) (struct exception_store *store); ++ ++ /* ++ * Find somewhere to store the next exception. ++ */ ++ int (*prepare_exception) (struct exception_store *store, ++ struct exception *e); ++ ++ /* ++ * Update the metadata with this exception. ++ */ ++ void (*commit_exception) (struct exception_store *store, ++ struct exception *e, ++ void (*callback) (void *, int success), ++ void *callback_context); ++ ++ /* ++ * The snapshot is invalid, note this in the metadata. ++ */ ++ void (*drop_snapshot) (struct exception_store *store); ++ ++ /* ++ * Return how full the snapshot is. ++ */ ++ void (*fraction_full) (struct exception_store *store, ++ sector_t *numerator, ++ sector_t *denominator); ++ ++ struct dm_snapshot *snap; ++ void *context; ++}; ++ ++struct dm_snapshot { ++ struct rw_semaphore lock; ++ struct dm_table *table; ++ ++ struct dm_dev *origin; ++ struct dm_dev *cow; ++ ++ /* List of snapshots per Origin */ ++ struct list_head list; ++ ++ /* Size of data blocks saved - must be a power of 2 */ ++ chunk_t chunk_size; ++ chunk_t chunk_mask; ++ chunk_t chunk_shift; ++ ++ /* You can't use a snapshot if this is 0 (e.g. if full) */ ++ int valid; ++ int have_metadata; ++ ++ /* Used for display of table */ ++ char type; ++ ++ /* The last percentage we notified */ ++ int last_percent; ++ ++ struct exception_table pending; ++ struct exception_table complete; ++ ++ /* The on disk metadata handler */ ++ struct exception_store store; ++ ++ struct kcopyd_client *kcopyd_client; ++}; ++ ++/* ++ * Used by the exception stores to load exceptions hen ++ * initialising. ++ */ ++int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); ++ ++/* ++ * Constructor and destructor for the default persistent ++ * store. ++ */ ++int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); ++ ++int dm_create_transient(struct exception_store *store, ++ struct dm_snapshot *s, int blocksize); ++ ++/* ++ * Return the number of sectors in the device. ++ */ ++static inline sector_t get_dev_size(kdev_t dev) ++{ ++ int *sizes; ++ ++ sizes = blk_size[MAJOR(dev)]; ++ if (sizes) ++ return sizes[MINOR(dev)] << 1; ++ ++ return 0; ++} ++ ++static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) ++{ ++ return (sector & ~s->chunk_mask) >> s->chunk_shift; ++} ++ ++static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) ++{ ++ return chunk << s->chunk_shift; ++} ++ ++#endif +--- linux-2.4.22/drivers/md/dm-stripe.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-stripe.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,258 @@ ++/* ++ * Copyright (C) 2001 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++ ++#include ++#include ++#include ++#include ++ ++struct stripe { ++ struct dm_dev *dev; ++ sector_t physical_start; ++}; ++ ++struct stripe_c { ++ uint32_t stripes; ++ ++ /* The size of this target / num. stripes */ ++ uint32_t stripe_width; ++ ++ /* stripe chunk size */ ++ uint32_t chunk_shift; ++ sector_t chunk_mask; ++ ++ struct stripe stripe[0]; ++}; ++ ++static inline struct stripe_c *alloc_context(unsigned int stripes) ++{ ++ size_t len; ++ ++ if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), ++ stripes)) ++ return NULL; ++ ++ len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); ++ ++ return kmalloc(len, GFP_KERNEL); ++} ++ ++/* ++ * Parse a single pair ++ */ ++static int get_stripe(struct dm_target *ti, struct stripe_c *sc, ++ unsigned int stripe, char **argv) ++{ ++ sector_t start; ++ ++ if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) ++ return -EINVAL; ++ ++ if (dm_get_device(ti, argv[0], start, sc->stripe_width, ++ dm_table_get_mode(ti->table), ++ &sc->stripe[stripe].dev)) ++ return -ENXIO; ++ ++ sc->stripe[stripe].physical_start = start; ++ return 0; ++} ++ ++/* ++ * FIXME: Nasty function, only present because we can't link ++ * against __moddi3 and __divdi3. ++ * ++ * returns a == b * n ++ */ ++static int multiple(sector_t a, sector_t b, sector_t *n) ++{ ++ sector_t acc, prev, i; ++ ++ *n = 0; ++ while (a >= b) { ++ for (acc = b, prev = 0, i = 1; ++ acc <= a; ++ prev = acc, acc <<= 1, i <<= 1) ++ ; ++ ++ a -= prev; ++ *n += i >> 1; ++ } ++ ++ return a == 0; ++} ++ ++/* ++ * Construct a striped mapping. ++ * [ ]+ ++ */ ++static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ++{ ++ struct stripe_c *sc; ++ sector_t width; ++ uint32_t stripes; ++ uint32_t chunk_size; ++ char *end; ++ int r; ++ unsigned int i; ++ ++ if (argc < 2) { ++ ti->error = "dm-stripe: Not enough arguments"; ++ return -EINVAL; ++ } ++ ++ stripes = simple_strtoul(argv[0], &end, 10); ++ if (*end) { ++ ti->error = "dm-stripe: Invalid stripe count"; ++ return -EINVAL; ++ } ++ ++ chunk_size = simple_strtoul(argv[1], &end, 10); ++ if (*end) { ++ ti->error = "dm-stripe: Invalid chunk_size"; ++ return -EINVAL; ++ } ++ ++ /* ++ * chunk_size is a power of two ++ */ ++ if (!chunk_size || (chunk_size & (chunk_size - 1))) { ++ ti->error = "dm-stripe: Invalid chunk size"; ++ return -EINVAL; ++ } ++ ++ if (!multiple(ti->len, stripes, &width)) { ++ ti->error = "dm-stripe: Target length not divisable by " ++ "number of stripes"; ++ return -EINVAL; ++ } ++ ++ /* ++ * Do we have enough arguments for that many stripes ? ++ */ ++ if (argc != (2 + 2 * stripes)) { ++ ti->error = "dm-stripe: Not enough destinations specified"; ++ return -EINVAL; ++ } ++ ++ sc = alloc_context(stripes); ++ if (!sc) { ++ ti->error = "dm-stripe: Memory allocation for striped context " ++ "failed"; ++ return -ENOMEM; ++ } ++ ++ sc->stripes = stripes; ++ sc->stripe_width = width; ++ ++ sc->chunk_mask = ((sector_t) chunk_size) - 1; ++ for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) ++ chunk_size >>= 1; ++ sc->chunk_shift--; ++ ++ /* ++ * Get the stripe destinations. ++ */ ++ for (i = 0; i < stripes; i++) { ++ argv += 2; ++ ++ r = get_stripe(ti, sc, i, argv); ++ if (r < 0) { ++ ti->error = "dm-stripe: Couldn't parse stripe " ++ "destination"; ++ while (i--) ++ dm_put_device(ti, sc->stripe[i].dev); ++ kfree(sc); ++ return r; ++ } ++ } ++ ++ ti->private = sc; ++ return 0; ++} ++ ++static void stripe_dtr(struct dm_target *ti) ++{ ++ unsigned int i; ++ struct stripe_c *sc = (struct stripe_c *) ti->private; ++ ++ for (i = 0; i < sc->stripes; i++) ++ dm_put_device(ti, sc->stripe[i].dev); ++ ++ kfree(sc); ++} ++ ++static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw, ++ union map_info *context) ++{ ++ struct stripe_c *sc = (struct stripe_c *) ti->private; ++ ++ sector_t offset = bh->b_rsector - ti->begin; ++ uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); ++ uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ ++ chunk = chunk / sc->stripes; ++ ++ bh->b_rdev = sc->stripe[stripe].dev->dev; ++ bh->b_rsector = sc->stripe[stripe].physical_start + ++ (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); ++ return 1; ++} ++ ++static int stripe_status(struct dm_target *ti, status_type_t type, ++ char *result, unsigned int maxlen) ++{ ++ struct stripe_c *sc = (struct stripe_c *) ti->private; ++ int offset; ++ unsigned int i; ++ ++ switch (type) { ++ case STATUSTYPE_INFO: ++ result[0] = '\0'; ++ break; ++ ++ case STATUSTYPE_TABLE: ++ offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT, ++ sc->stripes, sc->chunk_mask + 1); ++ for (i = 0; i < sc->stripes; i++) { ++ offset += ++ snprintf(result + offset, maxlen - offset, ++ " %s " SECTOR_FORMAT, ++ dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)), ++ sc->stripe[i].physical_start); ++ } ++ break; ++ } ++ return 0; ++} ++ ++static struct target_type stripe_target = { ++ .name = "striped", ++ .module = THIS_MODULE, ++ .ctr = stripe_ctr, ++ .dtr = stripe_dtr, ++ .map = stripe_map, ++ .status = stripe_status, ++}; ++ ++int __init dm_stripe_init(void) ++{ ++ int r; ++ ++ r = dm_register_target(&stripe_target); ++ if (r < 0) ++ DMWARN("striped target registration failed"); ++ ++ return r; ++} ++ ++void dm_stripe_exit(void) ++{ ++ if (dm_unregister_target(&stripe_target)) ++ DMWARN("striped target unregistration failed"); ++ ++ return; ++} +--- linux-2.4.22/drivers/md/dm-table.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-table.c Tue Nov 18 17:44:56 2003 +@@ -0,0 +1,679 @@ ++/* ++ * Copyright (C) 2001 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MAX_DEPTH 16 ++#define NODE_SIZE L1_CACHE_BYTES ++#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) ++#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) ++ ++struct dm_table { ++ atomic_t holders; ++ ++ /* btree table */ ++ unsigned int depth; ++ unsigned int counts[MAX_DEPTH]; /* in nodes */ ++ sector_t *index[MAX_DEPTH]; ++ ++ unsigned int num_targets; ++ unsigned int num_allocated; ++ sector_t *highs; ++ struct dm_target *targets; ++ ++ /* ++ * Indicates the rw permissions for the new logical ++ * device. This should be a combination of FMODE_READ ++ * and FMODE_WRITE. ++ */ ++ int mode; ++ ++ /* a list of devices used by this table */ ++ struct list_head devices; ++ ++ /* events get handed up using this callback */ ++ void (*event_fn)(void *); ++ void *event_context; ++}; ++ ++/* ++ * Similar to ceiling(log_size(n)) ++ */ ++static unsigned int int_log(unsigned long n, unsigned long base) ++{ ++ int result = 0; ++ ++ while (n > 1) { ++ n = dm_div_up(n, base); ++ result++; ++ } ++ ++ return result; ++} ++ ++/* ++ * Calculate the index of the child node of the n'th node k'th key. ++ */ ++static inline unsigned int get_child(unsigned int n, unsigned int k) ++{ ++ return (n * CHILDREN_PER_NODE) + k; ++} ++ ++/* ++ * Return the n'th node of level l from table t. ++ */ ++static inline sector_t *get_node(struct dm_table *t, unsigned int l, ++ unsigned int n) ++{ ++ return t->index[l] + (n * KEYS_PER_NODE); ++} ++ ++/* ++ * Return the highest key that you could lookup from the n'th ++ * node on level l of the btree. ++ */ ++static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) ++{ ++ for (; l < t->depth - 1; l++) ++ n = get_child(n, CHILDREN_PER_NODE - 1); ++ ++ if (n >= t->counts[l]) ++ return (sector_t) - 1; ++ ++ return get_node(t, l, n)[KEYS_PER_NODE - 1]; ++} ++ ++/* ++ * Fills in a level of the btree based on the highs of the level ++ * below it. ++ */ ++static int setup_btree_index(unsigned int l, struct dm_table *t) ++{ ++ unsigned int n, k; ++ sector_t *node; ++ ++ for (n = 0U; n < t->counts[l]; n++) { ++ node = get_node(t, l, n); ++ ++ for (k = 0U; k < KEYS_PER_NODE; k++) ++ node[k] = high(t, l + 1, get_child(n, k)); ++ } ++ ++ return 0; ++} ++ ++ ++ ++int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) ++{ ++ struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); ++ ++ if (!t) ++ return -ENOMEM; ++ ++ memset(t, 0, sizeof(*t)); ++ INIT_LIST_HEAD(&t->devices); ++ atomic_set(&t->holders, 1); ++ ++ num_targets = dm_round_up(num_targets, KEYS_PER_NODE); ++ ++ /* Allocate both the target array and offset array at once. */ ++ t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) + ++ sizeof(sector_t), num_targets); ++ if (!t->highs) { ++ kfree(t); ++ return -ENOMEM; ++ } ++ ++ memset(t->highs, -1, sizeof(*t->highs) * num_targets); ++ ++ t->targets = (struct dm_target *) (t->highs + num_targets); ++ t->num_allocated = num_targets; ++ t->mode = mode; ++ *result = t; ++ return 0; ++} ++ ++static void free_devices(struct list_head *devices) ++{ ++ struct list_head *tmp, *next; ++ ++ for (tmp = devices->next; tmp != devices; tmp = next) { ++ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); ++ next = tmp->next; ++ kfree(dd); ++ } ++} ++ ++void table_destroy(struct dm_table *t) ++{ ++ unsigned int i; ++ ++ /* free the indexes (see dm_table_complete) */ ++ if (t->depth >= 2) ++ vfree(t->index[t->depth - 2]); ++ ++ /* free the targets */ ++ for (i = 0; i < t->num_targets; i++) { ++ struct dm_target *tgt = t->targets + i; ++ ++ if (tgt->type->dtr) ++ tgt->type->dtr(tgt); ++ ++ dm_put_target_type(tgt->type); ++ } ++ ++ vfree(t->highs); ++ ++ /* free the device list */ ++ if (t->devices.next != &t->devices) { ++ DMWARN("devices still present during destroy: " ++ "dm_table_remove_device calls missing"); ++ ++ free_devices(&t->devices); ++ } ++ ++ kfree(t); ++} ++ ++void dm_table_get(struct dm_table *t) ++{ ++ atomic_inc(&t->holders); ++} ++ ++void dm_table_put(struct dm_table *t) ++{ ++ if (atomic_dec_and_test(&t->holders)) ++ table_destroy(t); ++} ++ ++/* ++ * Convert a device path to a dev_t. ++ */ ++static int lookup_device(const char *path, kdev_t *dev) ++{ ++ int r; ++ struct nameidata nd; ++ struct inode *inode; ++ ++ if (!path_init(path, LOOKUP_FOLLOW, &nd)) ++ return 0; ++ ++ if ((r = path_walk(path, &nd))) ++ goto out; ++ ++ inode = nd.dentry->d_inode; ++ if (!inode) { ++ r = -ENOENT; ++ goto out; ++ } ++ ++ if (!S_ISBLK(inode->i_mode)) { ++ r = -ENOTBLK; ++ goto out; ++ } ++ ++ *dev = inode->i_rdev; ++ ++ out: ++ path_release(&nd); ++ return r; ++} ++ ++/* ++ * See if we've already got a device in the list. ++ */ ++static struct dm_dev *find_device(struct list_head *l, kdev_t dev) ++{ ++ struct list_head *tmp; ++ ++ list_for_each(tmp, l) { ++ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); ++ if (kdev_same(dd->dev, dev)) ++ return dd; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Open a device so we can use it as a map destination. ++ */ ++static int open_dev(struct dm_dev *dd) ++{ ++ if (dd->bdev) ++ BUG(); ++ ++ dd->bdev = bdget(kdev_t_to_nr(dd->dev)); ++ if (!dd->bdev) ++ return -ENOMEM; ++ ++ return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW); ++} ++ ++/* ++ * Close a device that we've been using. ++ */ ++static void close_dev(struct dm_dev *dd) ++{ ++ if (!dd->bdev) ++ return; ++ ++ blkdev_put(dd->bdev, BDEV_RAW); ++ dd->bdev = NULL; ++} ++ ++/* ++ * If possible (ie. blk_size[major] is set), this checks an area ++ * of a destination device is valid. ++ */ ++static int check_device_area(kdev_t dev, sector_t start, sector_t len) ++{ ++ int *sizes; ++ sector_t dev_size; ++ ++ if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) ++ /* we don't know the device details, ++ * so give the benefit of the doubt */ ++ return 1; ++ ++ /* convert to 512-byte sectors */ ++ dev_size <<= 1; ++ ++ return ((start < dev_size) && (len <= (dev_size - start))); ++} ++ ++/* ++ * This upgrades the mode on an already open dm_dev. Being ++ * careful to leave things as they were if we fail to reopen the ++ * device. ++ */ ++static int upgrade_mode(struct dm_dev *dd, int new_mode) ++{ ++ int r; ++ struct dm_dev dd_copy; ++ ++ memcpy(&dd_copy, dd, sizeof(dd_copy)); ++ ++ dd->mode |= new_mode; ++ dd->bdev = NULL; ++ r = open_dev(dd); ++ if (!r) ++ close_dev(&dd_copy); ++ else ++ memcpy(dd, &dd_copy, sizeof(dd_copy)); ++ ++ return r; ++} ++ ++/* ++ * Add a device to the list, or just increment the usage count if ++ * it's already present. ++ */ ++int dm_get_device(struct dm_target *ti, const char *path, sector_t start, ++ sector_t len, int mode, struct dm_dev **result) ++{ ++ int r; ++ kdev_t dev; ++ struct dm_dev *dd; ++ unsigned major, minor; ++ struct dm_table *t = ti->table; ++ ++ if (!t) ++ BUG(); ++ ++ if (sscanf(path, "%u:%u", &major, &minor) == 2) { ++ /* Extract the major/minor numbers */ ++ dev = mk_kdev(major, minor); ++ } else { ++ /* convert the path to a device */ ++ if ((r = lookup_device(path, &dev))) ++ return r; ++ } ++ ++ dd = find_device(&t->devices, dev); ++ if (!dd) { ++ dd = kmalloc(sizeof(*dd), GFP_KERNEL); ++ if (!dd) ++ return -ENOMEM; ++ ++ dd->dev = dev; ++ dd->mode = mode; ++ dd->bdev = NULL; ++ ++ if ((r = open_dev(dd))) { ++ kfree(dd); ++ return r; ++ } ++ ++ atomic_set(&dd->count, 0); ++ list_add(&dd->list, &t->devices); ++ ++ } else if (dd->mode != (mode | dd->mode)) { ++ r = upgrade_mode(dd, mode); ++ if (r) ++ return r; ++ } ++ atomic_inc(&dd->count); ++ ++ if (!check_device_area(dd->dev, start, len)) { ++ DMWARN("device %s too small for target", path); ++ dm_put_device(ti, dd); ++ return -EINVAL; ++ } ++ ++ *result = dd; ++ ++ return 0; ++} ++ ++/* ++ * Decrement a devices use count and remove it if neccessary. ++ */ ++void dm_put_device(struct dm_target *ti, struct dm_dev *dd) ++{ ++ if (atomic_dec_and_test(&dd->count)) { ++ close_dev(dd); ++ list_del(&dd->list); ++ kfree(dd); ++ } ++} ++ ++/* ++ * Checks to see if the target joins onto the end of the table. ++ */ ++static int adjoin(struct dm_table *table, struct dm_target *ti) ++{ ++ struct dm_target *prev; ++ ++ if (!table->num_targets) ++ return !ti->begin; ++ ++ prev = &table->targets[table->num_targets - 1]; ++ return (ti->begin == (prev->begin + prev->len)); ++} ++ ++/* ++ * Used to dynamically allocate the arg array. ++ */ ++static char **realloc_argv(unsigned *array_size, char **old_argv) ++{ ++ char **argv; ++ unsigned new_size; ++ ++ new_size = *array_size ? *array_size * 2 : 64; ++ argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); ++ if (argv) { ++ memcpy(argv, old_argv, *array_size * sizeof(*argv)); ++ *array_size = new_size; ++ } ++ ++ kfree(old_argv); ++ return argv; ++} ++ ++/* ++ * Destructively splits up the argument list to pass to ctr. ++ */ ++static int split_args(int *argc, char ***argvp, char *input) ++{ ++ char *start, *end = input, *out, **argv = NULL; ++ unsigned array_size = 0; ++ ++ *argc = 0; ++ argv = realloc_argv(&array_size, argv); ++ if (!argv) ++ return -ENOMEM; ++ ++ while (1) { ++ start = end; ++ ++ /* Skip whitespace */ ++ while (*start && isspace(*start)) ++ start++; ++ ++ if (!*start) ++ break; /* success, we hit the end */ ++ ++ /* 'out' is used to remove any back-quotes */ ++ end = out = start; ++ while (*end) { ++ /* Everything apart from '\0' can be quoted */ ++ if (*end == '\\' && *(end + 1)) { ++ *out++ = *(end + 1); ++ end += 2; ++ continue; ++ } ++ ++ if (isspace(*end)) ++ break; /* end of token */ ++ ++ *out++ = *end++; ++ } ++ ++ /* have we already filled the array ? */ ++ if ((*argc + 1) > array_size) { ++ argv = realloc_argv(&array_size, argv); ++ if (!argv) ++ return -ENOMEM; ++ } ++ ++ /* we know this is whitespace */ ++ if (*end) ++ end++; ++ ++ /* terminate the string and put it in the array */ ++ *out = '\0'; ++ argv[*argc] = start; ++ (*argc)++; ++ } ++ ++ *argvp = argv; ++ return 0; ++} ++ ++int dm_table_add_target(struct dm_table *t, const char *type, ++ sector_t start, sector_t len, char *params) ++{ ++ int r = -EINVAL, argc; ++ char **argv; ++ struct dm_target *tgt; ++ ++ if (t->num_targets >= t->num_allocated) ++ return -ENOMEM; ++ ++ tgt = t->targets + t->num_targets; ++ memset(tgt, 0, sizeof(*tgt)); ++ ++ tgt->type = dm_get_target_type(type); ++ if (!tgt->type) { ++ tgt->error = "unknown target type"; ++ return -EINVAL; ++ } ++ ++ tgt->table = t; ++ tgt->begin = start; ++ tgt->len = len; ++ tgt->error = "Unknown error"; ++ ++ /* ++ * Does this target adjoin the previous one ? ++ */ ++ if (!adjoin(t, tgt)) { ++ tgt->error = "Gap in table"; ++ r = -EINVAL; ++ goto bad; ++ } ++ ++ r = split_args(&argc, &argv, params); ++ if (r) { ++ tgt->error = "couldn't split parameters (insufficient memory)"; ++ goto bad; ++ } ++ ++ r = tgt->type->ctr(tgt, argc, argv); ++ kfree(argv); ++ if (r) ++ goto bad; ++ ++ t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; ++ return 0; ++ ++ bad: ++ printk(KERN_ERR DM_NAME ": %s\n", tgt->error); ++ dm_put_target_type(tgt->type); ++ return r; ++} ++ ++static int setup_indexes(struct dm_table *t) ++{ ++ int i; ++ unsigned int total = 0; ++ sector_t *indexes; ++ ++ /* allocate the space for *all* the indexes */ ++ for (i = t->depth - 2; i >= 0; i--) { ++ t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); ++ total += t->counts[i]; ++ } ++ ++ indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE); ++ if (!indexes) ++ return -ENOMEM; ++ ++ /* set up internal nodes, bottom-up */ ++ for (i = t->depth - 2, total = 0; i >= 0; i--) { ++ t->index[i] = indexes; ++ indexes += (KEYS_PER_NODE * t->counts[i]); ++ setup_btree_index(i, t); ++ } ++ ++ return 0; ++} ++ ++/* ++ * Builds the btree to index the map. ++ */ ++int dm_table_complete(struct dm_table *t) ++{ ++ int r = 0; ++ unsigned int leaf_nodes; ++ ++ /* how many indexes will the btree have ? */ ++ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); ++ t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); ++ ++ /* leaf layer has already been set up */ ++ t->counts[t->depth - 1] = leaf_nodes; ++ t->index[t->depth - 1] = t->highs; ++ ++ if (t->depth >= 2) ++ r = setup_indexes(t); ++ ++ return r; ++} ++ ++static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED; ++void dm_table_event_callback(struct dm_table *t, ++ void (*fn)(void *), void *context) ++{ ++ spin_lock_irq(&_event_lock); ++ t->event_fn = fn; ++ t->event_context = context; ++ spin_unlock_irq(&_event_lock); ++} ++ ++void dm_table_event(struct dm_table *t) ++{ ++ spin_lock(&_event_lock); ++ if (t->event_fn) ++ t->event_fn(t->event_context); ++ spin_unlock(&_event_lock); ++} ++ ++sector_t dm_table_get_size(struct dm_table *t) ++{ ++ return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; ++} ++ ++struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) ++{ ++ if (index > t->num_targets) ++ return NULL; ++ ++ return t->targets + index; ++} ++ ++/* ++ * Search the btree for the correct target. ++ */ ++struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) ++{ ++ unsigned int l, n = 0, k = 0; ++ sector_t *node; ++ ++ for (l = 0; l < t->depth; l++) { ++ n = get_child(n, k); ++ node = get_node(t, l, n); ++ ++ for (k = 0; k < KEYS_PER_NODE; k++) ++ if (node[k] >= sector) ++ break; ++ } ++ ++ return &t->targets[(KEYS_PER_NODE * n) + k]; ++} ++ ++unsigned int dm_table_get_num_targets(struct dm_table *t) ++{ ++ return t->num_targets; ++} ++ ++struct list_head *dm_table_get_devices(struct dm_table *t) ++{ ++ return &t->devices; ++} ++ ++int dm_table_get_mode(struct dm_table *t) ++{ ++ return t->mode; ++} ++ ++void dm_table_suspend_targets(struct dm_table *t) ++{ ++ int i; ++ ++ for (i = 0; i < t->num_targets; i++) { ++ struct dm_target *ti = t->targets + i; ++ ++ if (ti->type->suspend) ++ ti->type->suspend(ti); ++ } ++} ++ ++void dm_table_resume_targets(struct dm_table *t) ++{ ++ int i; ++ ++ for (i = 0; i < t->num_targets; i++) { ++ struct dm_target *ti = t->targets + i; ++ ++ if (ti->type->resume) ++ ti->type->resume(ti); ++ } ++} ++ ++EXPORT_SYMBOL(dm_get_device); ++EXPORT_SYMBOL(dm_put_device); ++EXPORT_SYMBOL(dm_table_event); ++EXPORT_SYMBOL(dm_table_get_mode); +--- linux-2.4.22/drivers/md/dm-target.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm-target.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (C) 2001 Sistina Software (UK) Limited ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++ ++#include ++#include ++#include ++ ++struct tt_internal { ++ struct target_type tt; ++ ++ struct list_head list; ++ long use; ++}; ++ ++static LIST_HEAD(_targets); ++static DECLARE_RWSEM(_lock); ++ ++#define DM_MOD_NAME_SIZE 32 ++ ++static inline struct tt_internal *__find_target_type(const char *name) ++{ ++ struct list_head *tih; ++ struct tt_internal *ti; ++ ++ list_for_each(tih, &_targets) { ++ ti = list_entry(tih, struct tt_internal, list); ++ ++ if (!strcmp(name, ti->tt.name)) ++ return ti; ++ } ++ ++ return NULL; ++} ++ ++static struct tt_internal *get_target_type(const char *name) ++{ ++ struct tt_internal *ti; ++ ++ down_read(&_lock); ++ ti = __find_target_type(name); ++ ++ if (ti) { ++ if (ti->use == 0 && ti->tt.module) ++ __MOD_INC_USE_COUNT(ti->tt.module); ++ ti->use++; ++ } ++ up_read(&_lock); ++ ++ return ti; ++} ++ ++static void load_module(const char *name) ++{ ++ char module_name[DM_MOD_NAME_SIZE] = "dm-"; ++ ++ /* Length check for strcat() below */ ++ if (strlen(name) > (DM_MOD_NAME_SIZE - 4)) ++ return; ++ ++ strcat(module_name, name); ++ request_module(module_name); ++} ++ ++struct target_type *dm_get_target_type(const char *name) ++{ ++ struct tt_internal *ti = get_target_type(name); ++ ++ if (!ti) { ++ load_module(name); ++ ti = get_target_type(name); ++ } ++ ++ return ti ? &ti->tt : NULL; ++} ++ ++void dm_put_target_type(struct target_type *t) ++{ ++ struct tt_internal *ti = (struct tt_internal *) t; ++ ++ down_read(&_lock); ++ if (--ti->use == 0 && ti->tt.module) ++ __MOD_DEC_USE_COUNT(ti->tt.module); ++ ++ if (ti->use < 0) ++ BUG(); ++ up_read(&_lock); ++ ++ return; ++} ++ ++static struct tt_internal *alloc_target(struct target_type *t) ++{ ++ struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); ++ ++ if (ti) { ++ memset(ti, 0, sizeof(*ti)); ++ ti->tt = *t; ++ } ++ ++ return ti; ++} ++ ++int dm_register_target(struct target_type *t) ++{ ++ int rv = 0; ++ struct tt_internal *ti = alloc_target(t); ++ ++ if (!ti) ++ return -ENOMEM; ++ ++ down_write(&_lock); ++ if (__find_target_type(t->name)) { ++ kfree(ti); ++ rv = -EEXIST; ++ } else ++ list_add(&ti->list, &_targets); ++ ++ up_write(&_lock); ++ return rv; ++} ++ ++int dm_unregister_target(struct target_type *t) ++{ ++ struct tt_internal *ti; ++ ++ down_write(&_lock); ++ if (!(ti = __find_target_type(t->name))) { ++ up_write(&_lock); ++ return -EINVAL; ++ } ++ ++ if (ti->use) { ++ up_write(&_lock); ++ return -ETXTBSY; ++ } ++ ++ list_del(&ti->list); ++ kfree(ti); ++ ++ up_write(&_lock); ++ return 0; ++} ++ ++/* ++ * io-err: always fails an io, useful for bringing ++ * up LVs that have holes in them. ++ */ ++static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) ++{ ++ return 0; ++} ++ ++static void io_err_dtr(struct dm_target *ti) ++{ ++ /* empty */ ++} ++ ++static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw, ++ union map_info *map_context) ++{ ++ return -EIO; ++} ++ ++static struct target_type error_target = { ++ .name = "error", ++ .ctr = io_err_ctr, ++ .dtr = io_err_dtr, ++ .map = io_err_map, ++}; ++ ++int dm_target_init(void) ++{ ++ return dm_register_target(&error_target); ++} ++ ++void dm_target_exit(void) ++{ ++ if (dm_unregister_target(&error_target)) ++ DMWARN("error target unregistration failed"); ++} ++ ++EXPORT_SYMBOL(dm_register_target); ++EXPORT_SYMBOL(dm_unregister_target); +--- linux-2.4.22/drivers/md/dm.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm.c Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,1115 @@ ++/* ++ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include "dm.h" ++#include "kcopyd.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static const char *_name = DM_NAME; ++#define DEFAULT_READ_AHEAD 64 ++ ++struct dm_io { ++ struct mapped_device *md; ++ ++ struct dm_target *ti; ++ int rw; ++ union map_info map_context; ++ void (*end_io) (struct buffer_head * bh, int uptodate); ++ void *context; ++}; ++ ++struct deferred_io { ++ int rw; ++ struct buffer_head *bh; ++ struct deferred_io *next; ++}; ++ ++/* ++ * Bits for the md->flags field. ++ */ ++#define DMF_BLOCK_IO 0 ++#define DMF_SUSPENDED 1 ++ ++struct mapped_device { ++ struct rw_semaphore lock; ++ atomic_t holders; ++ ++ kdev_t dev; ++ unsigned long flags; ++ ++ /* ++ * A list of ios that arrived while we were suspended. ++ */ ++ atomic_t pending; ++ wait_queue_head_t wait; ++ struct deferred_io *deferred; ++ ++ /* ++ * The current mapping. ++ */ ++ struct dm_table *map; ++ ++ /* ++ * io objects are allocated from here. ++ */ ++ mempool_t *io_pool; ++ ++ /* ++ * Event handling. ++ */ ++ uint32_t event_nr; ++ wait_queue_head_t eventq; ++}; ++ ++#define MIN_IOS 256 ++static kmem_cache_t *_io_cache; ++ ++static struct mapped_device *get_kdev(kdev_t dev); ++static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh); ++static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); ++ ++/*----------------------------------------------------------------- ++ * In order to avoid the 256 minor number limit we are going to ++ * register more major numbers as neccessary. ++ *---------------------------------------------------------------*/ ++#define MAX_MINORS (1 << MINORBITS) ++ ++struct major_details { ++ unsigned int major; ++ ++ int transient; ++ struct list_head transient_list; ++ ++ unsigned int first_free_minor; ++ int nr_free_minors; ++ ++ struct mapped_device *mds[MAX_MINORS]; ++ int blk_size[MAX_MINORS]; ++ int blksize_size[MAX_MINORS]; ++ int hardsect_size[MAX_MINORS]; ++}; ++ ++static struct rw_semaphore _dev_lock; ++static struct major_details *_majors[MAX_BLKDEV]; ++ ++/* ++ * This holds a list of majors that non-specified device numbers ++ * may be allocated from. Only majors with free minors appear on ++ * this list. ++ */ ++static LIST_HEAD(_transients_free); ++ ++static int __alloc_major(unsigned int major, struct major_details **result) ++{ ++ int r; ++ unsigned int transient = !major; ++ struct major_details *maj; ++ ++ /* Major already allocated? */ ++ if (major && _majors[major]) ++ return 0; ++ ++ maj = kmalloc(sizeof(*maj), GFP_KERNEL); ++ if (!maj) ++ return -ENOMEM; ++ ++ memset(maj, 0, sizeof(*maj)); ++ INIT_LIST_HEAD(&maj->transient_list); ++ ++ maj->nr_free_minors = MAX_MINORS; ++ ++ r = register_blkdev(major, _name, &dm_blk_dops); ++ if (r < 0) { ++ DMERR("register_blkdev failed for %d", major); ++ kfree(maj); ++ return r; ++ } ++ if (r > 0) ++ major = r; ++ ++ maj->major = major; ++ ++ if (transient) { ++ maj->transient = transient; ++ list_add_tail(&maj->transient_list, &_transients_free); ++ } ++ ++ _majors[major] = maj; ++ ++ blk_size[major] = maj->blk_size; ++ blksize_size[major] = maj->blksize_size; ++ hardsect_size[major] = maj->hardsect_size; ++ read_ahead[major] = DEFAULT_READ_AHEAD; ++ ++ blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request); ++ ++ *result = maj; ++ return 0; ++} ++ ++static void __free_major(struct major_details *maj) ++{ ++ unsigned int major = maj->major; ++ ++ list_del(&maj->transient_list); ++ ++ read_ahead[major] = 0; ++ blk_size[major] = NULL; ++ blksize_size[major] = NULL; ++ hardsect_size[major] = NULL; ++ ++ _majors[major] = NULL; ++ kfree(maj); ++ ++ if (unregister_blkdev(major, _name) < 0) ++ DMERR("devfs_unregister_blkdev failed"); ++} ++ ++static void free_all_majors(void) ++{ ++ unsigned int major = ARRAY_SIZE(_majors); ++ ++ down_write(&_dev_lock); ++ ++ while (major--) ++ if (_majors[major]) ++ __free_major(_majors[major]); ++ ++ up_write(&_dev_lock); ++} ++ ++static void free_dev(kdev_t dev) ++{ ++ unsigned int major = major(dev); ++ unsigned int minor = minor(dev); ++ struct major_details *maj; ++ ++ down_write(&_dev_lock); ++ ++ maj = _majors[major]; ++ if (!maj) ++ goto out; ++ ++ maj->mds[minor] = NULL; ++ maj->nr_free_minors++; ++ ++ if (maj->nr_free_minors == MAX_MINORS) { ++ __free_major(maj); ++ goto out; ++ } ++ ++ if (!maj->transient) ++ goto out; ++ ++ if (maj->nr_free_minors == 1) ++ list_add_tail(&maj->transient_list, &_transients_free); ++ ++ if (minor < maj->first_free_minor) ++ maj->first_free_minor = minor; ++ ++ out: ++ up_write(&_dev_lock); ++} ++ ++static void __alloc_minor(struct major_details *maj, unsigned int minor, ++ struct mapped_device *md) ++{ ++ maj->mds[minor] = md; ++ md->dev = mk_kdev(maj->major, minor); ++ maj->nr_free_minors--; ++ ++ if (maj->transient && !maj->nr_free_minors) ++ list_del_init(&maj->transient_list); ++} ++ ++/* ++ * See if requested kdev_t is available. ++ */ ++static int specific_dev(kdev_t dev, struct mapped_device *md) ++{ ++ int r = 0; ++ unsigned int major = major(dev); ++ unsigned int minor = minor(dev); ++ struct major_details *maj; ++ ++ if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) { ++ DMWARN("device number requested out of range (%d, %d)", ++ major, minor); ++ return -EINVAL; ++ } ++ ++ down_write(&_dev_lock); ++ maj = _majors[major]; ++ ++ /* Register requested major? */ ++ if (!maj) { ++ r = __alloc_major(major, &maj); ++ if (r) ++ goto out; ++ ++ major = maj->major; ++ } ++ ++ if (maj->mds[minor]) { ++ r = -EBUSY; ++ goto out; ++ } ++ ++ __alloc_minor(maj, minor, md); ++ ++ out: ++ up_write(&_dev_lock); ++ ++ return r; ++} ++ ++/* ++ * Find first unused device number, requesting a new major number if required. ++ */ ++static int first_free_dev(struct mapped_device *md) ++{ ++ int r = 0; ++ struct major_details *maj; ++ ++ down_write(&_dev_lock); ++ ++ if (list_empty(&_transients_free)) { ++ r = __alloc_major(0, &maj); ++ if (r) ++ goto out; ++ } else ++ maj = list_entry(_transients_free.next, struct major_details, ++ transient_list); ++ ++ while (maj->mds[maj->first_free_minor++]) ++ ; ++ ++ __alloc_minor(maj, maj->first_free_minor - 1, md); ++ ++ out: ++ up_write(&_dev_lock); ++ ++ return r; ++} ++ ++static struct mapped_device *get_kdev(kdev_t dev) ++{ ++ struct mapped_device *md; ++ struct major_details *maj; ++ ++ down_read(&_dev_lock); ++ maj = _majors[major(dev)]; ++ if (!maj) { ++ md = NULL; ++ goto out; ++ } ++ md = maj->mds[minor(dev)]; ++ if (md) ++ dm_get(md); ++ out: ++ up_read(&_dev_lock); ++ ++ return md; ++} ++ ++/*----------------------------------------------------------------- ++ * init/exit code ++ *---------------------------------------------------------------*/ ++ ++static __init int local_init(void) ++{ ++ init_rwsem(&_dev_lock); ++ ++ /* allocate a slab for the dm_ios */ ++ _io_cache = kmem_cache_create("dm io", ++ sizeof(struct dm_io), 0, 0, NULL, NULL); ++ ++ if (!_io_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static void local_exit(void) ++{ ++ kmem_cache_destroy(_io_cache); ++ free_all_majors(); ++ ++ DMINFO("cleaned up"); ++} ++ ++/* ++ * We have a lot of init/exit functions, so it seems easier to ++ * store them in an array. The disposable macro 'xx' ++ * expands a prefix into a pair of function names. ++ */ ++static struct { ++ int (*init) (void); ++ void (*exit) (void); ++ ++} _inits[] = { ++#define xx(n) {n ## _init, n ## _exit}, ++ xx(local) ++ xx(kcopyd) ++ xx(dm_target) ++ xx(dm_linear) ++ xx(dm_stripe) ++ xx(dm_snapshot) ++ xx(dm_interface) ++#undef xx ++}; ++ ++static int __init dm_init(void) ++{ ++ const int count = ARRAY_SIZE(_inits); ++ ++ int r, i; ++ ++ for (i = 0; i < count; i++) { ++ r = _inits[i].init(); ++ if (r) ++ goto bad; ++ } ++ ++ return 0; ++ ++ bad: ++ while (i--) ++ _inits[i].exit(); ++ ++ return r; ++} ++ ++static void __exit dm_exit(void) ++{ ++ int i = ARRAY_SIZE(_inits); ++ ++ while (i--) ++ _inits[i].exit(); ++} ++ ++/* ++ * Block device functions ++ */ ++static int dm_blk_open(struct inode *inode, struct file *file) ++{ ++ struct mapped_device *md; ++ ++ md = get_kdev(inode->i_rdev); ++ if (!md) ++ return -ENXIO; ++ ++ return 0; ++} ++ ++static int dm_blk_close(struct inode *inode, struct file *file) ++{ ++ struct mapped_device *md; ++ ++ md = get_kdev(inode->i_rdev); ++ dm_put(md); /* put the reference gained by dm_blk_open */ ++ dm_put(md); ++ return 0; ++} ++ ++static inline struct dm_io *alloc_io(struct mapped_device *md) ++{ ++ return mempool_alloc(md->io_pool, GFP_NOIO); ++} ++ ++static inline void free_io(struct mapped_device *md, struct dm_io *io) ++{ ++ mempool_free(io, md->io_pool); ++} ++ ++static inline struct deferred_io *alloc_deferred(void) ++{ ++ return kmalloc(sizeof(struct deferred_io), GFP_NOIO); ++} ++ ++static inline void free_deferred(struct deferred_io *di) ++{ ++ kfree(di); ++} ++ ++static inline sector_t volume_size(kdev_t dev) ++{ ++ return blk_size[major(dev)][minor(dev)] << 1; ++} ++ ++/* FIXME: check this */ ++static int dm_blk_ioctl(struct inode *inode, struct file *file, ++ unsigned int command, unsigned long a) ++{ ++ kdev_t dev = inode->i_rdev; ++ long size; ++ ++ switch (command) { ++ case BLKROSET: ++ case BLKROGET: ++ case BLKRASET: ++ case BLKRAGET: ++ case BLKFLSBUF: ++ case BLKSSZGET: ++ //case BLKRRPART: /* Re-read partition tables */ ++ //case BLKPG: ++ case BLKELVGET: ++ case BLKELVSET: ++ case BLKBSZGET: ++ case BLKBSZSET: ++ return blk_ioctl(dev, command, a); ++ break; ++ ++ case BLKGETSIZE: ++ size = volume_size(dev); ++ if (copy_to_user((void *) a, &size, sizeof(long))) ++ return -EFAULT; ++ break; ++ ++ case BLKGETSIZE64: ++ size = volume_size(dev); ++ if (put_user((u64) ((u64) size) << 9, (u64 *) a)) ++ return -EFAULT; ++ break; ++ ++ case BLKRRPART: ++ return -ENOTTY; ++ ++ case LV_BMAP: ++ return dm_user_bmap(inode, (struct lv_bmap *) a); ++ ++ default: ++ DMWARN("unknown block ioctl 0x%x", command); ++ return -ENOTTY; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Add the buffer to the list of deferred io. ++ */ ++static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) ++{ ++ struct deferred_io *di; ++ ++ di = alloc_deferred(); ++ if (!di) ++ return -ENOMEM; ++ ++ down_write(&md->lock); ++ ++ if (!test_bit(DMF_BLOCK_IO, &md->flags)) { ++ up_write(&md->lock); ++ free_deferred(di); ++ return 1; ++ } ++ ++ di->bh = bh; ++ di->rw = rw; ++ di->next = md->deferred; ++ md->deferred = di; ++ ++ up_write(&md->lock); ++ return 0; /* deferred successfully */ ++} ++ ++/* ++ * bh->b_end_io routine that decrements the pending count ++ * and then calls the original bh->b_end_io fn. ++ */ ++static void dec_pending(struct buffer_head *bh, int uptodate) ++{ ++ int r; ++ struct dm_io *io = bh->b_private; ++ dm_endio_fn endio = io->ti->type->end_io; ++ ++ if (endio) { ++ r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO, ++ &io->map_context); ++ if (r < 0) ++ uptodate = 0; ++ ++ else if (r > 0) ++ /* the target wants another shot at the io */ ++ return; ++ } ++ ++ if (atomic_dec_and_test(&io->md->pending)) ++ /* nudge anyone waiting on suspend queue */ ++ wake_up(&io->md->wait); ++ ++ bh->b_end_io = io->end_io; ++ bh->b_private = io->context; ++ free_io(io->md, io); ++ ++ bh->b_end_io(bh, uptodate); ++} ++ ++/* ++ * Do the bh mapping for a given leaf ++ */ ++static inline int __map_buffer(struct mapped_device *md, int rw, ++ struct buffer_head *bh, struct dm_io *io) ++{ ++ struct dm_target *ti; ++ ++ if (!md->map) ++ return -EINVAL; ++ ++ ti = dm_table_find_target(md->map, bh->b_rsector); ++ if (!ti->type) ++ return -EINVAL; ++ ++ /* hook the end io request fn */ ++ atomic_inc(&md->pending); ++ io->md = md; ++ io->ti = ti; ++ io->rw = rw; ++ io->end_io = bh->b_end_io; ++ io->context = bh->b_private; ++ bh->b_end_io = dec_pending; ++ bh->b_private = io; ++ ++ return ti->type->map(ti, bh, rw, &io->map_context); ++} ++ ++/* ++ * Checks to see if we should be deferring io, if so it queues it ++ * and returns 1. ++ */ ++static inline int __deferring(struct mapped_device *md, int rw, ++ struct buffer_head *bh) ++{ ++ int r; ++ ++ /* ++ * If we're suspended we have to queue this io for later. ++ */ ++ while (test_bit(DMF_BLOCK_IO, &md->flags)) { ++ up_read(&md->lock); ++ ++ /* ++ * There's no point deferring a read ahead ++ * request, just drop it. ++ */ ++ if (rw == READA) { ++ down_read(&md->lock); ++ return -EIO; ++ } ++ ++ r = queue_io(md, bh, rw); ++ down_read(&md->lock); ++ ++ if (r < 0) ++ return r; ++ ++ if (r == 0) ++ return 1; /* deferred successfully */ ++ ++ } ++ ++ return 0; ++} ++ ++static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh) ++{ ++ int r; ++ struct dm_io *io; ++ struct mapped_device *md; ++ ++ md = get_kdev(bh->b_rdev); ++ if (!md) { ++ buffer_IO_error(bh); ++ return 0; ++ } ++ ++ io = alloc_io(md); ++ down_read(&md->lock); ++ ++ r = __deferring(md, rw, bh); ++ if (r < 0) ++ goto bad; ++ ++ else if (!r) { ++ /* not deferring */ ++ r = __map_buffer(md, rw, bh, io); ++ if (r < 0) ++ goto bad; ++ } else ++ r = 0; ++ ++ up_read(&md->lock); ++ dm_put(md); ++ return r; ++ ++ bad: ++ buffer_IO_error(bh); ++ up_read(&md->lock); ++ dm_put(md); ++ return 0; ++} ++ ++static int check_dev_size(kdev_t dev, unsigned long block) ++{ ++ unsigned int major = major(dev); ++ unsigned int minor = minor(dev); ++ ++ /* FIXME: check this */ ++ unsigned long max_sector = (blk_size[major][minor] << 1) + 1; ++ unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9); ++ ++ return (sector > max_sector) ? 0 : 1; ++} ++ ++/* ++ * Creates a dummy buffer head and maps it (for lilo). ++ */ ++static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block, ++ kdev_t *r_dev, unsigned long *r_block) ++{ ++ struct buffer_head bh; ++ struct dm_target *ti; ++ union map_info map_context; ++ int r; ++ ++ if (test_bit(DMF_BLOCK_IO, &md->flags)) { ++ return -EPERM; ++ } ++ ++ if (!check_dev_size(dev, block)) { ++ return -EINVAL; ++ } ++ ++ if (!md->map) ++ return -EINVAL; ++ ++ /* setup dummy bh */ ++ memset(&bh, 0, sizeof(bh)); ++ bh.b_blocknr = block; ++ bh.b_dev = bh.b_rdev = dev; ++ bh.b_size = blksize_size[major(dev)][minor(dev)]; ++ bh.b_rsector = block * (bh.b_size >> 9); ++ ++ /* find target */ ++ ti = dm_table_find_target(md->map, bh.b_rsector); ++ ++ /* do the mapping */ ++ r = ti->type->map(ti, &bh, READ, &map_context); ++ ti->type->end_io(ti, &bh, READ, 0, &map_context); ++ ++ if (!r) { ++ *r_dev = bh.b_rdev; ++ *r_block = bh.b_rsector / (bh.b_size >> 9); ++ } ++ ++ return r; ++} ++ ++/* ++ * Marshals arguments and results between user and kernel space. ++ */ ++static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) ++{ ++ struct mapped_device *md; ++ unsigned long block, r_block; ++ kdev_t r_dev; ++ int r; ++ ++ if (get_user(block, &lvb->lv_block)) ++ return -EFAULT; ++ ++ md = get_kdev(inode->i_rdev); ++ if (!md) ++ return -ENXIO; ++ ++ down_read(&md->lock); ++ r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block); ++ up_read(&md->lock); ++ dm_put(md); ++ ++ if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) || ++ put_user(r_block, &lvb->lv_block))) ++ r = -EFAULT; ++ ++ return r; ++} ++ ++static void free_md(struct mapped_device *md) ++{ ++ free_dev(md->dev); ++ mempool_destroy(md->io_pool); ++ kfree(md); ++} ++ ++/* ++ * Allocate and initialise a blank device with a given minor. ++ */ ++static struct mapped_device *alloc_md(kdev_t dev) ++{ ++ int r; ++ struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); ++ ++ if (!md) { ++ DMWARN("unable to allocate device, out of memory."); ++ return NULL; ++ } ++ ++ memset(md, 0, sizeof(*md)); ++ ++ /* Allocate suitable device number */ ++ if (!dev) ++ r = first_free_dev(md); ++ else ++ r = specific_dev(dev, md); ++ ++ if (r) { ++ kfree(md); ++ return NULL; ++ } ++ ++ md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, ++ mempool_free_slab, _io_cache); ++ if (!md->io_pool) { ++ free_md(md); ++ kfree(md); ++ return NULL; ++ } ++ ++ init_rwsem(&md->lock); ++ atomic_set(&md->holders, 1); ++ atomic_set(&md->pending, 0); ++ init_waitqueue_head(&md->wait); ++ init_waitqueue_head(&md->eventq); ++ ++ return md; ++} ++ ++/* ++ * The hardsect size for a mapped device is the largest hardsect size ++ * from the devices it maps onto. ++ */ ++static int __find_hardsect_size(struct list_head *devices) ++{ ++ int result = 512, size; ++ struct list_head *tmp; ++ ++ list_for_each (tmp, devices) { ++ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); ++ size = get_hardsect_size(dd->dev); ++ if (size > result) ++ result = size; ++ } ++ ++ return result; ++} ++ ++/* ++ * Bind a table to the device. ++ */ ++static void event_callback(void *context) ++{ ++ struct mapped_device *md = (struct mapped_device *) context; ++ ++ down_write(&md->lock); ++ md->event_nr++; ++ wake_up_interruptible(&md->eventq); ++ up_write(&md->lock); ++} ++ ++static int __bind(struct mapped_device *md, struct dm_table *t) ++{ ++ unsigned int minor = minor(md->dev); ++ unsigned int major = major(md->dev); ++ md->map = t; ++ ++ /* in k */ ++ blk_size[major][minor] = dm_table_get_size(t) >> 1; ++ blksize_size[major][minor] = BLOCK_SIZE; ++ hardsect_size[major][minor] = ++ __find_hardsect_size(dm_table_get_devices(t)); ++ register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]); ++ ++ dm_table_event_callback(md->map, event_callback, md); ++ dm_table_get(t); ++ return 0; ++} ++ ++static void __unbind(struct mapped_device *md) ++{ ++ unsigned int minor = minor(md->dev); ++ unsigned int major = major(md->dev); ++ ++ if (md->map) { ++ dm_table_event_callback(md->map, NULL, NULL); ++ dm_table_put(md->map); ++ md->map = NULL; ++ ++ } ++ ++ blk_size[major][minor] = 0; ++ blksize_size[major][minor] = 0; ++ hardsect_size[major][minor] = 0; ++} ++ ++/* ++ * Constructor for a new device. ++ */ ++int dm_create(kdev_t dev, struct mapped_device **result) ++{ ++ struct mapped_device *md; ++ ++ md = alloc_md(dev); ++ if (!md) ++ return -ENXIO; ++ ++ __unbind(md); /* Ensure zero device size */ ++ ++ *result = md; ++ return 0; ++} ++ ++void dm_get(struct mapped_device *md) ++{ ++ atomic_inc(&md->holders); ++} ++ ++void dm_put(struct mapped_device *md) ++{ ++ if (atomic_dec_and_test(&md->holders)) { ++ if (md->map) ++ dm_table_suspend_targets(md->map); ++ __unbind(md); ++ free_md(md); ++ } ++} ++ ++/* ++ * Requeue the deferred io by calling generic_make_request. ++ */ ++static void flush_deferred_io(struct deferred_io *c) ++{ ++ struct deferred_io *n; ++ ++ while (c) { ++ n = c->next; ++ generic_make_request(c->rw, c->bh); ++ free_deferred(c); ++ c = n; ++ } ++} ++ ++/* ++ * Swap in a new table (destroying old one). ++ */ ++int dm_swap_table(struct mapped_device *md, struct dm_table *table) ++{ ++ int r; ++ ++ down_write(&md->lock); ++ ++ /* ++ * The device must be suspended, or have no table bound yet. ++ */ ++ if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) { ++ up_write(&md->lock); ++ return -EPERM; ++ } ++ ++ __unbind(md); ++ r = __bind(md, table); ++ if (r) ++ return r; ++ ++ up_write(&md->lock); ++ return 0; ++} ++ ++/* ++ * We need to be able to change a mapping table under a mounted ++ * filesystem. For example we might want to move some data in ++ * the background. Before the table can be swapped with ++ * dm_bind_table, dm_suspend must be called to flush any in ++ * flight io and ensure that any further io gets deferred. ++ */ ++int dm_suspend(struct mapped_device *md) ++{ ++ int r = 0; ++ DECLARE_WAITQUEUE(wait, current); ++ ++ down_write(&md->lock); ++ ++ /* ++ * First we set the BLOCK_IO flag so no more ios will be ++ * mapped. ++ */ ++ if (test_bit(DMF_BLOCK_IO, &md->flags)) { ++ up_write(&md->lock); ++ return -EINVAL; ++ } ++ ++ set_bit(DMF_BLOCK_IO, &md->flags); ++ add_wait_queue(&md->wait, &wait); ++ up_write(&md->lock); ++ ++ /* ++ * Then we wait for the already mapped ios to ++ * complete. ++ */ ++ run_task_queue(&tq_disk); ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (!atomic_read(&md->pending) || signal_pending(current)) ++ break; ++ ++ schedule(); ++ } ++ set_current_state(TASK_RUNNING); ++ ++ down_write(&md->lock); ++ remove_wait_queue(&md->wait, &wait); ++ ++ /* did we flush everything ? */ ++ if (atomic_read(&md->pending)) { ++ clear_bit(DMF_BLOCK_IO, &md->flags); ++ r = -EINTR; ++ } else { ++ set_bit(DMF_SUSPENDED, &md->flags); ++ if (md->map) ++ dm_table_suspend_targets(md->map); ++ } ++ up_write(&md->lock); ++ ++ return r; ++} ++ ++int dm_resume(struct mapped_device *md) ++{ ++ struct deferred_io *def; ++ ++ down_write(&md->lock); ++ if (!test_bit(DMF_SUSPENDED, &md->flags)) { ++ up_write(&md->lock); ++ return -EINVAL; ++ } ++ ++ if (md->map) ++ dm_table_resume_targets(md->map); ++ ++ clear_bit(DMF_SUSPENDED, &md->flags); ++ clear_bit(DMF_BLOCK_IO, &md->flags); ++ def = md->deferred; ++ md->deferred = NULL; ++ up_write(&md->lock); ++ ++ flush_deferred_io(def); ++ run_task_queue(&tq_disk); ++ ++ return 0; ++} ++ ++struct dm_table *dm_get_table(struct mapped_device *md) ++{ ++ struct dm_table *t; ++ ++ down_read(&md->lock); ++ t = md->map; ++ if (t) ++ dm_table_get(t); ++ up_read(&md->lock); ++ ++ return t; ++} ++ ++/*----------------------------------------------------------------- ++ * Event notification. ++ *---------------------------------------------------------------*/ ++uint32_t dm_get_event_nr(struct mapped_device *md) ++{ ++ uint32_t r; ++ ++ down_read(&md->lock); ++ r = md->event_nr; ++ up_read(&md->lock); ++ ++ return r; ++} ++ ++int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, ++ uint32_t event_nr) ++{ ++ down_write(&md->lock); ++ if (event_nr != md->event_nr) { ++ up_write(&md->lock); ++ return 1; ++ } ++ ++ add_wait_queue(&md->eventq, wq); ++ up_write(&md->lock); ++ ++ return 0; ++} ++ ++const char *dm_kdevname(kdev_t dev) ++{ ++ static char buffer[32]; ++ sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev)); ++ return buffer; ++} ++ ++void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq) ++{ ++ down_write(&md->lock); ++ remove_wait_queue(&md->eventq, wq); ++ up_write(&md->lock); ++} ++ ++kdev_t dm_kdev(struct mapped_device *md) ++{ ++ kdev_t dev; ++ ++ down_read(&md->lock); ++ dev = md->dev; ++ up_read(&md->lock); ++ ++ return dev; ++} ++ ++int dm_suspended(struct mapped_device *md) ++{ ++ return test_bit(DMF_SUSPENDED, &md->flags); ++} ++ ++struct block_device_operations dm_blk_dops = { ++ .open = dm_blk_open, ++ .release = dm_blk_close, ++ .ioctl = dm_blk_ioctl, ++ .owner = THIS_MODULE ++}; ++ ++/* ++ * module hooks ++ */ ++module_init(dm_init); ++module_exit(dm_exit); ++ ++MODULE_DESCRIPTION(DM_NAME " driver"); ++MODULE_AUTHOR("Joe Thornber "); ++MODULE_LICENSE("GPL"); ++ ++EXPORT_SYMBOL(dm_kdevname); +--- linux-2.4.22/drivers/md/dm.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/dm.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,175 @@ ++/* ++ * Internal header file for device mapper ++ * ++ * Copyright (C) 2001, 2002 Sistina Software ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#ifndef DM_INTERNAL_H ++#define DM_INTERNAL_H ++ ++#include ++#include ++#include ++#include ++ ++#define DM_NAME "device-mapper" ++#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) ++#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) ++#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) ++ ++/* ++ * FIXME: I think this should be with the definition of sector_t ++ * in types.h. ++ */ ++#ifdef CONFIG_LBD ++#define SECTOR_FORMAT "%Lu" ++#else ++#define SECTOR_FORMAT "%lu" ++#endif ++ ++#define SECTOR_SHIFT 9 ++#define SECTOR_SIZE (1 << SECTOR_SHIFT) ++ ++extern struct block_device_operations dm_blk_dops; ++ ++/* ++ * List of devices that a metadevice uses and should open/close. ++ */ ++struct dm_dev { ++ struct list_head list; ++ ++ atomic_t count; ++ int mode; ++ kdev_t dev; ++ struct block_device *bdev; ++}; ++ ++struct dm_table; ++struct mapped_device; ++ ++/*----------------------------------------------------------------- ++ * Functions for manipulating a struct mapped_device. ++ * Drop the reference with dm_put when you finish with the object. ++ *---------------------------------------------------------------*/ ++int dm_create(kdev_t dev, struct mapped_device **md); ++ ++/* ++ * Reference counting for md. ++ */ ++void dm_get(struct mapped_device *md); ++void dm_put(struct mapped_device *md); ++ ++/* ++ * A device can still be used while suspended, but I/O is deferred. ++ */ ++int dm_suspend(struct mapped_device *md); ++int dm_resume(struct mapped_device *md); ++ ++/* ++ * The device must be suspended before calling this method. ++ */ ++int dm_swap_table(struct mapped_device *md, struct dm_table *t); ++ ++/* ++ * Drop a reference on the table when you've finished with the ++ * result. ++ */ ++struct dm_table *dm_get_table(struct mapped_device *md); ++ ++/* ++ * Event functions. ++ */ ++uint32_t dm_get_event_nr(struct mapped_device *md); ++int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, ++ uint32_t event_nr); ++void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq); ++ ++/* ++ * Info functions. ++ */ ++kdev_t dm_kdev(struct mapped_device *md); ++int dm_suspended(struct mapped_device *md); ++ ++/*----------------------------------------------------------------- ++ * Functions for manipulating a table. Tables are also reference ++ * counted. ++ *---------------------------------------------------------------*/ ++int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); ++ ++void dm_table_get(struct dm_table *t); ++void dm_table_put(struct dm_table *t); ++ ++int dm_table_add_target(struct dm_table *t, const char *type, ++ sector_t start, sector_t len, char *params); ++int dm_table_complete(struct dm_table *t); ++void dm_table_event_callback(struct dm_table *t, ++ void (*fn)(void *), void *context); ++void dm_table_event(struct dm_table *t); ++sector_t dm_table_get_size(struct dm_table *t); ++struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); ++struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); ++unsigned int dm_table_get_num_targets(struct dm_table *t); ++struct list_head *dm_table_get_devices(struct dm_table *t); ++int dm_table_get_mode(struct dm_table *t); ++void dm_table_suspend_targets(struct dm_table *t); ++void dm_table_resume_targets(struct dm_table *t); ++ ++/*----------------------------------------------------------------- ++ * A registry of target types. ++ *---------------------------------------------------------------*/ ++int dm_target_init(void); ++void dm_target_exit(void); ++struct target_type *dm_get_target_type(const char *name); ++void dm_put_target_type(struct target_type *t); ++ ++ ++/*----------------------------------------------------------------- ++ * Useful inlines. ++ *---------------------------------------------------------------*/ ++static inline int array_too_big(unsigned long fixed, unsigned long obj, ++ unsigned long num) ++{ ++ return (num > (ULONG_MAX - fixed) / obj); ++} ++ ++/* ++ * ceiling(n / size) * size ++ */ ++static inline unsigned long dm_round_up(unsigned long n, unsigned long size) ++{ ++ unsigned long r = n % size; ++ return n + (r ? (size - r) : 0); ++} ++ ++/* ++ * Ceiling(n / size) ++ */ ++static inline unsigned long dm_div_up(unsigned long n, unsigned long size) ++{ ++ return dm_round_up(n, size) / size; ++} ++ ++const char *dm_kdevname(kdev_t dev); ++ ++/* ++ * The device-mapper can be driven through one of two interfaces; ++ * ioctl or filesystem, depending which patch you have applied. ++ */ ++int dm_interface_init(void); ++void dm_interface_exit(void); ++ ++/* ++ * Targets for linear and striped mappings ++ */ ++int dm_linear_init(void); ++void dm_linear_exit(void); ++ ++int dm_stripe_init(void); ++void dm_stripe_exit(void); ++ ++int dm_snapshot_init(void); ++void dm_snapshot_exit(void); ++ ++#endif +--- linux-2.4.22/drivers/md/kcopyd.c Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/kcopyd.c Tue Nov 18 16:47:41 2003 +@@ -0,0 +1,666 @@ ++/* ++ * Copyright (C) 2002 Sistina Software (UK) Limited. ++ * ++ * This file is released under the GPL. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "kcopyd.h" ++#include "dm-daemon.h" ++ ++/* FIXME: this is only needed for the DMERR macros */ ++#include "dm.h" ++ ++static struct dm_daemon _kcopyd; ++ ++#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE) ++#define SUB_JOB_SIZE 128 ++#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE) ++#define SUB_JOB_COUNT 8 ++ ++/*----------------------------------------------------------------- ++ * Each kcopyd client has its own little pool of preallocated ++ * pages for kcopyd io. ++ *---------------------------------------------------------------*/ ++struct kcopyd_client { ++ struct list_head list; ++ ++ spinlock_t lock; ++ struct list_head pages; ++ unsigned int nr_pages; ++ unsigned int nr_free_pages; ++ unsigned int max_split; ++}; ++ ++static inline void __push_page(struct kcopyd_client *kc, struct page *p) ++{ ++ list_add(&p->list, &kc->pages); ++ kc->nr_free_pages++; ++} ++ ++static inline struct page *__pop_page(struct kcopyd_client *kc) ++{ ++ struct page *p; ++ ++ p = list_entry(kc->pages.next, struct page, list); ++ list_del(&p->list); ++ kc->nr_free_pages--; ++ ++ return p; ++} ++ ++static int kcopyd_get_pages(struct kcopyd_client *kc, ++ unsigned int nr, struct list_head *pages) ++{ ++ struct page *p; ++ INIT_LIST_HEAD(pages); ++ ++ spin_lock(&kc->lock); ++ if (kc->nr_free_pages < nr) { ++ spin_unlock(&kc->lock); ++ return -ENOMEM; ++ } ++ ++ while (nr--) { ++ p = __pop_page(kc); ++ list_add(&p->list, pages); ++ } ++ spin_unlock(&kc->lock); ++ ++ return 0; ++} ++ ++static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages) ++{ ++ struct list_head *tmp, *tmp2; ++ ++ spin_lock(&kc->lock); ++ list_for_each_safe (tmp, tmp2, pages) ++ __push_page(kc, list_entry(tmp, struct page, list)); ++ spin_unlock(&kc->lock); ++} ++ ++/* ++ * These three functions resize the page pool. ++ */ ++static void release_pages(struct list_head *pages) ++{ ++ struct page *p; ++ struct list_head *tmp, *tmp2; ++ ++ list_for_each_safe (tmp, tmp2, pages) { ++ p = list_entry(tmp, struct page, list); ++ UnlockPage(p); ++ __free_page(p); ++ } ++} ++ ++static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) ++{ ++ unsigned int i; ++ struct page *p; ++ LIST_HEAD(new); ++ ++ for (i = 0; i < nr; i++) { ++ p = alloc_page(GFP_KERNEL); ++ if (!p) { ++ release_pages(&new); ++ return -ENOMEM; ++ } ++ ++ LockPage(p); ++ list_add(&p->list, &new); ++ } ++ ++ kcopyd_put_pages(kc, &new); ++ kc->nr_pages += nr; ++ kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB; ++ if (kc->max_split > SUB_JOB_COUNT) ++ kc->max_split = SUB_JOB_COUNT; ++ ++ return 0; ++} ++ ++static void client_free_pages(struct kcopyd_client *kc) ++{ ++ BUG_ON(kc->nr_free_pages != kc->nr_pages); ++ release_pages(&kc->pages); ++ kc->nr_free_pages = kc->nr_pages = 0; ++} ++ ++/*----------------------------------------------------------------- ++ * kcopyd_jobs need to be allocated by the *clients* of kcopyd, ++ * for this reason we use a mempool to prevent the client from ++ * ever having to do io (which could cause a deadlock). ++ *---------------------------------------------------------------*/ ++struct kcopyd_job { ++ struct kcopyd_client *kc; ++ struct list_head list; ++ unsigned int flags; ++ ++ /* ++ * Error state of the job. ++ */ ++ int read_err; ++ unsigned int write_err; ++ ++ /* ++ * Either READ or WRITE ++ */ ++ int rw; ++ struct io_region source; ++ ++ /* ++ * The destinations for the transfer. ++ */ ++ unsigned int num_dests; ++ struct io_region dests[KCOPYD_MAX_REGIONS]; ++ ++ sector_t offset; ++ unsigned int nr_pages; ++ struct list_head pages; ++ ++ /* ++ * Set this to ensure you are notified when the job has ++ * completed. 'context' is for callback to use. ++ */ ++ kcopyd_notify_fn fn; ++ void *context; ++ ++ /* ++ * These fields are only used if the job has been split ++ * into more manageable parts. ++ */ ++ struct semaphore lock; ++ atomic_t sub_jobs; ++ sector_t progress; ++}; ++ ++/* FIXME: this should scale with the number of pages */ ++#define MIN_JOBS 512 ++ ++static kmem_cache_t *_job_cache; ++static mempool_t *_job_pool; ++ ++/* ++ * We maintain three lists of jobs: ++ * ++ * i) jobs waiting for pages ++ * ii) jobs that have pages, and are waiting for the io to be issued. ++ * iii) jobs that have completed. ++ * ++ * All three of these are protected by job_lock. ++ */ ++static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED; ++ ++static LIST_HEAD(_complete_jobs); ++static LIST_HEAD(_io_jobs); ++static LIST_HEAD(_pages_jobs); ++ ++static int jobs_init(void) ++{ ++ INIT_LIST_HEAD(&_complete_jobs); ++ INIT_LIST_HEAD(&_io_jobs); ++ INIT_LIST_HEAD(&_pages_jobs); ++ ++ _job_cache = kmem_cache_create("kcopyd-jobs", ++ sizeof(struct kcopyd_job), ++ __alignof__(struct kcopyd_job), ++ 0, NULL, NULL); ++ if (!_job_cache) ++ return -ENOMEM; ++ ++ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, ++ mempool_free_slab, _job_cache); ++ if (!_job_pool) { ++ kmem_cache_destroy(_job_cache); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static void jobs_exit(void) ++{ ++ BUG_ON(!list_empty(&_complete_jobs)); ++ BUG_ON(!list_empty(&_io_jobs)); ++ BUG_ON(!list_empty(&_pages_jobs)); ++ ++ mempool_destroy(_job_pool); ++ kmem_cache_destroy(_job_cache); ++} ++ ++/* ++ * Functions to push and pop a job onto the head of a given job ++ * list. ++ */ ++static inline struct kcopyd_job *pop(struct list_head *jobs) ++{ ++ struct kcopyd_job *job = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&_job_lock, flags); ++ ++ if (!list_empty(jobs)) { ++ job = list_entry(jobs->next, struct kcopyd_job, list); ++ list_del(&job->list); ++ } ++ spin_unlock_irqrestore(&_job_lock, flags); ++ ++ return job; ++} ++ ++static inline void push(struct list_head *jobs, struct kcopyd_job *job) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&_job_lock, flags); ++ list_add_tail(&job->list, jobs); ++ spin_unlock_irqrestore(&_job_lock, flags); ++} ++ ++/* ++ * These three functions process 1 item from the corresponding ++ * job list. ++ * ++ * They return: ++ * < 0: error ++ * 0: success ++ * > 0: can't process yet. ++ */ ++static int run_complete_job(struct kcopyd_job *job) ++{ ++ void *context = job->context; ++ int read_err = job->read_err; ++ unsigned int write_err = job->write_err; ++ kcopyd_notify_fn fn = job->fn; ++ ++ kcopyd_put_pages(job->kc, &job->pages); ++ mempool_free(job, _job_pool); ++ fn(read_err, write_err, context); ++ return 0; ++} ++ ++static void complete_io(unsigned int error, void *context) ++{ ++ struct kcopyd_job *job = (struct kcopyd_job *) context; ++ ++ if (error) { ++ if (job->rw == WRITE) ++ job->write_err &= error; ++ else ++ job->read_err = 1; ++ ++ if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { ++ push(&_complete_jobs, job); ++ dm_daemon_wake(&_kcopyd); ++ return; ++ } ++ } ++ ++ if (job->rw == WRITE) ++ push(&_complete_jobs, job); ++ ++ else { ++ job->rw = WRITE; ++ push(&_io_jobs, job); ++ } ++ ++ dm_daemon_wake(&_kcopyd); ++} ++ ++/* ++ * Request io on as many buffer heads as we can currently get for ++ * a particular job. ++ */ ++static int run_io_job(struct kcopyd_job *job) ++{ ++ int r; ++ ++ if (job->rw == READ) ++ r = dm_io_async(1, &job->source, job->rw, ++ list_entry(job->pages.next, struct page, list), ++ job->offset, complete_io, job); ++ ++ else ++ r = dm_io_async(job->num_dests, job->dests, job->rw, ++ list_entry(job->pages.next, struct page, list), ++ job->offset, complete_io, job); ++ ++ return r; ++} ++ ++static int run_pages_job(struct kcopyd_job *job) ++{ ++ int r; ++ ++ job->nr_pages = dm_div_up(job->dests[0].count + job->offset, ++ SECTORS_PER_PAGE); ++ r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); ++ if (!r) { ++ /* this job is ready for io */ ++ push(&_io_jobs, job); ++ return 0; ++ } ++ ++ if (r == -ENOMEM) ++ /* can't complete now */ ++ return 1; ++ ++ return r; ++} ++ ++/* ++ * Run through a list for as long as possible. Returns the count ++ * of successful jobs. ++ */ ++static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) ++{ ++ struct kcopyd_job *job; ++ int r, count = 0; ++ ++ while ((job = pop(jobs))) { ++ ++ r = fn(job); ++ ++ if (r < 0) { ++ /* error this rogue job */ ++ if (job->rw == WRITE) ++ job->write_err = (unsigned int) -1; ++ else ++ job->read_err = 1; ++ push(&_complete_jobs, job); ++ break; ++ } ++ ++ if (r > 0) { ++ /* ++ * We couldn't service this job ATM, so ++ * push this job back onto the list. ++ */ ++ push(jobs, job); ++ break; ++ } ++ ++ count++; ++ } ++ ++ return count; ++} ++ ++/* ++ * kcopyd does this every time it's woken up. ++ */ ++static void do_work(void) ++{ ++ /* ++ * The order that these are called is *very* important. ++ * complete jobs can free some pages for pages jobs. ++ * Pages jobs when successful will jump onto the io jobs ++ * list. io jobs call wake when they complete and it all ++ * starts again. ++ */ ++ process_jobs(&_complete_jobs, run_complete_job); ++ process_jobs(&_pages_jobs, run_pages_job); ++ process_jobs(&_io_jobs, run_io_job); ++ run_task_queue(&tq_disk); ++} ++ ++/* ++ * If we are copying a small region we just dispatch a single job ++ * to do the copy, otherwise the io has to be split up into many ++ * jobs. ++ */ ++static void dispatch_job(struct kcopyd_job *job) ++{ ++ push(&_pages_jobs, job); ++ dm_daemon_wake(&_kcopyd); ++} ++ ++static void segment_complete(int read_err, ++ unsigned int write_err, void *context) ++{ ++ /* FIXME: tidy this function */ ++ sector_t progress = 0; ++ sector_t count = 0; ++ struct kcopyd_job *job = (struct kcopyd_job *) context; ++ ++ down(&job->lock); ++ ++ /* update the error */ ++ if (read_err) ++ job->read_err = 1; ++ ++ if (write_err) ++ job->write_err &= write_err; ++ ++ /* ++ * Only dispatch more work if there hasn't been an error. ++ */ ++ if ((!job->read_err && !job->write_err) || ++ test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { ++ /* get the next chunk of work */ ++ progress = job->progress; ++ count = job->source.count - progress; ++ if (count) { ++ if (count > SUB_JOB_SIZE) ++ count = SUB_JOB_SIZE; ++ ++ job->progress += count; ++ } ++ } ++ up(&job->lock); ++ ++ if (count) { ++ int i; ++ struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); ++ ++ memcpy(sub_job, job, sizeof(*job)); ++ sub_job->source.sector += progress; ++ sub_job->source.count = count; ++ ++ for (i = 0; i < job->num_dests; i++) { ++ sub_job->dests[i].sector += progress; ++ sub_job->dests[i].count = count; ++ } ++ ++ sub_job->fn = segment_complete; ++ sub_job->context = job; ++ dispatch_job(sub_job); ++ ++ } else if (atomic_dec_and_test(&job->sub_jobs)) { ++ ++ /* ++ * To avoid a race we must keep the job around ++ * until after the notify function has completed. ++ * Otherwise the client may try and stop the job ++ * after we've completed. ++ */ ++ job->fn(read_err, write_err, job->context); ++ mempool_free(job, _job_pool); ++ } ++} ++ ++/* ++ * Create some little jobs that will do the move between ++ * them. ++ */ ++static void split_job(struct kcopyd_job *job) ++{ ++ int nr; ++ ++ nr = dm_div_up(job->source.count, SUB_JOB_SIZE); ++ if (nr > job->kc->max_split) ++ nr = job->kc->max_split; ++ ++ atomic_set(&job->sub_jobs, nr); ++ while (nr--) ++ segment_complete(0, 0u, job); ++} ++ ++int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, ++ unsigned int num_dests, struct io_region *dests, ++ unsigned int flags, kcopyd_notify_fn fn, void *context) ++{ ++ struct kcopyd_job *job; ++ ++ /* ++ * Allocate a new job. ++ */ ++ job = mempool_alloc(_job_pool, GFP_NOIO); ++ ++ /* ++ * set up for the read. ++ */ ++ job->kc = kc; ++ job->flags = flags; ++ job->read_err = 0; ++ job->write_err = 0; ++ job->rw = READ; ++ ++ memcpy(&job->source, from, sizeof(*from)); ++ ++ job->num_dests = num_dests; ++ memcpy(&job->dests, dests, sizeof(*dests) * num_dests); ++ ++ job->offset = 0; ++ job->nr_pages = 0; ++ INIT_LIST_HEAD(&job->pages); ++ ++ job->fn = fn; ++ job->context = context; ++ ++ if (job->source.count < SUB_JOB_SIZE) ++ dispatch_job(job); ++ ++ else { ++ init_MUTEX(&job->lock); ++ job->progress = 0; ++ split_job(job); ++ } ++ ++ return 0; ++} ++ ++/* ++ * Cancels a kcopyd job, eg. someone might be deactivating a ++ * mirror. ++ */ ++int kcopyd_cancel(struct kcopyd_job *job, int block) ++{ ++ /* FIXME: finish */ ++ return -1; ++} ++ ++/*----------------------------------------------------------------- ++ * Unit setup ++ *---------------------------------------------------------------*/ ++static DECLARE_MUTEX(_client_lock); ++static LIST_HEAD(_clients); ++ ++static int client_add(struct kcopyd_client *kc) ++{ ++ down(&_client_lock); ++ list_add(&kc->list, &_clients); ++ up(&_client_lock); ++ return 0; ++} ++ ++static void client_del(struct kcopyd_client *kc) ++{ ++ down(&_client_lock); ++ list_del(&kc->list); ++ up(&_client_lock); ++} ++ ++int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) ++{ ++ int r = 0; ++ struct kcopyd_client *kc; ++ ++ if (nr_pages * SECTORS_PER_PAGE < SUB_JOB_SIZE) { ++ DMERR("kcopyd client requested %u pages: minimum is %lu", ++ nr_pages, SUB_JOB_SIZE / SECTORS_PER_PAGE); ++ return -ENOMEM; ++ } ++ ++ kc = kmalloc(sizeof(*kc), GFP_KERNEL); ++ if (!kc) ++ return -ENOMEM; ++ ++ kc->lock = SPIN_LOCK_UNLOCKED; ++ INIT_LIST_HEAD(&kc->pages); ++ kc->nr_pages = kc->nr_free_pages = 0; ++ r = client_alloc_pages(kc, nr_pages); ++ if (r) { ++ kfree(kc); ++ return r; ++ } ++ ++ r = dm_io_get(nr_pages); ++ if (r) { ++ client_free_pages(kc); ++ kfree(kc); ++ return r; ++ } ++ ++ r = client_add(kc); ++ if (r) { ++ dm_io_put(nr_pages); ++ client_free_pages(kc); ++ kfree(kc); ++ return r; ++ } ++ ++ *result = kc; ++ return 0; ++} ++ ++void kcopyd_client_destroy(struct kcopyd_client *kc) ++{ ++ dm_io_put(kc->nr_pages); ++ client_free_pages(kc); ++ client_del(kc); ++ kfree(kc); ++} ++ ++ ++int __init kcopyd_init(void) ++{ ++ int r; ++ ++ r = jobs_init(); ++ if (r) ++ return r; ++ ++ r = dm_daemon_start(&_kcopyd, "kcopyd", do_work); ++ if (r) ++ jobs_exit(); ++ ++ return r; ++} ++ ++void kcopyd_exit(void) ++{ ++ jobs_exit(); ++ dm_daemon_stop(&_kcopyd); ++} ++ ++EXPORT_SYMBOL(kcopyd_client_create); ++EXPORT_SYMBOL(kcopyd_client_destroy); ++EXPORT_SYMBOL(kcopyd_copy); ++EXPORT_SYMBOL(kcopyd_cancel); +--- linux-2.4.22/drivers/md/kcopyd.h Thu Jan 1 01:00:00 1970 ++++ linux/drivers/md/kcopyd.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,47 @@ ++/* ++ * Copyright (C) 2001 Sistina Software ++ * ++ * This file is released under the GPL. ++ */ ++ ++#ifndef DM_KCOPYD_H ++#define DM_KCOPYD_H ++ ++/* ++ * Needed for the definition of offset_t. ++ */ ++#include ++#include ++ ++#include "dm-io.h" ++ ++int kcopyd_init(void); ++void kcopyd_exit(void); ++ ++/* FIXME: make this configurable */ ++#define KCOPYD_MAX_REGIONS 8 ++ ++#define KCOPYD_IGNORE_ERROR 1 ++ ++/* ++ * To use kcopyd you must first create a kcopyd client object. ++ */ ++struct kcopyd_client; ++int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); ++void kcopyd_client_destroy(struct kcopyd_client *kc); ++ ++/* ++ * Submit a copy job to kcopyd. This is built on top of the ++ * previous three fns. ++ * ++ * read_err is a boolean, ++ * write_err is a bitset, with 1 bit for each destination region ++ */ ++typedef void (*kcopyd_notify_fn)(int read_err, ++ unsigned int write_err, void *context); ++ ++int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, ++ unsigned int num_dests, struct io_region *dests, ++ unsigned int flags, kcopyd_notify_fn fn, void *context); ++ ++#endif +--- linux-2.4.22/include/linux/device-mapper.h Thu Jan 1 01:00:00 1970 ++++ linux/include/linux/device-mapper.h Tue Nov 18 13:43:10 2003 +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (C) 2001 Sistina Software (UK) Limited. ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#ifndef _LINUX_DEVICE_MAPPER_H ++#define _LINUX_DEVICE_MAPPER_H ++ ++typedef unsigned long sector_t; ++ ++struct dm_target; ++struct dm_table; ++struct dm_dev; ++ ++typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; ++ ++union map_info { ++ void *ptr; ++ unsigned long long ll; ++}; ++ ++/* ++ * In the constructor the target parameter will already have the ++ * table, type, begin and len fields filled in. ++ */ ++typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc, ++ char **argv); ++ ++/* ++ * The destructor doesn't need to free the dm_target, just ++ * anything hidden ti->private. ++ */ ++typedef void (*dm_dtr_fn) (struct dm_target * ti); ++ ++/* ++ * The map function must return: ++ * < 0: error ++ * = 0: The target will handle the io by resubmitting it later ++ * > 0: simple remap complete ++ */ ++typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh, ++ int rw, union map_info *map_context); ++ ++/* ++ * Returns: ++ * < 0 : error (currently ignored) ++ * 0 : ended successfully ++ * 1 : for some reason the io has still not completed (eg, ++ * multipath target might want to requeue a failed io). ++ */ ++typedef int (*dm_endio_fn) (struct dm_target * ti, ++ struct buffer_head * bh, int rw, int error, ++ union map_info *map_context); ++typedef void (*dm_suspend_fn) (struct dm_target *ti); ++typedef void (*dm_resume_fn) (struct dm_target *ti); ++typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type, ++ char *result, unsigned int maxlen); ++ ++void dm_error(const char *message); ++ ++/* ++ * Constructors should call these functions to ensure destination devices ++ * are opened/closed correctly. ++ * FIXME: too many arguments. ++ */ ++int dm_get_device(struct dm_target *ti, const char *path, sector_t start, ++ sector_t len, int mode, struct dm_dev **result); ++void dm_put_device(struct dm_target *ti, struct dm_dev *d); ++ ++/* ++ * Information about a target type ++ */ ++struct target_type { ++ const char *name; ++ struct module *module; ++ dm_ctr_fn ctr; ++ dm_dtr_fn dtr; ++ dm_map_fn map; ++ dm_endio_fn end_io; ++ dm_suspend_fn suspend; ++ dm_resume_fn resume; ++ dm_status_fn status; ++}; ++ ++struct dm_target { ++ struct dm_table *table; ++ struct target_type *type; ++ ++ /* target limits */ ++ sector_t begin; ++ sector_t len; ++ ++ /* target specific data */ ++ void *private; ++ ++ /* Used to provide an error string from the ctr */ ++ char *error; ++}; ++ ++int dm_register_target(struct target_type *t); ++int dm_unregister_target(struct target_type *t); ++ ++#endif /* _LINUX_DEVICE_MAPPER_H */ +--- linux-2.4.22/include/linux/dm-ioctl.h Thu Jan 1 01:00:00 1970 ++++ linux/include/linux/dm-ioctl.h Tue Nov 18 14:34:32 2003 +@@ -0,0 +1,237 @@ ++/* ++ * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. ++ * ++ * This file is released under the LGPL. ++ */ ++ ++#ifndef _LINUX_DM_IOCTL_H ++#define _LINUX_DM_IOCTL_H ++ ++#include ++ ++#define DM_DIR "mapper" /* Slashes not supported */ ++#define DM_MAX_TYPE_NAME 16 ++#define DM_NAME_LEN 128 ++#define DM_UUID_LEN 129 ++ ++/* ++ * A traditional ioctl interface for the device mapper. ++ * ++ * Each device can have two tables associated with it, an ++ * 'active' table which is the one currently used by io passing ++ * through the device, and an 'inactive' one which is a table ++ * that is being prepared as a replacement for the 'active' one. ++ * ++ * DM_VERSION: ++ * Just get the version information for the ioctl interface. ++ * ++ * DM_REMOVE_ALL: ++ * Remove all dm devices, destroy all tables. Only really used ++ * for debug. ++ * ++ * DM_LIST_DEVICES: ++ * Get a list of all the dm device names. ++ * ++ * DM_DEV_CREATE: ++ * Create a new device, neither the 'active' or 'inactive' table ++ * slots will be filled. The device will be in suspended state ++ * after creation, however any io to the device will get errored ++ * since it will be out-of-bounds. ++ * ++ * DM_DEV_REMOVE: ++ * Remove a device, destroy any tables. ++ * ++ * DM_DEV_RENAME: ++ * Rename a device. ++ * ++ * DM_SUSPEND: ++ * This performs both suspend and resume, depending which flag is ++ * passed in. ++ * Suspend: This command will not return until all pending io to ++ * the device has completed. Further io will be deferred until ++ * the device is resumed. ++ * Resume: It is no longer an error to issue this command on an ++ * unsuspended device. If a table is present in the 'inactive' ++ * slot, it will be moved to the active slot, then the old table ++ * from the active slot will be _destroyed_. Finally the device ++ * is resumed. ++ * ++ * DM_DEV_STATUS: ++ * Retrieves the status for the table in the 'active' slot. ++ * ++ * DM_DEV_WAIT: ++ * Wait for a significant event to occur to the device. This ++ * could either be caused by an event triggered by one of the ++ * targets of the table in the 'active' slot, or a table change. ++ * ++ * DM_TABLE_LOAD: ++ * Load a table into the 'inactive' slot for the device. The ++ * device does _not_ need to be suspended prior to this command. ++ * ++ * DM_TABLE_CLEAR: ++ * Destroy any table in the 'inactive' slot (ie. abort). ++ * ++ * DM_TABLE_DEPS: ++ * Return a set of device dependencies for the 'active' table. ++ * ++ * DM_TABLE_STATUS: ++ * Return the targets status for the 'active' table. ++ */ ++ ++/* ++ * All ioctl arguments consist of a single chunk of memory, with ++ * this structure at the start. If a uuid is specified any ++ * lookup (eg. for a DM_INFO) will be done on that, *not* the ++ * name. ++ */ ++struct dm_ioctl { ++ /* ++ * The version number is made up of three parts: ++ * major - no backward or forward compatibility, ++ * minor - only backwards compatible, ++ * patch - both backwards and forwards compatible. ++ * ++ * All clients of the ioctl interface should fill in the ++ * version number of the interface that they were ++ * compiled with. ++ * ++ * All recognised ioctl commands (ie. those that don't ++ * return -ENOTTY) fill out this field, even if the ++ * command failed. ++ */ ++ uint32_t version[3]; /* in/out */ ++ uint32_t data_size; /* total size of data passed in ++ * including this struct */ ++ ++ uint32_t data_start; /* offset to start of data ++ * relative to start of this struct */ ++ ++ uint32_t target_count; /* in/out */ ++ int32_t open_count; /* out */ ++ uint32_t flags; /* in/out */ ++ uint32_t event_nr; /* in/out */ ++ uint32_t padding; ++ ++ uint64_t dev; /* in/out */ ++ ++ char name[DM_NAME_LEN]; /* device name */ ++ char uuid[DM_UUID_LEN]; /* unique identifier for ++ * the block device */ ++}; ++ ++/* ++ * Used to specify tables. These structures appear after the ++ * dm_ioctl. ++ */ ++struct dm_target_spec { ++ uint64_t sector_start; ++ uint64_t length; ++ int32_t status; /* used when reading from kernel only */ ++ ++ /* ++ * Offset in bytes (from the start of this struct) to ++ * next target_spec. ++ */ ++ uint32_t next; ++ ++ char target_type[DM_MAX_TYPE_NAME]; ++ ++ /* ++ * Parameter string starts immediately after this object. ++ * Be careful to add padding after string to ensure correct ++ * alignment of subsequent dm_target_spec. ++ */ ++}; ++ ++/* ++ * Used to retrieve the target dependencies. ++ */ ++struct dm_target_deps { ++ uint32_t count; /* Array size */ ++ uint32_t padding; /* unused */ ++ uint64_t dev[0]; /* out */ ++}; ++ ++/* ++ * Used to get a list of all dm devices. ++ */ ++struct dm_name_list { ++ uint64_t dev; ++ uint32_t next; /* offset to the next record from ++ the _start_ of this */ ++ char name[0]; ++}; ++ ++/* ++ * If you change this make sure you make the corresponding change ++ * to dm-ioctl.c:lookup_ioctl() ++ */ ++enum { ++ /* Top level cmds */ ++ DM_VERSION_CMD = 0, ++ DM_REMOVE_ALL_CMD, ++ DM_LIST_DEVICES_CMD, ++ ++ /* device level cmds */ ++ DM_DEV_CREATE_CMD, ++ DM_DEV_REMOVE_CMD, ++ DM_DEV_RENAME_CMD, ++ DM_DEV_SUSPEND_CMD, ++ DM_DEV_STATUS_CMD, ++ DM_DEV_WAIT_CMD, ++ ++ /* Table level cmds */ ++ DM_TABLE_LOAD_CMD, ++ DM_TABLE_CLEAR_CMD, ++ DM_TABLE_DEPS_CMD, ++ DM_TABLE_STATUS_CMD, ++}; ++ ++#define DM_IOCTL 0xfd ++ ++#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) ++#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) ++#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) ++ ++#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) ++#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) ++#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) ++#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) ++#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) ++#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) ++ ++#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) ++#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) ++#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl) ++#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) ++ ++#define DM_VERSION_MAJOR 4 ++#define DM_VERSION_MINOR 0 ++#define DM_VERSION_PATCHLEVEL 5 ++#define DM_VERSION_EXTRA "-ioctl (2003-11-18)" ++ ++/* Status bits */ ++#define DM_READONLY_FLAG (1 << 0) /* In/Out */ ++#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */ ++#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */ ++ ++/* ++ * Flag passed into ioctl STATUS command to get table information ++ * rather than current status. ++ */ ++#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */ ++ ++/* ++ * Flags that indicate whether a table is present in either of ++ * the two table slots that a device has. ++ */ ++#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */ ++#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */ ++ ++/* ++ * Indicates that the buffer passed in wasn't big enough for the ++ * results. ++ */ ++#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ ++ ++#endif /* _LINUX_DM_IOCTL_H */ diff --git a/patches/common/linux-2.4.22-makefile.patch b/patches/common/linux-2.4.22-makefile.patch new file mode 100644 index 0000000..c79e8e2 --- /dev/null +++ b/patches/common/linux-2.4.22-makefile.patch @@ -0,0 +1,53 @@ +--- linux-2.4.22/drivers/md/Makefile Mon Nov 17 19:16:45 2003 ++++ linux/drivers/md/Makefile Tue Nov 18 13:22:48 2003 +@@ -4,24 +4,41 @@ + + O_TARGET := mddev.o + +-export-objs := md.o xor.o +-list-multi := lvm-mod.o ++export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \ ++ dm-log.o dm-io.o dm.o ++ ++list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o + lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o ++dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ ++ dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \ ++ kcopyd.o dm-daemon.o dm-io.o ++dm-mirror-mod-objs := dm-raid1.o dm-log.o + + # Note: link order is important. All raid personalities + # and xor.o must come before md.o, as they each initialise + # themselves, and md.o may use the personalities when it + # auto-initialised. + +-obj-$(CONFIG_MD_LINEAR) += linear.o +-obj-$(CONFIG_MD_RAID0) += raid0.o +-obj-$(CONFIG_MD_RAID1) += raid1.o +-obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +-obj-$(CONFIG_MD_MULTIPATH) += multipath.o +-obj-$(CONFIG_BLK_DEV_MD) += md.o +-obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o ++obj-$(CONFIG_MD_LINEAR) += linear.o ++obj-$(CONFIG_MD_RAID0) += raid0.o ++obj-$(CONFIG_MD_RAID1) += raid1.o ++obj-$(CONFIG_MD_RAID5) += raid5.o xor.o ++obj-$(CONFIG_MD_MULTIPATH) += multipath.o ++obj-$(CONFIG_BLK_DEV_MD) += md.o ++ ++obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o ++ ++obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o ++obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o + + include $(TOPDIR)/Rules.make + + lvm-mod.o: $(lvm-mod-objs) + $(LD) -r -o $@ $(lvm-mod-objs) ++ ++dm-mod.o: $(dm-mod-objs) ++ $(LD) -r -o $@ $(dm-mod-objs) ++ ++dm-mirror.o: $(dm-mirror-mod-objs) ++ $(LD) -r -o $@ $(dm-mirror-mod-objs) ++ diff --git a/patches/common/linux-2.4.22-memalloc.patch b/patches/common/linux-2.4.22-memalloc.patch new file mode 100644 index 0000000..043788f --- /dev/null +++ b/patches/common/linux-2.4.22-memalloc.patch @@ -0,0 +1,263 @@ +--- linux-2.4.22/drivers/md/dm-ioctl.c Wed Nov 19 13:59:45 2003 ++++ linux/drivers/md/dm-ioctl.c Wed Nov 19 14:00:03 2003 +@@ -1178,19 +1178,11 @@ + } + + /* +- * FIXME: I don't like this, we're trying to avoid low +- * memory issues when a device is suspended. +- */ +- current->flags |= PF_MEMALLOC; +- +- /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); +- if (r) { +- current->flags &= ~PF_MEMALLOC; ++ if (r) + return r; +- } + + r = validate_params(cmd, param); + if (r) +@@ -1208,7 +1200,6 @@ + + out: + free_params(param); +- current->flags &= ~PF_MEMALLOC; + return r; + } + +--- linux-2.4.22/include/asm-alpha/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-alpha/mman.h Wed Nov 19 14:00:03 2003 +@@ -30,6 +30,7 @@ + + #define MCL_CURRENT 8192 /* lock all currently mapped pages */ + #define MCL_FUTURE 16384 /* lock all additions to address space */ ++#define MCL_MEMALLOC 32768 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0 /* no further special treatment */ + #define MADV_RANDOM 1 /* expect random page references */ +--- linux-2.4.22/include/asm-arm/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-arm/mman.h Wed Nov 19 14:00:03 2003 +@@ -24,6 +24,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-cris/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-cris/mman.h Wed Nov 19 14:00:03 2003 +@@ -26,6 +26,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-i386/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-i386/mman.h Wed Nov 19 14:00:03 2003 +@@ -24,6 +24,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-ia64/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-ia64/mman.h Wed Nov 19 14:00:03 2003 +@@ -32,6 +32,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-m68k/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-m68k/mman.h Wed Nov 19 14:00:03 2003 +@@ -24,6 +24,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-mips/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-mips/mman.h Wed Nov 19 14:00:03 2003 +@@ -55,6 +55,7 @@ + */ + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-mips64/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-mips64/mman.h Wed Nov 19 14:00:03 2003 +@@ -53,6 +53,7 @@ + */ + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-parisc/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-parisc/mman.h Wed Nov 19 14:00:03 2003 +@@ -24,6 +24,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0 /* no further special treatment */ + #define MADV_RANDOM 1 /* expect random page references */ +--- linux-2.4.22/include/asm-ppc/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-ppc/mman.h Wed Nov 19 14:00:03 2003 +@@ -25,6 +25,7 @@ + + #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ + #define MCL_FUTURE 0x4000 /* lock all additions to address space */ ++#define MCL_MEMALLOC 0x8000 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-ppc64/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-ppc64/mman.h Wed Nov 19 14:00:03 2003 +@@ -31,6 +31,7 @@ + + #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ + #define MCL_FUTURE 0x4000 /* lock all additions to address space */ ++#define MCL_MEMALLOC 0x8000 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-s390/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-s390/mman.h Wed Nov 19 14:00:03 2003 +@@ -32,6 +32,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-s390x/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-s390x/mman.h Wed Nov 19 14:00:03 2003 +@@ -32,6 +32,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-sh/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-sh/mman.h Wed Nov 19 14:00:03 2003 +@@ -24,6 +24,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/include/asm-sparc/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-sparc/mman.h Wed Nov 19 14:00:03 2003 +@@ -30,6 +30,7 @@ + + #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ + #define MCL_FUTURE 0x4000 /* lock all additions to address space */ ++#define MCL_MEMALLOC 0x8000 /* allow allocation of reserved memory */ + + /* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system + * XXX calls. +--- linux-2.4.22/include/asm-sparc64/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-sparc64/mman.h Wed Nov 19 14:00:03 2003 +@@ -30,6 +30,7 @@ + + #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ + #define MCL_FUTURE 0x4000 /* lock all additions to address space */ ++#define MCL_MEMALLOC 0x8000 /* allow allocation of reserved memory */ + + /* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system + * XXX calls. +--- linux-2.4.22/include/asm-x86_64/mman.h Wed Nov 19 13:59:45 2003 ++++ linux/include/asm-x86_64/mman.h Wed Nov 19 14:00:03 2003 +@@ -25,6 +25,7 @@ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ ++#define MCL_MEMALLOC 4 /* allow allocation of reserved memory */ + + #define MADV_NORMAL 0x0 /* default page-in behavior */ + #define MADV_RANDOM 0x1 /* page-in minimum required */ +--- linux-2.4.22/mm/mlock.c Wed Nov 19 13:59:45 2003 ++++ linux/mm/mlock.c Wed Nov 19 14:00:03 2003 +@@ -244,6 +244,11 @@ + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + ++ if (flags & MCL_MEMALLOC) ++ current->flags |= PF_MEMALLOC; ++ else ++ current->flags &= ~PF_MEMALLOC; ++ + def_flags = 0; + if (flags & MCL_FUTURE) + def_flags = VM_LOCKED; +@@ -269,7 +274,7 @@ + int ret = -EINVAL; + + down_write(¤t->mm->mmap_sem); +- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) ++ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_MEMALLOC))) + goto out; + + lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; +--- linux-2.4.22/mm/oom_kill.c Wed Nov 19 13:59:45 2003 ++++ linux/mm/oom_kill.c Wed Nov 19 14:06:52 2003 +@@ -65,6 +65,12 @@ + if (p->flags & PF_MEMDIE) + return 0; + ++ /* ++ * The system's likely doomed if we have to kill a PF_MEMALLOC process. ++ */ ++ if (p->flags & PF_MEMALLOC) ++ return 1; ++ + /* + * The memory size of the process is the basis for the badness. + */ +--- linux-2.4.22/mm/page_alloc.c Wed Nov 19 13:59:45 2003 ++++ linux/mm/page_alloc.c Wed Nov 19 14:00:03 2003 +@@ -254,6 +254,7 @@ + { + struct page * page = NULL; + int __freed = 0; ++ unsigned long pf_memalloc; + + if (!(gfp_mask & __GFP_WAIT)) + goto out; +@@ -261,11 +262,12 @@ + BUG(); + + current->allocation_order = order; ++ pf_memalloc = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; + + __freed = try_to_free_pages_zone(classzone, gfp_mask); + +- current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); ++ current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES) | pf_memalloc; + + if (current->nr_local_pages) { + struct list_head * entry, * local_pages; diff --git a/patches/common/linux-2.4.22-mempool.patch b/patches/common/linux-2.4.22-mempool.patch new file mode 100644 index 0000000..f17ce00 --- /dev/null +++ b/patches/common/linux-2.4.22-mempool.patch @@ -0,0 +1,354 @@ +--- linux-2.4.22/include/linux/mempool.h Thu Jan 1 01:00:00 1970 ++++ linux/include/linux/mempool.h Tue Nov 18 13:42:25 2003 +@@ -0,0 +1,31 @@ ++/* ++ * memory buffer pool support ++ */ ++#ifndef _LINUX_MEMPOOL_H ++#define _LINUX_MEMPOOL_H ++ ++#include ++#include ++ ++struct mempool_s; ++typedef struct mempool_s mempool_t; ++ ++typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); ++typedef void (mempool_free_t)(void *element, void *pool_data); ++ ++extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, ++ mempool_free_t *free_fn, void *pool_data); ++extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); ++extern void mempool_destroy(mempool_t *pool); ++extern void * mempool_alloc(mempool_t *pool, int gfp_mask); ++extern void mempool_free(void *element, mempool_t *pool); ++ ++/* ++ * A mempool_alloc_t and mempool_free_t that get the memory from ++ * a slab that is passed in through pool_data. ++ */ ++void *mempool_alloc_slab(int gfp_mask, void *pool_data); ++void mempool_free_slab(void *element, void *pool_data); ++ ++ ++#endif /* _LINUX_MEMPOOL_H */ +--- linux-2.4.22/mm/Makefile Mon Nov 17 19:18:06 2003 ++++ linux/mm/Makefile Tue Nov 18 13:42:25 2003 +@@ -9,12 +9,12 @@ + + O_TARGET := mm.o + +-export-objs := shmem.o filemap.o memory.o page_alloc.o ++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o + + obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ + page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ +- shmem.o ++ shmem.o mempool.o + + obj-$(CONFIG_HIGHMEM) += highmem.o + +diff -rNu linux-2.4.22/mm/mempool.c linux/mm/mempool.c +--- linux-2.4.22/mm/mempool.c Thu Jan 1 01:00:00 1970 ++++ linux/mm/mempool.c Tue Nov 18 13:42:25 2003 +@@ -0,0 +1,299 @@ ++/* ++ * linux/mm/mempool.c ++ * ++ * memory buffer pool support. Such pools are mostly used ++ * for guaranteed, deadlock-free memory allocations during ++ * extreme VM load. ++ * ++ * started by Ingo Molnar, Copyright (C) 2001 ++ */ ++ ++#include ++#include ++#include ++#include ++ ++struct mempool_s { ++ spinlock_t lock; ++ int min_nr; /* nr of elements at *elements */ ++ int curr_nr; /* Current nr of elements at *elements */ ++ void **elements; ++ ++ void *pool_data; ++ mempool_alloc_t *alloc; ++ mempool_free_t *free; ++ wait_queue_head_t wait; ++}; ++ ++static void add_element(mempool_t *pool, void *element) ++{ ++ BUG_ON(pool->curr_nr >= pool->min_nr); ++ pool->elements[pool->curr_nr++] = element; ++} ++ ++static void *remove_element(mempool_t *pool) ++{ ++ BUG_ON(pool->curr_nr <= 0); ++ return pool->elements[--pool->curr_nr]; ++} ++ ++static void free_pool(mempool_t *pool) ++{ ++ while (pool->curr_nr) { ++ void *element = remove_element(pool); ++ pool->free(element, pool->pool_data); ++ } ++ kfree(pool->elements); ++ kfree(pool); ++} ++ ++/** ++ * mempool_create - create a memory pool ++ * @min_nr: the minimum number of elements guaranteed to be ++ * allocated for this pool. ++ * @alloc_fn: user-defined element-allocation function. ++ * @free_fn: user-defined element-freeing function. ++ * @pool_data: optional private data available to the user-defined functions. ++ * ++ * this function creates and allocates a guaranteed size, preallocated ++ * memory pool. The pool can be used from the mempool_alloc and mempool_free ++ * functions. This function might sleep. Both the alloc_fn() and the free_fn() ++ * functions might sleep - as long as the mempool_alloc function is not called ++ * from IRQ contexts. ++ */ ++mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, ++ mempool_free_t *free_fn, void *pool_data) ++{ ++ mempool_t *pool; ++ ++ pool = kmalloc(sizeof(*pool), GFP_KERNEL); ++ if (!pool) ++ return NULL; ++ memset(pool, 0, sizeof(*pool)); ++ pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); ++ if (!pool->elements) { ++ kfree(pool); ++ return NULL; ++ } ++ spin_lock_init(&pool->lock); ++ pool->min_nr = min_nr; ++ pool->pool_data = pool_data; ++ init_waitqueue_head(&pool->wait); ++ pool->alloc = alloc_fn; ++ pool->free = free_fn; ++ ++ /* ++ * First pre-allocate the guaranteed number of buffers. ++ */ ++ while (pool->curr_nr < pool->min_nr) { ++ void *element; ++ ++ element = pool->alloc(GFP_KERNEL, pool->pool_data); ++ if (unlikely(!element)) { ++ free_pool(pool); ++ return NULL; ++ } ++ add_element(pool, element); ++ } ++ return pool; ++} ++ ++/** ++ * mempool_resize - resize an existing memory pool ++ * @pool: pointer to the memory pool which was allocated via ++ * mempool_create(). ++ * @new_min_nr: the new minimum number of elements guaranteed to be ++ * allocated for this pool. ++ * @gfp_mask: the usual allocation bitmask. ++ * ++ * This function shrinks/grows the pool. In the case of growing, ++ * it cannot be guaranteed that the pool will be grown to the new ++ * size immediately, but new mempool_free() calls will refill it. ++ * ++ * Note, the caller must guarantee that no mempool_destroy is called ++ * while this function is running. mempool_alloc() & mempool_free() ++ * might be called (eg. from IRQ contexts) while this function executes. ++ */ ++int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask) ++{ ++ void *element; ++ void **new_elements; ++ unsigned long flags; ++ ++ BUG_ON(new_min_nr <= 0); ++ ++ spin_lock_irqsave(&pool->lock, flags); ++ if (new_min_nr < pool->min_nr) { ++ while (pool->curr_nr > new_min_nr) { ++ element = remove_element(pool); ++ spin_unlock_irqrestore(&pool->lock, flags); ++ pool->free(element, pool->pool_data); ++ spin_lock_irqsave(&pool->lock, flags); ++ } ++ pool->min_nr = new_min_nr; ++ goto out_unlock; ++ } ++ spin_unlock_irqrestore(&pool->lock, flags); ++ ++ /* Grow the pool */ ++ new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); ++ if (!new_elements) ++ return -ENOMEM; ++ ++ spin_lock_irqsave(&pool->lock, flags); ++ memcpy(new_elements, pool->elements, ++ pool->curr_nr * sizeof(*new_elements)); ++ kfree(pool->elements); ++ pool->elements = new_elements; ++ pool->min_nr = new_min_nr; ++ ++ while (pool->curr_nr < pool->min_nr) { ++ spin_unlock_irqrestore(&pool->lock, flags); ++ element = pool->alloc(gfp_mask, pool->pool_data); ++ if (!element) ++ goto out; ++ spin_lock_irqsave(&pool->lock, flags); ++ if (pool->curr_nr < pool->min_nr) ++ add_element(pool, element); ++ else ++ kfree(element); /* Raced */ ++ } ++out_unlock: ++ spin_unlock_irqrestore(&pool->lock, flags); ++out: ++ return 0; ++} ++ ++/** ++ * mempool_destroy - deallocate a memory pool ++ * @pool: pointer to the memory pool which was allocated via ++ * mempool_create(). ++ * ++ * this function only sleeps if the free_fn() function sleeps. The caller ++ * has to guarantee that all elements have been returned to the pool (ie: ++ * freed) prior to calling mempool_destroy(). ++ */ ++void mempool_destroy(mempool_t *pool) ++{ ++ if (pool->curr_nr != pool->min_nr) ++ BUG(); /* There were outstanding elements */ ++ free_pool(pool); ++} ++ ++/** ++ * mempool_alloc - allocate an element from a specific memory pool ++ * @pool: pointer to the memory pool which was allocated via ++ * mempool_create(). ++ * @gfp_mask: the usual allocation bitmask. ++ * ++ * this function only sleeps if the alloc_fn function sleeps or ++ * returns NULL. Note that due to preallocation, this function ++ * *never* fails when called from process contexts. (it might ++ * fail if called from an IRQ context.) ++ */ ++void * mempool_alloc(mempool_t *pool, int gfp_mask) ++{ ++ void *element; ++ unsigned long flags; ++ int curr_nr; ++ DECLARE_WAITQUEUE(wait, current); ++ int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); ++ ++repeat_alloc: ++ element = pool->alloc(gfp_nowait, pool->pool_data); ++ if (likely(element != NULL)) ++ return element; ++ ++ /* ++ * If the pool is less than 50% full then try harder ++ * to allocate an element: ++ */ ++ if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) { ++ element = pool->alloc(gfp_mask, pool->pool_data); ++ if (likely(element != NULL)) ++ return element; ++ } ++ ++ /* ++ * Kick the VM at this point. ++ */ ++ wakeup_bdflush(); ++ ++ spin_lock_irqsave(&pool->lock, flags); ++ if (likely(pool->curr_nr)) { ++ element = remove_element(pool); ++ spin_unlock_irqrestore(&pool->lock, flags); ++ return element; ++ } ++ spin_unlock_irqrestore(&pool->lock, flags); ++ ++ /* We must not sleep in the GFP_ATOMIC case */ ++ if (gfp_mask == gfp_nowait) ++ return NULL; ++ ++ run_task_queue(&tq_disk); ++ ++ add_wait_queue_exclusive(&pool->wait, &wait); ++ set_task_state(current, TASK_UNINTERRUPTIBLE); ++ ++ spin_lock_irqsave(&pool->lock, flags); ++ curr_nr = pool->curr_nr; ++ spin_unlock_irqrestore(&pool->lock, flags); ++ ++ if (!curr_nr) ++ schedule(); ++ ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&pool->wait, &wait); ++ ++ goto repeat_alloc; ++} ++ ++/** ++ * mempool_free - return an element to the pool. ++ * @element: pool element pointer. ++ * @pool: pointer to the memory pool which was allocated via ++ * mempool_create(). ++ * ++ * this function only sleeps if the free_fn() function sleeps. ++ */ ++void mempool_free(void *element, mempool_t *pool) ++{ ++ unsigned long flags; ++ ++ if (pool->curr_nr < pool->min_nr) { ++ spin_lock_irqsave(&pool->lock, flags); ++ if (pool->curr_nr < pool->min_nr) { ++ add_element(pool, element); ++ spin_unlock_irqrestore(&pool->lock, flags); ++ wake_up(&pool->wait); ++ return; ++ } ++ spin_unlock_irqrestore(&pool->lock, flags); ++ } ++ pool->free(element, pool->pool_data); ++} ++ ++/* ++ * A commonly used alloc and free fn. ++ */ ++void *mempool_alloc_slab(int gfp_mask, void *pool_data) ++{ ++ kmem_cache_t *mem = (kmem_cache_t *) pool_data; ++ return kmem_cache_alloc(mem, gfp_mask); ++} ++ ++void mempool_free_slab(void *element, void *pool_data) ++{ ++ kmem_cache_t *mem = (kmem_cache_t *) pool_data; ++ kmem_cache_free(mem, element); ++} ++ ++ ++EXPORT_SYMBOL(mempool_create); ++EXPORT_SYMBOL(mempool_resize); ++EXPORT_SYMBOL(mempool_destroy); ++EXPORT_SYMBOL(mempool_alloc); ++EXPORT_SYMBOL(mempool_free); ++EXPORT_SYMBOL(mempool_alloc_slab); ++EXPORT_SYMBOL(mempool_free_slab); diff --git a/patches/common/linux-2.4.22-o_direct.patch b/patches/common/linux-2.4.22-o_direct.patch new file mode 100644 index 0000000..71ad32f --- /dev/null +++ b/patches/common/linux-2.4.22-o_direct.patch @@ -0,0 +1,27 @@ +--- linux-2.4.22/mm/filemap.c Mon Nov 17 19:18:07 2003 ++++ linux/mm/filemap.c Tue Nov 18 13:28:22 2003 +@@ -1742,7 +1742,8 @@ + } + up(&inode->i_sem); + up_read(&inode->i_alloc_sem); +- UPDATE_ATIME(filp->f_dentry->d_inode); ++ if (!S_ISBLK(inode->i_mode)) ++ UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } + } +@@ -3120,8 +3121,12 @@ + goto out; + + remove_suid(inode); +- inode->i_ctime = inode->i_mtime = CURRENT_TIME; +- mark_inode_dirty_sync(inode); ++ ++ /* Don't update times for block devices using O_DIRECT */ ++ if (!(file->f_flags & O_DIRECT) || !S_ISBLK(inode->i_mode)) { ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ } + + do { + unsigned long index, offset; diff --git a/patches/common/linux-2.4.22-vcalloc.patch b/patches/common/linux-2.4.22-vcalloc.patch new file mode 100644 index 0000000..bd1f97f --- /dev/null +++ b/patches/common/linux-2.4.22-vcalloc.patch @@ -0,0 +1,45 @@ +--- linux-2.4.22/include/linux/vmalloc.h Mon Nov 17 19:18:01 2003 ++++ linux/include/linux/vmalloc.h Tue Nov 18 13:22:57 2003 +@@ -29,6 +29,7 @@ + extern void vmfree_area_pages(unsigned long address, unsigned long size); + extern int vmalloc_area_pages(unsigned long address, unsigned long size, + int gfp_mask, pgprot_t prot); ++extern void *vcalloc(unsigned long nmemb, unsigned long elem_size); + + /* + * Allocate any pages +--- linux-2.4.22/kernel/ksyms.c Mon Nov 17 19:18:06 2003 ++++ linux/kernel/ksyms.c Tue Nov 18 13:22:57 2003 +@@ -114,6 +114,7 @@ + EXPORT_SYMBOL(__vmalloc); + EXPORT_SYMBOL(vmap); + EXPORT_SYMBOL(vmalloc_to_page); ++EXPORT_SYMBOL(vcalloc); + EXPORT_SYMBOL(mem_map); + EXPORT_SYMBOL(remap_page_range); + EXPORT_SYMBOL(max_mapnr); +--- linux-2.4.22/mm/vmalloc.c Mon Nov 17 19:18:07 2003 ++++ linux/mm/vmalloc.c Tue Nov 18 13:22:58 2003 +@@ -374,3 +374,22 @@ + read_unlock(&vmlist_lock); + return buf - buf_start; + } ++ ++void *vcalloc(unsigned long nmemb, unsigned long elem_size) ++{ ++ unsigned long size; ++ void *addr; ++ ++ /* ++ * Check that we're not going to overflow. ++ */ ++ if (nmemb > (ULONG_MAX / elem_size)) ++ return NULL; ++ ++ size = nmemb * elem_size; ++ addr = vmalloc(size); ++ if (addr) ++ memset(addr, 0, size); ++ ++ return addr; ++} -- 2.43.5