From a329ea54f0981a5bfdf911718c0df76bd58e8cb3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <joe@fib011235813.fsnet.co.uk>
Date: Thu, 14 Mar 2002 16:02:50 +0000
Subject: [PATCH] o  split snapshot metadata handling into it's own file.

o  enable the inflight exceptions to use the same hash code as complete
   exceptions.

o  factor common code from dm_do_snapshot, snapshot_map into new_exception.

o  remove origin when there are no more snapshots against it.

o  reformat to fit the coding style used in the rest of dm.
---
 kernel/common/dm-exception-store.c |  683 +++++++++++++
 kernel/common/dm-snapshot.c        | 1528 +++++++++-------------------
 kernel/common/dm-snapshot.h        |  135 +++
 3 files changed, 1299 insertions(+), 1047 deletions(-)
 create mode 100644 kernel/common/dm-exception-store.c
 create mode 100644 kernel/common/dm-snapshot.h

diff --git a/kernel/common/dm-exception-store.c b/kernel/common/dm-exception-store.c
new file mode 100644
index 0000000..a1b5d4f
--- /dev/null
+++ b/kernel/common/dm-exception-store.c
@@ -0,0 +1,683 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-snapshot.h"
+
+#if 0
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata. Only applicable to
+ * persistent snapshots.
+ * There is no backward or forward compatibility implemented, snapshots
+ * with different disk versions than the kernel will not be usable. It is
+ * expected that "lvcreate" will blank out the start of the COW device
+ * before calling the snapshot constructor.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+/*
+ * Metadata format: (please keep this up-to-date!)
+ * Persistent snapshots have a 1 block header (see below for structure) at
+ * the very start of the device. The COW metadata starts at
+ * .start_of_exceptions.
+ *
+ * COW metadata is stored in blocks that are "extent-size" sectors long as
+ * an array of disk_exception structures in Little-Endian format.
+ * The last entry in this array has rsector_new set to 0 (this cannot be a
+ * legal redirection as the header is here) and if rsector_org has a value
+ * it is the sector number of the next COW metadata sector on the disk. if
+ * rsector_org is also zero then this is the end of the COW metadata.
+ *
+ * The metadata is written in hardblocksize lumps rather than in units of
+ * extents for efficiency so don't expect a whole extent to be zeroed out
+ * at any time.
+ *
+ * Non-persistent snapshots simple have redirected blocks stored
+ * (in chunk_size sectors) from hard block 1 to avoid inadvertantly
+ * creating a bad header.
+ */
+
+/*
+ * Internal snapshot structure
+ */
+struct persistent_info {
+	/* Size of extents used for COW blocks */
+        long extent_size;
+
+	/* Number of the next free sector for COW/data */
+	unsigned long next_free_sector;
+
+	/* Where the metadata starts */
+	unsigned long start_of_exceptions;
+
+	/* Where we are currently writing the metadata */
+	unsigned long current_metadata_sector;
+
+	/* Index into disk_cow array */
+	int current_metadata_entry;
+
+	/* Index into mythical extent array */
+	int current_metadata_number;
+
+	/* Number of metadata entries in the disk_cow array */
+	int highest_metadata_entry;
+
+	/* Number of metadata entries per hard disk block */
+	int md_entries_per_block;
+
+	/* kiobuf for doing I/O to header & metadata */
+	struct kiobuf *cow_iobuf;
+
+	 /*
+	  * Disk extent with COW data in it. as an array of
+	  * exception tables. The first one points to the next
+	  * block of metadata or 0 if this is the last
+	  */
+	struct disk_exception *disk_cow;
+};
+
+/*
+ * An array of these is held in each disk block. LE format
+ */
+struct disk_exception {
+	uint64_t rsector_org;
+	uint64_t rsector_new;
+};
+
+/*
+ * Structure of a (persistent) snapshot header on disk. in LE format
+ */
+struct snap_disk_header {
+	uint32_t magic;
+
+	/* Simple, incrementing version. no backward compatibility */
+	uint32_t version;
+
+	/* In 512 byte sectors */
+	uint32_t chunk_size;
+
+	/* In 512 byte sectors */
+	uint32_t extent_size;
+	uint64_t start_of_exceptions;
+	uint32_t full;
+};
+
+/*
+ * READ or WRITE some blocks to/from a device
+ */
+static int do_io(int rw, struct kiobuf *iobuf, kdev_t dev,
+		 unsigned long start, int nr_sectors)
+{
+	int i, sectors_per_block, nr_blocks;
+	int blocksize = get_hardsect_size(dev);
+	int status;
+
+	sectors_per_block = blocksize / SECTOR_SIZE;
+
+	nr_blocks = nr_sectors / sectors_per_block;
+	start /= sectors_per_block;
+
+	for (i = 0; i < nr_blocks; i++)
+		iobuf->blocks[i] = start++;
+
+	iobuf->length = nr_sectors << 9;
+
+	status = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize);
+	return (status != (nr_sectors << 9));
+}
+
+/*
+ * Write the latest COW metadata block.
+ */
+static int write_metadata(struct snapshot_c *s, struct persistent_info *pi)
+{
+	kdev_t dev = s->cow_dev->dev;
+	int blocksize = get_hardsect_size(dev);
+	int writesize = blocksize/SECTOR_SIZE;
+
+	if (do_io(WRITE, pi->cow_iobuf, dev,
+		  pi->current_metadata_sector, writesize) != 0) {
+		DMERR("Error writing COW block");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate a kiobuf. This is the only code nicked from the old
+ * snapshot driver and I've changed it anyway.
+ */
+static int alloc_iobuf_pages(struct kiobuf *iobuf, int nr_sectors)
+{
+	int nr_pages, err, i;
+
+	if (nr_sectors > KIO_MAX_SECTORS)
+		return -1;
+
+	nr_pages = nr_sectors / (PAGE_SIZE/SECTOR_SIZE);
+	err = expand_kiobuf(iobuf, nr_pages);
+	if (err) goto out;
+
+	err = -ENOMEM;
+	iobuf->locked = 1;
+	iobuf->nr_pages = 0;
+	for (i = 0; i < nr_pages; i++) {
+		struct page * page;
+
+		page = alloc_page(GFP_KERNEL);
+		if (!page) goto out;
+
+		iobuf->maplist[i] = page;
+		LockPage(page);
+		iobuf->nr_pages++;
+	}
+	iobuf->offset = 0;
+
+	err = 0;
+
+out:
+	return err;
+}
+
+/*
+ * Read on-disk COW metadata and populate the hash table.
+ */
+static int read_metadata(struct snapshot_c *lc, struct persistent_info *pi)
+{
+	int status;
+	int i;
+	int entry = 0;
+	int map_page = 0;
+	int nr_sectors = pi->extent_size;
+	kdev_t dev = lc->cow_dev->dev;
+	int blocksize = get_hardsect_size(dev);
+	unsigned long cur_sector = pi->start_of_exceptions;
+	unsigned long last_sector;
+	unsigned long first_free_sector = 0;
+	int entries_per_page = PAGE_SIZE / sizeof(struct disk_exception);
+	struct disk_exception *cow_block;
+	struct kiobuf *read_iobuf;
+	int err = 0;
+	int devsize = get_dev_size(dev);
+
+	/*
+	 * Allocate our own iovec for this operation 'cos the
+	 * others are way too small.
+	 */
+	if (alloc_kiovec(1, &read_iobuf)) {
+		DMERR("Error allocating iobuf for %s",
+		      kdevname(dev));
+		return -1;
+	}
+
+	if (alloc_iobuf_pages(read_iobuf, pi->extent_size)) {
+		DMERR("Error allocating iobuf space for %s",
+		      kdevname(dev));
+		free_kiovec(1, &read_iobuf);
+		return -1;
+	}
+	cow_block = page_address(read_iobuf->maplist[0]);
+
+	do {
+		/* Make sure the chain does not go off the end of
+		 * the device, or backwards */
+		if (cur_sector > devsize || cur_sector < first_free_sector) {
+			DMERR("COW table chain pointers are inconsistent, "
+			      "can't activate snapshot");
+			err = -1;
+			goto ret_free;
+		}
+
+		first_free_sector = max(first_free_sector,
+					cur_sector + pi->extent_size);
+		status = do_io(READ, read_iobuf, dev,
+			       cur_sector, nr_sectors);
+		if (status == 0) {
+
+			map_page = 0;
+			entry = 0;
+
+			cow_block = page_address(read_iobuf->maplist[0]);
+
+			/* Now populate the hash table from this data */
+			for (i = 0; i <= pi->highest_metadata_entry &&
+				     cow_block[entry].rsector_new != 0; i++) {
+
+				struct exception *ex;
+
+				ex = add_exception(lc,
+						   le64_to_cpu(cow_block[entry].rsector_org),
+						   le64_to_cpu(cow_block[entry].rsector_new));
+
+				first_free_sector = max(first_free_sector,
+							(unsigned long)(le64_to_cpu(cow_block[entry].rsector_new) +
+									lc->chunk_size));
+
+				/* Do we need to move onto the next page? */
+				if (++entry >= entries_per_page) {
+					entry = 0;
+					cow_block = page_address(read_iobuf->maplist[++map_page]);
+				}
+			}
+		}
+		else {
+			DMERR("Error reading COW metadata for %s",
+			      kdevname(dev));
+			err = -1;
+			goto ret_free;
+		}
+		last_sector = cur_sector;
+		cur_sector = le64_to_cpu(cow_block[entry].rsector_org);
+
+	} while (cur_sector != 0);
+
+	lc->persistent = 1;
+	pi->current_metadata_sector = last_sector +
+		                      map_page*PAGE_SIZE/SECTOR_SIZE +
+                                      entry/(SECTOR_SIZE/sizeof(struct disk_exception));
+	pi->current_metadata_entry  = entry;
+	pi->current_metadata_number = i;
+	pi->next_free_sector = first_free_sector;
+
+	/* Copy last block into cow_iobuf */
+	memcpy(pi->disk_cow, (char *)((long)&cow_block[entry] - ((long)&cow_block[entry] & (blocksize-1))), blocksize);
+
+ ret_free:
+	unmap_kiobuf(read_iobuf);
+	free_kiovec(1, &read_iobuf);
+
+	return err;
+}
+
+/*
+ * Read the snapshot volume header, returns 0 only if it read OK
+ * and it was valid. returns 1 if no header was found, -1 on
+ * error.  All fields are checked against the snapshot structure
+ * itself to make sure we don't corrupt the data.
+ */
+static int read_header(struct snapshot_c *lc, struct persistent_info *pi)
+{
+	int status;
+	struct snap_disk_header *header;
+	kdev_t dev = lc->cow_dev->dev;
+	int blocksize = get_hardsect_size(dev);
+	unsigned long devsize;
+
+	/* Get it */
+	status = do_io(READ, pi->cow_iobuf, dev, 0L, blocksize/SECTOR_SIZE);
+	if (status != 0) {
+		DMERR("Snapshot dev %s error reading header",
+		      kdevname(dev));
+		return -1;
+	}
+
+	header = (struct snap_disk_header *) page_address(pi->cow_iobuf->maplist[0]);
+
+	/*
+	 * Check the magic. It's OK if this fails, we just create a new snapshot header
+	 * and start from scratch
+	 */
+	if (le32_to_cpu(header->magic) != SNAP_MAGIC) {
+		return 1;
+	}
+
+	/* Check the version matches */
+	if (le32_to_cpu(header->version) != SNAPSHOT_DISK_VERSION) {
+		DMWARN("Snapshot dev %s version mismatch. Stored: %d, driver: %d",
+		       kdevname(dev), le32_to_cpu(header->version), SNAPSHOT_DISK_VERSION);
+		return -1;
+	}
+
+	/* Check the chunk sizes match */
+	if (le32_to_cpu(header->chunk_size) != lc->chunk_size) {
+		DMWARN("Snapshot dev %s chunk size mismatch. Stored: %d, requested: %d",
+		       kdevname(dev), le32_to_cpu(header->chunk_size), lc->chunk_size);
+		return -1;
+	}
+
+	/* Check the extent sizes match */
+	if (le32_to_cpu(header->extent_size) != pi->extent_size) {
+		DMWARN("Snapshot dev %s extent size mismatch. Stored: %d, requested: %ld",
+		       kdevname(dev), le32_to_cpu(header->extent_size), pi->extent_size);
+		return -1;
+	}
+
+	/* Get the rest of the data */
+	pi->start_of_exceptions = le64_to_cpu(header->start_of_exceptions);
+	if (header->full) {
+		DMWARN("Snapshot dev %s is full. It cannot be used", kdevname(dev));
+		lc->full = 1;
+		return -1;
+	}
+
+	/* Validate against the size of the volume */
+	devsize = get_dev_size(dev);
+	if (pi->start_of_exceptions > devsize) {
+		DMWARN("Snapshot metadata error on %s. start exceptions > device size (%ld > %ld)",
+		       kdevname(dev), pi->start_of_exceptions, devsize);
+		return -1;
+	}
+
+	/* Read metadata into the hash table and update pointers */
+	return read_metadata(lc, &lc->p_info);
+}
+
+/*
+ * Write (or update) the header. The only time we should need to
+ * do an update is when the snapshot becomes full.
+ */
+static int write_header(struct snapshot_c *lc, struct persistent_info *pi)
+{
+	struct snap_disk_header *header;
+	struct kiobuf *head_iobuf;
+	kdev_t dev = lc->cow_dev->dev;
+	int blocksize = get_hardsect_size(dev);
+	int status;
+
+	/*
+	 * Allocate our own iobuf for this so we don't corrupt
+	 * any of the other writes that may be going on.
+	 */
+	if (alloc_kiovec(1, &head_iobuf)) {
+		DMERR("Error allocating iobuf for header on %s", kdevname(dev));
+		return -1;
+	}
+
+	if (alloc_iobuf_pages(head_iobuf, PAGE_SIZE/SECTOR_SIZE)) {
+		DMERR("Error allocating iobuf space for header on %s", kdevname(dev));
+		free_kiovec(1, &head_iobuf);
+		return -1;
+	}
+
+	header = (struct snap_disk_header *) page_address(head_iobuf->maplist[0]);
+
+	header->magic       = cpu_to_le32(SNAP_MAGIC);
+	header->version     = cpu_to_le32(SNAPSHOT_DISK_VERSION);
+	header->chunk_size  = cpu_to_le32(lc->chunk_size);
+	header->extent_size = cpu_to_le32(pi->extent_size);
+	header->full        = cpu_to_le32(lc->full);
+
+	header->start_of_exceptions = cpu_to_le64(pi->start_of_exceptions);
+
+	/* Must write at least a full block */
+	status = do_io(WRITE, head_iobuf, dev, 0, blocksize/SECTOR_SIZE);
+
+	unmap_kiobuf(head_iobuf);
+	free_kiovec(1, &head_iobuf);
+	return status;
+}
+
+
+static int init_persistent_snapshot(struct snapshot_c *lc, int blocksize,
+				    unsigned long extent_size, void **context)
+{
+	struct persistent_info *pi = &lc->p_info;
+
+	int status;
+	int i;
+	int cow_sectors;
+
+	pi->extent_size = extent_size;
+	pi->next_free_sector = blocksize / SECTOR_SIZE; /* Leave the first block alone */
+	pi->disk_cow  = NULL;
+
+	pi->highest_metadata_entry = (pi->extent_size*SECTOR_SIZE) / sizeof(struct disk_exception) - 1;
+	pi->md_entries_per_block   = blocksize / sizeof(struct disk_exception);
+
+	/* Allocate and set up iobuf for metadata I/O */
+	*context = "Unable to allocate COW iovec";
+	if (alloc_kiovec(1, &pi->cow_iobuf))
+		return -1;
+
+	/* Allocate space for the COW buffer. It should be at least PAGE_SIZE. */
+	cow_sectors = blocksize/SECTOR_SIZE + PAGE_SIZE/SECTOR_SIZE;
+	*context = "Unable to allocate COW I/O buffer space";
+	if (alloc_iobuf_pages(pi->cow_iobuf, cow_sectors)) {
+		free_kiovec(1, &pi->cow_iobuf);
+		return -1;
+	}
+
+	for (i=0; i < pi->cow_iobuf->nr_pages; i++) {
+		memset(page_address(pi->cow_iobuf->maplist[i]), 0, PAGE_SIZE);
+	}
+
+	pi->disk_cow = page_address(pi->cow_iobuf->maplist[0]);
+
+	*context = "Error in disk header";
+	/* Check for a header on disk and create a new one if not */
+	if ( (status = read_header(lc, &lc->p_info)) == 1) {
+
+		/* Write a new header */
+		pi->start_of_exceptions = pi->next_free_sector;
+		pi->next_free_sector += pi->extent_size;
+		pi->current_metadata_sector = pi->start_of_exceptions;
+		pi->current_metadata_entry  = 0;
+		pi->current_metadata_number = 0;
+
+		*context = "Unable to write snapshot header";
+		if (write_header(lc, &lc->p_info) != 0) {
+			DMERR("Error writing header to snapshot volume %s",
+			      kdevname(lc->cow_dev->dev));
+			goto free_ret;
+		}
+
+		/* Write a blank metadata block to the device */
+		if (write_metadata(lc, &lc->p_info) != 0) {
+			DMERR("Error writing initial COW table to snapshot volume %s",
+			      kdevname(lc->cow_dev->dev));
+			goto free_ret;
+		}
+	}
+
+	/*
+	 * There is a header but it doesn't match - fail so we
+	 * don't destroy what might be useful data on disk.  If
+	 * the user really wants to use this COW device for a
+	 * snapshot then the first sector should be zeroed out
+	 * first.
+	 */
+	if (status == -1)
+		goto free_ret;
+
+	return 0;
+
+ free_ret:
+	unmap_kiobuf(pi->cow_iobuf);
+	free_kiovec(1, &pi->cow_iobuf);
+	return -1;
+}
+
+static void exit_persistent_snapshot(struct persistent_info *pi)
+{
+	unmap_kiobuf(pi->cow_iobuf);
+	free_kiovec(1, &pi->cow_iobuf);
+}
+
+/*
+ * Finds a suitable destination for the exception.
+ */
+static int prepare_exception(struct snapshot_c *s,
+			     struct inflight_exception *e)
+{
+	offset_t dev_size;
+
+	/*
+	 * Check for full snapshot. Doing the size calculation here means that
+	 * the COW device can be resized without us being told
+	 */
+	dev_size = get_dev_size(s->cow_dev->dev);
+	if (s->p_info.next_free_sector + s->chunk_size >= dev_size) {
+		/* Snapshot is full, we can't use it */
+		DMWARN("Snapshot %s is full (sec=%ld, size=%ld)",
+		       kdevname(s->cow_dev->dev),
+		       s->p_info.next_free_sector + s->chunk_size, dev_size);
+		s->full = 1;
+
+		/* Mark it full on the device */
+		if (s->persistent)
+			write_header(s, &s->p_info);
+
+		return -1;
+
+	} else {
+		e->rsector_new = s->p_info.next_free_sector;
+		s->p_info.next_free_sector += s->chunk_size;
+	}
+
+	return 0;
+}
+
+/*
+ * Add a new exception entry to the on-disk metadata.
+ */
+static int commit_exception(struct snapshot_c *sc,
+			    unsigned long org, unsigned long new)
+{
+	struct persistent_info *pi = &sc->p_info;
+
+	int i = pi->current_metadata_entry++;
+	unsigned long next_md_block = pi->current_metadata_sector;
+
+	pi->current_metadata_number++;
+
+	/* Update copy of disk COW */
+	pi->disk_cow[i].rsector_org = cpu_to_le64(org);
+	pi->disk_cow[i].rsector_new = cpu_to_le64(new);
+
+	/* Have we filled this extent ? */
+	if (pi->current_metadata_number >= pi->highest_metadata_entry) {
+		/* Fill in pointer to next metadata extent */
+		i++;
+		pi->current_metadata_entry++;
+
+		next_md_block = pi->next_free_sector;
+		pi->next_free_sector += pi->extent_size;
+
+		pi->disk_cow[i].rsector_org = cpu_to_le64(next_md_block);
+		pi->disk_cow[i].rsector_new = 0;
+	}
+
+	/* Commit to disk */
+	if (write_metadata(sc, &sc->p_info)) {
+		sc->full = 1; /* Failed. don't try again */
+		return -1;
+	}
+
+	/*
+	 * Write a new (empty) metadata block if we are at the
+	 * end of an existing block so that read_metadata finds a
+	 * terminating zero entry.
+	 */
+	if (pi->current_metadata_entry == pi->md_entries_per_block) {
+		memset(pi->disk_cow, 0, PAGE_SIZE);
+		pi->current_metadata_sector = next_md_block;
+
+		/*
+		 * If this is also the end of an extent then go
+		 * back to the start.
+		 */
+		if (pi->current_metadata_number >= pi->highest_metadata_entry) {
+			pi->current_metadata_number = 0;
+
+		} else {
+			int blocksize = get_hardsect_size(sc->cow_dev->dev);
+			pi->current_metadata_sector += blocksize/SECTOR_SIZE;
+		}
+
+		pi->current_metadata_entry = 0;
+		if (write_metadata(sc, &sc->p_info) != 0) {
+			sc->full = 1;
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Sets the full flag in the metadata.  A quick hack for now.
+ */
+static void invalidate_snapshot(struct snapshot_c *s)
+{
+	s->full = 1;
+	if (s->persistent)
+		write_header(s, &s->p_info);
+}
+
+
+#endif
+
+
+struct exception_store * dm_create_persistent(struct dm_snapshot *s,
+					      int blocksize,
+					      offset_t extent_size,
+					      void **error)
+{
+	return NULL;
+}
+
+
+/*
+ * Implementation of the store for non-persistent snapshots.
+ */
+struct transient_c {
+	offset_t next_free;
+};
+
+void destroy_transient(struct exception_store *store)
+{
+	kfree(store->context);
+	kfree(store);
+}
+
+int prepare_transient(struct exception_store *store, struct exception *e)
+{
+	struct transient_c *tc = (struct transient_c *) store->context;
+	offset_t size = get_dev_size(store->snap->cow->dev);
+
+	if (size < (tc->next_free + store->snap->chunk_size))
+		return -1;
+
+	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+	tc->next_free += store->snap->chunk_size;
+	return 0;
+}
+
+struct exception_store *dm_create_transient(struct dm_snapshot *s,
+					    int blocksize, void **error)
+{
+	struct exception_store *store;
+	struct transient_c *tc;
+
+	store = kmalloc(sizeof(*store), GFP_KERNEL);
+	if (!store) {
+		DMWARN("out of memory.");
+		return NULL;
+	}
+
+	memset(store, 0, sizeof(*store));
+	store->destroy = destroy_transient;
+	store->prepare_exception = prepare_transient;
+	store->snap = s;
+
+	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+	if (!tc) {
+		kfree(store);
+		return NULL;
+	}
+
+	tc->next_free = 0;
+	store->context = tc;
+
+	return store;
+}
+
diff --git a/kernel/common/dm-snapshot.c b/kernel/common/dm-snapshot.c
index 132cb58..6929732 100644
--- a/kernel/common/dm-snapshot.c
+++ b/kernel/common/dm-snapshot.c
@@ -15,12 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/device-mapper.h>
 
-#include "dm.h"
-
-/*
- * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
- */
-#define SNAP_MAGIC 0x70416e53
+#include "dm-snapshot.h"
 
 /*
  * Hard sector size used all over the kernel
@@ -32,518 +27,299 @@
  */
 #define SNAPSHOT_COPY_PRIORITY 2
 
-/*
- * The on-disk version of the metadata. Only applicable to
- * persistent snapshots.
- * There is no backward or forward compatibility implemented, snapshots
- * with different disk versions than the kernel will not be usable. It is
- * expected that "lvcreate" will blank out the start of the COW device
- * before calling the snapshot constructor.
- */
-#define SNAPSHOT_DISK_VERSION 1
-
-/*
- * Metadata format: (please keep this up-to-date!)
- * Persistent snapshots have a 1 block header (see below for structure) at
- * the very start of the device. The COW metadata starts at
- * .start_of_exceptions.
- *
- * COW metadata is stored in blocks that are "extent-size" sectors long as
- * an array of disk_exception structures in Little-Endian format.
- * The last entry in this array has rsector_new set to 0 (this cannot be a
- * legal redirection as the header is here) and if rsector_org has a value
- * it is the sector number of the next COW metadata sector on the disk. if
- * rsector_org is also zero then this is the end of the COW metadata.
- *
- * The metadata is written in hardblocksize lumps rather than in units of
- * extents for efficiency so don't expect a whole extent to be zeroed out
- * at any time.
- *
- * Non-persistent snapshots simple have redirected blocks stored
- * (in chunk_size sectors) from hard block 1 to avoid inadvertantly
- * creating a bad header.
- */
-
-/*
- * Internal snapshot structure
- */
-struct snapshot_c {
-	 /* Original device (s/b a snapshot-origin) */
-	struct dm_dev *origin_dev;
-
-	/* Device holding COW data */
-	struct dm_dev *cow_dev;
-
-	/* List of snapshots per Origin */
-        struct list_head list;
-
-	 /* Size of data blocks saved - must be a power of 2 */
-	unsigned int chunk_size;
-
-	/* Chunk size-1 for & operations */
-	unsigned int chunk_size_mask;
-
-	/* Power of 2 that chunk_size is */
-	unsigned int chunk_size_shift;
-
-	/* Size of extents used for COW blocks */
-        long extent_size;
-
-	/* 1 if snapshot is full (and therefore unusable) */
-	int full;
-
-	/* 1 if snapshot is is persistent (save metadata to disk) */
-	int persistent;
-
-	/* Number of the next free sector for COW/data */
-	unsigned long next_free_sector;
-
-	/* Where the metadata starts */
-	unsigned long start_of_exceptions;
-
-	/* Where we are currently writing the metadata */
-	unsigned long current_metadata_sector;
-
-	/* Index into disk_cow array */
-	int current_metadata_entry;
-
-	/* Index into mythical extent array */
-	int current_metadata_number;
-
-	/* Number of metadata entries in the disk_cow array */
-	int highest_metadata_entry;
-
-	/* Number of metadata entries per hard disk block */
-	int md_entries_per_block;
-
-	/* kiobuf for doing I/O to header & metadata */
-	struct kiobuf *cow_iobuf;
-
-	/* Hash table for looking up COW data */
-	struct list_head *hash_table;
-
-	/* Hash table for looking up inflight COW operations */
-	struct list_head *inflight_hash_table;
-
-	 /* To serialise access to the metadata */
-	struct rw_semaphore lock;
-
-	/* To help with calculating the hash function */
-	uint32_t hash_mask;
-	uint32_t inflight_hash_mask;
-	uint32_t hash_size;
-
-	 /*
-	  * Disk extent with COW data in it. as an array of
-	  * exception tables. The first one points to the next
-	  * block of metadata or 0 if this is the last
-	  */
-	struct disk_exception *disk_cow;
-};
-
-/*
- * Exception in memory
- */
-struct exception {
-	/* List of exceptions in this bucket */
-	struct list_head list;
-	uint32_t rsector_org;
-	uint32_t rsector_new;
-};
-
-/*
- * Inflight COW exception in memory
- */
-struct inflight_exception {
-	/* List of inflight exceptions in this bucket */
-	struct list_head list;
-	uint32_t rsector_org;
-	uint32_t rsector_new;
+struct pending_exception {
+	struct exception e;
 
 	/* Chain of WRITE buffer heads to submit when this COW has completed */
 	struct buffer_head *bh;
 
 	/* Pointer back to snapshot context */
-	struct snapshot_c *snap;
+	struct dm_snapshot *snap;
 };
 
 /*
- * An array of these is held in each disk block. LE format
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
  */
-struct disk_exception {
-	uint64_t rsector_org;
-	uint64_t rsector_new;
-};
+static kmem_cache_t *exception_cachep;
+static kmem_cache_t *pending_cachep;
 
 /*
- * Structure of a (persistent) snapshot header on disk. in LE format
+ * One of these per registered origin, held in the snapshot_origins hash
  */
-struct snap_disk_header {
-	uint32_t magic;
-
-	/* Simple, incrementing version. no backward compatibility */
-	uint32_t version;
+struct origin {
+	/* The origin device */
+	kdev_t dev;
 
-	/* In 512 byte sectors */
-	uint32_t chunk_size;
+	struct list_head hash_list;
 
-	/* In 512 byte sectors */
-	uint32_t extent_size;
-	uint64_t start_of_exceptions;
-	uint32_t full;
+	/* List of snapshots for this origin */
+	struct list_head snapshots;
 };
 
-static int write_metadata(struct snapshot_c *lc);
-static int write_header(struct snapshot_c *lc);
 /*
  * Size of the hash table for origin volumes. If we make this
  * the size of the minors list then it should be nearly perfect
  */
 #define ORIGIN_HASH_SIZE 256
 #define ORIGIN_MASK      0xFF
+static struct list_head *_origins;
+static struct rw_semaphore _origins_lock;
 
-/*
- * Hash table mapping origin volumes to lists of snapshots and
- * a lock to protect it
- */
-static struct list_head *snapshot_origins = NULL;
-static struct rw_semaphore origin_hash_lock;
-static kmem_cache_t *exception_cachep;
-static kmem_cache_t *inflight_cachep;
-/*
- * Hash functions
- */
-static inline unsigned int origin_hash(kdev_t dev)
+static int init_origin_hash(void)
 {
-       return MINOR(dev) & ORIGIN_MASK;
+	int i;
+
+	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+			   GFP_KERNEL);
+	if (!_origins) {
+		DMERR("Device mapper: Snapshot: unable to allocate memory");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+		INIT_LIST_HEAD(_origins + i);
+	init_rwsem(&_origins_lock);
+
+	return 0;
 }
 
-static inline uint32_t exception_hash(offset_t sector, struct snapshot_c *e)
+static void exit_origin_hash(void)
 {
-       unsigned int chunk = (sector & ~e->chunk_size_mask) >> e->chunk_size_shift;
-       return chunk & e->hash_mask;
+	kfree(_origins);
 }
 
-static inline uint32_t inflight_exception_hash(offset_t sector, struct snapshot_c *e)
+static inline unsigned int origin_hash(kdev_t dev)
 {
-       unsigned int chunk = (sector & ~e->chunk_size_mask) >> e->chunk_size_shift;
-       return chunk & e->inflight_hash_mask;
+	return MINOR(dev) & ORIGIN_MASK;
 }
 
-/*
- * One of these per registered origin, held in the snapshot_origins hash
- */
-struct origin_list
+static struct origin *__lookup_origin(kdev_t origin)
 {
-	/* The origin device */
-	kdev_t origin_dev;
+	struct list_head *slist;
+	struct list_head *ol;
+	struct origin *o;
 
-	/* List pointers for this list */
-	struct list_head list;
+	ol = &_origins[origin_hash(origin)];
+	list_for_each(slist, ol) {
+		o = list_entry(slist, struct origin, hash_list);
 
-	/* List of snapshots for this origin */
-	struct list_head snap_list;
-};
-
-
-/*
- * Return the number of sectors in the device
- */
-static inline int get_dev_size(kdev_t dev)
-{
-	int *sizes;
+		if (o->dev == origin)
+			return o;
+	}
 
-	sizes = blk_size[MAJOR(dev)];
-	if (sizes)
-		return sizes[MINOR(dev)]<<1;
-	else
-		return 0;
+	return NULL;
 }
 
-/*
- * Return the list of snapshots for a given origin device.  The
- * origin_hash_lock must be held when calling this.
- */
-static struct origin_list *__lookup_snapshot_list(kdev_t origin)
+static void __insert_origin(struct origin *o)
 {
-        struct list_head *slist;
-	struct list_head *snapshot_list;
-
-	snapshot_list = &snapshot_origins[origin_hash(origin)];
-	list_for_each(slist, snapshot_list) {
-		struct origin_list *ol;
-		ol = list_entry(slist, struct origin_list, list);
-
-		if (ol->origin_dev == origin) {
-			return ol;
-		}
-	}
-	return NULL;
+	struct list_head *sl = &_origins[origin_hash(o->dev)];
+	list_add_tail(&o->hash_list, sl);
 }
 
 /*
- * Add a new exception entry to the on-disk metadata.
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
  */
-static int update_metadata_block(struct snapshot_c *sc, unsigned long org, unsigned long new)
+static int register_snapshot(struct dm_snapshot *snap)
 {
-	int i = sc->current_metadata_entry++;
-	unsigned long next_md_block = sc->current_metadata_sector;
-
-	sc->current_metadata_number++;
+	struct origin *o;
+	kdev_t dev = snap->origin->dev;
 
-	/* Update copy of disk COW */
-	sc->disk_cow[i].rsector_org = cpu_to_le64(org);
-	sc->disk_cow[i].rsector_new = cpu_to_le64(new);
+	down_write(&_origins_lock);
+	o = __lookup_origin(dev);
 
-	/* Have we filled this extent ? */
-	if (sc->current_metadata_number >= sc->highest_metadata_entry) {
-		/* Fill in pointer to next metadata extent */
-		i++;
-		sc->current_metadata_entry++;
-
-		next_md_block = sc->next_free_sector;
-		sc->next_free_sector += sc->extent_size;
+	if (!o) {
+		/* New origin */
+		o = kmalloc(sizeof(*o), GFP_KERNEL);
+		if (!o) {
+			up_write(&_origins_lock);
+			return -ENOMEM;
+		}
 
-		sc->disk_cow[i].rsector_org = cpu_to_le64(next_md_block);
-		sc->disk_cow[i].rsector_new = 0;
-	}
+		/* Initialise the struct */
+		INIT_LIST_HEAD(&o->snapshots);
+		o->dev = dev;
 
-	/* Commit to disk */
-	if (write_metadata(sc)) {
-		sc->full = 1; /* Failed. don't try again */
-		return -1;
+		__insert_origin(o);
 	}
 
-	/*
-	 * Write a new (empty) metadata block if we are at the
-	 * end of an existing block so that read_metadata finds a
-	 * terminating zero entry.
-	 */
-	if (sc->current_metadata_entry == sc->md_entries_per_block) {
-		memset(sc->disk_cow, 0, PAGE_SIZE);
-		sc->current_metadata_sector = next_md_block;
+	list_add_tail(&snap->list, &o->snapshots);
 
-		/*
-		 * If this is also the end of an extent then go
-		 * back to the start.
-		 */
-		if (sc->current_metadata_number >= sc->highest_metadata_entry) {
-			sc->current_metadata_number = 0;
-		}
-		else {
-			int blocksize = get_hardsect_size(sc->cow_dev->dev);
-			sc->current_metadata_sector += blocksize/SECTOR_SIZE;
-		}
-
-		sc->current_metadata_entry = 0;
-		if (write_metadata(sc) != 0) {
-			sc->full = 1;
-			return -1;
-		}
-	}
+	up_write(&_origins_lock);
 	return 0;
 }
 
-
-/*
- * Add a new exception to the list
- */
-static struct exception *add_exception(struct snapshot_c *sc, unsigned long org, unsigned long new)
+static void unregister_snapshot(struct dm_snapshot *s)
 {
-	struct list_head *l = &sc->hash_table[exception_hash(org, sc)];
-	struct exception *new_ex;
+	struct origin *o;
 
-	new_ex = kmem_cache_alloc(exception_cachep, GFP_NOIO);
-	if (!new_ex) return NULL;
+	down_write(&_origins_lock);
+	o = __lookup_origin(s->origin->dev);
 
-	new_ex->rsector_org = org;
-	new_ex->rsector_new = new;
-
-	list_add(&new_ex->list, l);
+	list_del(&s->list);
+	if (list_empty(&o->snapshots)) {
+		list_del(&o->hash_list);
+		kfree(o);
+	}
 
-	return new_ex;
+	up_write(&_origins_lock);
 }
 
 /*
- * Called when the copy I/O has finished
+ * Implementation of the exception hash tables.
  */
-static void copy_callback(copy_cb_reason_t reason, void *context, long arg)
+static int init_exception_table(struct exception_table *et, uint32_t size)
 {
-	struct inflight_exception *iex = (struct inflight_exception *)context;
+	int i;
 
-	if (reason == COPY_CB_COMPLETE) {
-		struct buffer_head *bh;
+	et->hash_mask = size - 1;
+	et->table = vmalloc(sizeof(struct list_head) * (size));
+	if (!et->table)
+		return -ENOMEM;
 
-		/* Update the metadata if we are persistent */
-		if (iex->snap->persistent)
-			update_metadata_block(iex->snap, iex->rsector_org, iex->rsector_new);
+	for (i = 0; i < size; i++)
+		INIT_LIST_HEAD(et->table + i);
 
-		/* Add a proper exception,
-		   and remove the inflight exception from the list */
-		down_write(&iex->snap->lock);
+	return 0;
+}
 
-		add_exception(iex->snap, iex->rsector_org, iex->rsector_new);
-		list_del(&iex->list);
+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+{
+	struct list_head *slot, *entry, *temp;
+	struct exception *ex;
+	int i, size;
 
-		/* Submit any pending write BHs */
-		bh = iex->bh;
-		iex->bh = NULL;
-		up_write(&iex->snap->lock);
-		kmem_cache_free(inflight_cachep, iex);
+	size = et->hash_mask + 1;
+	for (i = 0; i < size; i++) {
+		slot = et->table + i;
 
-		while (bh) {
-			struct buffer_head *nextbh = bh->b_reqnext;
-			bh->b_reqnext = NULL;
-			generic_make_request(WRITE, bh);
-			bh = nextbh;
+		list_for_each_safe(entry, temp, slot) {
+			ex = list_entry(entry, struct exception, hash_list);
+			kmem_cache_free(mem, ex);
 		}
-
 	}
 
-	/* Read/write error - snapshot is unusable */
-	if (reason == COPY_CB_FAILED_WRITE || reason == COPY_CB_FAILED_READ) {
-		DMERR("Error reading/writing snapshot");
-		iex->snap->full = 1;
-		if (iex->snap->persistent)
-			write_header(iex->snap);
-		list_del(&iex->list);
-		kmem_cache_free(inflight_cachep, iex);
-	}
+	vfree(et->table);
 }
 
 /*
- * Make a note of the snapshot and its origin so we can look it
- * up when the origin has a write on it.
+ * FIXME: check how this hash fn is performing.
  */
-static int register_snapshot(kdev_t origin_dev, struct snapshot_c *snap)
+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
 {
-	struct origin_list *ol;
-
-	down_write(&origin_hash_lock);
-	ol = __lookup_snapshot_list(origin_dev);
-
-	if (!ol) {
-		struct list_head *snapshot_list;
-
-		/* New origin */
-		ol = kmalloc(sizeof(*ol), GFP_KERNEL);
-		if (!ol) {
-			up_write(&origin_hash_lock);
-			return 0;
-		}
-
-		/* Add this snapshot to the origin's list of snapshots */
-		INIT_LIST_HEAD(&ol->snap_list);
-
-		/* Initialise the struct */
-		ol->origin_dev = origin_dev;
-
-		/* Add this origin to the hash table */
-		snapshot_list = &snapshot_origins[origin_hash(origin_dev)];
-		list_add_tail(&ol->list, snapshot_list);
-	}
+	return chunk & et->hash_mask;
+}
 
-	list_add_tail(&snap->list, &ol->snap_list);
+static void insert_exception(struct exception_table *eh, struct exception *e)
+{
+	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
+	list_add(&e->hash_list, l);
+}
 
-	up_write(&origin_hash_lock);
-	return 1;
+static inline void remove_exception(struct exception *e)
+{
+	list_del(&e->hash_list);
 }
 
 /*
  * Return the exception data for a sector, or NULL if not
  * remapped.
  */
-static struct exception *find_exception(struct snapshot_c *sc, uint32_t b_rsector)
+static struct exception *lookup_exception(struct exception_table *et,
+					  chunk_t chunk)
 {
-	struct list_head *l = &sc->hash_table[exception_hash(b_rsector, sc)];
-        struct list_head *slist;
+	struct list_head *slot, *el;
+	struct exception *e;
 
-	list_for_each(slist, l) {
-		struct exception *et = list_entry(slist, struct exception, list);
-
-		if (et->rsector_org == b_rsector - (b_rsector & sc->chunk_size_mask)) {
-			return et;
-		}
+	slot = &et->table[exception_hash(et, chunk)];
+	list_for_each(el, slot) {
+		e = list_entry(el, struct exception, hash_list);
+		if (e->old_chunk == chunk)
+			return e;
 	}
+
 	return NULL;
 }
 
-/*
- * Return the inflight exception data for a sector, or NULL if none active
- */
-static struct inflight_exception *find_inflight_exception(struct snapshot_c *sc, uint32_t b_rsector)
+static inline struct exception *alloc_exception(void)
 {
-	struct list_head *l = &sc->inflight_hash_table[inflight_exception_hash(b_rsector, sc)];
-        struct list_head *slist;
-
-	list_for_each(slist, l) {
-		struct inflight_exception *et = list_entry(slist, struct inflight_exception, list);
-		if (et->rsector_org == b_rsector - (b_rsector & sc->chunk_size_mask)) {
-			return et;
-		}
-	}
-	return NULL;
+	return kmem_cache_alloc(exception_cachep, GFP_NOIO);
 }
 
-/*
- * Add a new inflight exception to the list
- */
-static struct inflight_exception *add_inflight_exception(struct snapshot_c *sc, unsigned long org, unsigned long new)
+static inline struct pending_exception *alloc_pending_exception(void)
 {
-	struct list_head *l = &sc->inflight_hash_table[inflight_exception_hash(org, sc)];
-	struct inflight_exception *new_ex;
-
-	new_ex = kmem_cache_alloc(inflight_cachep, GFP_NOIO);
-	if (!new_ex) return NULL;
-
-	new_ex->rsector_org = org;
-	new_ex->rsector_new = new;
-	new_ex->bh = NULL;
-	new_ex->snap = sc;
+	return kmem_cache_alloc(pending_cachep, GFP_NOIO);
+}
 
-	list_add(&new_ex->list, l);
+static inline void free_exception(struct exception *e)
+{
+	kmem_cache_free(exception_cachep, e);
+}
 
-	return new_ex;
+static inline void free_pending_exception(struct pending_exception *pe)
+{
+	kmem_cache_free(pending_cachep, pe);
 }
 
 /*
- * Allocate a kiobuf. This is the only code nicked from the old
- * snapshot driver and I've changed it anyway.
+ * Called when the copy I/O has finished
  */
-static int alloc_iobuf_pages(struct kiobuf *iobuf, int nr_sectors)
+static void copy_callback(copy_cb_reason_t reason, void *context, long arg)
 {
-	int nr_pages, err, i;
+	struct pending_exception *pe = (struct pending_exception *) context;
+	struct dm_snapshot *s = pe->snap;
+	struct exception *e;
 
-	if (nr_sectors > KIO_MAX_SECTORS)
-		return -1;
+	if (reason == COPY_CB_COMPLETE) {
+		struct buffer_head *bh;
 
-	nr_pages = nr_sectors / (PAGE_SIZE/SECTOR_SIZE);
-	err = expand_kiobuf(iobuf, nr_pages);
-	if (err) goto out;
+		/* Update the metadata if we are persistent */
+		if (s->store->commit_exception)
+			s->store->commit_exception(s->store, &pe->e);
 
-	err = -ENOMEM;
-	iobuf->locked = 1;
-	iobuf->nr_pages = 0;
-	for (i = 0; i < nr_pages; i++) {
-		struct page * page;
+		e = alloc_exception();
+		if (!e) {
+			/* FIXME: what do we do now ? */
+			return;
+		}
+
+		/* Add a proper exception,
+		   and remove the inflight exception from the list */
+		down_write(&pe->snap->lock);
+
+		memcpy(e, &pe->e, sizeof(*e));
+		insert_exception(&s->complete, e);
+		remove_exception(&pe->e);
+
+		/* Submit any pending write BHs */
+		bh = pe->bh;
+		pe->bh = NULL;
+		up_write(&pe->snap->lock);
 
-		page = alloc_page(GFP_KERNEL);
-		if (!page) goto out;
+		kmem_cache_free(pending_cachep, pe);
 
-		iobuf->maplist[i] = page;
-		LockPage(page);
-		iobuf->nr_pages++;
+		while (bh) {
+			struct buffer_head *nextbh = bh->b_reqnext;
+			bh->b_reqnext = NULL;
+			generic_make_request(WRITE, bh);
+			bh = nextbh;
+		}
 	}
-	iobuf->offset = 0;
 
-	err = 0;
+	/* Read/write error - snapshot is unusable */
+	if (reason == COPY_CB_FAILED_WRITE || reason == COPY_CB_FAILED_READ) {
+		DMERR("Error reading/writing snapshot");
 
-out:
-	return err;
+		if (pe->snap->store->drop_snapshot)
+			pe->snap->store->drop_snapshot(pe->snap->store);
+		remove_exception(&pe->e);
+		kmem_cache_free(pending_cachep, pe);
+	}
 }
 
 /*
- * ...OK there's this too.
+ * Hard coded magic.
  */
 static int calc_max_buckets(void)
 {
@@ -556,414 +332,63 @@ static int calc_max_buckets(void)
 	return mem;
 }
 
+/*
+ * Rounds a number down to a power of 2.
+ */
+static inline uint32_t round_down(uint32_t n)
+{
+	while (n & (n - 1))
+		n &= (n - 1);
+	return n;
+}
+
 /*
  * Allocate room for a suitable hash table.
  */
-static int alloc_hash_table(struct snapshot_c *sc)
+static int init_hash_tables(struct dm_snapshot *s)
 {
-	int  i;
-	int  hash_size;
-	unsigned long cow_dev_size;
-	unsigned long origin_dev_size;
-	int  max_buckets;
+	offset_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
 
-        /*
+	/*
 	 * Calculate based on the size of the original volume or
 	 * the COW volume...
 	 */
-	cow_dev_size = get_dev_size(sc->cow_dev->dev);
-	origin_dev_size = get_dev_size(sc->origin_dev->dev);
+	cow_dev_size = get_dev_size(s->cow->dev);
+	origin_dev_size = get_dev_size(s->origin->dev);
 	max_buckets = calc_max_buckets();
 
-	hash_size = min(origin_dev_size, cow_dev_size) / sc->chunk_size;
+	hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
 	hash_size = min(hash_size, max_buckets);
 
 	/* Round it down to a power of 2 */
-	while (hash_size & (hash_size-1))
-		hash_size &= (hash_size-1);
-
-	sc->hash_mask = hash_size-1;
-	sc->hash_size = hash_size;
-	sc->hash_table = vmalloc(sizeof(struct list_head) * (hash_size));
-	if (!sc->hash_table) return -1;
-
-	for (i=0; i<hash_size; i++)
-		INIT_LIST_HEAD(sc->hash_table + i);
+	hash_size = round_down(hash_size);
+	if (init_exception_table(&s->complete, hash_size))
+		return -ENOMEM;
 
 	/*
 	 * Allocate hash table for in-flight exceptions
 	 * Make this smaller than the real hash table
 	 */
-	hash_size >>= 1;
-	sc->inflight_hash_mask = sc->hash_mask >> 1;
-
-	sc->inflight_hash_table = vmalloc(sizeof(struct list_head) * (hash_size));
-	if (!sc->inflight_hash_table) return -1;
-	for (i=0; i<hash_size; i++)
-		INIT_LIST_HEAD(sc->inflight_hash_table + i);
+	hash_size >>= 3;
+	if (!hash_size)
+		hash_size = 64;
 
-	return 0;
-}
-
-
-/*
- * READ or WRITE some blocks to/from a device
- */
-static int do_io(int rw, struct kiobuf *iobuf, kdev_t dev, unsigned long start, int nr_sectors)
-{
-	int i, sectors_per_block, nr_blocks;
-	int blocksize = get_hardsect_size(dev);
-	int status;
-
-	sectors_per_block = blocksize / SECTOR_SIZE;
-
-	nr_blocks = nr_sectors / sectors_per_block;
-	start /= sectors_per_block;
-
-	for (i = 0; i < nr_blocks; i++)
-		iobuf->blocks[i] = start++;
-
-	iobuf->length = nr_sectors << 9;
-
-	status = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize);
-	return (status != (nr_sectors << 9));
-}
-
-/*
- * Free all the allocated exception structures.
- */
-static void free_exception_table(struct snapshot_c *lc)
-{
-	int i;
-
-	for (i=0; i < lc->hash_size; i++) {
-		struct list_head *l = &lc->hash_table[i];
-		struct list_head *entry, *temp;
-
-		if (l) {
-			list_for_each_safe(entry, temp, l) {
-				struct exception *ex;
-				ex = list_entry(entry, struct exception, list);
-				list_del(&ex->list);
-				kmem_cache_free(exception_cachep, ex);
-			}
-		}
-	}
-}
-
-/*
- * Read on-disk COW metadata and populate the hash table.
- */
-static int read_metadata(struct snapshot_c *lc)
-{
-	int status;
-	int i;
-	int entry = 0;
-	int map_page = 0;
-	int nr_sectors = lc->extent_size;
-	int blocksize = get_hardsect_size(lc->cow_dev->dev);
-	unsigned long cur_sector = lc->start_of_exceptions;
-	unsigned long last_sector;
-	unsigned long first_free_sector = 0;
-	int entries_per_page = PAGE_SIZE / sizeof(struct disk_exception);
-	struct disk_exception *cow_block;
-	struct kiobuf *read_iobuf;
-	int err = 0;
-	int devsize = get_dev_size(lc->cow_dev->dev);
-
-	/*
-	 * Allocate our own iovec for this operation 'cos the
-	 * others are way too small.
-	 */
-	if (alloc_kiovec(1, &read_iobuf)) {
-		DMERR("Error allocating iobuf for %s", kdevname(lc->cow_dev->dev));
-		return -1;
-	}
-
-	if (alloc_iobuf_pages(read_iobuf, lc->extent_size)) {
-		DMERR("Error allocating iobuf space for %s", kdevname(lc->cow_dev->dev));
-		free_kiovec(1, &read_iobuf);
-		return -1;
-	}
-	cow_block = page_address(read_iobuf->maplist[0]);
-
-	do
-	{
-		/* Make sure the chain does not go off the end of the device, or backwards */
-		if (cur_sector > devsize || cur_sector < first_free_sector) {
-			DMERR("COW table chain pointers are inconsistent, can't activate snapshot");
-			err = -1;
-			goto ret_free;
-		}
-
-		first_free_sector = max(first_free_sector, cur_sector+lc->extent_size);
-		status = do_io(READ, read_iobuf, lc->cow_dev->dev, cur_sector, nr_sectors);
-		if (status == 0) {
-
-			map_page = 0;
-			entry = 0;
-
-			cow_block = page_address(read_iobuf->maplist[0]);
-
-			/* Now populate the hash table from this data */
-			for (i=0; i <= lc->highest_metadata_entry &&
-				  cow_block[entry].rsector_new != 0; i++) {
-
-				struct exception *ex;
-
-				ex = add_exception(lc,
-						   le64_to_cpu(cow_block[entry].rsector_org),
-						   le64_to_cpu(cow_block[entry].rsector_new));
-
-				first_free_sector = max(first_free_sector,
-							(unsigned long)(le64_to_cpu(cow_block[entry].rsector_new) +
-									lc->chunk_size));
-
-				/* Do we need to move onto the next page? */
-				if (++entry >= entries_per_page) {
-					entry = 0;
-					cow_block = page_address(read_iobuf->maplist[++map_page]);
-				}
-			}
-		}
-		else {
-			DMERR("Error reading COW metadata for %s", kdevname(lc->cow_dev->dev));
-			err = -1;
-			goto ret_free;
-		}
-		last_sector = cur_sector;
-		cur_sector = le64_to_cpu(cow_block[entry].rsector_org);
-
-	} while (cur_sector != 0);
-
-	lc->persistent = 1;
-	lc->current_metadata_sector = last_sector +
-		                      map_page*PAGE_SIZE/SECTOR_SIZE +
-                                      entry/(SECTOR_SIZE/sizeof(struct disk_exception));
-	lc->current_metadata_entry  = entry;
-	lc->current_metadata_number = i;
-	lc->next_free_sector = first_free_sector;
-
-	/* Copy last block into cow_iobuf */
-	memcpy(lc->disk_cow, (char *)((long)&cow_block[entry] - ((long)&cow_block[entry] & (blocksize-1))), blocksize);
-
- ret_free:
-	unmap_kiobuf(read_iobuf);
-	free_kiovec(1, &read_iobuf);
-
-	return err;
-}
-
-/*
- * Read the snapshot volume header, returns 0 only if it read OK
- * and it was valid. returns 1 if no header was found, -1 on
- * error.  All fields are checked against the snapshot structure
- * itself to make sure we don't corrupt the data.
- */
-static int read_header(struct snapshot_c *lc)
-{
-	int status;
-	struct snap_disk_header *header;
-	int blocksize = get_hardsect_size(lc->cow_dev->dev);
-	unsigned long devsize;
-
-	/* Get it */
-	status = do_io(READ, lc->cow_iobuf, lc->cow_dev->dev, 0L, blocksize/SECTOR_SIZE);
-	if (status != 0) {
-		DMERR("Snapshot dev %s error reading header", kdevname(lc->cow_dev->dev));
-		return -1;
+	if (init_exception_table(&s->pending, hash_size)) {
+		exit_exception_table(&s->complete, exception_cachep);
+		return -ENOMEM;
 	}
 
-	header = (struct snap_disk_header *)page_address(lc->cow_iobuf->maplist[0]);
-
-	/*
-	 * Check the magic. It's OK if this fails, we just create a new snapshot header
-	 * and start from scratch
-	 */
-	if (le32_to_cpu(header->magic) != SNAP_MAGIC) {
-		return 1;
-	}
-
-	/* Check the version matches */
-	if (le32_to_cpu(header->version) != SNAPSHOT_DISK_VERSION) {
-		DMWARN("Snapshot dev %s version mismatch. Stored: %d, driver: %d",
-		       kdevname(lc->cow_dev->dev), le32_to_cpu(header->version), SNAPSHOT_DISK_VERSION);
-		return -1;
-	}
-
-	/* Check the chunk sizes match */
-	if (le32_to_cpu(header->chunk_size) != lc->chunk_size) {
-		DMWARN("Snapshot dev %s chunk size mismatch. Stored: %d, requested: %d",
-		       kdevname(lc->cow_dev->dev), le32_to_cpu(header->chunk_size), lc->chunk_size);
-		return -1;
-	}
-
-	/* Check the extent sizes match */
-	if (le32_to_cpu(header->extent_size) != lc->extent_size) {
-		DMWARN("Snapshot dev %s extent size mismatch. Stored: %d, requested: %ld",
-		       kdevname(lc->cow_dev->dev), le32_to_cpu(header->extent_size), lc->extent_size);
-		return -1;
-	}
-
-	/* Get the rest of the data */
-	lc->start_of_exceptions = le64_to_cpu(header->start_of_exceptions);
-	if (header->full) {
-		DMWARN("Snapshot dev %s is full. It cannot be used", kdevname(lc->cow_dev->dev));
-		lc->full = 1;
-		return -1;
-	}
-
-	/* Validate against the size of the volume */
-	devsize = get_dev_size(lc->cow_dev->dev);
-	if (lc->start_of_exceptions > devsize) {
-		DMWARN("Snapshot metadata error on %s. start exceptions > device size (%ld > %ld)",
-		       kdevname(lc->cow_dev->dev), lc->start_of_exceptions, devsize);
-		return -1;
-	}
-
-	/* Read metadata into the hash table and update pointers */
-	return read_metadata(lc);
-}
-
-/*
- * Write (or update) the header. The only time we should need to
- * do an update is when the snapshot becomes full.
- */
-static int write_header(struct snapshot_c *lc)
-{
-	struct snap_disk_header *header;
-	struct kiobuf *head_iobuf;
-	int blocksize = get_hardsect_size(lc->cow_dev->dev);
-	int status;
-
-	/*
-	 * Allocate our own iobuf for this so we don't corrupt
-	 * any of the other writes that may be going on.
-	 */
-	if (alloc_kiovec(1, &head_iobuf)) {
-		DMERR("Error allocating iobuf for header on %s", kdevname(lc->cow_dev->dev));
-		return -1;
-	}
-
-	if (alloc_iobuf_pages(head_iobuf, PAGE_SIZE/SECTOR_SIZE)) {
-		DMERR("Error allocating iobuf space for header on %s", kdevname(lc->cow_dev->dev));
-		free_kiovec(1, &head_iobuf);
-		return -1;
-	}
-
-	header = (struct snap_disk_header *)page_address(head_iobuf->maplist[0]);
-
-	header->magic       = cpu_to_le32(SNAP_MAGIC);
-	header->version     = cpu_to_le32(SNAPSHOT_DISK_VERSION);
-	header->chunk_size  = cpu_to_le32(lc->chunk_size);
-	header->extent_size = cpu_to_le32(lc->extent_size);
-	header->full        = cpu_to_le32(lc->full);
-
-	header->start_of_exceptions = cpu_to_le64(lc->start_of_exceptions);
-
-	/* Must write at least a full block */
-	status = do_io(WRITE, head_iobuf, lc->cow_dev->dev, 0, blocksize/SECTOR_SIZE);
-
-	unmap_kiobuf(head_iobuf);
-	free_kiovec(1, &head_iobuf);
-	return status;
-}
-
-
-/*
- * Write the latest COW metadata block.
- */
-static int write_metadata(struct snapshot_c *lc)
-{
-	int blocksize = get_hardsect_size(lc->cow_dev->dev);
-	int writesize = blocksize/SECTOR_SIZE;
-
-	if (do_io(WRITE, lc->cow_iobuf, lc->cow_dev->dev, lc->current_metadata_sector, writesize) != 0) {
-		DMERR("Error writing COW block");
-		return -1;
-	}
 	return 0;
 }
 
-static int setup_persistent_snapshot(struct snapshot_c *lc, int blocksize, void **context)
-{
-	int status;
-	int i;
-	int cow_sectors;
-
-	lc->highest_metadata_entry = (lc->extent_size*SECTOR_SIZE) / sizeof(struct disk_exception) - 1;
-	lc->md_entries_per_block   = blocksize / sizeof(struct disk_exception);
-
-	/* Allocate and set up iobuf for metadata I/O */
-	*context = "Unable to allocate COW iovec";
-	if (alloc_kiovec(1, &lc->cow_iobuf))
-		return -1;
-
-	/* Allocate space for the COW buffer. It should be at least PAGE_SIZE. */
-	cow_sectors = blocksize/SECTOR_SIZE + PAGE_SIZE/SECTOR_SIZE;
-	*context = "Unable to allocate COW I/O buffer space";
-	if (alloc_iobuf_pages(lc->cow_iobuf, cow_sectors)) {
-		free_kiovec(1, &lc->cow_iobuf);
-		return -1;
-	}
-
-	for (i=0; i < lc->cow_iobuf->nr_pages; i++) {
-		memset(page_address(lc->cow_iobuf->maplist[i]), 0, PAGE_SIZE);
-	}
-
-	lc->disk_cow = page_address(lc->cow_iobuf->maplist[0]);
-
-	*context = "Error in disk header";
-	/* Check for a header on disk and create a new one if not */
-	if ( (status = read_header(lc)) == 1) {
-
-		/* Write a new header */
-		lc->start_of_exceptions = lc->next_free_sector;
-		lc->next_free_sector += lc->extent_size;
-		lc->current_metadata_sector = lc->start_of_exceptions;
-		lc->current_metadata_entry  = 0;
-		lc->current_metadata_number = 0;
-
-		*context = "Unable to write snapshot header";
-		if (write_header(lc) != 0) {
-			DMERR("Error writing header to snapshot volume %s",
-			      kdevname(lc->cow_dev->dev));
-			goto free_ret;
-		}
-
-		/* Write a blank metadata block to the device */
-		if (write_metadata(lc) != 0) {
-			DMERR("Error writing initial COW table to snapshot volume %s",
-			      kdevname(lc->cow_dev->dev));
-			goto free_ret;
-		}
-	}
-
-	/*
-	 * There is a header but it doesn't match - fail so we
-	 * don't destroy what might be useful data on disk.  If
-	 * the user really wants to use this COW device for a
-	 * snapshot then the first sector should be zeroed out
-	 * first.
-	 */
-	if (status == -1)
-		goto free_ret;
-
-	return 0;
-
- free_ret:
-	unmap_kiobuf(lc->cow_iobuf);
-	free_kiovec(1, &lc->cow_iobuf);
-	return -1;
-}
-
 /*
- * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> <extent-size>
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n>
+ * <chunk-size> <extent-size>
  */
 static int snapshot_ctr(struct dm_table *t, offset_t b, offset_t l,
 			int argc, char **argv, void **context)
 {
-	struct snapshot_c *lc;
+	struct dm_snapshot *s;
 	unsigned long chunk_size;
 	unsigned long extent_size = 0L;
 	int r = -EINVAL;
@@ -975,184 +400,273 @@ static int snapshot_ctr(struct dm_table *t, offset_t b, offset_t l,
 
 	if (argc < 4) {
 		*context = "dm-snapshot: Not enough arguments";
-		return -EINVAL;
+		r = -EINVAL;
+		goto bad;
 	}
 
 	origin_path = argv[0];
-	cow_path    = argv[1];
-	persistent  = argv[2];
+	cow_path = argv[1];
+	persistent = argv[2];
 
-	*context = "Persistent flag is not P or N";
-	if ((*persistent & 0x5f) != 'P' &&
-	    (*persistent & 0x5f) != 'N')
+	if ((*persistent & 0x5f) != 'P' && (*persistent & 0x5f) != 'N') {
+		*context = "Persistent flag is not P or N";
+		r = -EINVAL;
 		goto bad;
+	}
 
 	chunk_size = simple_strtoul(argv[3], &value, 10);
 	if (chunk_size == 0 || value == NULL) {
 		*context = "Invalid chunk size";
+		r = -EINVAL;
 		goto bad;
 	}
 
 	/* Get the extent size for persistent snapshots */
 	if ((*persistent & 0x5f) == 'P') {
-		*context = "No extent size specified";
-		if (argc < 5)
+		if (argc < 5) {
+			*context = "No extent size specified";
+			r = -EINVAL;
 			goto bad;
+		}
 
 		extent_size = simple_strtoul(argv[4], &value, 10);
 		if (extent_size == 0 || value == NULL) {
 			*context = "Invalid extent size";
+			r = -EINVAL;
 			goto bad;
 		}
 	}
 
-	*context = "Cannot allocate snapshot context private structure";
-	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
-	if (lc == NULL)
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL) {
+		*context = "Cannot allocate snapshot context private structure";
+		r = -ENOMEM;
 		goto bad;
+	}
 
-	*context = "Cannot get origin device";
-	r = dm_table_get_device(t, origin_path, 0, 0, &lc->origin_dev);
-	if (r)
+	r = dm_table_get_device(t, origin_path, 0, 0, &s->origin);
+	if (r) {
+		*context = "Cannot get origin device";
+		r = -EINVAL;
 		goto bad_free;
+	}
 
-	*context = "Cannot get COW device";
-	r = dm_table_get_device(t, cow_path, 0, 0, &lc->cow_dev);
+	r = dm_table_get_device(t, cow_path, 0, 0, &s->cow);
 	if (r) {
-		dm_table_put_device(t, lc->origin_dev);
+		dm_table_put_device(t, s->origin);
+		*context = "Cannot get COW device";
+		r = -EINVAL;
 		goto bad_free;
 	}
 
 	/* Validate the extent and chunk sizes against the device block size */
-	blocksize = get_hardsect_size(lc->cow_dev->dev);
-	if (chunk_size % (blocksize/SECTOR_SIZE)) {
+	blocksize = get_hardsect_size(s->cow->dev);
+	if (chunk_size % (blocksize / SECTOR_SIZE)) {
 		*context = "Chunk size is not a multiple of device blocksize";
+		r = -EINVAL;
 		goto bad_putdev;
 	}
 
-	if (extent_size % (blocksize/SECTOR_SIZE)) {
+	if (extent_size % (blocksize / SECTOR_SIZE)) {
 		*context = "Extent size is not a multiple of device blocksize";
+		r = -EINVAL;
 		goto bad_putdev;
 	}
 
 	/* Check the sizes are small enough to fit in one kiovec */
 	if (chunk_size > KIO_MAX_SECTORS) {
 		*context = "Chunk size is too big";
+		r = -EINVAL;
 		goto bad_putdev;
 	}
 
 	if (extent_size > KIO_MAX_SECTORS) {
 		*context = "Extent size is too big";
+		r = -EINVAL;
 		goto bad_putdev;
 	}
 
 	/* Check chunk_size is a power of 2 */
-	if (chunk_size != 1 << (ffs(chunk_size)-1)) {
+	if (chunk_size & (chunk_size - 1)) {
 		*context = "Chunk size is not a power of 2";
 		r = -EINVAL;
 		goto bad_putdev;
 	}
 
+	s->chunk_size = chunk_size;
+	s->chunk_mask = chunk_size - 1;
+	for (s->chunk_shift = 0; chunk_size;
+	     s->chunk_shift++, chunk_size >>= 1) ;
 
-        lc->chunk_size = chunk_size;
-        lc->chunk_size_mask = chunk_size-1;
-	lc->extent_size = extent_size;
-	lc->next_free_sector = blocksize/SECTOR_SIZE; /* Leave the first block alone */
-	lc->full      = 0;
-	lc->disk_cow  = NULL;
-	init_rwsem(&lc->lock);
-
-	/* Work out the power of 2 that it is */
-	lc->chunk_size_shift = 0;
-	while ( (chunk_size&1) == 0) {
-		chunk_size >>= 1;
-		lc->chunk_size_shift++;
-	}
+	s->valid = 1;
+	init_rwsem(&s->lock);
 
 	/* Allocate hash table for COW data */
-	r = -ENOMEM;
-	*context = "Unable to allocate has table space";
-	if (alloc_hash_table(lc) == -1)
+	if (init_hash_tables(s)) {
+		*context = "Unable to allocate hash table space";
+		r = -ENOMEM;
 		goto bad_putdev;
+	}
 
 	/*
 	 * Check the persistent flag - done here because we need the iobuf
 	 * to check the LV header
 	 */
-	if ((*persistent & 0x5f) == 'P') {
-		lc->persistent = 1;
+#if 0
+	if ((*persistent & 0x5f) == 'P')
+		s->store = dm_create_persistent(s, blocksize,
+						extent_size, context);
+	else
+#endif
+		s->store = dm_create_transient(s, blocksize, context);
 
-		/* Allocate the COW iobuf and set associated variables */
-		if (setup_persistent_snapshot(lc, blocksize, context))
-			goto bad_free1;
+	if (!s->store) {
+		*context = "Couldn't create exception store";
+		r = -EINVAL;
+		goto bad_free1;
 	}
-	else {
-		lc->persistent = 0;
+
+	/* Allocate the COW iobuf and set associated variables */
+	if (s->store->init &&
+	    s->store->init(s->store, blocksize, extent_size, context)) {
+		*context = "Couldn't initialise exception store";
+		r = -ENOMEM;
+		goto bad_free1;
 	}
 
 	/* Flush IO to the origin device */
-	/* TODO: VFS lock sync too */
-	fsync_dev(lc->origin_dev->dev);
+	/* FIXME: what does sct have against fsync_dev ? */
+	fsync_dev(s->origin->dev);
 #if LVM_VFS_ENHANCEMENT
-	fsync_dev_lockfs(lc->origin_dev->dev);
-	unlockfs(lc->origin_dev->dev);
+	fsync_dev_lockfs(s->origin->dev);
 #endif
 
-        /* Add snapshot to the list of snapshots for this origin */
-	r = -EINVAL;
-	*context = "Cannot register snapshot origin";
-	if (!register_snapshot(lc->origin_dev->dev, lc))
-	    goto bad_free2;
+	/* Add snapshot to the list of snapshots for this origin */
+	if (register_snapshot(s)) {
+		r = -EINVAL;
+		*context = "Cannot register snapshot origin";
+		goto bad_free2;
+	}
+
+#if LVM_VFS_ENHANCEMENT
+	unlockfs(s->origin->dev);
+#endif
 
-	*context = lc;
+	*context = s;
 	return 0;
 
  bad_free2:
-	if (lc->persistent) {
-		unmap_kiobuf(lc->cow_iobuf);
-		free_kiovec(1, &lc->cow_iobuf);
-	}
+	if (s->store->destroy)
+		s->store->destroy(s->store);
+
  bad_free1:
-	vfree(lc->hash_table);
-	vfree(lc->inflight_hash_table);
+	exit_exception_table(&s->pending, pending_cachep);
+	exit_exception_table(&s->complete, exception_cachep);
+
  bad_putdev:
-	dm_table_put_device(t, lc->cow_dev);
-	dm_table_put_device(t, lc->origin_dev);
+	dm_table_put_device(t, s->cow);
+	dm_table_put_device(t, s->origin);
+
  bad_free:
-	kfree(lc);
+	kfree(s);
+
  bad:
 	return r;
 }
 
-static void snapshot_dtr(struct dm_table *t, void *c)
+static void snapshot_dtr(struct dm_table *t, void *context)
 {
-	struct snapshot_c *lc = (struct snapshot_c *) c;
+	struct dm_snapshot *s = (struct dm_snapshot *) context;
+
+	unregister_snapshot(s);
 
-	/* Unhook from the list */
-	list_del(&lc->list);
+	exit_exception_table(&s->pending, pending_cachep);
+	exit_exception_table(&s->complete, exception_cachep);
 
 	/* Deallocate memory used */
-	free_exception_table(lc);
-	if (lc->persistent) {
-		unmap_kiobuf(lc->cow_iobuf);
-		free_kiovec(1, &lc->cow_iobuf);
+	if (s->store->destroy)
+		s->store->destroy(s->store);
+
+	dm_table_put_device(t, s->origin);
+	dm_table_put_device(t, s->cow);
+	kfree(s);
+}
+
+/*
+ * Performs a new copy on write.
+ */
+static int new_exception(struct dm_snapshot *s, struct buffer_head *bh)
+{
+	struct exception *e;
+	struct pending_exception *pe;
+	chunk_t chunk;
+
+	chunk = sector_to_chunk(s, bh->b_rsector);
+
+	/*
+	 * If the exception is in flight then we just defer the
+	 * bh until this copy has completed.
+	 */
+
+	/* FIXME: great big race. */
+	e = lookup_exception(&s->pending, chunk);
+	if (e) {
+		/* cast the exception to a pending exception */
+		pe = list_entry(e, struct pending_exception, e);
+		bh->b_reqnext = pe->bh;
+		pe->bh = bh;
+		return 0;
+	}
+
+	pe = alloc_pending_exception();
+	if (!pe) {
+		DMWARN("Couldn't allocate inflight_exception.");
+		return -ENOMEM;
+	}
+
+	pe->e.old_chunk = chunk;
+
+	if (s->store->prepare_exception &&
+	    s->store->prepare_exception(s->store, &pe->e)) {
+		s->valid = 0;
+		return -ENXIO;
 	}
 
-	vfree(lc->hash_table);
-	vfree(lc->inflight_hash_table);
-	dm_table_put_device(t, lc->origin_dev);
-	dm_table_put_device(t, lc->cow_dev);
-	kfree(c);
+	bh->b_reqnext = pe->bh;
+	pe->bh = bh;
+	pe->snap = s;
+
+	insert_exception(&s->pending, &pe->e);
+
+	/* Get kcopyd to do the copy */
+	dm_blockcopy(chunk_to_sector(s, pe->e.old_chunk),
+		     chunk_to_sector(s, pe->e.new_chunk),
+		     s->chunk_size,
+		     s->origin->dev,
+		     s->cow->dev, SNAPSHOT_COPY_PRIORITY, 0, copy_callback, pe);
+
+	return 1;
+}
+
+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
+				   struct buffer_head *bh)
+{
+	bh->b_rdev = s->cow->dev;
+	bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
+	    (bh->b_rsector & s->chunk_mask);
 }
 
 static int snapshot_map(struct buffer_head *bh, int rw, void *context)
 {
-	struct exception *ex;
-	struct snapshot_c *lc = (struct snapshot_c *) context;
-	int ret = 1;
+	struct exception *e;
+	struct dm_snapshot *s = (struct dm_snapshot *) context;
+	int r = 1;
+	chunk_t chunk;
+
+	chunk = sector_to_chunk(s, bh->b_rsector);
 
-        /* Full snapshots are not usable */
-	if (lc->full)
+	/* Full snapshots are not usable */
+	if (!s->valid)
 		return -1;
 
 	/*
@@ -1161,88 +675,61 @@ static int snapshot_map(struct buffer_head *bh, int rw, void *context)
 	 * writeable.
 	 */
 	if (rw == WRITE) {
-		struct inflight_exception *iex;
 
-		down_write(&lc->lock);
+		down_write(&s->lock);
 
 		/* If the block is already remapped - use that, else remap it */
-		ex = find_exception(context, bh->b_rsector);
-		if (ex) {
-			bh->b_rdev = lc->cow_dev->dev;
-			bh->b_rsector = ex->rsector_new + (bh->b_rsector & lc->chunk_size_mask);
+		e = lookup_exception(&s->complete, chunk);
+		if (e) {
+			remap_exception(s, e, bh);
+			up_write(&s->lock);
+			return 1;
 		}
 
-		if (!ex && (iex = find_inflight_exception(context, bh->b_rsector)) ) {
-			/* Exception has not been committed to disk - save this bh */
-			bh->b_reqnext = iex->bh;
-			iex->bh = bh;
-			up_write(&lc->lock);
+		e = lookup_exception(&s->pending, chunk);
+		if (e) {
+			struct pending_exception *pe;
+			pe = list_entry(e, struct pending_exception, e);
+
+			/*
+			 * Exception has not been committed to
+			 * disk - save this bh
+			 */
+			bh->b_reqnext = pe->bh;
+			pe->bh = bh;
+			up_write(&s->lock);
 			return 0;
 		}
 
-		if (!ex) {
-			unsigned long read_start = bh->b_rsector - (bh->b_rsector & lc->chunk_size_mask);
-			unsigned long devsize = get_dev_size(lc->cow_dev->dev);
-			unsigned long reloc_sector;
-			struct inflight_exception *iex;
-
-			/* Check there is enough space */
-			if (lc->next_free_sector + lc->chunk_size >= devsize) {
-				DMWARN("Snapshot %s is full", kdevname(lc->cow_dev->dev));
-				lc->full = 1;
-				if (lc->persistent)
-					write_header(lc);
-				up_write(&lc->lock);
-				return -1;
-			}
-
-			/* Update the inflight exception table */
-			reloc_sector = lc->next_free_sector;
-			lc->next_free_sector += lc->chunk_size;
-			iex = add_inflight_exception(lc, read_start, reloc_sector);
-			if (!iex) {
-				DMERR("Snapshot %s error adding new exception entry", kdevname(lc->cow_dev->dev));
-				/* Error here - treat it as full */
-				lc->full = 1;
-				if (lc->persistent)
-					write_header(lc);
-				up_write(&lc->lock);
-				return -1;
-			}
-
-			/* Add this bh to the list of those we need to resubmit when the COW has completed */
-			bh->b_reqnext = iex->bh;
-			iex->bh = bh;
+		if (new_exception(s, bh))
+			r = -1;
+		else
+			r = 0;
 
-			/* Get kcopyd to do the work */
-			dm_blockcopy(read_start, reloc_sector, lc->chunk_size,
-				     lc->origin_dev->dev, lc->cow_dev->dev,
-				     SNAPSHOT_COPY_PRIORITY, 0,
-				     copy_callback, iex);
+		up_write(&s->lock);
 
-			/* Tell the upper layers we have control of the BH now */
-			ret = 0;
-		}
+	} else {
+		/*
+		 * FIXME: this read path scares me because we
+		 * always use the origin when we have a pending
+		 * exception.  However I can't think of a
+		 * situation where this is wrong - ejt.
+		 */
 
-		up_write(&lc->lock);
-	}
-	else {
 		/* Do reads */
-		down_read(&lc->lock);
+		down_read(&s->lock);
 
 		/* See if it it has been remapped */
-		ex = find_exception(context, bh->b_rsector);
-		if (ex) {
+		e = lookup_exception(&s->complete, chunk);
+		if (e)
+			remap_exception(s, e, bh);
+		else
+			bh->b_rdev = s->origin->dev;
 
-			bh->b_rdev = lc->cow_dev->dev;
-			bh->b_rsector = ex->rsector_new + (bh->b_rsector & lc->chunk_size_mask);
-		} else {
-			bh->b_rdev = lc->origin_dev->dev;
-		}
-		up_read(&lc->lock);
+		up_read(&s->lock);
 	}
 
-	return ret;
+	return r;
 }
 
 /*
@@ -1251,155 +738,103 @@ static int snapshot_map(struct buffer_head *bh, int rw, void *context)
 int dm_do_snapshot(struct dm_dev *origin, struct buffer_head *bh)
 {
 	struct list_head *snap_list;
-	struct origin_list *ol;
-	int ret = 1;
+	struct origin *o;
+	int r = 1;
+	chunk_t chunk;
 
-	down_read(&origin_hash_lock);
-	ol = __lookup_snapshot_list(origin->dev);
-	up_read(&origin_hash_lock);
+	down_read(&_origins_lock);
+	o = __lookup_origin(origin->dev);
 
-	if (ol && !list_empty(&ol->snap_list)) {
-		struct list_head *origin_snaps = &ol->snap_list;
+	if (o) {
+		struct list_head *origin_snaps = &o->snapshots;
+		struct dm_snapshot *lock_snap;
+
+		/* Lock the metadata */
+		lock_snap = list_entry(origin_snaps->next,
+				       struct dm_snapshot, list);
 
 		/* Do all the snapshots on this origin */
 		list_for_each(snap_list, origin_snaps) {
-			struct snapshot_c *snap;
-			struct exception  *ex;
-			snap = list_entry(snap_list, struct snapshot_c, list);
-
-			/* Ignore full snapshots */
-			if (snap->full)
-				continue;
+			struct dm_snapshot *snap;
+			struct exception *e;
+			snap = list_entry(snap_list, struct dm_snapshot, list);
 
 			down_write(&snap->lock);
 
 			/*
-			 * Check exception table to see if block is already remapped in this
-			 * snapshot and mark the snapshot as needing a COW if not
+			 * Remember different snapshots can have
+			 * different chunk sizes.
 			 */
-			ex = find_exception(snap, bh->b_rsector);
-			if (!ex) {
-				offset_t dev_size;
-				struct inflight_exception *iex = find_inflight_exception(snap, bh->b_rsector);
-
-				/* If the exception is in flight then defer the BH -
-				   but don't add it twice! */
-				if (iex) {
-					if (ret) {
-						bh->b_reqnext = iex->bh;
-						iex->bh = bh;
-						ret = 0;
-					}
-					up_write(&snap->lock);
-					continue;
-				}
-
-                                /*
-				 * Check for full snapshot. Doing the size calculation here means that
-				 * the COW device can be resized without us being told
+			chunk = sector_to_chunk(snap, bh->b_rsector);
+
+			/* Only deal with valid snapshots */
+			if (snap->valid) {
+				/*
+				 * Check exception table to see
+				 * if block is already remapped
+				 * in this snapshot and mark the
+				 * snapshot as needing a COW if
+				 * not
 				 */
-				dev_size = get_dev_size(snap->cow_dev->dev);
-				if (snap->next_free_sector + snap->chunk_size >= dev_size) {
-					        /* Snapshot is full, we can't use it */
-						DMWARN("Snapshot %s is full (sec=%ld, size=%ld)",
-						       kdevname(snap->cow_dev->dev), snap->next_free_sector + snap->chunk_size, dev_size);
-						snap->full = 1;
-						/* Mark it full on the device */
-						if (snap->persistent)
-							write_header(snap);
-						up_write(&snap->lock);
-						continue;
-				}
-				else {
-					/* Update exception table */
-					unsigned long reloc_sector;
-					unsigned long read_start = bh->b_rsector - (bh->b_rsector & snap->chunk_size_mask);
-					struct inflight_exception *iex;
-
-					reloc_sector = snap->next_free_sector;
-					snap->next_free_sector += snap->chunk_size;
-					iex = add_inflight_exception(snap, read_start, reloc_sector);
-					if (!iex) {
-						DMERR("Snapshot %s error adding new exception entry",
-						      kdevname(snap->cow_dev->dev));
-						/* Error here - treat it as full */
-						snap->full = 1;
-						if (snap->persistent)
-							write_header(snap);
-						up_write(&snap->lock);
-						continue;
-					}
-
-					/* Get kcopyd to do the copy */
-					dm_blockcopy(read_start, reloc_sector, snap->chunk_size,
-						     snap->origin_dev->dev, snap->cow_dev->dev,
-						     SNAPSHOT_COPY_PRIORITY, 0,
-						     copy_callback, iex);
-					if (ret) {
-						bh->b_reqnext = iex->bh;
-						iex->bh = bh;
-						ret = 0;
-					}
-				}
+				e = lookup_exception(&snap->complete, chunk);
+				if (!e && !new_exception(snap, bh))
+					r = 0;
 			}
+
 			up_write(&snap->lock);
 		}
 	}
-	return ret;
-}
 
+	up_read(&_origins_lock);
+	return r;
+}
 
 static struct target_type snapshot_target = {
-	name:	"snapshot",
-	module:	THIS_MODULE,
-	ctr:	snapshot_ctr,
-	dtr:	snapshot_dtr,
-	map:	snapshot_map,
-	err:	NULL
+	name:"snapshot",
+	module:THIS_MODULE,
+	ctr:snapshot_ctr,
+	dtr:snapshot_dtr,
+	map:snapshot_map,
+	err:NULL
 };
 
 int __init dm_snapshot_init(void)
 {
-	int r = dm_register_target(&snapshot_target);
+	int r;
 
-	if (r < 0)
-		DMERR("Device mapper: Snapshot: register failed %d", r);
-	else {
-		snapshot_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), GFP_KERNEL);
-		if (snapshot_origins == NULL) {
-			DMERR("Device mapper: Snapshot: unable to allocate memory");
-			r = -1;
-		}
-		else {
-			/* initialise the origin->snapshot hash table */
-			int i;
-			for (i=0; i<ORIGIN_HASH_SIZE; i++)
-				INIT_LIST_HEAD(snapshot_origins + i);
-			init_rwsem(&origin_hash_lock);
-		}
+	r = dm_register_target(&snapshot_target);
+	if (r) {
+		DMERR("snapshot target register failed %d", r);
+		return r;
 	}
+
+	r = init_origin_hash();
+	if (r) {
+		DMERR("init_origin_hash failed.");
+		return r;
+	}
+
 	exception_cachep = kmem_cache_create("dm-snapshot-ex",
 					     sizeof(struct exception),
 					     __alignof__(struct exception),
 					     0, NULL, NULL);
 	if (!exception_cachep) {
-		kfree(snapshot_origins);
+		exit_origin_hash();
 		return -1;
 	}
 
-
-	inflight_cachep = kmem_cache_create("dm-snapshot-in",
-					    sizeof(struct inflight_exception),
-					    __alignof__(struct inflight_exception),
-					    0, NULL, NULL);
-	if (!inflight_cachep) {
-		kfree(snapshot_origins);
+	pending_cachep =
+	    kmem_cache_create("dm-snapshot-in",
+			      sizeof(struct pending_exception),
+			      __alignof__(struct pending_exception),
+			      0, NULL, NULL);
+	if (!pending_cachep) {
+		exit_origin_hash();
 		kmem_cache_destroy(exception_cachep);
 		return -1;
 	}
 
-
-	return r;
+	return 0;
 }
 
 void dm_snapshot_exit(void)
@@ -1409,10 +844,9 @@ void dm_snapshot_exit(void)
 	if (r < 0)
 		DMERR("Device mapper: Snapshot: unregister failed %d", r);
 
-	if (snapshot_origins)
-		kfree(snapshot_origins);
+	exit_origin_hash();
 
-	kmem_cache_destroy(inflight_cachep);
+	kmem_cache_destroy(pending_cachep);
 	kmem_cache_destroy(exception_cachep);
 }
 
diff --git a/kernel/common/dm-snapshot.h b/kernel/common/dm-snapshot.h
new file mode 100644
index 0000000..da15ece
--- /dev/null
+++ b/kernel/common/dm-snapshot.h
@@ -0,0 +1,135 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SNAPSHOT_H
+#define DM_SNAPSHOT_H
+
+#include "dm.h"
+#include <linux/blkdev.h>
+
+struct exception_table {
+	uint32_t hash_mask;
+	struct list_head *table;
+};
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 64k - 256k.
+ */
+/* FIXME: can we get away with limiting these to a uint32_t ? */
+typedef offset_t chunk_t;
+
+struct dm_snapshot {
+	struct rw_semaphore lock;
+
+	struct dm_dev *origin;
+	struct dm_dev *cow;
+
+	/* List of snapshots per Origin */
+	struct list_head list;
+
+	/* Size of data blocks saved - must be a power of 2 */
+	chunk_t chunk_size;
+	chunk_t chunk_mask;
+	chunk_t chunk_shift;
+
+	/* You can't use a snapshot if this is 0 (e.g. if full) */
+	int valid;
+
+	struct exception_table pending;
+	struct exception_table complete;
+
+	/* The on disk metadata handler */
+	struct exception_store *store;
+};
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ */
+struct exception {
+	struct list_head hash_list;
+
+	chunk_t old_chunk;
+	chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle persistent snapshots.
+ */
+struct exception_store {
+
+	/*
+	 * Destroys this object when you've finished with it.
+	 */
+	void (*destroy)(struct exception_store *store);
+
+	/*
+	 * Read the metadata and populate the snapshot.
+	 */
+	int (*init)(struct exception_store *store,
+		     int blocksize, unsigned long extent_size, void **context);
+
+	/*
+	 * Find somewhere to store the next exception.
+	 */
+	int (*prepare_exception)(struct exception_store *store,
+				  struct exception *e);
+
+	/*
+	 * Update the metadata with this exception.
+	 */
+	int (*commit_exception)(struct exception_store *store,
+				 struct exception *e);
+
+	/*
+	 * The snapshot is invalid, note this in the metadata.
+	 */
+	void (*drop_snapshot)(struct exception_store *store);
+
+	struct dm_snapshot *snap;
+	void *context;
+};
+
+/*
+ * Constructor and destructor for the default persistent
+ * store.
+ */
+struct exception_store *dm_create_persistent(struct dm_snapshot *s,
+					     int blocksize,
+					     offset_t extent_size,
+					     void **error);
+
+struct exception_store *dm_create_transient(struct dm_snapshot *s,
+					    int blocksize, void **error);
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline offset_t get_dev_size(kdev_t dev)
+{
+	int *sizes;
+
+	sizes = blk_size[MAJOR(dev)];
+	if (sizes)
+		return sizes[MINOR(dev)] << 1;
+
+	return 0;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, offset_t sector)
+{
+	return (sector & ~s->chunk_mask) >> s->chunk_shift;
+}
+
+static inline offset_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
+{
+	return chunk << s->chunk_shift;
+}
+
+#endif
-- 
2.43.5