lib/metadata/lv_manip.c

   1 /*
   2  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
   3  * Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is part of LVM2.
   6  *
   7  * This copyrighted material is made available to anyone wishing to use,
   8  * modify, copy, or redistribute it subject to the terms and conditions
   9  * of the GNU Lesser General Public License v.2.1.
  10  *
  11  * You should have received a copy of the GNU Lesser General Public License
  12  * along with this program; if not, write to the Free Software Foundation,
  13  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  14  */
  15
  16 #include "lib.h"
  17 #include "metadata.h"
  18 #include "locking.h"
  19 #include "pv_map.h"
  20 #include "lvm-string.h"
  21 #include "toolcontext.h"
  22 #include "lv_alloc.h"
  23 #include "pv_alloc.h"
  24 #include "display.h"
  25 #include "segtype.h"
  26 #include "archiver.h"
  27 #include "activate.h"
  28 #include "str_list.h"
  29 #include "defaults.h"
  30
  31 typedef enum {
  32         PREFERRED,
  33         USE_AREA,
  34         NEXT_PV,
  35         NEXT_AREA
  36 } area_use_t;
  37
  38 /* FIXME: remove RAID_METADATA_AREA_LEN macro after defining 'raid_log_extents'*/
  39 #define RAID_METADATA_AREA_LEN 1
  40
  41 /* FIXME These ended up getting used differently from first intended.  Refactor. */
  42 /* Only one of A_CONTIGUOUS_TO_LVSEG, A_CLING_TO_LVSEG, A_CLING_TO_ALLOCED may be set */
  43 #define A_CONTIGUOUS_TO_LVSEG   0x01    /* Must be contiguous to an existing segment */
  44 #define A_CLING_TO_LVSEG        0x02    /* Must use same disks as existing LV segment */
  45 #define A_CLING_TO_ALLOCED      0x04    /* Must use same disks as already-allocated segment */
  46
  47 #define A_CLING_BY_TAGS         0x08    /* Must match tags against existing segment */
  48 #define A_CAN_SPLIT             0x10
  49
  50 /*
  51  * Constant parameters during a single allocation attempt.
  52  */
  53 struct alloc_parms {
  54         alloc_policy_t alloc;
  55         unsigned flags;         /* Holds A_* */
  56         struct lv_segment *prev_lvseg;
  57         uint32_t extents_still_needed;
  58 };
  59
  60 /*
  61  * Holds varying state of each allocation attempt.
  62  */
  63 struct alloc_state {
  64         struct pv_area_used *areas;
  65         uint32_t areas_size;
  66         uint32_t log_area_count_still_needed;   /* Number of areas still needing to be allocated for the log */
  67         uint32_t allocated;     /* Total number of extents allocated so far */
  68 };
  69
  70 struct lv_names {
  71         const char *old;
  72         const char *new;
  73 };
  74
  75 int add_seg_to_segs_using_this_lv(struct logical_volume *lv,
  76                                   struct lv_segment *seg)
  77 {
  78         struct seg_list *sl;
  79
  80         dm_list_iterate_items(sl, &lv->segs_using_this_lv) {
  81                 if (sl->seg == seg) {
  82                         sl->count++;
  83                         return 1;
  84                 }
  85         }
  86
  87         log_very_verbose("Adding %s:%" PRIu32 " as an user of %s",
  88                          seg->lv->name, seg->le, lv->name);
  89
  90         if (!(sl = dm_pool_zalloc(lv->vg->vgmem, sizeof(*sl)))) {
  91                 log_error("Failed to allocate segment list");
  92                 return 0;
  93         }
  94
  95         sl->count = 1;
  96         sl->seg = seg;
  97         dm_list_add(&lv->segs_using_this_lv, &sl->list);
  98
  99         return 1;
 100 }
 101
 102 int remove_seg_from_segs_using_this_lv(struct logical_volume *lv,
 103                                        struct lv_segment *seg)
 104 {
 105         struct seg_list *sl;
 106
 107         dm_list_iterate_items(sl, &lv->segs_using_this_lv) {
 108                 if (sl->seg != seg)
 109                         continue;
 110                 if (sl->count > 1)
 111                         sl->count--;
 112                 else {
 113                         log_very_verbose("%s:%" PRIu32 " is no longer a user "
 114                                          "of %s", seg->lv->name, seg->le,
 115                                          lv->name);
 116                         dm_list_del(&sl->list);
 117                 }
 118                 return 1;
 119         }
 120
 121         return 0;
 122 }
 123
 124 /*
 125  * This is a function specialized for the common case where there is
 126  * only one segment which uses the LV.
 127  * e.g. the LV is a layer inserted by insert_layer_for_lv().
 128  *
 129  * In general, walk through lv->segs_using_this_lv.
 130  */
 131 struct lv_segment *get_only_segment_using_this_lv(struct logical_volume *lv)
 132 {
 133         struct seg_list *sl;
 134
 135         if (dm_list_size(&lv->segs_using_this_lv) != 1) {
 136                 log_error("%s is expected to have only one segment using it, "
 137                           "while it has %d", lv->name,
 138                           dm_list_size(&lv->segs_using_this_lv));
 139                 return NULL;
 140         }
 141
 142         dm_list_iterate_items(sl, &lv->segs_using_this_lv)
 143                 break; /* first item */
 144
 145         if (sl->count != 1) {
 146                 log_error("%s is expected to have only one segment using it, "
 147                           "while %s:%" PRIu32 " uses it %d times",
 148                           lv->name, sl->seg->lv->name, sl->seg->le, sl->count);
 149                 return NULL;
 150         }
 151
 152         return sl->seg;
 153 }
 154
 155 /*
 156  * PVs used by a segment of an LV
 157  */
 158 struct seg_pvs {
 159         struct dm_list list;
 160
 161         struct dm_list pvs;     /* struct pv_list */
 162
 163         uint32_t le;
 164         uint32_t len;
 165 };
 166
 167 static struct seg_pvs *_find_seg_pvs_by_le(struct dm_list *list, uint32_t le)
 168 {
 169         struct seg_pvs *spvs;
 170
 171         dm_list_iterate_items(spvs, list)
 172                 if (le >= spvs->le && le < spvs->le + spvs->len)
 173                         return spvs;
 174
 175         return NULL;
 176 }
 177
 178 /*
 179  * Find first unused LV number.
 180  */
 181 uint32_t find_free_lvnum(struct logical_volume *lv)
 182 {
 183         int lvnum_used[MAX_RESTRICTED_LVS + 1];
 184         uint32_t i = 0;
 185         struct lv_list *lvl;
 186         int lvnum;
 187
 188         memset(&lvnum_used, 0, sizeof(lvnum_used));
 189
 190         dm_list_iterate_items(lvl, &lv->vg->lvs) {
 191                 lvnum = lvnum_from_lvid(&lvl->lv->lvid);
 192                 if (lvnum <= MAX_RESTRICTED_LVS)
 193                         lvnum_used[lvnum] = 1;
 194         }
 195
 196         while (lvnum_used[i])
 197                 i++;
 198
 199         /* FIXME What if none are free? */
 200
 201         return i;
 202 }
 203
 204 /*
 205  * All lv_segments get created here.
 206  */
 207 struct lv_segment *alloc_lv_segment(const struct segment_type *segtype,
 208                                     struct logical_volume *lv,
 209                                     uint32_t le, uint32_t len,
 210                                     uint64_t status,
 211                                     uint32_t stripe_size,
 212                                     struct logical_volume *log_lv,
 213                                     struct logical_volume *thin_pool_lv,
 214                                     uint32_t area_count,
 215                                     uint32_t area_len,
 216                                     uint32_t chunk_size,
 217                                     uint32_t region_size,
 218                                     uint32_t extents_copied,
 219                                     struct lv_segment *pvmove_source_seg)
 220 {
 221         struct lv_segment *seg;
 222         struct dm_pool *mem = lv->vg->vgmem;
 223         uint32_t areas_sz = area_count * sizeof(*seg->areas);
 224
 225         if (!segtype) {
 226                 log_error(INTERNAL_ERROR "alloc_lv_segment: Missing segtype.");
 227                 return NULL;
 228         }
 229
 230         if (!(seg = dm_pool_zalloc(mem, sizeof(*seg))))
 231                 return_NULL;
 232
 233         if (!(seg->areas = dm_pool_zalloc(mem, areas_sz))) {
 234                 dm_pool_free(mem, seg);
 235                 return_NULL;
 236         }
 237
 238         if (segtype_is_raid(segtype) &&
 239             !(seg->meta_areas = dm_pool_zalloc(mem, areas_sz))) {
 240                 dm_pool_free(mem, seg); /* frees everything alloced since seg */
 241                 return_NULL;
 242         }
 243
 244         seg->segtype = segtype;
 245         seg->lv = lv;
 246         seg->le = le;
 247         seg->len = len;
 248         seg->status = status;
 249         seg->stripe_size = stripe_size;
 250         seg->area_count = area_count;
 251         seg->area_len = area_len;
 252         seg->chunk_size = chunk_size;
 253         seg->region_size = region_size;
 254         seg->extents_copied = extents_copied;
 255         seg->pvmove_source_seg = pvmove_source_seg;
 256         dm_list_init(&seg->tags);
 257         dm_list_init(&seg->thin_messages);
 258
 259         if (thin_pool_lv) {
 260                 /* If this thin volume, thin snapshot is being created */
 261                 if (lv_is_thin_volume(thin_pool_lv)) {
 262                         seg->transaction_id = first_seg(first_seg(thin_pool_lv)->pool_lv)->transaction_id;
 263                         if (!attach_pool_lv(seg, first_seg(thin_pool_lv)->pool_lv, thin_pool_lv))
 264                                 return_NULL;
 265                 } else {
 266                         seg->transaction_id = first_seg(thin_pool_lv)->transaction_id;
 267                         if (!attach_pool_lv(seg, thin_pool_lv, NULL))
 268                                 return_NULL;
 269                 }
 270         }
 271
 272         if (log_lv && !attach_mirror_log(seg, log_lv))
 273                 return_NULL;
 274
 275         return seg;
 276 }
 277
 278 struct lv_segment *alloc_snapshot_seg(struct logical_volume *lv,
 279                                       uint64_t status, uint32_t old_le_count)
 280 {
 281         struct lv_segment *seg;
 282         const struct segment_type *segtype;
 283
 284         segtype = get_segtype_from_string(lv->vg->cmd, "snapshot");
 285         if (!segtype) {
 286                 log_error("Failed to find snapshot segtype");
 287                 return NULL;
 288         }
 289
 290         if (!(seg = alloc_lv_segment(segtype, lv, old_le_count,
 291                                      lv->le_count - old_le_count, status, 0,
 292                                      NULL, NULL, 0, lv->le_count - old_le_count,
 293                                      0, 0, 0, NULL))) {
 294                 log_error("Couldn't allocate new snapshot segment.");
 295                 return NULL;
 296         }
 297
 298         dm_list_add(&lv->segments, &seg->list);
 299         lv->status |= VIRTUAL;
 300
 301         return seg;
 302 }
 303
 304 void release_lv_segment_area(struct lv_segment *seg, uint32_t s,
 305                              uint32_t area_reduction)
 306 {
 307         if (seg_type(seg, s) == AREA_UNASSIGNED)
 308                 return;
 309
 310         if (seg_type(seg, s) == AREA_PV) {
 311                 if (release_pv_segment(seg_pvseg(seg, s), area_reduction) &&
 312                     seg->area_len == area_reduction)
 313                         seg_type(seg, s) = AREA_UNASSIGNED;
 314                 return;
 315         }
 316
 317         if ((seg_lv(seg, s)->status & MIRROR_IMAGE) ||
 318             (seg_lv(seg, s)->status & THIN_POOL_DATA)) {
 319                 if (!lv_reduce(seg_lv(seg, s), area_reduction))
 320                         stack; /* FIXME: any upper level reporting */
 321                 return;
 322         }
 323
 324         if (seg_lv(seg, s)->status & RAID_IMAGE) {
 325                 /*
 326                  * FIXME: Use lv_reduce not lv_remove
 327                  *  We use lv_remove for now, because I haven't figured out
 328                  *  why lv_reduce won't remove the LV.
 329                 lv_reduce(seg_lv(seg, s), area_reduction);
 330                 */
 331                 if (area_reduction != seg->area_len) {
 332                         log_error("Unable to reduce RAID LV - operation not implemented.");
 333                         return;
 334                 } else {
 335                         if (!lv_remove(seg_lv(seg, s))) {
 336                                 log_error("Failed to remove RAID image %s",
 337                                           seg_lv(seg, s)->name);
 338                                 return;
 339                         }
 340                 }
 341
 342                 /* Remove metadata area if image has been removed */
 343                 if (area_reduction == seg->area_len) {
 344                         if (!lv_reduce(seg_metalv(seg, s),
 345                                        seg_metalv(seg, s)->le_count)) {
 346                                 log_error("Failed to remove RAID meta-device %s",
 347                                           seg_metalv(seg, s)->name);
 348                                 return;
 349                         }
 350                 }
 351                 return;
 352         }
 353
 354         if (area_reduction == seg->area_len) {
 355                 log_very_verbose("Remove %s:%" PRIu32 "[%" PRIu32 "] from "
 356                                  "the top of LV %s:%" PRIu32,
 357                                  seg->lv->name, seg->le, s,
 358                                  seg_lv(seg, s)->name, seg_le(seg, s));
 359
 360                 remove_seg_from_segs_using_this_lv(seg_lv(seg, s), seg);
 361                 seg_lv(seg, s) = NULL;
 362                 seg_le(seg, s) = 0;
 363                 seg_type(seg, s) = AREA_UNASSIGNED;
 364         }
 365 }
 366
 367 /*
 368  * Move a segment area from one segment to another
 369  */
 370 int move_lv_segment_area(struct lv_segment *seg_to, uint32_t area_to,
 371                          struct lv_segment *seg_from, uint32_t area_from)
 372 {
 373         struct physical_volume *pv;
 374         struct logical_volume *lv;
 375         uint32_t pe, le;
 376
 377         switch (seg_type(seg_from, area_from)) {
 378         case AREA_PV:
 379                 pv = seg_pv(seg_from, area_from);
 380                 pe = seg_pe(seg_from, area_from);
 381
 382                 release_lv_segment_area(seg_from, area_from,
 383                                         seg_from->area_len);
 384                 release_lv_segment_area(seg_to, area_to, seg_to->area_len);
 385
 386                 if (!set_lv_segment_area_pv(seg_to, area_to, pv, pe))
 387                         return_0;
 388
 389                 break;
 390
 391         case AREA_LV:
 392                 lv = seg_lv(seg_from, area_from);
 393                 le = seg_le(seg_from, area_from);
 394
 395                 release_lv_segment_area(seg_from, area_from,
 396                                         seg_from->area_len);
 397                 release_lv_segment_area(seg_to, area_to, seg_to->area_len);
 398
 399                 if (!set_lv_segment_area_lv(seg_to, area_to, lv, le, 0))
 400                         return_0;
 401
 402                 break;
 403
 404         case AREA_UNASSIGNED:
 405                 release_lv_segment_area(seg_to, area_to, seg_to->area_len);
 406         }
 407
 408         return 1;
 409 }
 410
 411 /*
 412  * Link part of a PV to an LV segment.
 413  */
 414 int set_lv_segment_area_pv(struct lv_segment *seg, uint32_t area_num,
 415                            struct physical_volume *pv, uint32_t pe)
 416 {
 417         seg->areas[area_num].type = AREA_PV;
 418
 419         if (!(seg_pvseg(seg, area_num) =
 420               assign_peg_to_lvseg(pv, pe, seg->area_len, seg, area_num)))
 421                 return_0;
 422
 423         return 1;
 424 }
 425
 426 /*
 427  * Link one LV segment to another.  Assumes sizes already match.
 428  */
 429 int set_lv_segment_area_lv(struct lv_segment *seg, uint32_t area_num,
 430                            struct logical_volume *lv, uint32_t le,
 431                            uint64_t status)
 432 {
 433         log_very_verbose("Stack %s:%" PRIu32 "[%" PRIu32 "] on LV %s:%" PRIu32,
 434                          seg->lv->name, seg->le, area_num, lv->name, le);
 435
 436         if (status & RAID_META) {
 437                 seg->meta_areas[area_num].type = AREA_LV;
 438                 seg_metalv(seg, area_num) = lv;
 439                 if (le) {
 440                         log_error(INTERNAL_ERROR "Meta le != 0");
 441                         return 0;
 442                 }
 443                 seg_metale(seg, area_num) = 0;
 444         } else {
 445                 seg->areas[area_num].type = AREA_LV;
 446                 seg_lv(seg, area_num) = lv;
 447                 seg_le(seg, area_num) = le;
 448         }
 449         lv->status |= status;
 450
 451         if (!add_seg_to_segs_using_this_lv(lv, seg))
 452                 return_0;
 453
 454         return 1;
 455 }
 456
 457 /*
 458  * Prepare for adding parallel areas to an existing segment.
 459  */
 460 static int _lv_segment_add_areas(struct logical_volume *lv,
 461                                  struct lv_segment *seg,
 462                                  uint32_t new_area_count)
 463 {
 464         struct lv_segment_area *newareas;
 465         uint32_t areas_sz = new_area_count * sizeof(*newareas);
 466
 467         if (!(newareas = dm_pool_zalloc(lv->vg->cmd->mem, areas_sz)))
 468                 return_0;
 469
 470         memcpy(newareas, seg->areas, seg->area_count * sizeof(*seg->areas));
 471
 472         seg->areas = newareas;
 473         seg->area_count = new_area_count;
 474
 475         return 1;
 476 }
 477
 478 /*
 479  * Reduce the size of an lv_segment.  New size can be zero.
 480  */
 481 static int _lv_segment_reduce(struct lv_segment *seg, uint32_t reduction)
 482 {
 483         uint32_t area_reduction, s;
 484
 485         /* Caller must ensure exact divisibility */
 486         if (seg_is_striped(seg)) {
 487                 if (reduction % seg->area_count) {
 488                         log_error("Segment extent reduction %" PRIu32
 489                                   " not divisible by #stripes %" PRIu32,
 490                                   reduction, seg->area_count);
 491                         return 0;
 492                 }
 493                 area_reduction = (reduction / seg->area_count);
 494         } else
 495                 area_reduction = reduction;
 496
 497         for (s = 0; s < seg->area_count; s++)
 498                 release_lv_segment_area(seg, s, area_reduction);
 499
 500         seg->len -= reduction;
 501         seg->area_len -= area_reduction;
 502
 503         return 1;
 504 }
 505
 506 /*
 507  * Entry point for all LV reductions in size.
 508  */
 509 static int _lv_reduce(struct logical_volume *lv, uint32_t extents, int delete)
 510 {
 511         struct lv_segment *seg;
 512         uint32_t count = extents;
 513         uint32_t reduction;
 514
 515         dm_list_iterate_back_items(seg, &lv->segments) {
 516                 if (!count)
 517                         break;
 518
 519                 if (seg->len <= count) {
 520                         /* remove this segment completely */
 521                         /* FIXME Check this is safe */
 522                         if (seg->log_lv && !lv_remove(seg->log_lv))
 523                                 return_0;
 524
 525                         if (seg->metadata_lv && !lv_remove(seg->metadata_lv))
 526                                 return_0;
 527
 528                         if (seg->pool_lv) {
 529                                 if (!detach_pool_lv(seg))
 530                                         return_0;
 531                         }
 532
 533                         dm_list_del(&seg->list);
 534                         reduction = seg->len;
 535                 } else
 536                         reduction = count;
 537
 538                 if (!_lv_segment_reduce(seg, reduction))
 539                         return_0;
 540                 count -= reduction;
 541         }
 542
 543         lv->le_count -= extents;
 544         lv->size = (uint64_t) lv->le_count * lv->vg->extent_size;
 545
 546         if (!delete)
 547                 return 1;
 548
 549         /* Remove the LV if it is now empty */
 550         if (!lv->le_count && !unlink_lv_from_vg(lv))
 551                 return_0;
 552         else if (lv->vg->fid->fmt->ops->lv_setup &&
 553                    !lv->vg->fid->fmt->ops->lv_setup(lv->vg->fid, lv))
 554                 return_0;
 555
 556         return 1;
 557 }
 558
 559 /*
 560  * Empty an LV.
 561  */
 562 int lv_empty(struct logical_volume *lv)
 563 {
 564         return _lv_reduce(lv, lv->le_count, 0);
 565 }
 566
 567 /*
 568  * Empty an LV and add error segment.
 569  */
 570 int replace_lv_with_error_segment(struct logical_volume *lv)
 571 {
 572         uint32_t len = lv->le_count;
 573
 574         if (len && !lv_empty(lv))
 575                 return_0;
 576
 577         /* Minimum size required for a table. */
 578         if (!len)
 579                 len = 1;
 580
 581         /*
 582          * Since we are replacing the whatever-was-there with
 583          * an error segment, we should also clear any flags
 584          * that suggest it is anything other than "error".
 585          */
 586         lv->status &= ~(MIRRORED|PVMOVE);
 587
 588         /* FIXME: Should we bug if we find a log_lv attached? */
 589
 590         if (!lv_add_virtual_segment(lv, 0, len, get_segtype_from_string(lv->vg->cmd, "error"), NULL))
 591                 return_0;
 592
 593         return 1;
 594 }
 595
 596 /*
 597  * Remove given number of extents from LV.
 598  */
 599 int lv_reduce(struct logical_volume *lv, uint32_t extents)
 600 {
 601         return _lv_reduce(lv, extents, 1);
 602 }
 603
 604 /*
 605  * Completely remove an LV.
 606  */
 607 int lv_remove(struct logical_volume *lv)
 608 {
 609
 610         if (!lv_reduce(lv, lv->le_count))
 611                 return_0;
 612
 613         return 1;
 614 }
 615
 616 /*
 617  * A set of contiguous physical extents allocated
 618  */
 619 struct alloced_area {
 620         struct dm_list list;
 621
 622         struct physical_volume *pv;
 623         uint32_t pe;
 624         uint32_t len;
 625 };
 626
 627 /*
 628  * Details of an allocation attempt
 629  */
 630 struct alloc_handle {
 631         struct cmd_context *cmd;
 632         struct dm_pool *mem;
 633
 634         alloc_policy_t alloc;           /* Overall policy */
 635         uint32_t new_extents;           /* Number of new extents required */
 636         uint32_t area_count;            /* Number of parallel areas */
 637         uint32_t parity_count;   /* Adds to area_count, but not area_multiple */
 638         uint32_t area_multiple;         /* seg->len = area_len * area_multiple */
 639         uint32_t log_area_count;        /* Number of parallel logs */
 640         uint32_t metadata_area_count;   /* Number of parallel metadata areas */
 641         uint32_t log_len;               /* Length of log/metadata_area */
 642         uint32_t region_size;           /* Mirror region size */
 643         uint32_t total_area_len;        /* Total number of parallel extents */
 644
 645         unsigned maximise_cling;
 646         unsigned mirror_logs_separate;  /* Force mirror logs on separate PVs? */
 647
 648         /*
 649          * RAID devices require a metadata area that accompanies each
 650          * device.  During initial creation, it is best to look for space
 651          * that is new_extents + log_len and then split that between two
 652          * allocated areas when found.  'alloc_and_split_meta' indicates
 653          * that this is the desired dynamic.
 654          */
 655         unsigned alloc_and_split_meta;
 656
 657         const struct dm_config_node *cling_tag_list_cn;
 658
 659         struct dm_list *parallel_areas; /* PVs to avoid */
 660
 661         /*
 662          * Contains area_count lists of areas allocated to data stripes
 663          * followed by log_area_count lists of areas allocated to log stripes.
 664          */
 665         struct dm_list alloced_areas[0];
 666 };
 667
 668 static uint32_t _calc_area_multiple(const struct segment_type *segtype,
 669                                     const uint32_t area_count, const uint32_t stripes)
 670 {
 671         if (!area_count)
 672                 return 1;
 673
 674         /* Striped */
 675         if (segtype_is_striped(segtype))
 676                 return area_count;
 677
 678         /* Mirrored stripes */
 679         if (stripes)
 680                 return stripes;
 681
 682         /* Mirrored */
 683         return 1;
 684 }
 685
 686 /*
 687  * Returns log device size in extents, algorithm from kernel code
 688  */
 689 #define BYTE_SHIFT 3
 690 static uint32_t mirror_log_extents(uint32_t region_size, uint32_t pe_size, uint32_t area_len)
 691 {
 692         size_t area_size, bitset_size, log_size, region_count;
 693
 694         area_size = area_len * pe_size;
 695         region_count = dm_div_up(area_size, region_size);
 696
 697         /* Work out how many "unsigned long"s we need to hold the bitset. */
 698         bitset_size = dm_round_up(region_count, sizeof(uint32_t) << BYTE_SHIFT);
 699         bitset_size >>= BYTE_SHIFT;
 700
 701         /* Log device holds both header and bitset. */
 702         log_size = dm_round_up((MIRROR_LOG_OFFSET << SECTOR_SHIFT) + bitset_size, 1 << SECTOR_SHIFT);
 703         log_size >>= SECTOR_SHIFT;
 704         log_size = dm_div_up(log_size, pe_size);
 705
 706         /*
 707          * Kernel requires a mirror to be at least 1 region large.  So,
 708          * if our mirror log is itself a mirror, it must be at least
 709          * 1 region large.  This restriction may not be necessary for
 710          * non-mirrored logs, but we apply the rule anyway.
 711          *
 712          * (The other option is to make the region size of the log
 713          * mirror smaller than the mirror it is acting as a log for,
 714          * but that really complicates things.  It's much easier to
 715          * keep the region_size the same for both.)
 716          */
 717         return (log_size > (region_size / pe_size)) ? log_size :
 718                 (region_size / pe_size);
 719 }
 720
 721 /*
 722  * Preparation for a specific allocation attempt
 723  * stripes and mirrors refer to the parallel areas used for data.
 724  * If log_area_count > 1 it is always mirrored (not striped).
 725  */
 726 static struct alloc_handle *_alloc_init(struct cmd_context *cmd,
 727                                         struct dm_pool *mem,
 728                                         const struct segment_type *segtype,
 729                                         alloc_policy_t alloc,
 730                                         uint32_t new_extents,
 731                                         uint32_t mirrors,
 732                                         uint32_t stripes,
 733                                         uint32_t metadata_area_count,
 734                                         uint32_t extent_size,
 735                                         uint32_t region_size,
 736                                         struct dm_list *parallel_areas)
 737 {
 738         struct alloc_handle *ah;
 739         uint32_t s, area_count, alloc_count, parity_count;
 740         size_t size = 0;
 741
 742         /* FIXME Caller should ensure this */
 743         if (mirrors && !stripes)
 744                 stripes = 1;
 745
 746         if (segtype_is_virtual(segtype))
 747                 area_count = 0;
 748         else if (mirrors > 1)
 749                 area_count = mirrors * stripes;
 750         else
 751                 area_count = stripes;
 752
 753         size = sizeof(*ah);
 754
 755         /*
 756          * It is a requirement that RAID 4/5/6 are created with a number of
 757          * stripes that is greater than the number of parity devices.  (e.g
 758          * RAID4/5 must have at least 2 stripes and RAID6 must have at least
 759          * 3.)  It is also a constraint that, when replacing individual devices
 760          * in a RAID 4/5/6 array, no more devices can be replaced than
 761          * there are parity devices.  (Otherwise, there would not be enough
 762          * redundancy to maintain the array.)  Understanding these two
 763          * constraints allows us to infer whether the caller of this function
 764          * is intending to allocate an entire array or just replacement
 765          * component devices.  In the former case, we must account for the
 766          * necessary parity_count.  In the later case, we do not need to
 767          * account for the extra parity devices because the array already
 768          * exists and they only want replacement drives.
 769          */
 770         parity_count = (area_count <= segtype->parity_devs) ? 0 :
 771                 segtype->parity_devs;
 772         alloc_count = area_count + parity_count;
 773         if (segtype_is_raid(segtype) && metadata_area_count)
 774                 /* RAID has a meta area for each device */
 775                 alloc_count *= 2;
 776         else
 777                 /* mirrors specify their exact log count */
 778                 alloc_count += metadata_area_count;
 779
 780         size += sizeof(ah->alloced_areas[0]) * alloc_count;
 781
 782         if (!(ah = dm_pool_zalloc(mem, size))) {
 783                 log_error("allocation handle allocation failed");
 784                 return NULL;
 785         }
 786
 787         ah->cmd = cmd;
 788
 789         if (segtype_is_virtual(segtype))
 790                 return ah;
 791
 792         if (!(area_count + metadata_area_count)) {
 793                 log_error(INTERNAL_ERROR "_alloc_init called for non-virtual segment with no disk space.");
 794                 return NULL;
 795         }
 796
 797         if (!(ah->mem = dm_pool_create("allocation", 1024))) {
 798                 log_error("allocation pool creation failed");
 799                 return NULL;
 800         }
 801
 802         if (mirrors || stripes)
 803                 ah->new_extents = new_extents;
 804         else
 805                 ah->new_extents = 0;
 806         ah->area_count = area_count;
 807         ah->parity_count = parity_count;
 808         ah->region_size = region_size;
 809         ah->alloc = alloc;
 810         ah->area_multiple = _calc_area_multiple(segtype, area_count, stripes);
 811         ah->mirror_logs_separate = find_config_tree_bool(cmd, "allocation/mirror_logs_require_separate_pvs",
 812                                                          DEFAULT_MIRROR_LOGS_REQUIRE_SEPARATE_PVS);
 813
 814         if (segtype_is_raid(segtype)) {
 815                 if (metadata_area_count) {
 816                         if (metadata_area_count != area_count)
 817                                 log_error(INTERNAL_ERROR
 818                                           "Bad metadata_area_count");
 819                         ah->metadata_area_count = area_count;
 820                         ah->alloc_and_split_meta = 1;
 821
 822                         ah->log_len = RAID_METADATA_AREA_LEN;
 823
 824                         /*
 825                          * We need 'log_len' extents for each
 826                          * RAID device's metadata_area
 827                          */
 828                         ah->new_extents += (ah->log_len * ah->area_multiple);
 829                 } else {
 830                         ah->log_area_count = 0;
 831                         ah->log_len = 0;
 832                 }
 833         } else if (segtype_is_thin_pool(segtype)) {
 834                 ah->log_area_count = metadata_area_count;
 835                 /* thin_pool uses region_size to pass metadata size in extents */
 836                 ah->log_len = ah->region_size;
 837                 ah->region_size = 0;
 838                 ah->mirror_logs_separate =
 839                         find_config_tree_bool(cmd, "allocation/thin_pool_metadata_require_separate_pvs",
 840                                               DEFAULT_THIN_POOL_METADATA_REQUIRE_SEPARATE_PVS);
 841         } else {
 842                 ah->log_area_count = metadata_area_count;
 843                 ah->log_len = !metadata_area_count ? 0 :
 844                         mirror_log_extents(ah->region_size, extent_size,
 845                                            new_extents / ah->area_multiple);
 846         }
 847
 848         for (s = 0; s < alloc_count; s++)
 849                 dm_list_init(&ah->alloced_areas[s]);
 850
 851         ah->parallel_areas = parallel_areas;
 852
 853         ah->cling_tag_list_cn = find_config_tree_node(cmd, "allocation/cling_tag_list");
 854
 855         ah->maximise_cling = find_config_tree_bool(cmd, "allocation/maximise_cling", DEFAULT_MAXIMISE_CLING);
 856
 857         return ah;
 858 }
 859
 860 void alloc_destroy(struct alloc_handle *ah)
 861 {
 862         if (ah->mem)
 863                 dm_pool_destroy(ah->mem);
 864 }
 865
 866 /* Is there enough total space or should we give up immediately? */
 867 static int _sufficient_pes_free(struct alloc_handle *ah, struct dm_list *pvms,
 868                                 uint32_t allocated, uint32_t extents_still_needed)
 869 {
 870         uint32_t area_extents_needed = (extents_still_needed - allocated) * ah->area_count / ah->area_multiple;
 871         uint32_t parity_extents_needed = (extents_still_needed - allocated) * ah->parity_count / ah->area_multiple;
 872         uint32_t metadata_extents_needed = ah->metadata_area_count * RAID_METADATA_AREA_LEN; /* One each */
 873         uint32_t total_extents_needed = area_extents_needed + parity_extents_needed + metadata_extents_needed;
 874         uint32_t free_pes = pv_maps_size(pvms);
 875
 876         if (total_extents_needed > free_pes) {
 877                 log_error("Insufficient free space: %" PRIu32 " extents needed,"
 878                           " but only %" PRIu32 " available",
 879                           total_extents_needed, free_pes);
 880                 return 0;
 881         }
 882
 883         return 1;
 884 }
 885
 886 /* For striped mirrors, all the areas are counted, through the mirror layer */
 887 static uint32_t _stripes_per_mimage(struct lv_segment *seg)
 888 {
 889         struct lv_segment *last_lvseg;
 890
 891         if (seg_is_mirrored(seg) && seg->area_count && seg_type(seg, 0) == AREA_LV) {
 892                 last_lvseg = dm_list_item(dm_list_last(&seg_lv(seg, 0)->segments), struct lv_segment);
 893                 if (seg_is_striped(last_lvseg))
 894                         return last_lvseg->area_count;
 895         }
 896
 897         return 1;
 898 }
 899
 900 static void _init_alloc_parms(struct alloc_handle *ah, struct alloc_parms *alloc_parms, alloc_policy_t alloc,
 901                               struct lv_segment *prev_lvseg, unsigned can_split,
 902                               uint32_t allocated, uint32_t extents_still_needed)
 903 {
 904         alloc_parms->alloc = alloc;
 905         alloc_parms->prev_lvseg = prev_lvseg;
 906         alloc_parms->flags = 0;
 907         alloc_parms->extents_still_needed = extents_still_needed;
 908
 909         /* Are there any preceding segments we must follow on from? */
 910         if (alloc_parms->prev_lvseg) {
 911                 if (alloc_parms->alloc == ALLOC_CONTIGUOUS)
 912                         alloc_parms->flags |= A_CONTIGUOUS_TO_LVSEG;
 913                 else if ((alloc_parms->alloc == ALLOC_CLING) || (alloc_parms->alloc == ALLOC_CLING_BY_TAGS))
 914                         alloc_parms->flags |= A_CLING_TO_LVSEG;
 915         } else
 916                 /*
 917                  * A cling allocation that follows a successful contiguous allocation
 918                  * must use the same PVs (or else fail).
 919                  */
 920                 if ((alloc_parms->alloc == ALLOC_CLING) || (alloc_parms->alloc == ALLOC_CLING_BY_TAGS))
 921                         alloc_parms->flags |= A_CLING_TO_ALLOCED;
 922
 923         if (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)
 924                 alloc_parms->flags |= A_CLING_BY_TAGS;
 925
 926         /*
 927          * For normal allocations, if any extents have already been found
 928          * for allocation, prefer to place further extents on the same disks as
 929          * have already been used.
 930          */
 931         if (ah->maximise_cling && alloc_parms->alloc == ALLOC_NORMAL && allocated != alloc_parms->extents_still_needed)
 932                 alloc_parms->flags |= A_CLING_TO_ALLOCED;
 933
 934         if (can_split)
 935                 alloc_parms->flags |= A_CAN_SPLIT;
 936 }
 937
 938 static int _log_parallel_areas(struct dm_pool *mem, struct dm_list *parallel_areas)
 939 {
 940         struct seg_pvs *spvs;
 941         struct pv_list *pvl;
 942         char *pvnames;
 943
 944         if (!parallel_areas)
 945                 return 1;
 946
 947         dm_list_iterate_items(spvs, parallel_areas) {
 948                 if (!dm_pool_begin_object(mem, 256)) {
 949                         log_error("dm_pool_begin_object failed");
 950                         return 0;
 951                 }
 952
 953                 dm_list_iterate_items(pvl, &spvs->pvs) {
 954                         if (!dm_pool_grow_object(mem, pv_dev_name(pvl->pv), strlen(pv_dev_name(pvl->pv)))) {
 955                                 log_error("dm_pool_grow_object failed");
 956                                 dm_pool_abandon_object(mem);
 957                                 return 0;
 958                         }
 959                         if (!dm_pool_grow_object(mem, " ", 1)) {
 960                                 log_error("dm_pool_grow_object failed");
 961                                 dm_pool_abandon_object(mem);
 962                                 return 0;
 963                         }
 964                 }
 965
 966                 if (!dm_pool_grow_object(mem, "\0", 1)) {
 967                         log_error("dm_pool_grow_object failed");
 968                         dm_pool_abandon_object(mem);
 969                         return 0;
 970                 }
 971
 972                 pvnames = dm_pool_end_object(mem);
 973                 log_debug("Parallel PVs at LE %" PRIu32 " length %" PRIu32 ": %s",
 974                           spvs->le, spvs->len, pvnames);
 975                 dm_pool_free(mem, pvnames);
 976         }
 977
 978         return 1;
 979 }
 980
 981 static int _setup_alloced_segment(struct logical_volume *lv, uint64_t status,
 982                                   uint32_t area_count,
 983                                   uint32_t stripe_size,
 984                                   const struct segment_type *segtype,
 985                                   struct alloced_area *aa,
 986                                   uint32_t region_size)
 987 {
 988         uint32_t s, extents, area_multiple;
 989         struct lv_segment *seg;
 990
 991         area_multiple = _calc_area_multiple(segtype, area_count, 0);
 992
 993         if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count,
 994                                      aa[0].len * area_multiple,
 995                                      status, stripe_size, NULL, NULL,
 996                                      area_count,
 997                                      aa[0].len, 0u, region_size, 0u, NULL))) {
 998                 log_error("Couldn't allocate new LV segment.");
 999                 return 0;
1000         }
1001
1002         for (s = 0; s < area_count; s++)
1003                 if (!set_lv_segment_area_pv(seg, s, aa[s].pv, aa[s].pe))
1004                         return_0;
1005
1006         dm_list_add(&lv->segments, &seg->list);
1007
1008         extents = aa[0].len * area_multiple;
1009         lv->le_count += extents;
1010         lv->size += (uint64_t) extents *lv->vg->extent_size;
1011
1012         if (segtype_is_mirrored(segtype))
1013                 lv->status |= MIRRORED;
1014
1015         return 1;
1016 }
1017
1018 static int _setup_alloced_segments(struct logical_volume *lv,
1019                                    struct dm_list *alloced_areas,
1020                                    uint32_t area_count,
1021                                    uint64_t status,
1022                                    uint32_t stripe_size,
1023                                    const struct segment_type *segtype,
1024                                    uint32_t region_size)
1025 {
1026         struct alloced_area *aa;
1027
1028         dm_list_iterate_items(aa, &alloced_areas[0]) {
1029                 if (!_setup_alloced_segment(lv, status, area_count,
1030                                             stripe_size, segtype, aa,
1031                                             region_size))
1032                         return_0;
1033         }
1034
1035         return 1;
1036 }
1037
1038 /*
1039  * This function takes a list of pv_areas and adds them to allocated_areas.
1040  * If the complete area is not needed then it gets split.
1041  * The part used is removed from the pv_map so it can't be allocated twice.
1042  */
1043 static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocate,
1044                                 struct alloc_state *alloc_state, uint32_t ix_log_offset)
1045 {
1046         uint32_t area_len, len;
1047         uint32_t s;
1048         uint32_t ix_log_skip = 0; /* How many areas to skip in middle of array to reach log areas */
1049         uint32_t total_area_count;
1050         struct alloced_area *aa;
1051         struct pv_area *pva;
1052
1053         total_area_count = ah->area_count + alloc_state->log_area_count_still_needed;
1054         total_area_count += ah->parity_count;
1055         if (!total_area_count) {
1056                 log_error(INTERNAL_ERROR "_alloc_parallel_area called without any allocation to do.");
1057                 return 1;
1058         }
1059
1060         area_len = max_to_allocate / ah->area_multiple;
1061
1062         /* Reduce area_len to the smallest of the areas */
1063         for (s = 0; s < ah->area_count + ah->parity_count; s++)
1064                 if (area_len > alloc_state->areas[s].used)
1065                         area_len = alloc_state->areas[s].used;
1066
1067         len = (ah->alloc_and_split_meta) ? total_area_count * 2 : total_area_count;
1068         len *= sizeof(*aa);
1069         if (!(aa = dm_pool_alloc(ah->mem, len))) {
1070                 log_error("alloced_area allocation failed");
1071                 return 0;
1072         }
1073
1074         /*
1075          * Areas consists of area_count areas for data stripes, then
1076          * ix_log_skip areas to skip, then log_area_count areas to use for the
1077          * log, then some areas too small for the log.
1078          */
1079         len = area_len;
1080         for (s = 0; s < total_area_count; s++) {
1081                 if (s == (ah->area_count + ah->parity_count)) {
1082                         ix_log_skip = ix_log_offset - ah->area_count;
1083                         len = ah->log_len;
1084                 }
1085
1086                 pva = alloc_state->areas[s + ix_log_skip].pva;
1087                 if (ah->alloc_and_split_meta) {
1088                         /*
1089                          * The metadata area goes at the front of the allocated
1090                          * space for now, but could easily go at the end (or
1091                          * middle!).
1092                          *
1093                          * Even though we split these two from the same
1094                          * allocation, we store the images at the beginning
1095                          * of the areas array and the metadata at the end.
1096                          */
1097                         s += ah->area_count + ah->parity_count;
1098                         aa[s].pv = pva->map->pv;
1099                         aa[s].pe = pva->start;
1100                         aa[s].len = ah->log_len;
1101
1102                         log_debug("Allocating parallel metadata area %" PRIu32
1103                                   " on %s start PE %" PRIu32
1104                                   " length %" PRIu32 ".",
1105                                   (s - (ah->area_count + ah->parity_count)),
1106                                   pv_dev_name(aa[s].pv), aa[s].pe,
1107                                   ah->log_len);
1108
1109                         consume_pv_area(pva, ah->log_len);
1110                         dm_list_add(&ah->alloced_areas[s], &aa[s].list);
1111                         s -= ah->area_count + ah->parity_count;
1112                 }
1113                 aa[s].pv = pva->map->pv;
1114                 aa[s].pe = pva->start;
1115                 aa[s].len = (ah->alloc_and_split_meta) ? len - ah->log_len : len;
1116
1117                 log_debug("Allocating parallel area %" PRIu32
1118                           " on %s start PE %" PRIu32 " length %" PRIu32 ".",
1119                           s, pv_dev_name(aa[s].pv), aa[s].pe, aa[s].len);
1120
1121                 consume_pv_area(pva, aa[s].len);
1122
1123                 dm_list_add(&ah->alloced_areas[s], &aa[s].list);
1124         }
1125
1126         /* Only need to alloc metadata from the first batch */
1127         ah->alloc_and_split_meta = 0;
1128
1129         ah->total_area_len += area_len;
1130
1131         alloc_state->allocated += area_len * ah->area_multiple;
1132
1133         return 1;
1134 }
1135
1136 /*
1137  * Call fn for each AREA_PV used by the LV segment at lv:le of length *max_seg_len.
1138  * If any constituent area contains more than one segment, max_seg_len is
1139  * reduced to cover only the first.
1140  * fn should return 0 on error, 1 to continue scanning or >1 to terminate without error.
1141  * In the last case, this function passes on the return code.
1142  */
1143 static int _for_each_pv(struct cmd_context *cmd, struct logical_volume *lv,
1144                         uint32_t le, uint32_t len, struct lv_segment *seg,
1145                         uint32_t *max_seg_len,
1146                         uint32_t first_area, uint32_t max_areas,
1147                         int top_level_area_index,
1148                         int only_single_area_segments,
1149                         int (*fn)(struct cmd_context *cmd,
1150                                   struct pv_segment *peg, uint32_t s,
1151                                   void *data),
1152                         void *data)
1153 {
1154         uint32_t s;
1155         uint32_t remaining_seg_len, area_len, area_multiple;
1156         uint32_t stripes_per_mimage = 1;
1157         int r = 1;
1158
1159         if (!seg && !(seg = find_seg_by_le(lv, le))) {
1160                 log_error("Failed to find segment for %s extent %" PRIu32,
1161                           lv->name, le);
1162                 return 0;
1163         }
1164
1165         /* Remaining logical length of segment */
1166         remaining_seg_len = seg->len - (le - seg->le);
1167
1168         if (remaining_seg_len > len)
1169                 remaining_seg_len = len;
1170
1171         if (max_seg_len && *max_seg_len > remaining_seg_len)
1172                 *max_seg_len = remaining_seg_len;
1173
1174         area_multiple = _calc_area_multiple(seg->segtype, seg->area_count, 0);
1175         area_len = remaining_seg_len / area_multiple ? : 1;
1176
1177         /* For striped mirrors, all the areas are counted, through the mirror layer */
1178         if (top_level_area_index == -1)
1179                 stripes_per_mimage = _stripes_per_mimage(seg);
1180
1181         for (s = first_area;
1182              s < seg->area_count && (!max_areas || s <= max_areas);
1183              s++) {
1184                 if (seg_type(seg, s) == AREA_LV) {
1185                         if (!(r = _for_each_pv(cmd, seg_lv(seg, s),
1186                                                seg_le(seg, s) +
1187                                                (le - seg->le) / area_multiple,
1188                                                area_len, NULL, max_seg_len, 0,
1189                                                (stripes_per_mimage == 1) && only_single_area_segments ? 1U : 0U,
1190                                                (top_level_area_index != -1) ? top_level_area_index : (int) (s * stripes_per_mimage),
1191                                                only_single_area_segments, fn,
1192                                                data)))
1193                                 stack;
1194                 } else if (seg_type(seg, s) == AREA_PV)
1195                         if (!(r = fn(cmd, seg_pvseg(seg, s), top_level_area_index != -1 ? (uint32_t) top_level_area_index + s : s, data)))
1196                                 stack;
1197                 if (r != 1)
1198                         return r;
1199         }
1200
1201         /* FIXME only_single_area_segments used as workaround to skip log LV - needs new param? */
1202         if (!only_single_area_segments && seg_is_mirrored(seg) && seg->log_lv) {
1203                 if (!(r = _for_each_pv(cmd, seg->log_lv, 0, seg->log_lv->le_count, NULL,
1204                                        NULL, 0, 0, 0, only_single_area_segments,
1205                                        fn, data)))
1206                         stack;
1207                 if (r != 1)
1208                         return r;
1209         }
1210
1211         /* FIXME Add snapshot cow LVs etc. */
1212
1213         return 1;
1214 }
1215
1216 static int _comp_area(const void *l, const void *r)
1217 {
1218         const struct pv_area_used *lhs = (const struct pv_area_used *) l;
1219         const struct pv_area_used *rhs = (const struct pv_area_used *) r;
1220
1221         if (lhs->used < rhs->used)
1222                 return 1;
1223
1224         else if (lhs->used > rhs->used)
1225                 return -1;
1226
1227         return 0;
1228 }
1229
1230 /*
1231  * Search for pvseg that matches condition
1232  */
1233 struct pv_match {
1234         int (*condition)(struct pv_match *pvmatch, struct pv_segment *pvseg, struct pv_area *pva);
1235
1236         struct pv_area_used *areas;
1237         struct pv_area *pva;
1238         uint32_t areas_size;
1239         const struct dm_config_node *cling_tag_list_cn;
1240         int s;  /* Area index of match */
1241 };
1242
1243 /*
1244  * Is PV area on the same PV?
1245  */
1246 static int _is_same_pv(struct pv_match *pvmatch __attribute((unused)), struct pv_segment *pvseg, struct pv_area *pva)
1247 {
1248         if (pvseg->pv != pva->map->pv)
1249                 return 0;
1250
1251         return 1;
1252 }
1253
1254 /*
1255  * Does PV area have a tag listed in allocation/cling_tag_list that
1256  * matches a tag of the PV of the existing segment?
1257  */
1258 static int _pvs_have_matching_tag(const struct dm_config_node *cling_tag_list_cn, struct physical_volume *pv1, struct physical_volume *pv2)
1259 {
1260         const struct dm_config_value *cv;
1261         const char *str;
1262         const char *tag_matched;
1263
1264         for (cv = cling_tag_list_cn->v; cv; cv = cv->next) {
1265                 if (cv->type != DM_CFG_STRING) {
1266                         log_error("Ignoring invalid string in config file entry "
1267                                   "allocation/cling_tag_list");
1268                         continue;
1269                 }
1270                 str = cv->v.str;
1271                 if (!*str) {
1272                         log_error("Ignoring empty string in config file entry "
1273                                   "allocation/cling_tag_list");
1274                         continue;
1275                 }
1276
1277                 if (*str != '@') {
1278                         log_error("Ignoring string not starting with @ in config file entry "
1279                                   "allocation/cling_tag_list: %s", str);
1280                         continue;
1281                 }
1282
1283                 str++;
1284
1285                 if (!*str) {
1286                         log_error("Ignoring empty tag in config file entry "
1287                                   "allocation/cling_tag_list");
1288                         continue;
1289                 }
1290
1291                 /* Wildcard matches any tag against any tag. */
1292                 if (!strcmp(str, "*")) {
1293                         if (!str_list_match_list(&pv1->tags, &pv2->tags, &tag_matched))
1294                                 continue;
1295                         else {
1296                                 log_debug("Matched allocation PV tag %s on existing %s with free space on %s.",
1297                                           tag_matched, pv_dev_name(pv1), pv_dev_name(pv2));
1298                                 return 1;
1299                         }
1300                 }
1301
1302                 if (!str_list_match_item(&pv1->tags, str) ||
1303                     !str_list_match_item(&pv2->tags, str))
1304                         continue;
1305                 else {
1306                         log_debug("Matched allocation PV tag %s on existing %s with free space on %s.",
1307                                   str, pv_dev_name(pv1), pv_dev_name(pv2));
1308                         return 1;
1309                 }
1310         }
1311
1312         return 0;
1313 }
1314
1315 static int _has_matching_pv_tag(struct pv_match *pvmatch, struct pv_segment *pvseg, struct pv_area *pva)
1316 {
1317         return _pvs_have_matching_tag(pvmatch->cling_tag_list_cn, pvseg->pv, pva->map->pv);
1318 }
1319
1320 /*
1321  * Is PV area contiguous to PV segment?
1322  */
1323 static int _is_contiguous(struct pv_match *pvmatch __attribute((unused)), struct pv_segment *pvseg, struct pv_area *pva)
1324 {
1325         if (pvseg->pv != pva->map->pv)
1326                 return 0;
1327
1328         if (pvseg->pe + pvseg->len != pva->start)
1329                 return 0;
1330
1331         return 1;
1332 }
1333
1334 static void _reserve_area(struct pv_area_used *area_used, struct pv_area *pva, uint32_t required,
1335                           uint32_t ix_pva, uint32_t unreserved)
1336 {
1337         log_debug("%s allocation area %" PRIu32 " %s %s start PE %" PRIu32
1338                   " length %" PRIu32 " leaving %" PRIu32 ".",
1339                   area_used->pva ? "Changing   " : "Considering",
1340                   ix_pva - 1, area_used->pva ? "to" : "as",
1341                   dev_name(pva->map->pv->dev), pva->start, required, unreserved);
1342
1343         area_used->pva = pva;
1344         area_used->used = required;
1345 }
1346
1347 static int _is_condition(struct cmd_context *cmd __attribute__((unused)),
1348                          struct pv_segment *pvseg, uint32_t s,
1349                          void *data)
1350 {
1351         struct pv_match *pvmatch = data;
1352
1353         if (pvmatch->areas[s].pva)
1354                 return 1;       /* Area already assigned */
1355
1356         if (!pvmatch->condition(pvmatch, pvseg, pvmatch->pva))
1357                 return 1;       /* Continue */
1358
1359         if (s >= pvmatch->areas_size)
1360                 return 1;
1361
1362         /*
1363          * Only used for cling and contiguous policies (which only make one allocation per PV)
1364          * so it's safe to say all the available space is used.
1365          */
1366         _reserve_area(&pvmatch->areas[s], pvmatch->pva, pvmatch->pva->count, s + 1, 0);
1367
1368         return 2;       /* Finished */
1369 }
1370
1371 /*
1372  * Is pva on same PV as any existing areas?
1373  */
1374 static int _check_cling(struct alloc_handle *ah,
1375                         const struct dm_config_node *cling_tag_list_cn,
1376                         struct lv_segment *prev_lvseg, struct pv_area *pva,
1377                         struct alloc_state *alloc_state)
1378 {
1379         struct pv_match pvmatch;
1380         int r;
1381         uint32_t le, len;
1382
1383         pvmatch.condition = cling_tag_list_cn ? _has_matching_pv_tag : _is_same_pv;
1384         pvmatch.areas = alloc_state->areas;
1385         pvmatch.areas_size = alloc_state->areas_size;
1386         pvmatch.pva = pva;
1387         pvmatch.cling_tag_list_cn = cling_tag_list_cn;
1388
1389         if (ah->maximise_cling) {
1390                 /* Check entire LV */
1391                 le = 0;
1392                 len = prev_lvseg->le + prev_lvseg->len;
1393         } else {
1394                 /* Only check 1 LE at end of previous LV segment */
1395                 le = prev_lvseg->le + prev_lvseg->len - 1;
1396                 len = 1;
1397         }
1398
1399         /* FIXME Cope with stacks by flattening */
1400         if (!(r = _for_each_pv(ah->cmd, prev_lvseg->lv, le, len, NULL, NULL,
1401                                0, 0, -1, 1,
1402                                _is_condition, &pvmatch)))
1403                 stack;
1404
1405         if (r != 2)
1406                 return 0;
1407
1408         return 1;
1409 }
1410
1411 /*
1412  * Is pva contiguous to any existing areas or on the same PV?
1413  */
1414 static int _check_contiguous(struct cmd_context *cmd,
1415                              struct lv_segment *prev_lvseg, struct pv_area *pva,
1416                              struct alloc_state *alloc_state)
1417 {
1418         struct pv_match pvmatch;
1419         int r;
1420
1421         pvmatch.condition = _is_contiguous;
1422         pvmatch.areas = alloc_state->areas;
1423         pvmatch.areas_size = alloc_state->areas_size;
1424         pvmatch.pva = pva;
1425         pvmatch.cling_tag_list_cn = NULL;
1426
1427         /* FIXME Cope with stacks by flattening */
1428         if (!(r = _for_each_pv(cmd, prev_lvseg->lv,
1429                                prev_lvseg->le + prev_lvseg->len - 1, 1, NULL, NULL,
1430                                0, 0, -1, 1,
1431                                _is_condition, &pvmatch)))
1432                 stack;
1433
1434         if (r != 2)
1435                 return 0;
1436
1437         return 1;
1438 }
1439
1440 /*
1441  * Is pva on same PV as any areas already used in this allocation attempt?
1442  */
1443 static int _check_cling_to_alloced(struct alloc_handle *ah, const struct dm_config_node *cling_tag_list_cn,
1444                                    struct pv_area *pva, struct alloc_state *alloc_state)
1445 {
1446         unsigned s;
1447         struct alloced_area *aa;
1448
1449         /*
1450          * Ignore log areas.  They are always allocated whole as part of the
1451          * first allocation.  If they aren't yet set, we know we've nothing to do.
1452          */
1453         if (alloc_state->log_area_count_still_needed)
1454                 return 0;
1455
1456         for (s = 0; s < ah->area_count; s++) {
1457                 if (alloc_state->areas[s].pva)
1458                         continue;       /* Area already assigned */
1459                 dm_list_iterate_items(aa, &ah->alloced_areas[s]) {
1460                         if ((!cling_tag_list_cn && (pva->map->pv == aa[0].pv)) ||
1461                             (cling_tag_list_cn && _pvs_have_matching_tag(cling_tag_list_cn, pva->map->pv, aa[0].pv))) {
1462                                 _reserve_area(&alloc_state->areas[s], pva, pva->count, s + 1, 0);
1463                                 return 1;
1464                         }
1465                 }
1466         }
1467
1468         return 0;
1469 }
1470
1471 static int _pv_is_parallel(struct physical_volume *pv, struct dm_list *parallel_pvs)
1472 {
1473         struct pv_list *pvl;
1474
1475         dm_list_iterate_items(pvl, parallel_pvs)
1476                 if (pv == pvl->pv)
1477                         return 1;
1478
1479         return 0;
1480 }
1481
1482 /*
1483  * Decide whether or not to try allocation from supplied area pva.
1484  * alloc_state->areas may get modified.
1485  */
1486 static area_use_t _check_pva(struct alloc_handle *ah, struct pv_area *pva, uint32_t still_needed,
1487                              const struct alloc_parms *alloc_parms, struct alloc_state *alloc_state,
1488                              unsigned already_found_one, unsigned iteration_count, unsigned log_iteration_count)
1489 {
1490         unsigned s;
1491
1492         /* Skip fully-reserved areas (which are not currently removed from the list). */
1493         if (!pva->unreserved)
1494                 return NEXT_AREA;
1495
1496         /* FIXME Should this test be removed? */
1497         if (iteration_count)
1498                 /*
1499                 * Don't use an area twice.
1500                 */
1501                 for (s = 0; s < alloc_state->areas_size; s++)
1502                         if (alloc_state->areas[s].pva == pva)
1503                                 return NEXT_AREA;
1504
1505         /* If maximise_cling is set, perform several checks, otherwise perform exactly one. */
1506         if (!iteration_count && !log_iteration_count && alloc_parms->flags & (A_CONTIGUOUS_TO_LVSEG | A_CLING_TO_LVSEG | A_CLING_TO_ALLOCED)) {
1507                 /* Contiguous? */
1508                 if (((alloc_parms->flags & A_CONTIGUOUS_TO_LVSEG) || (ah->maximise_cling && alloc_parms->prev_lvseg)) &&
1509                     _check_contiguous(ah->cmd, alloc_parms->prev_lvseg, pva, alloc_state))
1510                         return PREFERRED;
1511
1512                 /* Try next area on same PV if looking for contiguous space */
1513                 if (alloc_parms->flags & A_CONTIGUOUS_TO_LVSEG)
1514                         return NEXT_AREA;
1515
1516                 /* Cling to prev_lvseg? */
1517                 if (((alloc_parms->flags & A_CLING_TO_LVSEG) || (ah->maximise_cling && alloc_parms->prev_lvseg)) &&
1518                     _check_cling(ah, NULL, alloc_parms->prev_lvseg, pva, alloc_state))
1519                         /* If this PV is suitable, use this first area */
1520                         return PREFERRED;
1521
1522                 /* Cling_to_alloced? */
1523                 if ((alloc_parms->flags & A_CLING_TO_ALLOCED) &&
1524                     _check_cling_to_alloced(ah, NULL, pva, alloc_state))
1525                         return PREFERRED;
1526
1527                 /* Cling_by_tags? */
1528                 if (!(alloc_parms->flags & A_CLING_BY_TAGS) || !ah->cling_tag_list_cn)
1529                         return NEXT_PV;
1530
1531                 if (alloc_parms->prev_lvseg) {
1532                         if (_check_cling(ah, ah->cling_tag_list_cn, alloc_parms->prev_lvseg, pva, alloc_state))
1533                                 return PREFERRED;
1534                 } else if (_check_cling_to_alloced(ah, ah->cling_tag_list_cn, pva, alloc_state))
1535                         return PREFERRED;
1536
1537                 /* All areas on this PV give same result so pointless checking more */
1538                 return NEXT_PV;
1539         }
1540
1541         /* Normal/Anywhere */
1542
1543         /* Is it big enough on its own? */
1544         if (pva->unreserved * ah->area_multiple < still_needed &&
1545             ((!(alloc_parms->flags & A_CAN_SPLIT) && !ah->log_area_count) ||
1546              (already_found_one && alloc_parms->alloc != ALLOC_ANYWHERE)))
1547                 return NEXT_PV;
1548
1549         return USE_AREA;
1550 }
1551
1552 /*
1553  * Decide how many extents we're trying to obtain from a given area.
1554  * Removes the extents from further consideration.
1555  */
1556 static uint32_t _calc_required_extents(struct alloc_handle *ah, struct pv_area *pva, unsigned ix_pva, uint32_t max_to_allocate, alloc_policy_t alloc)
1557 {
1558         uint32_t required = max_to_allocate / ah->area_multiple;
1559
1560         /*
1561          * Update amount unreserved - effectively splitting an area
1562          * into two or more parts.  If the whole stripe doesn't fit,
1563          * reduce amount we're looking for.
1564          */
1565         if (alloc == ALLOC_ANYWHERE) {
1566                 if (ix_pva - 1 >= ah->area_count)
1567                         required = ah->log_len;
1568         } else if (required < ah->log_len)
1569                 required = ah->log_len;
1570
1571         if (required >= pva->unreserved) {
1572                 required = pva->unreserved;
1573                 pva->unreserved = 0;
1574         } else {
1575                 pva->unreserved -= required;
1576                 reinsert_changed_pv_area(pva);
1577         }
1578
1579         return required;
1580 }
1581
1582 static int _reserve_required_area(struct alloc_handle *ah, uint32_t max_to_allocate,
1583                                   unsigned ix_pva, struct pv_area *pva,
1584                                   struct alloc_state *alloc_state, alloc_policy_t alloc)
1585 {
1586         uint32_t required = _calc_required_extents(ah, pva, ix_pva, max_to_allocate, alloc);
1587         uint32_t s;
1588
1589         /* Expand areas array if needed after an area was split. */
1590         if (ix_pva > alloc_state->areas_size) {
1591                 alloc_state->areas_size *= 2;
1592                 if (!(alloc_state->areas = dm_realloc(alloc_state->areas, sizeof(*alloc_state->areas) * (alloc_state->areas_size)))) {
1593                         log_error("Memory reallocation for parallel areas failed.");
1594                         return 0;
1595                 }
1596                 for (s = alloc_state->areas_size / 2; s < alloc_state->areas_size; s++)
1597                         alloc_state->areas[s].pva = NULL;
1598         }
1599
1600         _reserve_area(&alloc_state->areas[ix_pva - 1], pva, required, ix_pva, pva->unreserved);
1601
1602         return 1;
1603 }
1604
1605 static void _clear_areas(struct alloc_state *alloc_state)
1606 {
1607         uint32_t s;
1608
1609         for (s = 0; s < alloc_state->areas_size; s++)
1610                 alloc_state->areas[s].pva = NULL;
1611 }
1612
1613 static void _reset_unreserved(struct dm_list *pvms)
1614 {
1615         struct pv_map *pvm;
1616         struct pv_area *pva;
1617
1618         dm_list_iterate_items(pvm, pvms)
1619                 dm_list_iterate_items(pva, &pvm->areas)
1620                         if (pva->unreserved != pva->count) {
1621                                 pva->unreserved = pva->count;
1622                                 reinsert_changed_pv_area(pva);
1623                         }
1624 }
1625
1626 static void _report_needed_allocation_space(struct alloc_handle *ah,
1627                                             struct alloc_state *alloc_state)
1628 {
1629         const char *metadata_type;
1630         uint32_t parallel_areas_count, parallel_area_size;
1631         uint32_t metadata_count, metadata_size;
1632
1633         parallel_area_size = (ah->new_extents - alloc_state->allocated) / ah->area_multiple -
1634                       ((ah->alloc_and_split_meta) ? ah->log_len : 0);
1635
1636         parallel_areas_count = ah->area_count + ah->parity_count;
1637
1638         metadata_size = ah->log_len;
1639         if (ah->alloc_and_split_meta) {
1640                 metadata_type = "RAID metadata area";
1641                 metadata_count = parallel_areas_count;
1642         } else {
1643                 metadata_type = "mirror log";
1644                 metadata_count = alloc_state->log_area_count_still_needed;
1645         }
1646
1647         log_debug("Still need %" PRIu32 " total extents:",
1648                 parallel_area_size * parallel_areas_count + metadata_size * metadata_count);
1649         log_debug("  %" PRIu32 " (%" PRIu32 " data/%" PRIu32
1650                   " parity) parallel areas of %" PRIu32 " extents each",
1651                   parallel_areas_count, ah->area_count, ah->parity_count, parallel_area_size);
1652         log_debug("  %" PRIu32 " %ss of %" PRIu32 " extents each",
1653                   metadata_count, metadata_type, metadata_size);
1654 }
1655 /*
1656  * Returns 1 regardless of whether any space was found, except on error.
1657  */
1658 static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc_parms *alloc_parms,
1659                                      struct dm_list *pvms, struct alloc_state *alloc_state,
1660                                      struct dm_list *parallel_pvs, uint32_t max_to_allocate)
1661 {
1662         unsigned ix = 0;
1663         unsigned last_ix;
1664         struct pv_map *pvm;
1665         struct pv_area *pva;
1666         unsigned preferred_count = 0;
1667         unsigned already_found_one;
1668         unsigned ix_offset = 0; /* Offset for non-preferred allocations */
1669         unsigned ix_log_offset; /* Offset to start of areas to use for log */
1670         unsigned too_small_for_log_count; /* How many too small for log? */
1671         unsigned iteration_count = 0; /* cling_to_alloced may need 2 iterations */
1672         unsigned log_iteration_count = 0; /* extra iteration for logs on data devices */
1673         struct alloced_area *aa;
1674         uint32_t s;
1675         uint32_t devices_needed = ah->area_count + ah->parity_count;
1676
1677         /* ix_offset holds the number of parallel allocations that must be contiguous/cling */
1678         /* At most one of A_CONTIGUOUS_TO_LVSEG, A_CLING_TO_LVSEG or A_CLING_TO_ALLOCED may be set */
1679         if (alloc_parms->flags & (A_CONTIGUOUS_TO_LVSEG | A_CLING_TO_LVSEG))
1680                 ix_offset = _stripes_per_mimage(alloc_parms->prev_lvseg) * alloc_parms->prev_lvseg->area_count;
1681
1682         if (alloc_parms->flags & A_CLING_TO_ALLOCED)
1683                 ix_offset = ah->area_count;
1684
1685         if (alloc_parms->alloc == ALLOC_NORMAL || (alloc_parms->flags & A_CLING_TO_ALLOCED))
1686                 log_debug("Cling_to_allocated is %sset",
1687                           alloc_parms->flags & A_CLING_TO_ALLOCED ? "" : "not ");
1688
1689         _clear_areas(alloc_state);
1690         _reset_unreserved(pvms);
1691
1692         _report_needed_allocation_space(ah, alloc_state);
1693
1694         /* ix holds the number of areas found on other PVs */
1695         do {
1696                 if (log_iteration_count) {
1697                         log_debug("Found %u areas for %" PRIu32 " parallel areas and %" PRIu32 " log areas so far.", ix, devices_needed, alloc_state->log_area_count_still_needed);
1698                 } else if (iteration_count)
1699                         log_debug("Filled %u out of %u preferred areas so far.", preferred_count, ix_offset);
1700
1701                 /*
1702                  * Provide for escape from the loop if no progress is made.
1703                  * This should not happen: ALLOC_ANYWHERE should be able to use
1704                  * all available space. (If there aren't enough extents, the code
1705                  * should not reach this point.)
1706                  */
1707                 last_ix = ix;
1708
1709                 /*
1710                  * Put the smallest area of each PV that is at least the
1711                  * size we need into areas array.  If there isn't one
1712                  * that fits completely and we're allowed more than one
1713                  * LV segment, then take the largest remaining instead.
1714                  */
1715                 dm_list_iterate_items(pvm, pvms) {
1716                         /* PV-level checks */
1717                         if (dm_list_empty(&pvm->areas))
1718                                 continue;       /* Next PV */
1719
1720                         if (alloc_parms->alloc != ALLOC_ANYWHERE) {
1721                                 /* Don't allocate onto the log PVs */
1722                                 if (ah->log_area_count)
1723                                         dm_list_iterate_items(aa, &ah->alloced_areas[ah->area_count])
1724                                                 for (s = 0; s < ah->log_area_count; s++)
1725                                                         if (!aa[s].pv)
1726                                                                 goto next_pv;
1727
1728                                 /* FIXME Split into log and non-log parallel_pvs and only check the log ones if log_iteration? */
1729                                 /* (I've temporatily disabled the check.) */
1730                                 /* Avoid PVs used by existing parallel areas */
1731                                 if (!log_iteration_count && parallel_pvs && _pv_is_parallel(pvm->pv, parallel_pvs))
1732                                         goto next_pv;
1733
1734                                 /*
1735                                  * Avoid PVs already set aside for log.
1736                                  * We only reach here if there were enough PVs for the main areas but
1737                                  * not enough for the logs.
1738                                  */
1739                                 if (log_iteration_count) {
1740                                         for (s = devices_needed; s < ix + ix_offset; s++)
1741                                                 if (alloc_state->areas[s].pva && alloc_state->areas[s].pva->map->pv == pvm->pv)
1742                                                         goto next_pv;
1743                                 /* On a second pass, avoid PVs already used in an uncommitted area */
1744                                 } else if (iteration_count)
1745                                         for (s = 0; s < devices_needed; s++)
1746                                                 if (alloc_state->areas[s].pva && alloc_state->areas[s].pva->map->pv == pvm->pv)
1747                                                         goto next_pv;
1748                         }
1749
1750                         already_found_one = 0;
1751                         /* First area in each list is the largest */
1752                         dm_list_iterate_items(pva, &pvm->areas) {
1753                                 /*
1754                                  * There are two types of allocations, which can't be mixed at present.
1755                                  * PREFERRED are stored immediately in a specific parallel slot.
1756                                  * USE_AREA are stored for later, then sorted and chosen from.
1757                                  */
1758                                 switch(_check_pva(ah, pva, max_to_allocate, alloc_parms,
1759                                                   alloc_state, already_found_one, iteration_count, log_iteration_count)) {
1760
1761                                 case PREFERRED:
1762                                         preferred_count++;
1763                                         /* Fall through */
1764
1765                                 case NEXT_PV:
1766                                         goto next_pv;
1767
1768                                 case NEXT_AREA:
1769                                         continue;
1770
1771                                 case USE_AREA:
1772                                         /*
1773                                          * Except with ALLOC_ANYWHERE, replace first area with this
1774                                          * one which is smaller but still big enough.
1775                                          */
1776                                         if (!already_found_one ||
1777                                             alloc_parms->alloc == ALLOC_ANYWHERE) {
1778                                                 ix++;
1779                                                 already_found_one = 1;
1780                                         }
1781
1782                                         /* Reserve required amount of pva */
1783                                         if (!_reserve_required_area(ah, max_to_allocate, ix + ix_offset,
1784                                                                     pva, alloc_state, alloc_parms->alloc))
1785                                                 return_0;
1786                                 }
1787
1788                         }
1789
1790                 next_pv:
1791                         /* With ALLOC_ANYWHERE we ignore further PVs once we have at least enough areas */
1792                         /* With cling and contiguous we stop if we found a match for *all* the areas */
1793                         /* FIXME Rename these variables! */
1794                         if ((alloc_parms->alloc == ALLOC_ANYWHERE &&
1795                             ix + ix_offset >= devices_needed + alloc_state->log_area_count_still_needed) ||
1796                             (preferred_count == ix_offset &&
1797                              (ix_offset == devices_needed + alloc_state->log_area_count_still_needed)))
1798                                 break;
1799                 }
1800         } while ((alloc_parms->alloc == ALLOC_ANYWHERE && last_ix != ix && ix < devices_needed + alloc_state->log_area_count_still_needed) ||
1801                 /* With cling_to_alloced and normal, if there were gaps in the preferred areas, have a second iteration */
1802                  (alloc_parms->alloc == ALLOC_NORMAL && preferred_count &&
1803                   (preferred_count < ix_offset || alloc_state->log_area_count_still_needed) &&
1804                   (alloc_parms->flags & A_CLING_TO_ALLOCED) && !iteration_count++) ||
1805                 /* Extra iteration needed to fill log areas on PVs already used? */
1806                  (alloc_parms->alloc == ALLOC_NORMAL && preferred_count == ix_offset && !ah->mirror_logs_separate &&
1807                   (ix + preferred_count >= devices_needed) &&
1808                   (ix + preferred_count < devices_needed + alloc_state->log_area_count_still_needed) && !log_iteration_count++));
1809
1810         if (preferred_count < ix_offset && !(alloc_parms->flags & A_CLING_TO_ALLOCED))
1811                 return 1;
1812
1813         if (ix + preferred_count < devices_needed + alloc_state->log_area_count_still_needed)
1814                 return 1;
1815
1816         /* Sort the areas so we allocate from the biggest */
1817         if (log_iteration_count) {
1818                 if (ix > devices_needed + 1) {
1819                         log_debug("Sorting %u log areas", ix - devices_needed);
1820                         qsort(alloc_state->areas + devices_needed, ix - devices_needed, sizeof(*alloc_state->areas),
1821                               _comp_area);
1822                 }
1823         } else if (ix > 1) {
1824                 log_debug("Sorting %u areas", ix);
1825                 qsort(alloc_state->areas + ix_offset, ix, sizeof(*alloc_state->areas),
1826                       _comp_area);
1827         }
1828
1829         /* If there are gaps in our preferred areas, fill then from the sorted part of the array */
1830         if (preferred_count && preferred_count != ix_offset) {
1831                 for (s = 0; s < devices_needed; s++)
1832                         if (!alloc_state->areas[s].pva) {
1833                                 alloc_state->areas[s].pva = alloc_state->areas[ix_offset].pva;
1834                                 alloc_state->areas[s].used = alloc_state->areas[ix_offset].used;
1835                                 alloc_state->areas[ix_offset++].pva = NULL;
1836                         }
1837         }
1838
1839         /*
1840          * First time around, if there's a log, allocate it on the
1841          * smallest device that has space for it.
1842          */
1843         too_small_for_log_count = 0;
1844         ix_log_offset = 0;
1845
1846         /* FIXME This logic is due to its heritage and can be simplified! */
1847         if (alloc_state->log_area_count_still_needed) {
1848                 /* How many areas are too small for the log? */
1849                 while (too_small_for_log_count < ix_offset + ix &&
1850                        (*(alloc_state->areas + ix_offset + ix - 1 -
1851                           too_small_for_log_count)).used < ah->log_len)
1852                         too_small_for_log_count++;
1853                 ix_log_offset = ix_offset + ix - too_small_for_log_count - ah->log_area_count;
1854         }
1855
1856         if (ix + ix_offset < devices_needed +
1857             (alloc_state->log_area_count_still_needed ? alloc_state->log_area_count_still_needed +
1858                                     too_small_for_log_count : 0))
1859                 return 1;
1860
1861         /*
1862          * Finally add the space identified to the list of areas to be used.
1863          */
1864         if (!_alloc_parallel_area(ah, max_to_allocate, alloc_state, ix_log_offset))
1865                 return_0;
1866
1867         /*
1868          * Log is always allocated first time.
1869          */
1870         alloc_state->log_area_count_still_needed = 0;
1871
1872         return 1;
1873 }
1874
1875 /*
1876  * Choose sets of parallel areas to use, respecting any constraints
1877  * supplied in alloc_parms.
1878  */
1879 static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, struct alloc_parms *alloc_parms,
1880                                                    struct dm_list *pvms, struct alloc_state *alloc_state)
1881 {
1882         uint32_t max_tmp;
1883         uint32_t max_to_allocate;       /* Maximum extents to allocate this time */
1884         uint32_t old_allocated;
1885         uint32_t next_le;
1886         struct seg_pvs *spvs;
1887         struct dm_list *parallel_pvs;
1888
1889         /* FIXME This algorithm needs a lot of cleaning up! */
1890         /* FIXME anywhere doesn't find all space yet */
1891         do {
1892                 parallel_pvs = NULL;
1893                 max_to_allocate = alloc_parms->extents_still_needed - alloc_state->allocated;
1894
1895                 /*
1896                  * If there are existing parallel PVs, avoid them and reduce
1897                  * the maximum we can allocate in one go accordingly.
1898                  */
1899                 if (ah->parallel_areas) {
1900                         next_le = (alloc_parms->prev_lvseg ? alloc_parms->prev_lvseg->le + alloc_parms->prev_lvseg->len : 0) + alloc_state->allocated / ah->area_multiple;
1901                         dm_list_iterate_items(spvs, ah->parallel_areas) {
1902                                 if (next_le >= spvs->le + spvs->len)
1903                                         continue;
1904
1905                                 max_tmp = max_to_allocate +
1906                                         alloc_state->allocated;
1907
1908                                 /*
1909                                  * Because a request that groups metadata and
1910                                  * data together will be split, we must adjust
1911                                  * the comparison accordingly.
1912                                  */
1913                                 if (ah->alloc_and_split_meta)
1914                                         max_tmp -= ah->log_len;
1915                                 if (max_tmp > (spvs->le + spvs->len) * ah->area_multiple) {
1916                                         max_to_allocate = (spvs->le + spvs->len) * ah->area_multiple - alloc_state->allocated;
1917                                         max_to_allocate += ah->alloc_and_split_meta ? ah->log_len : 0;
1918                                 }
1919                                 parallel_pvs = &spvs->pvs;
1920                                 break;
1921                         }
1922                 }
1923
1924                 old_allocated = alloc_state->allocated;
1925
1926                 if (!_find_some_parallel_space(ah, alloc_parms, pvms, alloc_state, parallel_pvs, max_to_allocate))
1927                         return_0;
1928
1929                 /*
1930                  * If we didn't allocate anything this time with ALLOC_NORMAL and had
1931                  * A_CLING_TO_ALLOCED set, try again without it.
1932                  *
1933                  * For ALLOC_NORMAL, if we did allocate something without the
1934                  * flag set, set it and continue so that further allocations
1935                  * remain on the same disks where possible.
1936                  */
1937                 if (old_allocated == alloc_state->allocated) {
1938                         if ((alloc_parms->alloc == ALLOC_NORMAL) && (alloc_parms->flags & A_CLING_TO_ALLOCED))
1939                                 alloc_parms->flags &= ~A_CLING_TO_ALLOCED;
1940                         else
1941                                 break;  /* Give up */
1942                 } else if (ah->maximise_cling && alloc_parms->alloc == ALLOC_NORMAL &&
1943                            !(alloc_parms->flags & A_CLING_TO_ALLOCED))
1944                         alloc_parms->flags |= A_CLING_TO_ALLOCED;
1945         } while ((alloc_parms->alloc != ALLOC_CONTIGUOUS) && alloc_state->allocated != alloc_parms->extents_still_needed && (alloc_parms->flags & A_CAN_SPLIT));
1946
1947         return 1;
1948 }
1949
1950 /*
1951  * Allocate several segments, each the same size, in parallel.
1952  * If mirrored_pv and mirrored_pe are supplied, it is used as
1953  * the first area, and additional areas are allocated parallel to it.
1954  */
1955 static int _allocate(struct alloc_handle *ah,
1956                      struct volume_group *vg,
1957                      struct logical_volume *lv,
1958                      unsigned can_split,
1959                      struct dm_list *allocatable_pvs)
1960 {
1961         uint32_t old_allocated;
1962         struct lv_segment *prev_lvseg = NULL;
1963         int r = 0;
1964         struct dm_list *pvms;
1965         alloc_policy_t alloc;
1966         struct alloc_parms alloc_parms;
1967         struct alloc_state alloc_state;
1968
1969         alloc_state.allocated = lv ? lv->le_count : 0;
1970
1971         if (alloc_state.allocated >= ah->new_extents && !ah->log_area_count) {
1972                 log_error("_allocate called with no work to do!");
1973                 return 1;
1974         }
1975
1976         if (ah->area_multiple > 1 &&
1977             (ah->new_extents - alloc_state.allocated) % ah->area_multiple) {
1978                 log_error("Number of extents requested (%d) needs to be divisible by %d.",
1979                           ah->new_extents - alloc_state.allocated,
1980                           ah->area_multiple);
1981                 return 0;
1982         }
1983
1984         alloc_state.log_area_count_still_needed = ah->log_area_count;
1985
1986         if (ah->alloc == ALLOC_CONTIGUOUS)
1987                 can_split = 0;
1988
1989         if (lv && !dm_list_empty(&lv->segments))
1990                 prev_lvseg = dm_list_item(dm_list_last(&lv->segments),
1991                                        struct lv_segment);
1992         /*
1993          * Build the sets of available areas on the pv's.
1994          */
1995         if (!(pvms = create_pv_maps(ah->mem, vg, allocatable_pvs)))
1996                 return_0;
1997
1998         if (!_log_parallel_areas(ah->mem, ah->parallel_areas))
1999                 stack;
2000
2001         alloc_state.areas_size = dm_list_size(pvms);
2002         if (alloc_state.areas_size &&
2003             alloc_state.areas_size < (ah->area_count + ah->parity_count + ah->log_area_count)) {
2004                 if (ah->alloc != ALLOC_ANYWHERE && ah->mirror_logs_separate) {
2005                         log_error("Not enough PVs with free space available "
2006                                   "for parallel allocation.");
2007                         log_error("Consider --alloc anywhere if desperate.");
2008                         return 0;
2009                 }
2010                 alloc_state.areas_size = ah->area_count + ah->parity_count + ah->log_area_count;
2011         }
2012
2013         /* Upper bound if none of the PVs in prev_lvseg is in pvms */
2014         /* FIXME Work size out properly */
2015         if (prev_lvseg)
2016                 alloc_state.areas_size += _stripes_per_mimage(prev_lvseg) * prev_lvseg->area_count;
2017
2018         /* Allocate an array of pv_areas to hold the largest space on each PV */
2019         if (!(alloc_state.areas = dm_malloc(sizeof(*alloc_state.areas) * alloc_state.areas_size))) {
2020                 log_error("Couldn't allocate areas array.");
2021                 return 0;
2022         }
2023
2024         /*
2025          * cling includes implicit cling_by_tags
2026          * but it does nothing unless the lvm.conf setting is present.
2027          */
2028         if (ah->alloc == ALLOC_CLING)
2029                 ah->alloc = ALLOC_CLING_BY_TAGS;
2030
2031         /* Attempt each defined allocation policy in turn */
2032         for (alloc = ALLOC_CONTIGUOUS; alloc <= ah->alloc; alloc++) {
2033                 /* Skip cling_by_tags if no list defined */
2034                 if (alloc == ALLOC_CLING_BY_TAGS && !ah->cling_tag_list_cn)
2035                         continue;
2036                 old_allocated = alloc_state.allocated;
2037                 log_debug("Trying allocation using %s policy.", get_alloc_string(alloc));
2038
2039                 if (!_sufficient_pes_free(ah, pvms, alloc_state.allocated, ah->new_extents))
2040                         goto_out;
2041
2042                 _init_alloc_parms(ah, &alloc_parms, alloc, prev_lvseg,
2043                                   can_split, alloc_state.allocated,
2044                                   ah->new_extents);
2045
2046                 if (!_find_max_parallel_space_for_one_policy(ah, &alloc_parms, pvms, &alloc_state))
2047                         goto_out;
2048
2049                 if ((alloc_state.allocated == ah->new_extents && !alloc_state.log_area_count_still_needed) ||
2050                     (!can_split && (alloc_state.allocated != old_allocated)))
2051                         break;
2052         }
2053
2054         if (alloc_state.allocated != ah->new_extents) {
2055                 log_error("Insufficient suitable %sallocatable extents "
2056                           "for logical volume %s: %u more required",
2057                           can_split ? "" : "contiguous ",
2058                           lv ? lv->name : "",
2059                           (ah->new_extents - alloc_state.allocated) * ah->area_count
2060                           / ah->area_multiple);
2061                 goto out;
2062         }
2063
2064         if (alloc_state.log_area_count_still_needed) {
2065                 log_error("Insufficient free space for log allocation "
2066                           "for logical volume %s.",
2067                           lv ? lv->name : "");
2068                 goto out;
2069         }
2070
2071         r = 1;
2072
2073       out:
2074         dm_free(alloc_state.areas);
2075         return r;
2076 }
2077
2078 int lv_add_virtual_segment(struct logical_volume *lv, uint64_t status,
2079                            uint32_t extents, const struct segment_type *segtype,
2080                            const char *thin_pool_name)
2081 {
2082         struct lv_segment *seg;
2083         struct logical_volume *thin_pool_lv = NULL;
2084         struct lv_list *lvl;
2085         uint32_t size;
2086
2087         if (thin_pool_name) {
2088                 if (!(lvl = find_lv_in_vg(lv->vg, thin_pool_name))) {
2089                         log_error("Unable to find existing pool LV %s in VG %s.",
2090                                   thin_pool_name, lv->vg->name);
2091                         return 0;
2092                 }
2093                 thin_pool_lv = lvl->lv;
2094                 size = first_seg(thin_pool_lv)->chunk_size;
2095                 if (lv->vg->extent_size < size) {
2096                         /* Align extents on chunk boundary size */
2097                         size = ((uint64_t)lv->vg->extent_size * extents + size - 1) /
2098                                 size * size / lv->vg->extent_size;
2099                         if (size != extents) {
2100                                 log_print("Rounding size (%d extents) up to chunk boundary "
2101                                           "size (%d extents).", extents, size);
2102                                 extents = size;
2103                         }
2104                 }
2105         }
2106
2107         if (!dm_list_empty(&lv->segments) &&
2108             (seg = last_seg(lv)) && (seg->segtype == segtype)) {
2109                 seg->area_len += extents;
2110                 seg->len += extents;
2111         } else {
2112                 if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, extents,
2113                                              status, 0, NULL, thin_pool_lv, 0,
2114                                              extents, 0, 0, 0, NULL))) {
2115                         log_error("Couldn't allocate new zero segment.");
2116                         return 0;
2117                 }
2118                 lv->status |= VIRTUAL;
2119                 dm_list_add(&lv->segments, &seg->list);
2120         }
2121
2122         lv->le_count += extents;
2123         lv->size += (uint64_t) extents *lv->vg->extent_size;
2124
2125         return 1;
2126 }
2127
2128 /*
2129  * Entry point for all extent allocations.
2130  */
2131 struct alloc_handle *allocate_extents(struct volume_group *vg,
2132                                       struct logical_volume *lv,
2133                                       const struct segment_type *segtype,
2134                                       uint32_t stripes,
2135                                       uint32_t mirrors, uint32_t log_count,
2136                                       uint32_t region_size, uint32_t extents,
2137                                       struct dm_list *allocatable_pvs,
2138                                       alloc_policy_t alloc,
2139                                       struct dm_list *parallel_areas)
2140 {
2141         struct alloc_handle *ah;
2142         uint32_t new_extents;
2143
2144         if (segtype_is_virtual(segtype)) {
2145                 log_error("allocate_extents does not handle virtual segments");
2146                 return NULL;
2147         }
2148
2149         if (!allocatable_pvs) {
2150                 log_error(INTERNAL_ERROR "Missing allocatable pvs.");
2151                 return NULL;
2152         }
2153
2154         if (vg->fid->fmt->ops->segtype_supported &&
2155             !vg->fid->fmt->ops->segtype_supported(vg->fid, segtype)) {
2156                 log_error("Metadata format (%s) does not support required "
2157                           "LV segment type (%s).", vg->fid->fmt->name,
2158                           segtype->name);
2159                 log_error("Consider changing the metadata format by running "
2160                           "vgconvert.");
2161                 return NULL;
2162         }
2163
2164         if (alloc >= ALLOC_INHERIT)
2165                 alloc = vg->alloc;
2166
2167         new_extents = (lv ? lv->le_count : 0) + extents;
2168         if (!(ah = _alloc_init(vg->cmd, vg->cmd->mem, segtype, alloc,
2169                                new_extents, mirrors, stripes, log_count,
2170                                vg->extent_size, region_size,
2171                                parallel_areas)))
2172                 return_NULL;
2173
2174         if (!_allocate(ah, vg, lv, 1, allocatable_pvs)) {
2175                 alloc_destroy(ah);
2176                 return_NULL;
2177         }
2178
2179         return ah;
2180 }
2181
2182 /*
2183  * Add new segments to an LV from supplied list of areas.
2184  */
2185 int lv_add_segment(struct alloc_handle *ah,
2186                    uint32_t first_area, uint32_t num_areas,
2187                    struct logical_volume *lv,
2188                    const struct segment_type *segtype,
2189                    uint32_t stripe_size,
2190                    uint64_t status,
2191                    uint32_t region_size)
2192 {
2193         if (!segtype) {
2194                 log_error("Missing segtype in lv_add_segment().");
2195                 return 0;
2196         }
2197
2198         if (segtype_is_virtual(segtype)) {
2199                 log_error("lv_add_segment cannot handle virtual segments");
2200                 return 0;
2201         }
2202
2203         if ((status & MIRROR_LOG) && dm_list_size(&lv->segments)) {
2204                 log_error("Log segments can only be added to an empty LV");
2205                 return 0;
2206         }
2207
2208         if (!_setup_alloced_segments(lv, &ah->alloced_areas[first_area],
2209                                      num_areas, status,
2210                                      stripe_size, segtype,
2211                                      region_size))
2212                 return_0;
2213
2214         if ((segtype->flags & SEG_CAN_SPLIT) && !lv_merge_segments(lv)) {
2215                 log_error("Couldn't merge segments after extending "
2216                           "logical volume.");
2217                 return 0;
2218         }
2219
2220         if (lv->vg->fid->fmt->ops->lv_setup &&
2221             !lv->vg->fid->fmt->ops->lv_setup(lv->vg->fid, lv))
2222                 return_0;
2223
2224         return 1;
2225 }
2226
2227 /*
2228  * "mirror" segment type doesn't support split.
2229  * So, when adding mirrors to linear LV segment, first split it,
2230  * then convert it to "mirror" and add areas.
2231  */
2232 static struct lv_segment *_convert_seg_to_mirror(struct lv_segment *seg,
2233                                                  uint32_t region_size,
2234                                                  struct logical_volume *log_lv)
2235 {
2236         struct lv_segment *newseg;
2237         uint32_t s;
2238
2239         if (!seg_is_striped(seg)) {
2240                 log_error("Can't convert non-striped segment to mirrored.");
2241                 return NULL;
2242         }
2243
2244         if (seg->area_count > 1) {
2245                 log_error("Can't convert striped segment with multiple areas "
2246                           "to mirrored.");
2247                 return NULL;
2248         }
2249
2250         if (!(newseg = alloc_lv_segment(get_segtype_from_string(seg->lv->vg->cmd, "mirror"),
2251                                         seg->lv, seg->le, seg->len,
2252                                         seg->status, seg->stripe_size,
2253                                         log_lv, NULL,
2254                                         seg->area_count, seg->area_len,
2255                                         seg->chunk_size, region_size,
2256                                         seg->extents_copied, NULL))) {
2257                 log_error("Couldn't allocate converted LV segment");
2258                 return NULL;
2259         }
2260
2261         for (s = 0; s < seg->area_count; s++)
2262                 if (!move_lv_segment_area(newseg, s, seg, s))
2263                         return_NULL;
2264
2265         seg->pvmove_source_seg = NULL; /* Not maintained after allocation */
2266
2267         dm_list_add(&seg->list, &newseg->list);
2268         dm_list_del(&seg->list);
2269
2270         return newseg;
2271 }
2272
2273 /*
2274  * Add new areas to mirrored segments
2275  */
2276 int lv_add_mirror_areas(struct alloc_handle *ah,
2277                         struct logical_volume *lv, uint32_t le,
2278                         uint32_t region_size)
2279 {
2280         struct alloced_area *aa;
2281         struct lv_segment *seg;
2282         uint32_t current_le = le;
2283         uint32_t s, old_area_count, new_area_count;
2284
2285         dm_list_iterate_items(aa, &ah->alloced_areas[0]) {
2286                 if (!(seg = find_seg_by_le(lv, current_le))) {
2287                         log_error("Failed to find segment for %s extent %"
2288                                   PRIu32, lv->name, current_le);
2289                         return 0;
2290                 }
2291
2292                 /* Allocator assures aa[0].len <= seg->area_len */
2293                 if (aa[0].len < seg->area_len) {
2294                         if (!lv_split_segment(lv, seg->le + aa[0].len)) {
2295                                 log_error("Failed to split segment at %s "
2296                                           "extent %" PRIu32, lv->name, le);
2297                                 return 0;
2298                         }
2299                 }
2300
2301                 if (!seg_is_mirrored(seg) &&
2302                     (!(seg = _convert_seg_to_mirror(seg, region_size, NULL))))
2303                         return_0;
2304
2305                 old_area_count = seg->area_count;
2306                 new_area_count = old_area_count + ah->area_count;
2307
2308                 if (!_lv_segment_add_areas(lv, seg, new_area_count))
2309                         return_0;
2310
2311                 for (s = 0; s < ah->area_count; s++) {
2312                         if (!set_lv_segment_area_pv(seg, s + old_area_count,
2313                                                     aa[s].pv, aa[s].pe))
2314                                 return_0;
2315                 }
2316
2317                 current_le += seg->area_len;
2318         }
2319
2320         lv->status |= MIRRORED;
2321
2322         if (lv->vg->fid->fmt->ops->lv_setup &&
2323             !lv->vg->fid->fmt->ops->lv_setup(lv->vg->fid, lv))
2324                 return_0;
2325
2326         return 1;
2327 }
2328
2329 /*
2330  * Add mirror image LVs to mirrored segments
2331  */
2332 int lv_add_mirror_lvs(struct logical_volume *lv,
2333                       struct logical_volume **sub_lvs,
2334                       uint32_t num_extra_areas,
2335                       uint64_t status, uint32_t region_size)
2336 {
2337         struct lv_segment *seg;
2338         uint32_t old_area_count, new_area_count;
2339         uint32_t m;
2340         struct segment_type *mirror_segtype;
2341
2342         seg = first_seg(lv);
2343
2344         if (dm_list_size(&lv->segments) != 1 || seg_type(seg, 0) != AREA_LV) {
2345                 log_error("Mirror layer must be inserted before adding mirrors");
2346                 return 0;
2347         }
2348
2349         mirror_segtype = get_segtype_from_string(lv->vg->cmd, "mirror");
2350         if (seg->segtype != mirror_segtype)
2351                 if (!(seg = _convert_seg_to_mirror(seg, region_size, NULL)))
2352                         return_0;
2353
2354         if (region_size && region_size != seg->region_size) {
2355                 log_error("Conflicting region_size");
2356                 return 0;
2357         }
2358
2359         old_area_count = seg->area_count;
2360         new_area_count = old_area_count + num_extra_areas;
2361
2362         if (!_lv_segment_add_areas(lv, seg, new_area_count)) {
2363                 log_error("Failed to allocate widened LV segment for %s.",
2364                           lv->name);
2365                 return 0;
2366         }
2367
2368         for (m = 0; m < old_area_count; m++)
2369                 seg_lv(seg, m)->status |= status;
2370
2371         for (m = old_area_count; m < new_area_count; m++) {
2372                 if (!set_lv_segment_area_lv(seg, m, sub_lvs[m - old_area_count],
2373                                             0, status))
2374                         return_0;
2375                 lv_set_hidden(sub_lvs[m - old_area_count]);
2376         }
2377
2378         lv->status |= MIRRORED;
2379
2380         return 1;
2381 }
2382
2383 /*
2384  * Turn an empty LV into a mirror log.
2385  *
2386  * FIXME: Mirrored logs are built inefficiently.
2387  * A mirrored log currently uses the same layout that a mirror
2388  * LV uses.  The mirror layer sits on top of AREA_LVs which form the
2389  * legs, rather on AREA_PVs.  This is done to allow re-use of the
2390  * various mirror functions to also handle the mirrored LV that makes
2391  * up the log.
2392  *
2393  * If we used AREA_PVs under the mirror layer of a log, we could
2394  * assemble it all at once by calling 'lv_add_segment' with the
2395  * appropriate segtype (mirror/stripe), like this:
2396  *      lv_add_segment(ah, ah->area_count, ah->log_area_count,
2397  *                     log_lv, segtype, 0, MIRROR_LOG, 0);
2398  *
2399  * For now, we use the same mechanism to build a mirrored log as we
2400  * do for building a mirrored LV: 1) create initial LV, 2) add a
2401  * mirror layer, and 3) add the remaining copy LVs
2402  */
2403 int lv_add_log_segment(struct alloc_handle *ah, uint32_t first_area,
2404                        struct logical_volume *log_lv, uint64_t status)
2405 {
2406
2407         return lv_add_segment(ah, ah->area_count + first_area, 1, log_lv,
2408                               get_segtype_from_string(log_lv->vg->cmd,
2409                                                       "striped"),
2410                               0, status, 0);
2411 }
2412
2413 static int _lv_insert_empty_sublvs(struct logical_volume *lv,
2414                                    const struct segment_type *segtype,
2415                                    uint32_t stripe_size, uint32_t region_size,
2416                                    uint32_t devices)
2417 {
2418         struct logical_volume *sub_lv;
2419         uint32_t i;
2420         uint64_t sub_lv_status = 0;
2421         const char *layer_name;
2422         size_t len = strlen(lv->name) + 32;
2423         char img_name[len];
2424         struct lv_segment *mapseg;
2425
2426         if (lv->le_count || !dm_list_empty(&lv->segments)) {
2427                 log_error(INTERNAL_ERROR
2428                           "Non-empty LV passed to _lv_insert_empty_sublv");
2429                 return 0;
2430         }
2431
2432         if (segtype_is_raid(segtype)) {
2433                 lv->status |= RAID;
2434                 sub_lv_status = RAID_IMAGE;
2435                 layer_name = "rimage";
2436         } else if (segtype_is_mirrored(segtype)) {
2437                 lv->status |= MIRRORED;
2438                 sub_lv_status = MIRROR_IMAGE;
2439                 layer_name = "mimage";
2440         } else
2441                 return_0;
2442
2443         /*
2444          * First, create our top-level segment for our top-level LV
2445          */
2446         if (!(mapseg = alloc_lv_segment(segtype, lv, 0, 0, lv->status,
2447                                         stripe_size, NULL, NULL,
2448                                         devices, 0, 0, region_size, 0, NULL))) {
2449                 log_error("Failed to create mapping segment for %s", lv->name);
2450                 return 0;
2451         }
2452
2453         /*
2454          * Next, create all of our sub_lv's and link them in.
2455          */
2456         for (i = 0; i < devices; i++) {
2457                 /* Data LVs */
2458                 if (devices > 1) {
2459                         if (dm_snprintf(img_name, len, "%s_%s_%u",
2460                                         lv->name, layer_name, i) < 0)
2461                                 return_0;
2462                 } else {
2463                         if (dm_snprintf(img_name, len, "%s_%s",
2464                                         lv->name, layer_name) < 0)
2465                                 return_0;
2466                 }
2467
2468                 /* FIXME Should use ALLOC_INHERIT here and inherit from parent LV */
2469                 if (!(sub_lv = lv_create_empty(img_name, NULL,
2470                                          LVM_READ | LVM_WRITE,
2471                                          lv->alloc, lv->vg)))
2472                         return_0;
2473
2474                 if (!set_lv_segment_area_lv(mapseg, i, sub_lv, 0, sub_lv_status))
2475                         return_0;
2476
2477                 /* Metadata LVs for raid */
2478                 if (segtype_is_raid(segtype)) {
2479                         if (dm_snprintf(img_name, len, "%s_rmeta_%u", lv->name, i) < 0)
2480                                 return_0;
2481                 } else
2482                         continue;
2483
2484                 /* FIXME Should use ALLOC_INHERIT here and inherit from parent LV */
2485                 if (!(sub_lv = lv_create_empty(img_name, NULL,
2486                                                LVM_READ | LVM_WRITE,
2487                                                lv->alloc, lv->vg)))
2488                         return_0;
2489
2490                 if (!set_lv_segment_area_lv(mapseg, i, sub_lv, 0, RAID_META))
2491                                 return_0;
2492         }
2493
2494         dm_list_add(&lv->segments, &mapseg->list);
2495
2496         return 1;
2497 }
2498
2499 static int _lv_extend_layered_lv(struct alloc_handle *ah,
2500                                  struct logical_volume *lv,
2501                                  uint32_t extents, uint32_t first_area,
2502                                  uint32_t stripes, uint32_t stripe_size)
2503 {
2504         const struct segment_type *segtype;
2505         struct logical_volume *sub_lv, *meta_lv;
2506         struct lv_segment *seg;
2507         uint32_t fa, s;
2508         int clear_metadata = 0;
2509
2510         segtype = get_segtype_from_string(lv->vg->cmd, "striped");
2511
2512         /*
2513          * The component devices of a "striped" LV all go in the same
2514          * LV.  However, RAID has an LV for each device - making the
2515          * 'stripes' and 'stripe_size' parameters meaningless.
2516          */
2517         if (seg_is_raid(first_seg(lv))) {
2518                 stripes = 1;
2519                 stripe_size = 0;
2520         }
2521
2522         seg = first_seg(lv);
2523         for (fa = first_area, s = 0; s < seg->area_count; s++) {
2524                 if (is_temporary_mirror_layer(seg_lv(seg, s))) {
2525                         if (!_lv_extend_layered_lv(ah, seg_lv(seg, s), extents,
2526                                                    fa, stripes, stripe_size))
2527                                 return_0;
2528                         fa += lv_mirror_count(seg_lv(seg, s));
2529                         continue;
2530                 }
2531
2532                 sub_lv = seg_lv(seg, s);
2533                 if (!lv_add_segment(ah, fa, stripes, sub_lv, segtype,
2534                                     stripe_size, sub_lv->status, 0)) {
2535                         log_error("Aborting. Failed to extend %s in %s.",
2536                                   sub_lv->name, lv->name);
2537                         return 0;
2538                 }
2539
2540                 /* Extend metadata LVs only on initial creation */
2541                 if (seg_is_raid(seg) && !lv->le_count) {
2542                         if (!seg->meta_areas) {
2543                                 log_error("No meta_areas for RAID type");
2544                                 return 0;
2545                         }
2546
2547                         meta_lv = seg_metalv(seg, s);
2548                         if (!lv_add_segment(ah, fa + seg->area_count, 1,
2549                                             meta_lv, segtype, 0,
2550                                             meta_lv->status, 0)) {
2551                                 log_error("Failed to extend %s in %s.",
2552                                           meta_lv->name, lv->name);
2553                                 return 0;
2554                         }
2555                         lv_set_visible(meta_lv);
2556                         clear_metadata = 1;
2557                 }
2558
2559                 fa += stripes;
2560         }
2561
2562         if (clear_metadata) {
2563                 /*
2564                  * We must clear the metadata areas upon creation.
2565                  */
2566                 if (!vg_write(lv->vg) || !vg_commit(lv->vg))
2567                         return_0;
2568
2569                 for (s = 0; s < seg->area_count; s++) {
2570                         meta_lv = seg_metalv(seg, s);
2571                         if (!activate_lv(meta_lv->vg->cmd, meta_lv)) {
2572                                 log_error("Failed to activate %s/%s for clearing",
2573                                           meta_lv->vg->name, meta_lv->name);
2574                                 return 0;
2575                         }
2576
2577                         log_verbose("Clearing metadata area of %s/%s",
2578                                     meta_lv->vg->name, meta_lv->name);
2579                         /*
2580                          * Rather than wiping meta_lv->size, we can simply
2581                          * wipe '1' to remove the superblock of any previous
2582                          * RAID devices.  It is much quicker.
2583                          */
2584                         if (!set_lv(meta_lv->vg->cmd, meta_lv, 1, 0)) {
2585                                 log_error("Failed to zero %s/%s",
2586                                           meta_lv->vg->name, meta_lv->name);
2587                                 return 0;
2588                         }
2589
2590                         if (!deactivate_lv(meta_lv->vg->cmd, meta_lv)) {
2591                                 log_error("Failed to deactivate %s/%s",
2592                                           meta_lv->vg->name, meta_lv->name);
2593                                 return 0;
2594                         }
2595                         lv_set_hidden(meta_lv);
2596                 }
2597         }
2598
2599         seg->area_len += extents;
2600         seg->len += extents;
2601         lv->le_count += extents;
2602         lv->size += (uint64_t) extents *lv->vg->extent_size;
2603
2604         return 1;
2605 }
2606
2607 /*
2608  * Entry point for single-step LV allocation + extension.
2609  */
2610 int lv_extend(struct logical_volume *lv,
2611               const struct segment_type *segtype,
2612               uint32_t stripes, uint32_t stripe_size,
2613               uint32_t mirrors, uint32_t region_size,
2614               uint32_t extents, const char *thin_pool_name,
2615               struct dm_list *allocatable_pvs, alloc_policy_t alloc)
2616 {
2617         int r = 1;
2618         int log_count = 0;
2619         struct alloc_handle *ah;
2620         uint32_t sub_lv_count;
2621
2622         log_very_verbose("Extending segment type, %s", segtype->name);
2623
2624         if (segtype_is_virtual(segtype))
2625                 return lv_add_virtual_segment(lv, 0u, extents, segtype, thin_pool_name);
2626
2627         if (!lv->le_count && segtype_is_thin_pool(segtype)) {
2628                 /* Thin pool allocation treats its metadata device like a mirror log. */
2629                 /* FIXME Allow pool and data on same device with NORMAL */
2630                 /* FIXME Support striped metadata pool */
2631                 log_count = 1;
2632         } else if (segtype_is_raid(segtype) && !lv->le_count)
2633                 log_count = mirrors * stripes;
2634         /* FIXME log_count should be 1 for mirrors */
2635
2636         if (!(ah = allocate_extents(lv->vg, lv, segtype, stripes, mirrors,
2637                                     log_count, region_size, extents,
2638                                     allocatable_pvs, alloc, NULL)))
2639                 return_0;
2640
2641         if (segtype_is_thin_pool(segtype)) {
2642                 if (!lv->le_count) {
2643                         if (!(r = extend_pool(lv, segtype, ah, stripes, stripe_size)))
2644                                 stack;
2645                 } else if (!(r = _lv_extend_layered_lv(ah, lv, extents, 0,
2646                                                        stripes, stripe_size)))
2647                         stack;
2648         } else if (!segtype_is_mirrored(segtype) && !segtype_is_raid(segtype)) {
2649                 if (!(r = lv_add_segment(ah, 0, ah->area_count, lv, segtype,
2650                                          stripe_size, 0u, 0)))
2651                         stack;
2652         } else {
2653                 /*
2654                  * For RAID, all the devices are AREA_LV.
2655                  * However, for 'mirror on stripe' using non-RAID targets,
2656                  * the mirror legs are AREA_LV while the stripes underneath
2657                  * are AREA_PV.
2658                  */
2659                 if (segtype_is_raid(segtype))
2660                         sub_lv_count = mirrors * stripes + segtype->parity_devs;
2661                 else
2662                         sub_lv_count = mirrors;
2663
2664                 if (!lv->le_count &&
2665                     !(r = _lv_insert_empty_sublvs(lv, segtype, stripe_size,
2666                                                   region_size, sub_lv_count))) {
2667                         log_error("Failed to insert layer for %s", lv->name);
2668                         goto out;
2669                 }
2670
2671                 if (!(r = _lv_extend_layered_lv(ah, lv, extents, 0,
2672                                                 stripes, stripe_size)))
2673                         goto_out;
2674
2675                 /*
2676                  * If we are expanding an existing mirror, we can skip the
2677                  * resync of the extension if the LV is currently in-sync
2678                  * and the LV has the LV_NOTSYNCED flag set.
2679                  */
2680                 if ((lv->le_count != extents) &&
2681                     segtype_is_mirrored(segtype) &&
2682                     (lv->status & LV_NOTSYNCED)) {
2683                         percent_t sync_percent = PERCENT_INVALID;
2684
2685                         if (!lv_is_active(lv)) {
2686                                 log_print("%s/%s is not active."
2687                                           "  Unable to get sync percent.",
2688                                           lv->vg->name, lv->name);
2689                                 if (yes_no_prompt("Do full resync of extended "
2690                                                   "portion of %s/%s?  [y/n]: ",
2691                                                   lv->vg->name, lv->name) == 'y')
2692                                         goto out;
2693                                 r = 0;
2694                                 goto out;
2695                         }
2696
2697                         if (!(r = lv_mirror_percent(lv->vg->cmd, lv, 0,
2698                                                     &sync_percent, NULL))) {
2699                                 log_error("Failed to get sync percent for %s/%s",
2700                                           lv->vg->name, lv->name);
2701                                 goto out;
2702                         } else if (sync_percent == PERCENT_100) {
2703                                 log_verbose("Skipping initial resync for "
2704                                             "extended portion of %s/%s",
2705                                             lv->vg->name, lv->name);
2706                                 init_mirror_in_sync(1);
2707                                 lv->status |= LV_NOTSYNCED;
2708                         } else {
2709                                 log_error("%s/%s cannot be extended while"
2710                                           " it is recovering.",
2711                                           lv->vg->name, lv->name);
2712                                 r = 0;
2713                                 goto out;
2714                         }
2715                 }
2716         }
2717
2718 out:
2719         alloc_destroy(ah);
2720         return r;
2721 }
2722
2723 /*
2724  * Minimal LV renaming function.
2725  * Metadata transaction should be made by caller.
2726  * Assumes new_name is allocated from cmd->mem pool.
2727  */
2728 static int _rename_single_lv(struct logical_volume *lv, char *new_name)
2729 {
2730         struct volume_group *vg = lv->vg;
2731
2732         if (find_lv_in_vg(vg, new_name)) {
2733                 log_error("Logical volume \"%s\" already exists in "
2734                           "volume group \"%s\"", new_name, vg->name);
2735                 return 0;
2736         }
2737
2738         if (lv->status & LOCKED) {
2739                 log_error("Cannot rename locked LV %s", lv->name);
2740                 return 0;
2741         }
2742
2743         lv->name = new_name;
2744
2745         return 1;
2746 }
2747
2748 /*
2749  * Rename sub LV.
2750  * 'lv_name_old' and 'lv_name_new' are old and new names of the main LV.
2751  */
2752 static int _rename_sub_lv(struct cmd_context *cmd,
2753                           struct logical_volume *lv,
2754                           const char *lv_name_old, const char *lv_name_new)
2755 {
2756         const char *suffix;
2757         char *new_name;
2758         size_t len;
2759
2760         /*
2761          * A sub LV name starts with lv_name_old + '_'.
2762          * The suffix follows lv_name_old and includes '_'.
2763          */
2764         len = strlen(lv_name_old);
2765         if (strncmp(lv->name, lv_name_old, len) || lv->name[len] != '_') {
2766                 log_error("Cannot rename \"%s\": name format not recognized "
2767                           "for internal LV \"%s\"",
2768                           lv_name_old, lv->name);
2769                 return 0;
2770         }
2771         suffix = lv->name + len;
2772
2773         /*
2774          * Compose a new name for sub lv:
2775          *   e.g. new name is "lvol1_mlog"
2776          *      if the sub LV is "lvol0_mlog" and
2777          *      a new name for main LV is "lvol1"
2778          */
2779         len = strlen(lv_name_new) + strlen(suffix) + 1;
2780         new_name = dm_pool_alloc(cmd->mem, len);
2781         if (!new_name) {
2782                 log_error("Failed to allocate space for new name");
2783                 return 0;
2784         }
2785         if (dm_snprintf(new_name, len, "%s%s", lv_name_new, suffix) < 0) {
2786                 log_error("Failed to create new name");
2787                 return 0;
2788         }
2789
2790         /* Rename it */
2791         return _rename_single_lv(lv, new_name);
2792 }
2793
2794 /* Callback for for_each_sub_lv */
2795 static int _rename_cb(struct cmd_context *cmd, struct logical_volume *lv,
2796                       void *data)
2797 {
2798         struct lv_names *lv_names = (struct lv_names *) data;
2799
2800         return _rename_sub_lv(cmd, lv, lv_names->old, lv_names->new);
2801 }
2802
2803 /*
2804  * Loop down sub LVs and call fn for each.
2805  * fn is responsible to log necessary information on failure.
2806  */
2807 int for_each_sub_lv(struct cmd_context *cmd, struct logical_volume *lv,
2808                     int (*fn)(struct cmd_context *cmd,
2809                               struct logical_volume *lv, void *data),
2810                     void *data)
2811 {
2812         struct logical_volume *org;
2813         struct lv_segment *seg;
2814         uint32_t s;
2815
2816         if (lv_is_cow(lv) && lv_is_virtual_origin(org = origin_from_cow(lv))) {
2817                 if (!fn(cmd, org, data))
2818                         return_0;
2819                 if (!for_each_sub_lv(cmd, org, fn, data))
2820                         return_0;
2821         }
2822
2823         dm_list_iterate_items(seg, &lv->segments) {
2824                 if (seg->log_lv) {
2825                         if (!fn(cmd, seg->log_lv, data))
2826                                 return_0;
2827                         if (!for_each_sub_lv(cmd, seg->log_lv, fn, data))
2828                                 return_0;
2829                 }
2830
2831                 if (seg->metadata_lv) {
2832                         if (!fn(cmd, seg->metadata_lv, data))
2833                                 return_0;
2834                         if (!for_each_sub_lv(cmd, seg->metadata_lv, fn, data))
2835                                 return_0;
2836                 }
2837
2838                 for (s = 0; s < seg->area_count; s++) {
2839                         if (seg_type(seg, s) != AREA_LV)
2840                                 continue;
2841                         if (!fn(cmd, seg_lv(seg, s), data))
2842                                 return_0;
2843                         if (!for_each_sub_lv(cmd, seg_lv(seg, s), fn, data))
2844                                 return_0;
2845                 }
2846
2847                 if (!seg_is_raid(seg))
2848                         continue;
2849
2850                 /* RAID has meta_areas */
2851                 for (s = 0; s < seg->area_count; s++) {
2852                         if (seg_metatype(seg, s) != AREA_LV)
2853                                 continue;
2854                         if (!fn(cmd, seg_metalv(seg, s), data))
2855                                 return_0;
2856                         if (!for_each_sub_lv(cmd, seg_metalv(seg, s), fn, data))
2857                                 return_0;
2858                 }
2859         }
2860
2861         return 1;
2862 }
2863
2864
2865 /*
2866  * Core of LV renaming routine.
2867  * VG must be locked by caller.
2868  */
2869 int lv_rename(struct cmd_context *cmd, struct logical_volume *lv,
2870               const char *new_name)
2871 {
2872         struct volume_group *vg = lv->vg;
2873         struct lv_names lv_names;
2874         DM_LIST_INIT(lvs_changed);
2875         struct lv_list lvl, lvl2, *lvlp;
2876         int r = 0;
2877
2878         /* rename is not allowed on sub LVs */
2879         if (!lv_is_visible(lv)) {
2880                 log_error("Cannot rename internal LV \"%s\".", lv->name);
2881                 return 0;
2882         }
2883
2884         if (find_lv_in_vg(vg, new_name)) {
2885                 log_error("Logical volume \"%s\" already exists in "
2886                           "volume group \"%s\"", new_name, vg->name);
2887                 return 0;
2888         }
2889
2890         if (lv->status & LOCKED) {
2891                 log_error("Cannot rename locked LV %s", lv->name);
2892                 return 0;
2893         }
2894
2895         if (!archive(vg))
2896                 return 0;
2897
2898         /* rename sub LVs */
2899         lv_names.old = lv->name;
2900         lv_names.new = new_name;
2901         if (!for_each_sub_lv(cmd, lv, _rename_cb, (void *) &lv_names))
2902                 return 0;
2903
2904         /* rename main LV */
2905         if (!(lv->name = dm_pool_strdup(cmd->mem, new_name))) {
2906                 log_error("Failed to allocate space for new name");
2907                 return 0;
2908         }
2909
2910         lvl.lv = lv;
2911         dm_list_add(&lvs_changed, &lvl.list);
2912
2913         /* rename active virtual origin too */
2914         if (lv_is_cow(lv) && lv_is_virtual_origin(lvl2.lv = origin_from_cow(lv)))
2915                 dm_list_add_h(&lvs_changed, &lvl2.list);
2916
2917         log_verbose("Writing out updated volume group");
2918         if (!vg_write(vg))
2919                 return 0;
2920
2921         if (!suspend_lvs(cmd, &lvs_changed, vg))
2922                 goto_out;
2923
2924         if (!(r = vg_commit(vg)))
2925                 stack;
2926
2927         /*
2928          * FIXME: resume LVs in reverse order to prevent memory
2929          * lock imbalance when resuming virtual snapshot origin
2930          * (resume of snapshot resumes origin too)
2931          */
2932         dm_list_iterate_back_items(lvlp, &lvs_changed)
2933                 if (!resume_lv(cmd, lvlp->lv))
2934                         stack;
2935 out:
2936         backup(vg);
2937         return r;
2938 }
2939
2940 char *generate_lv_name(struct volume_group *vg, const char *format,
2941                        char *buffer, size_t len)
2942 {
2943         struct lv_list *lvl;
2944         int high = -1, i;
2945
2946         dm_list_iterate_items(lvl, &vg->lvs) {
2947                 if (sscanf(lvl->lv->name, format, &i) != 1)
2948                         continue;
2949
2950                 if (i > high)
2951                         high = i;
2952         }
2953
2954         if (dm_snprintf(buffer, len, format, high + 1) < 0)
2955                 return NULL;
2956
2957         return buffer;
2958 }
2959
2960 int vg_max_lv_reached(struct volume_group *vg)
2961 {
2962         if (!vg->max_lv)
2963                 return 0;
2964
2965         if (vg->max_lv > vg_visible_lvs(vg))
2966                 return 0;
2967
2968         log_verbose("Maximum number of logical volumes (%u) reached "
2969                     "in volume group %s", vg->max_lv, vg->name);
2970
2971         return 1;
2972 }
2973
2974 struct logical_volume *alloc_lv(struct dm_pool *mem)
2975 {
2976         struct logical_volume *lv;
2977
2978         if (!(lv = dm_pool_zalloc(mem, sizeof(*lv)))) {
2979                 log_error("Unable to allocate logical volume structure");
2980                 return NULL;
2981         }
2982
2983         lv->snapshot = NULL;
2984         dm_list_init(&lv->snapshot_segs);
2985         dm_list_init(&lv->segments);
2986         dm_list_init(&lv->tags);
2987         dm_list_init(&lv->segs_using_this_lv);
2988         dm_list_init(&lv->rsites);
2989
2990         return lv;
2991 }
2992
2993 /*
2994  * Create a new empty LV.
2995  */
2996 struct logical_volume *lv_create_empty(const char *name,
2997                                        union lvid *lvid,
2998                                        uint64_t status,
2999                                        alloc_policy_t alloc,
3000                                        struct volume_group *vg)
3001 {
3002         struct format_instance *fi = vg->fid;
3003         struct logical_volume *lv;
3004         char dname[NAME_LEN];
3005
3006         if (vg_max_lv_reached(vg))
3007                 stack;
3008
3009         if (strstr(name, "%d") &&
3010             !(name = generate_lv_name(vg, name, dname, sizeof(dname)))) {
3011                 log_error("Failed to generate unique name for the new "
3012                           "logical volume");
3013                 return NULL;
3014         } else if (find_lv_in_vg(vg, name)) {
3015                 log_error("Unable to create LV %s in Volume Group %s: "
3016                           "name already in use.", name, vg->name);
3017                 return NULL;
3018         }
3019
3020         log_verbose("Creating logical volume %s", name);
3021
3022         if (!(lv = alloc_lv(vg->vgmem)))
3023                 return_NULL;
3024
3025         if (!(lv->name = dm_pool_strdup(vg->vgmem, name)))
3026                 goto_bad;
3027
3028         lv->status = status;
3029         lv->alloc = alloc;
3030         lv->read_ahead = vg->cmd->default_settings.read_ahead;
3031         lv->major = -1;
3032         lv->minor = -1;
3033         lv->size = UINT64_C(0);
3034         lv->le_count = 0;
3035
3036         if (lvid)
3037                 lv->lvid = *lvid;
3038
3039         if (!link_lv_to_vg(vg, lv))
3040                 goto_bad;
3041
3042         if (!lv_set_creation(lv, NULL, 0))
3043                 goto_bad;
3044
3045         if (fi->fmt->ops->lv_setup && !fi->fmt->ops->lv_setup(fi, lv))
3046                 goto_bad;
3047
3048         return lv;
3049 bad:
3050         dm_pool_free(vg->vgmem, lv);
3051         return NULL;
3052 }
3053
3054 static int _add_pvs(struct cmd_context *cmd, struct pv_segment *peg,
3055                     uint32_t s __attribute__((unused)), void *data)
3056 {
3057         struct seg_pvs *spvs = (struct seg_pvs *) data;
3058         struct pv_list *pvl;
3059
3060         /* Don't add again if it's already on list. */
3061         if (find_pv_in_pv_list(&spvs->pvs, peg->pv))
3062                         return 1;
3063
3064         if (!(pvl = dm_pool_alloc(cmd->mem, sizeof(*pvl)))) {
3065                 log_error("pv_list allocation failed");
3066                 return 0;
3067         }
3068
3069         pvl->pv = peg->pv;
3070
3071         dm_list_add(&spvs->pvs, &pvl->list);
3072
3073         return 1;
3074 }
3075
3076 /*
3077  * Construct dm_list of segments of LVs showing which PVs they use.
3078  * For pvmove we use the *parent* LV so we can pick up stripes & existing mirrors etc.
3079  */
3080 struct dm_list *build_parallel_areas_from_lv(struct logical_volume *lv,
3081                                              unsigned use_pvmove_parent_lv)
3082 {
3083         struct cmd_context *cmd = lv->vg->cmd;
3084         struct dm_list *parallel_areas;
3085         struct seg_pvs *spvs;
3086         uint32_t current_le = 0;
3087         uint32_t raid_multiple;
3088         struct lv_segment *seg = first_seg(lv);
3089
3090         if (!(parallel_areas = dm_pool_alloc(cmd->mem, sizeof(*parallel_areas)))) {
3091                 log_error("parallel_areas allocation failed");
3092                 return NULL;
3093         }
3094
3095         dm_list_init(parallel_areas);
3096
3097         do {
3098                 if (!(spvs = dm_pool_zalloc(cmd->mem, sizeof(*spvs)))) {
3099                         log_error("allocation failed");
3100                         return NULL;
3101                 }
3102
3103                 dm_list_init(&spvs->pvs);
3104
3105                 spvs->le = current_le;
3106                 spvs->len = lv->le_count - current_le;
3107
3108                 dm_list_add(parallel_areas, &spvs->list);
3109
3110                 if (use_pvmove_parent_lv && !(seg = find_seg_by_le(lv, current_le))) {
3111                         log_error("Failed to find segment for %s extent %" PRIu32,
3112                                   lv->name, current_le);
3113                         return 0;
3114                 }
3115
3116                 /* Find next segment end */
3117                 /* FIXME Unnecessary nesting! */
3118                 if (!_for_each_pv(cmd, use_pvmove_parent_lv ? seg->pvmove_source_seg->lv : lv,
3119                                   use_pvmove_parent_lv ? seg->pvmove_source_seg->le : current_le,
3120                                   use_pvmove_parent_lv ? spvs->len * _calc_area_multiple(seg->pvmove_source_seg->segtype, seg->pvmove_source_seg->area_count, 0) : spvs->len,
3121                                   use_pvmove_parent_lv ? seg->pvmove_source_seg : NULL,
3122                                   &spvs->len,
3123                                   0, 0, -1, 0, _add_pvs, (void *) spvs))
3124                         return_NULL;
3125
3126                 current_le = spvs->le + spvs->len;
3127                 raid_multiple = (seg->segtype->parity_devs) ?
3128                         seg->area_count - seg->segtype->parity_devs : 1;
3129         } while ((current_le * raid_multiple) < lv->le_count);
3130
3131         /* FIXME Merge adjacent segments with identical PV lists (avoids need for contiguous allocation attempts between successful allocations) */
3132
3133         return parallel_areas;
3134 }
3135
3136 int link_lv_to_vg(struct volume_group *vg, struct logical_volume *lv)
3137 {
3138         struct lv_list *lvl;
3139
3140         if (vg_max_lv_reached(vg))
3141                 stack;
3142
3143         if (!(lvl = dm_pool_zalloc(vg->vgmem, sizeof(*lvl))))
3144                 return_0;
3145
3146         lvl->lv = lv;
3147         lv->vg = vg;
3148         dm_list_add(&vg->lvs, &lvl->list);
3149
3150         return 1;
3151 }
3152
3153 int unlink_lv_from_vg(struct logical_volume *lv)
3154 {
3155         struct lv_list *lvl;
3156
3157         if (!(lvl = find_lv_in_vg(lv->vg, lv->name)))
3158                 return_0;
3159
3160         dm_list_del(&lvl->list);
3161
3162         return 1;
3163 }
3164
3165 void lv_set_visible(struct logical_volume *lv)
3166 {
3167         if (lv_is_visible(lv))
3168                 return;
3169
3170         lv->status |= VISIBLE_LV;
3171
3172         log_debug("LV %s in VG %s is now visible.",  lv->name, lv->vg->name);
3173 }
3174
3175 void lv_set_hidden(struct logical_volume *lv)
3176 {
3177         if (!lv_is_visible(lv))
3178                 return;
3179
3180         lv->status &= ~VISIBLE_LV;
3181
3182         log_debug("LV %s in VG %s is now hidden.",  lv->name, lv->vg->name);
3183 }
3184
3185 int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv,
3186                      const force_t force)
3187 {
3188         struct volume_group *vg;
3189         struct lvinfo info;
3190         struct logical_volume *format1_origin = NULL;
3191         int format1_reload_required = 0;
3192         int visible;
3193         struct logical_volume *pool_lv = NULL;
3194
3195         vg = lv->vg;
3196
3197         if (!vg_check_status(vg, LVM_WRITE))
3198                 return_0;
3199
3200         if (lv_is_origin(lv)) {
3201                 log_error("Can't remove logical volume \"%s\" under snapshot",
3202                           lv->name);
3203                 return 0;
3204         }
3205
3206         if (lv->status & MIRROR_IMAGE) {
3207                 log_error("Can't remove logical volume %s used by a mirror",
3208                           lv->name);
3209                 return 0;
3210         }
3211
3212         if (lv->status & MIRROR_LOG) {
3213                 log_error("Can't remove logical volume %s used as mirror log",
3214                           lv->name);
3215                 return 0;
3216         }
3217
3218         if (lv->status & (RAID_META | RAID_IMAGE)) {
3219                 log_error("Can't remove logical volume %s used as RAID device",
3220                           lv->name);
3221                 return 0;
3222         }
3223
3224         if (lv_is_thin_pool_data(lv) || lv_is_thin_pool_metadata(lv)) {
3225                 log_error("Can't remove logical volume %s used by a thin pool.",
3226                           lv->name);
3227                 return 0;
3228         } else if (lv_is_thin_volume(lv))
3229                 pool_lv = first_seg(lv)->pool_lv;
3230
3231         if (lv->status & LOCKED) {
3232                 log_error("Can't remove locked LV %s", lv->name);
3233                 return 0;
3234         }
3235
3236         /* FIXME Ensure not referred to by another existing LVs */
3237
3238         if (lv_info(cmd, lv, 0, &info, 1, 0)) {
3239                 if (!lv_check_not_in_use(cmd, lv, &info))
3240                         return_0;
3241
3242                 if ((force == PROMPT) &&
3243                     lv_is_visible(lv) &&
3244                     lv_is_active(lv) &&
3245                     yes_no_prompt("Do you really want to remove active "
3246                                   "%slogical volume %s? [y/n]: ",
3247                                   vg_is_clustered(vg) ? "clustered " : "",
3248                                   lv->name) == 'n') {
3249                         log_error("Logical volume %s not removed", lv->name);
3250                         return 0;
3251                 }
3252         }
3253
3254         if (!archive(vg))
3255                 return 0;
3256
3257         if (lv_is_cow(lv)) {
3258                 /* Old format1 code */
3259                 if (!(lv->vg->fid->fmt->features & FMT_MDAS))
3260                         format1_origin = origin_from_cow(lv);
3261
3262                 log_verbose("Removing snapshot %s", lv->name);
3263                 /* vg_remove_snapshot() will preload origin/former snapshots */
3264                 if (!vg_remove_snapshot(lv))
3265                         return_0;
3266         }
3267
3268         /* FIXME Review and fix the snapshot error paths! */
3269         if (!deactivate_lv(cmd, lv)) {
3270                 log_error("Unable to deactivate logical volume \"%s\"",
3271                           lv->name);
3272                 return 0;
3273         }
3274
3275         /* Clear thin pool stacked messages */
3276         if (pool_lv && !pool_has_message(first_seg(pool_lv), lv, 0) &&
3277             !update_pool_lv(pool_lv, 1)) {
3278                 log_error("Failed to update thin pool %s.", pool_lv->name);
3279                 return 0;
3280         }
3281
3282         visible = lv_is_visible(lv);
3283
3284         log_verbose("Releasing logical volume \"%s\"", lv->name);
3285         if (!lv_remove(lv)) {
3286                 log_error("Error releasing logical volume \"%s\"", lv->name);
3287                 return 0;
3288         }
3289
3290         /*
3291          * Old format1 code: If no snapshots left reload without -real.
3292          */
3293         if (format1_origin && !lv_is_origin(format1_origin)) {
3294                 log_warn("WARNING: Support for snapshots with old LVM1-style metadata is deprecated.");
3295                 log_warn("WARNING: Please use lvconvert to update to lvm2 metadata at your convenience.");
3296                 format1_reload_required = 1;
3297         }
3298
3299         /* store it on disks */
3300         if (!vg_write(vg))
3301                 return_0;
3302
3303         /* format1 */
3304         if (format1_reload_required && !suspend_lv(cmd, format1_origin))
3305                 log_error("Failed to refresh %s without snapshot.", format1_origin->name);
3306
3307         if (!vg_commit(vg))
3308                 return_0;
3309
3310         /* format1 */
3311         if (format1_reload_required && !resume_lv(cmd, format1_origin)) {
3312                 log_error("Failed to resume %s.", format1_origin->name);
3313                 return 0;
3314         }
3315
3316         /* Release unneeded blocks in thin pool */
3317         /* TODO: defer when multiple LVs relased at once */
3318         if (pool_lv && !update_pool_lv(pool_lv, 1)) {
3319                 log_error("Failed to update thin pool %s.", pool_lv->name);
3320                 return 0;
3321         }
3322
3323         backup(vg);
3324
3325         if (visible)
3326                 log_print("Logical volume \"%s\" successfully removed", lv->name);
3327
3328         return 1;
3329 }
3330
3331 /*
3332  * remove LVs with its dependencies - LV leaf nodes should be removed first
3333  */
3334 int lv_remove_with_dependencies(struct cmd_context *cmd, struct logical_volume *lv,
3335                                 const force_t force, unsigned level)
3336 {
3337         percent_t snap_percent;
3338         struct dm_list *snh, *snht;
3339         struct seg_list *sl, *tsl;
3340         struct lvinfo info;
3341
3342         if (lv_is_cow(lv)) {
3343                 /*
3344                  * A merging snapshot cannot be removed directly unless
3345                  * it has been invalidated or failed merge removal is requested.
3346                  */
3347                 if (lv_is_merging_cow(lv) && !level) {
3348                         if (lv_info(lv->vg->cmd, lv, 0, &info, 1, 0) &&
3349                             info.exists && info.live_table) {
3350                                 if (!lv_snapshot_percent(lv, &snap_percent)) {
3351                                         log_error("Failed to obtain merging snapshot progress percentage for logical volume %s.",
3352                                                   lv->name);
3353                                         return 0;
3354                                 }
3355                                 if ((snap_percent != PERCENT_INVALID) &&
3356                                      (snap_percent != PERCENT_MERGE_FAILED)) {
3357                                         log_error("Can't remove merging snapshot logical volume \"%s\"",
3358                                                   lv->name);
3359                                         return 0;
3360                                 } else if ((snap_percent == PERCENT_MERGE_FAILED) &&
3361                                          (force == PROMPT) &&
3362                                          yes_no_prompt("Removing snapshot \"%s\" that failed to merge may leave origin \"%s\" inconsistent. "
3363                                                        "Proceed? [y/n]: ", lv->name, origin_from_cow(lv)->name) == 'n') {
3364                                         log_error("Logical volume %s not removed.", lv->name);
3365                                         return 0;
3366                                 }
3367                         }
3368                 }
3369         }
3370
3371         if (lv_is_origin(lv)) {
3372                 /* Remove snapshot LVs first */
3373                 if ((force == PROMPT) &&
3374                     /* Active snapshot already needs to confirm each active LV */
3375                     !lv_is_active(lv) &&
3376                     yes_no_prompt("Removing origin %s will also remove %u "
3377                                   "snapshots(s). Proceed? [y/n]: ",
3378                                   lv->name, lv->origin_count) == 'n') {
3379                         log_error("Logical volume %s not removed.", lv->name);
3380                         return 0;
3381                 }
3382
3383                 dm_list_iterate_safe(snh, snht, &lv->snapshot_segs)
3384                         if (!lv_remove_with_dependencies(cmd, dm_list_struct_base(snh, struct lv_segment,
3385                                                                                   origin_list)->cow,
3386                                                          force, level + 1))
3387                                 return_0;
3388         }
3389
3390         if (lv_is_used_thin_pool(lv)) {
3391                 /* Remove thin LVs first */
3392                 if ((force == PROMPT) &&
3393                     yes_no_prompt("Removing pool %s will also remove %u "
3394                                   "thin volume(s). OK? [y/n]: ", lv->name,
3395                                   /* Note: Snaphosts not included */
3396                                   dm_list_size(&lv->segs_using_this_lv)) == 'n') {
3397                         log_error("Logical volume %s not removed.", lv->name);
3398                         return 0;
3399                 }
3400
3401                 dm_list_iterate_items_safe(sl, tsl, &lv->segs_using_this_lv)
3402                         if (!lv_remove_with_dependencies(cmd, sl->seg->lv,
3403                                                          force, level + 1))
3404                                 return_0;
3405         }
3406
3407         return lv_remove_single(cmd, lv, force);
3408 }
3409
3410 /*
3411  * insert_layer_for_segments_on_pv() inserts a layer segment for a segment area.
3412  * However, layer modification could split the underlying layer segment.
3413  * This function splits the parent area according to keep the 1:1 relationship
3414  * between the parent area and the underlying layer segment.
3415  * Since the layer LV might have other layers below, build_parallel_areas()
3416  * is used to find the lowest-level segment boundaries.
3417  */
3418 static int _split_parent_area(struct lv_segment *seg, uint32_t s,
3419                               struct dm_list *layer_seg_pvs)
3420 {
3421         uint32_t parent_area_len, parent_le, layer_le;
3422         uint32_t area_multiple;
3423         struct seg_pvs *spvs;
3424
3425         if (seg_is_striped(seg))
3426                 area_multiple = seg->area_count;
3427         else
3428                 area_multiple = 1;
3429
3430         parent_area_len = seg->area_len;
3431         parent_le = seg->le;
3432         layer_le = seg_le(seg, s);
3433
3434         while (parent_area_len > 0) {
3435                 /* Find the layer segment pointed at */
3436                 if (!(spvs = _find_seg_pvs_by_le(layer_seg_pvs, layer_le))) {
3437                         log_error("layer segment for %s:%" PRIu32 " not found",
3438                                   seg->lv->name, parent_le);
3439                         return 0;
3440                 }
3441
3442                 if (spvs->le != layer_le) {
3443                         log_error("Incompatible layer boundary: "
3444                                   "%s:%" PRIu32 "[%" PRIu32 "] on %s:%" PRIu32,
3445                                   seg->lv->name, parent_le, s,
3446                                   seg_lv(seg, s)->name, layer_le);
3447                         return 0;
3448                 }
3449
3450                 if (spvs->len < parent_area_len) {
3451                         parent_le += spvs->len * area_multiple;
3452                         if (!lv_split_segment(seg->lv, parent_le))
3453                                 return_0;
3454                 }
3455
3456                 parent_area_len -= spvs->len;
3457                 layer_le += spvs->len;
3458         }
3459
3460         return 1;
3461 }
3462
3463 /*
3464  * Split the parent LV segments if the layer LV below it is splitted.
3465  */
3466 int split_parent_segments_for_layer(struct cmd_context *cmd,
3467                                     struct logical_volume *layer_lv)
3468 {
3469         struct lv_list *lvl;
3470         struct logical_volume *parent_lv;
3471         struct lv_segment *seg;
3472         uint32_t s;
3473         struct dm_list *parallel_areas;
3474
3475         if (!(parallel_areas = build_parallel_areas_from_lv(layer_lv, 0)))
3476                 return_0;
3477
3478         /* Loop through all LVs except itself */
3479         dm_list_iterate_items(lvl, &layer_lv->vg->lvs) {
3480                 parent_lv = lvl->lv;
3481                 if (parent_lv == layer_lv)
3482                         continue;
3483
3484                 /* Find all segments that point at the layer LV */
3485                 dm_list_iterate_items(seg, &parent_lv->segments) {
3486                         for (s = 0; s < seg->area_count; s++) {
3487                                 if (seg_type(seg, s) != AREA_LV ||
3488                                     seg_lv(seg, s) != layer_lv)
3489                                         continue;
3490
3491                                 if (!_split_parent_area(seg, s, parallel_areas))
3492                                         return_0;
3493                         }
3494                 }
3495         }
3496
3497         return 1;
3498 }
3499
3500 /* Remove a layer from the LV */
3501 int remove_layers_for_segments(struct cmd_context *cmd,
3502                                struct logical_volume *lv,
3503                                struct logical_volume *layer_lv,
3504                                uint64_t status_mask, struct dm_list *lvs_changed)
3505 {
3506         struct lv_segment *seg, *lseg;
3507         uint32_t s;
3508         int lv_changed = 0;
3509         struct lv_list *lvl;
3510
3511         log_very_verbose("Removing layer %s for segments of %s",
3512                          layer_lv->name, lv->name);
3513
3514         /* Find all segments that point at the temporary mirror */
3515         dm_list_iterate_items(seg, &lv->segments) {
3516                 for (s = 0; s < seg->area_count; s++) {
3517                         if (seg_type(seg, s) != AREA_LV ||
3518                             seg_lv(seg, s) != layer_lv)
3519                                 continue;
3520
3521                         /* Find the layer segment pointed at */
3522                         if (!(lseg = find_seg_by_le(layer_lv, seg_le(seg, s)))) {
3523                                 log_error("Layer segment found: %s:%" PRIu32,
3524                                           layer_lv->name, seg_le(seg, s));
3525                                 return 0;
3526                         }
3527
3528                         /* Check the segment params are compatible */
3529                         if (!seg_is_striped(lseg) || lseg->area_count != 1) {
3530                                 log_error("Layer is not linear: %s:%" PRIu32,
3531                                           layer_lv->name, lseg->le);
3532                                 return 0;
3533                         }
3534                         if ((lseg->status & status_mask) != status_mask) {
3535                                 log_error("Layer status does not match: "
3536                                           "%s:%" PRIu32 " status: 0x%" PRIx64 "/0x%" PRIx64,
3537                                           layer_lv->name, lseg->le,
3538                                           lseg->status, status_mask);
3539                                 return 0;
3540                         }
3541                         if (lseg->le != seg_le(seg, s) ||
3542                             lseg->area_len != seg->area_len) {
3543                                 log_error("Layer boundary mismatch: "
3544                                           "%s:%" PRIu32 "-%" PRIu32 " on "
3545                                           "%s:%" PRIu32 " / "
3546                                           "%" PRIu32 "-%" PRIu32 " / ",
3547                                           lv->name, seg->le, seg->area_len,
3548                                           layer_lv->name, seg_le(seg, s),
3549                                           lseg->le, lseg->area_len);
3550                                 return 0;
3551                         }
3552
3553                         if (!move_lv_segment_area(seg, s, lseg, 0))
3554                                 return_0;
3555
3556                         /* Replace mirror with error segment */
3557                         if (!(lseg->segtype =
3558                               get_segtype_from_string(lv->vg->cmd, "error"))) {
3559                                 log_error("Missing error segtype");
3560                                 return 0;
3561                         }
3562                         lseg->area_count = 0;
3563
3564                         /* First time, add LV to list of LVs affected */
3565                         if (!lv_changed && lvs_changed) {
3566                                 if (!(lvl = dm_pool_alloc(cmd->mem, sizeof(*lvl)))) {
3567                                         log_error("lv_list alloc failed");
3568                                         return 0;
3569                                 }
3570                                 lvl->lv = lv;
3571                                 dm_list_add(lvs_changed, &lvl->list);
3572                                 lv_changed = 1;
3573                         }
3574                 }
3575         }
3576         if (lv_changed && !lv_merge_segments(lv))
3577                 stack;
3578
3579         return 1;
3580 }
3581
3582 /* Remove a layer */
3583 int remove_layers_for_segments_all(struct cmd_context *cmd,
3584                                    struct logical_volume *layer_lv,
3585                                    uint64_t status_mask,
3586                                    struct dm_list *lvs_changed)
3587 {
3588         struct lv_list *lvl;
3589         struct logical_volume *lv1;
3590
3591         /* Loop through all LVs except the temporary mirror */
3592         dm_list_iterate_items(lvl, &layer_lv->vg->lvs) {
3593                 lv1 = lvl->lv;
3594                 if (lv1 == layer_lv)
3595                         continue;
3596
3597                 if (!remove_layers_for_segments(cmd, lv1, layer_lv,
3598                                                 status_mask, lvs_changed))
3599                         return_0;
3600         }
3601
3602         if (!lv_empty(layer_lv))
3603                 return_0;
3604
3605         return 1;
3606 }
3607
3608 int move_lv_segments(struct logical_volume *lv_to,
3609                      struct logical_volume *lv_from,
3610                      uint64_t set_status, uint64_t reset_status)
3611 {
3612         struct lv_segment *seg;
3613
3614         dm_list_iterate_items(seg, &lv_to->segments)
3615                 if (seg->origin) {
3616                         log_error("Can't move snapshot segment.");
3617                         return 0;
3618                 }
3619
3620         dm_list_init(&lv_to->segments);
3621         dm_list_splice(&lv_to->segments, &lv_from->segments);
3622
3623         dm_list_iterate_items(seg, &lv_to->segments) {
3624                 seg->lv = lv_to;
3625                 seg->status &= ~reset_status;
3626                 seg->status |= set_status;
3627         }
3628
3629         lv_to->le_count = lv_from->le_count;
3630         lv_to->size = lv_from->size;
3631
3632         lv_from->le_count = 0;
3633         lv_from->size = 0;
3634
3635         return 1;
3636 }
3637
3638 /* Remove a layer from the LV */
3639 int remove_layer_from_lv(struct logical_volume *lv,
3640                          struct logical_volume *layer_lv)
3641 {
3642         struct logical_volume *parent;
3643         struct lv_segment *parent_seg;
3644         struct segment_type *segtype;
3645
3646         log_very_verbose("Removing layer %s for %s", layer_lv->name, lv->name);
3647
3648         if (!(parent_seg = get_only_segment_using_this_lv(layer_lv))) {
3649                 log_error("Failed to find layer %s in %s",
3650                 layer_lv->name, lv->name);
3651                 return 0;
3652         }
3653         parent = parent_seg->lv;
3654
3655         /*
3656          * Before removal, the layer should be cleaned up,
3657          * i.e. additional segments and areas should have been removed.
3658          */
3659         if (dm_list_size(&parent->segments) != 1 ||
3660             parent_seg->area_count != 1 ||
3661             seg_type(parent_seg, 0) != AREA_LV ||
3662             layer_lv != seg_lv(parent_seg, 0) ||
3663             parent->le_count != layer_lv->le_count)
3664                 return_0;
3665
3666         if (!lv_empty(parent))
3667                 return_0;
3668
3669         if (!move_lv_segments(parent, layer_lv, 0, 0))
3670                 return_0;
3671
3672         /* Replace the empty layer with error segment */
3673         segtype = get_segtype_from_string(lv->vg->cmd, "error");
3674         if (!lv_add_virtual_segment(layer_lv, 0, parent->le_count, segtype, NULL))
3675                 return_0;
3676
3677         return 1;
3678 }
3679
3680 /*
3681  * Create and insert a linear LV "above" lv_where.
3682  * After the insertion, a new LV named lv_where->name + suffix is created
3683  * and all segments of lv_where is moved to the new LV.
3684  * lv_where will have a single segment which maps linearly to the new LV.
3685  */
3686 struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd,
3687                                            struct logical_volume *lv_where,
3688                                            uint64_t status,
3689                                            const char *layer_suffix)
3690 {
3691         int r;
3692         char *name;
3693         size_t len;
3694         struct str_list *sl;
3695         struct logical_volume *layer_lv;
3696         struct segment_type *segtype;
3697         struct lv_segment *mapseg;
3698         unsigned exclusive = 0;
3699
3700         /* create an empty layer LV */
3701         len = strlen(lv_where->name) + 32;
3702         if (!(name = alloca(len))) {
3703                 log_error("layer name allocation failed. "
3704                           "Remove new LV and retry.");
3705                 return NULL;
3706         }
3707
3708         if (dm_snprintf(name, len, "%s%s", lv_where->name, layer_suffix) < 0) {
3709                 log_error("layer name allocation failed. "
3710                           "Remove new LV and retry.");
3711                 return NULL;
3712         }
3713
3714         if (!(layer_lv = lv_create_empty(name, NULL, LVM_READ | LVM_WRITE,
3715                                          ALLOC_INHERIT, lv_where->vg))) {
3716                 log_error("Creation of layer LV failed");
3717                 return NULL;
3718         }
3719
3720         if (lv_is_active_exclusive_locally(lv_where))
3721                 exclusive = 1;
3722
3723         if (lv_is_active(lv_where) && strstr(name, "_mimagetmp")) {
3724                 log_very_verbose("Creating transient LV %s for mirror conversion in VG %s.", name, lv_where->vg->name);
3725
3726                 segtype = get_segtype_from_string(cmd, "error");
3727
3728                 if (!lv_add_virtual_segment(layer_lv, 0, lv_where->le_count, segtype, NULL)) {
3729                         log_error("Creation of transient LV %s for mirror conversion in VG %s failed.", name, lv_where->vg->name);
3730                         return NULL;
3731                 }
3732
3733                 /* Temporary tags for activation of the transient LV */
3734                 dm_list_iterate_items(sl, &lv_where->tags)
3735                         if (!str_list_add(cmd->mem, &layer_lv->tags, sl->str)) {
3736                                 log_error("Aborting.  Unable to tag"
3737                                           " transient mirror layer.");
3738                                 return NULL;
3739                         }
3740
3741                 if (!vg_write(lv_where->vg)) {
3742                         log_error("Failed to write intermediate VG %s metadata for mirror conversion.", lv_where->vg->name);
3743                         return NULL;
3744                 }
3745
3746                 if (!vg_commit(lv_where->vg)) {
3747                         log_error("Failed to commit intermediate VG %s metadata for mirror conversion.", lv_where->vg->name);
3748                         vg_revert(lv_where->vg);
3749                         return NULL;
3750                 }
3751
3752                 if (exclusive)
3753                         r = activate_lv_excl(cmd, layer_lv);
3754                 else
3755                         r = activate_lv(cmd, layer_lv);
3756
3757                 if (!r) {
3758                         log_error("Failed to resume transient LV"
3759                                   " %s for mirror conversion in VG %s.",
3760                                   name, lv_where->vg->name);
3761                         return NULL;
3762                 }
3763
3764                 /* Remove the temporary tags */
3765                 dm_list_iterate_items(sl, &lv_where->tags)
3766                         str_list_del(&layer_lv->tags, sl->str);
3767
3768         }
3769
3770         log_very_verbose("Inserting layer %s for %s",
3771                          layer_lv->name, lv_where->name);
3772
3773         if (!move_lv_segments(layer_lv, lv_where, 0, 0))
3774                 return_NULL;
3775
3776         if (!(segtype = get_segtype_from_string(cmd, "striped")))
3777                 return_NULL;
3778
3779         /* allocate a new linear segment */
3780         if (!(mapseg = alloc_lv_segment(segtype, lv_where, 0, layer_lv->le_count,
3781                                         status, 0, NULL, NULL, 1, layer_lv->le_count,
3782                                         0, 0, 0, NULL)))
3783                 return_NULL;
3784
3785         /* map the new segment to the original underlying are */
3786         if (!set_lv_segment_area_lv(mapseg, 0, layer_lv, 0, 0))
3787                 return_NULL;
3788
3789         /* add the new segment to the layer LV */
3790         dm_list_add(&lv_where->segments, &mapseg->list);
3791         lv_where->le_count = layer_lv->le_count;
3792         lv_where->size = (uint64_t) lv_where->le_count * lv_where->vg->extent_size;
3793
3794         return layer_lv;
3795 }
3796
3797 /*
3798  * Extend and insert a linear layer LV beneath the source segment area.
3799  */
3800 static int _extend_layer_lv_for_segment(struct logical_volume *layer_lv,
3801                                         struct lv_segment *seg, uint32_t s,
3802                                         uint64_t status)
3803 {
3804         struct lv_segment *mapseg;
3805         struct segment_type *segtype;
3806         struct physical_volume *src_pv = seg_pv(seg, s);
3807         uint32_t src_pe = seg_pe(seg, s);
3808
3809         if (seg_type(seg, s) != AREA_PV && seg_type(seg, s) != AREA_LV)
3810                 return_0;
3811
3812         if (!(segtype = get_segtype_from_string(layer_lv->vg->cmd, "striped")))
3813                 return_0;
3814
3815         /* FIXME Incomplete message? Needs more context */
3816         log_very_verbose("Inserting %s:%" PRIu32 "-%" PRIu32 " of %s/%s",
3817                          pv_dev_name(src_pv),
3818                          src_pe, src_pe + seg->area_len - 1,
3819                          seg->lv->vg->name, seg->lv->name);
3820
3821         /* allocate a new segment */
3822         if (!(mapseg = alloc_lv_segment(segtype, layer_lv, layer_lv->le_count,
3823                                         seg->area_len, status, 0,
3824                                         NULL, NULL, 1, seg->area_len, 0, 0, 0, seg)))
3825                 return_0;
3826
3827         /* map the new segment to the original underlying are */
3828         if (!move_lv_segment_area(mapseg, 0, seg, s))
3829                 return_0;
3830
3831         /* add the new segment to the layer LV */
3832         dm_list_add(&layer_lv->segments, &mapseg->list);
3833         layer_lv->le_count += seg->area_len;
3834         layer_lv->size += (uint64_t) seg->area_len * layer_lv->vg->extent_size;
3835
3836         /* map the original area to the new segment */
3837         if (!set_lv_segment_area_lv(seg, s, layer_lv, mapseg->le, 0))
3838                 return_0;
3839
3840         return 1;
3841 }
3842
3843 /*
3844  * Match the segment area to PEs in the pvl
3845  * (the segment area boundary should be aligned to PE ranges by
3846  *  _adjust_layer_segments() so that there is no partial overlap.)
3847  */
3848 static int _match_seg_area_to_pe_range(struct lv_segment *seg, uint32_t s,
3849                                        struct pv_list *pvl)
3850 {
3851         struct pe_range *per;
3852         uint32_t pe_start, per_end;
3853
3854         if (!pvl)
3855                 return 1;
3856
3857         if (seg_type(seg, s) != AREA_PV || seg_dev(seg, s) != pvl->pv->dev)
3858                 return 0;
3859
3860         pe_start = seg_pe(seg, s);
3861
3862         /* Do these PEs match to any of the PEs in pvl? */
3863         dm_list_iterate_items(per, pvl->pe_ranges) {
3864                 per_end = per->start + per->count - 1;
3865
3866                 if ((pe_start < per->start) || (pe_start > per_end))
3867                         continue;
3868
3869                 /* FIXME Missing context in this message - add LV/seg details */
3870                 log_debug("Matched PE range %s:%" PRIu32 "-%" PRIu32 " against "
3871                           "%s %" PRIu32 " len %" PRIu32, dev_name(pvl->pv->dev),
3872                           per->start, per_end, dev_name(seg_dev(seg, s)),
3873                           seg_pe(seg, s), seg->area_len);
3874
3875                 return 1;
3876         }
3877
3878         return 0;
3879 }
3880
3881 /*
3882  * For each segment in lv_where that uses a PV in pvl directly,
3883  * split the segment if it spans more than one underlying PV.
3884  */
3885 static int _align_segment_boundary_to_pe_range(struct logical_volume *lv_where,
3886                                                struct pv_list *pvl)
3887 {
3888         struct lv_segment *seg;
3889         struct pe_range *per;
3890         uint32_t pe_start, pe_end, per_end, stripe_multiplier, s;
3891
3892         if (!pvl)
3893                 return 1;
3894
3895         /* Split LV segments to match PE ranges */
3896         dm_list_iterate_items(seg, &lv_where->segments) {
3897                 for (s = 0; s < seg->area_count; s++) {
3898                         if (seg_type(seg, s) != AREA_PV ||
3899                             seg_dev(seg, s) != pvl->pv->dev)
3900                                 continue;
3901
3902                         /* Do these PEs match with the condition? */
3903                         dm_list_iterate_items(per, pvl->pe_ranges) {
3904                                 pe_start = seg_pe(seg, s);
3905                                 pe_end = pe_start + seg->area_len - 1;
3906                                 per_end = per->start + per->count - 1;
3907
3908                                 /* No overlap? */
3909                                 if ((pe_end < per->start) ||
3910                                     (pe_start > per_end))
3911                                         continue;
3912
3913                                 if (seg_is_striped(seg))
3914                                         stripe_multiplier = seg->area_count;
3915                                 else
3916                                         stripe_multiplier = 1;
3917
3918                                 if ((per->start != pe_start &&
3919                                      per->start > pe_start) &&
3920                                     !lv_split_segment(lv_where, seg->le +
3921                                                       (per->start - pe_start) *
3922                                                       stripe_multiplier))
3923                                         return_0;
3924
3925                                 if ((per_end != pe_end &&
3926                                      per_end < pe_end) &&
3927                                     !lv_split_segment(lv_where, seg->le +
3928                                                       (per_end - pe_start + 1) *
3929                                                       stripe_multiplier))
3930                                         return_0;
3931                         }
3932                 }
3933         }
3934
3935         return 1;
3936 }
3937
3938 /*
3939  * Scan lv_where for segments on a PV in pvl, and for each one found
3940  * append a linear segment to lv_layer and insert it between the two.
3941  *
3942  * If pvl is empty, a layer is placed under the whole of lv_where.
3943  * If the layer is inserted, lv_where is added to lvs_changed.
3944  */
3945 int insert_layer_for_segments_on_pv(struct cmd_context *cmd,
3946                                     struct logical_volume *lv_where,
3947                                     struct logical_volume *layer_lv,
3948                                     uint64_t status,
3949                                     struct pv_list *pvl,
3950                                     struct dm_list *lvs_changed)
3951 {
3952         struct lv_segment *seg;
3953         struct lv_list *lvl;
3954         int lv_used = 0;
3955         uint32_t s;
3956
3957         log_very_verbose("Inserting layer %s for segments of %s on %s",
3958                          layer_lv->name, lv_where->name,
3959                          pvl ? pv_dev_name(pvl->pv) : "any");
3960
3961         if (!_align_segment_boundary_to_pe_range(lv_where, pvl))
3962                 return_0;
3963
3964         /* Work through all segments on the supplied PV */
3965         dm_list_iterate_items(seg, &lv_where->segments) {
3966                 for (s = 0; s < seg->area_count; s++) {
3967                         if (!_match_seg_area_to_pe_range(seg, s, pvl))
3968                                 continue;
3969
3970                         /* First time, add LV to list of LVs affected */
3971                         if (!lv_used && lvs_changed) {
3972                                 if (!(lvl = dm_pool_alloc(cmd->mem, sizeof(*lvl)))) {
3973                                         log_error("lv_list alloc failed");
3974                                         return 0;
3975                                 }
3976                                 lvl->lv = lv_where;
3977                                 dm_list_add(lvs_changed, &lvl->list);
3978                                 lv_used = 1;
3979                         }
3980
3981                         if (!_extend_layer_lv_for_segment(layer_lv, seg, s,
3982                                                           status)) {
3983                                 log_error("Failed to insert segment in layer "
3984                                           "LV %s under %s:%" PRIu32 "-%" PRIu32,
3985                                           layer_lv->name, lv_where->name,
3986                                           seg->le, seg->le + seg->len);
3987                                 return 0;
3988                         }
3989                 }
3990         }
3991
3992         return 1;
3993 }
3994
3995 /*
3996  * Initialize the LV with 'value'.
3997  */
3998 int set_lv(struct cmd_context *cmd, struct logical_volume *lv,
3999            uint64_t sectors, int value)
4000 {
4001         struct device *dev;
4002         char *name;
4003
4004         /*
4005          * FIXME:
4006          * <clausen> also, more than 4k
4007          * <clausen> say, reiserfs puts it's superblock 32k in, IIRC
4008          * <ejt_> k, I'll drop a fixme to that effect
4009          *         (I know the device is at least 4k, but not 32k)
4010          */
4011         if (!(name = dm_pool_alloc(cmd->mem, PATH_MAX))) {
4012                 log_error("Name allocation failed - device not cleared");
4013                 return 0;
4014         }
4015
4016         if (dm_snprintf(name, PATH_MAX, "%s%s/%s", cmd->dev_dir,
4017                         lv->vg->name, lv->name) < 0) {
4018                 log_error("Name too long - device not cleared (%s)", lv->name);
4019                 return 0;
4020         }
4021
4022         sync_local_dev_names(cmd);  /* Wait until devices are available */
4023
4024         log_verbose("Clearing start of logical volume \"%s\"", lv->name);
4025
4026         if (!(dev = dev_cache_get(name, NULL))) {
4027                 log_error("%s: not found: device not cleared", name);
4028                 return 0;
4029         }
4030
4031         if (!dev_open_quiet(dev))
4032                 return_0;
4033
4034         if (!sectors)
4035                 sectors = UINT64_C(4096) >> SECTOR_SHIFT;
4036
4037         if (sectors > lv->size)
4038                 sectors = lv->size;
4039
4040         if (!dev_set(dev, UINT64_C(0), (size_t) sectors << SECTOR_SHIFT, value))
4041                 stack;
4042
4043         dev_flush(dev);
4044
4045         if (!dev_close_immediate(dev))
4046                 stack;
4047
4048         return 1;
4049 }
4050
4051 static struct logical_volume *_create_virtual_origin(struct cmd_context *cmd,
4052                                                      struct volume_group *vg,
4053                                                      const char *lv_name,
4054                                                      uint32_t permission,
4055                                                      uint64_t voriginextents)
4056 {
4057         const struct segment_type *segtype;
4058         size_t len;
4059         char *vorigin_name;
4060         struct logical_volume *lv;
4061
4062         if (!(segtype = get_segtype_from_string(cmd, "zero"))) {
4063                 log_error("Zero segment type for virtual origin not found");
4064                 return NULL;
4065         }
4066
4067         len = strlen(lv_name) + 32;
4068         if (!(vorigin_name = alloca(len)) ||
4069             dm_snprintf(vorigin_name, len, "%s_vorigin", lv_name) < 0) {
4070                 log_error("Virtual origin name allocation failed.");
4071                 return NULL;
4072         }
4073
4074         if (!(lv = lv_create_empty(vorigin_name, NULL, permission,
4075                                    ALLOC_INHERIT, vg)))
4076                 return_NULL;
4077
4078         if (!lv_extend(lv, segtype, 1, 0, 1, 0, voriginextents,
4079                        NULL, NULL, ALLOC_INHERIT))
4080                 return_NULL;
4081
4082         /* store vg on disk(s) */
4083         if (!vg_write(vg) || !vg_commit(vg))
4084                 return_NULL;
4085
4086         backup(vg);
4087
4088         return lv;
4089 }
4090
4091 /* Thin notes:
4092  * If lp->thin OR lp->activate is AY*, activate the pool if not already active.
4093  * If lp->thin, create thin LV within the pool - as a snapshot if lp->snapshot.
4094  *   If lp->activate is AY*, activate it.
4095  *   If lp->activate was AN* and the pool was originally inactive, deactivate it.
4096  */
4097 static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, struct lvcreate_params *lp,
4098                                                const char *new_lv_name)
4099 {
4100         struct cmd_context *cmd = vg->cmd;
4101         uint32_t size_rest;
4102         uint64_t status = UINT64_C(0);
4103         struct logical_volume *lv, *org = NULL;
4104         struct logical_volume *pool_lv;
4105         struct lv_list *lvl;
4106         int origin_active = 0;
4107         struct lvinfo info;
4108
4109         if (new_lv_name && find_lv_in_vg(vg, new_lv_name)) {
4110                 log_error("Logical volume \"%s\" already exists in "
4111                           "volume group \"%s\"", new_lv_name, lp->vg_name);
4112                 return NULL;
4113         }
4114
4115         if (vg_max_lv_reached(vg)) {
4116                 log_error("Maximum number of logical volumes (%u) reached "
4117                           "in volume group %s", vg->max_lv, vg->name);
4118                 return NULL;
4119         }
4120
4121         if ((segtype_is_mirrored(lp->segtype) ||
4122              segtype_is_raid(lp->segtype) || segtype_is_thin(lp->segtype)) &&
4123             !(vg->fid->fmt->features & FMT_SEGMENTS)) {
4124                 log_error("Metadata does not support %s segments.",
4125                           lp->segtype->name);
4126                 return NULL;
4127         }
4128
4129         if (lp->read_ahead != DM_READ_AHEAD_AUTO &&
4130             lp->read_ahead != DM_READ_AHEAD_NONE &&
4131             (vg->fid->fmt->features & FMT_RESTRICTED_READAHEAD) &&
4132             (lp->read_ahead < 2 || lp->read_ahead > 120)) {
4133                 log_error("Metadata only supports readahead values between 2 and 120.");
4134                 return NULL;
4135         }
4136
4137         if (lp->stripe_size > vg->extent_size) {
4138                 log_error("Reducing requested stripe size %s to maximum, "
4139                           "physical extent size %s",
4140                           display_size(cmd, (uint64_t) lp->stripe_size),
4141                           display_size(cmd, (uint64_t) vg->extent_size));
4142                 lp->stripe_size = vg->extent_size;
4143         }
4144
4145         /* Need to check the vg's format to verify this - the cmd format isn't setup properly yet */
4146         if (lp->stripes > 1 &&
4147             !(vg->fid->fmt->features & FMT_UNLIMITED_STRIPESIZE) &&
4148             (lp->stripe_size > STRIPE_SIZE_MAX)) {
4149                 log_error("Stripe size may not exceed %s",
4150                           display_size(cmd, (uint64_t) STRIPE_SIZE_MAX));
4151                 return NULL;
4152         }
4153
4154         if ((size_rest = lp->extents % lp->stripes)) {
4155                 log_print("Rounding size (%d extents) up to stripe boundary "
4156                           "size (%d extents)", lp->extents,
4157                           lp->extents - size_rest + lp->stripes);
4158                 lp->extents = lp->extents - size_rest + lp->stripes;
4159         }
4160
4161         /* Does LV need to be zeroed?  Thin handles this as a per-pool in-kernel setting. */
4162         if (lp->zero && !segtype_is_thin(lp->segtype) && !activation()) {
4163                 log_error("Can't wipe start of new LV without using "
4164                           "device-mapper kernel driver");
4165                 return NULL;
4166         }
4167
4168         status |= lp->permission | VISIBLE_LV;
4169
4170         if (lp->snapshot && lp->thin) {
4171                 if (!(org = find_lv(vg, lp->origin))) {
4172                         log_error("Couldn't find origin volume '%s'.",
4173                                   lp->origin);
4174                         return NULL;
4175                 }
4176
4177                 if (org->status & LOCKED) {
4178                         log_error("Snapshots of locked devices are not supported.");
4179                         return NULL;
4180                 }
4181
4182                 lp->voriginextents = org->le_count;
4183         } else if (lp->snapshot) {
4184                 if (!activation()) {
4185                         log_error("Can't create snapshot without using "
4186                                   "device-mapper kernel driver");
4187                         return NULL;
4188                 }
4189
4190                 /* Must zero cow */
4191                 status |= LVM_WRITE;
4192
4193                 if (lp->voriginsize)
4194                         origin_active = 1;
4195                 else {
4196
4197                         if (!(org = find_lv(vg, lp->origin))) {
4198                                 log_error("Couldn't find origin volume '%s'.",
4199                                           lp->origin);
4200                                 return NULL;
4201                         }
4202                         if (lv_is_virtual_origin(org)) {
4203                                 log_error("Can't share virtual origins. "
4204                                           "Use --virtualsize.");
4205                                 return NULL;
4206                         }
4207                         if (lv_is_cow(org)) {
4208                                 log_error("Snapshots of snapshots are not "
4209                                           "supported yet.");
4210                                 return NULL;
4211                         }
4212                         if (org->status & LOCKED) {
4213                                 log_error("Snapshots of locked devices are not "
4214                                           "supported yet");
4215                                 return NULL;
4216                         }
4217                         if (lv_is_merging_origin(org)) {
4218                                 log_error("Snapshots of an origin that has a "
4219                                           "merging snapshot is not supported");
4220                                 return NULL;
4221                         }
4222
4223                         if (lv_is_thin_type(org) && !lv_is_thin_volume(org)) {
4224                                 log_error("Snapshots of thin pool %sdevices "
4225                                           "are not supported.",
4226                                           lv_is_thin_pool_data(org) ? "data " :
4227                                           lv_is_thin_pool_metadata(org) ?
4228                                           "metadata " : "");
4229                                 return NULL;
4230                         }
4231
4232                         if (lv_is_mirror_type(org) &&
4233                             !seg_is_raid(first_seg(org))) {
4234                                 log_error("Snapshots of \"mirror\" segment types"
4235                                           " are not supported");
4236                                 return NULL;
4237                         }
4238
4239                         if (!lv_info(cmd, org, 0, &info, 0, 0)) {
4240                                 log_error("Check for existence of active snapshot "
4241                                           "origin '%s' failed.", org->name);
4242                                 return NULL;
4243                         }
4244                         origin_active = info.exists;
4245
4246                         if (vg_is_clustered(vg) &&
4247                             !lv_is_active_exclusive_locally(org)) {
4248                                 log_error("%s must be active exclusively to"
4249                                           " create snapshot", org->name);
4250                                 return NULL;
4251                         }
4252                 }
4253         }
4254
4255         if (!seg_is_thin_volume(lp) && !lp->extents) {
4256                 log_error("Unable to create new logical volume with no extents");
4257                 return NULL;
4258         }
4259
4260         if (seg_is_thin_pool(lp) &&
4261             ((uint64_t)lp->extents * vg->extent_size < lp->chunk_size)) {
4262                 log_error("Unable to create thin pool smaller than 1 chunk.");
4263                 return NULL;
4264         }
4265
4266         if (lp->snapshot && !lp->thin && ((uint64_t)lp->extents * vg->extent_size < 2 * lp->chunk_size)) {
4267                 log_error("Unable to create a snapshot smaller than 2 chunks.");
4268                 return NULL;
4269         }
4270
4271         if (!seg_is_virtual(lp) &&
4272             vg->free_count < lp->extents) {
4273                 log_error("Volume group \"%s\" has insufficient free space "
4274                           "(%u extents): %u required.",
4275                           vg->name, vg->free_count, lp->extents);
4276                 return NULL;
4277         }
4278
4279         if (lp->stripes > dm_list_size(lp->pvh) && lp->alloc != ALLOC_ANYWHERE) {
4280                 log_error("Number of stripes (%u) must not exceed "
4281                           "number of physical volumes (%d)", lp->stripes,
4282                           dm_list_size(lp->pvh));
4283                 return NULL;
4284         }
4285
4286         if (!activation() &&
4287             (seg_is_mirrored(lp) ||
4288              seg_is_raid(lp) ||
4289              seg_is_thin_pool(lp))) {
4290                 /*
4291                  * FIXME: For thin pool add some code to allow delayed
4292                  * initialization of empty thin pool volume.
4293                  * i.e. using some LV flag, fake message,...
4294                  * and testing for metadata pool header signature?
4295                  */
4296                 log_error("Can't create %s without using "
4297                           "device-mapper kernel driver.",
4298                           segtype_is_raid(lp->segtype) ? lp->segtype->name :
4299                           segtype_is_mirrored(lp->segtype) ?  "mirror" :
4300                           "thin pool volume");
4301                 return NULL;
4302         }
4303
4304         /* The snapshot segment gets created later */
4305         if (lp->snapshot && !lp->thin &&
4306             !(lp->segtype = get_segtype_from_string(cmd, "striped")))
4307                 return_NULL;
4308
4309         if (!archive(vg))
4310                 return_NULL;
4311
4312         if (!dm_list_empty(&lp->tags)) {
4313                 if (!(vg->fid->fmt->features & FMT_TAGS)) {
4314                         log_error("Volume group %s does not support tags",
4315                                   vg->name);
4316                         return NULL;
4317                 }
4318         }
4319
4320         if (seg_is_thin_volume(lp) &&
4321             ((lp->activate == CHANGE_AY) ||
4322              (lp->activate == CHANGE_AE) ||
4323              (lp->activate == CHANGE_ALY))) {
4324                 /* Ensure all stacked messages are submitted */
4325                 if (!(lvl = find_lv_in_vg(vg, lp->pool))) {
4326                         log_error("Unable to find existing pool LV %s in VG %s.",
4327                                   lp->pool, vg->name);
4328                         return 0;
4329                 }
4330                 if (!update_pool_lv(lvl->lv, 1))
4331                         return_0;
4332         }
4333
4334         if (segtype_is_mirrored(lp->segtype) || segtype_is_raid(lp->segtype)) {
4335                 init_mirror_in_sync(lp->nosync);
4336
4337                 if (lp->nosync) {
4338                         log_warn("WARNING: New %s won't be synchronised. "
4339                                  "Don't read what you didn't write!",
4340                                  lp->segtype->name);
4341                         status |= LV_NOTSYNCED;
4342                 }
4343
4344                 lp->region_size = adjusted_mirror_region_size(vg->extent_size,
4345                                                               lp->extents,
4346                                                               lp->region_size);
4347         }
4348
4349         if (!(lv = lv_create_empty(new_lv_name ? : "lvol%d", NULL,
4350                                    status, lp->alloc, vg)))
4351                 return_NULL;
4352
4353         if (lp->read_ahead != lv->read_ahead) {
4354                 log_verbose("Setting read ahead sectors");
4355                 lv->read_ahead = lp->read_ahead;
4356         }
4357
4358         if (!seg_is_thin_pool(lp) && lp->minor >= 0) {
4359                 lv->major = lp->major;
4360                 lv->minor = lp->minor;
4361                 lv->status |= FIXED_MINOR;
4362                 log_verbose("Setting device number to (%d, %d)", lv->major,
4363                             lv->minor);
4364         }
4365
4366         dm_list_splice(&lv->tags, &lp->tags);
4367
4368         if (!lv_extend(lv, lp->segtype,
4369                        lp->stripes, lp->stripe_size,
4370                        lp->mirrors,
4371                        seg_is_thin_pool(lp) ? lp->poolmetadataextents : lp->region_size,
4372                        seg_is_thin_volume(lp) ? lp->voriginextents : lp->extents,
4373                        seg_is_thin_volume(lp) ? (org ? org->name : lp->pool) : NULL, lp->pvh, lp->alloc))
4374                 return_NULL;
4375
4376         if (seg_is_thin_pool(lp)) {
4377                 first_seg(lv)->zero_new_blocks = lp->zero ? 1 : 0;
4378                 first_seg(lv)->chunk_size = lp->chunk_size;
4379                 /* FIXME: use lowwatermark  via lvm.conf global for all thinpools ? */
4380                 first_seg(lv)->low_water_mark = 0;
4381         } else if (seg_is_thin_volume(lp)) {
4382                 pool_lv = first_seg(lv)->pool_lv;
4383
4384                 if (!(first_seg(lv)->device_id =
4385                       get_free_pool_device_id(first_seg(pool_lv)))) {
4386                         stack;
4387                         goto revert_new_lv;
4388                 }
4389
4390                 if (!attach_pool_message(first_seg(pool_lv),
4391                                          DM_THIN_MESSAGE_CREATE_THIN, lv, 0, 0)) {
4392                         stack;
4393                         goto revert_new_lv;
4394                 }
4395         }
4396
4397         /* FIXME Log allocation and attachment should have happened inside lv_extend. */
4398         if (lp->log_count &&
4399             !seg_is_raid(first_seg(lv)) && seg_is_mirrored(first_seg(lv))) {
4400                 if (!add_mirror_log(cmd, lv, lp->log_count,
4401                                     first_seg(lv)->region_size,
4402                                     lp->pvh, lp->alloc)) {
4403                         stack;
4404                         goto revert_new_lv;
4405                 }
4406         }
4407
4408         /* store vg on disk(s) */
4409         if (!vg_write(vg) || !vg_commit(vg))
4410                 return_NULL;
4411
4412         backup(vg);
4413
4414         if (test_mode()) {
4415                 log_verbose("Test mode: Skipping activation and zeroing.");
4416                 goto out;
4417         }
4418
4419         if (seg_is_thin(lp)) {
4420                 /* For snapshot, suspend active thin origin first */
4421                 if (org && lv_is_active(org)) {
4422                         if (!pool_below_threshold(first_seg(first_seg(org)->pool_lv))) {
4423                                 log_error("Cannot create thin snapshot. Pool %s/%s is filled "
4424                                           "over the autoextend threshold.",
4425                                           org->vg->name, first_seg(org)->pool_lv->name);
4426                                 goto revert_new_lv;
4427                         }
4428                         if (!suspend_lv_origin(cmd, org)) {
4429                                 log_error("Failed to suspend thin snapshot origin %s/%s.",
4430                                           org->vg->name, org->name);
4431                                 goto revert_new_lv;
4432                         }
4433                         if (!resume_lv_origin(cmd, org)) { /* deptree updates thin-pool */
4434                                 log_error("Failed to resume thin snapshot origin %s/%s.",
4435                                           org->vg->name, org->name);
4436                                 goto revert_new_lv;
4437                         }
4438                         /* At this point remove pool messages, snapshot is active */
4439                         if (!update_pool_lv(first_seg(org)->pool_lv, 0)) {
4440                                 stack;
4441                                 goto deactivate_and_revert_new_lv;
4442                         }
4443                 }
4444                 if (((lp->activate == CHANGE_AY) ||
4445                      (lp->activate == CHANGE_AE) ||
4446                      (lp->activate == CHANGE_ALY))) {
4447                         /* At this point send message to kernel thin mda */
4448                         pool_lv = lv_is_thin_pool(lv) ? lv : first_seg(lv)->pool_lv;
4449                         if (!update_pool_lv(pool_lv, 1)) {
4450                                 stack;
4451                                 goto deactivate_and_revert_new_lv;
4452                         }
4453                         if (!activate_lv_excl(cmd, lv)) {
4454                                 log_error("Aborting. Failed to activate thin %s.",
4455                                           lv->name);
4456                                 goto deactivate_and_revert_new_lv;
4457                         }
4458                 }
4459         } else if (lp->snapshot) {
4460                 if (!activate_lv_excl(cmd, lv)) {
4461                         log_error("Aborting. Failed to activate snapshot "
4462                                   "exception store.");
4463                         goto revert_new_lv;
4464                 }
4465         } else if ((lp->activate == CHANGE_AY && !activate_lv(cmd, lv)) ||
4466                    (lp->activate == CHANGE_AE && !activate_lv_excl(cmd, lv)) ||
4467                    (lp->activate == CHANGE_ALY && !activate_lv_local(cmd, lv))) {
4468                 log_error("Failed to activate new LV.");
4469                 if (lp->zero)
4470                         goto deactivate_and_revert_new_lv;
4471                 return NULL;
4472         }
4473
4474         if (!seg_is_thin(lp) && !lp->zero && !lp->snapshot)
4475                 log_warn("WARNING: \"%s\" not zeroed", lv->name);
4476         else if ((!seg_is_thin(lp) ||
4477                   (lv_is_thin_volume(lv) &&
4478                    !first_seg(first_seg(lv)->pool_lv)->zero_new_blocks)) &&
4479                  !set_lv(cmd, lv, UINT64_C(0), 0)) {
4480                 log_error("Aborting. Failed to wipe %s.",
4481                           lp->snapshot ? "snapshot exception store" :
4482                                          "start of new LV");
4483                 goto deactivate_and_revert_new_lv;
4484         }
4485
4486         if (lp->snapshot && !lp->thin) {
4487                 /* Reset permission after zeroing */
4488                 if (!(lp->permission & LVM_WRITE))
4489                         lv->status &= ~LVM_WRITE;
4490
4491                 /* COW area must be deactivated if origin is not active */
4492                 if (!origin_active && !deactivate_lv(cmd, lv)) {
4493                         log_error("Aborting. Couldn't deactivate snapshot "
4494                                   "COW area. Manual intervention required.");
4495                         return NULL;
4496                 }
4497
4498                 /* A virtual origin must be activated explicitly. */
4499                 if (lp->voriginsize &&
4500                     (!(org = _create_virtual_origin(cmd, vg, lv->name,
4501                                                     lp->permission,
4502                                                     lp->voriginextents)) ||
4503                      !activate_lv_excl(cmd, org))) {
4504                         log_error("Couldn't create virtual origin for LV %s",
4505                                   lv->name);
4506                         if (org && !lv_remove(org))
4507                                 stack;
4508                         goto deactivate_and_revert_new_lv;
4509                 }
4510
4511                 /* cow LV remains active and becomes snapshot LV */
4512
4513                 if (!vg_add_snapshot(org, lv, NULL,
4514                                      org->le_count, lp->chunk_size)) {
4515                         log_error("Couldn't create snapshot.");
4516                         goto deactivate_and_revert_new_lv;
4517                 }
4518
4519                 /* store vg on disk(s) */
4520                 if (!vg_write(vg))
4521                         return_NULL;
4522
4523                 if (!suspend_lv(cmd, org)) {
4524                         log_error("Failed to suspend origin %s", org->name);
4525                         vg_revert(vg);
4526                         return NULL;
4527                 }
4528
4529                 if (!vg_commit(vg))
4530                         return_NULL;
4531
4532                 if (!resume_lv(cmd, org)) {
4533                         log_error("Problem reactivating origin %s", org->name);
4534                         return NULL;
4535                 }
4536         }
4537         /* FIXME out of sequence */
4538         backup(vg);
4539
4540 out:
4541         return lv;
4542
4543 deactivate_and_revert_new_lv:
4544         if (!deactivate_lv(cmd, lv)) {
4545                 log_error("Unable to deactivate failed new LV. "
4546                           "Manual intervention required.");
4547                 return NULL;
4548         }
4549
4550 revert_new_lv:
4551         /* FIXME Better to revert to backup of metadata? */
4552         if (!lv_remove(lv) || !vg_write(vg) || !vg_commit(vg))
4553                 log_error("Manual intervention may be required to remove "
4554                           "abandoned LV(s) before retrying.");
4555         else
4556                 backup(vg);
4557
4558         return NULL;
4559 }
4560
4561 int lv_create_single(struct volume_group *vg,
4562                      struct lvcreate_params *lp)
4563 {
4564         struct logical_volume *lv;
4565
4566         /* Create thin pool first if necessary */
4567         if (lp->create_thin_pool) {
4568                 if (!seg_is_thin_pool(lp) &&
4569                     !(lp->segtype = get_segtype_from_string(vg->cmd, "thin-pool")))
4570                         return_0;
4571
4572                 if (!(lv = _lv_create_an_lv(vg, lp, lp->pool)))
4573                         return_0;
4574
4575                 if (!lp->thin)
4576                         goto out;
4577
4578                 lp->pool = lv->name;
4579
4580                 if (!(lp->segtype = get_segtype_from_string(vg->cmd, "thin")))
4581                         return_0;
4582         }
4583
4584         if (!(lv = _lv_create_an_lv(vg, lp, lp->lv_name)))
4585                 return_0;
4586
4587 out:
4588         log_print("Logical volume \"%s\" created", lv->name);
4589
4590         return 1;
4591 }