locale/programs/ld-collate.c

   1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <errno.h>
  25 #include <error.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28
  29 #include "charmap.h"
  30 #include "localeinfo.h"
  31 #include "linereader.h"
  32 #include "locfile.h"
  33 #include "localedef.h"
  34
  35 /* Uncomment the following line in the production version.  */
  36 /* #define NDEBUG 1 */
  37 #include <assert.h>
  38
  39 #define obstack_chunk_alloc malloc
  40 #define obstack_chunk_free free
  41
  42 /* Forward declaration.  */
  43 struct element_t;
  44
  45 /* Data type for list of strings.  */
  46 struct section_list
  47 {
  48   struct section_list *next;
  49   /* Name of the section.  */
  50   const char *name;
  51   /* First element of this section.  */
  52   struct element_t *first;
  53   /* Last element of this section.  */
  54   struct element_t *last;
  55   /* These are the rules for this section.  */
  56   enum coll_sort_rule *rules;
  57   /* Index of the rule set in the appropriate section of the output file.  */
  58   int ruleidx;
  59 };
  60
  61 struct element_t;
  62
  63 struct element_list_t
  64 {
  65   /* Number of elements.  */
  66   int cnt;
  67
  68   struct element_t **w;
  69 };
  70
  71 /* Data type for collating element.  */
  72 struct element_t
  73 {
  74   const char *name;
  75
  76   const char *mbs;
  77   size_t nmbs;
  78   const uint32_t *wcs;
  79   size_t nwcs;
  80   int *mborder;
  81   int wcorder;
  82
  83   /* The following is a bit mask which bits are set if this element is
  84      used in the appropriate level.  Interesting for the singlebyte
  85      weight computation.
  86
  87      XXX The type here restricts the number of levels to 32.  It could
  88      we changed if necessary but I doubt this is necessary.  */
  89   unsigned int used_in_level;
  90
  91   struct element_list_t *weights;
  92
  93   /* Where does the definition come from.  */
  94   const char *file;
  95   size_t line;
  96
  97   /* Which section does this belong to.  */
  98   struct section_list *section;
  99
 100   /* Predecessor and successor in the order list.  */
 101   struct element_t *last;
 102   struct element_t *next;
 103
 104   /* Next element in multibyte output list.  */
 105   struct element_t *mbnext;
 106 };
 107
 108 /* Special element value.  */
 109 #define ELEMENT_ELLIPSIS2       ((struct element_t *) 1)
 110 #define ELEMENT_ELLIPSIS3       ((struct element_t *) 2)
 111 #define ELEMENT_ELLIPSIS4       ((struct element_t *) 3)
 112
 113 /* Data type for collating symbol.  */
 114 struct symbol_t
 115 {
 116   /* Point to place in the order list.  */
 117   struct element_t *order;
 118
 119   /* Where does the definition come from.  */
 120   const char *file;
 121   size_t line;
 122 };
 123
 124
 125 /* The real definition of the struct for the LC_COLLATE locale.  */
 126 struct locale_collate_t
 127 {
 128   int col_weight_max;
 129   int cur_weight_max;
 130
 131   /* List of known scripts.  */
 132   struct section_list *sections;
 133   /* Current section using definition.  */
 134   struct section_list *current_section;
 135   /* There always can be an unnamed section.  */
 136   struct section_list unnamed_section;
 137   /* To make handling of errors easier we have another section.  */
 138   struct section_list error_section;
 139
 140   /* Number of sorting rules given in order_start line.  */
 141   uint32_t nrules;
 142
 143   /* Start of the order list.  */
 144   struct element_t *start;
 145
 146   /* The undefined element.  */
 147   struct element_t undefined;
 148
 149   /* This is the cursor for `reorder_after' insertions.  */
 150   struct element_t *cursor;
 151
 152   /* This value is used when handling ellipsis.  */
 153   struct element_t ellipsis_weight;
 154
 155   /* Known collating elements.  */
 156   hash_table elem_table;
 157
 158   /* Known collating symbols.  */
 159   hash_table sym_table;
 160
 161   /* Known collation sequences.  */
 162   hash_table seq_table;
 163
 164   struct obstack mempool;
 165
 166   /* The LC_COLLATE category is a bit special as it is sometimes possible
 167      that the definitions from more than one input file contains information.
 168      Therefore we keep all relevant input in a list.  */
 169   struct locale_collate_t *next;
 170
 171   /* Arrays with heads of the list for each of the leading bytes in
 172      the multibyte sequences.  */
 173   struct element_t *mbheads[256];
 174 };
 175
 176
 177 /* We have a few global variables which are used for reading all
 178    LC_COLLATE category descriptions in all files.  */
 179 static int nrules;
 180
 181
 182 /* These are definitions used by some of the functions for handling
 183    UTF-8 encoding below.  */
 184 static const uint32_t encoding_mask[] =
 185 {
 186   ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
 187 };
 188
 189 static const unsigned char encoding_byte[] =
 190 {
 191   0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 192 };
 193
 194
 195 /* We need UTF-8 encoding of numbers.  */
 196 static inline int
 197 utf8_encode (char *buf, int val)
 198 {
 199   char *startp = buf;
 200   int retval;
 201
 202   if (val < 0x80)
 203     {
 204       *buf++ = (char) val;
 205       retval = 1;
 206     }
 207   else
 208     {
 209       int step;
 210
 211       for (step = 2; step < 6; ++step)
 212         if ((val & encoding_mask[step - 2]) == 0)
 213           break;
 214       retval = step;
 215
 216       *buf = encoding_byte[step - 2];
 217       --step;
 218       do
 219         {
 220           buf[step] = 0x80 | (val & 0x3f);
 221           val >>= 6;
 222         }
 223       while (--step > 0);
 224       *buf |= val;
 225     }
 226
 227   return buf - startp;
 228 }
 229
 230
 231 static struct section_list *
 232 make_seclist_elem (struct locale_collate_t *collate, const char *string,
 233                    struct section_list *next)
 234 {
 235   struct section_list *newp;
 236
 237   newp = (struct section_list *) obstack_alloc (&collate->mempool,
 238                                                 sizeof (*newp));
 239   newp->next = next;
 240   newp->name = string;
 241   newp->first = NULL;
 242
 243   return newp;
 244 }
 245
 246
 247 static struct element_t *
 248 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 249              const uint32_t *wcs, const char *name, size_t namelen)
 250 {
 251   struct element_t *newp;
 252
 253   newp = (struct element_t *) obstack_alloc (&collate->mempool,
 254                                              sizeof (*newp));
 255   newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
 256                                                     name, namelen);
 257   if (mbs != NULL)
 258     {
 259       newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
 260       newp->nmbs = mbslen;
 261     }
 262   else
 263     {
 264       newp->mbs = NULL;
 265       newp->nmbs = 0;
 266     }
 267   if (wcs != NULL)
 268     {
 269       size_t nwcs = wcslen ((wchar_t *) wcs);
 270       uint32_t zero = 0;
 271       obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
 272       obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
 273       newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
 274       newp->nwcs = nwcs;
 275     }
 276   else
 277     {
 278       newp->wcs = NULL;
 279       newp->nwcs = 0;
 280     }
 281   newp->mborder = NULL;
 282   newp->wcorder = 0;
 283   newp->used_in_level = 0;
 284
 285   /* Will be allocated later.  */
 286   newp->weights = NULL;
 287
 288   newp->file = NULL;
 289   newp->line = 0;
 290
 291   newp->section = collate->current_section;
 292
 293   newp->last = NULL;
 294   newp->next = NULL;
 295
 296   newp->mbnext = NULL;
 297
 298   return newp;
 299 }
 300
 301
 302 static struct symbol_t *
 303 new_symbol (struct locale_collate_t *collate)
 304 {
 305   struct symbol_t *newp;
 306
 307   newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
 308
 309   newp->order = NULL;
 310
 311   newp->file = NULL;
 312   newp->line = 0;
 313
 314   return newp;
 315 }
 316
 317
 318 /* Test whether this name is already defined somewhere.  */
 319 static int
 320 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
 321                  struct charmap_t *charmap, struct repertoire_t *repertoire,
 322                  const char *symbol, size_t symbol_len)
 323 {
 324   void *ignore = NULL;
 325
 326   if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
 327     {
 328       lr_error (ldfile, _("`%s' already defined in charmap"), symbol);
 329       return 1;
 330     }
 331
 332   if (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore) == 0)
 333     {
 334       lr_error (ldfile, _("`%s' already defined in repertoire"), symbol);
 335       return 1;
 336     }
 337
 338   if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
 339     {
 340       lr_error (ldfile, _("`%s' already defined as collating symbol"), symbol);
 341       return 1;
 342     }
 343
 344   if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
 345     {
 346       lr_error (ldfile, _("`%s' already defined as collating element"),
 347                 symbol);
 348       return 1;
 349     }
 350
 351   return 0;
 352 }
 353
 354
 355 /* Read the direction specification.  */
 356 static void
 357 read_directions (struct linereader *ldfile, struct token *arg,
 358                  struct charmap_t *charmap, struct repertoire_t *repertoire,
 359                  struct locale_collate_t *collate)
 360 {
 361   int cnt = 0;
 362   int max = nrules ?: 10;
 363   enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
 364   int warned = 0;
 365
 366   while (1)
 367     {
 368       int valid = 0;
 369
 370       if (arg->tok == tok_forward)
 371         {
 372           if (rules[cnt] & sort_backward)
 373             {
 374               if (! warned)
 375                 {
 376                   lr_error (ldfile, _("\
 377 %s: `forward' and `backward' are mutually excluding each other"),
 378                             "LC_COLLATE");
 379                   warned = 1;
 380                 }
 381             }
 382           else if (rules[cnt] & sort_forward)
 383             {
 384               if (! warned)
 385                 {
 386                   lr_error (ldfile, _("\
 387 %s: `%s' mentioned twice in definition of weight %d"),
 388                             "LC_COLLATE", "forward", cnt + 1);
 389                 }
 390             }
 391           else
 392             rules[cnt] |= sort_forward;
 393
 394           valid = 1;
 395         }
 396       else if (arg->tok == tok_backward)
 397         {
 398           if (rules[cnt] & sort_forward)
 399             {
 400               if (! warned)
 401                 {
 402                   lr_error (ldfile, _("\
 403 %s: `forward' and `backward' are mutually excluding each other"),
 404                             "LC_COLLATE");
 405                   warned = 1;
 406                 }
 407             }
 408           else if (rules[cnt] & sort_backward)
 409             {
 410               if (! warned)
 411                 {
 412                   lr_error (ldfile, _("\
 413 %s: `%s' mentioned twice in definition of weight %d"),
 414                             "LC_COLLATE", "backward", cnt + 1);
 415                 }
 416             }
 417           else
 418             rules[cnt] |= sort_backward;
 419
 420           valid = 1;
 421         }
 422       else if (arg->tok == tok_position)
 423         {
 424           if (rules[cnt] & sort_position)
 425             {
 426               if (! warned)
 427                 {
 428                   lr_error (ldfile, _("\
 429 %s: `%s' mentioned twice in definition of weight %d in category `%s'"),
 430                             "LC_COLLATE", "position", cnt + 1);
 431                 }
 432             }
 433           else
 434             rules[cnt] |= sort_position;
 435
 436           valid = 1;
 437         }
 438
 439       if (valid)
 440         arg = lr_token (ldfile, charmap, repertoire);
 441
 442       if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
 443           || arg->tok == tok_semicolon)
 444         {
 445           if (! valid && ! warned)
 446             {
 447               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 448               warned = 1;
 449             }
 450
 451           /* See whether we have to increment the counter.  */
 452           if (arg->tok != tok_comma && rules[cnt] != 0)
 453             ++cnt;
 454
 455           if (arg->tok == tok_eof || arg->tok == tok_eol)
 456             /* End of line or file, so we exit the loop.  */
 457             break;
 458
 459           if (nrules == 0)
 460             {
 461               /* See whether we have enough room in the array.  */
 462               if (cnt == max)
 463                 {
 464                   max += 10;
 465                   rules = (enum coll_sort_rule *) xrealloc (rules,
 466                                                             max
 467                                                             * sizeof (*rules));
 468                   memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
 469                 }
 470             }
 471           else
 472             {
 473               if (cnt == nrules)
 474                 {
 475                   /* There must not be any more rule.  */
 476                   if (! warned)
 477                     {
 478                       lr_error (ldfile, _("\
 479 %s: too many rules; first entry only had %d"),
 480                                 "LC_COLLATE", nrules);
 481                       warned = 1;
 482                     }
 483
 484                   lr_ignore_rest (ldfile, 0);
 485                   break;
 486                 }
 487             }
 488         }
 489       else
 490         {
 491           if (! warned)
 492             {
 493               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 494               warned = 1;
 495             }
 496         }
 497
 498       arg = lr_token (ldfile, charmap, repertoire);
 499     }
 500
 501   if (nrules == 0)
 502     {
 503       /* Now we know how many rules we have.  */
 504       nrules = cnt;
 505       rules = (enum coll_sort_rule *) xrealloc (rules,
 506                                                 nrules * sizeof (*rules));
 507     }
 508   else
 509     {
 510       if (cnt < nrules)
 511         {
 512           /* Not enough rules in this specification.  */
 513           if (! warned)
 514             lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
 515
 516           do
 517             rules[cnt] = sort_forward;
 518           while (++cnt < nrules);
 519         }
 520     }
 521
 522   collate->current_section->rules = rules;
 523 }
 524
 525
 526 static struct element_t *
 527 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
 528               const char *str, size_t len, uint32_t *wcstr)
 529 {
 530   struct element_t *result = NULL;
 531
 532   /* Search for the entries among the collation sequences already define.  */
 533   if (find_entry (&collate->seq_table, str, len, (void **) &result) != 0)
 534     {
 535       /* Nope, not define yet.  So we see whether it is a
 536          collation symbol.  */
 537       void *ptr;
 538
 539       if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
 540         {
 541           /* It's a collation symbol.  */
 542           struct symbol_t *sym = (struct symbol_t *) ptr;
 543           result = sym->order;
 544
 545           if (result == NULL)
 546             result = sym->order = new_element (collate, NULL, 0, NULL,
 547                                                NULL, 0);
 548         }
 549       else if (find_entry (&collate->elem_table, str, len,
 550                            (void **) &result) != 0)
 551         {
 552           /* It's also no collation element.  So it is a character
 553              element defined later.  */
 554           result = new_element (collate, NULL, 0, NULL, str, len);
 555           if (result != NULL)
 556             /* Insert it into the sequence table.  */
 557             insert_entry (&collate->seq_table, str, len, result);
 558         }
 559     }
 560
 561   return result;
 562 }
 563
 564
 565 static void
 566 unlink_element (struct locale_collate_t *collate)
 567 {
 568   if (collate->cursor == collate->start)
 569     {
 570       assert (collate->cursor->next == NULL);
 571       assert (collate->cursor->last == NULL);
 572       collate->cursor = NULL;
 573     }
 574   else
 575     {
 576       if (collate->cursor->next != NULL)
 577         collate->cursor->next->last = collate->cursor->last;
 578       if (collate->cursor->last != NULL)
 579         collate->cursor->last->next = collate->cursor->next;
 580       collate->cursor = collate->cursor->last;
 581     }
 582 }
 583
 584
 585 static void
 586 insert_weights (struct linereader *ldfile, struct element_t *elem,
 587                 struct charmap_t *charmap, struct repertoire_t *repertoire,
 588                 struct locale_collate_t *collate, enum token_t ellipsis)
 589 {
 590   int weight_cnt;
 591   struct token *arg;
 592
 593   /* Initialize all the fields.  */
 594   elem->file = ldfile->fname;
 595   elem->line = ldfile->lineno;
 596   elem->last = collate->cursor;
 597   elem->next = collate->cursor ? collate->cursor->next : NULL;
 598   elem->section = collate->current_section;
 599   if (collate->cursor != NULL)
 600     collate->cursor->next = elem;
 601   if (collate->start == NULL)
 602     {
 603       assert (collate->cursor == NULL);
 604       collate->start = elem;
 605     }
 606   elem->weights = (struct element_list_t *)
 607     obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
 608   memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
 609
 610   if (collate->current_section->first == NULL)
 611     collate->current_section->first = elem;
 612   if (collate->current_section->last == collate->cursor)
 613     collate->current_section->last = elem;
 614
 615   collate->cursor = elem;
 616
 617   weight_cnt = 0;
 618
 619   arg = lr_token (ldfile, charmap, repertoire);
 620   do
 621     {
 622       if (arg->tok == tok_eof || arg->tok == tok_eol)
 623         break;
 624
 625       if (arg->tok == tok_ignore)
 626         {
 627           /* The weight for this level has to be ignored.  We use the
 628              null pointer to indicate this.  */
 629           elem->weights[weight_cnt].w = (struct element_t **)
 630             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 631           elem->weights[weight_cnt].w[0] = NULL;
 632           elem->weights[weight_cnt].cnt = 1;
 633         }
 634       else if (arg->tok == tok_bsymbol)
 635         {
 636           struct element_t *val = find_element (ldfile, collate,
 637                                                 arg->val.str.startmb,
 638                                                 arg->val.str.lenmb,
 639                                                 arg->val.str.startwc);
 640
 641           if (val == NULL)
 642             break;
 643
 644           elem->weights[weight_cnt].w = (struct element_t **)
 645             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 646           elem->weights[weight_cnt].w[0] = val;
 647           elem->weights[weight_cnt].cnt = 1;
 648         }
 649       else if (arg->tok == tok_string)
 650         {
 651           /* Split the string up in the individual characters and put
 652              the element definitions in the list.  */
 653           const char *cp = arg->val.str.startmb;
 654           int cnt = 0;
 655           struct element_t *charelem;
 656           struct element_t **weights = NULL;
 657           int max = 0;
 658
 659           if (*cp == '\0')
 660             {
 661               lr_error (ldfile, _("%s: empty weight string not allowed"),
 662                         "LC_COLLATE");
 663               lr_ignore_rest (ldfile, 0);
 664               break;
 665             }
 666
 667           do
 668             {
 669               if (*cp == '<')
 670                 {
 671                   /* Ahh, it's a bsymbol.  That's what we want.  */
 672                   const char *startp = ++cp;
 673
 674                   while (*cp != '>')
 675                     {
 676                       if (*cp == ldfile->escape_char)
 677                         ++cp;
 678                       if (*cp == '\0')
 679                         /* It's a syntax error.  */
 680                         goto syntax;
 681
 682                       ++cp;
 683                     }
 684
 685                     charelem = find_element (ldfile, collate, startp,
 686                                              cp - startp, NULL);
 687                     ++cp;
 688                 }
 689               else
 690                 {
 691                   /* People really shouldn't use characters directly in
 692                      the string.  Especially since it's not really clear
 693                      what this means.  We interpret all characters in the
 694                      string as if that would be bsymbols.  Otherwise we
 695                      would have to match back to bsymbols somehow and this
 696                      is normally not what people normally expect.  */
 697                   charelem = find_element (ldfile, collate, cp++, 1, NULL);
 698                 }
 699
 700               if (charelem == NULL)
 701                 {
 702                   /* We ignore the rest of the line.  */
 703                   lr_ignore_rest (ldfile, 0);
 704                   break;
 705                 }
 706
 707               /* Add the pointer.  */
 708               if (cnt >= max)
 709                 {
 710                   struct element_t **newp;
 711                   max += 10;
 712                   newp = (struct element_t **)
 713                     alloca (max * sizeof (struct element_t *));
 714                   memcpy (newp, weights, cnt * sizeof (struct element_t *));
 715                   weights = newp;
 716                 }
 717               weights[cnt++] = charelem;
 718             }
 719           while (*cp != '\0');
 720
 721           /* Now store the information.  */
 722           elem->weights[weight_cnt].w = (struct element_t **)
 723             obstack_alloc (&collate->mempool,
 724                            cnt * sizeof (struct element_t *));
 725           memcpy (elem->weights[weight_cnt].w, weights,
 726                   cnt * sizeof (struct element_t *));
 727           elem->weights[weight_cnt].cnt = cnt;
 728
 729           /* We don't need the string anymore.  */
 730           free (arg->val.str.startmb);
 731         }
 732       else if (ellipsis != tok_none
 733                && (arg->tok == tok_ellipsis2
 734                    || arg->tok == tok_ellipsis3
 735                    || arg->tok == tok_ellipsis4))
 736         {
 737           /* It must be the same ellipsis as used in the initial column.  */
 738           if (arg->tok != ellipsis)
 739             lr_error (ldfile, _("\
 740 %s: weights must use the same ellipsis symbol as the name"),
 741                       "LC_COLLATE");
 742
 743           /* The weight for this level has to be ignored.  We use the
 744              null pointer to indicate this.  */
 745           elem->weights[weight_cnt].w = (struct element_t **)
 746             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 747           elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
 748           elem->weights[weight_cnt].cnt = 1;
 749         }
 750       else
 751         {
 752         syntax:
 753           /* It's a syntax error.  */
 754           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 755           lr_ignore_rest (ldfile, 0);
 756           break;
 757         }
 758
 759       arg = lr_token (ldfile, charmap, repertoire);
 760       /* This better should be the end of the line or a semicolon.  */
 761       if (arg->tok == tok_semicolon)
 762         /* OK, ignore this and read the next token.  */
 763         arg = lr_token (ldfile, charmap, repertoire);
 764       else if (arg->tok != tok_eof && arg->tok != tok_eol)
 765         {
 766           /* It's a syntax error.  */
 767           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 768           lr_ignore_rest (ldfile, 0);
 769           break;
 770         }
 771     }
 772   while (++weight_cnt < nrules);
 773
 774   if (weight_cnt < nrules)
 775     {
 776       /* This means the rest of the line uses the current element as
 777          the weight.  */
 778       do
 779         {
 780           elem->weights[weight_cnt].w = (struct element_t **)
 781             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 782           elem->weights[weight_cnt].w[0] = elem;
 783           elem->weights[weight_cnt].cnt = 1;
 784         }
 785       while (++weight_cnt < nrules);
 786     }
 787   else
 788     {
 789       if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
 790         {
 791           /* Too many rule values.  */
 792           lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
 793           lr_ignore_rest (ldfile, 0);
 794         }
 795       else
 796         lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
 797     }
 798 }
 799
 800
 801 static int
 802 insert_value (struct linereader *ldfile, struct token *arg,
 803               struct charmap_t *charmap, struct repertoire_t *repertoire,
 804               struct locale_collate_t *collate)
 805 {
 806   /* First find out what kind of symbol this is.  */
 807   struct charseq *seq;
 808   uint32_t wc;
 809   struct element_t *elem = NULL;
 810
 811   /* Try to find the character in the charmap.  */
 812   seq = charmap_find_value (charmap, arg->val.str.startmb, arg->val.str.lenmb);
 813
 814   /* Determine the wide character.  */
 815   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
 816     {
 817       wc = repertoire_find_value (repertoire, arg->val.str.startmb,
 818                                   arg->val.str.lenmb);
 819       if (seq != NULL)
 820         seq->ucs4 = wc;
 821     }
 822   else
 823     wc = seq->ucs4;
 824
 825   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
 826     {
 827       /* It's no character, so look through the collation elements and
 828          symbol list.  */
 829       void *result;
 830
 831       if (find_entry (&collate->sym_table, arg->val.str.startmb,
 832                       arg->val.str.lenmb, &result) == 0)
 833         {
 834           /* It's a collation symbol.  */
 835           struct symbol_t *sym = (struct symbol_t *) result;
 836           elem = sym->order;
 837
 838           if (elem == NULL)
 839             elem = sym->order = new_element (collate, NULL, 0, NULL, NULL, 0);
 840         }
 841       else if (find_entry (&collate->elem_table, arg->val.str.startmb,
 842                            arg->val.str.lenmb, (void **) &elem) != 0)
 843         {
 844           /* It's also no collation element.  Therefore ignore it.  */
 845           lr_ignore_rest (ldfile, 0);
 846           return 1;
 847         }
 848     }
 849   else
 850     {
 851       /* Otherwise the symbols stands for a character.  */
 852       if (find_entry (&collate->seq_table, arg->val.str.startmb,
 853                       arg->val.str.lenmb, (void **) &elem) != 0)
 854         {
 855           uint32_t wcs[2] = { wc, 0 };
 856
 857           /* We have to allocate an entry.  */
 858           elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
 859                               seq != NULL ? seq->nbytes : 0,
 860                               wcs, arg->val.str.startmb, arg->val.str.lenmb);
 861
 862           /* And add it to the table.  */
 863           if (insert_entry (&collate->seq_table, arg->val.str.startmb,
 864                             arg->val.str.lenmb, elem) != 0)
 865             /* This cannot happen.  */
 866             assert (! "Internal error");
 867         }
 868       else
 869         {
 870           /* Maybe the character was used before the definition.  In this case
 871              we have to insert the byte sequences now.  */
 872           if (elem->mbs == NULL && seq != NULL)
 873             {
 874               elem->mbs = obstack_copy0 (&collate->mempool,
 875                                          seq->bytes, seq->nbytes);
 876               elem->nmbs = seq->nbytes;
 877             }
 878
 879           if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE)
 880             {
 881               uint32_t wcs[2] = { wc, 0 };
 882
 883               elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
 884               elem->nwcs = 1;
 885             }
 886         }
 887     }
 888
 889   /* Test whether this element is not already in the list.  */
 890   if (elem->next != NULL || (collate->cursor != NULL
 891                              && elem->next == collate->cursor))
 892     {
 893       lr_error (ldfile, _("order for `%.*s' already defined at %s:%zu"),
 894                 (int) arg->val.str.lenmb, arg->val.str.startmb,
 895                 elem->file, elem->line);
 896       lr_ignore_rest (ldfile, 0);
 897       return 1;
 898     }
 899
 900   insert_weights (ldfile, elem, charmap, repertoire, collate, tok_none);
 901
 902   return 0;
 903 }
 904
 905
 906 static void
 907 handle_ellipsis (struct linereader *ldfile, struct token *arg,
 908                  enum token_t ellipsis, struct charmap_t *charmap,
 909                  struct repertoire_t *repertoire,
 910                  struct locale_collate_t *collate)
 911 {
 912   struct element_t *startp;
 913   struct element_t *endp;
 914
 915   /* Unlink the entry added for the ellipsis.  */
 916   unlink_element (collate);
 917   startp = collate->cursor;
 918
 919   /* Process and add the end-entry.  */
 920   if (arg != NULL
 921       && insert_value (ldfile, arg, charmap, repertoire, collate))
 922     /* Something went wrong with inserting the to-value.  This means
 923        we cannot process the ellipsis.  */
 924     return;
 925
 926   /* Reset the cursor.  */
 927   collate->cursor = startp;
 928
 929   /* Now we have to handle many different situations:
 930      - we have to distinguish between the three different ellipsis forms
 931      - the is the ellipsis at the beginning, in the middle, or at the end.
 932   */
 933   endp = collate->cursor->next;
 934   assert (arg == NULL || endp != NULL);
 935
 936   /* Both, the start and the end symbol, must stand for characters.  */
 937   if ((startp == NULL || startp->name == NULL)
 938       || (endp == NULL || endp->name == NULL))
 939     {
 940       lr_error (ldfile, _("\
 941 %s: the start end the end symbol of a range must stand for characters"),
 942                 "LC_COLLATE");
 943       return;
 944     }
 945
 946   if (ellipsis == tok_ellipsis3)
 947     {
 948       /* One requirement we make here: the length of the byte
 949          sequences for the first and end character must be the same.
 950          This is mainly to prevent unwanted effects and this is often
 951          not what is wanted.  */
 952       size_t len = (startp->mbs != NULL ? startp->nmbs
 953                     : (endp->mbs != NULL ? endp->nmbs : 0));
 954       char mbcnt[len + 1];
 955       char mbend[len + 1];
 956
 957       /* Well, this should be caught somewhere else already.  Just to
 958          make sure.  */
 959       assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
 960       assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
 961
 962       if (startp != NULL && endp != NULL
 963           && startp->mbs != NULL && endp->mbs != NULL
 964           && startp->nmbs != endp->nmbs)
 965         {
 966           lr_error (ldfile, _("\
 967 %s: byte sequences of first and last character must have the same length"),
 968                     "LC_COLLATE");
 969           return;
 970         }
 971
 972       /* Determine whether we have to generate multibyte sequences.  */
 973       if ((startp == NULL || startp->mbs != NULL)
 974           && (endp == NULL || endp->mbs != NULL))
 975         {
 976           int cnt;
 977           int ret;
 978
 979           /* Prepare the beginning byte sequence.  This is either from the
 980              beginning byte sequence or it is all nulls if it was an
 981              initial ellipsis.  */
 982           if (startp == NULL || startp->mbs == NULL)
 983             memset (mbcnt, '\0', len);
 984           else
 985             {
 986               memcpy (mbcnt, startp->mbs, len);
 987
 988               /* And increment it so that the value is the first one we will
 989                  try to insert.  */
 990               for (cnt = len - 1; cnt >= 0; --cnt)
 991                 if (++mbcnt[cnt] != '\0')
 992                   break;
 993             }
 994           mbcnt[len] = '\0';
 995
 996           /* And the end sequence.  */
 997           if (endp == NULL || endp->mbs == NULL)
 998             memset (mbend, '\0', len);
 999           else
1000             memcpy (mbend, endp->mbs, len);
1001           mbend[len] = '\0';
1002
1003           /* Test whether we have a correct range.  */
1004           ret = memcmp (mbcnt, mbend, len);
1005           if (ret >= 0)
1006             {
1007               if (ret > 0)
1008                 lr_error (ldfile, _("%s: byte sequence of first character of \
1009 sequence is not lower than that of the last character"), "LC_COLLATE");
1010               return;
1011             }
1012
1013           /* Generate the byte sequences data.  */
1014           while (1)
1015             {
1016               struct charseq *seq;
1017
1018               /* Quite a bit of work ahead.  We have to find the character
1019                  definition for the byte sequence and then determine the
1020                  wide character belonging to it.  */
1021               seq = charmap_find_symbol (charmap, mbcnt, len);
1022               if (seq != NULL)
1023                 {
1024                   struct element_t *elem;
1025                   size_t namelen;
1026
1027                   if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1028                     seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1029                                                        strlen (seq->name));
1030
1031                   /* I don't this this can ever happen.  */
1032                   assert (seq->name != NULL);
1033                   namelen = strlen (seq->name);
1034
1035                   /* Now we are ready to insert the new value in the
1036                      sequence.  Find out whether the element is
1037                      already known.  */
1038                   if (find_entry (&collate->seq_table, seq->name, namelen,
1039                                   (void **) &elem) != 0)
1040                     {
1041                       uint32_t wcs[2] = { seq->ucs4, 0 };
1042
1043                       /* We have to allocate an entry.  */
1044                       elem = new_element (collate, mbcnt, len, wcs, seq->name,
1045                                           namelen);
1046
1047                       /* And add it to the table.  */
1048                       if (insert_entry (&collate->seq_table, seq->name,
1049                                         namelen, elem) != 0)
1050                         /* This cannot happen.  */
1051                         assert (! "Internal error");
1052                     }
1053
1054                   /* Test whether this element is not already in the list.  */
1055                   if (elem->next != NULL || (collate->cursor != NULL
1056                                              && elem->next == collate->cursor))
1057                     {
1058                       lr_error (ldfile, _("\
1059 order for `%.*s' already defined at %s:%zu"),
1060                                 (int) namelen, seq->name,
1061                                 elem->file, elem->line);
1062                       goto increment;
1063                     }
1064
1065                   /* Enqueue the new element.  */
1066                   elem->last = collate->cursor;
1067                   if (collate->cursor != NULL)
1068                     elem->next = NULL;
1069                   else
1070                     {
1071                       elem->next = collate->cursor->next;
1072                       elem->last->next = elem;
1073                       if (elem->next != NULL)
1074                         elem->next->last = elem;
1075                     }
1076                   if (collate->start == NULL)
1077                     {
1078                       assert (collate->cursor == NULL);
1079                       collate->start = elem;
1080                     }
1081                   collate->cursor = elem;
1082
1083                  /* Add the weight value.  We take them from the
1084                     `ellipsis_weights' member of `collate'.  */
1085                   elem->weights = (struct element_list_t *)
1086                     obstack_alloc (&collate->mempool,
1087                                    nrules * sizeof (struct element_list_t));
1088                   for (cnt = 0; cnt < nrules; ++cnt)
1089                     if (collate->ellipsis_weight.weights[cnt].cnt == 1
1090                         && (collate->ellipsis_weight.weights[cnt].w[0]
1091                             == ELEMENT_ELLIPSIS2))
1092                       {
1093                         elem->weights[cnt].w = (struct element_t **)
1094                           obstack_alloc (&collate->mempool,
1095                                          sizeof (struct element_t *));
1096                         elem->weights[cnt].w[0] = elem;
1097                         elem->weights[cnt].cnt = 1;
1098                       }
1099                     else
1100                       {
1101                         /* Simly use the weight from `ellipsis_weight'.  */
1102                         elem->weights[cnt].w =
1103                           collate->ellipsis_weight.weights[cnt].w;
1104                         elem->weights[cnt].cnt =
1105                           collate->ellipsis_weight.weights[cnt].cnt;
1106                       }
1107                 }
1108
1109               /* Increment for the next round.  */
1110             increment:
1111               for (cnt = len - 1; cnt >= 0; --cnt)
1112                 if (++mbcnt[cnt] != '\0')
1113                   break;
1114
1115               /* Find out whether this was all.  */
1116               if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1117                 /* Yep, that's all.  */
1118                 break;
1119             }
1120         }
1121     }
1122   else
1123     {
1124       /* For symbolic range we naturally must have a beginning and an
1125          end specified by the user.  */
1126       if (startp == NULL)
1127         lr_error (ldfile, _("\
1128 %s: symbolic range ellipsis must not directly follow `order_start'"),
1129                   "LC_COLLATE");
1130       else if (endp == NULL)
1131         lr_error (ldfile, _("\
1132 %s: symbolic range ellipsis must not be direct followed by `order_end'"),
1133                   "LC_COLLATE");
1134       else
1135         {
1136           /* Determine the range.  To do so we have to determine the
1137              common prefix of the both names and then the numeric
1138              values of both ends.  */
1139           size_t lenfrom = strlen (startp->name);
1140           size_t lento = strlen (endp->name);
1141           char buf[lento + 1];
1142           int preflen = 0;
1143           long int from;
1144           long int to;
1145           char *cp;
1146           int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1147
1148           if (lenfrom != lento)
1149             {
1150             invalid_range:
1151               lr_error (ldfile, _("\
1152 `%s' and `%.*s' are no valid names for symbolic range"),
1153                         startp->name, (int) lento, endp->name);
1154               return;
1155             }
1156
1157           while (startp->name[preflen] == endp->name[preflen])
1158             if (startp->name[preflen] == '\0')
1159               /* Nothing to be done.  The start and end point are identical
1160                  and while inserting the end point we have already given
1161                  the user an error message.  */
1162               return;
1163             else
1164               ++preflen;
1165
1166           errno = 0;
1167           from = strtol (startp->name + preflen, &cp, base);
1168           if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1169             goto invalid_range;
1170
1171           errno = 0;
1172           to = strtol (endp->name + preflen, &cp, base);
1173           if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1174             goto invalid_range;
1175
1176           /* Copy the prefix.  */
1177           memcpy (buf, startp->name, preflen);
1178
1179           /* Loop over all values.  */
1180           for (++from; from < to; ++from)
1181             {
1182               struct element_t *elem = NULL;
1183               struct charseq *seq;
1184               uint32_t wc;
1185               int cnt;
1186
1187               /* Generate the the name.  */
1188               sprintf (buf + preflen, base == 10 ? "%d" : "%x", from);
1189
1190               /* Look whether this name is already defined.  */
1191               if (find_entry (&collate->seq_table, arg->val.str.startmb,
1192                               arg->val.str.lenmb, (void **) &elem) == 0)
1193                 {
1194                   if (elem->next != NULL || (collate->cursor != NULL
1195                                              && elem->next == collate->cursor))
1196                     {
1197                       lr_error (ldfile, _("\
1198 %s: order for `%.*s' already defined at %s:%zu"),
1199                                 "LC_COLLATE", (int) lenfrom, buf,
1200                                 elem->file, elem->line);
1201                       continue;
1202                     }
1203
1204                   if (elem->name == NULL)
1205                     {
1206                       lr_error (ldfile, _("%s: `%s' must be a charater"),
1207                                 "LC_COLLATE", buf);
1208                       continue;
1209                     }
1210                 }
1211
1212               if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1213                 {
1214                   /* Search for a character of this name.  */
1215                   seq = charmap_find_value (charmap, buf, lenfrom);
1216                   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1217                     {
1218                       wc = repertoire_find_value (repertoire, buf, lenfrom);
1219
1220                       if (seq != NULL)
1221                         seq->ucs4 = wc;
1222                     }
1223                   else
1224                     wc = seq->ucs4;
1225
1226                   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1227                     /* We don't know anything about a character with this
1228                        name.  XXX Should we warn?  */
1229                     continue;
1230
1231                   if (elem == NULL)
1232                     {
1233                       uint32_t wcs[2] = { wc, 0 };
1234
1235                       /* We have to allocate an entry.  */
1236                       elem = new_element (collate,
1237                                           seq != NULL ? seq->bytes : NULL,
1238                                           seq != NULL ? seq->nbytes : 0,
1239                                           wc == ILLEGAL_CHAR_VALUE
1240                                           ? NULL : wcs,
1241                                           buf, lenfrom);
1242                     }
1243                   else
1244                     {
1245                       /* Update the element.  */
1246                       if (seq != NULL)
1247                         {
1248                           elem->mbs = obstack_copy0 (&collate->mempool,
1249                                                      seq->bytes, seq->nbytes);
1250                           elem->nmbs = seq->nbytes;
1251                         }
1252
1253                       if (wc != ILLEGAL_CHAR_VALUE)
1254                         {
1255                           uint32_t zero = 0;
1256
1257                           obstack_grow (&collate->mempool,
1258                                         &wc, sizeof (uint32_t));
1259                           obstack_grow (&collate->mempool,
1260                                         &zero, sizeof (uint32_t));
1261                           elem->wcs = obstack_finish (&collate->mempool);
1262                           elem->nwcs = 1;
1263                         }
1264                     }
1265
1266                   elem->file = ldfile->fname;
1267                   elem->line = ldfile->lineno;
1268                   elem->section = collate->current_section;
1269                 }
1270
1271               /* Enqueue the new element.  */
1272               elem->last = collate->cursor;
1273               elem->next = collate->cursor->next;
1274               elem->last->next = elem;
1275               if (elem->next != NULL)
1276                 elem->next->last = elem;
1277               collate->cursor = elem;
1278
1279               /* Now add the weights.  They come from the `ellipsis_weights'
1280                  member of `collate'.  */
1281               elem->weights = (struct element_list_t *)
1282                 obstack_alloc (&collate->mempool,
1283                                nrules * sizeof (struct element_list_t));
1284               for (cnt = 0; cnt < nrules; ++cnt)
1285                 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1286                     && (collate->ellipsis_weight.weights[cnt].w[0]
1287                         == ELEMENT_ELLIPSIS2))
1288                   {
1289                     elem->weights[cnt].w = (struct element_t **)
1290                       obstack_alloc (&collate->mempool,
1291                                      sizeof (struct element_t *));
1292                     elem->weights[cnt].w[0] = elem;
1293                     elem->weights[cnt].cnt = 1;
1294                   }
1295                 else
1296                   {
1297                     /* Simly use the weight from `ellipsis_weight'.  */
1298                     elem->weights[cnt].w =
1299                       collate->ellipsis_weight.weights[cnt].w;
1300                     elem->weights[cnt].cnt =
1301                       collate->ellipsis_weight.weights[cnt].cnt;
1302                   }
1303             }
1304         }
1305     }
1306 }
1307
1308
1309 static void
1310 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1311                  struct localedef_t *copy_locale, int ignore_content)
1312 {
1313   if (!ignore_content)
1314     {
1315       struct locale_collate_t *collate;
1316
1317       if (copy_locale == NULL)
1318         {
1319           collate = locale->categories[LC_COLLATE].collate =
1320             (struct locale_collate_t *)
1321             xcalloc (1, sizeof (struct locale_collate_t));
1322
1323           /* Init the various data structures.  */
1324           init_hash (&collate->elem_table, 100);
1325           init_hash (&collate->sym_table, 100);
1326           init_hash (&collate->seq_table, 500);
1327           obstack_init (&collate->mempool);
1328
1329           collate->col_weight_max = -1;
1330         }
1331       else
1332         collate = locale->categories[LC_COLLATE].collate =
1333           copy_locale->categories[LC_COLLATE].collate;
1334     }
1335
1336   ldfile->translate_strings = 0;
1337   ldfile->return_widestr = 0;
1338 }
1339
1340
1341 void
1342 collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
1343 {
1344   /* Now is the time when we can assign the individual collation
1345      values for all the symbols.  We have possibly different values
1346      for the wide- and the multibyte-character symbols.  This is done
1347      since it might make a difference in the encoding if there is in
1348      some cases no multibyte-character but there are wide-characters.
1349      (The other way around it is not important since theencoded
1350      collation value in the wide-character case is 32 bits wide and
1351      therefore requires no encoding).
1352
1353      The lowest collation value assigned is 2.  Zero is reserved for
1354      the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1355      functions and 1 is used to separate the individual passes for the
1356      different rules.
1357
1358      We also have to construct is list with all the bytes/words which
1359      can come first in a sequence, followed by all the elements which
1360      also start with this byte/word.  The order is reverse which has
1361      among others the important effect that longer strings are located
1362      first in the list.  This is required for the output data since
1363      the algorithm used in `strcoll' etc depends on this.
1364
1365      The multibyte case is easy.  We simply sort into an array with
1366      256 elements.  */
1367   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1368   int mbact[nrules];
1369   int wcact;
1370   struct element_t *runp;
1371   int i;
1372   int need_undefined = 0;
1373   struct section_list *sect;
1374   int ruleidx;
1375
1376   /* If this assertion is hit change the type in `element_t'.  */
1377   assert (nrules <= sizeof (runp->used_in_level) * 8);
1378
1379   /* Find out which elements are used at which level.  At the same
1380      time we find out whether we have any undefined symbols.  */
1381   runp = collate->start;
1382   while (runp != NULL)
1383     {
1384       if (runp->mbs != NULL)
1385         {
1386           for (i = 0; i < nrules; ++i)
1387             {
1388               int j;
1389
1390               for (j = 0; j < runp->weights[i].cnt; ++j)
1391                 /* A NULL pointer as the weight means IGNORE.  */
1392                 if (runp->weights[i].w[j] != NULL)
1393                   {
1394                     if (runp->weights[i].w[j]->weights == NULL)
1395                       {
1396                         error_at_line (0, 0, runp->file, runp->line,
1397                                        _("symbol `%s' not defined"),
1398                                        runp->weights[i].w[j]->name);
1399
1400                         need_undefined = 1;
1401                         runp->weights[i].w[j] = &collate->undefined;
1402                       }
1403                     else
1404                       /* Set the bit for the level.  */
1405                       runp->weights[i].w[j]->used_in_level |= 1 << i;
1406                   }
1407             }
1408         }
1409
1410       /* Up to the next entry.  */
1411       runp = runp->next;
1412     }
1413
1414   /* Walk through the list of defined sequences and assign weights.  Also
1415      create the data structure which will allow generating the single byte
1416      character based tables.
1417
1418      Since at each time only the weights for each of the rules are
1419      only compared to other weights for this rule it is possible to
1420      assign more compact weight values than simply counting all
1421      weights in sequence.  We can assign weights from 3, one for each
1422      rule individually and only for those elements, which are actually
1423      used for this rule.
1424
1425      Why is this important?  It is not for the wide char table.  But
1426      it is for the singlebyte output since here larger numbers have to
1427      be encoded to make it possible to emit the value as a byte
1428      string.  */
1429   for (i = 0; i < nrules; ++i)
1430     mbact[i] = 3;
1431   wcact = 3;
1432   runp = collate->start;
1433   while (runp != NULL)
1434     {
1435       /* Determine the order.  */
1436       if (runp->used_in_level != 0)
1437         {
1438           runp->mborder = (int *) obstack_alloc (&collate->mempool,
1439                                                  nrules * sizeof (int));
1440
1441           for (i = 0; i < nrules; ++i)
1442             if ((runp->used_in_level & (1 << i)) != 0)
1443               runp->mborder[i] = mbact[i]++;
1444             else
1445               runp->mborder[i] = 0;
1446         }
1447
1448       if (runp->mbs != NULL)
1449         {
1450           struct element_t **eptr;
1451
1452           /* Find the point where to insert in the list.  */
1453           eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1454           while (*eptr != NULL)
1455             {
1456               if ((*eptr)->nmbs < runp->nmbs)
1457                 break;
1458
1459               if ((*eptr)->nmbs == runp->nmbs)
1460                 {
1461                   int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1462
1463                   if (c == 0)
1464                     {
1465                       /* This should not happen.  It means that we have
1466                          to symbols with the same byte sequence.  It is
1467                          of course an error.  */
1468                       error_at_line (0, 0, (*eptr)->file, (*eptr)->line,
1469                                      _("symbol `%s' has same encoding as"),
1470                                      (*eptr)->name);
1471                       error_at_line (0, 0, runp->file, runp->line,
1472                                      _("symbol `%s'"), runp->name);
1473                       goto dont_insert;
1474                     }
1475                   else if (c < 0)
1476                     /* Insert it here.  */
1477                     break;
1478                 }
1479
1480               /* To the next entry.  */
1481               eptr = &(*eptr)->mbnext;
1482             }
1483
1484           /* Set the pointers.  */
1485           runp->mbnext = *eptr;
1486           *eptr = runp;
1487         dont_insert:
1488         }
1489
1490       if (runp->wcs != NULL)
1491         runp->wcorder = wcact++;
1492
1493       /* Up to the next entry.  */
1494       runp = runp->next;
1495     }
1496
1497   /* Find out whether any of the `mbheads' entries is unset.  In this
1498      case we use the UNDEFINED entry.  */
1499   for (i = 1; i < 256; ++i)
1500     if (collate->mbheads[i] == NULL)
1501       {
1502         need_undefined = 1;
1503         collate->mbheads[i] = &collate->undefined;
1504       }
1505
1506   /* Now determine whether the UNDEFINED entry is needed and if yes,
1507      whether it was defined.  */
1508   collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1509   if (need_undefined && collate->undefined.file == NULL)
1510     {
1511       error (0, 0, _("no definition of `UNDEFINED'"));
1512
1513       /* Add UNDEFINED at the end.  */
1514       collate->undefined.mborder =
1515         (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1516
1517       for (i = 0; i < nrules; ++i)
1518         collate->undefined.mborder[i] = mbact[i]++;
1519
1520       collate->undefined.wcorder = wcact++;
1521     }
1522
1523   /* Finally, try to unify the rules for the sections.  Whenever the rules
1524      for a section are the same as those for another section give the
1525      ruleset the same index.  Since there are never many section we can
1526      use an O(n^2) algorithm here.  */
1527   sect = collate->sections;
1528   assert (sect != NULL);
1529   ruleidx = 0;
1530   do
1531     {
1532       struct section_list *osect = collate->sections;
1533
1534       while (osect != sect)
1535         if (memcmp (osect->rules, sect->rules, nrules) == 0)
1536           break;
1537         else
1538           osect = osect->next;
1539
1540       if (osect == sect)
1541         sect->ruleidx = ruleidx++;
1542       else
1543         sect->ruleidx = osect->ruleidx;
1544
1545       /* Next section.  */
1546       sect = sect->next;
1547     }
1548   while (sect != NULL);
1549   /* We are currently not prepared for more than 256 rulesets.  But this
1550      should never really be a problem.  */
1551   assert (ruleidx <= 256);
1552 }
1553
1554
1555 static inline int32_t
1556 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1557                struct element_t *elem)
1558 {
1559   size_t cnt;
1560   int32_t retval;
1561
1562   /* Optimize the use of UNDEFINED.  */
1563   if (elem == &collate->undefined)
1564     /* The weights are already inserted.  */
1565     return 0;
1566
1567   /* This byte can start exactly one collation element and this is
1568      a single byte.  We can directly give the index to the weights.  */
1569   retval = obstack_object_size (pool);
1570
1571   /* Construct the weight.  */
1572   for (cnt = 0; cnt < nrules; ++cnt)
1573     {
1574       char buf[elem->weights[cnt].cnt * 7];
1575       int len = 0;
1576       int i;
1577
1578       /* Add the direction.  */
1579       obstack_1grow (pool, elem->section->rules[cnt]);
1580
1581       for (i = 0; i < elem->weights[cnt].cnt; ++i)
1582         /* Encode the weight value.  */
1583         if (elem->weights[cnt].w[i] == NULL)
1584           {
1585             /* This entry was IGNORE.  */
1586             buf[len++] = '\3';
1587           }
1588         else
1589           len += utf8_encode (&buf[len],
1590                               elem->weights[cnt].w[i]->mborder[cnt]);
1591
1592       /* And add the buffer content.  */
1593       obstack_grow (pool, buf, len);
1594     }
1595
1596   return retval;
1597 }
1598
1599
1600 void
1601 collate_output (struct localedef_t *locale, struct charmap_t *charmap,
1602                 const char *output_path)
1603 {
1604   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1605   const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1606   struct iovec iov[2 + nelems];
1607   struct locale_file data;
1608   uint32_t idx[nelems];
1609   size_t cnt;
1610   size_t ch;
1611   int32_t tablemb[256];
1612   struct obstack weightpool;
1613   struct obstack extrapool;
1614   struct section_list *sect;
1615   int i;
1616
1617   obstack_init (&weightpool);
1618   obstack_init (&extrapool);
1619
1620   data.magic = LIMAGIC (LC_COLLATE);
1621   data.n = nelems;
1622   iov[0].iov_base = (void *) &data;
1623   iov[0].iov_len = sizeof (data);
1624
1625   iov[1].iov_base = (void *) idx;
1626   iov[1].iov_len = sizeof (idx);
1627
1628   idx[0] = iov[0].iov_len + iov[1].iov_len;
1629   cnt = 0;
1630
1631   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1632   iov[2 + cnt].iov_base = &collate->nrules;
1633   iov[2 + cnt].iov_len = sizeof (uint32_t);
1634   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1635   ++cnt;
1636
1637   /* Prepare the ruleset table.  */
1638   for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
1639     if (sect->ruleidx == i)
1640       {
1641         obstack_grow (&weightpool, sect->rules, nrules);
1642         ++i;
1643       }
1644   /* And align the output.  */
1645   i = (nrules * i) % __alignof__ (int32_t);
1646   if (i > 0)
1647     do
1648       obstack_1grow (&weightpool, '\0');
1649     while (++i < __alignof__ (int32_t));
1650
1651   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
1652   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
1653   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
1654   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1655   ++cnt;
1656
1657   /* Generate the 8-bit table.  Walk through the lists of sequences
1658      starting with the same byte and add them one after the other to
1659      the table.  In case we have more than one sequence starting with
1660      the same byte we have to use extra indirection.
1661
1662      First add a record for the NUL byte.  This entry will never be used
1663      so it does not matter.  */
1664   tablemb[0] = 0;
1665
1666   /* Now insert the `UNDEFINED' value if it is used.  Since this value
1667      will probably be used more than once it is good to store the
1668      weights only once.  */
1669   if (collate->undefined.used_in_level != 0)
1670     output_weight (&weightpool, collate, &collate->undefined);
1671
1672   for (ch = 1; ch < 256; ++ch)
1673     if (collate->mbheads[ch]->mbnext == NULL
1674         && collate->mbheads[ch]->nmbs == 1)
1675       {
1676         tablemb[ch] = output_weight (&weightpool, collate,
1677                                       collate->mbheads[ch]);
1678       }
1679     else
1680       {
1681         /* The entries in the list are sorted by length and then
1682            alphabetically.  This is the order in which we will add the
1683            elements to the collation table.  This allows to simply
1684            walk the table in sequence and stop at the first matching
1685            entry.  Since the longer sequences are coming first in the
1686            list they have the possibility to match first, just as it
1687            has to be.  In the worst case we are walking to the end of
1688            the list where we put, if no singlebyte sequence is defined
1689            in the locale definition, the weights for UNDEFINED.
1690
1691            To reduce the length of the search list we compress them a bit.
1692            This happens by collecting sequences of consecutive byte
1693            sequences in one entry (having and begin and end byte sequence)
1694            and add only one index into the weight table.  We can find the
1695            consecutive entries since they are also consecutive in the list.  */
1696         struct element_t *runp = collate->mbheads[ch];
1697         struct element_t *lastp;
1698
1699         tablemb[ch] = -obstack_object_size (&extrapool);
1700
1701         do
1702           {
1703             /* Store the current index in the weight table.  We know that
1704                the current position in the `extrapool' is aligned on a
1705                32-bit address.  */
1706             int32_t weightidx;
1707             int added;
1708
1709             /* Output the weight info.  */
1710             weightidx = output_weight (&weightpool, collate, runp);
1711
1712             /* Find out wether this is a single entry or we have more than
1713                one consecutive entry.  */
1714             if (runp->mbnext != NULL
1715                 && runp->nmbs == runp->mbnext->nmbs
1716                 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
1717                 && (runp->mbs[runp->nmbs - 1] + 1
1718                     == runp->mbnext->mbs[runp->nmbs - 1]))
1719               {
1720                 int i;
1721
1722                 /* More than one consecutive entry.  We mark this by having
1723                    a negative index into the weight table.  */
1724                 weightidx = -weightidx;
1725
1726                 /* Now add first the initial byte sequence.  */
1727                 added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
1728                           + __alignof__ (int32_t) - 1)
1729                          & ~(__alignof__ (int32_t) - 1));
1730                 obstack_make_room (&extrapool, added);
1731
1732                 if (sizeof (int32_t) == sizeof (int))
1733                   obstack_int_grow_fast (&extrapool, weightidx);
1734                 else
1735                   obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
1736                 obstack_1grow_fast (&extrapool, runp->section->ruleidx);
1737                 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
1738                 for (i = 1; i < runp->nmbs; ++i)
1739                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1740
1741                 /* Now find the end of the consecutive sequence.  */
1742                 do
1743                   runp = runp->next;
1744                 while (runp->mbnext != NULL
1745                        && runp->nmbs == runp->mbnext->nmbs
1746                        && memcmp (runp->mbs, runp->mbnext->mbs,
1747                                   runp->nmbs - 1) == 0
1748                        && (runp->mbs[runp->nmbs - 1] + 1
1749                            == runp->mbnext->mbs[runp->nmbs - 1]));
1750
1751                 /* And add the end by sequence.  Without length this time.  */
1752                 for (i = 1; i < runp->nmbs; ++i)
1753                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1754               }
1755             else
1756               {
1757                 /* A single entry.  Simply add the index and the length and
1758                    string (except for the first character which is already
1759                    tested for).  */
1760                 int i;
1761
1762                 added = ((sizeof (int32_t) + 1 + 1 + runp->nmbs - 1
1763                           + __alignof__ (int32_t) - 1)
1764                          & ~(__alignof__ (int32_t) - 1));
1765                 obstack_make_room (&extrapool, added);
1766
1767                 if (sizeof (int32_t) == sizeof (int))
1768                   obstack_int_grow_fast (&extrapool, weightidx);
1769                 else
1770                   obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
1771                 obstack_1grow_fast (&extrapool, runp->section->ruleidx);
1772                 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
1773                 for (i = 1; i < runp->nmbs; ++i)
1774                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1775               }
1776
1777             /* Add alignment bytes if necessary.  */
1778             i = added % __alignof__ (int32_t);
1779             if (i > 0)
1780               do
1781                 obstack_1grow_fast (&extrapool, '\0');
1782               while (++i != __alignof__ (int32_t));
1783
1784             /* Next entry.  */
1785             lastp = runp;
1786             runp = runp->mbnext;
1787           }
1788         while (runp != NULL);
1789
1790         /* If the final entry in the list is not a single character we
1791            add an UNDEFINED entry here.  */
1792         if (lastp->nmbs != 1)
1793           {
1794             int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t))
1795                          & ~(__alignof__ (int32_t) - 1));
1796             obstack_make_room (&extrapool, added);
1797
1798             if (sizeof (int32_t) == sizeof (int))
1799               obstack_int_grow_fast (&extrapool, 0);
1800             else
1801               {
1802                 int32_t zero = 0;
1803                 obstack_grow (&extrapool, &zero, sizeof (int32_t));
1804               }
1805             /* XXX What rule? We just pick the first.  */
1806             obstack_1grow_fast (&extrapool, 0);
1807             /* Length is zero.  */
1808             obstack_1grow_fast (&extrapool, 0);
1809
1810             /* Add alignment bytes if necessary.  */
1811             i = added % __alignof__ (int32_t);
1812             if (i > 0)
1813               do
1814                 obstack_1grow_fast (&extrapool, '\0');
1815               while (++i != __alignof__ (int32_t));
1816           }
1817       }
1818
1819   /* Now add the three tables.  */
1820   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
1821   iov[2 + cnt].iov_base = tablemb;
1822   iov[2 + cnt].iov_len = sizeof (tablemb);
1823   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1824   ++cnt;
1825
1826   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
1827   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
1828   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
1829   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1830   ++cnt;
1831
1832   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
1833   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
1834   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
1835   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1836   ++cnt;
1837
1838
1839   assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
1840
1841   write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
1842 }
1843
1844
1845 void
1846 collate_read (struct linereader *ldfile, struct localedef_t *result,
1847               struct charmap_t *charmap, const char *repertoire_name,
1848               int ignore_content)
1849 {
1850   struct repertoire_t *repertoire = NULL;
1851   struct locale_collate_t *collate;
1852   struct token *now;
1853   struct token *arg = NULL;
1854   enum token_t nowtok;
1855   int state = 0;
1856   enum token_t was_ellipsis = tok_none;
1857   struct localedef_t *copy_locale = NULL;
1858
1859   /* Get the repertoire we have to use.  */
1860   if (repertoire_name != NULL)
1861     repertoire = repertoire_read (repertoire_name);
1862
1863   /* The rest of the line containing `LC_COLLATE' must be free.  */
1864   lr_ignore_rest (ldfile, 1);
1865
1866   do
1867     {
1868       now = lr_token (ldfile, charmap, NULL);
1869       nowtok = now->tok;
1870     }
1871   while (nowtok == tok_eol);
1872
1873   if (nowtok == tok_copy)
1874     {
1875       state = 2;
1876       now = lr_token (ldfile, charmap, NULL);
1877       if (now->tok != tok_string)
1878         {
1879           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
1880
1881         skip_category:
1882           do
1883             now = lr_token (ldfile, charmap, NULL);
1884           while (now->tok != tok_eof && now->tok != tok_end);
1885
1886           if (now->tok != tok_eof
1887               || (now = lr_token (ldfile, charmap, NULL), now->tok == tok_eof))
1888             lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
1889           else if (now->tok != tok_lc_collate)
1890             {
1891               lr_error (ldfile, _("\
1892 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
1893               lr_ignore_rest (ldfile, 0);
1894             }
1895           else
1896             lr_ignore_rest (ldfile, 1);
1897
1898           return;
1899         }
1900
1901       /* Get the locale definition.  */
1902       copy_locale = find_locale (LC_COLLATE, now->val.str.startmb,
1903                                  repertoire_name, charmap);
1904       if ((copy_locale->avail & COLLATE_LOCALE) == 0)
1905         {
1906           /* Not yet loaded.  So do it now.  */
1907           if (locfile_read (copy_locale, charmap) != 0)
1908             goto skip_category;
1909         }
1910
1911       lr_ignore_rest (ldfile, 1);
1912
1913       now = lr_token (ldfile, charmap, NULL);
1914       nowtok = now->tok;
1915     }
1916
1917   /* Prepare the data structures.  */
1918   collate_startup (ldfile, result, copy_locale, ignore_content);
1919   collate = result->categories[LC_COLLATE].collate;
1920
1921   while (1)
1922     {
1923       /* Of course we don't proceed beyond the end of file.  */
1924       if (nowtok == tok_eof)
1925         break;
1926
1927       /* Ingore empty lines.  */
1928       if (nowtok == tok_eol)
1929         {
1930           now = lr_token (ldfile, charmap, NULL);
1931           nowtok = now->tok;
1932           continue;
1933         }
1934
1935       switch (nowtok)
1936         {
1937         case tok_coll_weight_max:
1938           /* Ignore the rest of the line if we don't need the input of
1939              this line.  */
1940           if (ignore_content)
1941             {
1942               lr_ignore_rest (ldfile, 0);
1943               break;
1944             }
1945
1946           if (state != 0)
1947             goto err_label;
1948
1949           arg = lr_token (ldfile, charmap, NULL);
1950           if (arg->tok != tok_number)
1951             goto err_label;
1952           if (collate->col_weight_max != -1)
1953             lr_error (ldfile, _("%s: duplicate definition of `%s'"),
1954                       "LC_COLLATE", "col_weight_max");
1955           else
1956             collate->col_weight_max = arg->val.num;
1957           lr_ignore_rest (ldfile, 1);
1958           break;
1959
1960         case tok_section_symbol:
1961           /* Ignore the rest of the line if we don't need the input of
1962              this line.  */
1963           if (ignore_content)
1964             {
1965               lr_ignore_rest (ldfile, 0);
1966               break;
1967             }
1968
1969           if (state != 0)
1970             goto err_label;
1971
1972           arg = lr_token (ldfile, charmap, repertoire);
1973           if (arg->tok != tok_bsymbol)
1974             goto err_label;
1975           else if (!ignore_content)
1976             {
1977               /* Check whether this section is already known.  */
1978               struct section_list *known = collate->sections;
1979               while (known != NULL)
1980                 if (strcmp (known->name, arg->val.str.startmb) == 0)
1981                   break;
1982
1983               if (known != NULL)
1984                 {
1985                   lr_error (ldfile,
1986                             _("%s: duplicate declaration of section `%s'"),
1987                             "LC_COLLATE", arg->val.str.startmb);
1988                   free (arg->val.str.startmb);
1989                 }
1990               else
1991                 collate->sections = make_seclist_elem (collate,
1992                                                        arg->val.str.startmb,
1993                                                        collate->sections);
1994
1995               lr_ignore_rest (ldfile, known == NULL);
1996             }
1997           else
1998             {
1999               free (arg->val.str.startmb);
2000               lr_ignore_rest (ldfile, 0);
2001             }
2002           break;
2003
2004         case tok_collating_element:
2005           /* Ignore the rest of the line if we don't need the input of
2006              this line.  */
2007           if (ignore_content)
2008             {
2009               lr_ignore_rest (ldfile, 0);
2010               break;
2011             }
2012
2013           if (state != 0)
2014             goto err_label;
2015
2016           arg = lr_token (ldfile, charmap, repertoire);
2017           if (arg->tok != tok_bsymbol)
2018             goto err_label;
2019           else
2020             {
2021               const char *symbol = arg->val.str.startmb;
2022               size_t symbol_len = arg->val.str.lenmb;
2023
2024               /* Next the `from' keyword.  */
2025               arg = lr_token (ldfile, charmap, repertoire);
2026               if (arg->tok != tok_from)
2027                 {
2028                   free ((char *) symbol);
2029                   goto err_label;
2030                 }
2031
2032               ldfile->return_widestr = 1;
2033
2034               /* Finally the string with the replacement.  */
2035               arg = lr_token (ldfile, charmap, repertoire);
2036               ldfile->return_widestr = 0;
2037               if (arg->tok != tok_string)
2038                 goto err_label;
2039
2040               if (!ignore_content)
2041                 {
2042                   if (symbol == NULL)
2043                     lr_error (ldfile, _("\
2044 %s: unknown character in collating element name"),
2045                               "LC_COLLATE");
2046                   if (arg->val.str.startmb == NULL)
2047                     lr_error (ldfile, _("\
2048 %s: unknown character in collating element definition"),
2049                               "LC_COLLATE");
2050                   if (arg->val.str.startwc == NULL)
2051                     lr_error (ldfile, _("\
2052 %s: unknown wide character in collating element definition"),
2053                               "LC_COLLATE");
2054                   else if (arg->val.str.lenwc < 2)
2055                     lr_error (ldfile, _("\
2056 %s: substitution string in collating element definition must have at least two characters"),
2057                               "LC_COLLATE");
2058
2059                   if (symbol != NULL)
2060                     {
2061                       /* The name is already defined.  */
2062                       if (check_duplicate (ldfile, collate, charmap,
2063                                            repertoire, symbol, symbol_len))
2064                         goto col_elem_free;
2065
2066                       if (insert_entry (&collate->elem_table,
2067                                         symbol, symbol_len,
2068                                         new_element (collate,
2069                                                      NULL, 0, NULL, symbol,
2070                                                      symbol_len)) < 0)
2071                         lr_error (ldfile, _("\
2072 error while adding collating element"));
2073                     }
2074                   else
2075                     goto col_elem_free;
2076                 }
2077               else
2078                 {
2079                 col_elem_free:
2080                   if (symbol != NULL)
2081                     free ((char *) symbol);
2082                   if (arg->val.str.startmb != NULL)
2083                     free (arg->val.str.startmb);
2084                   if (arg->val.str.startwc != NULL)
2085                     free (arg->val.str.startwc);
2086                 }
2087               lr_ignore_rest (ldfile, 1);
2088             }
2089           break;
2090
2091         case tok_collating_symbol:
2092           /* Ignore the rest of the line if we don't need the input of
2093              this line.  */
2094           if (ignore_content)
2095             {
2096               lr_ignore_rest (ldfile, 0);
2097               break;
2098             }
2099
2100           if (state != 0)
2101             goto err_label;
2102
2103           arg = lr_token (ldfile, charmap, repertoire);
2104           if (arg->tok != tok_bsymbol)
2105             goto err_label;
2106           else
2107             {
2108               const char *symbol = arg->val.str.startmb;
2109               size_t symbol_len = arg->val.str.lenmb;
2110
2111               if (!ignore_content)
2112                 {
2113                   if (symbol == NULL)
2114                     lr_error (ldfile, _("\
2115 %s: unknown character in collating symbol name"),
2116                               "LC_COLLATE");
2117                   else
2118                     {
2119                       /* The name is already defined.  */
2120                       if (check_duplicate (ldfile, collate, charmap,
2121                                            repertoire, symbol, symbol_len))
2122                         goto col_sym_free;
2123
2124                       if (insert_entry (&collate->sym_table,
2125                                         symbol, symbol_len,
2126                                         new_symbol (collate)) < 0)
2127                         lr_error (ldfile, _("\
2128 error while adding collating symbol"));
2129                     }
2130                 }
2131               else
2132                 {
2133                 col_sym_free:
2134                   if (symbol != NULL)
2135                     free ((char *) symbol);
2136                 }
2137               lr_ignore_rest (ldfile, 1);
2138             }
2139           break;
2140
2141         case tok_symbol_equivalence:
2142           /* Ignore the rest of the line if we don't need the input of
2143              this line.  */
2144           if (ignore_content)
2145             {
2146               lr_ignore_rest (ldfile, 0);
2147               break;
2148             }
2149
2150           if (state != 0)
2151             goto err_label;
2152
2153           arg = lr_token (ldfile, charmap, repertoire);
2154           if (arg->tok != tok_bsymbol)
2155             goto err_label;
2156           else
2157             {
2158               const char *newname = arg->val.str.startmb;
2159               size_t newname_len = arg->val.str.lenmb;
2160               const char *symname;
2161               size_t symname_len;
2162               struct symbol_t *symval;
2163
2164               arg = lr_token (ldfile, charmap, repertoire);
2165               if (arg->tok != tok_bsymbol)
2166                 {
2167                   if (newname != NULL)
2168                     free ((char *) newname);
2169                   goto err_label;
2170                 }
2171
2172               symname = arg->val.str.startmb;
2173               symname_len = arg->val.str.lenmb;
2174
2175               if (!ignore_content)
2176                 {
2177                   if (newname == NULL)
2178                     {
2179                       lr_error (ldfile, _("\
2180 %s: unknown character in equivalent definition name"),
2181                                 "LC_COLLATE");
2182                       goto sym_equiv_free;
2183                     }
2184                   if (symname == NULL)
2185                     {
2186                       lr_error (ldfile, _("\
2187 %s: unknown character in equivalent definition value"),
2188                                 "LC_COLLATE");
2189                       goto sym_equiv_free;
2190                     }
2191                   /* The name is already defined.  */
2192                   if (check_duplicate (ldfile, collate, charmap,
2193                                        repertoire, symname, symname_len))
2194                     goto col_sym_free;
2195
2196                   /* See whether the symbol name is already defined.  */
2197                   if (find_entry (&collate->sym_table, symname, symname_len,
2198                                   (void **) &symval) != 0)
2199                     {
2200                       lr_error (ldfile, _("\
2201 %s: unknown symbol `%s' in equivalent definition"),
2202                                 "LC_COLLATE", symname);
2203                       goto col_sym_free;
2204                     }
2205
2206                   if (insert_entry (&collate->sym_table,
2207                                     newname, newname_len, symval) < 0)
2208                     {
2209                       lr_error (ldfile, _("\
2210 error while adding equivalent collating symbol"));
2211                       goto sym_equiv_free;
2212                     }
2213
2214                   free ((char *) symname);
2215                 }
2216               else
2217                 {
2218                 sym_equiv_free:
2219                   if (newname != NULL)
2220                     free ((char *) newname);
2221                   if (symname != NULL)
2222                     free ((char *) symname);
2223                 }
2224               lr_ignore_rest (ldfile, 1);
2225             }
2226           break;
2227
2228         case tok_order_start:
2229           /* Ignore the rest of the line if we don't need the input of
2230              this line.  */
2231           if (ignore_content)
2232             {
2233               lr_ignore_rest (ldfile, 0);
2234               break;
2235             }
2236
2237           if (state != 0 && state != 1)
2238             goto err_label;
2239           state = 1;
2240
2241           /* The 14652 draft does not specify whether all `order_start' lines
2242              must contain the same number of sort-rules, but 14651 does.  So
2243              we require this here as well.  */
2244           arg = lr_token (ldfile, charmap, repertoire);
2245           if (arg->tok == tok_bsymbol)
2246             {
2247               /* This better should be a section name.  */
2248               struct section_list *sp = collate->sections;
2249               while (sp != NULL
2250                      && strcmp (sp->name, arg->val.str.startmb) != 0)
2251                 sp = sp->next;
2252
2253               if (sp == NULL)
2254                 {
2255                   lr_error (ldfile, _("\
2256 %s: unknown section name `%s'"),
2257                             "LC_COLLATE", arg->val.str.startmb);
2258                   /* We use the error section.  */
2259                   collate->current_section = &collate->error_section;
2260
2261                   if (collate->error_section.first == NULL)
2262                     {
2263                       collate->error_section.next = collate->sections;
2264                       collate->sections = &collate->error_section;
2265                     }
2266                 }
2267               else
2268                 {
2269                   /* Remember this section.  */
2270                   collate->current_section = sp;
2271
2272                   /* One should not be allowed to open the same
2273                      section twice.  */
2274                   if (sp->first != NULL)
2275                     lr_error (ldfile, _("\
2276 %s: multiple order definitions for section `%s'"),
2277                               "LC_COLLATE", sp->name);
2278                   else
2279                     {
2280                       sp->next = collate->sections;
2281                       collate->sections = sp;
2282                     }
2283
2284                   /* Next should come the end of the line or a semicolon.  */
2285                   arg = lr_token (ldfile, charmap, repertoire);
2286                   if (arg->tok == tok_eol)
2287                     {
2288                       uint32_t cnt;
2289
2290                       /* This means we have exactly one rule: `forward'.  */
2291                       if (collate->nrules > 1)
2292                         lr_error (ldfile, _("\
2293 %s: invalid number of sorting rules"),
2294                                   "LC_COLLATE");
2295                       else
2296                         collate->nrules = 1;
2297                       sp->rules = obstack_alloc (&collate->mempool,
2298                                                  (sizeof (enum coll_sort_rule)
2299                                                   * collate->nrules));
2300                       for (cnt = 0; cnt < collate->nrules; ++cnt)
2301                         sp->rules[cnt] = sort_forward;
2302
2303                       /* Next line.  */
2304                       break;
2305                     }
2306
2307                   /* Get the next token.  */
2308                   arg = lr_token (ldfile, charmap, repertoire);
2309                 }
2310             }
2311           else
2312             {
2313               /* There is no section symbol.  Therefore we use the unnamed
2314                  section.  */
2315               collate->current_section = &collate->unnamed_section;
2316
2317               if (collate->unnamed_section.first != NULL)
2318                 lr_error (ldfile, _("\
2319 %s: multiple order definitions for unnamed section"),
2320                           "LC_COLLATE");
2321               else
2322                 {
2323                   collate->unnamed_section.next = collate->sections;
2324                   collate->sections = &collate->unnamed_section;
2325                 }
2326             }
2327
2328           /* Now read the direction names.  */
2329           read_directions (ldfile, arg, charmap, repertoire, collate);
2330
2331           /* From now be need the strings untranslated.  */
2332           ldfile->translate_strings = 0;
2333           break;
2334
2335         case tok_order_end:
2336           /* Ignore the rest of the line if we don't need the input of
2337              this line.  */
2338           if (ignore_content)
2339             {
2340               lr_ignore_rest (ldfile, 0);
2341               break;
2342             }
2343
2344           if (state != 1)
2345             goto err_label;
2346
2347           /* Handle ellipsis at end of list.  */
2348           if (was_ellipsis != tok_none)
2349             {
2350               handle_ellipsis (ldfile, NULL, was_ellipsis, charmap, repertoire,
2351                                collate);
2352               was_ellipsis = tok_none;
2353             }
2354
2355           state = 2;
2356           lr_ignore_rest (ldfile, 1);
2357           break;
2358
2359         case tok_reorder_after:
2360           /* Ignore the rest of the line if we don't need the input of
2361              this line.  */
2362           if (ignore_content)
2363             {
2364               lr_ignore_rest (ldfile, 0);
2365               break;
2366             }
2367
2368           if (state == 1)
2369             {
2370               lr_error (ldfile, _("%s: missing `order_end' keyword"),
2371                         "LC_COLLATE");
2372               state = 2;
2373
2374               /* Handle ellipsis at end of list.  */
2375               if (was_ellipsis != tok_none)
2376                 {
2377                   handle_ellipsis (ldfile, arg, was_ellipsis, charmap,
2378                                    repertoire, collate);
2379                   was_ellipsis = tok_none;
2380                 }
2381             }
2382           else if (state != 2 && state != 3)
2383             goto err_label;
2384           state = 3;
2385
2386           arg = lr_token (ldfile, charmap, repertoire);
2387           if (arg->tok == tok_bsymbol)
2388             {
2389               /* Find this symbol in the sequence table.  */
2390               struct element_t *insp;
2391               int no_error = 1;
2392
2393               if (find_entry (&collate->seq_table, arg->val.str.startmb,
2394                               arg->val.str.lenmb, (void **) &insp) == 0)
2395                 /* Yes, the symbol exists.  Simply point the cursor
2396                    to it.  */
2397                   collate->cursor = insp;
2398               else
2399                 {
2400                   /* This is bad.  The symbol after which we have to
2401                      insert does not exist.  */
2402                   lr_error (ldfile, _("\
2403 %s: cannot reorder after %.*s: symbol not known"),
2404                             "LC_COLLATE", (int) arg->val.str.lenmb,
2405                             arg->val.str.startmb);
2406                   collate->cursor = NULL;
2407                   no_error = 0;
2408                 }
2409
2410               lr_ignore_rest (ldfile, no_error);
2411             }
2412           else
2413             /* This must not happen.  */
2414             goto err_label;
2415           break;
2416
2417         case tok_reorder_end:
2418           /* Ignore the rest of the line if we don't need the input of
2419              this line.  */
2420           if (ignore_content)
2421             break;
2422
2423           if (state != 3)
2424             goto err_label;
2425           state = 4;
2426           lr_ignore_rest (ldfile, 1);
2427           break;
2428
2429         case tok_reorder_sections_after:
2430           /* Ignore the rest of the line if we don't need the input of
2431              this line.  */
2432           if (ignore_content)
2433             {
2434               lr_ignore_rest (ldfile, 0);
2435               break;
2436             }
2437
2438           if (state == 1)
2439             {
2440               lr_error (ldfile, _("%s: missing `order_end' keyword"),
2441                         "LC_COLLATE");
2442               state = 2;
2443
2444               /* Handle ellipsis at end of list.  */
2445               if (was_ellipsis != tok_none)
2446                 {
2447                   handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
2448                                    repertoire, collate);
2449                   was_ellipsis = tok_none;
2450                 }
2451             }
2452           else if (state == 3)
2453             {
2454               error (0, 0, _("%s: missing `reorder-end' keyword"),
2455                      "LC_COLLATE");
2456               state = 4;
2457             }
2458           else if (state != 2 && state != 4)
2459             goto err_label;
2460           state = 5;
2461
2462           /* Get the name of the sections we are adding after.  */
2463           arg = lr_token (ldfile, charmap, repertoire);
2464           if (arg->tok == tok_bsymbol)
2465             {
2466               /* Now find a section with this name.  */
2467               struct section_list *runp = collate->sections;
2468
2469               while (runp != NULL)
2470                 {
2471                   if (runp->name != NULL
2472                       && strlen (runp->name) == arg->val.str.lenmb
2473                       && memcmp (runp->name, arg->val.str.startmb,
2474                                  arg->val.str.lenmb) == 0)
2475                     break;
2476
2477                   runp = runp->next;
2478                 }
2479
2480               if (runp != NULL)
2481                 collate->current_section = runp;
2482               else
2483                 {
2484                   /* This is bad.  The section after which we have to
2485                      reorder does not exist.  Therefore we cannot
2486                      process the whole rest of this reorder
2487                      specification.  */
2488                   lr_error (ldfile, _("%s: section `%.*s' not known"),
2489                             "LC_COLLATE", (int) arg->val.str.lenmb,
2490                             arg->val.str.startmb);
2491
2492                   do
2493                     {
2494                       lr_ignore_rest (ldfile, 0);
2495
2496                       now = lr_token (ldfile, charmap, NULL);
2497                     }
2498                   while (now->tok == tok_reorder_sections_after
2499                          || now->tok == tok_reorder_sections_end
2500                          || now->tok == tok_end);
2501
2502                   /* Process the token we just saw.  */
2503                   nowtok = now->tok;
2504                   continue;
2505                 }
2506             }
2507           else
2508             /* This must not happen.  */
2509             goto err_label;
2510           break;
2511
2512         case tok_reorder_sections_end:
2513           /* Ignore the rest of the line if we don't need the input of
2514              this line.  */
2515           if (ignore_content)
2516             break;
2517
2518           if (state != 5)
2519             goto err_label;
2520           state = 6;
2521           lr_ignore_rest (ldfile, 1);
2522           break;
2523
2524         case tok_bsymbol:
2525           /* Ignore the rest of the line if we don't need the input of
2526              this line.  */
2527           if (ignore_content)
2528             {
2529               lr_ignore_rest (ldfile, 0);
2530               break;
2531             }
2532
2533           if (state != 1 && state != 3)
2534             goto err_label;
2535
2536           if (state == 3)
2537             {
2538               /* It is possible that we already have this collation sequence.
2539                  In this case we move the entry.  */
2540               struct element_t *seqp;
2541
2542               /* If the symbol after which we have to insert was not found
2543                  ignore all entries.  */
2544               if (collate->cursor == NULL)
2545                 {
2546                   lr_ignore_rest (ldfile, 0);
2547                   break;
2548                 }
2549
2550               if (find_entry (&collate->seq_table, arg->val.str.startmb,
2551                               arg->val.str.lenmb, (void **) &seqp) == 0)
2552                 {
2553                   /* Remove the entry from the old position.  */
2554                   if (seqp->last == NULL)
2555                     collate->start = seqp->next;
2556                   else
2557                     seqp->last->next = seqp->next;
2558                   if (seqp->next != NULL)
2559                     seqp->next->last = seqp->last;
2560
2561                   /* We also have to check whether this entry is the
2562                      first or last of a section.  */
2563                   if (seqp->section->first == seqp)
2564                     {
2565                       if (seqp->section->first == seqp->section->last)
2566                         /* This setion has no content anymore.  */
2567                         seqp->section->first = seqp->section->last = NULL;
2568                       else
2569                         seqp->section->first = seqp->next;
2570                     }
2571                   else if (seqp->section->last == seqp)
2572                     seqp->section->last = seqp->last;
2573
2574                   /* Now insert it in the new place.  */
2575                   seqp->next = collate->cursor->next;
2576                   seqp->last = collate->cursor;
2577                   collate->cursor->next = seqp;
2578                   if (seqp->next != NULL)
2579                     seqp->next->last = seqp;
2580
2581                   seqp->section = collate->cursor->section;
2582                   if (seqp->section->last == collate->cursor)
2583                     seqp->section->last = seqp;
2584
2585                   break;
2586                 }
2587
2588               /* Otherwise we just add a new entry.  */
2589             }
2590           else if (state == 5)
2591             {
2592               /* We are reordering sections.  Find the named section.  */
2593               struct section_list *runp = collate->sections;
2594               struct section_list *prevp = NULL;
2595
2596               while (runp != NULL)
2597                 {
2598                   if (runp->name != NULL
2599                       && strlen (runp->name) == arg->val.str.lenmb
2600                       && memcmp (runp->name, arg->val.str.startmb,
2601                                  arg->val.str.lenmb) == 0)
2602                     break;
2603
2604                   prevp = runp;
2605                   runp = runp->next;
2606                 }
2607
2608               if (runp == NULL)
2609                 {
2610                   lr_error (ldfile, _("%s: section `%.*s' not known"),
2611                             "LC_COLLATE", (int) arg->val.str.lenmb,
2612                             arg->val.str.startmb);
2613                   lr_ignore_rest (ldfile, 0);
2614                 }
2615               else
2616                 {
2617                   if (runp != collate->current_section)
2618                     {
2619                       /* Remove the named section from the old place and
2620                          insert it in the new one.  */
2621                       prevp->next = runp->next;
2622
2623                       runp->next = collate->current_section->next;
2624                       collate->current_section->next = runp;
2625                       collate->current_section = runp;
2626                     }
2627
2628                   /* Process the rest of the line which might change
2629                      the collation rules.  */
2630                   arg = lr_token (ldfile, charmap, repertoire);
2631                   if (arg->tok != tok_eof && arg->tok != tok_eol)
2632                     read_directions (ldfile, arg, charmap, repertoire,
2633                                      collate);
2634                 }
2635               break;
2636             }
2637           else if (was_ellipsis != tok_none)
2638             {
2639               /* Using the information in the `ellipsis_weight'
2640                  element and this and the last value we have to handle
2641                  the ellipsis now.  */
2642               assert (state == 1);
2643
2644               handle_ellipsis (ldfile, arg, was_ellipsis, charmap, repertoire,
2645                                collate);
2646
2647               /* Remember that we processed the ellipsis.  */
2648               was_ellipsis = tok_none;
2649
2650               /* And don't add the value a second time.  */
2651               break;
2652             }
2653
2654           /* Now insert in the new place.  */
2655           insert_value (ldfile, arg, charmap, repertoire, collate);
2656           break;
2657
2658         case tok_undefined:
2659           /* Ignore the rest of the line if we don't need the input of
2660              this line.  */
2661           if (ignore_content)
2662             {
2663               lr_ignore_rest (ldfile, 0);
2664               break;
2665             }
2666
2667           if (state != 1)
2668             goto err_label;
2669
2670           if (was_ellipsis != tok_none)
2671             {
2672               lr_error (ldfile,
2673                         _("%s: cannot have `%s' as end of ellipsis range"),
2674                         "LC_COLLATE", "UNDEFINED");
2675
2676               unlink_element (collate);
2677               was_ellipsis = tok_none;
2678             }
2679
2680           /* See whether UNDEFINED already appeared somewhere.  */
2681           if (collate->undefined.next != NULL
2682               || (collate->cursor != NULL
2683                   && collate->undefined.next == collate->cursor))
2684             {
2685               lr_error (ldfile,
2686                         _("%s: order for `%.*s' already defined at %s:%zu"),
2687                         "LC_COLLATE", 9, "UNDEFINED", collate->undefined.file,
2688                         collate->undefined.line);
2689               lr_ignore_rest (ldfile, 0);
2690             }
2691           else
2692             /* Parse the weights.  */
2693              insert_weights (ldfile, &collate->undefined, charmap,
2694                              repertoire, collate, tok_none);
2695           break;
2696
2697         case tok_ellipsis2:
2698         case tok_ellipsis3:
2699         case tok_ellipsis4:
2700           /* This is the symbolic (decimal or hexadecimal) or absolute
2701              ellipsis.  */
2702           if (was_ellipsis != tok_none)
2703             goto err_label;
2704
2705           if (state != 1 && state != 3)
2706             goto err_label;
2707
2708           was_ellipsis = nowtok;
2709
2710           insert_weights (ldfile, &collate->ellipsis_weight, charmap,
2711                           repertoire, collate, nowtok);
2712           break;
2713
2714         case tok_end:
2715           /* Next we assume `LC_COLLATE'.  */
2716           if (!ignore_content)
2717             {
2718               if (state == 0)
2719                 /* We must either see a copy statement or have
2720                    ordering values.  */
2721                 lr_error (ldfile,
2722                           _("%s: empty category description not allowed"),
2723                           "LC_COLLATE");
2724               else if (state == 1)
2725                 {
2726                   lr_error (ldfile, _("%s: missing `order_end' keyword"),
2727                             "LC_COLLATE");
2728
2729                   /* Handle ellipsis at end of list.  */
2730                   if (was_ellipsis != tok_none)
2731                     {
2732                       handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
2733                                        repertoire, collate);
2734                       was_ellipsis = tok_none;
2735                     }
2736                 }
2737               else if (state == 3)
2738                 error (0, 0, _("%s: missing `reorder-end' keyword"),
2739                        "LC_COLLATE");
2740               else if (state == 5)
2741                 error (0, 0, _("%s: missing `reorder-sections-end' keyword"),
2742                        "LC_COLLATE");
2743             }
2744           arg = lr_token (ldfile, charmap, NULL);
2745           if (arg->tok == tok_eof)
2746             break;
2747           if (arg->tok == tok_eol)
2748             lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
2749           else if (arg->tok != tok_lc_collate)
2750             lr_error (ldfile, _("\
2751 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2752           lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
2753           return;
2754
2755         default:
2756         err_label:
2757           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2758         }
2759
2760       /* Prepare for the next round.  */
2761       now = lr_token (ldfile, charmap, NULL);
2762       nowtok = now->tok;
2763     }
2764
2765   /* When we come here we reached the end of the file.  */
2766   lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2767 }