locale/programs/ld-collate.c

   1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <errno.h>
  25 #include <error.h>
  26 #include <stdlib.h>
  27 #include <wchar.h>
  28
  29 #include "charmap.h"
  30 #include "localeinfo.h"
  31 #include "linereader.h"
  32 #include "locfile.h"
  33 #include "localedef.h"
  34
  35 /* Uncomment the following line in the production version.  */
  36 /* #define NDEBUG 1 */
  37 #include <assert.h>
  38
  39 #define obstack_chunk_alloc malloc
  40 #define obstack_chunk_free free
  41
  42 /* Forward declaration.  */
  43 struct element_t;
  44
  45 /* Data type for list of strings.  */
  46 struct section_list
  47 {
  48   struct section_list *next;
  49   /* Name of the section.  */
  50   const char *name;
  51   /* First element of this section.  */
  52   struct element_t *first;
  53   /* Last element of this section.  */
  54   struct element_t *last;
  55   /* These are the rules for this section.  */
  56   enum coll_sort_rule *rules;
  57   /* Index of the rule set in the appropriate section of the output file.  */
  58   int ruleidx;
  59 };
  60
  61 struct element_t;
  62
  63 struct element_list_t
  64 {
  65   /* Number of elements.  */
  66   int cnt;
  67
  68   struct element_t **w;
  69 };
  70
  71 /* Data type for collating element.  */
  72 struct element_t
  73 {
  74   const char *name;
  75
  76   const char *mbs;
  77   size_t nmbs;
  78   const uint32_t *wcs;
  79   size_t nwcs;
  80   int *mborder;
  81   int wcorder;
  82
  83   /* The following is a bit mask which bits are set if this element is
  84      used in the appropriate level.  Interesting for the singlebyte
  85      weight computation.
  86
  87      XXX The type here restricts the number of levels to 32.  It could
  88      we changed if necessary but I doubt this is necessary.  */
  89   unsigned int used_in_level;
  90
  91   struct element_list_t *weights;
  92
  93   /* Where does the definition come from.  */
  94   const char *file;
  95   size_t line;
  96
  97   /* Which section does this belong to.  */
  98   struct section_list *section;
  99
 100   /* Predecessor and successor in the order list.  */
 101   struct element_t *last;
 102   struct element_t *next;
 103
 104   /* Next element in multibyte output list.  */
 105   struct element_t *mbnext;
 106 };
 107
 108 /* Special element value.  */
 109 #define ELEMENT_ELLIPSIS2       ((struct element_t *) 1)
 110 #define ELEMENT_ELLIPSIS3       ((struct element_t *) 2)
 111 #define ELEMENT_ELLIPSIS4       ((struct element_t *) 3)
 112
 113 /* Data type for collating symbol.  */
 114 struct symbol_t
 115 {
 116   /* Point to place in the order list.  */
 117   struct element_t *order;
 118
 119   /* Where does the definition come from.  */
 120   const char *file;
 121   size_t line;
 122 };
 123
 124
 125 /* The real definition of the struct for the LC_COLLATE locale.  */
 126 struct locale_collate_t
 127 {
 128   int col_weight_max;
 129   int cur_weight_max;
 130
 131   /* List of known scripts.  */
 132   struct section_list *sections;
 133   /* Current section using definition.  */
 134   struct section_list *current_section;
 135   /* There always can be an unnamed section.  */
 136   struct section_list unnamed_section;
 137   /* To make handling of errors easier we have another section.  */
 138   struct section_list error_section;
 139
 140   /* Number of sorting rules given in order_start line.  */
 141   uint32_t nrules;
 142
 143   /* Start of the order list.  */
 144   struct element_t *start;
 145
 146   /* The undefined element.  */
 147   struct element_t undefined;
 148
 149   /* This is the cursor for `reorder_after' insertions.  */
 150   struct element_t *cursor;
 151
 152   /* This value is used when handling ellipsis.  */
 153   struct element_t ellipsis_weight;
 154
 155   /* Known collating elements.  */
 156   hash_table elem_table;
 157
 158   /* Known collating symbols.  */
 159   hash_table sym_table;
 160
 161   /* Known collation sequences.  */
 162   hash_table seq_table;
 163
 164   struct obstack mempool;
 165
 166   /* The LC_COLLATE category is a bit special as it is sometimes possible
 167      that the definitions from more than one input file contains information.
 168      Therefore we keep all relevant input in a list.  */
 169   struct locale_collate_t *next;
 170
 171   /* Arrays with heads of the list for each of the leading bytes in
 172      the multibyte sequences.  */
 173   struct element_t *mbheads[256];
 174 };
 175
 176
 177 /* We have a few global variables which are used for reading all
 178    LC_COLLATE category descriptions in all files.  */
 179 static int nrules;
 180
 181
 182 /* These are definitions used by some of the functions for handling
 183    UTF-8 encoding below.  */
 184 static const uint32_t encoding_mask[] =
 185 {
 186   ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
 187 };
 188
 189 static const unsigned char encoding_byte[] =
 190 {
 191   0xc0, 0xe0, 0xf0, 0xf8, 0xfc
 192 };
 193
 194
 195 /* We need UTF-8 encoding of numbers.  */
 196 static inline int
 197 utf8_encode (char *buf, int val)
 198 {
 199   char *startp = buf;
 200   int retval;
 201
 202   if (val < 0x80)
 203     {
 204       *buf++ = (char) val;
 205       retval = 1;
 206     }
 207   else
 208     {
 209       int step;
 210
 211       for (step = 2; step < 6; ++step)
 212         if ((val & encoding_mask[step - 2]) == 0)
 213           break;
 214       retval = step;
 215
 216       *buf = encoding_byte[step - 2];
 217       --step;
 218       do
 219         {
 220           buf[step] = 0x80 | (val & 0x3f);
 221           val >>= 6;
 222         }
 223       while (--step > 0);
 224       *buf |= val;
 225     }
 226
 227   return buf - startp;
 228 }
 229
 230
 231 static struct section_list *
 232 make_seclist_elem (struct locale_collate_t *collate, const char *string,
 233                    struct section_list *next)
 234 {
 235   struct section_list *newp;
 236
 237   newp = (struct section_list *) obstack_alloc (&collate->mempool,
 238                                                 sizeof (*newp));
 239   newp->next = next;
 240   newp->name = string;
 241   newp->first = NULL;
 242
 243   return newp;
 244 }
 245
 246
 247 static struct element_t *
 248 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
 249              const uint32_t *wcs, const char *name, size_t namelen)
 250 {
 251   struct element_t *newp;
 252
 253   newp = (struct element_t *) obstack_alloc (&collate->mempool,
 254                                              sizeof (*newp));
 255   newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
 256                                                     name, namelen);
 257   if (mbs != NULL)
 258     {
 259       newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
 260       newp->nmbs = mbslen;
 261     }
 262   else
 263     {
 264       newp->mbs = NULL;
 265       newp->nmbs = 0;
 266     }
 267   if (wcs != NULL)
 268     {
 269       size_t nwcs = wcslen ((wchar_t *) wcs);
 270       uint32_t zero = 0;
 271       obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
 272       obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
 273       newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
 274       newp->nwcs = nwcs;
 275     }
 276   else
 277     {
 278       newp->wcs = NULL;
 279       newp->nwcs = 0;
 280     }
 281   newp->mborder = NULL;
 282   newp->wcorder = 0;
 283   newp->used_in_level = 0;
 284
 285   /* Will be allocated later.  */
 286   newp->weights = NULL;
 287
 288   newp->file = NULL;
 289   newp->line = 0;
 290
 291   newp->section = collate->current_section;
 292
 293   newp->last = NULL;
 294   newp->next = NULL;
 295
 296   newp->mbnext = NULL;
 297
 298   return newp;
 299 }
 300
 301
 302 static struct symbol_t *
 303 new_symbol (struct locale_collate_t *collate)
 304 {
 305   struct symbol_t *newp;
 306
 307   newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
 308
 309   newp->order = NULL;
 310
 311   newp->file = NULL;
 312   newp->line = 0;
 313
 314   return newp;
 315 }
 316
 317
 318 /* Test whether this name is already defined somewhere.  */
 319 static int
 320 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
 321                  struct charmap_t *charmap, struct repertoire_t *repertoire,
 322                  const char *symbol, size_t symbol_len)
 323 {
 324   void *ignore = NULL;
 325
 326   if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
 327     {
 328       lr_error (ldfile, _("`%s' already defined in charmap"), symbol);
 329       return 1;
 330     }
 331
 332   if (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore) == 0)
 333     {
 334       lr_error (ldfile, _("`%s' already defined in repertoire"), symbol);
 335       return 1;
 336     }
 337
 338   if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
 339     {
 340       lr_error (ldfile, _("`%s' already defined as collating symbol"), symbol);
 341       return 1;
 342     }
 343
 344   if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
 345     {
 346       lr_error (ldfile, _("`%s' already defined as collating element"),
 347                 symbol);
 348       return 1;
 349     }
 350
 351   return 0;
 352 }
 353
 354
 355 /* Read the direction specification.  */
 356 static void
 357 read_directions (struct linereader *ldfile, struct token *arg,
 358                  struct charmap_t *charmap, struct repertoire_t *repertoire,
 359                  struct locale_collate_t *collate)
 360 {
 361   int cnt = 0;
 362   int max = nrules ?: 10;
 363   enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
 364   int warned = 0;
 365
 366   while (1)
 367     {
 368       int valid = 0;
 369
 370       if (arg->tok == tok_forward)
 371         {
 372           if (rules[cnt] & sort_backward)
 373             {
 374               if (! warned)
 375                 {
 376                   lr_error (ldfile, _("\
 377 %s: `forward' and `backward' are mutually excluding each other"),
 378                             "LC_COLLATE");
 379                   warned = 1;
 380                 }
 381             }
 382           else if (rules[cnt] & sort_forward)
 383             {
 384               if (! warned)
 385                 {
 386                   lr_error (ldfile, _("\
 387 %s: `%s' mentioned twice in definition of weight %d"),
 388                             "LC_COLLATE", "forward", cnt + 1);
 389                 }
 390             }
 391           else
 392             rules[cnt] |= sort_forward;
 393
 394           valid = 1;
 395         }
 396       else if (arg->tok == tok_backward)
 397         {
 398           if (rules[cnt] & sort_forward)
 399             {
 400               if (! warned)
 401                 {
 402                   lr_error (ldfile, _("\
 403 %s: `forward' and `backward' are mutually excluding each other"),
 404                             "LC_COLLATE");
 405                   warned = 1;
 406                 }
 407             }
 408           else if (rules[cnt] & sort_backward)
 409             {
 410               if (! warned)
 411                 {
 412                   lr_error (ldfile, _("\
 413 %s: `%s' mentioned twice in definition of weight %d"),
 414                             "LC_COLLATE", "backward", cnt + 1);
 415                 }
 416             }
 417           else
 418             rules[cnt] |= sort_backward;
 419
 420           valid = 1;
 421         }
 422       else if (arg->tok == tok_position)
 423         {
 424           if (rules[cnt] & sort_position)
 425             {
 426               if (! warned)
 427                 {
 428                   lr_error (ldfile, _("\
 429 %s: `%s' mentioned twice in definition of weight %d in category `%s'"),
 430                             "LC_COLLATE", "position", cnt + 1);
 431                 }
 432             }
 433           else
 434             rules[cnt] |= sort_position;
 435
 436           valid = 1;
 437         }
 438
 439       if (valid)
 440         arg = lr_token (ldfile, charmap, repertoire);
 441
 442       if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
 443           || arg->tok == tok_semicolon)
 444         {
 445           if (! valid && ! warned)
 446             {
 447               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 448               warned = 1;
 449             }
 450
 451           /* See whether we have to increment the counter.  */
 452           if (arg->tok != tok_comma && rules[cnt] != 0)
 453             ++cnt;
 454
 455           if (arg->tok == tok_eof || arg->tok == tok_eol)
 456             /* End of line or file, so we exit the loop.  */
 457             break;
 458
 459           if (nrules == 0)
 460             {
 461               /* See whether we have enough room in the array.  */
 462               if (cnt == max)
 463                 {
 464                   max += 10;
 465                   rules = (enum coll_sort_rule *) xrealloc (rules,
 466                                                             max
 467                                                             * sizeof (*rules));
 468                   memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
 469                 }
 470             }
 471           else
 472             {
 473               if (cnt == nrules)
 474                 {
 475                   /* There must not be any more rule.  */
 476                   if (! warned)
 477                     {
 478                       lr_error (ldfile, _("\
 479 %s: too many rules; first entry only had %d"),
 480                                 "LC_COLLATE", nrules);
 481                       warned = 1;
 482                     }
 483
 484                   lr_ignore_rest (ldfile, 0);
 485                   break;
 486                 }
 487             }
 488         }
 489       else
 490         {
 491           if (! warned)
 492             {
 493               lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 494               warned = 1;
 495             }
 496         }
 497
 498       arg = lr_token (ldfile, charmap, repertoire);
 499     }
 500
 501   if (nrules == 0)
 502     {
 503       /* Now we know how many rules we have.  */
 504       nrules = cnt;
 505       rules = (enum coll_sort_rule *) xrealloc (rules,
 506                                                 nrules * sizeof (*rules));
 507     }
 508   else
 509     {
 510       if (cnt < nrules)
 511         {
 512           /* Not enough rules in this specification.  */
 513           if (! warned)
 514             lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
 515
 516           do
 517             rules[cnt] = sort_forward;
 518           while (++cnt < nrules);
 519         }
 520     }
 521
 522   collate->current_section->rules = rules;
 523 }
 524
 525
 526 static struct element_t *
 527 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
 528               const char *str, size_t len, uint32_t *wcstr)
 529 {
 530   struct element_t *result = NULL;
 531
 532   /* Search for the entries among the collation sequences already define.  */
 533   if (find_entry (&collate->seq_table, str, len, (void **) &result) != 0)
 534     {
 535       /* Nope, not define yet.  So we see whether it is a
 536          collation symbol.  */
 537       void *ptr;
 538
 539       if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
 540         {
 541           /* It's a collation symbol.  */
 542           struct symbol_t *sym = (struct symbol_t *) ptr;
 543           result = sym->order;
 544
 545           if (result == NULL)
 546             result = sym->order = new_element (collate, NULL, 0, NULL,
 547                                                NULL, 0);
 548         }
 549       else if (find_entry (&collate->elem_table, str, len,
 550                            (void **) &result) != 0)
 551         {
 552           /* It's also no collation element.  So it is a character
 553              element defined later.  */
 554           result = new_element (collate, NULL, 0, NULL, str, len);
 555           if (result != NULL)
 556             /* Insert it into the sequence table.  */
 557             insert_entry (&collate->seq_table, str, len, result);
 558         }
 559     }
 560
 561   return result;
 562 }
 563
 564
 565 static void
 566 unlink_element (struct locale_collate_t *collate)
 567 {
 568   if (collate->cursor == collate->start)
 569     {
 570       assert (collate->cursor->next == NULL);
 571       assert (collate->cursor->last == NULL);
 572       collate->cursor = NULL;
 573     }
 574   else
 575     {
 576       if (collate->cursor->next != NULL)
 577         collate->cursor->next->last = collate->cursor->last;
 578       if (collate->cursor->last != NULL)
 579         collate->cursor->last->next = collate->cursor->next;
 580       collate->cursor = collate->cursor->last;
 581     }
 582 }
 583
 584
 585 static void
 586 insert_weights (struct linereader *ldfile, struct element_t *elem,
 587                 struct charmap_t *charmap, struct repertoire_t *repertoire,
 588                 struct locale_collate_t *collate, enum token_t ellipsis)
 589 {
 590   int weight_cnt;
 591   struct token *arg;
 592
 593   /* Initialize all the fields.  */
 594   elem->file = ldfile->fname;
 595   elem->line = ldfile->lineno;
 596   elem->last = collate->cursor;
 597   elem->next = collate->cursor ? collate->cursor->next : NULL;
 598   elem->section = collate->current_section;
 599   if (collate->cursor != NULL)
 600     collate->cursor->next = elem;
 601   if (collate->start == NULL)
 602     {
 603       assert (collate->cursor == NULL);
 604       collate->start = elem;
 605     }
 606   elem->weights = (struct element_list_t *)
 607     obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
 608   memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
 609
 610   if (collate->current_section->first == NULL)
 611     collate->current_section->first = elem;
 612   if (collate->current_section->last == collate->cursor)
 613     collate->current_section->last = elem;
 614
 615   collate->cursor = elem;
 616
 617   weight_cnt = 0;
 618
 619   arg = lr_token (ldfile, charmap, repertoire);
 620   do
 621     {
 622       if (arg->tok == tok_eof || arg->tok == tok_eol)
 623         break;
 624
 625       if (arg->tok == tok_ignore)
 626         {
 627           /* The weight for this level has to be ignored.  We use the
 628              null pointer to indicate this.  */
 629           elem->weights[weight_cnt].w = (struct element_t **)
 630             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 631           elem->weights[weight_cnt].w[0] = NULL;
 632           elem->weights[weight_cnt].cnt = 1;
 633         }
 634       else if (arg->tok == tok_bsymbol)
 635         {
 636           struct element_t *val = find_element (ldfile, collate,
 637                                                 arg->val.str.startmb,
 638                                                 arg->val.str.lenmb,
 639                                                 arg->val.str.startwc);
 640
 641           if (val == NULL)
 642             break;
 643
 644           elem->weights[weight_cnt].w = (struct element_t **)
 645             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 646           elem->weights[weight_cnt].w[0] = val;
 647           elem->weights[weight_cnt].cnt = 1;
 648         }
 649       else if (arg->tok == tok_string)
 650         {
 651           /* Split the string up in the individual characters and put
 652              the element definitions in the list.  */
 653           const char *cp = arg->val.str.startmb;
 654           int cnt = 0;
 655           struct element_t *charelem;
 656           struct element_t **weights = NULL;
 657           int max = 0;
 658
 659           if (*cp == '\0')
 660             {
 661               lr_error (ldfile, _("%s: empty weight string not allowed"),
 662                         "LC_COLLATE");
 663               lr_ignore_rest (ldfile, 0);
 664               break;
 665             }
 666
 667           do
 668             {
 669               if (*cp == '<')
 670                 {
 671                   /* Ahh, it's a bsymbol.  That's what we want.  */
 672                   const char *startp = ++cp;
 673
 674                   while (*cp != '>')
 675                     {
 676                       if (*cp == ldfile->escape_char)
 677                         ++cp;
 678                       if (*cp == '\0')
 679                         /* It's a syntax error.  */
 680                         goto syntax;
 681
 682                       ++cp;
 683                     }
 684
 685                     charelem = find_element (ldfile, collate, startp,
 686                                              cp - startp, NULL);
 687                     ++cp;
 688                 }
 689               else
 690                 {
 691                   /* People really shouldn't use characters directly in
 692                      the string.  Especially since it's not really clear
 693                      what this means.  We interpret all characters in the
 694                      string as if that would be bsymbols.  Otherwise we
 695                      would have to match back to bsymbols somehow and this
 696                      is normally not what people normally expect.  */
 697                   charelem = find_element (ldfile, collate, cp++, 1, NULL);
 698                 }
 699
 700               if (charelem == NULL)
 701                 {
 702                   /* We ignore the rest of the line.  */
 703                   lr_ignore_rest (ldfile, 0);
 704                   break;
 705                 }
 706
 707               /* Add the pointer.  */
 708               if (cnt >= max)
 709                 {
 710                   struct element_t **newp;
 711                   max += 10;
 712                   newp = (struct element_t **)
 713                     alloca (max * sizeof (struct element_t *));
 714                   memcpy (newp, weights, cnt * sizeof (struct element_t *));
 715                   weights = newp;
 716                 }
 717               weights[cnt++] = charelem;
 718             }
 719           while (*cp != '\0');
 720
 721           /* Now store the information.  */
 722           elem->weights[weight_cnt].w = (struct element_t **)
 723             obstack_alloc (&collate->mempool,
 724                            cnt * sizeof (struct element_t *));
 725           memcpy (elem->weights[weight_cnt].w, weights,
 726                   cnt * sizeof (struct element_t *));
 727           elem->weights[weight_cnt].cnt = cnt;
 728
 729           /* We don't need the string anymore.  */
 730           free (arg->val.str.startmb);
 731         }
 732       else if (ellipsis != tok_none
 733                && (arg->tok == tok_ellipsis2
 734                    || arg->tok == tok_ellipsis3
 735                    || arg->tok == tok_ellipsis4))
 736         {
 737           /* It must be the same ellipsis as used in the initial column.  */
 738           if (arg->tok != ellipsis)
 739             lr_error (ldfile, _("\
 740 %s: weights must use the same ellipsis symbol as the name"),
 741                       "LC_COLLATE");
 742
 743           /* The weight for this level has to be ignored.  We use the
 744              null pointer to indicate this.  */
 745           elem->weights[weight_cnt].w = (struct element_t **)
 746             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 747           elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
 748           elem->weights[weight_cnt].cnt = 1;
 749         }
 750       else
 751         {
 752         syntax:
 753           /* It's a syntax error.  */
 754           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 755           lr_ignore_rest (ldfile, 0);
 756           break;
 757         }
 758
 759       arg = lr_token (ldfile, charmap, repertoire);
 760       /* This better should be the end of the line or a semicolon.  */
 761       if (arg->tok == tok_semicolon)
 762         /* OK, ignore this and read the next token.  */
 763         arg = lr_token (ldfile, charmap, repertoire);
 764       else if (arg->tok != tok_eof && arg->tok != tok_eol)
 765         {
 766           /* It's a syntax error.  */
 767           lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
 768           lr_ignore_rest (ldfile, 0);
 769           break;
 770         }
 771     }
 772   while (++weight_cnt < nrules);
 773
 774   if (weight_cnt < nrules)
 775     {
 776       /* This means the rest of the line uses the current element as
 777          the weight.  */
 778       do
 779         {
 780           elem->weights[weight_cnt].w = (struct element_t **)
 781             obstack_alloc (&collate->mempool, sizeof (struct element_t *));
 782           elem->weights[weight_cnt].w[0] = elem;
 783           elem->weights[weight_cnt].cnt = 1;
 784         }
 785       while (++weight_cnt < nrules);
 786     }
 787   else
 788     {
 789       if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
 790         {
 791           /* Too many rule values.  */
 792           lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
 793           lr_ignore_rest (ldfile, 0);
 794         }
 795       else
 796         lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
 797     }
 798 }
 799
 800
 801 static int
 802 insert_value (struct linereader *ldfile, struct token *arg,
 803               struct charmap_t *charmap, struct repertoire_t *repertoire,
 804               struct locale_collate_t *collate)
 805 {
 806   /* First find out what kind of symbol this is.  */
 807   struct charseq *seq;
 808   uint32_t wc;
 809   struct element_t *elem = NULL;
 810
 811   /* Try to find the character in the charmap.  */
 812   seq = charmap_find_value (charmap, arg->val.str.startmb, arg->val.str.lenmb);
 813
 814   /* Determine the wide character.  */
 815   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
 816     {
 817       wc = repertoire_find_value (repertoire, arg->val.str.startmb,
 818                                   arg->val.str.lenmb);
 819       if (seq != NULL)
 820         seq->ucs4 = wc;
 821     }
 822   else
 823     wc = seq->ucs4;
 824
 825   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
 826     {
 827       /* It's no character, so look through the collation elements and
 828          symbol list.  */
 829       void *result;
 830
 831       if (find_entry (&collate->sym_table, arg->val.str.startmb,
 832                       arg->val.str.lenmb, &result) == 0)
 833         {
 834           /* It's a collation symbol.  */
 835           struct symbol_t *sym = (struct symbol_t *) result;
 836           elem = sym->order;
 837
 838           if (elem == NULL)
 839             elem = sym->order = new_element (collate, NULL, 0, NULL, NULL, 0);
 840         }
 841       else if (find_entry (&collate->elem_table, arg->val.str.startmb,
 842                            arg->val.str.lenmb, (void **) &elem) != 0)
 843         {
 844           /* It's also no collation element.  Therefore ignore it.  */
 845           lr_ignore_rest (ldfile, 0);
 846           return 1;
 847         }
 848     }
 849   else
 850     {
 851       /* Otherwise the symbols stands for a character.  */
 852       if (find_entry (&collate->seq_table, arg->val.str.startmb,
 853                       arg->val.str.lenmb, (void **) &elem) != 0)
 854         {
 855           uint32_t wcs[2] = { wc, 0 };
 856
 857           /* We have to allocate an entry.  */
 858           elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
 859                               seq != NULL ? seq->nbytes : 0,
 860                               wcs, arg->val.str.startmb, arg->val.str.lenmb);
 861
 862           /* And add it to the table.  */
 863           if (insert_entry (&collate->seq_table, arg->val.str.startmb,
 864                             arg->val.str.lenmb, elem) != 0)
 865             /* This cannot happen.  */
 866             assert (! "Internal error");
 867         }
 868       else
 869         {
 870           /* Maybe the character was used before the definition.  In this case
 871              we have to insert the byte sequences now.  */
 872           if (elem->mbs == NULL && seq != NULL)
 873             {
 874               elem->mbs = obstack_copy0 (&collate->mempool,
 875                                          seq->bytes, seq->nbytes);
 876               elem->nmbs = seq->nbytes;
 877             }
 878
 879           if (elem->wcs == NULL && seq != ILLEGAL_CHAR_VALUE)
 880             {
 881               uint32_t wcs[2] = { wc, 0 };
 882
 883               elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
 884               elem->nwcs = 1;
 885             }
 886         }
 887     }
 888
 889   /* Test whether this element is not already in the list.  */
 890   if (elem->next != NULL || (collate->cursor != NULL
 891                              && elem->next == collate->cursor))
 892     {
 893       lr_error (ldfile, _("order for `%.*s' already defined at %s:%zu"),
 894                 (int) arg->val.str.lenmb, arg->val.str.startmb,
 895                 elem->file, elem->line);
 896       lr_ignore_rest (ldfile, 0);
 897       return 1;
 898     }
 899
 900   insert_weights (ldfile, elem, charmap, repertoire, collate, tok_none);
 901
 902   return 0;
 903 }
 904
 905
 906 static void
 907 handle_ellipsis (struct linereader *ldfile, struct token *arg,
 908                  enum token_t ellipsis, struct charmap_t *charmap,
 909                  struct repertoire_t *repertoire,
 910                  struct locale_collate_t *collate)
 911 {
 912   struct element_t *startp;
 913   struct element_t *endp;
 914
 915   /* Unlink the entry added for the ellipsis.  */
 916   unlink_element (collate);
 917   startp = collate->cursor;
 918
 919   /* Process and add the end-entry.  */
 920   if (arg != NULL
 921       && insert_value (ldfile, arg, charmap, repertoire, collate))
 922     /* Something went wrong with inserting the to-value.  This means
 923        we cannot process the ellipsis.  */
 924     return;
 925
 926   /* Reset the cursor.  */
 927   collate->cursor = startp;
 928
 929   /* Now we have to handle many different situations:
 930      - we have to distinguish between the three different ellipsis forms
 931      - the is the ellipsis at the beginning, in the middle, or at the end.
 932   */
 933   endp = collate->cursor->next;
 934   assert (arg == NULL || endp != NULL);
 935
 936   /* Both, the start and the end symbol, must stand for characters.  */
 937   if ((startp == NULL || startp->name == NULL)
 938       || (endp == NULL || endp->name == NULL))
 939     {
 940       lr_error (ldfile, _("\
 941 %s: the start end the end symbol of a range must stand for characters"),
 942                 "LC_COLLATE");
 943       return;
 944     }
 945
 946   if (ellipsis == tok_ellipsis3)
 947     {
 948       /* One requirement we make here: the length of the byte
 949          sequences for the first and end character must be the same.
 950          This is mainly to prevent unwanted effects and this is often
 951          not what is wanted.  */
 952       size_t len = (startp->mbs != NULL ? startp->nmbs
 953                     : (endp->mbs != NULL ? endp->nmbs : 0));
 954       char mbcnt[len + 1];
 955       char mbend[len + 1];
 956
 957       /* Well, this should be caught somewhere else already.  Just to
 958          make sure.  */
 959       assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
 960       assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
 961
 962       if (startp != NULL && endp != NULL
 963           && startp->mbs != NULL && endp->mbs != NULL
 964           && startp->nmbs != endp->nmbs)
 965         {
 966           lr_error (ldfile, _("\
 967 %s: byte sequences of first and last character must have the same length"),
 968                     "LC_COLLATE");
 969           return;
 970         }
 971
 972       /* Determine whether we have to generate multibyte sequences.  */
 973       if ((startp == NULL || startp->mbs != NULL)
 974           && (endp == NULL || endp->mbs != NULL))
 975         {
 976           int cnt;
 977           int ret;
 978
 979           /* Prepare the beginning byte sequence.  This is either from the
 980              beginning byte sequence or it is all nulls if it was an
 981              initial ellipsis.  */
 982           if (startp == NULL || startp->mbs == NULL)
 983             memset (mbcnt, '\0', len);
 984           else
 985             {
 986               memcpy (mbcnt, startp->mbs, len);
 987
 988               /* And increment it so that the value is the first one we will
 989                  try to insert.  */
 990               for (cnt = len - 1; cnt >= 0; --cnt)
 991                 if (++mbcnt[cnt] != '\0')
 992                   break;
 993             }
 994           mbcnt[len] = '\0';
 995
 996           /* And the end sequence.  */
 997           if (endp == NULL || endp->mbs == NULL)
 998             memset (mbend, '\0', len);
 999           else
1000             memcpy (mbend, endp->mbs, len);
1001           mbend[len] = '\0';
1002
1003           /* Test whether we have a correct range.  */
1004           ret = memcmp (mbcnt, mbend, len);
1005           if (ret >= 0)
1006             {
1007               if (ret > 0)
1008                 lr_error (ldfile, _("%s: byte sequence of first character of \
1009 sequence is not lower than that of the last character"), "LC_COLLATE");
1010               return;
1011             }
1012
1013           /* Generate the byte sequences data.  */
1014           while (1)
1015             {
1016               struct charseq *seq;
1017
1018               /* Quite a bit of work ahead.  We have to find the character
1019                  definition for the byte sequence and then determine the
1020                  wide character belonging to it.  */
1021               seq = charmap_find_symbol (charmap, mbcnt, len);
1022               if (seq != NULL)
1023                 {
1024                   struct element_t *elem;
1025                   size_t namelen;
1026
1027                   if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1028                     seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1029                                                        strlen (seq->name));
1030
1031                   /* I don't this this can ever happen.  */
1032                   assert (seq->name != NULL);
1033                   namelen = strlen (seq->name);
1034
1035                   /* Now we are ready to insert the new value in the
1036                      sequence.  Find out whether the element is
1037                      already known.  */
1038                   if (find_entry (&collate->seq_table, seq->name, namelen,
1039                                   (void **) &elem) != 0)
1040                     {
1041                       uint32_t wcs[2] = { seq->ucs4, 0 };
1042
1043                       /* We have to allocate an entry.  */
1044                       elem = new_element (collate, mbcnt, len, wcs, seq->name,
1045                                           namelen);
1046
1047                       /* And add it to the table.  */
1048                       if (insert_entry (&collate->seq_table, seq->name,
1049                                         namelen, elem) != 0)
1050                         /* This cannot happen.  */
1051                         assert (! "Internal error");
1052                     }
1053
1054                   /* Test whether this element is not already in the list.  */
1055                   if (elem->next != NULL || (collate->cursor != NULL
1056                                              && elem->next == collate->cursor))
1057                     {
1058                       lr_error (ldfile, _("\
1059 order for `%.*s' already defined at %s:%zu"),
1060                                 (int) namelen, seq->name,
1061                                 elem->file, elem->line);
1062                       goto increment;
1063                     }
1064
1065                   /* Enqueue the new element.  */
1066                   elem->last = collate->cursor;
1067                   if (collate->cursor != NULL)
1068                     elem->next = NULL;
1069                   else
1070                     {
1071                       elem->next = collate->cursor->next;
1072                       elem->last->next = elem;
1073                       if (elem->next != NULL)
1074                         elem->next->last = elem;
1075                     }
1076                   if (collate->start == NULL)
1077                     {
1078                       assert (collate->cursor == NULL);
1079                       collate->start = elem;
1080                     }
1081                   collate->cursor = elem;
1082
1083                  /* Add the weight value.  We take them from the
1084                     `ellipsis_weights' member of `collate'.  */
1085                   elem->weights = (struct element_list_t *)
1086                     obstack_alloc (&collate->mempool,
1087                                    nrules * sizeof (struct element_list_t));
1088                   for (cnt = 0; cnt < nrules; ++cnt)
1089                     if (collate->ellipsis_weight.weights[cnt].cnt == 1
1090                         && (collate->ellipsis_weight.weights[cnt].w[0]
1091                             == ELEMENT_ELLIPSIS2))
1092                       {
1093                         elem->weights[cnt].w = (struct element_t **)
1094                           obstack_alloc (&collate->mempool,
1095                                          sizeof (struct element_t *));
1096                         elem->weights[cnt].w[0] = elem;
1097                         elem->weights[cnt].cnt = 1;
1098                       }
1099                     else
1100                       {
1101                         /* Simly use the weight from `ellipsis_weight'.  */
1102                         elem->weights[cnt].w =
1103                           collate->ellipsis_weight.weights[cnt].w;
1104                         elem->weights[cnt].cnt =
1105                           collate->ellipsis_weight.weights[cnt].cnt;
1106                       }
1107                 }
1108
1109               /* Increment for the next round.  */
1110             increment:
1111               for (cnt = len - 1; cnt >= 0; --cnt)
1112                 if (++mbcnt[cnt] != '\0')
1113                   break;
1114
1115               /* Find out whether this was all.  */
1116               if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1117                 /* Yep, that's all.  */
1118                 break;
1119             }
1120         }
1121     }
1122   else
1123     {
1124       /* For symbolic range we naturally must have a beginning and an
1125          end specified by the user.  */
1126       if (startp == NULL)
1127         lr_error (ldfile, _("\
1128 %s: symbolic range ellipsis must not directly follow `order_start'"),
1129                   "LC_COLLATE");
1130       else if (endp == NULL)
1131         lr_error (ldfile, _("\
1132 %s: symbolic range ellipsis must not be direct followed by `order_end'"),
1133                   "LC_COLLATE");
1134       else
1135         {
1136           /* Determine the range.  To do so we have to determine the
1137              common prefix of the both names and then the numeric
1138              values of both ends.  */
1139           size_t lenfrom = strlen (startp->name);
1140           size_t lento = strlen (endp->name);
1141           char buf[lento + 1];
1142           int preflen = 0;
1143           long int from;
1144           long int to;
1145           char *cp;
1146           int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1147
1148           if (lenfrom != lento)
1149             {
1150             invalid_range:
1151               lr_error (ldfile, _("\
1152 `%s' and `%.*s' are no valid names for symbolic range"),
1153                         startp->name, (int) lento, endp->name);
1154               return;
1155             }
1156
1157           while (startp->name[preflen] == endp->name[preflen])
1158             if (startp->name[preflen] == '\0')
1159               /* Nothing to be done.  The start and end point are identical
1160                  and while inserting the end point we have already given
1161                  the user an error message.  */
1162               return;
1163             else
1164               ++preflen;
1165
1166           errno = 0;
1167           from = strtol (startp->name + preflen, &cp, base);
1168           if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1169             goto invalid_range;
1170
1171           errno = 0;
1172           to = strtol (endp->name + preflen, &cp, base);
1173           if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1174             goto invalid_range;
1175
1176           /* Copy the prefix.  */
1177           memcpy (buf, startp->name, preflen);
1178
1179           /* Loop over all values.  */
1180           for (++from; from < to; ++from)
1181             {
1182               struct element_t *elem = NULL;
1183               struct charseq *seq;
1184               uint32_t wc;
1185               int cnt;
1186
1187               /* Generate the the name.  */
1188               sprintf (buf + preflen, base == 10 ? "%d" : "%x", from);
1189
1190               /* Look whether this name is already defined.  */
1191               if (find_entry (&collate->seq_table, arg->val.str.startmb,
1192                               arg->val.str.lenmb, (void **) &elem) == 0)
1193                 {
1194                   if (elem->next != NULL || (collate->cursor != NULL
1195                                              && elem->next == collate->cursor))
1196                     {
1197                       lr_error (ldfile, _("\
1198 %s: order for `%.*s' already defined at %s:%zu"),
1199                                 "LC_COLLATE", (int) lenfrom, buf,
1200                                 elem->file, elem->line);
1201                       continue;
1202                     }
1203
1204                   if (elem->name == NULL)
1205                     {
1206                       lr_error (ldfile, _("%s: `%s' must be a charater"),
1207                                 "LC_COLLATE", buf);
1208                       continue;
1209                     }
1210                 }
1211
1212               if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1213                 {
1214                   /* Search for a character of this name.  */
1215                   seq = charmap_find_value (charmap, buf, lenfrom);
1216                   if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1217                     {
1218                       wc = repertoire_find_value (repertoire, buf, lenfrom);
1219
1220                       if (seq != NULL)
1221                         seq->ucs4 = wc;
1222                     }
1223                   else
1224                     wc = seq->ucs4;
1225
1226                   if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1227                     /* We don't know anything about a character with this
1228                        name.  XXX Should we warn?  */
1229                     continue;
1230
1231                   if (elem == NULL)
1232                     {
1233                       uint32_t wcs[2] = { wc, 0 };
1234
1235                       /* We have to allocate an entry.  */
1236                       elem = new_element (collate,
1237                                           seq != NULL ? seq->bytes : NULL,
1238                                           seq != NULL ? seq->nbytes : 0,
1239                                           wc == ILLEGAL_CHAR_VALUE
1240                                           ? NULL : wcs,
1241                                           buf, lenfrom);
1242                     }
1243                   else
1244                     {
1245                       /* Update the element.  */
1246                       if (seq != NULL)
1247                         {
1248                           elem->mbs = obstack_copy0 (&collate->mempool,
1249                                                      seq->bytes, seq->nbytes);
1250                           elem->nmbs = seq->nbytes;
1251                         }
1252
1253                       if (wc != ILLEGAL_CHAR_VALUE)
1254                         {
1255                           uint32_t zero = 0;
1256
1257                           obstack_grow (&collate->mempool,
1258                                         &wc, sizeof (uint32_t));
1259                           obstack_grow (&collate->mempool,
1260                                         &zero, sizeof (uint32_t));
1261                           elem->wcs = obstack_finish (&collate->mempool);
1262                           elem->nwcs = 1;
1263                         }
1264                     }
1265
1266                   elem->file = ldfile->fname;
1267                   elem->line = ldfile->lineno;
1268                   elem->section = collate->current_section;
1269                 }
1270
1271               /* Enqueue the new element.  */
1272               elem->last = collate->cursor;
1273               elem->next = collate->cursor->next;
1274               elem->last->next = elem;
1275               if (elem->next != NULL)
1276                 elem->next->last = elem;
1277               collate->cursor = elem;
1278
1279               /* Now add the weights.  They come from the `ellipsis_weights'
1280                  member of `collate'.  */
1281               elem->weights = (struct element_list_t *)
1282                 obstack_alloc (&collate->mempool,
1283                                nrules * sizeof (struct element_list_t));
1284               for (cnt = 0; cnt < nrules; ++cnt)
1285                 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1286                     && (collate->ellipsis_weight.weights[cnt].w[0]
1287                         == ELEMENT_ELLIPSIS2))
1288                   {
1289                     elem->weights[cnt].w = (struct element_t **)
1290                       obstack_alloc (&collate->mempool,
1291                                      sizeof (struct element_t *));
1292                     elem->weights[cnt].w[0] = elem;
1293                     elem->weights[cnt].cnt = 1;
1294                   }
1295                 else
1296                   {
1297                     /* Simly use the weight from `ellipsis_weight'.  */
1298                     elem->weights[cnt].w =
1299                       collate->ellipsis_weight.weights[cnt].w;
1300                     elem->weights[cnt].cnt =
1301                       collate->ellipsis_weight.weights[cnt].cnt;
1302                   }
1303             }
1304         }
1305     }
1306 }
1307
1308
1309 static void
1310 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1311                  struct localedef_t *copy_locale, int ignore_content)
1312 {
1313   if (!ignore_content)
1314     {
1315       struct locale_collate_t *collate;
1316
1317       if (copy_locale == NULL)
1318         {
1319           collate = locale->categories[LC_COLLATE].collate =
1320             (struct locale_collate_t *)
1321             xcalloc (1, sizeof (struct locale_collate_t));
1322
1323           /* Init the various data structures.  */
1324           init_hash (&collate->elem_table, 100);
1325           init_hash (&collate->sym_table, 100);
1326           init_hash (&collate->seq_table, 500);
1327           obstack_init (&collate->mempool);
1328
1329           collate->col_weight_max = -1;
1330         }
1331       else
1332         collate = locale->categories[LC_COLLATE].collate =
1333           copy_locale->categories[LC_COLLATE].collate;
1334     }
1335
1336   ldfile->translate_strings = 0;
1337   ldfile->return_widestr = 0;
1338 }
1339
1340
1341 void
1342 collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
1343 {
1344   /* Now is the time when we can assign the individual collation
1345      values for all the symbols.  We have possibly different values
1346      for the wide- and the multibyte-character symbols.  This is done
1347      since it might make a difference in the encoding if there is in
1348      some cases no multibyte-character but there are wide-characters.
1349      (The other way around it is not important since theencoded
1350      collation value in the wide-character case is 32 bits wide and
1351      therefore requires no encoding).
1352
1353      The lowest collation value assigned is 2.  Zero is reserved for
1354      the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1355      functions and 1 is used to separate the individual passes for the
1356      different rules.
1357
1358      We also have to construct is list with all the bytes/words which
1359      can come first in a sequence, followed by all the elements which
1360      also start with this byte/word.  The order is reverse which has
1361      among others the important effect that longer strings are located
1362      first in the list.  This is required for the output data since
1363      the algorithm used in `strcoll' etc depends on this.
1364
1365      The multibyte case is easy.  We simply sort into an array with
1366      256 elements.  */
1367   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1368   int mbact[nrules];
1369   int wcact;
1370   struct element_t *runp;
1371   int i;
1372   int need_undefined = 0;
1373   struct section_list *sect;
1374   int ruleidx;
1375
1376   /* If this assertion is hit change the type in `element_t'.  */
1377   assert (nrules <= sizeof (runp->used_in_level) * 8);
1378
1379   /* Find out which elements are used at which level.  At the same
1380      time we find out whether we have any undefined symbols.  */
1381   runp = collate->start;
1382   while (runp != NULL)
1383     {
1384       if (runp->mbs != NULL)
1385         {
1386           for (i = 0; i < nrules; ++i)
1387             {
1388               int j;
1389
1390               for (j = 0; j < runp->weights[i].cnt; ++j)
1391                 /* A NULL pointer as the weight means IGNORE.  */
1392                 if (runp->weights[i].w[j] != NULL)
1393                   {
1394                     if (runp->weights[i].w[j]->weights == NULL)
1395                       {
1396                         error_at_line (0, 0, runp->file, runp->line,
1397                                        _("symbol `%s' not defined"),
1398                                        runp->weights[i].w[j]->name);
1399
1400                         need_undefined = 1;
1401                         runp->weights[i].w[j] = &collate->undefined;
1402                       }
1403                     else
1404                       /* Set the bit for the level.  */
1405                       runp->weights[i].w[j]->used_in_level |= 1 << i;
1406                   }
1407             }
1408         }
1409
1410       /* Up to the next entry.  */
1411       runp = runp->next;
1412     }
1413
1414   /* Walk through the list of defined sequences and assign weights.  Also
1415      create the data structure which will allow generating the single byte
1416      character based tables.
1417
1418      Since at each time only the weights for each of the rules are
1419      only compared to other weights for this rule it is possible to
1420      assign more compact weight values than simply counting all
1421      weights in sequence.  We can assign weights from 3, one for each
1422      rule individually and only for those elements, which are actually
1423      used for this rule.
1424
1425      Why is this important?  It is not for the wide char table.  But
1426      it is for the singlebyte output since here larger numbers have to
1427      be encoded to make it possible to emit the value as a byte
1428      string.  */
1429   for (i = 0; i < nrules; ++i)
1430     mbact[i] = 3;
1431   wcact = 3;
1432   runp = collate->start;
1433   while (runp != NULL)
1434     {
1435       /* Determine the order.  */
1436       if (runp->used_in_level != 0)
1437         {
1438           runp->mborder = (int *) obstack_alloc (&collate->mempool,
1439                                                  nrules * sizeof (int));
1440
1441           for (i = 0; i < nrules; ++i)
1442             if ((runp->used_in_level & (1 << i)) != 0)
1443               runp->mborder[i] = mbact[i]++;
1444             else
1445               runp->mborder[i] = 0;
1446         }
1447
1448       if (runp->mbs != NULL)
1449         {
1450           struct element_t **eptr;
1451
1452           /* Find the point where to insert in the list.  */
1453           eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1454           while (*eptr != NULL)
1455             {
1456               if ((*eptr)->nmbs < runp->nmbs)
1457                 break;
1458
1459               if ((*eptr)->nmbs == runp->nmbs)
1460                 {
1461                   int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1462
1463                   if (c == 0)
1464                     {
1465                       /* This should not happen.  It means that we have
1466                          to symbols with the same byte sequence.  It is
1467                          of course an error.  */
1468                       error_at_line (0, 0, (*eptr)->file, (*eptr)->line,
1469                                      _("symbol `%s' has same encoding as"),
1470                                      (*eptr)->name);
1471                       error_at_line (0, 0, runp->file, runp->line,
1472                                      _("symbol `%s'"), runp->name);
1473                       goto dont_insert;
1474                     }
1475                   else if (c < 0)
1476                     /* Insert it here.  */
1477                     break;
1478                 }
1479
1480               /* To the next entry.  */
1481               eptr = &(*eptr)->mbnext;
1482             }
1483
1484           /* Set the pointers.  */
1485           runp->mbnext = *eptr;
1486           *eptr = runp;
1487         dont_insert:
1488         }
1489
1490       if (runp->wcs != NULL)
1491         runp->wcorder = wcact++;
1492
1493       /* Up to the next entry.  */
1494       runp = runp->next;
1495     }
1496
1497   /* Find out whether any of the `mbheads' entries is unset.  In this
1498      case we use the UNDEFINED entry.  */
1499   for (i = 1; i < 256; ++i)
1500     if (collate->mbheads[i] == NULL)
1501       {
1502         need_undefined = 1;
1503         collate->mbheads[i] = &collate->undefined;
1504       }
1505
1506   /* Now determine whether the UNDEFINED entry is needed and if yes,
1507      whether it was defined.  */
1508   collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1509   if (need_undefined && collate->undefined.file == NULL)
1510     {
1511       error (0, 0, _("no definition of `UNDEFINED'"));
1512
1513       /* Add UNDEFINED at the end.  */
1514       collate->undefined.mborder =
1515         (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1516
1517       for (i = 0; i < nrules; ++i)
1518         collate->undefined.mborder[i] = mbact[i]++;
1519
1520       collate->undefined.wcorder = wcact++;
1521     }
1522
1523   /* Finally, try to unify the rules for the sections.  Whenever the rules
1524      for a section are the same as those for another section give the
1525      ruleset the same index.  Since there are never many section we can
1526      use an O(n^2) algorithm here.  */
1527   sect = collate->sections;
1528   assert (sect != NULL);
1529   ruleidx = 0;
1530   do
1531     {
1532       struct section_list *osect = collate->sections;
1533
1534       while (osect != sect)
1535         if (memcmp (osect->rules, sect->rules, nrules) == 0)
1536           break;
1537         else
1538           osect = osect->next;
1539
1540       if (osect == sect)
1541         sect->ruleidx = ruleidx++;
1542       else
1543         sect->ruleidx = osect->ruleidx;
1544
1545       /* Next section.  */
1546       sect = sect->next;
1547     }
1548   while (sect != NULL);
1549   /* We are currently not prepared for more than 256 rulesets.  But this
1550      should never really be a problem.  */
1551   assert (ruleidx <= 256);
1552 }
1553
1554
1555 static inline int32_t
1556 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1557                struct element_t *elem)
1558 {
1559   size_t cnt;
1560   int32_t retval;
1561
1562   /* Optimize the use of UNDEFINED.  */
1563   if (elem == &collate->undefined)
1564     /* The weights are already inserted.  */
1565     return 0;
1566
1567   /* This byte can start exactly one collation element and this is
1568      a single byte.  We can directly give the index to the weights.  */
1569   retval = obstack_object_size (pool);
1570
1571   /* Construct the weight.  */
1572   for (cnt = 0; cnt < nrules; ++cnt)
1573     {
1574       char buf[elem->weights[cnt].cnt * 7];
1575       int len = 0;
1576       int i;
1577
1578       /* Add the direction.  */
1579       obstack_1grow (pool, elem->section->rules[cnt]);
1580
1581       for (i = 0; i < elem->weights[cnt].cnt; ++i)
1582         /* Encode the weight value.  */
1583         if (elem->weights[cnt].w[i] == NULL)
1584           {
1585             /* This entry was IGNORE.  */
1586             buf[len++] = IGNORE_CHAR;
1587           }
1588         else
1589           len += utf8_encode (&buf[len],
1590                               elem->weights[cnt].w[i]->mborder[cnt]);
1591
1592       /* And add the buffer content.  */
1593       obstack_grow (pool, buf, len);
1594     }
1595
1596   return retval;
1597 }
1598
1599
1600 void
1601 collate_output (struct localedef_t *locale, struct charmap_t *charmap,
1602                 const char *output_path)
1603 {
1604   struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1605   const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1606   struct iovec iov[2 + nelems];
1607   struct locale_file data;
1608   uint32_t idx[nelems];
1609   size_t cnt;
1610   size_t ch;
1611   int32_t tablemb[256];
1612   struct obstack weightpool;
1613   struct obstack extrapool;
1614   struct section_list *sect;
1615   int i;
1616
1617   obstack_init (&weightpool);
1618   obstack_init (&extrapool);
1619
1620   data.magic = LIMAGIC (LC_COLLATE);
1621   data.n = nelems;
1622   iov[0].iov_base = (void *) &data;
1623   iov[0].iov_len = sizeof (data);
1624
1625   iov[1].iov_base = (void *) idx;
1626   iov[1].iov_len = sizeof (idx);
1627
1628   idx[0] = iov[0].iov_len + iov[1].iov_len;
1629   cnt = 0;
1630
1631   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1632   iov[2 + cnt].iov_base = &collate->nrules;
1633   iov[2 + cnt].iov_len = sizeof (uint32_t);
1634   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1635   ++cnt;
1636
1637   /* Prepare the ruleset table.  */
1638   for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
1639     if (sect->ruleidx == i)
1640       {
1641         obstack_grow (&weightpool, sect->rules, nrules);
1642         ++i;
1643       }
1644   /* And align the output.  */
1645   i = (nrules * i) % __alignof__ (int32_t);
1646   if (i > 0)
1647     do
1648       obstack_1grow (&weightpool, '\0');
1649     while (++i < __alignof__ (int32_t));
1650
1651   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
1652   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
1653   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
1654   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1655   ++cnt;
1656
1657   /* Generate the 8-bit table.  Walk through the lists of sequences
1658      starting with the same byte and add them one after the other to
1659      the table.  In case we have more than one sequence starting with
1660      the same byte we have to use extra indirection.
1661
1662      First add a record for the NUL byte.  This entry will never be used
1663      so it does not matter.  */
1664   tablemb[0] = 0;
1665
1666   /* Now insert the `UNDEFINED' value if it is used.  Since this value
1667      will probably be used more than once it is good to store the
1668      weights only once.  */
1669   if (collate->undefined.used_in_level != 0)
1670     output_weight (&weightpool, collate, &collate->undefined);
1671
1672   for (ch = 1; ch < 256; ++ch)
1673     if (collate->mbheads[ch]->mbnext == NULL
1674         && collate->mbheads[ch]->nmbs == 1)
1675       {
1676         tablemb[ch] = output_weight (&weightpool, collate,
1677                                       collate->mbheads[ch]);
1678       }
1679     else
1680       {
1681         /* The entries in the list are sorted by length and then
1682            alphabetically.  This is the order in which we will add the
1683            elements to the collation table.  This allows to simply
1684            walk the table in sequence and stop at the first matching
1685            entry.  Since the longer sequences are coming first in the
1686            list they have the possibility to match first, just as it
1687            has to be.  In the worst case we are walking to the end of
1688            the list where we put, if no singlebyte sequence is defined
1689            in the locale definition, the weights for UNDEFINED.
1690
1691            To reduce the length of the search list we compress them a bit.
1692            This happens by collecting sequences of consecutive byte
1693            sequences in one entry (having and begin and end byte sequence)
1694            and add only one index into the weight table.  We can find the
1695            consecutive entries since they are also consecutive in the list.  */
1696         struct element_t *runp = collate->mbheads[ch];
1697         struct element_t *lastp;
1698
1699         tablemb[ch] = -obstack_object_size (&extrapool);
1700
1701         do
1702           {
1703             /* Store the current index in the weight table.  We know that
1704                the current position in the `extrapool' is aligned on a
1705                32-bit address.  */
1706             int32_t weightidx;
1707             int added;
1708
1709             /* Output the weight info.  */
1710             weightidx = output_weight (&weightpool, collate, runp);
1711
1712             /* Find out wether this is a single entry or we have more than
1713                one consecutive entry.  */
1714             if (runp->mbnext != NULL
1715                 && runp->nmbs == runp->mbnext->nmbs
1716                 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
1717                 && (runp->mbs[runp->nmbs - 1] + 1
1718                     == runp->mbnext->mbs[runp->nmbs - 1]))
1719               {
1720                 int i;
1721
1722                 /* More than one consecutive entry.  We mark this by having
1723                    a negative index into the weight table.  */
1724                 weightidx = -weightidx;
1725
1726                 /* Now add first the initial byte sequence.  */
1727                 added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
1728                           + __alignof__ (int32_t) - 1)
1729                          & ~(__alignof__ (int32_t) - 1));
1730                 obstack_make_room (&extrapool, added);
1731
1732                 if (sizeof (int32_t) == sizeof (int))
1733                   obstack_int_grow_fast (&extrapool, weightidx);
1734                 else
1735                   obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
1736                 obstack_1grow_fast (&extrapool, runp->section->ruleidx);
1737                 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
1738                 for (i = 1; i < runp->nmbs; ++i)
1739                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1740
1741                 /* Now find the end of the consecutive sequence.  */
1742                 do
1743                   runp = runp->next;
1744                 while (runp->mbnext != NULL
1745                        && runp->nmbs == runp->mbnext->nmbs
1746                        && memcmp (runp->mbs, runp->mbnext->mbs,
1747                                   runp->nmbs - 1) == 0
1748                        && (runp->mbs[runp->nmbs - 1] + 1
1749                            == runp->mbnext->mbs[runp->nmbs - 1]));
1750
1751                 /* And add the end by sequence.  Without length this time.  */
1752                 for (i = 1; i < runp->nmbs; ++i)
1753                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1754               }
1755             else
1756               {
1757                 /* A single entry.  Simply add the index and the length and
1758                    string (except for the first character which is already
1759                    tested for).  */
1760                 int i;
1761
1762                 added = ((sizeof (int32_t) + 1 + 1 + runp->nmbs - 1
1763                           + __alignof__ (int32_t) - 1)
1764                          & ~(__alignof__ (int32_t) - 1));
1765                 obstack_make_room (&extrapool, added);
1766
1767                 if (sizeof (int32_t) == sizeof (int))
1768                   obstack_int_grow_fast (&extrapool, weightidx);
1769                 else
1770                   obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
1771                 obstack_1grow_fast (&extrapool, runp->section->ruleidx);
1772                 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
1773                 for (i = 1; i < runp->nmbs; ++i)
1774                   obstack_1grow_fast (&extrapool, runp->mbs[i]);
1775               }
1776
1777             /* Add alignment bytes if necessary.  */
1778             i = added % __alignof__ (int32_t);
1779             if (i > 0)
1780               do
1781                 obstack_1grow_fast (&extrapool, '\0');
1782               while (++i != __alignof__ (int32_t));
1783
1784             /* Next entry.  */
1785             lastp = runp;
1786             runp = runp->mbnext;
1787           }
1788         while (runp != NULL);
1789
1790         /* If the final entry in the list is not a single character we
1791            add an UNDEFINED entry here.  */
1792         if (lastp->nmbs != 1)
1793           {
1794             int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t))
1795                          & ~(__alignof__ (int32_t) - 1));
1796             obstack_make_room (&extrapool, added);
1797
1798             if (sizeof (int32_t) == sizeof (int))
1799               obstack_int_grow_fast (&extrapool, 0);
1800             else
1801               {
1802                 int32_t zero = 0;
1803                 obstack_grow (&extrapool, &zero, sizeof (int32_t));
1804               }
1805             /* XXX What rule? We just pick the first.  */
1806             obstack_1grow_fast (&extrapool, 0);
1807             /* Length is zero.  */
1808             obstack_1grow_fast (&extrapool, 0);
1809
1810             /* Add alignment bytes if necessary.  */
1811             i = added % __alignof__ (int32_t);
1812             if (i > 0)
1813               do
1814                 obstack_1grow_fast (&extrapool, '\0');
1815               while (++i != __alignof__ (int32_t));
1816           }
1817       }
1818
1819   /* Now add the three tables.  */
1820   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
1821   iov[2 + cnt].iov_base = tablemb;
1822   iov[2 + cnt].iov_len = sizeof (tablemb);
1823   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1824   ++cnt;
1825
1826   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
1827   iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
1828   iov[2 + cnt].iov_base = obstack_finish (&weightpool);
1829   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1830   ++cnt;
1831
1832   assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
1833   iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
1834   iov[2 + cnt].iov_base = obstack_finish (&extrapool);
1835   idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1836   ++cnt;
1837
1838
1839   assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
1840
1841   write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
1842
1843   obstack_free (&weightpool, NULL);
1844   obstack_free (&extrapool, NULL);
1845 }
1846
1847
1848 void
1849 collate_read (struct linereader *ldfile, struct localedef_t *result,
1850               struct charmap_t *charmap, const char *repertoire_name,
1851               int ignore_content)
1852 {
1853   struct repertoire_t *repertoire = NULL;
1854   struct locale_collate_t *collate;
1855   struct token *now;
1856   struct token *arg = NULL;
1857   enum token_t nowtok;
1858   int state = 0;
1859   enum token_t was_ellipsis = tok_none;
1860   struct localedef_t *copy_locale = NULL;
1861
1862   /* Get the repertoire we have to use.  */
1863   if (repertoire_name != NULL)
1864     repertoire = repertoire_read (repertoire_name);
1865
1866   /* The rest of the line containing `LC_COLLATE' must be free.  */
1867   lr_ignore_rest (ldfile, 1);
1868
1869   do
1870     {
1871       now = lr_token (ldfile, charmap, NULL);
1872       nowtok = now->tok;
1873     }
1874   while (nowtok == tok_eol);
1875
1876   if (nowtok == tok_copy)
1877     {
1878       state = 2;
1879       now = lr_token (ldfile, charmap, NULL);
1880       if (now->tok != tok_string)
1881         {
1882           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
1883
1884         skip_category:
1885           do
1886             now = lr_token (ldfile, charmap, NULL);
1887           while (now->tok != tok_eof && now->tok != tok_end);
1888
1889           if (now->tok != tok_eof
1890               || (now = lr_token (ldfile, charmap, NULL), now->tok == tok_eof))
1891             lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
1892           else if (now->tok != tok_lc_collate)
1893             {
1894               lr_error (ldfile, _("\
1895 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
1896               lr_ignore_rest (ldfile, 0);
1897             }
1898           else
1899             lr_ignore_rest (ldfile, 1);
1900
1901           return;
1902         }
1903
1904       /* Get the locale definition.  */
1905       copy_locale = find_locale (LC_COLLATE, now->val.str.startmb,
1906                                  repertoire_name, charmap);
1907       if ((copy_locale->avail & COLLATE_LOCALE) == 0)
1908         {
1909           /* Not yet loaded.  So do it now.  */
1910           if (locfile_read (copy_locale, charmap) != 0)
1911             goto skip_category;
1912         }
1913
1914       lr_ignore_rest (ldfile, 1);
1915
1916       now = lr_token (ldfile, charmap, NULL);
1917       nowtok = now->tok;
1918     }
1919
1920   /* Prepare the data structures.  */
1921   collate_startup (ldfile, result, copy_locale, ignore_content);
1922   collate = result->categories[LC_COLLATE].collate;
1923
1924   while (1)
1925     {
1926       /* Of course we don't proceed beyond the end of file.  */
1927       if (nowtok == tok_eof)
1928         break;
1929
1930       /* Ingore empty lines.  */
1931       if (nowtok == tok_eol)
1932         {
1933           now = lr_token (ldfile, charmap, NULL);
1934           nowtok = now->tok;
1935           continue;
1936         }
1937
1938       switch (nowtok)
1939         {
1940         case tok_coll_weight_max:
1941           /* Ignore the rest of the line if we don't need the input of
1942              this line.  */
1943           if (ignore_content)
1944             {
1945               lr_ignore_rest (ldfile, 0);
1946               break;
1947             }
1948
1949           if (state != 0)
1950             goto err_label;
1951
1952           arg = lr_token (ldfile, charmap, NULL);
1953           if (arg->tok != tok_number)
1954             goto err_label;
1955           if (collate->col_weight_max != -1)
1956             lr_error (ldfile, _("%s: duplicate definition of `%s'"),
1957                       "LC_COLLATE", "col_weight_max");
1958           else
1959             collate->col_weight_max = arg->val.num;
1960           lr_ignore_rest (ldfile, 1);
1961           break;
1962
1963         case tok_section_symbol:
1964           /* Ignore the rest of the line if we don't need the input of
1965              this line.  */
1966           if (ignore_content)
1967             {
1968               lr_ignore_rest (ldfile, 0);
1969               break;
1970             }
1971
1972           if (state != 0)
1973             goto err_label;
1974
1975           arg = lr_token (ldfile, charmap, repertoire);
1976           if (arg->tok != tok_bsymbol)
1977             goto err_label;
1978           else if (!ignore_content)
1979             {
1980               /* Check whether this section is already known.  */
1981               struct section_list *known = collate->sections;
1982               while (known != NULL)
1983                 if (strcmp (known->name, arg->val.str.startmb) == 0)
1984                   break;
1985
1986               if (known != NULL)
1987                 {
1988                   lr_error (ldfile,
1989                             _("%s: duplicate declaration of section `%s'"),
1990                             "LC_COLLATE", arg->val.str.startmb);
1991                   free (arg->val.str.startmb);
1992                 }
1993               else
1994                 collate->sections = make_seclist_elem (collate,
1995                                                        arg->val.str.startmb,
1996                                                        collate->sections);
1997
1998               lr_ignore_rest (ldfile, known == NULL);
1999             }
2000           else
2001             {
2002               free (arg->val.str.startmb);
2003               lr_ignore_rest (ldfile, 0);
2004             }
2005           break;
2006
2007         case tok_collating_element:
2008           /* Ignore the rest of the line if we don't need the input of
2009              this line.  */
2010           if (ignore_content)
2011             {
2012               lr_ignore_rest (ldfile, 0);
2013               break;
2014             }
2015
2016           if (state != 0)
2017             goto err_label;
2018
2019           arg = lr_token (ldfile, charmap, repertoire);
2020           if (arg->tok != tok_bsymbol)
2021             goto err_label;
2022           else
2023             {
2024               const char *symbol = arg->val.str.startmb;
2025               size_t symbol_len = arg->val.str.lenmb;
2026
2027               /* Next the `from' keyword.  */
2028               arg = lr_token (ldfile, charmap, repertoire);
2029               if (arg->tok != tok_from)
2030                 {
2031                   free ((char *) symbol);
2032                   goto err_label;
2033                 }
2034
2035               ldfile->return_widestr = 1;
2036
2037               /* Finally the string with the replacement.  */
2038               arg = lr_token (ldfile, charmap, repertoire);
2039               ldfile->return_widestr = 0;
2040               if (arg->tok != tok_string)
2041                 goto err_label;
2042
2043               if (!ignore_content)
2044                 {
2045                   if (symbol == NULL)
2046                     lr_error (ldfile, _("\
2047 %s: unknown character in collating element name"),
2048                               "LC_COLLATE");
2049                   if (arg->val.str.startmb == NULL)
2050                     lr_error (ldfile, _("\
2051 %s: unknown character in collating element definition"),
2052                               "LC_COLLATE");
2053                   if (arg->val.str.startwc == NULL)
2054                     lr_error (ldfile, _("\
2055 %s: unknown wide character in collating element definition"),
2056                               "LC_COLLATE");
2057                   else if (arg->val.str.lenwc < 2)
2058                     lr_error (ldfile, _("\
2059 %s: substitution string in collating element definition must have at least two characters"),
2060                               "LC_COLLATE");
2061
2062                   if (symbol != NULL)
2063                     {
2064                       /* The name is already defined.  */
2065                       if (check_duplicate (ldfile, collate, charmap,
2066                                            repertoire, symbol, symbol_len))
2067                         goto col_elem_free;
2068
2069                       if (insert_entry (&collate->elem_table,
2070                                         symbol, symbol_len,
2071                                         new_element (collate,
2072                                                      NULL, 0, NULL, symbol,
2073                                                      symbol_len)) < 0)
2074                         lr_error (ldfile, _("\
2075 error while adding collating element"));
2076                     }
2077                   else
2078                     goto col_elem_free;
2079                 }
2080               else
2081                 {
2082                 col_elem_free:
2083                   if (symbol != NULL)
2084                     free ((char *) symbol);
2085                   if (arg->val.str.startmb != NULL)
2086                     free (arg->val.str.startmb);
2087                   if (arg->val.str.startwc != NULL)
2088                     free (arg->val.str.startwc);
2089                 }
2090               lr_ignore_rest (ldfile, 1);
2091             }
2092           break;
2093
2094         case tok_collating_symbol:
2095           /* Ignore the rest of the line if we don't need the input of
2096              this line.  */
2097           if (ignore_content)
2098             {
2099               lr_ignore_rest (ldfile, 0);
2100               break;
2101             }
2102
2103           if (state != 0)
2104             goto err_label;
2105
2106           arg = lr_token (ldfile, charmap, repertoire);
2107           if (arg->tok != tok_bsymbol)
2108             goto err_label;
2109           else
2110             {
2111               const char *symbol = arg->val.str.startmb;
2112               size_t symbol_len = arg->val.str.lenmb;
2113
2114               if (!ignore_content)
2115                 {
2116                   if (symbol == NULL)
2117                     lr_error (ldfile, _("\
2118 %s: unknown character in collating symbol name"),
2119                               "LC_COLLATE");
2120                   else
2121                     {
2122                       /* The name is already defined.  */
2123                       if (check_duplicate (ldfile, collate, charmap,
2124                                            repertoire, symbol, symbol_len))
2125                         goto col_sym_free;
2126
2127                       if (insert_entry (&collate->sym_table,
2128                                         symbol, symbol_len,
2129                                         new_symbol (collate)) < 0)
2130                         lr_error (ldfile, _("\
2131 error while adding collating symbol"));
2132                     }
2133                 }
2134               else
2135                 {
2136                 col_sym_free:
2137                   if (symbol != NULL)
2138                     free ((char *) symbol);
2139                 }
2140               lr_ignore_rest (ldfile, 1);
2141             }
2142           break;
2143
2144         case tok_symbol_equivalence:
2145           /* Ignore the rest of the line if we don't need the input of
2146              this line.  */
2147           if (ignore_content)
2148             {
2149               lr_ignore_rest (ldfile, 0);
2150               break;
2151             }
2152
2153           if (state != 0)
2154             goto err_label;
2155
2156           arg = lr_token (ldfile, charmap, repertoire);
2157           if (arg->tok != tok_bsymbol)
2158             goto err_label;
2159           else
2160             {
2161               const char *newname = arg->val.str.startmb;
2162               size_t newname_len = arg->val.str.lenmb;
2163               const char *symname;
2164               size_t symname_len;
2165               struct symbol_t *symval;
2166
2167               arg = lr_token (ldfile, charmap, repertoire);
2168               if (arg->tok != tok_bsymbol)
2169                 {
2170                   if (newname != NULL)
2171                     free ((char *) newname);
2172                   goto err_label;
2173                 }
2174
2175               symname = arg->val.str.startmb;
2176               symname_len = arg->val.str.lenmb;
2177
2178               if (!ignore_content)
2179                 {
2180                   if (newname == NULL)
2181                     {
2182                       lr_error (ldfile, _("\
2183 %s: unknown character in equivalent definition name"),
2184                                 "LC_COLLATE");
2185                       goto sym_equiv_free;
2186                     }
2187                   if (symname == NULL)
2188                     {
2189                       lr_error (ldfile, _("\
2190 %s: unknown character in equivalent definition value"),
2191                                 "LC_COLLATE");
2192                       goto sym_equiv_free;
2193                     }
2194                   /* The name is already defined.  */
2195                   if (check_duplicate (ldfile, collate, charmap,
2196                                        repertoire, symname, symname_len))
2197                     goto col_sym_free;
2198
2199                   /* See whether the symbol name is already defined.  */
2200                   if (find_entry (&collate->sym_table, symname, symname_len,
2201                                   (void **) &symval) != 0)
2202                     {
2203                       lr_error (ldfile, _("\
2204 %s: unknown symbol `%s' in equivalent definition"),
2205                                 "LC_COLLATE", symname);
2206                       goto col_sym_free;
2207                     }
2208
2209                   if (insert_entry (&collate->sym_table,
2210                                     newname, newname_len, symval) < 0)
2211                     {
2212                       lr_error (ldfile, _("\
2213 error while adding equivalent collating symbol"));
2214                       goto sym_equiv_free;
2215                     }
2216
2217                   free ((char *) symname);
2218                 }
2219               else
2220                 {
2221                 sym_equiv_free:
2222                   if (newname != NULL)
2223                     free ((char *) newname);
2224                   if (symname != NULL)
2225                     free ((char *) symname);
2226                 }
2227               lr_ignore_rest (ldfile, 1);
2228             }
2229           break;
2230
2231         case tok_order_start:
2232           /* Ignore the rest of the line if we don't need the input of
2233              this line.  */
2234           if (ignore_content)
2235             {
2236               lr_ignore_rest (ldfile, 0);
2237               break;
2238             }
2239
2240           if (state != 0 && state != 1)
2241             goto err_label;
2242           state = 1;
2243
2244           /* The 14652 draft does not specify whether all `order_start' lines
2245              must contain the same number of sort-rules, but 14651 does.  So
2246              we require this here as well.  */
2247           arg = lr_token (ldfile, charmap, repertoire);
2248           if (arg->tok == tok_bsymbol)
2249             {
2250               /* This better should be a section name.  */
2251               struct section_list *sp = collate->sections;
2252               while (sp != NULL
2253                      && strcmp (sp->name, arg->val.str.startmb) != 0)
2254                 sp = sp->next;
2255
2256               if (sp == NULL)
2257                 {
2258                   lr_error (ldfile, _("\
2259 %s: unknown section name `%s'"),
2260                             "LC_COLLATE", arg->val.str.startmb);
2261                   /* We use the error section.  */
2262                   collate->current_section = &collate->error_section;
2263
2264                   if (collate->error_section.first == NULL)
2265                     {
2266                       collate->error_section.next = collate->sections;
2267                       collate->sections = &collate->error_section;
2268                     }
2269                 }
2270               else
2271                 {
2272                   /* Remember this section.  */
2273                   collate->current_section = sp;
2274
2275                   /* One should not be allowed to open the same
2276                      section twice.  */
2277                   if (sp->first != NULL)
2278                     lr_error (ldfile, _("\
2279 %s: multiple order definitions for section `%s'"),
2280                               "LC_COLLATE", sp->name);
2281                   else
2282                     {
2283                       sp->next = collate->sections;
2284                       collate->sections = sp;
2285                     }
2286
2287                   /* Next should come the end of the line or a semicolon.  */
2288                   arg = lr_token (ldfile, charmap, repertoire);
2289                   if (arg->tok == tok_eol)
2290                     {
2291                       uint32_t cnt;
2292
2293                       /* This means we have exactly one rule: `forward'.  */
2294                       if (collate->nrules > 1)
2295                         lr_error (ldfile, _("\
2296 %s: invalid number of sorting rules"),
2297                                   "LC_COLLATE");
2298                       else
2299                         collate->nrules = 1;
2300                       sp->rules = obstack_alloc (&collate->mempool,
2301                                                  (sizeof (enum coll_sort_rule)
2302                                                   * collate->nrules));
2303                       for (cnt = 0; cnt < collate->nrules; ++cnt)
2304                         sp->rules[cnt] = sort_forward;
2305
2306                       /* Next line.  */
2307                       break;
2308                     }
2309
2310                   /* Get the next token.  */
2311                   arg = lr_token (ldfile, charmap, repertoire);
2312                 }
2313             }
2314           else
2315             {
2316               /* There is no section symbol.  Therefore we use the unnamed
2317                  section.  */
2318               collate->current_section = &collate->unnamed_section;
2319
2320               if (collate->unnamed_section.first != NULL)
2321                 lr_error (ldfile, _("\
2322 %s: multiple order definitions for unnamed section"),
2323                           "LC_COLLATE");
2324               else
2325                 {
2326                   collate->unnamed_section.next = collate->sections;
2327                   collate->sections = &collate->unnamed_section;
2328                 }
2329             }
2330
2331           /* Now read the direction names.  */
2332           read_directions (ldfile, arg, charmap, repertoire, collate);
2333
2334           /* From now be need the strings untranslated.  */
2335           ldfile->translate_strings = 0;
2336           break;
2337
2338         case tok_order_end:
2339           /* Ignore the rest of the line if we don't need the input of
2340              this line.  */
2341           if (ignore_content)
2342             {
2343               lr_ignore_rest (ldfile, 0);
2344               break;
2345             }
2346
2347           if (state != 1)
2348             goto err_label;
2349
2350           /* Handle ellipsis at end of list.  */
2351           if (was_ellipsis != tok_none)
2352             {
2353               handle_ellipsis (ldfile, NULL, was_ellipsis, charmap, repertoire,
2354                                collate);
2355               was_ellipsis = tok_none;
2356             }
2357
2358           state = 2;
2359           lr_ignore_rest (ldfile, 1);
2360           break;
2361
2362         case tok_reorder_after:
2363           /* Ignore the rest of the line if we don't need the input of
2364              this line.  */
2365           if (ignore_content)
2366             {
2367               lr_ignore_rest (ldfile, 0);
2368               break;
2369             }
2370
2371           if (state == 1)
2372             {
2373               lr_error (ldfile, _("%s: missing `order_end' keyword"),
2374                         "LC_COLLATE");
2375               state = 2;
2376
2377               /* Handle ellipsis at end of list.  */
2378               if (was_ellipsis != tok_none)
2379                 {
2380                   handle_ellipsis (ldfile, arg, was_ellipsis, charmap,
2381                                    repertoire, collate);
2382                   was_ellipsis = tok_none;
2383                 }
2384             }
2385           else if (state != 2 && state != 3)
2386             goto err_label;
2387           state = 3;
2388
2389           arg = lr_token (ldfile, charmap, repertoire);
2390           if (arg->tok == tok_bsymbol)
2391             {
2392               /* Find this symbol in the sequence table.  */
2393               struct element_t *insp;
2394               int no_error = 1;
2395
2396               if (find_entry (&collate->seq_table, arg->val.str.startmb,
2397                               arg->val.str.lenmb, (void **) &insp) == 0)
2398                 /* Yes, the symbol exists.  Simply point the cursor
2399                    to it.  */
2400                   collate->cursor = insp;
2401               else
2402                 {
2403                   /* This is bad.  The symbol after which we have to
2404                      insert does not exist.  */
2405                   lr_error (ldfile, _("\
2406 %s: cannot reorder after %.*s: symbol not known"),
2407                             "LC_COLLATE", (int) arg->val.str.lenmb,
2408                             arg->val.str.startmb);
2409                   collate->cursor = NULL;
2410                   no_error = 0;
2411                 }
2412
2413               lr_ignore_rest (ldfile, no_error);
2414             }
2415           else
2416             /* This must not happen.  */
2417             goto err_label;
2418           break;
2419
2420         case tok_reorder_end:
2421           /* Ignore the rest of the line if we don't need the input of
2422              this line.  */
2423           if (ignore_content)
2424             break;
2425
2426           if (state != 3)
2427             goto err_label;
2428           state = 4;
2429           lr_ignore_rest (ldfile, 1);
2430           break;
2431
2432         case tok_reorder_sections_after:
2433           /* Ignore the rest of the line if we don't need the input of
2434              this line.  */
2435           if (ignore_content)
2436             {
2437               lr_ignore_rest (ldfile, 0);
2438               break;
2439             }
2440
2441           if (state == 1)
2442             {
2443               lr_error (ldfile, _("%s: missing `order_end' keyword"),
2444                         "LC_COLLATE");
2445               state = 2;
2446
2447               /* Handle ellipsis at end of list.  */
2448               if (was_ellipsis != tok_none)
2449                 {
2450                   handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
2451                                    repertoire, collate);
2452                   was_ellipsis = tok_none;
2453                 }
2454             }
2455           else if (state == 3)
2456             {
2457               error (0, 0, _("%s: missing `reorder-end' keyword"),
2458                      "LC_COLLATE");
2459               state = 4;
2460             }
2461           else if (state != 2 && state != 4)
2462             goto err_label;
2463           state = 5;
2464
2465           /* Get the name of the sections we are adding after.  */
2466           arg = lr_token (ldfile, charmap, repertoire);
2467           if (arg->tok == tok_bsymbol)
2468             {
2469               /* Now find a section with this name.  */
2470               struct section_list *runp = collate->sections;
2471
2472               while (runp != NULL)
2473                 {
2474                   if (runp->name != NULL
2475                       && strlen (runp->name) == arg->val.str.lenmb
2476                       && memcmp (runp->name, arg->val.str.startmb,
2477                                  arg->val.str.lenmb) == 0)
2478                     break;
2479
2480                   runp = runp->next;
2481                 }
2482
2483               if (runp != NULL)
2484                 collate->current_section = runp;
2485               else
2486                 {
2487                   /* This is bad.  The section after which we have to
2488                      reorder does not exist.  Therefore we cannot
2489                      process the whole rest of this reorder
2490                      specification.  */
2491                   lr_error (ldfile, _("%s: section `%.*s' not known"),
2492                             "LC_COLLATE", (int) arg->val.str.lenmb,
2493                             arg->val.str.startmb);
2494
2495                   do
2496                     {
2497                       lr_ignore_rest (ldfile, 0);
2498
2499                       now = lr_token (ldfile, charmap, NULL);
2500                     }
2501                   while (now->tok == tok_reorder_sections_after
2502                          || now->tok == tok_reorder_sections_end
2503                          || now->tok == tok_end);
2504
2505                   /* Process the token we just saw.  */
2506                   nowtok = now->tok;
2507                   continue;
2508                 }
2509             }
2510           else
2511             /* This must not happen.  */
2512             goto err_label;
2513           break;
2514
2515         case tok_reorder_sections_end:
2516           /* Ignore the rest of the line if we don't need the input of
2517              this line.  */
2518           if (ignore_content)
2519             break;
2520
2521           if (state != 5)
2522             goto err_label;
2523           state = 6;
2524           lr_ignore_rest (ldfile, 1);
2525           break;
2526
2527         case tok_bsymbol:
2528           /* Ignore the rest of the line if we don't need the input of
2529              this line.  */
2530           if (ignore_content)
2531             {
2532               lr_ignore_rest (ldfile, 0);
2533               break;
2534             }
2535
2536           if (state != 1 && state != 3)
2537             goto err_label;
2538
2539           if (state == 3)
2540             {
2541               /* It is possible that we already have this collation sequence.
2542                  In this case we move the entry.  */
2543               struct element_t *seqp;
2544
2545               /* If the symbol after which we have to insert was not found
2546                  ignore all entries.  */
2547               if (collate->cursor == NULL)
2548                 {
2549                   lr_ignore_rest (ldfile, 0);
2550                   break;
2551                 }
2552
2553               if (find_entry (&collate->seq_table, arg->val.str.startmb,
2554                               arg->val.str.lenmb, (void **) &seqp) == 0)
2555                 {
2556                   /* Remove the entry from the old position.  */
2557                   if (seqp->last == NULL)
2558                     collate->start = seqp->next;
2559                   else
2560                     seqp->last->next = seqp->next;
2561                   if (seqp->next != NULL)
2562                     seqp->next->last = seqp->last;
2563
2564                   /* We also have to check whether this entry is the
2565                      first or last of a section.  */
2566                   if (seqp->section->first == seqp)
2567                     {
2568                       if (seqp->section->first == seqp->section->last)
2569                         /* This setion has no content anymore.  */
2570                         seqp->section->first = seqp->section->last = NULL;
2571                       else
2572                         seqp->section->first = seqp->next;
2573                     }
2574                   else if (seqp->section->last == seqp)
2575                     seqp->section->last = seqp->last;
2576
2577                   /* Now insert it in the new place.  */
2578                   seqp->next = collate->cursor->next;
2579                   seqp->last = collate->cursor;
2580                   collate->cursor->next = seqp;
2581                   if (seqp->next != NULL)
2582                     seqp->next->last = seqp;
2583
2584                   seqp->section = collate->cursor->section;
2585                   if (seqp->section->last == collate->cursor)
2586                     seqp->section->last = seqp;
2587
2588                   break;
2589                 }
2590
2591               /* Otherwise we just add a new entry.  */
2592             }
2593           else if (state == 5)
2594             {
2595               /* We are reordering sections.  Find the named section.  */
2596               struct section_list *runp = collate->sections;
2597               struct section_list *prevp = NULL;
2598
2599               while (runp != NULL)
2600                 {
2601                   if (runp->name != NULL
2602                       && strlen (runp->name) == arg->val.str.lenmb
2603                       && memcmp (runp->name, arg->val.str.startmb,
2604                                  arg->val.str.lenmb) == 0)
2605                     break;
2606
2607                   prevp = runp;
2608                   runp = runp->next;
2609                 }
2610
2611               if (runp == NULL)
2612                 {
2613                   lr_error (ldfile, _("%s: section `%.*s' not known"),
2614                             "LC_COLLATE", (int) arg->val.str.lenmb,
2615                             arg->val.str.startmb);
2616                   lr_ignore_rest (ldfile, 0);
2617                 }
2618               else
2619                 {
2620                   if (runp != collate->current_section)
2621                     {
2622                       /* Remove the named section from the old place and
2623                          insert it in the new one.  */
2624                       prevp->next = runp->next;
2625
2626                       runp->next = collate->current_section->next;
2627                       collate->current_section->next = runp;
2628                       collate->current_section = runp;
2629                     }
2630
2631                   /* Process the rest of the line which might change
2632                      the collation rules.  */
2633                   arg = lr_token (ldfile, charmap, repertoire);
2634                   if (arg->tok != tok_eof && arg->tok != tok_eol)
2635                     read_directions (ldfile, arg, charmap, repertoire,
2636                                      collate);
2637                 }
2638               break;
2639             }
2640           else if (was_ellipsis != tok_none)
2641             {
2642               /* Using the information in the `ellipsis_weight'
2643                  element and this and the last value we have to handle
2644                  the ellipsis now.  */
2645               assert (state == 1);
2646
2647               handle_ellipsis (ldfile, arg, was_ellipsis, charmap, repertoire,
2648                                collate);
2649
2650               /* Remember that we processed the ellipsis.  */
2651               was_ellipsis = tok_none;
2652
2653               /* And don't add the value a second time.  */
2654               break;
2655             }
2656
2657           /* Now insert in the new place.  */
2658           insert_value (ldfile, arg, charmap, repertoire, collate);
2659           break;
2660
2661         case tok_undefined:
2662           /* Ignore the rest of the line if we don't need the input of
2663              this line.  */
2664           if (ignore_content)
2665             {
2666               lr_ignore_rest (ldfile, 0);
2667               break;
2668             }
2669
2670           if (state != 1)
2671             goto err_label;
2672
2673           if (was_ellipsis != tok_none)
2674             {
2675               lr_error (ldfile,
2676                         _("%s: cannot have `%s' as end of ellipsis range"),
2677                         "LC_COLLATE", "UNDEFINED");
2678
2679               unlink_element (collate);
2680               was_ellipsis = tok_none;
2681             }
2682
2683           /* See whether UNDEFINED already appeared somewhere.  */
2684           if (collate->undefined.next != NULL
2685               || (collate->cursor != NULL
2686                   && collate->undefined.next == collate->cursor))
2687             {
2688               lr_error (ldfile,
2689                         _("%s: order for `%.*s' already defined at %s:%zu"),
2690                         "LC_COLLATE", 9, "UNDEFINED", collate->undefined.file,
2691                         collate->undefined.line);
2692               lr_ignore_rest (ldfile, 0);
2693             }
2694           else
2695             /* Parse the weights.  */
2696              insert_weights (ldfile, &collate->undefined, charmap,
2697                              repertoire, collate, tok_none);
2698           break;
2699
2700         case tok_ellipsis2:
2701         case tok_ellipsis3:
2702         case tok_ellipsis4:
2703           /* This is the symbolic (decimal or hexadecimal) or absolute
2704              ellipsis.  */
2705           if (was_ellipsis != tok_none)
2706             goto err_label;
2707
2708           if (state != 1 && state != 3)
2709             goto err_label;
2710
2711           was_ellipsis = nowtok;
2712
2713           insert_weights (ldfile, &collate->ellipsis_weight, charmap,
2714                           repertoire, collate, nowtok);
2715           break;
2716
2717         case tok_end:
2718           /* Next we assume `LC_COLLATE'.  */
2719           if (!ignore_content)
2720             {
2721               if (state == 0)
2722                 /* We must either see a copy statement or have
2723                    ordering values.  */
2724                 lr_error (ldfile,
2725                           _("%s: empty category description not allowed"),
2726                           "LC_COLLATE");
2727               else if (state == 1)
2728                 {
2729                   lr_error (ldfile, _("%s: missing `order_end' keyword"),
2730                             "LC_COLLATE");
2731
2732                   /* Handle ellipsis at end of list.  */
2733                   if (was_ellipsis != tok_none)
2734                     {
2735                       handle_ellipsis (ldfile, NULL, was_ellipsis, charmap,
2736                                        repertoire, collate);
2737                       was_ellipsis = tok_none;
2738                     }
2739                 }
2740               else if (state == 3)
2741                 error (0, 0, _("%s: missing `reorder-end' keyword"),
2742                        "LC_COLLATE");
2743               else if (state == 5)
2744                 error (0, 0, _("%s: missing `reorder-sections-end' keyword"),
2745                        "LC_COLLATE");
2746             }
2747           arg = lr_token (ldfile, charmap, NULL);
2748           if (arg->tok == tok_eof)
2749             break;
2750           if (arg->tok == tok_eol)
2751             lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
2752           else if (arg->tok != tok_lc_collate)
2753             lr_error (ldfile, _("\
2754 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2755           lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
2756           return;
2757
2758         default:
2759         err_label:
2760           SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2761         }
2762
2763       /* Prepare for the next round.  */
2764       now = lr_token (ldfile, charmap, NULL);
2765       nowtok = now->tok;
2766     }
2767
2768   /* When we come here we reached the end of the file.  */
2769   lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2770 }