]> sourceware.org Git - glibc.git/blob - locale/programs/ld-ctype.c
Update.
[glibc.git] / locale / programs / ld-ctype.c
1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23
24 #include <alloca.h>
25 #include <byteswap.h>
26 #include <endian.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <obstack.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <wchar.h>
33 #include <wctype.h>
34 #include <sys/uio.h>
35
36 #include "charmap.h"
37 #include "localeinfo.h"
38 #include "langinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
41 #include "locfile.h"
42 #include "localedef.h"
43
44 #include <assert.h>
45
46
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
49 classes. */
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
53 #endif
54
55
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
60
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
64
65
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
71
72
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
77 struct translit_to_t
78 {
79 uint32_t *str;
80
81 struct translit_to_t *next;
82 };
83
84 struct translit_t
85 {
86 uint32_t *from;
87
88 const char *fname;
89 size_t lineno;
90
91 struct translit_to_t *to;
92
93 struct translit_t *next;
94 };
95
96 struct translit_ignore_t
97 {
98 uint32_t from;
99 uint32_t to;
100 uint32_t step;
101
102 const char *fname;
103 size_t lineno;
104
105 struct translit_ignore_t *next;
106 };
107
108
109 /* The real definition of the struct for the LC_CTYPE locale. */
110 struct locale_ctype_t
111 {
112 uint32_t *charnames;
113 size_t charnames_max;
114 size_t charnames_act;
115
116 struct repertoire_t *repertoire;
117
118 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
119 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
120 size_t nr_charclass;
121 const char *classnames[MAX_NR_CHARCLASS];
122 uint32_t last_class_char;
123 uint32_t class256_collection[256];
124 uint32_t *class_collection;
125 size_t class_collection_max;
126 size_t class_collection_act;
127 uint32_t class_done;
128
129 struct charseq **mbdigits;
130 size_t mbdigits_act;
131 size_t mbdigits_max;
132 uint32_t *wcdigits;
133 size_t wcdigits_act;
134 size_t wcdigits_max;
135
136 struct charseq *mboutdigits[10];
137 uint32_t wcoutdigits[10];
138 size_t outdigits_act;
139
140 /* If the following number ever turns out to be too small simply
141 increase it. But I doubt it will. --drepper@gnu */
142 #define MAX_NR_CHARMAP 16
143 const char *mapnames[MAX_NR_CHARMAP];
144 uint32_t *map_collection[MAX_NR_CHARMAP];
145 uint32_t map256_collection[2][256];
146 size_t map_collection_max[MAX_NR_CHARMAP];
147 size_t map_collection_act[MAX_NR_CHARMAP];
148 size_t map_collection_nr;
149 size_t last_map_idx;
150 int tomap_done[MAX_NR_CHARMAP];
151
152 /* Transliteration information. */
153 const char *translit_copy_locale;
154 const char *translit_copy_repertoire;
155 struct translit_t *translit;
156 struct translit_ignore_t *translit_ignore;
157
158 uint32_t *default_missing;
159 const char *default_missing_file;
160 size_t default_missing_lineno;
161
162 /* The arrays for the binary representation. */
163 uint32_t plane_size;
164 uint32_t plane_cnt;
165 char_class_t *ctype_b;
166 char_class32_t *ctype32_b;
167 uint32_t *names;
168 uint32_t **map;
169 uint32_t **map32;
170 uint32_t *class_name_ptr;
171 uint32_t *map_name_ptr;
172 unsigned char *width;
173 uint32_t mb_cur_max;
174 const char *codeset_name;
175 uint32_t translit_hash_size;
176 uint32_t translit_hash_layers;
177 uint32_t *translit_from_idx;
178 uint32_t *translit_from_tbl;
179 uint32_t *translit_to_idx;
180 uint32_t *translit_to_tbl;
181 size_t translit_idx_size;
182 size_t translit_from_tbl_size;
183 size_t translit_to_tbl_size;
184
185 struct obstack mempool;
186 };
187
188
189 #define obstack_chunk_alloc xmalloc
190 #define obstack_chunk_free free
191
192
193 /* Prototypes for local functions. */
194 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
195 struct charmap_t *charmap, int ignore_content);
196 static void ctype_class_new (struct linereader *lr,
197 struct locale_ctype_t *ctype, const char *name);
198 static void ctype_map_new (struct linereader *lr,
199 struct locale_ctype_t *ctype,
200 const char *name, struct charmap_t *charmap);
201 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
202 size_t *max, size_t *act, unsigned int idx);
203 static void set_class_defaults (struct locale_ctype_t *ctype,
204 struct charmap_t *charmap,
205 struct repertoire_t *repertoire);
206 static void allocate_arrays (struct locale_ctype_t *ctype,
207 struct charmap_t *charmap,
208 struct repertoire_t *repertoire);
209
210
211 static const char *longnames[] =
212 {
213 "zero", "one", "two", "three", "four",
214 "five", "six", "seven", "eight", "nine"
215 };
216 static const unsigned char digits[] = "0123456789";
217
218
219 static void
220 ctype_startup (struct linereader *lr, struct localedef_t *locale,
221 struct charmap_t *charmap, int ignore_content)
222 {
223 unsigned int cnt;
224 struct locale_ctype_t *ctype;
225
226 if (!ignore_content)
227 {
228 /* Allocate the needed room. */
229 locale->categories[LC_CTYPE].ctype = ctype =
230 (struct locale_ctype_t *) xcalloc (1, sizeof (struct locale_ctype_t));
231
232 /* We have seen no names yet. */
233 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
234 ctype->charnames =
235 (unsigned int *) xmalloc (ctype->charnames_max
236 * sizeof (unsigned int));
237 for (cnt = 0; cnt < 256; ++cnt)
238 ctype->charnames[cnt] = cnt;
239 ctype->charnames_act = 256;
240
241 /* Fill character class information. */
242 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
243 /* The order of the following instructions determines the bit
244 positions! */
245 ctype_class_new (lr, ctype, "upper");
246 ctype_class_new (lr, ctype, "lower");
247 ctype_class_new (lr, ctype, "alpha");
248 ctype_class_new (lr, ctype, "digit");
249 ctype_class_new (lr, ctype, "xdigit");
250 ctype_class_new (lr, ctype, "space");
251 ctype_class_new (lr, ctype, "print");
252 ctype_class_new (lr, ctype, "graph");
253 ctype_class_new (lr, ctype, "blank");
254 ctype_class_new (lr, ctype, "cntrl");
255 ctype_class_new (lr, ctype, "punct");
256 ctype_class_new (lr, ctype, "alnum");
257 #ifdef PREDEFINED_CLASSES
258 /* The following are extensions from ISO 14652. */
259 ctype_class_new (lr, ctype, "left_to_right");
260 ctype_class_new (lr, ctype, "right_to_left");
261 ctype_class_new (lr, ctype, "num_terminator");
262 ctype_class_new (lr, ctype, "num_separator");
263 ctype_class_new (lr, ctype, "segment_separator");
264 ctype_class_new (lr, ctype, "block_separator");
265 ctype_class_new (lr, ctype, "direction_control");
266 ctype_class_new (lr, ctype, "sym_swap_layout");
267 ctype_class_new (lr, ctype, "char_shape_selector");
268 ctype_class_new (lr, ctype, "num_shape_selector");
269 ctype_class_new (lr, ctype, "non_spacing");
270 ctype_class_new (lr, ctype, "non_spacing_level3");
271 ctype_class_new (lr, ctype, "normal_connect");
272 ctype_class_new (lr, ctype, "r_connect");
273 ctype_class_new (lr, ctype, "no_connect");
274 ctype_class_new (lr, ctype, "no_connect-space");
275 ctype_class_new (lr, ctype, "vowel_connect");
276 #endif
277
278 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
279 ctype->class_collection
280 = (uint32_t *) xcalloc (sizeof (unsigned long int),
281 ctype->class_collection_max);
282 ctype->class_collection_act = 256;
283
284 /* Fill character map information. */
285 ctype->last_map_idx = MAX_NR_CHARMAP;
286 ctype_map_new (lr, ctype, "toupper", charmap);
287 ctype_map_new (lr, ctype, "tolower", charmap);
288 #ifdef PREDEFINED_CLASSES
289 ctype_map_new (lr, ctype, "tosymmetric", charmap);
290 #endif
291
292 /* Fill first 256 entries in `toXXX' arrays. */
293 for (cnt = 0; cnt < 256; ++cnt)
294 {
295 ctype->map_collection[0][cnt] = cnt;
296 ctype->map_collection[1][cnt] = cnt;
297 #ifdef PREDEFINED_CLASSES
298 ctype->map_collection[2][cnt] = cnt;
299 #endif
300 ctype->map256_collection[0][cnt] = cnt;
301 ctype->map256_collection[1][cnt] = cnt;
302 }
303
304 obstack_init (&ctype->mempool);
305 }
306 }
307
308
309 void
310 ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
311 {
312 /* See POSIX.2, table 2-6 for the meaning of the following table. */
313 #define NCLASS 12
314 static const struct
315 {
316 const char *name;
317 const char allow[NCLASS];
318 }
319 valid_table[NCLASS] =
320 {
321 /* The order is important. See token.h for more information.
322 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
323 { "upper", "--MX-XDDXXX-" },
324 { "lower", "--MX-XDDXXX-" },
325 { "alpha", "---X-XDDXXX-" },
326 { "digit", "XXX--XDDXXX-" },
327 { "xdigit", "-----XDDXXX-" },
328 { "space", "XXXXX------X" },
329 { "print", "---------X--" },
330 { "graph", "---------X--" },
331 { "blank", "XXXXXM-----X" },
332 { "cntrl", "XXXXX-XX--XX" },
333 { "punct", "XXXXX-DD-X-X" },
334 { "alnum", "-----XDDXXX-" }
335 };
336 size_t cnt;
337 int cls1, cls2;
338 uint32_t space_value;
339 struct charseq *space_seq;
340 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
341 int warned;
342
343 /* Now resolve copying and also handle completely missing definitions. */
344 if (ctype == NULL)
345 {
346 const char *repertoire_name;
347
348 /* First see whether we were supposed to copy. If yes, find the
349 actual definition. */
350 if (locale->copy_name[LC_CTYPE] != NULL)
351 {
352 /* Find the copying locale. This has to happen transitively since
353 the locale we are copying from might also copying another one. */
354 struct localedef_t *from = locale;
355
356 do
357 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
358 from->repertoire_name, charmap);
359 while (from->categories[LC_CTYPE].ctype == NULL
360 && from->copy_name[LC_CTYPE] != NULL);
361
362 ctype = locale->categories[LC_CTYPE].ctype
363 = from->categories[LC_CTYPE].ctype;
364 }
365
366 /* If there is still no definition issue an warning and create an
367 empty one. */
368 if (ctype == NULL)
369 {
370 if (! be_quiet)
371 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
372 ctype_startup (NULL, locale, charmap, 0);
373 ctype = locale->categories[LC_CTYPE].ctype;
374 }
375
376 /* Get the repertoire we have to use. */
377 repertoire_name = locale->repertoire_name ?: repertoire_global;
378 if (repertoire_name != NULL)
379 ctype->repertoire = repertoire_read (repertoire_name);
380 }
381
382 /* We need the name of the currently used 8-bit character set to
383 make correct conversion between this 8-bit representation and the
384 ISO 10646 character set used internally for wide characters. */
385 ctype->codeset_name = charmap->code_set_name;
386 if (ctype->codeset_name == NULL)
387 {
388 if (! be_quiet)
389 error (0, 0, "no character set name specified in charmap");
390 ctype->codeset_name = "//UNKNOWN//";
391 }
392
393 /* Set default value for classes not specified. */
394 set_class_defaults (ctype, charmap, ctype->repertoire);
395
396 /* Check according to table. */
397 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
398 {
399 uint32_t tmp = ctype->class_collection[cnt];
400
401 if (tmp != 0)
402 {
403 for (cls1 = 0; cls1 < NCLASS; ++cls1)
404 if ((tmp & _ISwbit (cls1)) != 0)
405 for (cls2 = 0; cls2 < NCLASS; ++cls2)
406 if (valid_table[cls1].allow[cls2] != '-')
407 {
408 int eq = (tmp & _ISwbit (cls2)) != 0;
409 switch (valid_table[cls1].allow[cls2])
410 {
411 case 'M':
412 if (!eq)
413 {
414 uint32_t value = ctype->charnames[cnt];
415
416 if (!be_quiet)
417 error (0, 0, _("\
418 character L'\\u%0*x' in class `%s' must be in class `%s'"),
419 value > 0xffff ? 8 : 4, value,
420 valid_table[cls1].name,
421 valid_table[cls2].name);
422 }
423 break;
424
425 case 'X':
426 if (eq)
427 {
428 uint32_t value = ctype->charnames[cnt];
429
430 if (!be_quiet)
431 error (0, 0, _("\
432 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
433 value > 0xffff ? 8 : 4, value,
434 valid_table[cls1].name,
435 valid_table[cls2].name);
436 }
437 break;
438
439 case 'D':
440 ctype->class_collection[cnt] |= _ISwbit (cls2);
441 break;
442
443 default:
444 error (5, 0, _("internal error in %s, line %u"),
445 __FUNCTION__, __LINE__);
446 }
447 }
448 }
449 }
450
451 for (cnt = 0; cnt < 256; ++cnt)
452 {
453 uint32_t tmp = ctype->class256_collection[cnt];
454
455 if (tmp != 0)
456 {
457 for (cls1 = 0; cls1 < NCLASS; ++cls1)
458 if ((tmp & _ISbit (cls1)) != 0)
459 for (cls2 = 0; cls2 < NCLASS; ++cls2)
460 if (valid_table[cls1].allow[cls2] != '-')
461 {
462 int eq = (tmp & _ISbit (cls2)) != 0;
463 switch (valid_table[cls1].allow[cls2])
464 {
465 case 'M':
466 if (!eq)
467 {
468 char buf[17];
469
470 snprintf (buf, sizeof buf, "\\%Zo", cnt);
471
472 if (!be_quiet)
473 error (0, 0, _("\
474 character '%s' in class `%s' must be in class `%s'"),
475 buf, valid_table[cls1].name,
476 valid_table[cls2].name);
477 }
478 break;
479
480 case 'X':
481 if (eq)
482 {
483 char buf[17];
484
485 snprintf (buf, sizeof buf, "\\%Zo", cnt);
486
487 if (!be_quiet)
488 error (0, 0, _("\
489 character '%s' in class `%s' must not be in class `%s'"),
490 buf, valid_table[cls1].name,
491 valid_table[cls2].name);
492 }
493 break;
494
495 case 'D':
496 ctype->class256_collection[cnt] |= _ISbit (cls2);
497 break;
498
499 default:
500 error (5, 0, _("internal error in %s, line %u"),
501 __FUNCTION__, __LINE__);
502 }
503 }
504 }
505 }
506
507 /* ... and now test <SP> as a special case. */
508 space_value = 32;
509 if (((cnt = BITPOS (tok_space),
510 (ELEM (ctype, class_collection, , space_value)
511 & BITw (tok_space)) == 0)
512 || (cnt = BITPOS (tok_blank),
513 (ELEM (ctype, class_collection, , space_value)
514 & BITw (tok_blank)) == 0)))
515 {
516 if (!be_quiet)
517 error (0, 0, _("<SP> character not in class `%s'"),
518 valid_table[cnt].name);
519 }
520 else if (((cnt = BITPOS (tok_punct),
521 (ELEM (ctype, class_collection, , space_value)
522 & BITw (tok_punct)) != 0)
523 || (cnt = BITPOS (tok_graph),
524 (ELEM (ctype, class_collection, , space_value)
525 & BITw (tok_graph))
526 != 0)))
527 {
528 if (!be_quiet)
529 error (0, 0, _("<SP> character must not be in class `%s'"),
530 valid_table[cnt].name);
531 }
532 else
533 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
534
535 space_seq = charmap_find_value (charmap, "SP", 2);
536 if (space_seq == NULL)
537 space_seq = charmap_find_value (charmap, "space", 5);
538 if (space_seq == NULL)
539 space_seq = charmap_find_value (charmap, "U00000020", 5);
540 if (space_seq == NULL || space_seq->nbytes != 1)
541 {
542 if (!be_quiet)
543 error (0, 0, _("character <SP> not defined in character map"));
544 }
545 else if (((cnt = BITPOS (tok_space),
546 (ctype->class256_collection[space_seq->bytes[0]]
547 & BIT (tok_space)) == 0)
548 || (cnt = BITPOS (tok_blank),
549 (ctype->class256_collection[space_seq->bytes[0]]
550 & BIT (tok_blank)) == 0)))
551 {
552 if (!be_quiet)
553 error (0, 0, _("<SP> character not in class `%s'"),
554 valid_table[cnt].name);
555 }
556 else if (((cnt = BITPOS (tok_punct),
557 (ctype->class256_collection[space_seq->bytes[0]]
558 & BIT (tok_punct)) != 0)
559 || (cnt = BITPOS (tok_graph),
560 (ctype->class256_collection[space_seq->bytes[0]]
561 & BIT (tok_graph)) != 0)))
562 {
563 if (!be_quiet)
564 error (0, 0, _("<SP> character must not be in class `%s'"),
565 valid_table[cnt].name);
566 }
567 else
568 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
569
570 /* Now that the tests are done make sure the name array contains all
571 characters which are handled in the WIDTH section of the
572 character set definition file. */
573 if (charmap->width_rules != NULL)
574 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
575 {
576 unsigned char bytes[charmap->mb_cur_max];
577 int nbytes = charmap->width_rules[cnt].from->nbytes;
578
579 /* We have the range of character for which the width is
580 specified described using byte sequences of the multibyte
581 charset. We have to convert this to UCS4 now. And we
582 cannot simply convert the beginning and the end of the
583 sequence, we have to iterate over the byte sequence and
584 convert it for every single character. */
585 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
586
587 while (nbytes < charmap->width_rules[cnt].to->nbytes
588 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
589 nbytes) <= 0)
590 {
591 /* Find the UCS value for `bytes'. */
592 int inner;
593 uint32_t wch;
594 struct charseq *seq = charmap_find_symbol (charmap, bytes, nbytes);
595
596 if (seq == NULL)
597 wch = ILLEGAL_CHAR_VALUE;
598 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
599 wch = seq->ucs4;
600 else
601 wch = repertoire_find_value (ctype->repertoire, seq->name,
602 strlen (seq->name));
603
604 if (wch != ILLEGAL_CHAR_VALUE)
605 /* We are only interested in the side-effects of the
606 `find_idx' call. It will add appropriate entries in
607 the name array if this is necessary. */
608 (void) find_idx (ctype, NULL, NULL, NULL, wch);
609
610 /* "Increment" the bytes sequence. */
611 inner = nbytes - 1;
612 while (inner >= 0 && bytes[inner] == 0xff)
613 --inner;
614
615 if (inner < 0)
616 {
617 /* We have to extend the byte sequence. */
618 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
619 break;
620
621 bytes[0] = 1;
622 memset (&bytes[1], 0, nbytes);
623 ++nbytes;
624 }
625 else
626 {
627 ++bytes[inner];
628 while (++inner < nbytes)
629 bytes[inner] = 0;
630 }
631 }
632 }
633
634 /* There must be a multiple of 10 digits. */
635 if (ctype->mbdigits_act % 10 != 0)
636 {
637 assert (ctype->mbdigits_act == ctype->wcdigits_act);
638 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
639 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
640 error (0, 0, _("`digit' category has not entries in groups of ten"));
641 }
642
643 /* Check the input digits. There must be a multiple of ten available.
644 In each group it could be that one or the other character is missing.
645 In this case the whole group must be removed. */
646 cnt = 0;
647 while (cnt < ctype->mbdigits_act)
648 {
649 size_t inner;
650 for (inner = 0; inner < 10; ++inner)
651 if (ctype->mbdigits[cnt + inner] == NULL)
652 break;
653
654 if (inner == 10)
655 cnt += 10;
656 else
657 {
658 /* Remove the group. */
659 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
660 ((ctype->wcdigits_act - cnt - 10)
661 * sizeof (ctype->mbdigits[0])));
662 ctype->mbdigits_act -= 10;
663 }
664 }
665
666 /* If no input digits are given use the default. */
667 if (ctype->mbdigits_act == 0)
668 {
669 if (ctype->mbdigits_max == 0)
670 {
671 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
672 10 * sizeof (struct charseq *));
673 ctype->mbdigits_max = 10;
674 }
675
676 for (cnt = 0; cnt < 10; ++cnt)
677 {
678 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
679 digits + cnt, 1);
680 if (ctype->mbdigits[cnt] == NULL)
681 {
682 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
683 longnames[cnt],
684 strlen (longnames[cnt]));
685 if (ctype->mbdigits[cnt] == NULL)
686 {
687 /* Hum, this ain't good. */
688 error (0, 0, _("\
689 no input digits defined and none of the standard names in the charmap"));
690
691 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
692 sizeof (struct charseq) + 1);
693
694 /* This is better than nothing. */
695 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
696 ctype->mbdigits[cnt]->nbytes = 1;
697 }
698 }
699 }
700
701 ctype->mbdigits_act = 10;
702 }
703
704 /* Check the wide character input digits. There must be a multiple
705 of ten available. In each group it could be that one or the other
706 character is missing. In this case the whole group must be
707 removed. */
708 cnt = 0;
709 while (cnt < ctype->wcdigits_act)
710 {
711 size_t inner;
712 for (inner = 0; inner < 10; ++inner)
713 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
714 break;
715
716 if (inner == 10)
717 cnt += 10;
718 else
719 {
720 /* Remove the group. */
721 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
722 ((ctype->wcdigits_act - cnt - 10)
723 * sizeof (ctype->wcdigits[0])));
724 ctype->wcdigits_act -= 10;
725 }
726 }
727
728 /* If no input digits are given use the default. */
729 if (ctype->wcdigits_act == 0)
730 {
731 if (ctype->wcdigits_max == 0)
732 {
733 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
734 10 * sizeof (uint32_t));
735 ctype->wcdigits_max = 10;
736 }
737
738 for (cnt = 0; cnt < 10; ++cnt)
739 ctype->wcdigits[cnt] = L'0' + cnt;
740
741 ctype->mbdigits_act = 10;
742 }
743
744 /* Check the outdigits. */
745 warned = 0;
746 for (cnt = 0; cnt < 10; ++cnt)
747 if (ctype->mboutdigits[cnt] == NULL)
748 {
749 static struct charseq replace[2];
750
751 if (!warned)
752 {
753 error (0, 0, _("\
754 not all characters used in `outdigit' are available in the charmap"));
755 warned = 1;
756 }
757
758 replace[0].nbytes = 1;
759 replace[0].bytes[0] = '?';
760 replace[0].bytes[1] = '\0';
761 ctype->mboutdigits[cnt] = &replace[0];
762 }
763
764 warned = 0;
765 for (cnt = 0; cnt < 10; ++cnt)
766 if (ctype->wcoutdigits[cnt] == 0)
767 {
768 if (!warned)
769 {
770 error (0, 0, _("\
771 not all characters used in `outdigit' are available in the repertoire"));
772 warned = 1;
773 }
774
775 ctype->wcoutdigits[cnt] = L'?';
776 }
777 }
778
779
780 void
781 ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
782 const char *output_path)
783 {
784 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
785 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
786 + (ctype->map_collection_nr - 2));
787 struct iovec iov[2 + nelems + ctype->nr_charclass
788 + ctype->map_collection_nr];
789 struct locale_file data;
790 uint32_t idx[nelems + 1];
791 size_t elem, cnt, offset, total;
792 char *cp;
793
794 /* Now prepare the output: Find the sizes of the table we can use. */
795 allocate_arrays (ctype, charmap, ctype->repertoire);
796
797 data.magic = LIMAGIC (LC_CTYPE);
798 data.n = nelems;
799 iov[0].iov_base = (void *) &data;
800 iov[0].iov_len = sizeof (data);
801
802 iov[1].iov_base = (void *) idx;
803 iov[1].iov_len = nelems * sizeof (uint32_t);
804
805 idx[0] = iov[0].iov_len + iov[1].iov_len;
806 offset = 0;
807
808 for (elem = 0; elem < nelems; ++elem)
809 {
810 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
811 switch (elem)
812 {
813 #define CTYPE_DATA(name, base, len) \
814 case _NL_ITEM_INDEX (name): \
815 iov[2 + elem + offset].iov_base = (base); \
816 iov[2 + elem + offset].iov_len = (len); \
817 if (elem + 1 < nelems) \
818 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
819 break
820
821 CTYPE_DATA (_NL_CTYPE_CLASS,
822 ctype->ctype_b,
823 (256 + 128) * sizeof (char_class_t));
824
825 CTYPE_DATA (_NL_CTYPE_TOUPPER,
826 ctype->map[0],
827 (256 + 128) * sizeof (uint32_t));
828 CTYPE_DATA (_NL_CTYPE_TOLOWER,
829 ctype->map[1],
830 (256 + 128) * sizeof (uint32_t));
831
832 CTYPE_DATA (_NL_CTYPE_TOUPPER32,
833 ctype->map32[0],
834 (ctype->plane_size * ctype->plane_cnt)
835 * sizeof (uint32_t));
836 CTYPE_DATA (_NL_CTYPE_TOLOWER32,
837 ctype->map32[1],
838 (ctype->plane_size * ctype->plane_cnt)
839 * sizeof (uint32_t));
840
841 CTYPE_DATA (_NL_CTYPE_CLASS32,
842 ctype->ctype32_b,
843 (ctype->plane_size * ctype->plane_cnt
844 * sizeof (char_class32_t)));
845
846 CTYPE_DATA (_NL_CTYPE_NAMES,
847 ctype->names, (ctype->plane_size * ctype->plane_cnt
848 * sizeof (uint32_t)));
849
850 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE,
851 &ctype->translit_hash_size, sizeof (uint32_t));
852 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS,
853 &ctype->translit_hash_layers, sizeof (uint32_t));
854
855 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
856 ctype->translit_from_idx,
857 ctype->translit_idx_size);
858
859 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
860 ctype->translit_from_tbl,
861 ctype->translit_from_tbl_size);
862
863 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
864 ctype->translit_to_idx,
865 ctype->translit_idx_size);
866
867 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
868 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
869
870 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
871 &ctype->plane_size, sizeof (uint32_t));
872 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
873 &ctype->plane_cnt, sizeof (uint32_t));
874
875 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
876 /* The class name array. */
877 total = 0;
878 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
879 {
880 iov[2 + elem + offset].iov_base
881 = (void *) ctype->classnames[cnt];
882 iov[2 + elem + offset].iov_len
883 = strlen (ctype->classnames[cnt]) + 1;
884 total += iov[2 + elem + offset].iov_len;
885 }
886 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
887 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
888 total += 1 + (4 - ((total + 1) % 4));
889
890 idx[elem + 1] = idx[elem] + total;
891 break;
892
893 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
894 /* The class name array. */
895 total = 0;
896 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
897 {
898 iov[2 + elem + offset].iov_base
899 = (void *) ctype->mapnames[cnt];
900 iov[2 + elem + offset].iov_len
901 = strlen (ctype->mapnames[cnt]) + 1;
902 total += iov[2 + elem + offset].iov_len;
903 }
904 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
905 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
906 total += 1 + (4 - ((total + 1) % 4));
907
908 idx[elem + 1] = idx[elem] + total;
909 break;
910
911 CTYPE_DATA (_NL_CTYPE_WIDTH,
912 ctype->width,
913 (ctype->plane_size * ctype->plane_cnt + 3) & ~3ul);
914
915 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
916 &ctype->mb_cur_max, sizeof (uint32_t));
917
918 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
919 total = strlen (ctype->codeset_name) + 1;
920 if (total % 4 == 0)
921 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
922 else
923 {
924 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
925 memset (mempcpy (iov[2 + elem + offset].iov_base,
926 ctype->codeset_name, total),
927 '\0', 4 - (total & 3));
928 total = (total + 3) & ~3;
929 }
930 iov[2 + elem + offset].iov_len = total;
931 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
932 break;
933
934 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
935 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
936 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
937 *(uint32_t *) iov[2 + elem + offset].iov_base =
938 ctype->mbdigits_act / 10;
939 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
940 break;
941
942 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
943 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
944 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
945 *(uint32_t *) iov[2 + elem + offset].iov_base =
946 ctype->wcdigits_act / 10;
947 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
948 break;
949
950 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
951 /* Compute the length of all possible characters. For INDIGITS
952 there might be more than one. We simply concatenate all of
953 them with a NUL byte following. The NUL byte wouldn't be
954 necessary but it makes it easier for the user. */
955 total = 0;
956 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
957 cnt < ctype->mbdigits_act; cnt += 10)
958 total += ctype->mbdigits[cnt]->nbytes + 1;
959 iov[2 + elem + offset].iov_base = (char *) alloca (total);
960 iov[2 + elem + offset].iov_len = total;
961
962 cp = iov[2 + elem + offset].iov_base;
963 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
964 cnt < ctype->mbdigits_act; cnt += 10)
965 {
966 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
967 ctype->mbdigits[cnt]->nbytes);
968 *cp++ = '\0';
969 }
970 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
971 break;
972
973 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
974 /* Compute the length of all possible characters. For INDIGITS
975 there might be more than one. We simply concatenate all of
976 them with a NUL byte following. The NUL byte wouldn't be
977 necessary but it makes it easier for the user. */
978 cnt = elem - _NL_CTYPE_OUTDIGIT0_MB;
979 total = ctype->mboutdigits[cnt]->nbytes + 1;
980 iov[2 + elem + offset].iov_base = (char *) alloca (total);
981 iov[2 + elem + offset].iov_len = total;
982
983 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
984 ctype->mbdigits[cnt]->bytes,
985 ctype->mbdigits[cnt]->nbytes) = '\0';
986 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
987 break;
988
989 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
990 total = ctype->wcdigits_act / 10;
991
992 iov[2 + elem + offset].iov_base =
993 (uint32_t *) alloca (total * sizeof (uint32_t));
994 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
995
996 for (cnt = elem - _NL_CTYPE_INDIGITS0_WC;
997 cnt < ctype->wcdigits_act; cnt += 10)
998 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
999 = ctype->wcdigits[cnt];
1000 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1001 break;
1002
1003 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1004 cnt = elem - _NL_CTYPE_OUTDIGIT0_WC;
1005 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
1006 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1007 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1008 break;
1009
1010 default:
1011 assert (! "unknown CTYPE element");
1012 }
1013 else
1014 {
1015 /* Handle extra maps. */
1016 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) + 2;
1017
1018 iov[2 + elem + offset].iov_base = ctype->map32[nr];
1019 iov[2 + elem + offset].iov_len = ((ctype->plane_size
1020 * ctype->plane_cnt)
1021 * sizeof (uint32_t));
1022
1023 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1024 }
1025 }
1026
1027 assert (2 + elem + offset == (nelems + ctype->nr_charclass
1028 + ctype->map_collection_nr + 2));
1029
1030 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
1031 }
1032
1033
1034 /* Local functions. */
1035 static void
1036 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1037 const char *name)
1038 {
1039 size_t cnt;
1040
1041 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1042 if (strcmp (ctype->classnames[cnt], name) == 0)
1043 break;
1044
1045 if (cnt < ctype->nr_charclass)
1046 {
1047 lr_error (lr, _("character class `%s' already defined"), name);
1048 return;
1049 }
1050
1051 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1052 /* Exit code 2 is prescribed in P1003.2b. */
1053 error (2, 0, _("\
1054 implementation limit: no more than %Zd character classes allowed"),
1055 MAX_NR_CHARCLASS);
1056
1057 ctype->classnames[ctype->nr_charclass++] = name;
1058 }
1059
1060
1061 static void
1062 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1063 const char *name, struct charmap_t *charmap)
1064 {
1065 size_t max_chars = 0;
1066 size_t cnt;
1067
1068 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1069 {
1070 if (strcmp (ctype->mapnames[cnt], name) == 0)
1071 break;
1072
1073 if (max_chars < ctype->map_collection_max[cnt])
1074 max_chars = ctype->map_collection_max[cnt];
1075 }
1076
1077 if (cnt < ctype->map_collection_nr)
1078 {
1079 lr_error (lr, _("character map `%s' already defined"), name);
1080 return;
1081 }
1082
1083 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1084 /* Exit code 2 is prescribed in P1003.2b. */
1085 error (2, 0, _("\
1086 implementation limit: no more than %d character maps allowed"),
1087 MAX_NR_CHARMAP);
1088
1089 ctype->mapnames[cnt] = name;
1090
1091 if (max_chars == 0)
1092 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1093 else
1094 ctype->map_collection_max[cnt] = max_chars;
1095
1096 ctype->map_collection[cnt] = (uint32_t *)
1097 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1098 ctype->map_collection_act[cnt] = 256;
1099
1100 ++ctype->map_collection_nr;
1101 }
1102
1103
1104 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1105 is possible if we only want to extend the name array. */
1106 static uint32_t *
1107 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1108 size_t *act, uint32_t idx)
1109 {
1110 size_t cnt;
1111
1112 if (idx < 256)
1113 return table == NULL ? NULL : &(*table)[idx];
1114
1115 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1116 if (ctype->charnames[cnt] == idx)
1117 break;
1118
1119 /* We have to distinguish two cases: the name is found or not. */
1120 if (cnt == ctype->charnames_act)
1121 {
1122 /* Extend the name array. */
1123 if (ctype->charnames_act == ctype->charnames_max)
1124 {
1125 ctype->charnames_max *= 2;
1126 ctype->charnames = (uint32_t *)
1127 xrealloc (ctype->charnames,
1128 sizeof (uint32_t) * ctype->charnames_max);
1129 }
1130 ctype->charnames[ctype->charnames_act++] = idx;
1131 }
1132
1133 if (table == NULL)
1134 /* We have done everything we are asked to do. */
1135 return NULL;
1136
1137 if (cnt >= *act)
1138 {
1139 if (cnt >= *max)
1140 {
1141 size_t old_max = *max;
1142 do
1143 *max *= 2;
1144 while (*max <= cnt);
1145
1146 *table =
1147 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1148 memset (&(*table)[old_max], '\0',
1149 (*max - old_max) * sizeof (uint32_t));
1150 }
1151
1152 *act = cnt + 1;
1153 }
1154
1155 return &(*table)[cnt];
1156 }
1157
1158
1159 static int
1160 get_character (struct token *now, struct charmap_t *charmap,
1161 struct repertoire_t *repertoire,
1162 struct charseq **seqp, uint32_t *wchp)
1163 {
1164 if (now->tok == tok_bsymbol)
1165 {
1166 /* This will hopefully be the normal case. */
1167 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1168 now->val.str.lenmb);
1169 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1170 now->val.str.lenmb);
1171 }
1172 else if (now->tok == tok_ucs4)
1173 {
1174 char utmp[10];
1175
1176 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1177 *seqp = charmap_find_value (charmap, utmp, 9);
1178
1179 if (*seqp == NULL)
1180 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1181
1182 if (*seqp == NULL)
1183 {
1184 /* Compute the value in the charmap from the UCS value. */
1185 const char *symbol = repertoire_find_symbol (repertoire,
1186 now->val.ucs4);
1187
1188 if (symbol == NULL)
1189 *seqp = NULL;
1190 else
1191 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1192
1193 if (*seqp == NULL)
1194 {
1195 if (repertoire != NULL)
1196 {
1197 /* Insert a negative entry. */
1198 static const struct charseq negative
1199 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1200 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1201 sizeof (uint32_t));
1202 *newp = now->val.ucs4;
1203
1204 insert_entry (&repertoire->seq_table, newp,
1205 sizeof (uint32_t), (void *) &negative);
1206 }
1207 }
1208 else
1209 (*seqp)->ucs4 = now->val.ucs4;
1210 }
1211 else if ((*seqp)->ucs4 != now->val.ucs4)
1212 *seqp = NULL;
1213
1214 *wchp = now->val.ucs4;
1215 }
1216 else if (now->tok == tok_charcode)
1217 {
1218 /* We must map from the byte code to UCS4. */
1219 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1220 now->val.str.lenmb);
1221
1222 if (*seqp == NULL)
1223 *wchp = ILLEGAL_CHAR_VALUE;
1224 else
1225 {
1226 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1227 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1228 strlen ((*seqp)->name));
1229 *wchp = (*seqp)->ucs4;
1230 }
1231 }
1232 else
1233 return 1;
1234
1235 return 0;
1236 }
1237
1238
1239 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1240 the .(2). counterparts. */
1241 static void
1242 charclass_symbolic_ellipsis (struct linereader *ldfile,
1243 struct locale_ctype_t *ctype,
1244 struct charmap_t *charmap,
1245 struct repertoire_t *repertoire,
1246 struct token *now,
1247 const char *last_str,
1248 unsigned long int class256_bit,
1249 unsigned long int class_bit, int base,
1250 int ignore_content, int handle_digits, int step)
1251 {
1252 const char *nowstr = now->val.str.startmb;
1253 char tmp[now->val.str.lenmb + 1];
1254 const char *cp;
1255 char *endp;
1256 unsigned long int from;
1257 unsigned long int to;
1258
1259 /* We have to compute the ellipsis values using the symbolic names. */
1260 assert (last_str != NULL);
1261
1262 if (strlen (last_str) != now->val.str.lenmb)
1263 {
1264 invalid_range:
1265 lr_error (ldfile,
1266 _("`%s' and `%.*s' are no valid names for symbolic range"),
1267 last_str, (int) now->val.str.lenmb, nowstr);
1268 return;
1269 }
1270
1271 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1272 /* Nothing to do, the names are the same. */
1273 return;
1274
1275 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1276 ;
1277
1278 errno = 0;
1279 from = strtoul (cp, &endp, base);
1280 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1281 goto invalid_range;
1282
1283 to = strtoul (nowstr + (cp - last_str), &endp, base);
1284 if ((to == UINT_MAX && errno == ERANGE)
1285 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1286 goto invalid_range;
1287
1288 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1289 if (!ignore_content)
1290 {
1291 now->val.str.startmb = tmp;
1292 while ((from += step) <= to)
1293 {
1294 struct charseq *seq;
1295 uint32_t wch;
1296
1297 sprintf (tmp, (base == 10 ? "%.*s%0*d" : "%.*s%0*X"), cp - last_str,
1298 last_str, now->val.str.lenmb - (cp - last_str), from);
1299
1300 get_character (now, charmap, repertoire, &seq, &wch);
1301
1302 if (seq != NULL && seq->nbytes == 1)
1303 /* Yep, we can store information about this byte sequence. */
1304 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1305
1306 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1307 /* We have the UCS4 position. */
1308 *find_idx (ctype, &ctype->class_collection,
1309 &ctype->class_collection_max,
1310 &ctype->class_collection_act, wch) |= class_bit;
1311
1312 if (handle_digits == 1)
1313 {
1314 /* We must store the digit values. */
1315 if (ctype->mbdigits_act == ctype->mbdigits_max)
1316 {
1317 ctype->mbdigits_max *= 2;
1318 ctype->mbdigits = xrealloc (ctype->mbdigits,
1319 (ctype->mbdigits_max
1320 * sizeof (char *)));
1321 ctype->wcdigits_max *= 2;
1322 ctype->wcdigits = xrealloc (ctype->wcdigits,
1323 (ctype->wcdigits_max
1324 * sizeof (uint32_t)));
1325 }
1326
1327 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1328 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1329 }
1330 else if (handle_digits == 2)
1331 {
1332 /* We must store the digit values. */
1333 if (ctype->outdigits_act >= 10)
1334 {
1335 lr_error (ldfile, _("\
1336 %s: field `%s' does not contain exactly ten entries"),
1337 "LC_CTYPE", "outdigit");
1338 return;
1339 }
1340
1341 ctype->mboutdigits[ctype->outdigits_act] = seq;
1342 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1343 ++ctype->outdigits_act;
1344 }
1345 }
1346 }
1347 }
1348
1349
1350 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1351 static void
1352 charclass_ucs4_ellipsis (struct linereader *ldfile,
1353 struct locale_ctype_t *ctype,
1354 struct charmap_t *charmap,
1355 struct repertoire_t *repertoire,
1356 struct token *now, uint32_t last_wch,
1357 unsigned long int class256_bit,
1358 unsigned long int class_bit, int ignore_content,
1359 int handle_digits, int step)
1360 {
1361 if (last_wch > now->val.ucs4)
1362 {
1363 lr_error (ldfile, _("\
1364 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1365 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1366 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1367 return;
1368 }
1369
1370 if (!ignore_content)
1371 while ((last_wch += step) <= now->val.ucs4)
1372 {
1373 /* We have to find out whether there is a byte sequence corresponding
1374 to this UCS4 value. */
1375 struct charseq *seq;
1376 char utmp[10];
1377
1378 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1379 seq = charmap_find_value (charmap, utmp, 9);
1380 if (seq == NULL)
1381 {
1382 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1383 seq = charmap_find_value (charmap, utmp, 5);
1384 }
1385
1386 if (seq == NULL)
1387 /* Try looking in the repertoire map. */
1388 seq = repertoire_find_seq (repertoire, last_wch);
1389
1390 /* If this is the first time we look for this sequence create a new
1391 entry. */
1392 if (seq == NULL)
1393 {
1394 static const struct charseq negative
1395 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1396
1397 /* Find the symbolic name for this UCS4 value. */
1398 if (repertoire != NULL)
1399 {
1400 const char *symbol = repertoire_find_symbol (repertoire,
1401 last_wch);
1402 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1403 sizeof (uint32_t));
1404 *newp = last_wch;
1405
1406 if (symbol != NULL)
1407 /* We have a name, now search the multibyte value. */
1408 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1409
1410 if (seq == NULL)
1411 /* We have to create a fake entry. */
1412 seq = (struct charseq *) &negative;
1413 else
1414 seq->ucs4 = last_wch;
1415
1416 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1417 seq);
1418 }
1419 else
1420 /* We have to create a fake entry. */
1421 seq = (struct charseq *) &negative;
1422 }
1423
1424 /* We have a name, now search the multibyte value. */
1425 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1426 /* Yep, we can store information about this byte sequence. */
1427 ctype->class256_collection[(size_t) seq->bytes[0]]
1428 |= class256_bit;
1429
1430 /* And of course we have the UCS4 position. */
1431 if (class_bit != 0)
1432 *find_idx (ctype, &ctype->class_collection,
1433 &ctype->class_collection_max,
1434 &ctype->class_collection_act, last_wch) |= class_bit;
1435
1436 if (handle_digits == 1)
1437 {
1438 /* We must store the digit values. */
1439 if (ctype->mbdigits_act == ctype->mbdigits_max)
1440 {
1441 ctype->mbdigits_max *= 2;
1442 ctype->mbdigits = xrealloc (ctype->mbdigits,
1443 (ctype->mbdigits_max
1444 * sizeof (char *)));
1445 ctype->wcdigits_max *= 2;
1446 ctype->wcdigits = xrealloc (ctype->wcdigits,
1447 (ctype->wcdigits_max
1448 * sizeof (uint32_t)));
1449 }
1450
1451 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1452 ? seq : NULL);
1453 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1454 }
1455 else if (handle_digits == 2)
1456 {
1457 /* We must store the digit values. */
1458 if (ctype->outdigits_act >= 10)
1459 {
1460 lr_error (ldfile, _("\
1461 %s: field `%s' does not contain exactly ten entries"),
1462 "LC_CTYPE", "outdigit");
1463 return;
1464 }
1465
1466 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1467 ? seq : NULL);
1468 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1469 ++ctype->outdigits_act;
1470 }
1471 }
1472 }
1473
1474
1475 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1476 static void
1477 charclass_charcode_ellipsis (struct linereader *ldfile,
1478 struct locale_ctype_t *ctype,
1479 struct charmap_t *charmap,
1480 struct repertoire_t *repertoire,
1481 struct token *now, char *last_charcode,
1482 uint32_t last_charcode_len,
1483 unsigned long int class256_bit,
1484 unsigned long int class_bit, int ignore_content,
1485 int handle_digits)
1486 {
1487 /* First check whether the to-value is larger. */
1488 if (now->val.charcode.nbytes != last_charcode_len)
1489 {
1490 lr_error (ldfile, _("\
1491 start end end character sequence of range must have the same length"));
1492 return;
1493 }
1494
1495 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1496 {
1497 lr_error (ldfile, _("\
1498 to-value character sequence is smaller than from-value sequence"));
1499 return;
1500 }
1501
1502 if (!ignore_content)
1503 {
1504 do
1505 {
1506 /* Increment the byte sequence value. */
1507 struct charseq *seq;
1508 uint32_t wch;
1509 int i;
1510
1511 for (i = last_charcode_len - 1; i >= 0; --i)
1512 if (++last_charcode[i] != 0)
1513 break;
1514
1515 if (last_charcode_len == 1)
1516 /* Of course we have the charcode value. */
1517 ctype->class256_collection[(size_t) last_charcode[0]]
1518 |= class256_bit;
1519
1520 /* Find the symbolic name. */
1521 seq = charmap_find_symbol (charmap, last_charcode,
1522 last_charcode_len);
1523 if (seq != NULL)
1524 {
1525 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1526 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1527 strlen (seq->name));
1528 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1529
1530 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1531 *find_idx (ctype, &ctype->class_collection,
1532 &ctype->class_collection_max,
1533 &ctype->class_collection_act, wch) |= class_bit;
1534 }
1535 else
1536 wch = ILLEGAL_CHAR_VALUE;
1537
1538 if (handle_digits == 1)
1539 {
1540 /* We must store the digit values. */
1541 if (ctype->mbdigits_act == ctype->mbdigits_max)
1542 {
1543 ctype->mbdigits_max *= 2;
1544 ctype->mbdigits = xrealloc (ctype->mbdigits,
1545 (ctype->mbdigits_max
1546 * sizeof (char *)));
1547 ctype->wcdigits_max *= 2;
1548 ctype->wcdigits = xrealloc (ctype->wcdigits,
1549 (ctype->wcdigits_max
1550 * sizeof (uint32_t)));
1551 }
1552
1553 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1554 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1555 seq->nbytes = last_charcode_len;
1556
1557 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1558 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1559 }
1560 else if (handle_digits == 2)
1561 {
1562 struct charseq *seq;
1563 /* We must store the digit values. */
1564 if (ctype->outdigits_act >= 10)
1565 {
1566 lr_error (ldfile, _("\
1567 %s: field `%s' does not contain exactly ten entries"),
1568 "LC_CTYPE", "outdigit");
1569 return;
1570 }
1571
1572 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1573 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1574 seq->nbytes = last_charcode_len;
1575
1576 ctype->mboutdigits[ctype->outdigits_act] = seq;
1577 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1578 ++ctype->outdigits_act;
1579 }
1580 }
1581 while (memcmp (last_charcode, now->val.charcode.bytes,
1582 last_charcode_len) != 0);
1583 }
1584 }
1585
1586
1587 /* Read one transliteration entry. */
1588 static uint32_t *
1589 read_widestring (struct linereader *ldfile, struct token *now,
1590 struct charmap_t *charmap, struct repertoire_t *repertoire)
1591 {
1592 uint32_t *wstr;
1593
1594 if (now->tok == tok_default_missing)
1595 /* The special name "" will denote this case. */
1596 wstr = ((uint32_t *) { 0 });
1597 else if (now->tok == tok_bsymbol)
1598 {
1599 /* Get the value from the repertoire. */
1600 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1601 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1602 now->val.str.lenmb);
1603 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1604 {
1605 /* We cannot proceed, we don't know the UCS4 value. */
1606 free (wstr);
1607 return NULL;
1608 }
1609
1610 wstr[1] = 0;
1611 }
1612 else if (now->tok == tok_ucs4)
1613 {
1614 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1615 wstr[0] = now->val.ucs4;
1616 wstr[1] = 0;
1617 }
1618 else if (now->tok == tok_charcode)
1619 {
1620 /* Argh, we have to convert to the symbol name first and then to the
1621 UCS4 value. */
1622 struct charseq *seq = charmap_find_symbol (charmap,
1623 now->val.str.startmb,
1624 now->val.str.lenmb);
1625 if (seq == NULL)
1626 /* Cannot find the UCS4 value. */
1627 return NULL;
1628
1629 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1630 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1631 strlen (seq->name));
1632 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1633 /* We cannot proceed, we don't know the UCS4 value. */
1634 return NULL;
1635
1636 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1637 wstr[0] = seq->ucs4;
1638 wstr[1] = 0;
1639 }
1640 else if (now->tok == tok_string)
1641 {
1642 wstr = now->val.str.startwc;
1643 if (wstr == NULL || wstr[0] == 0)
1644 return NULL;
1645 }
1646 else
1647 {
1648 if (now->tok != tok_eol && now->tok != tok_eof)
1649 lr_ignore_rest (ldfile, 0);
1650 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1651 return (uint32_t *) -1l;
1652 }
1653
1654 return wstr;
1655 }
1656
1657
1658 static void
1659 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1660 struct token *now, struct charmap_t *charmap,
1661 struct repertoire_t *repertoire)
1662 {
1663 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1664 struct translit_t *result;
1665 struct translit_to_t **top;
1666 struct obstack *ob = &ctype->mempool;
1667 int first;
1668 int ignore;
1669
1670 if (from_wstr == NULL)
1671 /* There is no valid from string. */
1672 return;
1673
1674 result = (struct translit_t *) obstack_alloc (ob,
1675 sizeof (struct translit_t));
1676 result->from = from_wstr;
1677 result->fname = ldfile->fname;
1678 result->lineno = ldfile->lineno;
1679 result->next = NULL;
1680 result->to = NULL;
1681 top = &result->to;
1682 first = 1;
1683 ignore = 0;
1684
1685 while (1)
1686 {
1687 uint32_t *to_wstr;
1688
1689 /* Next we have one or more transliterations. They are
1690 separated by semicolons. */
1691 now = lr_token (ldfile, charmap, repertoire);
1692
1693 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1694 {
1695 /* One string read. */
1696 const uint32_t zero = 0;
1697
1698 if (!ignore)
1699 {
1700 obstack_grow (ob, &zero, 4);
1701 to_wstr = obstack_finish (ob);
1702
1703 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1704 (*top)->str = to_wstr;
1705 (*top)->next = NULL;
1706 }
1707
1708 if (now->tok == tok_eol)
1709 {
1710 result->next = ctype->translit;
1711 ctype->translit = result;
1712 return;
1713 }
1714
1715 if (!ignore)
1716 top = &(*top)->next;
1717 ignore = 0;
1718 }
1719 else
1720 {
1721 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1722 if (to_wstr == (uint32_t *) -1l)
1723 {
1724 /* An error occurred. */
1725 obstack_free (ob, result);
1726 return;
1727 }
1728
1729 if (to_wstr == NULL)
1730 ignore = 1;
1731 else
1732 /* This value is usable. */
1733 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1734
1735 first = 0;
1736 }
1737 }
1738 }
1739
1740
1741 static void
1742 read_translit_ignore_entry (struct linereader *ldfile,
1743 struct locale_ctype_t *ctype,
1744 struct charmap_t *charmap,
1745 struct repertoire_t *repertoire)
1746 {
1747 /* We expect a semicolon-separated list of characters we ignore. We are
1748 only interested in the wide character definitions. These must be
1749 single characters, possibly defining a range when an ellipsis is used. */
1750 while (1)
1751 {
1752 struct token *now = lr_token (ldfile, charmap, repertoire);
1753 struct translit_ignore_t *newp;
1754 uint32_t from;
1755
1756 if (now->tok == tok_eol || now->tok == tok_eof)
1757 {
1758 lr_error (ldfile,
1759 _("premature end of `translit_ignore' definition"));
1760 return;
1761 }
1762
1763 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1764 {
1765 lr_error (ldfile, _("syntax error"));
1766 lr_ignore_rest (ldfile, 0);
1767 return;
1768 }
1769
1770 if (now->tok == tok_ucs4)
1771 from = now->val.ucs4;
1772 else
1773 /* Try to get the value. */
1774 from = repertoire_find_value (repertoire, now->val.str.startmb,
1775 now->val.str.lenmb);
1776
1777 if (from == ILLEGAL_CHAR_VALUE)
1778 {
1779 lr_error (ldfile, "invalid character name");
1780 newp = NULL;
1781 }
1782 else
1783 {
1784 newp = (struct translit_ignore_t *)
1785 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
1786 newp->from = from;
1787 newp->to = from;
1788 newp->step = 1;
1789
1790 newp->next = ctype->translit_ignore;
1791 ctype->translit_ignore = newp;
1792 }
1793
1794 /* Now we expect either a semicolon, an ellipsis, or the end of the
1795 line. */
1796 now = lr_token (ldfile, charmap, repertoire);
1797
1798 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
1799 {
1800 /* XXX Should we bother implementing `....'? `...' certainly
1801 will not be implemented. */
1802 uint32_t to;
1803 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
1804
1805 now = lr_token (ldfile, charmap, repertoire);
1806
1807 if (now->tok == tok_eol || now->tok == tok_eof)
1808 {
1809 lr_error (ldfile,
1810 _("premature end of `translit_ignore' definition"));
1811 return;
1812 }
1813
1814 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1815 {
1816 lr_error (ldfile, _("syntax error"));
1817 lr_ignore_rest (ldfile, 0);
1818 return;
1819 }
1820
1821 if (now->tok == tok_ucs4)
1822 to = now->val.ucs4;
1823 else
1824 /* Try to get the value. */
1825 to = repertoire_find_value (repertoire, now->val.str.startmb,
1826 now->val.str.lenmb);
1827
1828 if (to == ILLEGAL_CHAR_VALUE)
1829 lr_error (ldfile, "invalid character name");
1830 else
1831 {
1832 /* Make sure the `to'-value is larger. */
1833 if (to >= from)
1834 {
1835 newp->to = to;
1836 newp->step = step;
1837 }
1838 else
1839 lr_error (ldfile, _("\
1840 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1841 (to | from) < 65536 ? 4 : 8, to,
1842 (to | from) < 65536 ? 4 : 8, from);
1843 }
1844
1845 /* And the next token. */
1846 now = lr_token (ldfile, charmap, repertoire);
1847 }
1848
1849 if (now->tok == tok_eol || now->tok == tok_eof)
1850 /* We are done. */
1851 return;
1852
1853 if (now->tok == tok_semicolon)
1854 /* Next round. */
1855 continue;
1856
1857 /* If we come here something is wrong. */
1858 lr_error (ldfile, _("syntax error"));
1859 lr_ignore_rest (ldfile, 0);
1860 return;
1861 }
1862 }
1863
1864
1865 /* The parser for the LC_CTYPE section of the locale definition. */
1866 void
1867 ctype_read (struct linereader *ldfile, struct localedef_t *result,
1868 struct charmap_t *charmap, const char *repertoire_name,
1869 int ignore_content)
1870 {
1871 struct repertoire_t *repertoire = NULL;
1872 struct locale_ctype_t *ctype;
1873 struct token *now;
1874 enum token_t nowtok;
1875 size_t cnt;
1876 struct charseq *last_seq;
1877 uint32_t last_wch = 0;
1878 enum token_t last_token;
1879 enum token_t ellipsis_token;
1880 int step;
1881 char last_charcode[16];
1882 size_t last_charcode_len = 0;
1883 const char *last_str = NULL;
1884 int mapidx;
1885
1886 /* Get the repertoire we have to use. */
1887 if (repertoire_name != NULL)
1888 repertoire = repertoire_read (repertoire_name);
1889
1890 /* The rest of the line containing `LC_CTYPE' must be free. */
1891 lr_ignore_rest (ldfile, 1);
1892
1893
1894 do
1895 {
1896 now = lr_token (ldfile, charmap, NULL);
1897 nowtok = now->tok;
1898 }
1899 while (nowtok == tok_eol);
1900
1901 /* If we see `copy' now we are almost done. */
1902 if (nowtok == tok_copy)
1903 {
1904 handle_copy (ldfile, charmap, repertoire_name, result, tok_lc_ctype,
1905 LC_CTYPE, "LC_CTYPE", ignore_content);
1906 return;
1907 }
1908
1909 /* Prepare the data structures. */
1910 ctype_startup (ldfile, result, charmap, ignore_content);
1911 ctype = result->categories[LC_CTYPE].ctype;
1912
1913 /* Remember the repertoire we use. */
1914 if (!ignore_content)
1915 ctype->repertoire = repertoire;
1916
1917 while (1)
1918 {
1919 unsigned long int class_bit = 0;
1920 unsigned long int class256_bit = 0;
1921 int handle_digits = 0;
1922
1923 /* Of course we don't proceed beyond the end of file. */
1924 if (nowtok == tok_eof)
1925 break;
1926
1927 /* Ingore empty lines. */
1928 if (nowtok == tok_eol)
1929 {
1930 now = lr_token (ldfile, charmap, NULL);
1931 nowtok = now->tok;
1932 continue;
1933 }
1934
1935 switch (nowtok)
1936 {
1937 case tok_charclass:
1938 now = lr_token (ldfile, charmap, NULL);
1939 while (now->tok == tok_ident || now->tok == tok_string)
1940 {
1941 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1942 now = lr_token (ldfile, charmap, NULL);
1943 if (now->tok != tok_semicolon)
1944 break;
1945 now = lr_token (ldfile, charmap, NULL);
1946 }
1947 if (now->tok != tok_eol)
1948 SYNTAX_ERROR (_("\
1949 %s: syntax error in definition of new character class"), "LC_CTYPE");
1950 break;
1951
1952 case tok_charconv:
1953 now = lr_token (ldfile, charmap, NULL);
1954 while (now->tok == tok_ident || now->tok == tok_string)
1955 {
1956 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
1957 now = lr_token (ldfile, charmap, NULL);
1958 if (now->tok != tok_semicolon)
1959 break;
1960 now = lr_token (ldfile, charmap, NULL);
1961 }
1962 if (now->tok != tok_eol)
1963 SYNTAX_ERROR (_("\
1964 %s: syntax error in definition of new character map"), "LC_CTYPE");
1965 break;
1966
1967 case tok_class:
1968 /* Ignore the rest of the line if we don't need the input of
1969 this line. */
1970 if (ignore_content)
1971 {
1972 lr_ignore_rest (ldfile, 0);
1973 break;
1974 }
1975
1976 /* We simply forget the `class' keyword and use the following
1977 operand to determine the bit. */
1978 now = lr_token (ldfile, charmap, NULL);
1979 if (now->tok == tok_ident || now->tok == tok_string)
1980 {
1981 /* Must can be one of the predefined class names. */
1982 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1983 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
1984 break;
1985 if (cnt >= ctype->nr_charclass)
1986 {
1987 #ifdef PREDEFINED_CLASSES
1988 if (now->val.str.lenmb == 8
1989 && memcmp ("special1", now->val.str.startmb, 8) == 0)
1990 class_bit = _ISwspecial1;
1991 else if (now->val.str.lenmb == 8
1992 && memcmp ("special2", now->val.str.startmb, 8) == 0)
1993 class_bit = _ISwspecial2;
1994 else if (now->val.str.lenmb == 8
1995 && memcmp ("special3", now->val.str.startmb, 8) == 0)
1996 class_bit = _ISwspecial3;
1997 else
1998 #endif
1999 {
2000 /* OK, it's a new class. */
2001 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2002
2003 class_bit = _ISwbit (ctype->nr_charclass - 1);
2004 }
2005 }
2006 else
2007 {
2008 class_bit = _ISwbit (cnt);
2009
2010 free (now->val.str.startmb);
2011 }
2012 }
2013 else if (now->tok == tok_digit)
2014 goto handle_tok_digit;
2015 else if (now->tok < tok_upper || now->tok > tok_blank)
2016 goto err_label;
2017 else
2018 {
2019 class_bit = BITw (now->tok);
2020 class256_bit = BIT (now->tok);
2021 }
2022
2023 /* The next character must be a semicolon. */
2024 now = lr_token (ldfile, charmap, NULL);
2025 if (now->tok != tok_semicolon)
2026 goto err_label;
2027 goto read_charclass;
2028
2029 case tok_upper:
2030 case tok_lower:
2031 case tok_alpha:
2032 case tok_alnum:
2033 case tok_space:
2034 case tok_cntrl:
2035 case tok_punct:
2036 case tok_graph:
2037 case tok_print:
2038 case tok_xdigit:
2039 case tok_blank:
2040 /* Ignore the rest of the line if we don't need the input of
2041 this line. */
2042 if (ignore_content)
2043 {
2044 lr_ignore_rest (ldfile, 0);
2045 break;
2046 }
2047
2048 class_bit = BITw (now->tok);
2049 class256_bit = BIT (now->tok);
2050 handle_digits = 0;
2051 read_charclass:
2052 ctype->class_done |= class_bit;
2053 last_token = tok_none;
2054 ellipsis_token = tok_none;
2055 step = 1;
2056 now = lr_token (ldfile, charmap, NULL);
2057 while (now->tok != tok_eol && now->tok != tok_eof)
2058 {
2059 uint32_t wch;
2060 struct charseq *seq;
2061
2062 if (ellipsis_token == tok_none)
2063 {
2064 if (get_character (now, charmap, repertoire, &seq, &wch))
2065 goto err_label;
2066
2067 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2068 /* Yep, we can store information about this byte
2069 sequence. */
2070 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2071
2072 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2073 && class_bit != 0)
2074 /* We have the UCS4 position. */
2075 *find_idx (ctype, &ctype->class_collection,
2076 &ctype->class_collection_max,
2077 &ctype->class_collection_act, wch) |= class_bit;
2078
2079 last_token = now->tok;
2080 /* Terminate the string. */
2081 if (last_token == tok_bsymbol)
2082 {
2083 now->val.str.startmb[now->val.str.lenmb] = '\0';
2084 last_str = now->val.str.startmb;
2085 }
2086 else
2087 last_str = NULL;
2088 last_seq = seq;
2089 last_wch = wch;
2090 memcpy (last_charcode, now->val.charcode.bytes, 16);
2091 last_charcode_len = now->val.charcode.nbytes;
2092
2093 if (!ignore_content && handle_digits == 1)
2094 {
2095 /* We must store the digit values. */
2096 if (ctype->mbdigits_act == ctype->mbdigits_max)
2097 {
2098 ctype->mbdigits_max += 10;
2099 ctype->mbdigits = xrealloc (ctype->mbdigits,
2100 (ctype->mbdigits_max
2101 * sizeof (char *)));
2102 ctype->wcdigits_max += 10;
2103 ctype->wcdigits = xrealloc (ctype->wcdigits,
2104 (ctype->wcdigits_max
2105 * sizeof (uint32_t)));
2106 }
2107
2108 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2109 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2110 }
2111 else if (!ignore_content && handle_digits == 2)
2112 {
2113 /* We must store the digit values. */
2114 if (ctype->outdigits_act >= 10)
2115 {
2116 lr_error (ldfile, _("\
2117 %s: field `%s' does not contain exactly ten entries"),
2118 "LC_CTYPE", "outdigit");
2119 goto err_label;
2120 }
2121
2122 ctype->mboutdigits[ctype->outdigits_act] = seq;
2123 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2124 ++ctype->outdigits_act;
2125 }
2126 }
2127 else
2128 {
2129 /* Now it gets complicated. We have to resolve the
2130 ellipsis problem. First we must distinguish between
2131 the different kind of ellipsis and this must match the
2132 tokens we have seen. */
2133 assert (last_token != tok_none);
2134
2135 if (last_token != now->tok)
2136 {
2137 lr_error (ldfile, _("\
2138 ellipsis range must be marked by two operands of same type"));
2139 lr_ignore_rest (ldfile, 0);
2140 break;
2141 }
2142
2143 if (last_token == tok_bsymbol)
2144 {
2145 if (ellipsis_token == tok_ellipsis3)
2146 lr_error (ldfile, _("with symbolic name range values \
2147 the absolute ellipsis `...' must not be used"));
2148
2149 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2150 repertoire, now, last_str,
2151 class256_bit, class_bit,
2152 (ellipsis_token
2153 == tok_ellipsis4
2154 ? 10 : 16),
2155 ignore_content,
2156 handle_digits, step);
2157 }
2158 else if (last_token == tok_ucs4)
2159 {
2160 if (ellipsis_token != tok_ellipsis2)
2161 lr_error (ldfile, _("\
2162 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2163
2164 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2165 repertoire, now, last_wch,
2166 class256_bit, class_bit,
2167 ignore_content, handle_digits,
2168 step);
2169 }
2170 else
2171 {
2172 assert (last_token == tok_charcode);
2173
2174 if (ellipsis_token != tok_ellipsis3)
2175 lr_error (ldfile, _("\
2176 with character code range values one must use the absolute ellipsis `...'"));
2177
2178 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2179 repertoire, now,
2180 last_charcode,
2181 last_charcode_len,
2182 class256_bit, class_bit,
2183 ignore_content,
2184 handle_digits);
2185 }
2186
2187 /* Now we have used the last value. */
2188 last_token = tok_none;
2189 }
2190
2191 /* Next we expect a semicolon or the end of the line. */
2192 now = lr_token (ldfile, charmap, NULL);
2193 if (now->tok == tok_eol || now->tok == tok_eof)
2194 break;
2195
2196 if (last_token != tok_none
2197 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2198 {
2199 if (now->tok == tok_ellipsis2_2)
2200 {
2201 now->tok = tok_ellipsis2;
2202 step = 2;
2203 }
2204 else if (now->tok == tok_ellipsis4_2)
2205 {
2206 now->tok = tok_ellipsis4;
2207 step = 2;
2208 }
2209
2210 ellipsis_token = now->tok;
2211
2212 now = lr_token (ldfile, charmap, NULL);
2213 continue;
2214 }
2215
2216 if (now->tok != tok_semicolon)
2217 goto err_label;
2218
2219 /* And get the next character. */
2220 now = lr_token (ldfile, charmap, NULL);
2221
2222 ellipsis_token = tok_none;
2223 step = 1;
2224 }
2225 break;
2226
2227 case tok_digit:
2228 /* Ignore the rest of the line if we don't need the input of
2229 this line. */
2230 if (ignore_content)
2231 {
2232 lr_ignore_rest (ldfile, 0);
2233 break;
2234 }
2235
2236 handle_tok_digit:
2237 class_bit = _ISwdigit;
2238 class256_bit = _ISdigit;
2239 handle_digits = 1;
2240 goto read_charclass;
2241
2242 case tok_outdigit:
2243 /* Ignore the rest of the line if we don't need the input of
2244 this line. */
2245 if (ignore_content)
2246 {
2247 lr_ignore_rest (ldfile, 0);
2248 break;
2249 }
2250
2251 if (ctype->outdigits_act != 0)
2252 lr_error (ldfile, _("\
2253 %s: field `%s' declared more than once"),
2254 "LC_CTYPE", "outdigit");
2255 class_bit = 0;
2256 class256_bit = 0;
2257 handle_digits = 2;
2258 goto read_charclass;
2259
2260 case tok_toupper:
2261 /* Ignore the rest of the line if we don't need the input of
2262 this line. */
2263 if (ignore_content)
2264 {
2265 lr_ignore_rest (ldfile, 0);
2266 break;
2267 }
2268
2269 mapidx = 0;
2270 goto read_mapping;
2271
2272 case tok_tolower:
2273 /* Ignore the rest of the line if we don't need the input of
2274 this line. */
2275 if (ignore_content)
2276 {
2277 lr_ignore_rest (ldfile, 0);
2278 break;
2279 }
2280
2281 mapidx = 1;
2282 goto read_mapping;
2283
2284 case tok_map:
2285 /* Ignore the rest of the line if we don't need the input of
2286 this line. */
2287 if (ignore_content)
2288 {
2289 lr_ignore_rest (ldfile, 0);
2290 break;
2291 }
2292
2293 /* We simply forget the `map' keyword and use the following
2294 operand to determine the mapping. */
2295 now = lr_token (ldfile, charmap, NULL);
2296 if (now->tok == tok_ident || now->tok == tok_string)
2297 {
2298 size_t cnt;
2299
2300 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2301 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2302 break;
2303
2304 if (cnt < ctype->map_collection_nr)
2305 free (now->val.str.startmb);
2306 else
2307 /* OK, it's a new map. */
2308 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2309
2310 mapidx = cnt;
2311 }
2312 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2313 goto err_label;
2314 else
2315 mapidx = now->tok - tok_toupper;
2316
2317 now = lr_token (ldfile, charmap, NULL);
2318 /* This better should be a semicolon. */
2319 if (now->tok != tok_semicolon)
2320 goto err_label;
2321
2322 read_mapping:
2323 /* Test whether this mapping was already defined. */
2324 if (ctype->tomap_done[mapidx])
2325 {
2326 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2327 ctype->mapnames[mapidx]);
2328 lr_ignore_rest (ldfile, 0);
2329 break;
2330 }
2331 ctype->tomap_done[mapidx] = 1;
2332
2333 now = lr_token (ldfile, charmap, NULL);
2334 while (now->tok != tok_eol && now->tok != tok_eof)
2335 {
2336 struct charseq *from_seq;
2337 uint32_t from_wch;
2338 struct charseq *to_seq;
2339 uint32_t to_wch;
2340
2341 /* Every pair starts with an opening brace. */
2342 if (now->tok != tok_open_brace)
2343 goto err_label;
2344
2345 /* Next comes the from-value. */
2346 now = lr_token (ldfile, charmap, NULL);
2347 if (get_character (now, charmap, repertoire, &from_seq,
2348 &from_wch) != 0)
2349 goto err_label;
2350
2351 /* The next is a comma. */
2352 now = lr_token (ldfile, charmap, NULL);
2353 if (now->tok != tok_comma)
2354 goto err_label;
2355
2356 /* And the other value. */
2357 now = lr_token (ldfile, charmap, NULL);
2358 if (get_character (now, charmap, repertoire, &to_seq,
2359 &to_wch) != 0)
2360 goto err_label;
2361
2362 /* And the last thing is the closing brace. */
2363 now = lr_token (ldfile, charmap, NULL);
2364 if (now->tok != tok_close_brace)
2365 goto err_label;
2366
2367 if (!ignore_content)
2368 {
2369 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2370 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2371 /* We can use this value. */
2372 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2373 = to_seq->bytes[0];
2374
2375 if (from_wch != ILLEGAL_CHAR_VALUE
2376 && to_wch != ILLEGAL_CHAR_VALUE)
2377 /* Both correct values. */
2378 *find_idx (ctype, &ctype->map_collection[mapidx],
2379 &ctype->map_collection_max[mapidx],
2380 &ctype->map_collection_act[mapidx],
2381 from_wch) = to_wch;
2382 }
2383
2384 /* Now comes a semicolon or the end of the line/file. */
2385 now = lr_token (ldfile, charmap, NULL);
2386 if (now->tok == tok_semicolon)
2387 now = lr_token (ldfile, charmap, NULL);
2388 }
2389 break;
2390
2391 case tok_translit_start:
2392 /* Ignore the rest of the line if we don't need the input of
2393 this line. */
2394 if (ignore_content)
2395 {
2396 lr_ignore_rest (ldfile, 0);
2397 break;
2398 }
2399
2400 /* The rest of the line better should be empty. */
2401 lr_ignore_rest (ldfile, 1);
2402
2403 /* We count here the number of allocated entries in the `translit'
2404 array. */
2405 cnt = 0;
2406
2407 /* We proceed until we see the `translit_end' token. */
2408 while (now = lr_token (ldfile, charmap, repertoire),
2409 now->tok != tok_translit_end && now->tok != tok_eof)
2410 {
2411 if (now->tok == tok_eol)
2412 /* Ignore empty lines. */
2413 continue;
2414
2415 if (now->tok == tok_translit_end)
2416 {
2417 lr_ignore_rest (ldfile, 0);
2418 break;
2419 }
2420
2421 if (now->tok == tok_include)
2422 {
2423 /* We have to include locale. */
2424 const char *locale_name;
2425 const char *repertoire_name;
2426
2427 now = lr_token (ldfile, charmap, NULL);
2428 /* This should be a string or an identifier. In any
2429 case something to name a locale. */
2430 if (now->tok != tok_string && now->tok != tok_ident)
2431 {
2432 translit_syntax:
2433 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2434 lr_ignore_rest (ldfile, 0);
2435 continue;
2436 }
2437 locale_name = now->val.str.startmb;
2438
2439 /* Next should be a semicolon. */
2440 now = lr_token (ldfile, charmap, NULL);
2441 if (now->tok != tok_semicolon)
2442 goto translit_syntax;
2443
2444 /* Now the repertoire name. */
2445 now = lr_token (ldfile, charmap, NULL);
2446 if ((now->tok != tok_string && now->tok != tok_ident)
2447 || now->val.str.startmb == NULL)
2448 goto translit_syntax;
2449 repertoire_name = now->val.str.startmb;
2450
2451 /* We must not have more than one `include'. */
2452 if (ctype->translit_copy_locale != NULL)
2453 {
2454 lr_error (ldfile, _("\
2455 %s: only one `include' instruction allowed"), "LC_CTYPE");
2456 lr_ignore_rest (ldfile, 0);
2457 continue;
2458 }
2459
2460 ctype->translit_copy_locale = locale_name;
2461 ctype->translit_copy_repertoire = repertoire_name;
2462
2463 /* The rest of the line must be empty. */
2464 lr_ignore_rest (ldfile, 1);
2465
2466 /* Make sure the locale is read. */
2467 add_to_readlist (LC_CTYPE, ctype->translit_copy_locale,
2468 repertoire_name, 1);
2469 continue;
2470 }
2471 else if (now->tok == tok_default_missing)
2472 {
2473 uint32_t *wstr;
2474
2475 /* We expect a single character or string as the
2476 argument. */
2477 now = lr_token (ldfile, charmap, NULL);
2478 wstr = read_widestring (ldfile, now, charmap, repertoire);
2479
2480 if (wstr != NULL)
2481 {
2482 if (ctype->default_missing != NULL)
2483 {
2484 lr_error (ldfile, _("\
2485 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2486 error_at_line (0, 0, ctype->default_missing_file,
2487 ctype->default_missing_lineno,
2488 _("previous definition was here"));
2489 }
2490 else
2491 {
2492 ctype->default_missing = wstr;
2493 ctype->default_missing_file = ldfile->fname;
2494 ctype->default_missing_lineno = ldfile->lineno;
2495 }
2496 }
2497 lr_ignore_rest (ldfile, 1);
2498 continue;
2499 }
2500 else if (now->tok == tok_translit_ignore)
2501 {
2502 read_translit_ignore_entry (ldfile, ctype, charmap,
2503 repertoire);
2504 continue;
2505 }
2506
2507 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2508 }
2509 break;
2510
2511 case tok_ident:
2512 /* Ignore the rest of the line if we don't need the input of
2513 this line. */
2514 if (ignore_content)
2515 {
2516 lr_ignore_rest (ldfile, 0);
2517 break;
2518 }
2519
2520 /* This could mean one of several things. First test whether
2521 it's a character class name. */
2522 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2523 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2524 break;
2525 if (cnt < ctype->nr_charclass)
2526 {
2527 class_bit = _ISwbit (cnt);
2528 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2529 free (now->val.str.startmb);
2530 goto read_charclass;
2531 }
2532 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2533 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2534 break;
2535 if (cnt < ctype->map_collection_nr)
2536 {
2537 mapidx = cnt;
2538 free (now->val.str.startmb);
2539 goto read_mapping;
2540 }
2541 #ifdef PREDEFINED_CLASSES
2542 if (strcmp (now->val.str.startmb, "special1") == 0)
2543 {
2544 class_bit = _ISwspecial1;
2545 free (now->val.str.startmb);
2546 goto read_charclass;
2547 }
2548 if (strcmp (now->val.str.startmb, "special2") == 0)
2549 {
2550 class_bit = _ISwspecial2;
2551 free (now->val.str.startmb);
2552 goto read_charclass;
2553 }
2554 if (strcmp (now->val.str.startmb, "special3") == 0)
2555 {
2556 class_bit = _ISwspecial3;
2557 free (now->val.str.startmb);
2558 goto read_charclass;
2559 }
2560 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2561 {
2562 mapidx = 2;
2563 goto read_mapping;
2564 }
2565 #endif
2566 break;
2567
2568 case tok_end:
2569 /* Next we assume `LC_CTYPE'. */
2570 now = lr_token (ldfile, charmap, NULL);
2571 if (now->tok == tok_eof)
2572 break;
2573 if (now->tok == tok_eol)
2574 lr_error (ldfile, _("%s: incomplete `END' line"),
2575 "LC_CTYPE");
2576 else if (now->tok != tok_lc_ctype)
2577 lr_error (ldfile, _("\
2578 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2579 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2580 return;
2581
2582 default:
2583 err_label:
2584 if (now->tok != tok_eof)
2585 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2586 }
2587
2588 /* Prepare for the next round. */
2589 now = lr_token (ldfile, charmap, NULL);
2590 nowtok = now->tok;
2591 }
2592
2593 /* When we come here we reached the end of the file. */
2594 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2595 }
2596
2597
2598 static void
2599 set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2600 struct repertoire_t *repertoire)
2601 {
2602 size_t cnt;
2603
2604 /* These function defines the default values for the classes and conversions
2605 according to POSIX.2 2.5.2.1.
2606 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2607 Don't move them unless you know what you do! */
2608
2609 void set_default (int bitpos, int from, int to)
2610 {
2611 char tmp[2];
2612 int ch;
2613 int bit = _ISbit (bitpos);
2614 int bitw = _ISwbit (bitpos);
2615 /* Define string. */
2616 strcpy (tmp, "?");
2617
2618 for (ch = from; ch <= to; ++ch)
2619 {
2620 struct charseq *seq;
2621 tmp[0] = ch;
2622
2623 seq = charmap_find_value (charmap, tmp, 1);
2624 if (seq == NULL)
2625 {
2626 if (!be_quiet)
2627 error (0, 0, _("\
2628 %s: character `%s' not defined in charmap while needed as default value"),
2629 "LC_CTYPE", tmp);
2630 }
2631 else if (seq->nbytes != 1)
2632 error (0, 0, _("\
2633 %s: character `%s' in charmap not representable with one byte"),
2634 "LC_CTYPE", tmp);
2635 else
2636 ctype->class256_collection[seq->bytes[0]] |= bit;
2637
2638 /* No need to search here, the ASCII value is also the Unicode
2639 value. */
2640 ELEM (ctype, class_collection, , ch) |= bitw;
2641 }
2642 }
2643
2644 /* Set default values if keyword was not present. */
2645 if ((ctype->class_done & BITw (tok_upper)) == 0)
2646 /* "If this keyword [lower] is not specified, the lowercase letters
2647 `A' through `Z', ..., shall automatically belong to this class,
2648 with implementation defined character values." [P1003.2, 2.5.2.1] */
2649 set_default (BITPOS (tok_upper), 'A', 'Z');
2650
2651 if ((ctype->class_done & BITw (tok_lower)) == 0)
2652 /* "If this keyword [lower] is not specified, the lowercase letters
2653 `a' through `z', ..., shall automatically belong to this class,
2654 with implementation defined character values." [P1003.2, 2.5.2.1] */
2655 set_default (BITPOS (tok_lower), 'a', 'z');
2656
2657 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2658 {
2659 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2660 class `lower' *must* be in class `alpha'. */
2661 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2662 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2663
2664 for (cnt = 0; cnt < 256; ++cnt)
2665 if ((ctype->class256_collection[cnt] & mask) != 0)
2666 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2667
2668 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2669 if ((ctype->class_collection[cnt] & maskw) != 0)
2670 ctype->class_collection[cnt] |= BITw (tok_alpha);
2671 }
2672
2673 if ((ctype->class_done & BITw (tok_digit)) == 0)
2674 /* "If this keyword [digit] is not specified, the digits `0' through
2675 `9', ..., shall automatically belong to this class, with
2676 implementation-defined character values." [P1003.2, 2.5.2.1] */
2677 set_default (BITPOS (tok_digit), '0', '9');
2678
2679 /* "Only characters specified for the `alpha' and `digit' keyword
2680 shall be specified. Characters specified for the keyword `alpha'
2681 and `digit' are automatically included in this class. */
2682 {
2683 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2684 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2685
2686 for (cnt = 0; cnt < 256; ++cnt)
2687 if ((ctype->class256_collection[cnt] & mask) != 0)
2688 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2689
2690 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2691 if ((ctype->class_collection[cnt] & maskw) != 0)
2692 ctype->class_collection[cnt] |= BITw (tok_alnum);
2693 }
2694
2695 if ((ctype->class_done & BITw (tok_space)) == 0)
2696 /* "If this keyword [space] is not specified, the characters <space>,
2697 <form-feed>, <newline>, <carriage-return>, <tab>, and
2698 <vertical-tab>, ..., shall automatically belong to this class,
2699 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2700 {
2701 struct charseq *seq;
2702
2703 seq = charmap_find_value (charmap, "space", 5);
2704 if (seq == NULL)
2705 seq = charmap_find_value (charmap, "SP", 2);
2706 if (seq == NULL)
2707 seq = charmap_find_value (charmap, "U00000020", 9);
2708 if (seq == NULL)
2709 {
2710 if (!be_quiet)
2711 error (0, 0, _("\
2712 %s: character `%s' not defined while needed as default value"),
2713 "LC_CTYPE", "<space>");
2714 }
2715 else if (seq->nbytes != 1)
2716 error (0, 0, _("\
2717 %s: character `%s' in charmap not representable with one byte"),
2718 "LC_CTYPE", "<space>");
2719 else
2720 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2721
2722 /* No need to search. */
2723 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
2724
2725 seq = charmap_find_value (charmap, "form-feed", 9);
2726 if (seq == NULL)
2727 seq = charmap_find_value (charmap, "U0000000C", 9);
2728 if (seq == NULL)
2729 {
2730 if (!be_quiet)
2731 error (0, 0, _("\
2732 %s: character `%s' not defined while needed as default value"),
2733 "LC_CTYPE", "<form-feed>");
2734 }
2735 else if (seq->nbytes != 1)
2736 error (0, 0, _("\
2737 %s: character `%s' in charmap not representable with one byte"),
2738 "LC_CTYPE", "<form-feed>");
2739 else
2740 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2741
2742 /* No need to search. */
2743 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
2744
2745
2746 seq = charmap_find_value (charmap, "newline", 7);
2747 if (seq == NULL)
2748 seq = charmap_find_value (charmap, "U0000000A", 9);
2749 if (seq == NULL)
2750 {
2751 if (!be_quiet)
2752 error (0, 0, _("\
2753 character `%s' not defined while needed as default value"),
2754 "<newline>");
2755 }
2756 else if (seq->nbytes != 1)
2757 error (0, 0, _("\
2758 %s: character `%s' in charmap not representable with one byte"),
2759 "LC_CTYPE", "<newline>");
2760 else
2761 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2762
2763 /* No need to search. */
2764 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
2765
2766
2767 seq = charmap_find_value (charmap, "carriage-return", 15);
2768 if (seq == NULL)
2769 seq = charmap_find_value (charmap, "U0000000D", 9);
2770 if (seq == NULL)
2771 {
2772 if (!be_quiet)
2773 error (0, 0, _("\
2774 %s: character `%s' not defined while needed as default value"),
2775 "LC_CTYPE", "<carriage-return>");
2776 }
2777 else if (seq->nbytes != 1)
2778 error (0, 0, _("\
2779 %s: character `%s' in charmap not representable with one byte"),
2780 "LC_CTYPE", "<carriage-return>");
2781 else
2782 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2783
2784 /* No need to search. */
2785 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
2786
2787
2788 seq = charmap_find_value (charmap, "tab", 3);
2789 if (seq == NULL)
2790 seq = charmap_find_value (charmap, "U00000009", 9);
2791 if (seq == NULL)
2792 {
2793 if (!be_quiet)
2794 error (0, 0, _("\
2795 %s: character `%s' not defined while needed as default value"),
2796 "LC_CTYPE", "<tab>");
2797 }
2798 else if (seq->nbytes != 1)
2799 error (0, 0, _("\
2800 %s: character `%s' in charmap not representable with one byte"),
2801 "LC_CTYPE", "<tab>");
2802 else
2803 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2804
2805 /* No need to search. */
2806 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
2807
2808
2809 seq = charmap_find_value (charmap, "vertical-tab", 12);
2810 if (seq == NULL)
2811 seq = charmap_find_value (charmap, "U0000000B", 9);
2812 if (seq == NULL)
2813 {
2814 if (!be_quiet)
2815 error (0, 0, _("\
2816 %s: character `%s' not defined while needed as default value"),
2817 "LC_CTYPE", "<vertical-tab>");
2818 }
2819 else if (seq->nbytes != 1)
2820 error (0, 0, _("\
2821 %s: character `%s' in charmap not representable with one byte"),
2822 "LC_CTYPE", "<vertical-tab>");
2823 else
2824 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2825
2826 /* No need to search. */
2827 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
2828 }
2829
2830 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
2831 /* "If this keyword is not specified, the digits `0' to `9', the
2832 uppercase letters `A' through `F', and the lowercase letters `a'
2833 through `f', ..., shell automatically belong to this class, with
2834 implementation defined character values." [P1003.2, 2.5.2.1] */
2835 {
2836 set_default (BITPOS (tok_xdigit), '0', '9');
2837 set_default (BITPOS (tok_xdigit), 'A', 'F');
2838 set_default (BITPOS (tok_xdigit), 'a', 'f');
2839 }
2840
2841 if ((ctype->class_done & BITw (tok_blank)) == 0)
2842 /* "If this keyword [blank] is unspecified, the characters <space> and
2843 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2844 {
2845 struct charseq *seq;
2846
2847 seq = charmap_find_value (charmap, "space", 5);
2848 if (seq == NULL)
2849 seq = charmap_find_value (charmap, "SP", 2);
2850 if (seq == NULL)
2851 seq = charmap_find_value (charmap, "U00000020", 9);
2852 if (seq == NULL)
2853 {
2854 if (!be_quiet)
2855 error (0, 0, _("\
2856 %s: character `%s' not defined while needed as default value"),
2857 "LC_CTYPE", "<space>");
2858 }
2859 else if (seq->nbytes != 1)
2860 error (0, 0, _("\
2861 %s: character `%s' in charmap not representable with one byte"),
2862 "LC_CTYPE", "<space>");
2863 else
2864 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2865
2866 /* No need to search. */
2867 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
2868
2869
2870 seq = charmap_find_value (charmap, "tab", 3);
2871 if (seq == NULL)
2872 seq = charmap_find_value (charmap, "U00000009", 9);
2873 if (seq == NULL)
2874 {
2875 if (!be_quiet)
2876 error (0, 0, _("\
2877 %s: character `%s' not defined while needed as default value"),
2878 "LC_CTYPE", "<tab>");
2879 }
2880 else if (seq->nbytes != 1)
2881 error (0, 0, _("\
2882 %s: character `%s' in charmap not representable with one byte"),
2883 "LC_CTYPE", "<tab>");
2884 else
2885 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2886
2887 /* No need to search. */
2888 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
2889 }
2890
2891 if ((ctype->class_done & BITw (tok_graph)) == 0)
2892 /* "If this keyword [graph] is not specified, characters specified for
2893 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2894 shall belong to this character class." [P1003.2, 2.5.2.1] */
2895 {
2896 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2897 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2898 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
2899 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
2900 BITw (tok_punct);
2901 size_t cnt;
2902
2903 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2904 if ((ctype->class_collection[cnt] & maskw) != 0)
2905 ctype->class_collection[cnt] |= BITw (tok_graph);
2906
2907 for (cnt = 0; cnt < 256; ++cnt)
2908 if ((ctype->class256_collection[cnt] & mask) != 0)
2909 ctype->class256_collection[cnt] |= BIT (tok_graph);
2910 }
2911
2912 if ((ctype->class_done & BITw (tok_print)) == 0)
2913 /* "If this keyword [print] is not provided, characters specified for
2914 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2915 and the <space> character shall belong to this character class."
2916 [P1003.2, 2.5.2.1] */
2917 {
2918 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2919 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2920 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
2921 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
2922 BITw (tok_punct);
2923 size_t cnt;
2924 struct charseq *seq;
2925
2926 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2927 if ((ctype->class_collection[cnt] & maskw) != 0)
2928 ctype->class_collection[cnt] |= BITw (tok_print);
2929
2930 for (cnt = 0; cnt < 256; ++cnt)
2931 if ((ctype->class256_collection[cnt] & mask) != 0)
2932 ctype->class256_collection[cnt] |= BIT (tok_print);
2933
2934
2935 seq = charmap_find_value (charmap, "space", 5);
2936 if (seq == NULL)
2937 seq = charmap_find_value (charmap, "SP", 2);
2938 if (seq == NULL)
2939 seq = charmap_find_value (charmap, "U00000020", 9);
2940 if (seq == NULL)
2941 {
2942 if (!be_quiet)
2943 error (0, 0, _("\
2944 %s: character `%s' not defined while needed as default value"),
2945 "LC_CTYPE", "<space>");
2946 }
2947 else if (seq->nbytes != 1)
2948 error (0, 0, _("\
2949 %s: character `%s' in charmap not representable with one byte"),
2950 "LC_CTYPE", "<space>");
2951 else
2952 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
2953
2954 /* No need to search. */
2955 ELEM (ctype, class_collection, , L' ') |= BITw (tok_print);
2956 }
2957
2958 if (ctype->tomap_done[0] == 0)
2959 /* "If this keyword [toupper] is not specified, the lowercase letters
2960 `a' through `z', and their corresponding uppercase letters `A' to
2961 `Z', ..., shall automatically be included, with implementation-
2962 defined character values." [P1003.2, 2.5.2.1] */
2963 {
2964 char tmp[4];
2965 int ch;
2966
2967 strcpy (tmp, "<?>");
2968
2969 for (ch = 'a'; ch <= 'z'; ++ch)
2970 {
2971 struct charseq *seq_from, *seq_to;
2972
2973 tmp[1] = (char) ch;
2974
2975 seq_from = charmap_find_value (charmap, &tmp[1], 1);
2976 if (seq_from == NULL)
2977 {
2978 if (!be_quiet)
2979 error (0, 0, _("\
2980 %s: character `%s' not defined while needed as default value"),
2981 "LC_CTYPE", tmp);
2982 }
2983 else if (seq_from->nbytes != 1)
2984 {
2985 if (!be_quiet)
2986 error (0, 0, _("\
2987 %s: character `%s' needed as default value not representable with one byte"),
2988 "LC_CTYPE", tmp);
2989 }
2990 else
2991 {
2992 /* This conversion is implementation defined. */
2993 tmp[1] = (char) (ch + ('A' - 'a'));
2994 seq_to = charmap_find_value (charmap, &tmp[1], 1);
2995 if (seq_to == NULL)
2996 {
2997 if (!be_quiet)
2998 error (0, 0, _("\
2999 %s: character `%s' not defined while needed as default value"),
3000 "LC_CTYPE", tmp);
3001 }
3002 else if (seq_to->nbytes != 1)
3003 {
3004 if (!be_quiet)
3005 error (0, 0, _("\
3006 %s: character `%s' needed as default value not representable with one byte"),
3007 "LC_CTYPE", tmp);
3008 }
3009 else
3010 /* The index [0] is determined by the order of the
3011 `ctype_map_newP' calls in `ctype_startup'. */
3012 ctype->map256_collection[0][seq_from->bytes[0]]
3013 = seq_to->bytes[0];
3014 }
3015
3016 /* No need to search. */
3017 ELEM (ctype, map_collection, [0], ch) = ch + ('A' - 'a');
3018 }
3019 }
3020
3021 if (ctype->tomap_done[1] == 0)
3022 /* "If this keyword [tolower] is not specified, the mapping shall be
3023 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3024 {
3025 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
3026 if (ctype->map_collection[0][cnt] != 0)
3027 ELEM (ctype, map_collection, [1],
3028 ctype->map_collection[0][cnt])
3029 = ctype->charnames[cnt];
3030
3031 for (cnt = 0; cnt < 256; ++cnt)
3032 if (ctype->map256_collection[0][cnt] != 0)
3033 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
3034 }
3035
3036 if (ctype->outdigits_act == 0)
3037 {
3038 for (cnt = 0; cnt < 10; ++cnt)
3039 {
3040 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3041 digits + cnt, 1);
3042
3043 if (ctype->mboutdigits[cnt] == NULL)
3044 {
3045 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
3046 longnames[cnt],
3047 strlen (longnames[cnt]));
3048
3049 if (ctype->mboutdigits[cnt] == NULL)
3050 {
3051 /* Provide a replacement. */
3052 error (0, 0, _("\
3053 no output digits defined and none of the standard names in the charmap"));
3054
3055 ctype->mboutdigits[cnt] = obstack_alloc (&charmap->mem_pool,
3056 sizeof (struct charseq) + 1);
3057
3058 /* This is better than nothing. */
3059 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
3060 ctype->mboutdigits[cnt]->nbytes = 1;
3061 }
3062 }
3063
3064 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
3065 digits + cnt, 1);
3066
3067 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
3068 {
3069 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
3070 longnames[cnt],
3071 strlen (longnames[cnt]));
3072
3073 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
3074 {
3075 /* Provide a replacement. */
3076 error (0, 0, _("\
3077 no output digits defined and none of the standard names in the repertoire"));
3078
3079 /* This is better than nothing. */
3080 ctype->wcoutdigits[cnt] = (uint32_t) digits[cnt];
3081 }
3082 }
3083 }
3084
3085 ctype->outdigits_act = 10;
3086 }
3087 }
3088
3089
3090 static void
3091 allocate_arrays (struct locale_ctype_t *ctype, struct charmap_t *charmap,
3092 struct repertoire_t *repertoire)
3093 {
3094 size_t idx;
3095 size_t width_table_size;
3096
3097 /* First we have to decide how we organize the arrays. It is easy
3098 for a one-byte character set. But multi-byte character set
3099 cannot be stored flat because the chars might be sparsely used.
3100 So we determine an optimal hashing function for the used
3101 characters.
3102
3103 We use a very trivial hashing function to store the sparse
3104 table. CH % TABSIZE is used as an index. To solve multiple hits
3105 we have N planes. This guarantees a fixed search time for a
3106 character [N / 2]. In the following code we determine the minimum
3107 value for TABSIZE * N, where TABSIZE >= 256.
3108
3109 Some people complained that this algorithm takes too long. Well,
3110 go on, improve it. But changing the step size is *not* an
3111 option. Some people changed this to use only sizes of prime
3112 numbers. Think again, do some math. We are looking for the
3113 optimal solution, not something which works in general. Unless
3114 somebody can provide a dynamic programming solution I think this
3115 implementation is as good as it can get. */
3116 size_t min_total = UINT_MAX;
3117 size_t act_size = 256;
3118
3119 if (!be_quiet && ctype->charnames_act > 512)
3120 fputs (_("\
3121 Computing table size for character classes might take a while..."),
3122 stderr);
3123
3124 /* While we want to have a small total size we are willing to use a
3125 little bit larger table if this reduces the number of layers.
3126 Therefore we add a little penalty to the number of planes.
3127 Maybe this constant has to be adjusted a bit. */
3128 #define PENALTY 128
3129 do
3130 {
3131 size_t cnt[act_size];
3132 size_t act_planes = 1;
3133
3134 memset (cnt, '\0', sizeof cnt);
3135
3136 for (idx = 0; idx < 256; ++idx)
3137 cnt[idx] = 1;
3138
3139 for (idx = 0; idx < ctype->charnames_act; ++idx)
3140 if (ctype->charnames[idx] >= 256)
3141 {
3142 size_t nr = ctype->charnames[idx] % act_size;
3143
3144 if (++cnt[nr] > act_planes)
3145 {
3146 act_planes = cnt[nr];
3147 if ((act_size + PENALTY) * act_planes >= min_total)
3148 break;
3149 }
3150 }
3151
3152 if ((act_size + PENALTY) * act_planes < min_total)
3153 {
3154 min_total = (act_size + PENALTY) * act_planes;
3155 ctype->plane_size = act_size;
3156 ctype->plane_cnt = act_planes;
3157 }
3158
3159 ++act_size;
3160 }
3161 while (act_size < min_total);
3162
3163 if (!be_quiet && ctype->charnames_act > 512)
3164 fputs (_(" done\n"), stderr);
3165
3166
3167 ctype->names = (uint32_t *) xcalloc (ctype->plane_size
3168 * ctype->plane_cnt,
3169 sizeof (uint32_t));
3170
3171 for (idx = 1; idx < 256; ++idx)
3172 ctype->names[idx] = idx;
3173
3174 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
3175 ctype->names[0] = 1;
3176
3177 for (idx = 256; idx < ctype->charnames_act; ++idx)
3178 {
3179 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
3180 size_t depth = 0;
3181
3182 while (ctype->names[nr + depth * ctype->plane_size])
3183 ++depth;
3184 assert (depth < ctype->plane_cnt);
3185
3186 ctype->names[nr + depth * ctype->plane_size] = ctype->charnames[idx];
3187
3188 /* Now for faster access remember the index in the NAMES_B array. */
3189 ctype->charnames[idx] = nr + depth * ctype->plane_size;
3190 }
3191 ctype->names[0] = 0;
3192
3193
3194 /* You wonder about this amount of memory? This is only because some
3195 users do not manage to address the array with unsigned values or
3196 data types with range >= 256. '\200' would result in the array
3197 index -128. To help these poor people we duplicate the entries for
3198 128 up to 255 below the entry for \0. */
3199 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
3200 sizeof (char_class_t));
3201 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
3202 * ctype->plane_cnt,
3203 sizeof (char_class32_t));
3204
3205 /* This is the array accessed using the multibyte string elements. */
3206 for (idx = 0; idx < 256; ++idx)
3207 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
3208
3209 /* Mirror first 127 entries. We must take care that entry -1 is not
3210 mirrored because EOF == -1. */
3211 for (idx = 0; idx < 127; ++idx)
3212 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3213
3214 /* The 32 bit array contains all characters. */
3215 for (idx = 0; idx < ctype->class_collection_act; ++idx)
3216 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
3217
3218 /* Room for table of mappings. */
3219 ctype->map = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3220 ctype->map32 = (uint32_t **) xmalloc (ctype->map_collection_nr
3221 * sizeof (uint32_t *));
3222
3223 /* Fill in all mappings. */
3224 for (idx = 0; idx < 2; ++idx)
3225 {
3226 unsigned int idx2;
3227
3228 /* Allocate table. */
3229 ctype->map[idx] = (uint32_t *) xmalloc ((256 + 128) * sizeof (uint32_t));
3230
3231 /* Copy values from collection. */
3232 for (idx2 = 0; idx2 < 256; ++idx2)
3233 ctype->map[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
3234
3235 /* Mirror first 127 entries. We must take care not to map entry
3236 -1 because EOF == -1. */
3237 for (idx2 = 0; idx2 < 127; ++idx2)
3238 ctype->map[idx][idx2] = ctype->map[idx][256 + idx2];
3239
3240 /* EOF must map to EOF. */
3241 ctype->map[idx][127] = EOF;
3242 }
3243
3244 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3245 {
3246 unsigned int idx2;
3247
3248 /* Allocate table. */
3249 ctype->map32[idx] = (uint32_t *) xmalloc (ctype->plane_size
3250 * ctype->plane_cnt
3251 * sizeof (uint32_t));
3252
3253 /* Copy default value (identity mapping). */
3254 memcpy (ctype->map32[idx], ctype->names,
3255 ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t));
3256
3257 /* Copy values from collection. */
3258 for (idx2 = 0; idx2 < 256; ++idx2)
3259 if (ctype->map_collection[idx][idx2] != 0)
3260 ctype->map32[idx][idx2] = ctype->map_collection[idx][idx2];
3261
3262 while (idx2 < ctype->map_collection_act[idx])
3263 {
3264 if (ctype->map_collection[idx][idx2] != 0)
3265 ctype->map32[idx][ctype->charnames[idx2]] =
3266 ctype->map_collection[idx][idx2];
3267 ++idx2;
3268 }
3269 }
3270
3271 /* Extra array for class and map names. */
3272 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3273 * sizeof (uint32_t));
3274 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3275 * sizeof (uint32_t));
3276
3277 /* Array for width information. Because the expected width are very
3278 small we use only one single byte. This save space and we need
3279 not provide the information twice with both endianesses. */
3280 width_table_size = (ctype->plane_size * ctype->plane_cnt + 3) & ~3ul;
3281 ctype->width = (unsigned char *) xmalloc (width_table_size);
3282
3283 /* Initialize with default width value. */
3284 memset (ctype->width, charmap->width_default, width_table_size);
3285 if (charmap->width_rules != NULL)
3286 {
3287 size_t cnt;
3288
3289 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
3290 {
3291 unsigned char bytes[charmap->mb_cur_max];
3292 int nbytes = charmap->width_rules[cnt].from->nbytes;
3293
3294 /* We have the range of character for which the width is
3295 specified described using byte sequences of the multibyte
3296 charset. We have to convert this to UCS4 now. And we
3297 cannot simply convert the beginning and the end of the
3298 sequence, we have to iterate over the byte sequence and
3299 convert it for every single character. */
3300 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3301
3302 while (nbytes < charmap->width_rules[cnt].to->nbytes
3303 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3304 nbytes) <= 0)
3305 {
3306 /* Find the UCS value for `bytes'. */
3307 int inner;
3308 uint32_t wch;
3309 struct charseq *seq =
3310 charmap_find_symbol (charmap, bytes, nbytes);
3311
3312 if (seq == NULL)
3313 wch = ILLEGAL_CHAR_VALUE;
3314 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3315 wch = seq->ucs4;
3316 else
3317 wch = repertoire_find_value (ctype->repertoire, seq->name,
3318 strlen (seq->name));
3319
3320 if (wch != ILLEGAL_CHAR_VALUE)
3321 {
3322 /* Store the value. */
3323 size_t nr = wch % ctype->plane_size;
3324 size_t depth = 0;
3325
3326 while (ctype->names[nr + depth * ctype->plane_size] != wch)
3327 ++depth;
3328 assert (depth < ctype->plane_cnt);
3329
3330 ctype->width[nr + depth * ctype->plane_size]
3331 = charmap->width_rules[cnt].width;
3332 }
3333
3334 /* "Increment" the bytes sequence. */
3335 inner = nbytes - 1;
3336 while (inner >= 0 && bytes[inner] == 0xff)
3337 --inner;
3338
3339 if (inner < 0)
3340 {
3341 /* We have to extend the byte sequence. */
3342 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3343 break;
3344
3345 bytes[0] = 1;
3346 memset (&bytes[1], 0, nbytes);
3347 ++nbytes;
3348 }
3349 else
3350 {
3351 ++bytes[inner];
3352 while (++inner < nbytes)
3353 bytes[inner] = 0;
3354 }
3355 }
3356 }
3357 }
3358
3359 /* Set MB_CUR_MAX. */
3360 ctype->mb_cur_max = charmap->mb_cur_max;
3361
3362 /* Now determine the table for the transliteration information.
3363
3364 XXX It is not yet clear to me whether it is worth implementing a
3365 complicated algorithm which uses a hash table to locate the entries.
3366 For now I'll use a simple array which can be searching using binary
3367 search. */
3368 if (ctype->translit_copy_locale != NULL)
3369 {
3370 /* Fold in the transliteration information from the locale mentioned
3371 in the `include' statement. */
3372 struct locale_ctype_t *here = ctype;
3373
3374 do
3375 {
3376 struct localedef_t *other = find_locale (LC_CTYPE,
3377 here->translit_copy_locale,
3378 repertoire->name, charmap);
3379
3380 if (other == NULL)
3381 {
3382 error (0, 0, _("\
3383 %s: transliteration data from locale `%s' not available"),
3384 "LC_CTYPE", here->translit_copy_locale);
3385 break;
3386 }
3387
3388 here = other->categories[LC_CTYPE].ctype;
3389
3390 /* Enqueue the information if necessary. */
3391 if (here->translit != NULL)
3392 {
3393 struct translit_t *endp = here->translit;
3394 while (endp->next != NULL)
3395 endp = endp->next;
3396
3397 endp->next = ctype->translit;
3398 ctype->translit = here->translit;
3399 }
3400 }
3401 while (here->translit_copy_locale != NULL);
3402 }
3403
3404 if (ctype->translit != NULL)
3405 {
3406 /* First count how many entries we have. This is the upper limit
3407 since some entries from the included files might be overwritten. */
3408 size_t number = 0;
3409 size_t cnt;
3410 struct translit_t *runp = ctype->translit;
3411 struct translit_t **sorted;
3412 size_t from_len, to_len;
3413
3414 while (runp != NULL)
3415 {
3416 ++number;
3417 runp = runp->next;
3418 }
3419
3420 /* Next we allocate an array large enough and fill in the values. */
3421 sorted = (struct translit_t **) alloca (number
3422 * sizeof (struct translit_t **));
3423 runp = ctype->translit;
3424 number = 0;
3425 do
3426 {
3427 /* Search for the place where to insert this string.
3428 XXX Better use a real sorting algorithm later. */
3429 size_t idx = 0;
3430 int replace = 0;
3431
3432 while (idx < number)
3433 {
3434 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3435 (const wchar_t *) runp->from);
3436 if (res == 0)
3437 {
3438 replace = 1;
3439 break;
3440 }
3441 if (res > 0)
3442 break;
3443 ++idx;
3444 }
3445
3446 if (replace)
3447 sorted[idx] = runp;
3448 else
3449 {
3450 memmove (&sorted[idx + 1], &sorted[idx],
3451 (number - idx) * sizeof (struct translit_t *));
3452 sorted[idx] = runp;
3453 ++number;
3454 }
3455
3456 runp = runp->next;
3457 }
3458 while (runp != NULL);
3459
3460 /* The next step is putting all the possible transliteration
3461 strings in one memory block so that we can write it out.
3462 We need several different blocks:
3463 - index to the from-string array
3464 - from-string array
3465 - index to the to-string array
3466 - to-string array.
3467 */
3468 from_len = to_len = 0;
3469 for (cnt = 0; cnt < number; ++cnt)
3470 {
3471 struct translit_to_t *srunp;
3472 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3473 srunp = sorted[cnt]->to;
3474 while (srunp != NULL)
3475 {
3476 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3477 srunp = srunp->next;
3478 }
3479 /* Plus one for the extra NUL character marking the end of
3480 the list for the current entry. */
3481 ++to_len;
3482 }
3483
3484 /* We can allocate the arrays for the results. */
3485 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3486 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3487 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3488 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
3489
3490 from_len = 0;
3491 to_len = 0;
3492 for (cnt = 0; cnt < number; ++cnt)
3493 {
3494 size_t len;
3495 struct translit_to_t *srunp;
3496
3497 ctype->translit_from_idx[cnt] = from_len;
3498 ctype->translit_to_idx[cnt] = to_len;
3499
3500 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3501 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
3502 (const wchar_t *) sorted[cnt]->from, len);
3503 from_len += len;
3504
3505 ctype->translit_to_idx[cnt] = to_len;
3506 srunp = sorted[cnt]->to;
3507 while (srunp != NULL)
3508 {
3509 len = wcslen ((const wchar_t *) srunp->str) + 1;
3510 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
3511 (const wchar_t *) srunp->str, len);
3512 to_len += len;
3513 srunp = srunp->next;
3514 }
3515 ctype->translit_to_tbl[to_len++] = L'\0';
3516 }
3517
3518 /* Store the information about the length. */
3519 ctype->translit_idx_size = number * sizeof (uint32_t);
3520 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3521 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3522 }
3523 else
3524 {
3525 /* Provide some dummy pointers since we have nothing to write out. */
3526 static uint32_t no_str = { 0 };
3527
3528 ctype->translit_from_idx = &no_str;
3529 ctype->translit_from_tbl = &no_str;
3530 ctype->translit_to_tbl = &no_str;
3531 ctype->translit_idx_size = 0;
3532 ctype->translit_from_tbl_size = 0;
3533 ctype->translit_to_tbl_size = 0;
3534 }
3535 }
This page took 0.223603 seconds and 5 git commands to generate.