]> sourceware.org Git - glibc.git/blob - libidn/nfkc.c
Move sysdeps/unix/common/tcsendbrk.c to sysdeps/unix/sysv/linux/.
[glibc.git] / libidn / nfkc.c
1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #if HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <stdlib.h>
25 #include <string.h>
26
27 #include "stringprep.h"
28
29 /* This file contains functions from GLIB, including gutf8.c and
30 * gunidecomp.c, all licensed under LGPL and copyright hold by:
31 *
32 * Copyright (C) 1999, 2000 Tom Tromey
33 * Copyright 2000 Red Hat, Inc.
34 */
35
36 /* Hacks to make syncing with GLIB code easier. */
37 #define gboolean int
38 #define gchar char
39 #define guchar unsigned char
40 #define glong long
41 #define gint int
42 #define guint unsigned int
43 #define gushort unsigned short
44 #define gint16 int16_t
45 #define guint16 uint16_t
46 #define gunichar uint32_t
47 #define gsize size_t
48 #define gssize ssize_t
49 #define g_malloc malloc
50 #define g_free free
51 #define GError void
52 #define g_set_error(a,b,c,d) ((void) 0)
53 #define g_new(struct_type, n_structs) \
54 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
55 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
56 # define G_STMT_START (void)(
57 # define G_STMT_END )
58 # else
59 # if (defined (sun) || defined (__sun__))
60 # define G_STMT_START if (1)
61 # define G_STMT_END else (void)0
62 # else
63 # define G_STMT_START do
64 # define G_STMT_END while (0)
65 # endif
66 # endif
67 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
68 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
69 #define TRUE 1
70 #define FALSE 0
71
72 /* Code from GLIB gunicode.h starts here. */
73
74 typedef enum
75 {
76 G_NORMALIZE_DEFAULT,
77 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
78 G_NORMALIZE_DEFAULT_COMPOSE,
79 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
80 G_NORMALIZE_ALL,
81 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
82 G_NORMALIZE_ALL_COMPOSE,
83 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
84 }
85 GNormalizeMode;
86
87 /* Code from GLIB gutf8.c starts here. */
88
89 #define UTF8_COMPUTE(Char, Mask, Len) \
90 if (Char < 128) \
91 { \
92 Len = 1; \
93 Mask = 0x7f; \
94 } \
95 else if ((Char & 0xe0) == 0xc0) \
96 { \
97 Len = 2; \
98 Mask = 0x1f; \
99 } \
100 else if ((Char & 0xf0) == 0xe0) \
101 { \
102 Len = 3; \
103 Mask = 0x0f; \
104 } \
105 else if ((Char & 0xf8) == 0xf0) \
106 { \
107 Len = 4; \
108 Mask = 0x07; \
109 } \
110 else if ((Char & 0xfc) == 0xf8) \
111 { \
112 Len = 5; \
113 Mask = 0x03; \
114 } \
115 else if ((Char & 0xfe) == 0xfc) \
116 { \
117 Len = 6; \
118 Mask = 0x01; \
119 } \
120 else \
121 Len = -1;
122
123 #define UTF8_LENGTH(Char) \
124 ((Char) < 0x80 ? 1 : \
125 ((Char) < 0x800 ? 2 : \
126 ((Char) < 0x10000 ? 3 : \
127 ((Char) < 0x200000 ? 4 : \
128 ((Char) < 0x4000000 ? 5 : 6)))))
129
130
131 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
132 (Result) = (Chars)[0] & (Mask); \
133 for ((Count) = 1; (Count) < (Len); ++(Count)) \
134 { \
135 if (((Chars)[(Count)] & 0xc0) != 0x80) \
136 { \
137 (Result) = -1; \
138 break; \
139 } \
140 (Result) <<= 6; \
141 (Result) |= ((Chars)[(Count)] & 0x3f); \
142 }
143
144 #define UNICODE_VALID(Char) \
145 ((Char) < 0x110000 && \
146 (((Char) & 0xFFFFF800) != 0xD800) && \
147 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
148 ((Char) & 0xFFFE) != 0xFFFE)
149
150
151 static const gchar utf8_skip_data[256] = {
152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1,
162 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 1, 1, 1, 1, 1, 1, 1,
164 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
165 2, 2, 2, 2, 2, 2, 2,
166 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
167 5, 5, 5, 6, 6, 1, 1
168 };
169
170 const gchar *const g_utf8_skip = utf8_skip_data;
171
172 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
173
174 /*
175 * g_utf8_strlen:
176 * @p: pointer to the start of a UTF-8 encoded string.
177 * @max: the maximum number of bytes to examine. If @max
178 * is less than 0, then the string is assumed to be
179 * nul-terminated. If @max is 0, @p will not be examined and
180 * may be %NULL.
181 *
182 * Returns the length of the string in characters.
183 *
184 * Return value: the length of the string in characters
185 **/
186 static glong
187 g_utf8_strlen (const gchar * p, gssize max)
188 {
189 glong len = 0;
190 const gchar *start = p;
191 g_return_val_if_fail (p != NULL || max == 0, 0);
192
193 if (max < 0)
194 {
195 while (*p)
196 {
197 p = g_utf8_next_char (p);
198 ++len;
199 }
200 }
201 else
202 {
203 if (max == 0 || !*p)
204 return 0;
205
206 p = g_utf8_next_char (p);
207
208 while (p - start < max && *p)
209 {
210 ++len;
211 p = g_utf8_next_char (p);
212 }
213
214 /* only do the last len increment if we got a complete
215 * char (don't count partial chars)
216 */
217 if (p - start == max)
218 ++len;
219 }
220
221 return len;
222 }
223
224 /*
225 * g_utf8_get_char:
226 * @p: a pointer to Unicode character encoded as UTF-8
227 *
228 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
229 * If @p does not point to a valid UTF-8 encoded character, results are
230 * undefined. If you are not sure that the bytes are complete
231 * valid Unicode characters, you should use g_utf8_get_char_validated()
232 * instead.
233 *
234 * Return value: the resulting character
235 **/
236 static gunichar
237 g_utf8_get_char (const gchar * p)
238 {
239 int i, mask = 0, len;
240 gunichar result;
241 unsigned char c = (unsigned char) *p;
242
243 UTF8_COMPUTE (c, mask, len);
244 if (len == -1)
245 return (gunichar) - 1;
246 UTF8_GET (result, p, i, mask, len);
247
248 return result;
249 }
250
251 /*
252 * g_unichar_to_utf8:
253 * @c: a ISO10646 character code
254 * @outbuf: output buffer, must have at least 6 bytes of space.
255 * If %NULL, the length will be computed and returned
256 * and nothing will be written to @outbuf.
257 *
258 * Converts a single character to UTF-8.
259 *
260 * Return value: number of bytes written
261 **/
262 static int
263 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
264 {
265 guint len = 0;
266 int first;
267 int i;
268
269 if (c < 0x80)
270 {
271 first = 0;
272 len = 1;
273 }
274 else if (c < 0x800)
275 {
276 first = 0xc0;
277 len = 2;
278 }
279 else if (c < 0x10000)
280 {
281 first = 0xe0;
282 len = 3;
283 }
284 else if (c < 0x200000)
285 {
286 first = 0xf0;
287 len = 4;
288 }
289 else if (c < 0x4000000)
290 {
291 first = 0xf8;
292 len = 5;
293 }
294 else
295 {
296 first = 0xfc;
297 len = 6;
298 }
299
300 if (outbuf)
301 {
302 for (i = len - 1; i > 0; --i)
303 {
304 outbuf[i] = (c & 0x3f) | 0x80;
305 c >>= 6;
306 }
307 outbuf[0] = c | first;
308 }
309
310 return len;
311 }
312
313 /*
314 * g_utf8_to_ucs4_fast:
315 * @str: a UTF-8 encoded string
316 * @len: the maximum length of @str to use. If @len < 0, then
317 * the string is nul-terminated.
318 * @items_written: location to store the number of characters in the
319 * result, or %NULL.
320 *
321 * Convert a string from UTF-8 to a 32-bit fixed width
322 * representation as UCS-4, assuming valid UTF-8 input.
323 * This function is roughly twice as fast as g_utf8_to_ucs4()
324 * but does no error checking on the input.
325 *
326 * Return value: a pointer to a newly allocated UCS-4 string.
327 * This value must be freed with g_free().
328 **/
329 static gunichar *
330 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
331 {
332 gint j, charlen;
333 gunichar *result;
334 gint n_chars, i;
335 const gchar *p;
336
337 g_return_val_if_fail (str != NULL, NULL);
338
339 p = str;
340 n_chars = 0;
341 if (len < 0)
342 {
343 while (*p)
344 {
345 p = g_utf8_next_char (p);
346 ++n_chars;
347 }
348 }
349 else
350 {
351 while (p < str + len && *p)
352 {
353 p = g_utf8_next_char (p);
354 ++n_chars;
355 }
356 }
357
358 result = g_new (gunichar, n_chars + 1);
359 if (!result)
360 return NULL;
361
362 p = str;
363 for (i = 0; i < n_chars; i++)
364 {
365 gunichar wc = ((unsigned char *) p)[0];
366
367 if (wc < 0x80)
368 {
369 result[i] = wc;
370 p++;
371 }
372 else
373 {
374 if (wc < 0xe0)
375 {
376 charlen = 2;
377 wc &= 0x1f;
378 }
379 else if (wc < 0xf0)
380 {
381 charlen = 3;
382 wc &= 0x0f;
383 }
384 else if (wc < 0xf8)
385 {
386 charlen = 4;
387 wc &= 0x07;
388 }
389 else if (wc < 0xfc)
390 {
391 charlen = 5;
392 wc &= 0x03;
393 }
394 else
395 {
396 charlen = 6;
397 wc &= 0x01;
398 }
399
400 for (j = 1; j < charlen; j++)
401 {
402 wc <<= 6;
403 wc |= ((unsigned char *) p)[j] & 0x3f;
404 }
405
406 result[i] = wc;
407 p += charlen;
408 }
409 }
410 result[i] = 0;
411
412 if (items_written)
413 *items_written = i;
414
415 return result;
416 }
417
418 /*
419 * g_ucs4_to_utf8:
420 * @str: a UCS-4 encoded string
421 * @len: the maximum length of @str to use. If @len < 0, then
422 * the string is terminated with a 0 character.
423 * @items_read: location to store number of characters read read, or %NULL.
424 * @items_written: location to store number of bytes written or %NULL.
425 * The value here stored does not include the trailing 0
426 * byte.
427 * @error: location to store the error occuring, or %NULL to ignore
428 * errors. Any of the errors in #GConvertError other than
429 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
430 *
431 * Convert a string from a 32-bit fixed width representation as UCS-4.
432 * to UTF-8. The result will be terminated with a 0 byte.
433 *
434 * Return value: a pointer to a newly allocated UTF-8 string.
435 * This value must be freed with g_free(). If an
436 * error occurs, %NULL will be returned and
437 * @error set.
438 **/
439 static gchar *
440 g_ucs4_to_utf8 (const gunichar * str,
441 glong len,
442 glong * items_read, glong * items_written, GError ** error)
443 {
444 gint result_length;
445 gchar *result = NULL;
446 gchar *p;
447 gint i;
448
449 result_length = 0;
450 for (i = 0; len < 0 || i < len; i++)
451 {
452 if (!str[i])
453 break;
454
455 if (str[i] >= 0x80000000)
456 {
457 if (items_read)
458 *items_read = i;
459
460 g_set_error (error, G_CONVERT_ERROR,
461 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
462 _("Character out of range for UTF-8"));
463 goto err_out;
464 }
465
466 result_length += UTF8_LENGTH (str[i]);
467 }
468
469 result = g_malloc (result_length + 1);
470 if (!result)
471 return NULL;
472 p = result;
473
474 i = 0;
475 while (p < result + result_length)
476 p += g_unichar_to_utf8 (str[i++], p);
477
478 *p = '\0';
479
480 if (items_written)
481 *items_written = p - result;
482
483 err_out:
484 if (items_read)
485 *items_read = i;
486
487 return result;
488 }
489
490 /* Code from GLIB gunidecomp.c starts here. */
491
492 #include "gunidecomp.h"
493 #include "gunicomp.h"
494
495 #define CC_PART1(Page, Char) \
496 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
497 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
498 : (cclass_data[combining_class_table_part1[Page]][Char]))
499
500 #define CC_PART2(Page, Char) \
501 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
502 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
503 : (cclass_data[combining_class_table_part2[Page]][Char]))
504
505 #define COMBINING_CLASS(Char) \
506 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
507 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
508 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
509 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
510 : 0))
511
512 /* constants for hangul syllable [de]composition */
513 #define SBase 0xAC00
514 #define LBase 0x1100
515 #define VBase 0x1161
516 #define TBase 0x11A7
517 #define LCount 19
518 #define VCount 21
519 #define TCount 28
520 #define NCount (VCount * TCount)
521 #define SCount (LCount * NCount)
522
523 /*
524 * g_unicode_canonical_ordering:
525 * @string: a UCS-4 encoded string.
526 * @len: the maximum length of @string to use.
527 *
528 * Computes the canonical ordering of a string in-place.
529 * This rearranges decomposed characters in the string
530 * according to their combining classes. See the Unicode
531 * manual for more information.
532 **/
533 static void
534 g_unicode_canonical_ordering (gunichar * string, gsize len)
535 {
536 gsize i;
537 int swap = 1;
538
539 while (swap)
540 {
541 int last;
542 swap = 0;
543 last = COMBINING_CLASS (string[0]);
544 for (i = 0; i < len - 1; ++i)
545 {
546 int next = COMBINING_CLASS (string[i + 1]);
547 if (next != 0 && last > next)
548 {
549 gsize j;
550 /* Percolate item leftward through string. */
551 for (j = i + 1; j > 0; --j)
552 {
553 gunichar t;
554 if (COMBINING_CLASS (string[j - 1]) <= next)
555 break;
556 t = string[j];
557 string[j] = string[j - 1];
558 string[j - 1] = t;
559 swap = 1;
560 }
561 /* We're re-entering the loop looking at the old
562 character again. */
563 next = last;
564 }
565 last = next;
566 }
567 }
568 }
569
570 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
571 * r should be null or have sufficient space. Calling with r == NULL will
572 * only calculate the result_len; however, a buffer with space for three
573 * characters will always be big enough. */
574 static void
575 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
576 {
577 gint SIndex = s - SBase;
578
579 /* not a hangul syllable */
580 if (SIndex < 0 || SIndex >= SCount)
581 {
582 if (r)
583 r[0] = s;
584 *result_len = 1;
585 }
586 else
587 {
588 gunichar L = LBase + SIndex / NCount;
589 gunichar V = VBase + (SIndex % NCount) / TCount;
590 gunichar T = TBase + SIndex % TCount;
591
592 if (r)
593 {
594 r[0] = L;
595 r[1] = V;
596 }
597
598 if (T != TBase)
599 {
600 if (r)
601 r[2] = T;
602 *result_len = 3;
603 }
604 else
605 *result_len = 2;
606 }
607 }
608
609 /* returns a pointer to a null-terminated UTF-8 string */
610 static const gchar *
611 find_decomposition (gunichar ch, gboolean compat)
612 {
613 int start = 0;
614 int end = G_N_ELEMENTS (decomp_table);
615
616 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
617 {
618 while (TRUE)
619 {
620 int half = (start + end) / 2;
621 if (ch == decomp_table[half].ch)
622 {
623 int offset;
624
625 if (compat)
626 {
627 offset = decomp_table[half].compat_offset;
628 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
629 offset = decomp_table[half].canon_offset;
630 }
631 else
632 {
633 offset = decomp_table[half].canon_offset;
634 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
635 return NULL;
636 }
637
638 return &(decomp_expansion_string[offset]);
639 }
640 else if (half == start)
641 break;
642 else if (ch > decomp_table[half].ch)
643 start = half;
644 else
645 end = half;
646 }
647 }
648
649 return NULL;
650 }
651
652 /* L,V => LV and LV,T => LVT */
653 static gboolean
654 combine_hangul (gunichar a, gunichar b, gunichar * result)
655 {
656 gint LIndex = a - LBase;
657 gint SIndex = a - SBase;
658
659 gint VIndex = b - VBase;
660 gint TIndex = b - TBase;
661
662 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
663 {
664 *result = SBase + (LIndex * VCount + VIndex) * TCount;
665 return TRUE;
666 }
667 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
668 && 0 <= TIndex && TIndex <= TCount)
669 {
670 *result = a + TIndex;
671 return TRUE;
672 }
673
674 return FALSE;
675 }
676
677 #define CI(Page, Char) \
678 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
679 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
680 : (compose_data[compose_table[Page]][Char]))
681
682 #define COMPOSE_INDEX(Char) \
683 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
684
685 static gboolean
686 combine (gunichar a, gunichar b, gunichar * result)
687 {
688 gushort index_a, index_b;
689
690 if (combine_hangul (a, b, result))
691 return TRUE;
692
693 index_a = COMPOSE_INDEX (a);
694
695 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
696 {
697 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
698 {
699 *result =
700 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
701 return TRUE;
702 }
703 else
704 return FALSE;
705 }
706
707 index_b = COMPOSE_INDEX (b);
708
709 if (index_b >= COMPOSE_SECOND_SINGLE_START)
710 {
711 if (a ==
712 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
713 {
714 *result =
715 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
716 return TRUE;
717 }
718 else
719 return FALSE;
720 }
721
722 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
723 && index_b >= COMPOSE_SECOND_START
724 && index_b < COMPOSE_SECOND_SINGLE_START)
725 {
726 gunichar res =
727 compose_array[index_a - COMPOSE_FIRST_START][index_b -
728 COMPOSE_SECOND_START];
729
730 if (res)
731 {
732 *result = res;
733 return TRUE;
734 }
735 }
736
737 return FALSE;
738 }
739
740 static gunichar *
741 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
742 {
743 gsize n_wc;
744 gunichar *wc_buffer;
745 const char *p;
746 gsize last_start;
747 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
748 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
749
750 n_wc = 0;
751 p = str;
752 while ((max_len < 0 || p < str + max_len) && *p)
753 {
754 const gchar *decomp;
755 gunichar wc = g_utf8_get_char (p);
756
757 if (wc >= 0xac00 && wc <= 0xd7af)
758 {
759 gsize result_len;
760 decompose_hangul (wc, NULL, &result_len);
761 n_wc += result_len;
762 }
763 else
764 {
765 decomp = find_decomposition (wc, do_compat);
766
767 if (decomp)
768 n_wc += g_utf8_strlen (decomp, -1);
769 else
770 n_wc++;
771 }
772
773 p = g_utf8_next_char (p);
774 }
775
776 wc_buffer = g_new (gunichar, n_wc + 1);
777 if (!wc_buffer)
778 return NULL;
779
780 last_start = 0;
781 n_wc = 0;
782 p = str;
783 while ((max_len < 0 || p < str + max_len) && *p)
784 {
785 gunichar wc = g_utf8_get_char (p);
786 const gchar *decomp;
787 int cc;
788 gsize old_n_wc = n_wc;
789
790 if (wc >= 0xac00 && wc <= 0xd7af)
791 {
792 gsize result_len;
793 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
794 n_wc += result_len;
795 }
796 else
797 {
798 decomp = find_decomposition (wc, do_compat);
799
800 if (decomp)
801 {
802 const char *pd;
803 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
804 wc_buffer[n_wc++] = g_utf8_get_char (pd);
805 }
806 else
807 wc_buffer[n_wc++] = wc;
808 }
809
810 if (n_wc > 0)
811 {
812 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
813
814 if (cc == 0)
815 {
816 g_unicode_canonical_ordering (wc_buffer + last_start,
817 n_wc - last_start);
818 last_start = old_n_wc;
819 }
820 }
821
822 p = g_utf8_next_char (p);
823 }
824
825 if (n_wc > 0)
826 {
827 g_unicode_canonical_ordering (wc_buffer + last_start,
828 n_wc - last_start);
829 last_start = n_wc;
830 }
831
832 wc_buffer[n_wc] = 0;
833
834 /* All decomposed and reordered */
835
836 if (do_compose && n_wc > 0)
837 {
838 gsize i, j;
839 int last_cc = 0;
840 last_start = 0;
841
842 for (i = 0; i < n_wc; i++)
843 {
844 int cc = COMBINING_CLASS (wc_buffer[i]);
845
846 if (i > 0 &&
847 (last_cc == 0 || last_cc != cc) &&
848 combine (wc_buffer[last_start], wc_buffer[i],
849 &wc_buffer[last_start]))
850 {
851 for (j = i + 1; j < n_wc; j++)
852 wc_buffer[j - 1] = wc_buffer[j];
853 n_wc--;
854 i--;
855
856 if (i == last_start)
857 last_cc = 0;
858 else
859 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
860
861 continue;
862 }
863
864 if (cc == 0)
865 last_start = i;
866
867 last_cc = cc;
868 }
869 }
870
871 wc_buffer[n_wc] = 0;
872
873 return wc_buffer;
874 }
875
876 /*
877 * g_utf8_normalize:
878 * @str: a UTF-8 encoded string.
879 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
880 * @mode: the type of normalization to perform.
881 *
882 * Converts a string into canonical form, standardizing
883 * such issues as whether a character with an accent
884 * is represented as a base character and combining
885 * accent or as a single precomposed character. You
886 * should generally call g_utf8_normalize() before
887 * comparing two Unicode strings.
888 *
889 * The normalization mode %G_NORMALIZE_DEFAULT only
890 * standardizes differences that do not affect the
891 * text content, such as the above-mentioned accent
892 * representation. %G_NORMALIZE_ALL also standardizes
893 * the "compatibility" characters in Unicode, such
894 * as SUPERSCRIPT THREE to the standard forms
895 * (in this case DIGIT THREE). Formatting information
896 * may be lost but for most text operations such
897 * characters should be considered the same.
898 * For example, g_utf8_collate() normalizes
899 * with %G_NORMALIZE_ALL as its first step.
900 *
901 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
902 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
903 * but returned a result with composed forms rather
904 * than a maximally decomposed form. This is often
905 * useful if you intend to convert the string to
906 * a legacy encoding or pass it to a system with
907 * less capable Unicode handling.
908 *
909 * Return value: a newly allocated string, that is the
910 * normalized form of @str.
911 **/
912 static gchar *
913 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
914 {
915 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
916 gchar *result;
917
918 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
919 g_free (result_wc);
920
921 return result;
922 }
923
924 /* Public Libidn API starts here. */
925
926 /**
927 * stringprep_utf8_to_unichar:
928 * @p: a pointer to Unicode character encoded as UTF-8
929 *
930 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
931 * If @p does not point to a valid UTF-8 encoded character, results are
932 * undefined.
933 *
934 * Return value: the resulting character.
935 **/
936 uint32_t
937 stringprep_utf8_to_unichar (const char *p)
938 {
939 return g_utf8_get_char (p);
940 }
941
942 /**
943 * stringprep_unichar_to_utf8:
944 * @c: a ISO10646 character code
945 * @outbuf: output buffer, must have at least 6 bytes of space.
946 * If %NULL, the length will be computed and returned
947 * and nothing will be written to @outbuf.
948 *
949 * Converts a single character to UTF-8.
950 *
951 * Return value: number of bytes written.
952 **/
953 int
954 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
955 {
956 return g_unichar_to_utf8 (c, outbuf);
957 }
958
959 /**
960 * stringprep_utf8_to_ucs4:
961 * @str: a UTF-8 encoded string
962 * @len: the maximum length of @str to use. If @len < 0, then
963 * the string is nul-terminated.
964 * @items_written: location to store the number of characters in the
965 * result, or %NULL.
966 *
967 * Convert a string from UTF-8 to a 32-bit fixed width
968 * representation as UCS-4, assuming valid UTF-8 input.
969 * This function does no error checking on the input.
970 *
971 * Return value: a pointer to a newly allocated UCS-4 string.
972 * This value must be freed with free().
973 **/
974 uint32_t *
975 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
976 {
977 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
978 }
979
980 /**
981 * stringprep_ucs4_to_utf8:
982 * @str: a UCS-4 encoded string
983 * @len: the maximum length of @str to use. If @len < 0, then
984 * the string is terminated with a 0 character.
985 * @items_read: location to store number of characters read read, or %NULL.
986 * @items_written: location to store number of bytes written or %NULL.
987 * The value here stored does not include the trailing 0
988 * byte.
989 *
990 * Convert a string from a 32-bit fixed width representation as UCS-4.
991 * to UTF-8. The result will be terminated with a 0 byte.
992 *
993 * Return value: a pointer to a newly allocated UTF-8 string.
994 * This value must be freed with free(). If an
995 * error occurs, %NULL will be returned and
996 * @error set.
997 **/
998 char *
999 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1000 size_t * items_read, size_t * items_written)
1001 {
1002 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1003 (glong *) items_written, NULL);
1004 }
1005
1006 /**
1007 * stringprep_utf8_nfkc_normalize:
1008 * @str: a UTF-8 encoded string.
1009 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1010 *
1011 * Converts a string into canonical form, standardizing
1012 * such issues as whether a character with an accent
1013 * is represented as a base character and combining
1014 * accent or as a single precomposed character.
1015 *
1016 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1017 * differences that do not affect the text content, such as the
1018 * above-mentioned accent representation. It standardizes the
1019 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1020 * the standard forms (in this case DIGIT THREE). Formatting
1021 * information may be lost but for most text operations such
1022 * characters should be considered the same. It returns a result with
1023 * composed forms rather than a maximally decomposed form.
1024 *
1025 * Return value: a newly allocated string, that is the
1026 * NFKC normalized form of @str.
1027 **/
1028 char *
1029 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1030 {
1031 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1032 }
1033
1034 /**
1035 * stringprep_ucs4_nfkc_normalize:
1036 * @str: a Unicode string.
1037 * @len: length of @str array, or -1 if @str is nul-terminated.
1038 *
1039 * Converts UCS4 string into UTF-8 and runs
1040 * stringprep_utf8_nfkc_normalize().
1041 *
1042 * Return value: a newly allocated Unicode string, that is the NFKC
1043 * normalized form of @str.
1044 **/
1045 uint32_t *
1046 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1047 {
1048 char *p;
1049 uint32_t *result_wc;
1050
1051 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1052 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1053 free (p);
1054
1055 return result_wc;
1056 }
This page took 0.097595 seconds and 5 git commands to generate.