]>
Commit | Line | Data |
---|---|---|
01859b1c UD |
1 | /* nfkc.c Unicode normalization utilities. |
2 | * Copyright (C) 2002, 2003 Simon Josefsson | |
3 | * | |
4 | * This file is part of GNU Libidn. | |
5 | * | |
6 | * GNU Libidn is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * GNU Libidn is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 17 | * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>. |
01859b1c UD |
18 | */ |
19 | ||
20 | #if HAVE_CONFIG_H | |
21 | # include "config.h" | |
22 | #endif | |
23 | ||
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | ||
27 | #include "stringprep.h" | |
28 | ||
29 | /* This file contains functions from GLIB, including gutf8.c and | |
30 | * gunidecomp.c, all licensed under LGPL and copyright hold by: | |
31 | * | |
32 | * Copyright (C) 1999, 2000 Tom Tromey | |
33 | * Copyright 2000 Red Hat, Inc. | |
34 | */ | |
35 | ||
36 | /* Hacks to make syncing with GLIB code easier. */ | |
37 | #define gboolean int | |
38 | #define gchar char | |
39 | #define guchar unsigned char | |
40 | #define glong long | |
41 | #define gint int | |
42 | #define guint unsigned int | |
43 | #define gushort unsigned short | |
44 | #define gint16 int16_t | |
45 | #define guint16 uint16_t | |
46 | #define gunichar uint32_t | |
47 | #define gsize size_t | |
48 | #define gssize ssize_t | |
49 | #define g_malloc malloc | |
50 | #define g_free free | |
51 | #define GError void | |
52 | #define g_set_error(a,b,c,d) ((void) 0) | |
53 | #define g_new(struct_type, n_structs) \ | |
54 | ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs)))) | |
55 | # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus) | |
56 | # define G_STMT_START (void)( | |
57 | # define G_STMT_END ) | |
58 | # else | |
59 | # if (defined (sun) || defined (__sun__)) | |
60 | # define G_STMT_START if (1) | |
61 | # define G_STMT_END else (void)0 | |
62 | # else | |
63 | # define G_STMT_START do | |
64 | # define G_STMT_END while (0) | |
65 | # endif | |
66 | # endif | |
67 | #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END | |
68 | #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) | |
69 | #define TRUE 1 | |
70 | #define FALSE 0 | |
71 | ||
72 | /* Code from GLIB gunicode.h starts here. */ | |
73 | ||
74 | typedef enum | |
75 | { | |
76 | G_NORMALIZE_DEFAULT, | |
77 | G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, | |
78 | G_NORMALIZE_DEFAULT_COMPOSE, | |
79 | G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, | |
80 | G_NORMALIZE_ALL, | |
81 | G_NORMALIZE_NFKD = G_NORMALIZE_ALL, | |
82 | G_NORMALIZE_ALL_COMPOSE, | |
83 | G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE | |
84 | } | |
85 | GNormalizeMode; | |
86 | ||
87 | /* Code from GLIB gutf8.c starts here. */ | |
88 | ||
89 | #define UTF8_COMPUTE(Char, Mask, Len) \ | |
90 | if (Char < 128) \ | |
91 | { \ | |
92 | Len = 1; \ | |
93 | Mask = 0x7f; \ | |
94 | } \ | |
95 | else if ((Char & 0xe0) == 0xc0) \ | |
96 | { \ | |
97 | Len = 2; \ | |
98 | Mask = 0x1f; \ | |
99 | } \ | |
100 | else if ((Char & 0xf0) == 0xe0) \ | |
101 | { \ | |
102 | Len = 3; \ | |
103 | Mask = 0x0f; \ | |
104 | } \ | |
105 | else if ((Char & 0xf8) == 0xf0) \ | |
106 | { \ | |
107 | Len = 4; \ | |
108 | Mask = 0x07; \ | |
109 | } \ | |
110 | else if ((Char & 0xfc) == 0xf8) \ | |
111 | { \ | |
112 | Len = 5; \ | |
113 | Mask = 0x03; \ | |
114 | } \ | |
115 | else if ((Char & 0xfe) == 0xfc) \ | |
116 | { \ | |
117 | Len = 6; \ | |
118 | Mask = 0x01; \ | |
119 | } \ | |
120 | else \ | |
121 | Len = -1; | |
122 | ||
123 | #define UTF8_LENGTH(Char) \ | |
124 | ((Char) < 0x80 ? 1 : \ | |
125 | ((Char) < 0x800 ? 2 : \ | |
126 | ((Char) < 0x10000 ? 3 : \ | |
127 | ((Char) < 0x200000 ? 4 : \ | |
128 | ((Char) < 0x4000000 ? 5 : 6))))) | |
129 | ||
130 | ||
131 | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ | |
132 | (Result) = (Chars)[0] & (Mask); \ | |
133 | for ((Count) = 1; (Count) < (Len); ++(Count)) \ | |
134 | { \ | |
135 | if (((Chars)[(Count)] & 0xc0) != 0x80) \ | |
136 | { \ | |
137 | (Result) = -1; \ | |
138 | break; \ | |
139 | } \ | |
140 | (Result) <<= 6; \ | |
141 | (Result) |= ((Chars)[(Count)] & 0x3f); \ | |
142 | } | |
143 | ||
144 | #define UNICODE_VALID(Char) \ | |
145 | ((Char) < 0x110000 && \ | |
146 | (((Char) & 0xFFFFF800) != 0xD800) && \ | |
147 | ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ | |
148 | ((Char) & 0xFFFE) != 0xFFFE) | |
149 | ||
150 | ||
151 | static const gchar utf8_skip_data[256] = { | |
152 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
153 | 1, 1, 1, 1, 1, 1, 1, | |
154 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
155 | 1, 1, 1, 1, 1, 1, 1, | |
156 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
157 | 1, 1, 1, 1, 1, 1, 1, | |
158 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
159 | 1, 1, 1, 1, 1, 1, 1, | |
160 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
161 | 1, 1, 1, 1, 1, 1, 1, | |
162 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
163 | 1, 1, 1, 1, 1, 1, 1, | |
164 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
165 | 2, 2, 2, 2, 2, 2, 2, | |
166 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, | |
167 | 5, 5, 5, 6, 6, 1, 1 | |
168 | }; | |
169 | ||
170 | const gchar *const g_utf8_skip = utf8_skip_data; | |
171 | ||
172 | #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) | |
173 | ||
174 | /* | |
175 | * g_utf8_strlen: | |
176 | * @p: pointer to the start of a UTF-8 encoded string. | |
177 | * @max: the maximum number of bytes to examine. If @max | |
178 | * is less than 0, then the string is assumed to be | |
179 | * nul-terminated. If @max is 0, @p will not be examined and | |
180 | * may be %NULL. | |
181 | * | |
182 | * Returns the length of the string in characters. | |
183 | * | |
184 | * Return value: the length of the string in characters | |
185 | **/ | |
186 | static glong | |
187 | g_utf8_strlen (const gchar * p, gssize max) | |
188 | { | |
189 | glong len = 0; | |
190 | const gchar *start = p; | |
191 | g_return_val_if_fail (p != NULL || max == 0, 0); | |
192 | ||
193 | if (max < 0) | |
194 | { | |
195 | while (*p) | |
196 | { | |
197 | p = g_utf8_next_char (p); | |
198 | ++len; | |
199 | } | |
200 | } | |
201 | else | |
202 | { | |
203 | if (max == 0 || !*p) | |
204 | return 0; | |
205 | ||
206 | p = g_utf8_next_char (p); | |
207 | ||
208 | while (p - start < max && *p) | |
209 | { | |
210 | ++len; | |
211 | p = g_utf8_next_char (p); | |
212 | } | |
213 | ||
214 | /* only do the last len increment if we got a complete | |
215 | * char (don't count partial chars) | |
216 | */ | |
217 | if (p - start == max) | |
218 | ++len; | |
219 | } | |
220 | ||
221 | return len; | |
222 | } | |
223 | ||
224 | /* | |
225 | * g_utf8_get_char: | |
226 | * @p: a pointer to Unicode character encoded as UTF-8 | |
227 | * | |
228 | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | |
229 | * If @p does not point to a valid UTF-8 encoded character, results are | |
230 | * undefined. If you are not sure that the bytes are complete | |
231 | * valid Unicode characters, you should use g_utf8_get_char_validated() | |
232 | * instead. | |
233 | * | |
234 | * Return value: the resulting character | |
235 | **/ | |
236 | static gunichar | |
237 | g_utf8_get_char (const gchar * p) | |
238 | { | |
239 | int i, mask = 0, len; | |
240 | gunichar result; | |
241 | unsigned char c = (unsigned char) *p; | |
242 | ||
243 | UTF8_COMPUTE (c, mask, len); | |
244 | if (len == -1) | |
245 | return (gunichar) - 1; | |
246 | UTF8_GET (result, p, i, mask, len); | |
247 | ||
248 | return result; | |
249 | } | |
250 | ||
251 | /* | |
252 | * g_unichar_to_utf8: | |
253 | * @c: a ISO10646 character code | |
254 | * @outbuf: output buffer, must have at least 6 bytes of space. | |
255 | * If %NULL, the length will be computed and returned | |
256 | * and nothing will be written to @outbuf. | |
257 | * | |
258 | * Converts a single character to UTF-8. | |
259 | * | |
260 | * Return value: number of bytes written | |
261 | **/ | |
262 | static int | |
263 | g_unichar_to_utf8 (gunichar c, gchar * outbuf) | |
264 | { | |
265 | guint len = 0; | |
266 | int first; | |
267 | int i; | |
268 | ||
269 | if (c < 0x80) | |
270 | { | |
271 | first = 0; | |
272 | len = 1; | |
273 | } | |
274 | else if (c < 0x800) | |
275 | { | |
276 | first = 0xc0; | |
277 | len = 2; | |
278 | } | |
279 | else if (c < 0x10000) | |
280 | { | |
281 | first = 0xe0; | |
282 | len = 3; | |
283 | } | |
284 | else if (c < 0x200000) | |
285 | { | |
286 | first = 0xf0; | |
287 | len = 4; | |
288 | } | |
289 | else if (c < 0x4000000) | |
290 | { | |
291 | first = 0xf8; | |
292 | len = 5; | |
293 | } | |
294 | else | |
295 | { | |
296 | first = 0xfc; | |
297 | len = 6; | |
298 | } | |
299 | ||
300 | if (outbuf) | |
301 | { | |
302 | for (i = len - 1; i > 0; --i) | |
303 | { | |
304 | outbuf[i] = (c & 0x3f) | 0x80; | |
305 | c >>= 6; | |
306 | } | |
307 | outbuf[0] = c | first; | |
308 | } | |
309 | ||
310 | return len; | |
311 | } | |
312 | ||
313 | /* | |
314 | * g_utf8_to_ucs4_fast: | |
315 | * @str: a UTF-8 encoded string | |
316 | * @len: the maximum length of @str to use. If @len < 0, then | |
317 | * the string is nul-terminated. | |
318 | * @items_written: location to store the number of characters in the | |
319 | * result, or %NULL. | |
320 | * | |
321 | * Convert a string from UTF-8 to a 32-bit fixed width | |
322 | * representation as UCS-4, assuming valid UTF-8 input. | |
323 | * This function is roughly twice as fast as g_utf8_to_ucs4() | |
324 | * but does no error checking on the input. | |
325 | * | |
326 | * Return value: a pointer to a newly allocated UCS-4 string. | |
327 | * This value must be freed with g_free(). | |
328 | **/ | |
329 | static gunichar * | |
330 | g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written) | |
331 | { | |
332 | gint j, charlen; | |
333 | gunichar *result; | |
334 | gint n_chars, i; | |
335 | const gchar *p; | |
336 | ||
337 | g_return_val_if_fail (str != NULL, NULL); | |
338 | ||
339 | p = str; | |
340 | n_chars = 0; | |
341 | if (len < 0) | |
342 | { | |
343 | while (*p) | |
344 | { | |
345 | p = g_utf8_next_char (p); | |
346 | ++n_chars; | |
347 | } | |
348 | } | |
349 | else | |
350 | { | |
351 | while (p < str + len && *p) | |
352 | { | |
353 | p = g_utf8_next_char (p); | |
354 | ++n_chars; | |
355 | } | |
356 | } | |
357 | ||
358 | result = g_new (gunichar, n_chars + 1); | |
359 | if (!result) | |
360 | return NULL; | |
361 | ||
362 | p = str; | |
363 | for (i = 0; i < n_chars; i++) | |
364 | { | |
365 | gunichar wc = ((unsigned char *) p)[0]; | |
366 | ||
367 | if (wc < 0x80) | |
368 | { | |
369 | result[i] = wc; | |
370 | p++; | |
371 | } | |
372 | else | |
373 | { | |
374 | if (wc < 0xe0) | |
375 | { | |
376 | charlen = 2; | |
377 | wc &= 0x1f; | |
378 | } | |
379 | else if (wc < 0xf0) | |
380 | { | |
381 | charlen = 3; | |
382 | wc &= 0x0f; | |
383 | } | |
384 | else if (wc < 0xf8) | |
385 | { | |
386 | charlen = 4; | |
387 | wc &= 0x07; | |
388 | } | |
389 | else if (wc < 0xfc) | |
390 | { | |
391 | charlen = 5; | |
392 | wc &= 0x03; | |
393 | } | |
394 | else | |
395 | { | |
396 | charlen = 6; | |
397 | wc &= 0x01; | |
398 | } | |
399 | ||
400 | for (j = 1; j < charlen; j++) | |
401 | { | |
402 | wc <<= 6; | |
403 | wc |= ((unsigned char *) p)[j] & 0x3f; | |
404 | } | |
405 | ||
406 | result[i] = wc; | |
407 | p += charlen; | |
408 | } | |
409 | } | |
410 | result[i] = 0; | |
411 | ||
412 | if (items_written) | |
413 | *items_written = i; | |
414 | ||
415 | return result; | |
416 | } | |
417 | ||
418 | /* | |
419 | * g_ucs4_to_utf8: | |
420 | * @str: a UCS-4 encoded string | |
421 | * @len: the maximum length of @str to use. If @len < 0, then | |
422 | * the string is terminated with a 0 character. | |
423 | * @items_read: location to store number of characters read read, or %NULL. | |
424 | * @items_written: location to store number of bytes written or %NULL. | |
425 | * The value here stored does not include the trailing 0 | |
426 | * byte. | |
427 | * @error: location to store the error occuring, or %NULL to ignore | |
428 | * errors. Any of the errors in #GConvertError other than | |
429 | * %G_CONVERT_ERROR_NO_CONVERSION may occur. | |
430 | * | |
431 | * Convert a string from a 32-bit fixed width representation as UCS-4. | |
432 | * to UTF-8. The result will be terminated with a 0 byte. | |
433 | * | |
434 | * Return value: a pointer to a newly allocated UTF-8 string. | |
435 | * This value must be freed with g_free(). If an | |
436 | * error occurs, %NULL will be returned and | |
437 | * @error set. | |
438 | **/ | |
439 | static gchar * | |
440 | g_ucs4_to_utf8 (const gunichar * str, | |
441 | glong len, | |
442 | glong * items_read, glong * items_written, GError ** error) | |
443 | { | |
444 | gint result_length; | |
445 | gchar *result = NULL; | |
446 | gchar *p; | |
447 | gint i; | |
448 | ||
449 | result_length = 0; | |
450 | for (i = 0; len < 0 || i < len; i++) | |
451 | { | |
452 | if (!str[i]) | |
453 | break; | |
454 | ||
455 | if (str[i] >= 0x80000000) | |
456 | { | |
457 | if (items_read) | |
458 | *items_read = i; | |
459 | ||
460 | g_set_error (error, G_CONVERT_ERROR, | |
461 | G_CONVERT_ERROR_ILLEGAL_SEQUENCE, | |
462 | _("Character out of range for UTF-8")); | |
463 | goto err_out; | |
464 | } | |
465 | ||
466 | result_length += UTF8_LENGTH (str[i]); | |
467 | } | |
468 | ||
469 | result = g_malloc (result_length + 1); | |
470 | if (!result) | |
471 | return NULL; | |
472 | p = result; | |
473 | ||
474 | i = 0; | |
475 | while (p < result + result_length) | |
476 | p += g_unichar_to_utf8 (str[i++], p); | |
477 | ||
478 | *p = '\0'; | |
479 | ||
480 | if (items_written) | |
481 | *items_written = p - result; | |
482 | ||
483 | err_out: | |
484 | if (items_read) | |
485 | *items_read = i; | |
486 | ||
487 | return result; | |
488 | } | |
489 | ||
490 | /* Code from GLIB gunidecomp.c starts here. */ | |
491 | ||
492 | #include "gunidecomp.h" | |
493 | #include "gunicomp.h" | |
494 | ||
495 | #define CC_PART1(Page, Char) \ | |
496 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | |
497 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ | |
498 | : (cclass_data[combining_class_table_part1[Page]][Char])) | |
499 | ||
500 | #define CC_PART2(Page, Char) \ | |
501 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | |
502 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ | |
503 | : (cclass_data[combining_class_table_part2[Page]][Char])) | |
504 | ||
505 | #define COMBINING_CLASS(Char) \ | |
506 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ | |
507 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ | |
508 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ | |
509 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ | |
510 | : 0)) | |
511 | ||
512 | /* constants for hangul syllable [de]composition */ | |
513 | #define SBase 0xAC00 | |
514 | #define LBase 0x1100 | |
515 | #define VBase 0x1161 | |
516 | #define TBase 0x11A7 | |
517 | #define LCount 19 | |
518 | #define VCount 21 | |
519 | #define TCount 28 | |
520 | #define NCount (VCount * TCount) | |
521 | #define SCount (LCount * NCount) | |
522 | ||
523 | /* | |
524 | * g_unicode_canonical_ordering: | |
525 | * @string: a UCS-4 encoded string. | |
526 | * @len: the maximum length of @string to use. | |
527 | * | |
528 | * Computes the canonical ordering of a string in-place. | |
529 | * This rearranges decomposed characters in the string | |
530 | * according to their combining classes. See the Unicode | |
531 | * manual for more information. | |
532 | **/ | |
533 | static void | |
534 | g_unicode_canonical_ordering (gunichar * string, gsize len) | |
535 | { | |
536 | gsize i; | |
537 | int swap = 1; | |
538 | ||
539 | while (swap) | |
540 | { | |
541 | int last; | |
542 | swap = 0; | |
543 | last = COMBINING_CLASS (string[0]); | |
544 | for (i = 0; i < len - 1; ++i) | |
545 | { | |
546 | int next = COMBINING_CLASS (string[i + 1]); | |
547 | if (next != 0 && last > next) | |
548 | { | |
549 | gsize j; | |
550 | /* Percolate item leftward through string. */ | |
551 | for (j = i + 1; j > 0; --j) | |
552 | { | |
553 | gunichar t; | |
554 | if (COMBINING_CLASS (string[j - 1]) <= next) | |
555 | break; | |
556 | t = string[j]; | |
557 | string[j] = string[j - 1]; | |
558 | string[j - 1] = t; | |
559 | swap = 1; | |
560 | } | |
561 | /* We're re-entering the loop looking at the old | |
562 | character again. */ | |
563 | next = last; | |
564 | } | |
565 | last = next; | |
566 | } | |
567 | } | |
568 | } | |
569 | ||
570 | /* http://www.unicode.org/unicode/reports/tr15/#Hangul | |
571 | * r should be null or have sufficient space. Calling with r == NULL will | |
572 | * only calculate the result_len; however, a buffer with space for three | |
573 | * characters will always be big enough. */ | |
574 | static void | |
575 | decompose_hangul (gunichar s, gunichar * r, gsize * result_len) | |
576 | { | |
577 | gint SIndex = s - SBase; | |
578 | ||
579 | /* not a hangul syllable */ | |
580 | if (SIndex < 0 || SIndex >= SCount) | |
581 | { | |
582 | if (r) | |
583 | r[0] = s; | |
584 | *result_len = 1; | |
585 | } | |
586 | else | |
587 | { | |
588 | gunichar L = LBase + SIndex / NCount; | |
589 | gunichar V = VBase + (SIndex % NCount) / TCount; | |
590 | gunichar T = TBase + SIndex % TCount; | |
591 | ||
592 | if (r) | |
593 | { | |
594 | r[0] = L; | |
595 | r[1] = V; | |
596 | } | |
597 | ||
598 | if (T != TBase) | |
599 | { | |
600 | if (r) | |
601 | r[2] = T; | |
602 | *result_len = 3; | |
603 | } | |
604 | else | |
605 | *result_len = 2; | |
606 | } | |
607 | } | |
608 | ||
609 | /* returns a pointer to a null-terminated UTF-8 string */ | |
610 | static const gchar * | |
611 | find_decomposition (gunichar ch, gboolean compat) | |
612 | { | |
613 | int start = 0; | |
614 | int end = G_N_ELEMENTS (decomp_table); | |
615 | ||
616 | if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch) | |
617 | { | |
618 | while (TRUE) | |
619 | { | |
620 | int half = (start + end) / 2; | |
621 | if (ch == decomp_table[half].ch) | |
622 | { | |
623 | int offset; | |
624 | ||
625 | if (compat) | |
626 | { | |
627 | offset = decomp_table[half].compat_offset; | |
628 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) | |
629 | offset = decomp_table[half].canon_offset; | |
630 | } | |
631 | else | |
632 | { | |
633 | offset = decomp_table[half].canon_offset; | |
634 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) | |
635 | return NULL; | |
636 | } | |
637 | ||
638 | return &(decomp_expansion_string[offset]); | |
639 | } | |
640 | else if (half == start) | |
641 | break; | |
642 | else if (ch > decomp_table[half].ch) | |
643 | start = half; | |
644 | else | |
645 | end = half; | |
646 | } | |
647 | } | |
648 | ||
649 | return NULL; | |
650 | } | |
651 | ||
652 | /* L,V => LV and LV,T => LVT */ | |
653 | static gboolean | |
654 | combine_hangul (gunichar a, gunichar b, gunichar * result) | |
655 | { | |
656 | gint LIndex = a - LBase; | |
657 | gint SIndex = a - SBase; | |
658 | ||
659 | gint VIndex = b - VBase; | |
660 | gint TIndex = b - TBase; | |
661 | ||
662 | if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) | |
663 | { | |
664 | *result = SBase + (LIndex * VCount + VIndex) * TCount; | |
665 | return TRUE; | |
666 | } | |
667 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 | |
668 | && 0 <= TIndex && TIndex <= TCount) | |
669 | { | |
670 | *result = a + TIndex; | |
671 | return TRUE; | |
672 | } | |
673 | ||
674 | return FALSE; | |
675 | } | |
676 | ||
677 | #define CI(Page, Char) \ | |
678 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ | |
679 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ | |
680 | : (compose_data[compose_table[Page]][Char])) | |
681 | ||
682 | #define COMPOSE_INDEX(Char) \ | |
683 | ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) | |
684 | ||
685 | static gboolean | |
686 | combine (gunichar a, gunichar b, gunichar * result) | |
687 | { | |
688 | gushort index_a, index_b; | |
689 | ||
690 | if (combine_hangul (a, b, result)) | |
691 | return TRUE; | |
692 | ||
693 | index_a = COMPOSE_INDEX (a); | |
694 | ||
695 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) | |
696 | { | |
697 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) | |
698 | { | |
699 | *result = | |
700 | compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; | |
701 | return TRUE; | |
702 | } | |
703 | else | |
704 | return FALSE; | |
705 | } | |
706 | ||
707 | index_b = COMPOSE_INDEX (b); | |
708 | ||
709 | if (index_b >= COMPOSE_SECOND_SINGLE_START) | |
710 | { | |
711 | if (a == | |
712 | compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) | |
713 | { | |
714 | *result = | |
715 | compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; | |
716 | return TRUE; | |
717 | } | |
718 | else | |
719 | return FALSE; | |
720 | } | |
721 | ||
722 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START | |
723 | && index_b >= COMPOSE_SECOND_START | |
724 | && index_b < COMPOSE_SECOND_SINGLE_START) | |
725 | { | |
726 | gunichar res = | |
727 | compose_array[index_a - COMPOSE_FIRST_START][index_b - | |
728 | COMPOSE_SECOND_START]; | |
729 | ||
730 | if (res) | |
731 | { | |
732 | *result = res; | |
733 | return TRUE; | |
734 | } | |
735 | } | |
736 | ||
737 | return FALSE; | |
738 | } | |
739 | ||
740 | static gunichar * | |
741 | _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) | |
742 | { | |
743 | gsize n_wc; | |
744 | gunichar *wc_buffer; | |
745 | const char *p; | |
746 | gsize last_start; | |
747 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); | |
748 | gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); | |
749 | ||
750 | n_wc = 0; | |
751 | p = str; | |
752 | while ((max_len < 0 || p < str + max_len) && *p) | |
753 | { | |
754 | const gchar *decomp; | |
755 | gunichar wc = g_utf8_get_char (p); | |
756 | ||
757 | if (wc >= 0xac00 && wc <= 0xd7af) | |
758 | { | |
759 | gsize result_len; | |
760 | decompose_hangul (wc, NULL, &result_len); | |
761 | n_wc += result_len; | |
762 | } | |
763 | else | |
764 | { | |
765 | decomp = find_decomposition (wc, do_compat); | |
766 | ||
767 | if (decomp) | |
768 | n_wc += g_utf8_strlen (decomp, -1); | |
769 | else | |
770 | n_wc++; | |
771 | } | |
772 | ||
773 | p = g_utf8_next_char (p); | |
774 | } | |
775 | ||
776 | wc_buffer = g_new (gunichar, n_wc + 1); | |
777 | if (!wc_buffer) | |
778 | return NULL; | |
779 | ||
780 | last_start = 0; | |
781 | n_wc = 0; | |
782 | p = str; | |
783 | while ((max_len < 0 || p < str + max_len) && *p) | |
784 | { | |
785 | gunichar wc = g_utf8_get_char (p); | |
786 | const gchar *decomp; | |
787 | int cc; | |
788 | gsize old_n_wc = n_wc; | |
789 | ||
790 | if (wc >= 0xac00 && wc <= 0xd7af) | |
791 | { | |
792 | gsize result_len; | |
793 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); | |
794 | n_wc += result_len; | |
795 | } | |
796 | else | |
797 | { | |
798 | decomp = find_decomposition (wc, do_compat); | |
799 | ||
800 | if (decomp) | |
801 | { | |
802 | const char *pd; | |
803 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) | |
804 | wc_buffer[n_wc++] = g_utf8_get_char (pd); | |
805 | } | |
806 | else | |
807 | wc_buffer[n_wc++] = wc; | |
808 | } | |
809 | ||
810 | if (n_wc > 0) | |
811 | { | |
812 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); | |
813 | ||
814 | if (cc == 0) | |
815 | { | |
816 | g_unicode_canonical_ordering (wc_buffer + last_start, | |
817 | n_wc - last_start); | |
818 | last_start = old_n_wc; | |
819 | } | |
820 | } | |
821 | ||
822 | p = g_utf8_next_char (p); | |
823 | } | |
824 | ||
825 | if (n_wc > 0) | |
826 | { | |
827 | g_unicode_canonical_ordering (wc_buffer + last_start, | |
828 | n_wc - last_start); | |
829 | last_start = n_wc; | |
830 | } | |
831 | ||
832 | wc_buffer[n_wc] = 0; | |
833 | ||
834 | /* All decomposed and reordered */ | |
835 | ||
836 | if (do_compose && n_wc > 0) | |
837 | { | |
838 | gsize i, j; | |
839 | int last_cc = 0; | |
840 | last_start = 0; | |
841 | ||
842 | for (i = 0; i < n_wc; i++) | |
843 | { | |
844 | int cc = COMBINING_CLASS (wc_buffer[i]); | |
845 | ||
846 | if (i > 0 && | |
847 | (last_cc == 0 || last_cc != cc) && | |
848 | combine (wc_buffer[last_start], wc_buffer[i], | |
849 | &wc_buffer[last_start])) | |
850 | { | |
851 | for (j = i + 1; j < n_wc; j++) | |
852 | wc_buffer[j - 1] = wc_buffer[j]; | |
853 | n_wc--; | |
854 | i--; | |
855 | ||
856 | if (i == last_start) | |
857 | last_cc = 0; | |
858 | else | |
859 | last_cc = COMBINING_CLASS (wc_buffer[i - 1]); | |
860 | ||
861 | continue; | |
862 | } | |
863 | ||
864 | if (cc == 0) | |
865 | last_start = i; | |
866 | ||
867 | last_cc = cc; | |
868 | } | |
869 | } | |
870 | ||
871 | wc_buffer[n_wc] = 0; | |
872 | ||
873 | return wc_buffer; | |
874 | } | |
875 | ||
876 | /* | |
877 | * g_utf8_normalize: | |
878 | * @str: a UTF-8 encoded string. | |
879 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. | |
880 | * @mode: the type of normalization to perform. | |
881 | * | |
882 | * Converts a string into canonical form, standardizing | |
883 | * such issues as whether a character with an accent | |
884 | * is represented as a base character and combining | |
885 | * accent or as a single precomposed character. You | |
886 | * should generally call g_utf8_normalize() before | |
887 | * comparing two Unicode strings. | |
888 | * | |
889 | * The normalization mode %G_NORMALIZE_DEFAULT only | |
890 | * standardizes differences that do not affect the | |
891 | * text content, such as the above-mentioned accent | |
892 | * representation. %G_NORMALIZE_ALL also standardizes | |
893 | * the "compatibility" characters in Unicode, such | |
894 | * as SUPERSCRIPT THREE to the standard forms | |
895 | * (in this case DIGIT THREE). Formatting information | |
896 | * may be lost but for most text operations such | |
897 | * characters should be considered the same. | |
898 | * For example, g_utf8_collate() normalizes | |
899 | * with %G_NORMALIZE_ALL as its first step. | |
900 | * | |
901 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE | |
902 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, | |
903 | * but returned a result with composed forms rather | |
904 | * than a maximally decomposed form. This is often | |
905 | * useful if you intend to convert the string to | |
906 | * a legacy encoding or pass it to a system with | |
907 | * less capable Unicode handling. | |
908 | * | |
909 | * Return value: a newly allocated string, that is the | |
910 | * normalized form of @str. | |
911 | **/ | |
912 | static gchar * | |
913 | g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode) | |
914 | { | |
915 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); | |
916 | gchar *result; | |
917 | ||
918 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); | |
919 | g_free (result_wc); | |
920 | ||
921 | return result; | |
922 | } | |
923 | ||
924 | /* Public Libidn API starts here. */ | |
925 | ||
926 | /** | |
927 | * stringprep_utf8_to_unichar: | |
928 | * @p: a pointer to Unicode character encoded as UTF-8 | |
929 | * | |
930 | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | |
931 | * If @p does not point to a valid UTF-8 encoded character, results are | |
932 | * undefined. | |
933 | * | |
934 | * Return value: the resulting character. | |
935 | **/ | |
936 | uint32_t | |
937 | stringprep_utf8_to_unichar (const char *p) | |
938 | { | |
939 | return g_utf8_get_char (p); | |
940 | } | |
941 | ||
942 | /** | |
943 | * stringprep_unichar_to_utf8: | |
944 | * @c: a ISO10646 character code | |
945 | * @outbuf: output buffer, must have at least 6 bytes of space. | |
946 | * If %NULL, the length will be computed and returned | |
947 | * and nothing will be written to @outbuf. | |
948 | * | |
949 | * Converts a single character to UTF-8. | |
950 | * | |
951 | * Return value: number of bytes written. | |
952 | **/ | |
953 | int | |
954 | stringprep_unichar_to_utf8 (uint32_t c, char *outbuf) | |
955 | { | |
956 | return g_unichar_to_utf8 (c, outbuf); | |
957 | } | |
958 | ||
959 | /** | |
960 | * stringprep_utf8_to_ucs4: | |
961 | * @str: a UTF-8 encoded string | |
962 | * @len: the maximum length of @str to use. If @len < 0, then | |
963 | * the string is nul-terminated. | |
964 | * @items_written: location to store the number of characters in the | |
965 | * result, or %NULL. | |
966 | * | |
967 | * Convert a string from UTF-8 to a 32-bit fixed width | |
968 | * representation as UCS-4, assuming valid UTF-8 input. | |
969 | * This function does no error checking on the input. | |
970 | * | |
971 | * Return value: a pointer to a newly allocated UCS-4 string. | |
972 | * This value must be freed with free(). | |
973 | **/ | |
974 | uint32_t * | |
975 | stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written) | |
976 | { | |
977 | return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written); | |
978 | } | |
979 | ||
980 | /** | |
981 | * stringprep_ucs4_to_utf8: | |
982 | * @str: a UCS-4 encoded string | |
983 | * @len: the maximum length of @str to use. If @len < 0, then | |
984 | * the string is terminated with a 0 character. | |
985 | * @items_read: location to store number of characters read read, or %NULL. | |
986 | * @items_written: location to store number of bytes written or %NULL. | |
987 | * The value here stored does not include the trailing 0 | |
988 | * byte. | |
989 | * | |
990 | * Convert a string from a 32-bit fixed width representation as UCS-4. | |
991 | * to UTF-8. The result will be terminated with a 0 byte. | |
992 | * | |
993 | * Return value: a pointer to a newly allocated UTF-8 string. | |
994 | * This value must be freed with free(). If an | |
995 | * error occurs, %NULL will be returned and | |
996 | * @error set. | |
997 | **/ | |
998 | char * | |
999 | stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len, | |
1000 | size_t * items_read, size_t * items_written) | |
1001 | { | |
1002 | return g_ucs4_to_utf8 (str, len, (glong *) items_read, | |
1003 | (glong *) items_written, NULL); | |
1004 | } | |
1005 | ||
1006 | /** | |
1007 | * stringprep_utf8_nfkc_normalize: | |
1008 | * @str: a UTF-8 encoded string. | |
1009 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. | |
1010 | * | |
1011 | * Converts a string into canonical form, standardizing | |
1012 | * such issues as whether a character with an accent | |
1013 | * is represented as a base character and combining | |
1014 | * accent or as a single precomposed character. | |
1015 | * | |
1016 | * The normalization mode is NFKC (ALL COMPOSE). It standardizes | |
1017 | * differences that do not affect the text content, such as the | |
1018 | * above-mentioned accent representation. It standardizes the | |
1019 | * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to | |
1020 | * the standard forms (in this case DIGIT THREE). Formatting | |
1021 | * information may be lost but for most text operations such | |
1022 | * characters should be considered the same. It returns a result with | |
1023 | * composed forms rather than a maximally decomposed form. | |
1024 | * | |
1025 | * Return value: a newly allocated string, that is the | |
1026 | * NFKC normalized form of @str. | |
1027 | **/ | |
1028 | char * | |
1029 | stringprep_utf8_nfkc_normalize (const char *str, ssize_t len) | |
1030 | { | |
1031 | return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); | |
1032 | } | |
1033 | ||
1034 | /** | |
1035 | * stringprep_ucs4_nfkc_normalize: | |
1036 | * @str: a Unicode string. | |
1037 | * @len: length of @str array, or -1 if @str is nul-terminated. | |
1038 | * | |
1039 | * Converts UCS4 string into UTF-8 and runs | |
1040 | * stringprep_utf8_nfkc_normalize(). | |
1041 | * | |
1042 | * Return value: a newly allocated Unicode string, that is the NFKC | |
1043 | * normalized form of @str. | |
1044 | **/ | |
1045 | uint32_t * | |
1046 | stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len) | |
1047 | { | |
1048 | char *p; | |
1049 | uint32_t *result_wc; | |
1050 | ||
1051 | p = stringprep_ucs4_to_utf8 (str, len, 0, 0); | |
1052 | result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC); | |
1053 | free (p); | |
1054 | ||
1055 | return result_wc; | |
1056 | } |