]>
Commit | Line | Data |
---|---|---|
55985355 | 1 | /* Transliteration using the locale's data. |
dff8da6b | 2 | Copyright (C) 2000-2024 Free Software Foundation, Inc. |
55985355 | 3 | This file is part of the GNU C Library. |
55985355 UD |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
55985355 UD |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 13 | Lesser General Public License for more details. |
55985355 | 14 | |
41bdb6e2 | 15 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 | 16 | License along with the GNU C Library; if not, see |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
55985355 | 18 | |
d6204268 | 19 | #include <assert.h> |
f1d5c60d | 20 | #include <dlfcn.h> |
d6204268 | 21 | #include <search.h> |
55985355 | 22 | #include <stdint.h> |
d6204268 | 23 | #include <string.h> |
7884bf47 | 24 | #include <stdlib.h> |
55985355 | 25 | |
ec999b8e | 26 | #include <libc-lock.h> |
55985355 UD |
27 | #include "gconv_int.h" |
28 | #include "../locale/localeinfo.h" | |
88f4b692 | 29 | #include <pointer_guard.h> |
55985355 UD |
30 | |
31 | ||
32 | int | |
f1d5c60d UD |
33 | __gconv_transliterate (struct __gconv_step *step, |
34 | struct __gconv_step_data *step_data, | |
35 | const unsigned char *inbufstart, | |
36 | const unsigned char **inbufp, | |
37 | const unsigned char *inbufend, | |
38 | unsigned char **outbufstart, size_t *irreversible) | |
55985355 UD |
39 | { |
40 | /* Find out about the locale's transliteration. */ | |
535e935a | 41 | uint32_t size; |
17427edd UD |
42 | const uint32_t *from_idx; |
43 | const uint32_t *from_tbl; | |
44 | const uint32_t *to_idx; | |
45 | const uint32_t *to_tbl; | |
46 | const uint32_t *winbuf; | |
47 | const uint32_t *winbufend; | |
535e935a NG |
48 | uint32_t low; |
49 | uint32_t high; | |
55985355 | 50 | |
d5055a20 | 51 | /* The input buffer. There are actually 4-byte values. */ |
17427edd UD |
52 | winbuf = (const uint32_t *) *inbufp; |
53 | winbufend = (const uint32_t *) inbufend; | |
d5055a20 | 54 | |
1911b455 | 55 | __gconv_fct fct = step->__fct; |
1911b455 UD |
56 | if (step->__shlib_handle != NULL) |
57 | PTR_DEMANGLE (fct); | |
1911b455 | 58 | |
55985355 UD |
59 | /* If there is no transliteration information in the locale don't do |
60 | anything and return the error. */ | |
04fbc779 | 61 | size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE); |
55985355 | 62 | if (size == 0) |
1d96d74d | 63 | goto no_rules; |
55985355 | 64 | |
f1d5c60d | 65 | /* Get the rest of the values. */ |
17427edd UD |
66 | from_idx = |
67 | (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX); | |
68 | from_tbl = | |
69 | (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL); | |
70 | to_idx = | |
71 | (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX); | |
72 | to_tbl = | |
73 | (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL); | |
f1d5c60d | 74 | |
f1d5c60d UD |
75 | /* Test whether there is enough input. */ |
76 | if (winbuf + 1 > winbufend) | |
77 | return (winbuf == winbufend | |
78 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); | |
79 | ||
14ef9c18 DL |
80 | /* The array starting at FROM_IDX contains indices to the string table |
81 | in FROM_TBL. The indices are sorted wrt to the strings. I.e., we | |
f1d5c60d UD |
82 | are doing binary search. */ |
83 | low = 0; | |
84 | high = size; | |
85 | while (low < high) | |
86 | { | |
535e935a | 87 | uint32_t med = (low + high) / 2; |
f1d5c60d UD |
88 | uint32_t idx; |
89 | int cnt; | |
90 | ||
91 | /* Compare the string at this index with the string at the current | |
92 | position in the input buffer. */ | |
93 | idx = from_idx[med]; | |
94 | cnt = 0; | |
95 | do | |
96 | { | |
97 | if (from_tbl[idx + cnt] != winbuf[cnt]) | |
98 | /* Does not match. */ | |
99 | break; | |
100 | ++cnt; | |
101 | } | |
102 | while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend); | |
103 | ||
104 | if (cnt > 0 && from_tbl[idx + cnt] == L'\0') | |
105 | { | |
106 | /* Found a matching input sequence. Now try to convert the | |
107 | possible replacements. */ | |
108 | uint32_t idx2 = to_idx[med]; | |
109 | ||
110 | do | |
111 | { | |
112 | /* Determine length of replacement. */ | |
535e935a | 113 | unsigned int len = 0; |
f1d5c60d UD |
114 | int res; |
115 | const unsigned char *toinptr; | |
403cb8a1 | 116 | unsigned char *outptr; |
f1d5c60d UD |
117 | |
118 | while (to_tbl[idx2 + len] != L'\0') | |
119 | ++len; | |
120 | ||
121 | /* Try this input text. */ | |
122 | toinptr = (const unsigned char *) &to_tbl[idx2]; | |
403cb8a1 | 123 | outptr = *outbufstart; |
1911b455 | 124 | res = DL_CALL_FCT (fct, |
f1d5c60d UD |
125 | (step, step_data, &toinptr, |
126 | (const unsigned char *) &to_tbl[idx2 + len], | |
403cb8a1 | 127 | &outptr, NULL, 0, 0)); |
f1d5c60d UD |
128 | if (res != __GCONV_ILLEGAL_INPUT) |
129 | { | |
130 | /* If the conversion succeeds we have to increment the | |
131 | input buffer. */ | |
132 | if (res == __GCONV_EMPTY_INPUT) | |
133 | { | |
134 | *inbufp += cnt * sizeof (uint32_t); | |
135 | ++*irreversible; | |
a8e4c924 | 136 | res = __GCONV_OK; |
f1d5c60d | 137 | } |
1b14353e UD |
138 | /* Do not increment the output pointer if we could not |
139 | store the entire output. */ | |
140 | if (res != __GCONV_FULL_OUTPUT) | |
141 | *outbufstart = outptr; | |
f1d5c60d UD |
142 | |
143 | return res; | |
144 | } | |
145 | ||
146 | /* Next replacement. */ | |
147 | idx2 += len + 1; | |
148 | } | |
149 | while (to_tbl[idx2] != L'\0'); | |
150 | ||
151 | /* Nothing found, continue searching. */ | |
152 | } | |
a8e4c924 UD |
153 | else if (cnt > 0) |
154 | /* This means that the input buffer contents matches a prefix of | |
155 | an entry. Since we cannot match it unless we get more input, | |
156 | we will tell the caller about it. */ | |
157 | return __GCONV_INCOMPLETE_INPUT; | |
f1d5c60d UD |
158 | |
159 | if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt]) | |
04fbc779 | 160 | low = med + 1; |
f1d5c60d | 161 | else |
04fbc779 | 162 | high = med; |
f1d5c60d UD |
163 | } |
164 | ||
1d96d74d | 165 | no_rules: |
a8e4c924 UD |
166 | /* Maybe the character is supposed to be ignored. */ |
167 | if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0) | |
168 | { | |
169 | int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN); | |
17427edd UD |
170 | const uint32_t *ranges = |
171 | (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE); | |
172 | const uint32_t wc = *(const uint32_t *) (*inbufp); | |
a8e4c924 UD |
173 | int i; |
174 | ||
175 | /* Test whether there is enough input. */ | |
176 | if (winbuf + 1 > winbufend) | |
177 | return (winbuf == winbufend | |
178 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); | |
179 | ||
180 | for (i = 0; i < n; ranges += 3, ++i) | |
181 | if (ranges[0] <= wc && wc <= ranges[1] | |
182 | && (wc - ranges[0]) % ranges[2] == 0) | |
183 | { | |
184 | /* Matches the range. Ignore it. */ | |
185 | *inbufp += 4; | |
186 | ++*irreversible; | |
187 | return __GCONV_OK; | |
188 | } | |
189 | else if (wc < ranges[0]) | |
190 | /* There cannot be any other matching range since they are | |
191 | sorted. */ | |
192 | break; | |
193 | } | |
194 | ||
195 | /* One last chance: use the default replacement. */ | |
fb46e8d2 | 196 | if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0) |
1d96d74d | 197 | { |
17427edd | 198 | const uint32_t *default_missing = (const uint32_t *) |
fb46e8d2 | 199 | _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING); |
1d96d74d UD |
200 | const unsigned char *toinptr = (const unsigned char *) default_missing; |
201 | uint32_t len = _NL_CURRENT_WORD (LC_CTYPE, | |
202 | _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN); | |
403cb8a1 | 203 | unsigned char *outptr; |
1d96d74d UD |
204 | int res; |
205 | ||
a8e4c924 UD |
206 | /* Test whether there is enough input. */ |
207 | if (winbuf + 1 > winbufend) | |
208 | return (winbuf == winbufend | |
209 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); | |
210 | ||
403cb8a1 | 211 | outptr = *outbufstart; |
1911b455 | 212 | res = DL_CALL_FCT (fct, |
1d96d74d UD |
213 | (step, step_data, &toinptr, |
214 | (const unsigned char *) (default_missing + len), | |
403cb8a1 | 215 | &outptr, NULL, 0, 0)); |
1d96d74d UD |
216 | |
217 | if (res != __GCONV_ILLEGAL_INPUT) | |
218 | { | |
219 | /* If the conversion succeeds we have to increment the | |
220 | input buffer. */ | |
221 | if (res == __GCONV_EMPTY_INPUT) | |
222 | { | |
a8e4c924 | 223 | /* This worked but is not reversible. */ |
1d96d74d | 224 | ++*irreversible; |
a8e4c924 UD |
225 | *inbufp += 4; |
226 | res = __GCONV_OK; | |
1d96d74d | 227 | } |
403cb8a1 | 228 | *outbufstart = outptr; |
1d96d74d UD |
229 | |
230 | return res; | |
231 | } | |
232 | } | |
233 | ||
f1d5c60d | 234 | /* Haven't found a match. */ |
55985355 UD |
235 | return __GCONV_ILLEGAL_INPUT; |
236 | } | |
ba7b4d29 | 237 | libc_hidden_def (__gconv_transliterate) |