[PATCH v4 4/4] iconv: Add UTF-7-IMAP variant in utf-7.c
Adhemerval Zanella
adhemerval.zanella@linaro.org
Mon Mar 7 12:46:38 GMT 2022
On 09/12/2021 06:31, Max Gautier via Libc-alpha wrote:
> UTF-7-IMAP differs from UTF-7 in the followings ways (see RFC 3501[1]
> for reference) :
>
> - The shift character is '&' instead of '+'
> - There is no "optional direct characters" and the "direct characters"
> set is different
> - ',' replaces '/' in the Modified Base64 alphabet
> - There is no implicit shift back to US-ASCII from BASE64, all BASE64
> sequences MUST be terminated with '-'
>
> [1]: https://datatracker.ietf.org/doc/html/rfc3501#section-5.1.3
>
> Signed-off-by: Max Gautier <mg@max.gautier.name>
Patch looks ok, some minor style issues (as for other parts as well).
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
> ---
> iconvdata/TESTS | 1 +
> iconvdata/gconv-modules | 4 ++++
> iconvdata/testdata/UTF-7-IMAP | 1 +
> iconvdata/testdata/UTF-7-IMAP..UTF8 | 32 +++++++++++++++++++++++++++++
> iconvdata/utf-7.c | 28 ++++++++++++++++++++-----
> 5 files changed, 61 insertions(+), 5 deletions(-)
> create mode 100644 iconvdata/testdata/UTF-7-IMAP
> create mode 100644 iconvdata/testdata/UTF-7-IMAP..UTF8
>
> diff --git a/iconvdata/TESTS b/iconvdata/TESTS
> index a0157c3350..3cc043c21b 100644
> --- a/iconvdata/TESTS
> +++ b/iconvdata/TESTS
> @@ -94,6 +94,7 @@ EUC-TW EUC-TW Y UTF8
> GBK GBK Y UTF8
> BIG5HKSCS BIG5HKSCS Y UTF8
> UTF-7 UTF-7 N UTF8
> +UTF-7-IMAP UTF-7-IMAP N UTF8
> IBM856 IBM856 N UTF8
> IBM922 IBM922 Y UTF8
> IBM930 IBM930 N UTF8
Ok.
> diff --git a/iconvdata/gconv-modules b/iconvdata/gconv-modules
> index 4acbba062f..d120699394 100644
> --- a/iconvdata/gconv-modules
> +++ b/iconvdata/gconv-modules
> @@ -113,3 +113,7 @@ module INTERNAL UTF-32BE// UTF-32 1
> alias UTF7// UTF-7//
> module UTF-7// INTERNAL UTF-7 1
> module INTERNAL UTF-7// UTF-7 1
> +
> +# from to module cost
> +module UTF-7-IMAP// INTERNAL UTF-7 1
> +module INTERNAL UTF-7-IMAP// UTF-7 1
Ok.
> diff --git a/iconvdata/testdata/UTF-7-IMAP b/iconvdata/testdata/UTF-7-IMAP
> new file mode 100644
> index 0000000000..6b5dada63c
> --- /dev/null
> +++ b/iconvdata/testdata/UTF-7-IMAP
> @@ -0,0 +1 @@
> +&EqASGxItEps- Amharic&AAoBDQ-esky Czech&AAo-Dansk Danish&AAo-English English&AAo-Suomi Finnish&AAo-Fran&AOc-ais French&AAo-Deutsch German&AAoDlQO7A7sDtwO9A7kDugOs- Greek&AAoF4gXRBegF2QXq- Hebrew&AAo-Italiano Italian&AAo-Norsk Norwegian&AAoEIARDBEEEQQQ6BDgEOQ- Russian&AAo-Espa&APE-ol Spanish&AAo-Svenska Swedish&AAoOIA4yDikOMg5EDhcOIg- Thai&AAo-T&APw-rk&AOc-e Turkish&AAo-Ti&Hr8-ng Vi&Hsc-t Vietnamese&AApl5Wcsip4- Japanese&AApOLWWH- Chinese&AArVXK4A- Korean&AAoACg-// Checking for correct handling of shift characters ('&-', '-') after base64 sequences&AArVXK4A-&-&AArVXK4A--&AAoACg-// Checking for correct handling of litteral '&-' and '-'&AAo----&-&--&AAoACg-// The last line of this file is missing the end-of-line terminator&AAo-// on purpose, in order to test that the conversion empties the bit buffer&AAo-// and shifts back to the initial state at the end of the conversion.&AAo-A&ImIDkQ-
> \ No newline at end of file
Ok.
> diff --git a/iconvdata/testdata/UTF-7-IMAP..UTF8 b/iconvdata/testdata/UTF-7-IMAP..UTF8
> new file mode 100644
> index 0000000000..8b9add3670
> --- /dev/null
> +++ b/iconvdata/testdata/UTF-7-IMAP..UTF8
> @@ -0,0 +1,32 @@
> +አማርኛ Amharic
> +česky Czech
> +Dansk Danish
> +English English
> +Suomi Finnish
> +Français French
> +Deutsch German
> +Ελληνικά Greek
> +עברית Hebrew
> +Italiano Italian
> +Norsk Norwegian
> +Русский Russian
> +Español Spanish
> +Svenska Swedish
> +ภาษาไทย Thai
> +Türkçe Turkish
> +Tiếng Việt Vietnamese
> +日本語 Japanese
> +中文 Chinese
> +한글 Korean
> +
> +// Checking for correct handling of shift characters ('&', '-') after base64 sequences
> +한글&
> +한글-
> +
> +// Checking for correct handling of litteral '&' and '-'
> +---&&-
> +
> +// The last line of this file is missing the end-of-line terminator
> +// on purpose, in order to test that the conversion empties the bit buffer
> +// and shifts back to the initial state at the end of the conversion.
> +A≢Α
> \ No newline at end of file
Ok.
> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> index 965d4220f1..553636e324 100644
> --- a/iconvdata/utf-7.c
> +++ b/iconvdata/utf-7.c
> @@ -32,11 +32,13 @@
> enum variant
> {
> UTF7,
> + UTF_7_IMAP
> };
>
> /* Must be in the same order as enum variant above. */
> static const char names[] =
> "UTF-7//\0"
> + "UTF-7-IMAP//\0"
> "\0";
>
> static uint32_t
> @@ -44,6 +46,8 @@ shift_character(enum variant const var)
> {
> if (var == UTF7)
> return '+';
> + else if (var == UTF_7_IMAP)
> + return '&';
> else
> abort();
> }
> @@ -58,6 +62,9 @@ between(uint32_t const ch,
> /* The set of "direct characters":
> FOR UTF-7
> A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
> + FOR UTF-7-IMAP
> + A-Z a-z 0-9 ' ( ) , - . / : ? space
> + ! " # $ % + * ; < = > @ [ \ ] ^ _ ` { | } ~
> */
>
> static int
> @@ -71,6 +78,8 @@ isdirect (uint32_t ch, enum variant var)
> || between(ch, ',', '/')
> || ch == ':' || ch == '?'
> || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
> + else if (var == UTF_7_IMAP)
> + return (ch != '&' && between(ch, ' ', '~'));
> abort();
> }
>
Some style issues as before.
> @@ -127,6 +136,8 @@ base64 (unsigned int i, enum variant var)
> return '+';
> else if (i == 63 && var == UTF7)
> return '/';
> + else if (i == 63 && var == UTF_7_IMAP)
> + return ',';
> else
> abort ();
> }
> @@ -313,7 +324,8 @@ gconv_end (struct __gconv_step *data)
> i = ch - '0' + 52; \
> else if (ch == '+') \
> i = 62; \
> - else if (ch == '/') \
> + else if ((var == UTF7 && ch == '/') \
> + || (var == UTF_7_IMAP && ch == ',')) \
> i = 63; \
> else \
> { \
> @@ -321,8 +333,10 @@ gconv_end (struct __gconv_step *data)
> \
> /* If accumulated data is nonzero, the input is invalid. */ \
> /* Also, partial UTF-16 characters are invalid. */ \
> + /* In IMAP variant, must be terminated by '-'. */ \
> if (__builtin_expect (statep->__value.__wch != 0, 0) \
> - || __builtin_expect ((statep->__count >> 3) <= 26, 0)) \
> + || __builtin_expect ((statep->__count >> 3) <= 26, 0) \
> + || __builtin_expect (var == UTF_7_IMAP && ch != '-', 0)) \
Use __glibc_likely.
> { \
> STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \
> } \
> @@ -479,13 +493,15 @@ gconv_end (struct __gconv_step *data)
> else \
> { \
> /* base64 encoding active */ \
> - if (isdirect (ch, var)) \
> + if ((var == UTF_7_IMAP && ch == '&') || isdirect (ch, var)) \
> { \
> /* deactivate base64 encoding */ \
> size_t count; \
> \
> count = ((statep->__count & 0x18) >= 0x10) \
> - + needs_explicit_shift (ch) + 1; \
> + + (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
> + + (var == UTF_7_IMAP && ch == '&') \
> + + 1; \
> if (__glibc_unlikely (outptr + count > outend)) \
> { \
> result = __GCONV_FULL_OUTPUT; \
> @@ -494,9 +510,11 @@ gconv_end (struct __gconv_step *data)
> \
> if ((statep->__count & 0x18) >= 0x10) \
> *outptr++ = base64 ((statep->__count >> 3) & ~3, var); \
> - if (needs_explicit_shift (ch)) \
> + if (var == UTF_7_IMAP || needs_explicit_shift (ch)) \
> *outptr++ = '-'; \
> *outptr++ = (unsigned char) ch; \
> + if (var == UTF_7_IMAP && ch == '&') \
> + *outptr++ = '-'; \
> statep->__count = 0; \
> } \
> else \
More information about the Libc-alpha
mailing list