From 161211d186a16e4f090b8b3c63040f0b9aee25d4 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Tue, 24 Mar 2009 12:18:34 +0000 Subject: [PATCH] * ctype.cc (_CTYPE_DATA_0_127): Add _B class to TAB character. (__ctype_default): New character class array for default ASCII character set. (__ctype_iso): New array of character class array for ISO charsets. (__ctype_cp): Ditto for singlebyte Windows codepages. (tolower): Implement as distinct function to support any singlebyte charset. (toupper): Ditto. (__set_ctype): New function to copy singlebyte character classes corresponding to current charset to ctype_b array. Align copyright text to upstream. * dcrt0.cc (dll_crt0_1): Reset current locale to "C" per POSIX. * environ.cc (set_file_api_mode): Remove. (codepage_init): Remove. (parse_thing): Remove "codepage" setting. (environ_init): Set locale according to environment settings, or to current codepage, before converting environment to multibyte. * fhandler.h (fhandler_console::write_replacement_char): Drop argument. * fhandler_console.cc (dev_console::str_to_con): Call sys_cp_mbstowcs rather than MultiByteToWideChar. (fhandler_console::write_replacement_char): Always print a funny half filled square if a character isn't in the current charset. (fhandler_console::write_normal): Convert to using __mbtowc rather than next_char. * fork.cc (frok::child): Drop call to set_file_api_mode. * globals.cc (enum codepage_type) Remove. (current_codepage): Remove. * miscfuncs.cc (cygwin_wcslwr): Unused, dangerous. Remove. (cygwin_wcsupr): Ditto. (is_cp_multibyte): Remove. (next_char): Remove. * miscfuncs.h (is_cp_multibyte): Drop declaration. (next_char): Ditto. * strfuncs.cc (get_cp): Remove. (__db_wctomb): New function to implement _wctomb_r functionality for doublebyte charsets using WideCharToMultiByte. (__sjis_wctomb): New function to replace unusable newlib function. (__jis_wctomb): Ditto. (__eucjp_wctomb): Ditto. (__gbk_wctomb): New function. (__kr_wctomb): Ditto. (__big5_wctomb): Ditto. (__db_mbtowc): New function to implement _mbtowc_r functionality for doublebyte charsets using MultiByteToWideChar. (__sjis_mbtowc): New function to replace unusable newlib function. (__jis_mbtowc): Ditto. (__eucjp_mbtowc): Ditto. (__gbk_mbtowc): New function. (__kr_mbtowc): New function (__big5_mbtowc): New function (__set_charset_from_codepage): New function. (sys_wcstombs): Reimplement, basically using same wide char to multibyte conversion as newlib's application level functions. Plus extras. Add lengthy comment to explain. Change return type to size_t. (sys_wcstombs_alloc): Just use sys_wcstombs. Change return type to size_t. (sys_cp_mbstowcs): Replace sys_mbstowcs, take additional codepage argument. Explain why. Change return type to size_t. (sys_mbstowcs_alloc): Just use sys_mbstowcs. Change return type to size_t. * wchar.h: Declare internal functions implemented in strfuncs.cc. (wcscasecmp): Remove. (wcsncasecmp): Remove. (wcslwr): Remove. (wcsupr): Remove. * winsup.h (codepage_init): Remove declaration. (get_cp): Ditto. (sys_wcstombs): Align declaration to new implementation. (sys_wcstombs_alloc): Ditto. (sys_cp_mbstowcs): Add declaration. (sys_mbstowcs): Define as inline function. (sys_mbstowcs_alloc): Align declaration to new implementation. (set_file_api_mode): Remove declaration. * include/ctype.h (isblank): Redefine to use _B character class. (toupper): Remove ASCII-only definition. (tolower): Ditto. --- winsup/cygwin/ChangeLog | 81 +++- winsup/cygwin/ctype.cc | 724 +++++++++++++++++++++++++++++- winsup/cygwin/dcrt0.cc | 3 + winsup/cygwin/environ.cc | 69 +-- winsup/cygwin/fhandler.h | 2 +- winsup/cygwin/fhandler_console.cc | 88 ++-- winsup/cygwin/fork.cc | 2 - winsup/cygwin/globals.cc | 2 - winsup/cygwin/include/ctype.h | 8 +- winsup/cygwin/miscfuncs.cc | 132 ------ winsup/cygwin/miscfuncs.h | 5 - winsup/cygwin/strfuncs.cc | 489 +++++++++++++++++--- winsup/cygwin/wchar.h | 27 +- winsup/cygwin/winsup.h | 19 +- 14 files changed, 1336 insertions(+), 315 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index 79076ce1b..dea7f3d22 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,4 +1,83 @@ -2009-03-23 Corinna Vinschen +2009-03-24 Corinna Vinschen + + * ctype.cc (_CTYPE_DATA_0_127): Add _B class to TAB character. + (__ctype_default): New character class array for default ASCII + character set. + (__ctype_iso): New array of character class array for ISO charsets. + (__ctype_cp): Ditto for singlebyte Windows codepages. + (tolower): Implement as distinct function to support any singlebyte + charset. + (toupper): Ditto. + (__set_ctype): New function to copy singlebyte character classes + corresponding to current charset to ctype_b array. + Align copyright text to upstream. + * dcrt0.cc (dll_crt0_1): Reset current locale to "C" per POSIX. + * environ.cc (set_file_api_mode): Remove. + (codepage_init): Remove. + (parse_thing): Remove "codepage" setting. + (environ_init): Set locale according to environment settings, or + to current codepage, before converting environment to multibyte. + * fhandler.h (fhandler_console::write_replacement_char): Drop argument. + * fhandler_console.cc (dev_console::str_to_con): Call sys_cp_mbstowcs + rather than MultiByteToWideChar. + (fhandler_console::write_replacement_char): Always print a funny + half filled square if a character isn't in the current charset. + (fhandler_console::write_normal): Convert to using __mbtowc + rather than next_char. + * fork.cc (frok::child): Drop call to set_file_api_mode. + * globals.cc (enum codepage_type) Remove. + (current_codepage): Remove. + * miscfuncs.cc (cygwin_wcslwr): Unused, dangerous. Remove. + (cygwin_wcsupr): Ditto. + (is_cp_multibyte): Remove. + (next_char): Remove. + * miscfuncs.h (is_cp_multibyte): Drop declaration. + (next_char): Ditto. + * strfuncs.cc (get_cp): Remove. + (__db_wctomb): New function to implement _wctomb_r functionality for + doublebyte charsets using WideCharToMultiByte. + (__sjis_wctomb): New function to replace unusable newlib function. + (__jis_wctomb): Ditto. + (__eucjp_wctomb): Ditto. + (__gbk_wctomb): New function. + (__kr_wctomb): Ditto. + (__big5_wctomb): Ditto. + (__db_mbtowc): New function to implement _mbtowc_r functionality for + doublebyte charsets using MultiByteToWideChar. + (__sjis_mbtowc): New function to replace unusable newlib function. + (__jis_mbtowc): Ditto. + (__eucjp_mbtowc): Ditto. + (__gbk_mbtowc): New function. + (__kr_mbtowc): New function + (__big5_mbtowc): New function + (__set_charset_from_codepage): New function. + (sys_wcstombs): Reimplement, basically using same wide char to multibyte + conversion as newlib's application level functions. Plus extras. + Add lengthy comment to explain. Change return type to size_t. + (sys_wcstombs_alloc): Just use sys_wcstombs. Change return type to + size_t. + (sys_cp_mbstowcs): Replace sys_mbstowcs, take additional codepage + argument. Explain why. Change return type to size_t. + (sys_mbstowcs_alloc): Just use sys_mbstowcs. Change return type to + size_t. + * wchar.h: Declare internal functions implemented in strfuncs.cc. + (wcscasecmp): Remove. + (wcsncasecmp): Remove. + (wcslwr): Remove. + (wcsupr): Remove. + * winsup.h (codepage_init): Remove declaration. + (get_cp): Ditto. + (sys_wcstombs): Align declaration to new implementation. + (sys_wcstombs_alloc): Ditto. + (sys_cp_mbstowcs): Add declaration. + (sys_mbstowcs): Define as inline function. + (sys_mbstowcs_alloc): Align declaration to new implementation. + (set_file_api_mode): Remove declaration. + * include/ctype.h (isblank): Redefine to use _B character class. + (toupper): Remove ASCII-only definition. + (tolower): Ditto. + +2009-03-24 Corinna Vinschen * sec_auth.cc (str2buf2uni): Remove. * security.h (str2buf2uni): Remove declaration. diff --git a/winsup/cygwin/ctype.cc b/winsup/cygwin/ctype.cc index b656d3e08..5d70b6118 100644 --- a/winsup/cygwin/ctype.cc +++ b/winsup/cygwin/ctype.cc @@ -1,10 +1,12 @@ #include "winsup.h" extern "C" { #include +#include +#include #define _CTYPE_DATA_0_127 \ _C, _C, _C, _C, _C, _C, _C, _C, \ - _C, _C|_S, _C|_S, _C|_S, _C|_S, _C|_S, _C, _C, \ + _C, _B|_C|_S, _C|_S, _C|_S, _C|_S, _C|_S, _C, _C, \ _C, _C, _C, _C, _C, _C, _C, _C, \ _C, _C, _C, _C, _C, _C, _C, _C, \ _S|_B, _P, _P, _P, _P, _P, _P, _P, \ @@ -36,7 +38,645 @@ extern "C" { 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, \ - 0, 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0, 0, 0, 0, 0 + +/* FIXME: These tables should rather be defined in newlib and we should + switch to the newer __ctype_ptr method from newlib for new applications. */ + +static char __ctype_default[128] = { _CTYPE_DATA_128_256 }; +static char __ctype_iso[15][128] = { + /* ISO-8859-1 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-2 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _P, _U, _P, _U, _U, _P, + _P, _U, _U, _U, _U, _P, _U, _U, + _P, _L, _P, _L, _P, _L, _L, _P, + _P, _L, _L, _L, _L, _P, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-3 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _P, _P, _P, 0, _U, _P, + _P, _U, _U, _U, _U, _P, 0, _U, + _P, _L, _P, _P, _P, _L, _L, _P, + _P, _L, _L, _L, _L, _P, 0, _L, + _U, _U, _U, 0, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + 0, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, 0, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + 0, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _P }, + /* ISO-8859-4 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _L, _U, _P, _U, _U, _P, + _P, _U, _U, _U, _U, _P, _U, _P, + _P, _L, _P, _L, _P, _L, _L, _P, + _P, _L, _L, _L, _L, _P, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-5 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _P, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _P, _L, _L }, + /* ISO-8859-6 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, 0, 0, 0, _P, 0, 0, 0, + 0, 0, 0, 0, _P, _P, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, _P, 0, 0, 0, _P, + 0, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + 0, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, 0, 0, 0, 0, 0, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _P, _P, _P, _P, _P, + _P, _P, _P, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }, + /* ISO-8859-7 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _U, _P, + _U, _U, _U, _P, _U, _P, _U, _U, + _L, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P }, + /* ISO-8859-8 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, 0, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, 0, 0, _P, _P, 0 }, + /* ISO-8859-9 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-10 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _P, _U, _U, + _P, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _P, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-11 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _P, _L, _L, _P, _P, _P, _P, + _P, _P, _P, 0, 0, 0, 0, _P, + _L, _L, _L, _L, _L, _L, _L, _P, + _P, _P, _P, _P, _P, _P, _P, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, 0, 0, 0, 0 }, + /* ISO-8859-13 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _U, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _L, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _P }, + /* ISO-8859-14 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _L, _P, _U, _L, _U, _P, + _U, _P, _U, _L, _U, _P, _P, _U, + _U, _L, _U, _L, _U, _L, _P, _U, + _L, _L, _L, _U, _L, _U, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-15 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* ISO-8859-16 */ + { _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _C, _C, _C, _C, _C, _C, _C, _C, + _S|_B, _U, _L, _U, _P, _P, _U, _P, + _L, _P, _U, _P, _U, _P, _L, _U, + _P, _P, _U, _U, _U, _P, _P, _P, + _L, _L, _L, _P, _U, _L, _U, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L } +}; +static char __ctype_cp[22][128] = { + /* CP437 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _U, _U, + _U, _L, _U, _L, _L, _L, _L, _L, + _L, _U, _U, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _U, _L, _U, _L, _P, _L, + _U, _U, _U, _L, _P, _L, _L, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP720 */ + { 0, 0, _L, _L, 0, _L, 0, _L, + _L, _L, _L, _L, _L, 0, 0, 0, + 0, _P, _P, _L, _P, _P, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + 0, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP737 */ + { _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _U, _U, _U, _U, _U, _U, + _U, _P, _P, _P, _P, _U, _U, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP775 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _U, _L, _L, _U, _U, _U, + _U, _L, _U, _L, _L, _U, _P, _U, + _L, _U, _U, _P, _P, _P, _P, _P, + _U, _U, _L, _U, _L, _L, _P, _P, + _P, _P, _P, _P, _P, _U, _P, _P, + _P, _P, _P, _P, _P, _U, _U, _U, + _U, _P, _P, _P, _P, _U, _U, _P, + _P, _P, _P, _P, _P, _P, _U, _U, + _P, _P, _P, _P, _P, _P, _P, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _P, _P, _P, _P, _P, _P, _P, + _U, _L, _U, _U, _L, _U, _L, _L, + _U, _L, _U, _L, _L, _U, _U, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP850 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _U, _U, + _U, _L, _U, _L, _L, _L, _L, _L, + _L, _U, _U, _L, _P, _U, _P, _P, + _L, _L, _L, _L, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _U, _U, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _L, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _U, _U, _U, _U, _L, _U, _U, + _U, _P, _P, _P, _P, _P, _U, _P, + _U, _L, _U, _U, _L, _U, _L, _U, + _L, _U, _U, _U, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP852 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _U, _L, _L, _U, _U, _U, + _U, _U, _L, _L, _L, _U, _L, _U, + _L, _U, _U, _U, _L, _U, _P, _L, + _L, _L, _L, _L, _U, _L, _U, _L, + _U, _L, _P, _L, _U, _L, _P, _P, + _P, _P, _P, _P, _P, _U, _U, _U, + _U, _P, _P, _P, _P, _U, _L, _P, + _P, _P, _P, _P, _P, _P, _U, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _U, _U, _U, _L, _U, _U, _U, + _L, _P, _P, _P, _P, _U, _U, _P, + _U, _L, _U, _U, _L, _L, _U, _L, + _U, _U, _L, _U, _L, _U, _L, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _L, _U, _L, _P, _S|_B }, + /* CP855 */ + { _L, _U, _L, _U, _L, _U, _L, _U, + _L, _U, _L, _U, _L, _U, _L, _U, + _L, _U, _L, _U, _L, _U, _L, _U, + _L, _U, _L, _U, _L, _U, _L, _U, + _L, _U, _L, _U, _L, _U, _L, _U, + _L, _U, _L, _U, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _L, _U, _L, + _U, _P, _P, _P, _P, _L, _U, _P, + _P, _P, _P, _P, _P, _P, _L, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _U, _L, _U, _L, _U, _L, _U, + _L, _P, _P, _P, _P, _U, _L, _P, + _U, _L, _U, _L, _U, _L, _U, _L, + _U, _L, _U, _L, _U, _L, _U, _P, + _P, _L, _U, _L, _U, _L, _U, _L, + _U, _L, _U, _L, _U, _P, _P, _S|_B }, + /* CP857 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _U, _U, + _U, _L, _U, _L, _L, _L, _L, _L, + _U, _U, _U, _L, _P, _U, _U, _L, + _L, _L, _L, _L, _L, _U, _U, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _U, _U, _U, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _L, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _U, _U, _U, _L, _U, _U, + _U, _P, _P, _P, _P, _P, _U, _P, + _U, _L, _U, _U, _L, _U, _L, _L, + _P, _U, _U, _U, _L, _L, _P, _P, + _P, _P, _L, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP858 */ + { _U, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _U, _U, + _U, _L, _U, _L, _L, _L, _L, _L, + _L, _U, _U, _L, _P, _U, _P, _P, + _L, _L, _L, _L, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _U, _U, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _L, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _U, _U, _U, _U, _P, _U, _U, + _U, _P, _P, _P, _P, _P, _U, _P, + _U, _L, _U, _U, _L, _U, _L, _U, + _L, _U, _U, _U, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP862 */ + { _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _U, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _U, _L, _U, _L, _P, _L, + _U, _U, _U, _L, _P, _L, _L, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP866 */ + { _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _U, _L, _U, _L, _U, _L, _U, _L, + _P, _P, _P, _P, _P, _P, _P, _S|_B }, + /* CP874 */ + { _P, 0, 0, 0, 0, _P, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + 0, 0, 0, 0, 0, 0, 0, 0, + _S|_B, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, 0, 0, 0, 0, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _L, _L, 0, 0, 0, 0 }, + /* CP1125 */ + { _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _U, _L, _U, _L, _U, _L, _U, _L, + _U, _L, _P, _P, _P, _P, _P, _S|_B }, + /* CP1250 */ + { _P, 0, _P, 0, _P, _P, _P, _P, + 0, _P, _U, _P, _U, _U, _U, _U, + 0, _P, _P, _P, _P, _P, _P, _P, + 0, _P, _L, _P, _L, _L, _L, _L, + _S|_B, _P, _P, _U, _P, _U, _P, _P, + _P, _P, _U, _P, _P, _P, _P, _U, + _P, _P, _P, _L, _P, _P, _P, _P, + _P, _L, _L, _P, _U, _P, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _P }, + /* CP1251 */ + { _U, _U, _P, _L, _P, _P, _P, _P, + _P, _P, _U, _P, _U, _U, _U, _U, + _L, _P, _P, _P, _P, _P, _P, _P, + _L, _P, _L, _L, _L, _L, _P, _U, + _S|_B, _U, _L, _U, _P, _U, _P, _P, + _U, _P, _U, _P, _P, _P, _P, _U, + _P, _P, _U, _L, _L, _P, _P, _P, + _L, _P, _L, _P, _L, _U, _L, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* CP1252 */ + { _P, 0, _P, _L, _P, _P, _P, _P, + _P, _P, _U, _P, _U, _U, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _L, _P, _L, 0, _L, _U, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* CP1253 */ + { _P, 0, _P, _L, _P, _P, _P, _P, + 0, _P, 0, _P, 0, 0, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + 0, _P, _P, 0, 0, 0, 0, 0, + _S|_B, _P, _U, _P, _P, _P, _P, _P, + _P, _P, 0, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _P, _U, _P, _U, _U, + _L, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* CP1254 */ + { _P, 0, _P, _L, _P, _P, _P, _P, + _P, _P, _U, _P, _U, 0, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _L, _P, _L, 0, 0, _U, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _L }, + /* CP1255 */ + { _P, 0, _P, _L, _P, _P, _P, _P, + _P, _P, 0, _P, 0, 0, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + _P, _P, 0, _P, 0, 0, 0, 0, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, 0, 0, 0, 0, 0, 0, 0, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, 0, 0, _P, _P, 0 }, + /* CP1256 */ + { _P, _L, _P, _L, _P, _P, _P, _P, + _P, _P, _L, _P, _U, _L, _L, _L, + _L, _P, _P, _P, _P, _P, _P, _P, + _L, _P, _L, _P, _L, _P, _P, _L, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _L, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _P, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _P, _P, _P, _P, _L, _P, _P, _P, + _P, _L, _P, _L, _L, _P, _P, _L }, + /* CP1257 */ + { _P, 0, _P, 0, _P, _P, _P, _P, + 0, _P, 0, _P, 0, _P, _P, _P, + 0, _P, _P, _P, _P, _P, _P, _P, + 0, _P, 0, _P, 0, _P, _P, 0, + _S|_B, 0, _P, _P, _P, 0, _P, _P, + _U, _P, _U, _P, _P, _P, _P, _U, + _P, _P, _P, _P, _P, _P, _P, _P, + _L, _P, _L, _P, _P, _P, _P, _L, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _U, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _L, _P }, + /* CP1258 */ + { _P, 0, _P, _L, _P, _P, _P, _P, + _P, _P, 0, _P, _U, 0, 0, 0, + 0, _P, _P, _P, _P, _P, _P, _P, + _P, _P, 0, _P, _L, 0, 0, _U, + _S|_B, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _P, _P, _P, _P, _P, _P, _P, _P, + _U, _U, _U, _U, _U, _U, _U, _U, + _U, _U, _U, _U, _P, _U, _U, _U, + _U, _U, _P, _U, _U, _U, _U, _P, + _U, _U, _U, _U, _U, _U, _P, _L, + _L, _L, _L, _L, _L, _L, _L, _L, + _L, _L, _L, _L, _P, _L, _L, _L, + _L, _L, _P, _L, _L, _L, _L, _P, + _L, _L, _L, _L, _L, _L, _P, _L } +}; char ctype_b[128 + 256] = { _CTYPE_DATA_128_256, @@ -70,12 +710,81 @@ makefunc(ispunct) makefunc(isspace) makefunc(isupper) makefunc(isxdigit) -makefunc(tolower) -makefunc(toupper) makefunc(isblank) makefunc(isascii) makefunc(toascii) + +static int __cdecl +c_tolower (int c) +{ + if ((unsigned char) c <= 0x7f) + return isupper (c) ? c + 0x20 : c; + + char s[8] = { c, '\0' }; + wchar_t wc; + if (mbtowc (&wc, s, 1) >= 0 + && wctomb (s, (wchar_t) towlower ((wint_t) wc)) == 1) + c = s[0]; + return c; +} +EXPORT_ALIAS(c_tolower, tolower) + +static int __cdecl +c_toupper (int c) +{ + if ((unsigned char) c <= 0x7f) + return islower (c) ? c - 0x20 : c; + + char s[8] = { c, '\0' }; + wchar_t wc; + if (mbtowc (&wc, s, 1) >= 0 + && wctomb (s, (wchar_t) towupper ((wint_t) wc)) == 1) + c = s[0]; + return c; } +EXPORT_ALIAS(c_toupper, toupper) + +/* Called from newlib's setlocale(). What we do here is to copy the + 128 bytes of charset specific ctype data into the array at _ctype_b. + Given that the functionality is usually implemented locally in the + application, that's the only backward compatible way to do it. + Setlocale is usually only called once in an application, so this isn't + time-critical anyway. */ +int __iso_8859_index (const char *charset_ext); /* Newlib */ +int __cp_index (const char *charset_ext); /* Newlib */ + +void +__set_ctype (const char *charset) +{ + int idx; + + switch (*charset) + { + case 'I': + idx = __iso_8859_index (charset + 9); + /* Our ctype table has a leading ISO-8859-1 element. */ + if (idx < 0) + idx = 0; + else + ++idx; + memcpy (ctype_b, __ctype_iso[idx], 128); + memcpy (ctype_b + 256, __ctype_iso[idx], 128); + return; + case 'C': + idx = __cp_index (charset + 2); + if (idx < 0) + break; + memcpy (ctype_b, __ctype_cp[idx], 128); + memcpy (ctype_b + 256, __ctype_cp[idx], 128); + return; + default: + break; + } + memcpy (ctype_b, __ctype_default, 128); + memcpy (ctype_b + 256, __ctype_default, 128); +} + +} /* extern "C" */ /* * Copyright (c) 1989 The Regents of the University of California. @@ -89,13 +798,6 @@ makefunc(toascii) * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/winsup/cygwin/dcrt0.cc b/winsup/cygwin/dcrt0.cc index 53a3228c7..70d2650d9 100644 --- a/winsup/cygwin/dcrt0.cc +++ b/winsup/cygwin/dcrt0.cc @@ -16,6 +16,7 @@ details. */ #include #include "glob.h" #include +#include #include "environ.h" #include "sigproc.h" #include "pinfo.h" @@ -921,6 +922,8 @@ dll_crt0_1 (void *) do this for noncygwin case since the signal thread is blocked due to LoadLibrary serialization. */ ld_preload (); + /* Reset current locale to "C" per POSIX */ + _setlocale_r (_GLOBAL_REENT, LC_CTYPE, "C"); if (user_data->main) cygwin_exit (user_data->main (__argc, __argv, *user_data->envptr)); __asm__ (" \n\ diff --git a/winsup/cygwin/environ.cc b/winsup/cygwin/environ.cc index fa4441de6..41b122b53 100644 --- a/winsup/cygwin/environ.cc +++ b/winsup/cygwin/environ.cc @@ -13,6 +13,7 @@ details. */ #include #include #include +#include #include #include #include @@ -552,48 +553,6 @@ glob_init (const char *buf) } } -void -set_file_api_mode (codepage_type cp) -{ - if (cp == oem_cp) - { - SetFileApisToOEM (); - debug_printf ("File APIs set to OEM"); - } - else - { - SetFileApisToANSI (); - debug_printf ("File APIs set to ANSI"); - } -} - -void -codepage_init (const char *buf) -{ - if (!buf) - buf = "ansi"; - - if (ascii_strcasematch (buf, "oem")) - { - current_codepage = oem_cp; - active_codepage = GetOEMCP (); - } - else if (ascii_strcasematch (buf, "utf8")) - { - current_codepage = utf8_cp; - active_codepage = CP_UTF8; - } - else - { - if (!ascii_strcasematch (buf, "ansi")) - debug_printf ("Wrong codepage name: %s", buf); - /* Fallback to ANSI */ - current_codepage = ansi_cp; - active_codepage = GetACP (); - } - set_file_api_mode (current_codepage); -} - static void set_chunksize (const char *buf) { @@ -629,7 +588,6 @@ static struct parse_thing } values[2]; } known[] NO_COPY = { - {"codepage", {func: &codepage_init}, isfunc, NULL, {{0}, {0}}}, {"dosfilewarning", {&dos_file_warning}, justset, NULL, {{false}, {true}}}, {"envcache", {&envcache}, justset, NULL, {{true}, {false}}}, {"error_start", {func: &error_start_init}, isfunc, NULL, {{0}, {0}}}, @@ -774,6 +732,8 @@ environ_init (char **envp, int envc) static char NO_COPY cygterm[] = "TERM=cygwin"; myfault efault; tmp_pathbuf tp; + bool got_lc = false; + static const char *lc_arr[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL }; if (efault.faulted ()) api_fatal ("internal error reading the windows environment - too many environment variables?"); @@ -818,10 +778,27 @@ environ_init (char **envp, int envc) /* Allocate space for environment + trailing NULL + CYGWIN env. */ lastenviron = envp = (char **) malloc ((4 + (envc = 100)) * sizeof (char *)); - /* We need the CYGWIN variable content before we can loop through + /* We need the locale variables' content before we can loop through the whole environment, so that the wide-char to multibyte conversion - can be done according to the "codepage" setting, as well as the - uppercasing according to the "upcaseenv" setting. */ + can be done according to the $LC_ALL/$LC_CTYPE/$LANG/current_codepage + setting, as well as the uppercasing according to the "upcaseenv" + setting. Note that we have to reset the LC_CTYPE setting to "C" + before calling main() for POSIX compatibility. */ + for (int lc = 0; lc_arr[lc]; ++lc) + { + if ((i = GetEnvironmentVariableA (lc_arr[lc], NULL, 0))) + { + char *buf = (char *) alloca (i); + GetEnvironmentVariableA (lc_arr[lc], buf, i); + if (_setlocale_r (_GLOBAL_REENT, LC_CTYPE, buf)) + got_lc = true; + } + } + /* No matching POSIX environment variable, use current codepage. */ + if (!got_lc) + _setlocale_r (_GLOBAL_REENT, LC_CTYPE, "en_US"); + /* We also need the CYGWIN variable early to know the value of the + CYGWIN=upcaseenv setting for the below loop. */ if ((i = GetEnvironmentVariableA ("CYGWIN", NULL, 0))) { char *buf = (char *) alloca (i); diff --git a/winsup/cygwin/fhandler.h b/winsup/cygwin/fhandler.h index 0fc075d83..d526f529d 100644 --- a/winsup/cygwin/fhandler.h +++ b/winsup/cygwin/fhandler.h @@ -927,7 +927,7 @@ class fhandler_console: public fhandler_termios void cursor_set (bool, int, int); void cursor_get (int *, int *); void cursor_rel (int, int); - void write_replacement_char (const unsigned char *); + void write_replacement_char (); const unsigned char *write_normal (unsigned const char*, unsigned const char *); void char_command (char); bool set_raw_win32_keyboard_mode (bool); diff --git a/winsup/cygwin/fhandler_console.cc b/winsup/cygwin/fhandler_console.cc index 11cf70639..68fb71ea9 100644 --- a/winsup/cygwin/fhandler_console.cc +++ b/winsup/cygwin/fhandler_console.cc @@ -13,6 +13,7 @@ details. */ #include "miscfuncs.h" #include #include +#include #include #include #include @@ -133,13 +134,13 @@ dev_console::con_to_str (char *d, int dlen, WCHAR w) inline UINT dev_console::get_console_cp () { - return alternate_charset_active ? GetConsoleOutputCP () : get_cp (); + return alternate_charset_active ? GetConsoleOutputCP () : 0; } inline DWORD dev_console::str_to_con (PWCHAR d, const char *s, DWORD sz) { - return MultiByteToWideChar (get_console_cp (), 0, s, sz, d, CONVERT_LIMIT); + return sys_cp_mbstowcs (get_console_cp (), d, CONVERT_LIMIT, s, sz); } bool @@ -1400,22 +1401,15 @@ beep () MessageBeep (MB_OK); } -/* This gets called when we found an invalid UTF-8 character. We try with - the default ANSI codepage. If that fails we just print a question mark. - Looks ugly but is a neat and alomst sane fallback for many languages. */ +/* This gets called when we found an invalid input character. We just + print a half filled square (UTF 0x2592). We have no chance to figure + out the "meaning" of the input char anyway. */ void -fhandler_console::write_replacement_char (const unsigned char *char_p) +fhandler_console::write_replacement_char () { - int n; - WCHAR def_cp_chars[2]; + static const wchar_t replacement_char = 0x2592; /* Half filled square */ DWORD done; - - n = MultiByteToWideChar (GetACP (), 0, (const CHAR *) char_p, 1, - def_cp_chars, 2); - if (n) - WriteConsoleW (get_output_handle (), def_cp_chars, n, &done, 0); - else - WriteConsoleW (get_output_handle (), L"?", 1, &done, 0); + WriteConsoleW (get_output_handle (), &replacement_char, 1, &done, 0); } const unsigned char * @@ -1426,22 +1420,46 @@ fhandler_console::write_normal (const unsigned char *src, DWORD done; DWORD buf_len; const unsigned char *found = src; - const unsigned char *nfound; + size_t ret; + mbstate_t ps; UINT cp = dev_state->get_console_cp (); + char charsetbuf[32]; + char *charset = __locale_charset (); + mbtowc_p f_mbtowc = __mbtowc; + + if (cp) + f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf); /* First check if we have cached lead bytes of a former try to write a truncated multibyte sequence. If so, process it. */ if (trunc_buf.len) { + const unsigned char *nfound; int cp_len = min (end - src, 4 - trunc_buf.len); memcpy (trunc_buf.buf + trunc_buf.len, src, cp_len); - nfound = next_char (cp, trunc_buf.buf, - trunc_buf.buf + trunc_buf.len + cp_len); - /* Still truncated multibyte sequence? Keep in trunc_buf. */ - if (nfound == trunc_buf.buf) + memset (&ps, 0, sizeof ps); + switch (ret = f_mbtowc (_REENT, NULL, (const char *) trunc_buf.buf, + trunc_buf.len + cp_len, charset, &ps)) { + case -2: + /* Still truncated multibyte sequence? Keep in trunc_buf. */ trunc_buf.len += cp_len; return end; + case -1: + /* Give up, print replacement chars for trunc_buf... */ + for (int i = 0; i < trunc_buf.len; ++i) + write_replacement_char (); + /* ... mark trunc_buf as unused... */ + trunc_buf.len = 0; + /* ... and proceed. */ + nfound = NULL; + break; + case 0: + nfound = trunc_buf.buf + 1; + break; + default: + nfound = trunc_buf.buf + ret; + break; } /* Valid multibyte sequence? Process. */ if (nfound) @@ -1454,28 +1472,32 @@ fhandler_console::write_normal (const unsigned char *src, trunc_buf.len = 0; return found; } - /* Give up, print replacement chars for trunc_buf... */ - for (int i = 0; i < trunc_buf.len; ++i) - write_replacement_char (trunc_buf.buf + i); - /* ... mark trunc_buf as unused... */ - trunc_buf.len = 0; - /* ... and proceed. */ } + memset (&ps, 0, sizeof ps); while (found < end && found - src < CONVERT_LIMIT && base_chars[*found] == NOR) { - nfound = next_char (cp, found, end); - if (!nfound) /* Invalid multibyte sequence. */ - break; - if (nfound == found) /* Truncated multibyte sequence. */ - { /* Stick to it until the next write. */ + switch (ret = f_mbtowc (_REENT, NULL, (const char *) found, + end - found, charset, &ps)) + { + case -2: + /* Truncated multibyte sequence. Stick to it until the next write. */ trunc_buf.len = end - found; memcpy (trunc_buf.buf, found, trunc_buf.len); return end; + case -1: + break; + case 0: + found++; + break; + default: + found += ret; + break; } - found = nfound; + if (ret == (size_t) -1) /* Invalid multibyte sequence. */ + break; } /* Print all the base ones out */ @@ -1558,7 +1580,7 @@ fhandler_console::write_normal (const unsigned char *src, cursor_set (false, 8 * (x / 8 + 1), y); break; case NOR: - write_replacement_char (found); + write_replacement_char (); break; } found++; diff --git a/winsup/cygwin/fork.cc b/winsup/cygwin/fork.cc index 19f8950be..6887448df 100644 --- a/winsup/cygwin/fork.cc +++ b/winsup/cygwin/fork.cc @@ -205,8 +205,6 @@ frok::child (volatile char * volatile here) } #endif - set_file_api_mode (current_codepage); - MALLOC_CHECK; /* Incredible but true: If we use sockets and SYSV IPC shared memory, diff --git a/winsup/cygwin/globals.cc b/winsup/cygwin/globals.cc index b87b070ef..38f6460c3 100644 --- a/winsup/cygwin/globals.cc +++ b/winsup/cygwin/globals.cc @@ -25,8 +25,6 @@ HMODULE NO_COPY cygwin_hmodule; HANDLE hExeced; /* Codepage and multibyte string specific stuff. */ -enum codepage_type {ansi_cp, oem_cp, utf8_cp}; -codepage_type current_codepage = ansi_cp; UINT active_codepage; /* program exit the program */ diff --git a/winsup/cygwin/include/ctype.h b/winsup/cygwin/include/ctype.h index 343bafbd5..f3341822b 100644 --- a/winsup/cygwin/include/ctype.h +++ b/winsup/cygwin/include/ctype.h @@ -49,7 +49,7 @@ extern const __declspec(dllimport) char _ctype_[]; #if !defined(__cplusplus) || defined(__INSIDE_CYGWIN__) #define isalpha(c) ((_ctype_+1)[(unsigned)(c)]&(_U|_L)) -#define isblank(c) ((c) == ' ' || (c) == '\t') +#define isblank(c) ((_ctype_+1)[(unsigned)(c)]&_B) #define isupper(c) ((_ctype_+1)[(unsigned)(c)]&_U) #define islower(c) ((_ctype_+1)[(unsigned)(c)]&_L) #define isdigit(c) ((_ctype_+1)[(unsigned)(c)]&_N) @@ -60,12 +60,6 @@ extern const __declspec(dllimport) char _ctype_[]; #define isprint(c) ((_ctype_+1)[(unsigned)(c)]&(_P|_U|_L|_N|_B)) #define isgraph(c) ((_ctype_+1)[(unsigned)(c)]&(_P|_U|_L|_N)) #define iscntrl(c) ((_ctype_+1)[(unsigned)(c)]&_C) -/* Non-gcc versions will get the library versions, and will be - slightly slower */ -# define toupper(c) \ - __extension__ ({ int __x = (c); islower(__x) ? (__x - 'a' + 'A') : __x;}) -# define tolower(c) \ - __extension__ ({ int __x = (c); isupper(__x) ? (__x - 'A' + 'a') : __x;}) #endif /* !__cplusplus */ #if !defined(__STRICT_ANSI__) || defined(__INSIDE_CYGWIN__) diff --git a/winsup/cygwin/miscfuncs.cc b/winsup/cygwin/miscfuncs.cc index 845575edf..29b8159ad 100644 --- a/winsup/cygwin/miscfuncs.cc +++ b/winsup/cygwin/miscfuncs.cc @@ -141,26 +141,6 @@ cygwin_strncasecmp (const char *cs, const char *ct, size_t n) return RtlCompareUnicodeString (&us, &ut, TRUE); } -extern "C" wchar_t * __stdcall -cygwin_wcslwr (wchar_t *string) -{ - UNICODE_STRING us; - - RtlInitUnicodeString (&us, string); - RtlDowncaseUnicodeString (&us, &us, FALSE); - return string; -} - -extern "C" wchar_t * __stdcall -cygwin_wcsupr (wchar_t *string) -{ - UNICODE_STRING us; - - RtlInitUnicodeString (&us, string); - RtlUpcaseUnicodeString (&us, &us, FALSE); - return string; -} - extern "C" char * __stdcall cygwin_strlwr (char *string) { @@ -189,118 +169,6 @@ cygwin_strupr (char *string) return string; } -/* FIXME? We only support standard ANSI/OEM codepages according to - http://www.microsoft.com/globaldev/reference/cphome.mspx as well - as UTF-8 and codepage 1361, which is also mentioned as valid - doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx). - Everything else will be hosed. */ - -bool -is_cp_multibyte (UINT cp) -{ - switch (cp) - { - case 932: - case 936: - case 949: - case 950: - case 1361: - case 65001: - return true; - } - return false; -} - -/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with - double byte charsets. So we have to do it ourselves for UTF-8. - - While being at it, we do more. If a double-byte or multibyte - sequence is truncated due to an early end, we need a way to recognize - it. The reason is that multiple buffered write statements might - accidentally stop and start in the middle of a single character byte - sequence. If we have to interpret the byte sequences (as in - fhandler_console), we would print wrong output in these cases. - - So we have four possible return values here: - - ret = end if str >= end - ret = NULL if we encounter an invalid byte sequence - ret = str if we encounter the start byte of a truncated byte sequence - ret = str + n if we encounter a vaild byte sequence -*/ - -const unsigned char * -next_char (UINT cp, const unsigned char *str, const unsigned char *end) -{ - const unsigned char *ret = NULL; - - if (str >= end) - return end; - - switch (cp) - { - case 932: - case 936: - case 949: - case 950: - case 1361: - if (*str <= 0x7f) - ret = str + 1; - else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str)) - ret = str; - else - ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0); - break; - case CP_UTF8: - switch (str[0] >> 4) - { - case 0x0 ... 0x7: /* One byte character. */ - ret = str + 1; - break; - case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */ - ret = NULL; - break; - case 0xc ... 0xd: /* Two byte character. */ - /* Check followup bytes for validity. */ - if (str >= end - 1) - ret = str; - else if (str[1] <= 0xbf) - ret = str + 2; - else - ret = NULL; - break; - case 0xe: /* Three byte character. */ - if (str >= end - 2) - ret = str; - else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 - && (str[0] != 0xe0 || str[1] >= 0xa0) - && (str[0] != 0xed || str[1] <= 0x9f)) - ret = str + 3; - else - ret = NULL; - break; - case 0xf: /* Four byte character. */ - if (str[0] >= 0xf8) - ret = NULL; - else if (str >= end - 3) - ret = str; - else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 - && (str[3] & 0xc0) == 0x80 - && (str[0] == 0xf0 || str[1] >= 0x90) - && (str[0] == 0xf4 || str[1] <= 0x8f)) - ret = str + 4; - else - ret = NULL; - break; - } - break; - default: - ret = str + 1; - break; - } - return ret; -} - int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) { diff --git a/winsup/cygwin/miscfuncs.h b/winsup/cygwin/miscfuncs.h index d75c9afcc..4755d724a 100644 --- a/winsup/cygwin/miscfuncs.h +++ b/winsup/cygwin/miscfuncs.h @@ -25,11 +25,6 @@ void backslashify (const char *, char *, bool); void slashify (const char *, char *, bool); #define isslash(c) ((c) == '/') -/* multibyte stuff */ -bool is_cp_multibyte (UINT cp); -const unsigned char *next_char (UINT cp, const unsigned char *str, - const unsigned char *end); - /* Memory checking */ int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2))); diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index e051cd7b4..5a9dd7d14 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -20,45 +20,356 @@ details. */ #include "fhandler.h" #include "dtable.h" #include "cygheap.h" +#include "tls_pbuf.h" -UINT -get_cp () +/* The SJIS, JIS and EUCJP conversion in newlib does not use UTF as + wchar_t character representation. That's unfortunate for us since + we require UTF for the OS. What we do here is to have our own + implementation of the base functions for the conversion using + the MulitByteToWideChar/WideCharToMultiByte functions. */ + +/* GBK, CP949, and Big5 conversions are not available so far in newlib. */ + +static int +__db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp) +{ + if (s == NULL) + return 0; + + if (wchar < 0x80) + { + *s = (char) wchar; + return 1; + } + + BOOL def_used = false; + int ret = WideCharToMultiByte (cp, cp > 50000 ? 0 : WC_NO_BEST_FIT_CHARS, + &wchar, 1, s, MB_CUR_MAX, NULL, &def_used); + if (ret > 0 && !def_used) + return ret; + + r->_errno = EILSEQ; + return -1; +} + +extern "C" int +__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) { - if (!active_codepage) - codepage_init ("ansi"); - return active_codepage; + return __db_wctomb (r,s, wchar, 932); } -/* tlen is always treated as the maximum buffer size, including the '\0' - character. sys_wcstombs will always return a 0-terminated result, no - matter what. */ -int __stdcall -sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen) +extern "C" int +__jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) { + return __db_wctomb (r,s, wchar, 50220); +} + +extern "C" int +__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 51932); +} + +extern "C" int +__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 936); +} + +extern "C" int +__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 949); +} + +extern "C" int +__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 950); +} + +static int +__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + UINT cp, mbstate_t *state) +{ + wchar_t dummy; + char buf[2]; int ret; + + if (pwc == NULL) + pwc = &dummy; - /* Convert UNICODE private use area. Reverse functionality (only for - path names) is transform_chars in path.cc. */ - if (slen < 0) - slen = wcslen (src) + 1; - WCHAR sbuf[slen]; - memcpy (sbuf, src, slen * sizeof (WCHAR)); - const unsigned char *end = (unsigned char *) (sbuf + slen); - for (unsigned char *s = ((unsigned char *) sbuf) + 1; s < end; - s += sizeof (WCHAR)) - if (*s == 0xf0) - *s = 0; - ret = WideCharToMultiByte (get_cp (), 0, sbuf, slen, tgt, tlen, NULL, NULL); - if (ret && tgt) + if (s == NULL) + return 0; /* not state-dependent */ + + if (n == 0) + return -2; + + if (state->__count == 0) + { + if (*(unsigned char *) s < 0x80) + { + *pwc = *(unsigned char *) s; + return *s ? 1 : 0; + } + ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + s, 2, pwc, 1); + if (ret) + return *s ? 2 : 0; + if (n == 1) + { + state->__count = 1; + state->__value.__wchb[0] = *s; + return -2; + } + else + { + /* These Win32 functions are really crappy. Assuming n is 2 + but the first byte is a singlebyte charcode, the function + does not convert that byte and return 1, rather it just + returns 0. So, what we do here is to check if the first + byte returns a valid value... */ + ret = MultiByteToWideChar (cp, + cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + s, 1, pwc, 1); + if (ret) + return *s ? 1 : 0; + } + r->_errno = EILSEQ; + return -1; + } + if (!*s) + return -2; + buf[0] = state->__value.__wchb[0]; + buf[1] = *s; + ret = MultiByteToWideChar (cp, cp > 50000 ? 0 : MB_ERR_INVALID_CHARS, + buf, 2, pwc, 1); + if (!ret) { - ret = (ret < tlen) ? ret : tlen - 1; - tgt[ret] = '\0'; + r->_errno = EILSEQ; + return -1; } return ret; } +extern "C" int +__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 932, state); +} + +extern "C" int +__jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 50220, state); +} + +extern "C" int +__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 51932, state); +} + +extern "C" int +__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 936, state); +} + +extern "C" int +__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 949, state); +} + +extern "C" int +__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 950, state); +} + +/* Convert Windows codepage to a setlocale compatible character set code. + Called from newlib's setlocale() with the current ANSI codepage, if the + charset isn't given explicitely in the POSIX compatible locale specifier. + The function also returns a pointer to the corresponding _mbtowc_r + function. This is used below in the sys_cp_mbstowcs function which + is called directly from fhandler_console if the "Alternate Charset" has + been switched on by an escape sequence. */ +extern "C" mbtowc_p +__set_charset_from_codepage (UINT cp, char *charset) +{ + switch (cp) + { + case 437: + case 720: + case 737: + case 775: + case 850: + case 852: + case 855: + case 857: + case 858: + case 862: + case 866: + case 874: + case 1125: + case 1250: + case 1251: + case 1252: + case 1253: + case 1254: + case 1255: + case 1256: + case 1257: + case 1258: + __small_sprintf (charset, "CP%u", cp); + return __cp_mbtowc; + case 28591: + case 28592: + case 28593: + case 28594: + case 28595: + case 28596: + case 28597: + case 28598: + case 28599: + case 28603: + case 28605: + __small_sprintf (charset, "ISO-8859-%u", cp - 28590); + return __iso_mbtowc; + case 932: + strcpy (charset, "SJIS"); + return __sjis_mbtowc; + case 936: + strcpy (charset, "GBK"); + return __gbk_mbtowc; + case 949: + strcpy (charset, "CP949"); + return __kr_mbtowc; + case 950: + strcpy (charset, "BIG5"); + return __big5_mbtowc; + case 50220: + strcpy (charset, "JIS"); + return __jis_mbtowc; + case 51932: + strcpy (charset, "EUCJP"); + return __eucjp_mbtowc; + case 65001: + strcpy (charset, "UTF-8"); + return __utf8_mbtowc; + default: + break; + } + strcpy (charset, "ASCII"); + return __ascii_mbtowc; +} + +/* Our own sys_wcstombs/sys_mbstowcs functions differ from the + wcstombs/mbstowcs API in three ways: + + - The UNICODE private use area is used in filenames to specify + characters not allowed in Windows filenames ('*', '?', etc). + The sys_wcstombs converts characters in the private use area + back to the corresponding ASCII chars. + + - If a wide character in a filename has no representation in the current + multibyte charset, then usually you wouldn't be able to access the + file. To fix this problem, sys_wcstombs creates a replacement multibyte + sequences for the non-representable wide-char. The sequence starts with + an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the + character. The sys_(cp_)mbstowcs function detects ASCII SO characters + in the input multibyte string and converts the following multibyte + sequence in by treating it as an UTF-8 char. If that fails, the ASCII + SO was probably standalone and it gets just copied over as ASCII SO. + + - The functions always create 0-terminated results, no matter what. + If the result is truncated due to buffer size, it's a bug in Cygwin + and the buffer in the calling function should be raised. */ +size_t __stdcall +sys_wcstombs (char *dst, size_t len, const PWCHAR src, size_t nwc) +{ + char buf[10]; + char *ptr = dst; + wchar_t *pwcs = (wchar_t *) src; + size_t n = 0; + mbstate_t ps; + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t) -1; + while (n < len && nwc-- > 0) + { + wchar_t pw = *pwcs; + /* Convert UNICODE private use area. Reverse functionality (only for + path names) is transform_chars in path.cc. */ + if ((pw & 0xff00) == 0xf000) + pw &= 0xff; + int bytes = _wctomb_r (_REENT, buf, pw, &ps); + /* Convert chars invalid in the current codepage to a sequence + ASCII SO; UTF-8 representation of invalid char. + Do the same for ASCII SO itself. */ + if ((bytes == -1 || pw == 0x0e) && *__locale_charset () != 'U'/*TF-8*/) + { + buf[0] = 0x0e; /* ASCII SO */ + bytes = __utf8_wctomb (_REENT, buf + 1, pw, __locale_charset (), &ps); + if (bytes == -1) + { + ++pwcs; + ps.__count = 0; + continue; + } + ++bytes; /* Add the ASCII SO to the byte count. */ + if (ps.__count == -4) /* First half of a surrogate pair. */ + { + ++pwcs; + if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */ + { + ++pwcs; + ps.__count = 0; + continue; + } + bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, + __locale_charset (), &ps); + } + } + if (n + bytes <= len) + { + n += bytes; + if (dst) + { + for (int i = 0; i < bytes; ++i) + *ptr++ = buf[i]; + } + if (*pwcs++ == 0x00) + break; + } + else + break; + } + if (n && dst) + { + n = (n < len) ? n : len - 1; + dst[n] = '\0'; + } + + return n; +} + /* Allocate a buffer big enough for the string, always including the - terminating '\0'. The buffer pointer is returned in *tgt_p, the return + terminating '\0'. The buffer pointer is returned in *dst_p, the return value is the number of bytes written to the buffer, as usual. The "type" argument determines where the resulting buffer is stored. It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP". @@ -67,57 +378,129 @@ sys_wcstombs (char *tgt, int tlen, const PWCHAR src, int slen) Note that this code is shared by cygserver (which requires it via __small_vsprintf) and so when built there plain calloc is the only choice. */ -int __stdcall -sys_wcstombs_alloc (char **tgt_p, int type, const PWCHAR src, int slen) +size_t __stdcall +sys_wcstombs_alloc (char **dst_p, int type, const PWCHAR src, size_t nwc) { - int ret; + size_t ret; - ret = WideCharToMultiByte (get_cp (), 0, src, slen, NULL, 0 ,NULL, NULL); - if (ret) + ret = sys_wcstombs (NULL, (size_t) -1, src, nwc); + if (ret > 0) { - size_t tlen = (slen == -1) ? ret : ret + 1; + size_t dlen = ret + 1; if (type == HEAP_NOTHEAP) - *tgt_p = (char *) calloc (tlen, sizeof (char)); + *dst_p = (char *) calloc (dlen, sizeof (char)); else - *tgt_p = (char *) ccalloc ((cygheap_types) type, tlen, sizeof (char)); - if (!*tgt_p) + *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char)); + if (!*dst_p) return 0; - ret = sys_wcstombs (*tgt_p, tlen, src, slen); + ret = sys_wcstombs (*dst_p, dlen, src, nwc); } return ret; } -int __stdcall -sys_mbstowcs (PWCHAR tgt, int tlen, const char *src, int slen) +/* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with + a 0 codepage. If cp is not 0, the codepage is evaluated and used for the + conversion. This is so that fhandler_console can switch to an alternate + charset, which is the charset returned by GetConsoleCP (). Most of the + time this is used for box and line drawing characters. */ +size_t __stdcall +sys_cp_mbstowcs (UINT cp, PWCHAR dst, size_t dlen, const char *src, size_t nms) { - int ret = MultiByteToWideChar (get_cp (), 0, src, slen, tgt, tlen); - if (ret && tgt) + wchar_t *ptr = dst; + char *pmbs = (char *) src; + size_t count = 0; + size_t len = dlen; + int bytes; + mbstate_t ps; + char charsetbuf[32]; + char *charset = __locale_charset (); + mbtowc_p f_mbtowc = __mbtowc; + + if (cp) + f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf); + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t)-1; + while (len > 0) { - ret = (ret < tlen) ? ret : tlen - 1; - tgt[ret] = L'\0'; + /* ASCII SO. Convert following UTF-8 sequence (if not UTF-8 anyway). */ + if (*pmbs == 0x0e && *charset != 'U'/*TF-8*/) + { + pmbs++; + bytes = __utf8_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + if (bytes < 0) + { + /* Invalid UTF-8 sequence? Treat the ASCII SO character as + stand-alone ASCII SO char. */ + bytes = 1; + if (dst) + *ptr = 0x0e; + memset (&ps, 0, sizeof ps); + break; + } + if (bytes == 0) + break; + if (ps.__count == 4) /* First half of a surrogate. */ + { + wchar_t *ptr2 = dst ? ptr + 1 : NULL; + int bytes2 = __utf8_mbtowc (_REENT, ptr2, pmbs + bytes, + nms - bytes, charset, &ps); + if (bytes2 < 0) + break; + pmbs += bytes2; + nms -= bytes2; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + } + else + bytes = f_mbtowc (_REENT, ptr, pmbs, nms, charset, &ps); + if (bytes > 0) + { + pmbs += bytes; + nms -= bytes; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + else + { + if (bytes == 0) + ++count; + break; + } } - return ret; + + if (count && dst) + { + count = (count < dlen) ? count : dlen - 1; + dst[count] = L'\0'; + } + + return count; } /* Same as sys_wcstombs_alloc, just backwards. */ -int __stdcall -sys_mbstowcs_alloc (PWCHAR *tgt_p, int type, const char *src, int slen) +size_t __stdcall +sys_mbstowcs_alloc (PWCHAR *dst_p, int type, const char *src, size_t nms) { - int ret; + size_t ret; - ret = MultiByteToWideChar (get_cp (), 0, src, slen, NULL, 0); - if (ret) + ret = sys_mbstowcs (NULL, (size_t) -1, src, nms); + if (ret > 0) { - size_t tlen = (slen == -1 ? ret : ret + 1); + size_t dlen = ret + 1; if (type == HEAP_NOTHEAP) - *tgt_p = (PWCHAR) calloc (tlen, sizeof (WCHAR)); + *dst_p = (PWCHAR) calloc (dlen, sizeof (WCHAR)); else - *tgt_p = (PWCHAR) ccalloc ((cygheap_types) type, tlen, sizeof (WCHAR)); - if (!*tgt_p) + *dst_p = (PWCHAR) ccalloc ((cygheap_types) type, dlen, sizeof (WCHAR)); + if (!*dst_p) return 0; - ret = sys_mbstowcs (*tgt_p, tlen, src, slen); + ret = sys_mbstowcs (*dst_p, dlen, src, nms); } return ret; } diff --git a/winsup/cygwin/wchar.h b/winsup/cygwin/wchar.h index b03d5110f..2ad83e126 100644 --- a/winsup/cygwin/wchar.h +++ b/winsup/cygwin/wchar.h @@ -1,6 +1,6 @@ /* wchar.h: Extra wchar defs - Copyright 2007 Red Hat, Inc. + Copyright 2007, 2009 Red Hat, Inc. This file is part of Cygwin. @@ -17,21 +17,22 @@ details. */ extern "C" { #endif -#undef wcscasecmp -#define wcscasecmp cygwin_wcscasecmp -int __stdcall cygwin_wcscasecmp (const wchar_t *, const wchar_t *); +extern "C" int __utf8_wctomb (struct _reent *, char *, wchar_t, + const char *, mbstate_t *); -#undef wcsncasecmp -#define wcsncasecmp cygwin_wcsncasecmp -int __stdcall cygwin_wcsncasecmp (const wchar_t *, const wchar_t *, size_t); +typedef int mbtowc_f (struct _reent *, wchar_t *, const char *, size_t, + const char *, mbstate_t *); +typedef mbtowc_f *mbtowc_p; -#undef wcslwr -#define wcslwr cygwin_wcslwr -wchar_t * __stdcall cygwin_wcslwr (wchar_t *); +extern "C" mbtowc_p __mbtowc; +extern "C" mbtowc_f __ascii_mbtowc; +extern "C" mbtowc_f __utf8_mbtowc; +extern "C" mbtowc_f __iso_mbtowc; +extern "C" mbtowc_f __cp_mbtowc; -#undef wcsupr -#define wcsupr cygwin_wcsupr -wchar_t * __stdcall cygwin_wcsupr (wchar_t *); +extern "C" char *__locale_charset (); + +extern "C" mbtowc_p __set_charset_from_codepage (UINT cp, char *charset); #ifdef __cplusplus } diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h index 97980eafd..44f7caa37 100644 --- a/winsup/cygwin/winsup.h +++ b/winsup/cygwin/winsup.h @@ -104,21 +104,24 @@ extern const char case_folded_upper[]; /* The one function we use from winuser.h most of the time */ extern "C" DWORD WINAPI GetLastError (void); -void codepage_init (const char *buf); -UINT get_cp (); - /* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc. For a description see there. */ #define HEAP_NOTHEAP -1 -int __stdcall sys_wcstombs (char *, int, const PWCHAR, int = -1) +size_t __stdcall sys_wcstombs (char *, size_t, const PWCHAR, size_t = (size_t) -1) __attribute__ ((regparm(3))); -int __stdcall sys_wcstombs_alloc (char **, int, const PWCHAR, int = -1) +size_t __stdcall sys_wcstombs_alloc (char **, int, const PWCHAR, size_t = (size_t) -1) __attribute__ ((regparm(3))); -int __stdcall sys_mbstowcs (PWCHAR, int, const char *, int = -1) +size_t __stdcall sys_cp_mbstowcs (UINT, PWCHAR, size_t, const char *, size_t = (size_t) -1) __attribute__ ((regparm(3))); -int __stdcall sys_mbstowcs_alloc (PWCHAR *, int, const char *, int = -1) +inline size_t +sys_mbstowcs (PWCHAR dst, size_t dlen, const char *src, + size_t nms = (size_t) -1) +{ + return sys_cp_mbstowcs (0, dst, dlen, src, nms); +} +size_t __stdcall sys_mbstowcs_alloc (PWCHAR *, int, const char *, size_t = (size_t) -1) __attribute__ ((regparm(3))); /* Used to check if Cygwin DLL is dynamically loaded. */ @@ -333,8 +336,6 @@ inline void clear_procimptoken () CloseHandle (old_procimp); } } - -void set_file_api_mode (codepage_type); #endif #endif /* defined __cplusplus */ -- 2.43.5