[newlib-cygwin] Cygwin: pty: move codepage evaluation to nlsfuncs.cc
Corinna Vinschen
corinna@sourceware.org
Tue Sep 8 08:36:22 GMT 2020
https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=eaed594d736259c42affa7cf58f12492cbc5fb67
commit eaed594d736259c42affa7cf58f12492cbc5fb67
Author: Corinna Vinschen <corinna@vinschen.de>
Date: Tue Sep 8 10:36:04 2020 +0200
Cygwin: pty: move codepage evaluation to nlsfuncs.cc
The new function __eval_codepage_from_internal_charset
is a simplified version of the former code in
fhandler_tty.cc. It probably needs some extension,
but the gist is to use knowledge of internals to
be as quick as possible.
Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
Diff:
---
winsup/cygwin/fhandler_tty.cc | 191 ++----------------------------------------
winsup/cygwin/nlsfuncs.cc | 48 +++++++++++
2 files changed, 57 insertions(+), 182 deletions(-)
diff --git a/winsup/cygwin/fhandler_tty.cc b/winsup/cygwin/fhandler_tty.cc
index 8bf39c3e6..6de591d9b 100644
--- a/winsup/cygwin/fhandler_tty.cc
+++ b/winsup/cygwin/fhandler_tty.cc
@@ -1614,8 +1614,8 @@ fhandler_pty_master::write (const void *ptr, size_t len)
if (to_be_read_from_pcon () && get_ttyp ()->h_pseudo_console)
{
size_t nlen;
- char *buf = convert_mb_str
- (CP_UTF8, &nlen, get_ttyp ()->term_code_page, (const char *) ptr, len);
+ char *buf = convert_mb_str (CP_UTF8, &nlen, get_ttyp ()->term_code_page,
+ (const char *) ptr, len);
WaitForSingleObject (input_mutex, INFINITE);
@@ -1782,183 +1782,13 @@ fhandler_pty_common::set_close_on_exec (bool val)
close_on_exec (val);
}
-/* This table is borrowed from mintty: charset.c */
-static const struct {
- UINT cp;
- const char *name;
-}
-cs_names[] = {
- { CP_UTF8, "UTF-8"},
- { CP_UTF8, "UTF8"},
- { 20127, "ASCII"},
- { 20127, "US-ASCII"},
- { 20127, "ANSI_X3.4-1968"},
- { 20866, "KOI8-R"},
- { 20866, "KOI8R"},
- { 20866, "KOI8"},
- { 21866, "KOI8-U"},
- { 21866, "KOI8U"},
- { 20932, "EUCJP"},
- { 20932, "EUC-JP"},
- { 874, "TIS620"},
- { 874, "TIS-620"},
- { 932, "SJIS"},
- { 936, "GBK"},
- { 936, "GB2312"},
- { 936, "EUCCN"},
- { 936, "EUC-CN"},
- { 949, "EUCKR"},
- { 949, "EUC-KR"},
- { 950, "BIG5"},
- { 0, "NULL"}
-};
-
-static void
-get_locale_from_env (char *locale)
-{
- const char *env = NULL;
- char lang[ENCODING_LEN + 1] = {0, }, country[ENCODING_LEN + 1] = {0, };
- env = getenv ("LC_ALL");
- if (env == NULL || !*env)
- env = getenv ("LC_CTYPE");
- if (env == NULL || !*env)
- env = getenv ("LANG");
- if (env == NULL || !*env)
- {
- if (GetLocaleInfo (LOCALE_CUSTOM_UI_DEFAULT,
- LOCALE_SISO639LANGNAME,
- lang, sizeof (lang)))
- GetLocaleInfo (LOCALE_CUSTOM_UI_DEFAULT,
- LOCALE_SISO3166CTRYNAME,
- country, sizeof (country));
- else if (GetLocaleInfo (LOCALE_CUSTOM_DEFAULT,
- LOCALE_SISO639LANGNAME,
- lang, sizeof (lang)))
- GetLocaleInfo (LOCALE_CUSTOM_DEFAULT,
- LOCALE_SISO3166CTRYNAME,
- country, sizeof (country));
- else if (GetLocaleInfo (LOCALE_USER_DEFAULT,
- LOCALE_SISO639LANGNAME,
- lang, sizeof (lang)))
- GetLocaleInfo (LOCALE_USER_DEFAULT,
- LOCALE_SISO3166CTRYNAME,
- country, sizeof (country));
- else if (GetLocaleInfo (LOCALE_SYSTEM_DEFAULT,
- LOCALE_SISO639LANGNAME,
- lang, sizeof (lang)))
- GetLocaleInfo (LOCALE_SYSTEM_DEFAULT,
- LOCALE_SISO3166CTRYNAME,
- country, sizeof (country));
- if (strlen (lang) && strlen (country))
- __small_sprintf (lang + strlen (lang), "_%s.UTF-8", country);
- else
- strcpy (lang , "C.UTF-8");
- env = lang;
- }
- strcpy (locale, env);
-}
-
-static void
-get_langinfo (char *locale_out, char *charset_out)
-{
- /* Get locale from environment */
- char new_locale[ENCODING_LEN + 1];
- get_locale_from_env (new_locale);
-
- __locale_t loc;
- memset (&loc, 0, sizeof (loc));
- const char *locale = __loadlocale (&loc, LC_CTYPE, new_locale);
- if (!locale)
- locale = "C";
-
- const char *charset;
- struct lc_ctype_T *lc_ctype = (struct lc_ctype_T *) loc.lc_cat[LC_CTYPE].ptr;
- if (!lc_ctype)
- charset = "ASCII";
- else
- charset = lc_ctype->codeset;
-
- /* The following code is borrowed from nl_langinfo()
- in newlib/libc/locale/nl_langinfo.c */
- /* Convert charset to Linux compatible codeset string. */
- if (charset[0] == 'A'/*SCII*/)
- charset = "ANSI_X3.4-1968";
- else if (charset[0] == 'E')
- {
- if (strcmp (charset, "EUCJP") == 0)
- charset = "EUC-JP";
- else if (strcmp (charset, "EUCKR") == 0)
- charset = "EUC-KR";
- else if (strcmp (charset, "EUCCN") == 0)
- charset = "GB2312";
- }
- else if (charset[0] == 'C'/*Pxxxx*/)
- {
- if (strcmp (charset + 2, "874") == 0)
- charset = "TIS-620";
- else if (strcmp (charset + 2, "20866") == 0)
- charset = "KOI8-R";
- else if (strcmp (charset + 2, "21866") == 0)
- charset = "KOI8-U";
- else if (strcmp (charset + 2, "101") == 0)
- charset = "GEORGIAN-PS";
- else if (strcmp (charset + 2, "102") == 0)
- charset = "PT154";
- }
- else if (charset[0] == 'S'/*JIS*/)
- {
- /* Cygwin uses MSFT's implementation of SJIS, which differs
- in some codepoints from the real thing, especially
- 0x5c: yen sign instead of backslash,
- 0x7e: overline instead of tilde.
- We can't use the real SJIS since otherwise Win32
- pathnames would become invalid. OTOH, if we return
- "SJIS" here, then libiconv will do mb<->wc conversion
- differently to our internal functions. Therefore we
- return what we really implement, CP932. This is handled
- fine by libiconv. */
- charset = "CP932";
- }
-
- /* Set results */
- strcpy (locale_out, new_locale);
- strcpy (charset_out, charset);
-}
-
void
fhandler_pty_slave::setup_locale (void)
{
- if (get_ttyp ()->term_code_page != 0)
- return;
+ extern UINT __eval_codepage_from_internal_charset ();
- char locale[ENCODING_LEN + 1] = "C";
- char charset[ENCODING_LEN + 1] = "ASCII";
- get_langinfo (locale, charset);
-
- /* Set terminal code page from locale */
- /* This code is borrowed from mintty: charset.c */
- get_ttyp ()->term_code_page = 20127; /* Default ASCII */
- char charset_u[ENCODING_LEN + 1] = {0, };
- for (int i=0; charset[i] && i<ENCODING_LEN; i++)
- charset_u[i] = toupper (charset[i]);
- unsigned int iso;
- UINT cp = 20127; /* Default for fallback */
- if (sscanf (charset_u, "ISO-8859-%u", &iso) == 1
- || sscanf (charset_u, "ISO8859-%u", &iso) == 1
- || sscanf (charset_u, "ISO8859%u", &iso) == 1)
- {
- if (iso && iso <= 16 && iso !=12)
- get_ttyp ()->term_code_page = 28590 + iso;
- }
- else if (sscanf (charset_u, "CP%u", &cp) == 1)
- get_ttyp ()->term_code_page = cp;
- else
- for (int i=0; cs_names[i].cp; i++)
- if (strcasecmp (charset_u, cs_names[i].name) == 0)
- {
- get_ttyp ()->term_code_page = cs_names[i].cp;
- break;
- }
+ if (!get_ttyp ()->term_code_page)
+ get_ttyp ()->term_code_page = __eval_codepage_from_internal_charset ();
}
void
@@ -1977,9 +1807,6 @@ fhandler_pty_slave::fixup_after_exec ()
if (!close_on_exec ())
fixup_after_fork (NULL); /* No parent handle required. */
- /* Set locale */
- setup_locale ();
-
/* Hook Console API */
#define DO_HOOK(module, name) \
if (!name##_Orig) \
@@ -2205,8 +2032,8 @@ fhandler_pty_master::pty_master_fwd_thread ()
state = 0;
size_t nlen;
- char *buf = convert_mb_str
- (get_ttyp ()->term_code_page, &nlen, CP_UTF8, ptr, wlen);
+ char *buf = convert_mb_str (get_ttyp ()->term_code_page,
+ &nlen, CP_UTF8, ptr, wlen);
ptr = buf;
wlen = rlen = nlen;
@@ -2228,8 +2055,8 @@ fhandler_pty_master::pty_master_fwd_thread ()
continue;
}
size_t nlen;
- char *buf = convert_mb_str
- (get_ttyp ()->term_code_page, &nlen, GetConsoleOutputCP (), ptr, wlen);
+ char *buf = convert_mb_str (get_ttyp ()->term_code_page, &nlen,
+ GetConsoleOutputCP (), ptr, wlen);
ptr = buf;
wlen = rlen = nlen;
diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc
index 668d7eb9e..a518d2be3 100644
--- a/winsup/cygwin/nlsfuncs.cc
+++ b/winsup/cygwin/nlsfuncs.cc
@@ -1448,6 +1448,54 @@ __set_charset_from_locale (const char *locale, char *charset)
stpcpy (charset, cs);
}
+/* Called from fhandler_tty::setup_locale. Set a codepage which reflects the
+ internal charset setting. This is *not* necessarily the Windows
+ codepage connected to a locale by default, so we have to set this
+ up explicitely. */
+UINT
+__eval_codepage_from_internal_charset ()
+{
+ const char *charset = __locale_charset (__get_global_locale ());
+ UINT codepage = CP_UTF8; /* Default UTF8 */
+
+ /* The internal charset names are well defined, so we can use shortcuts. */
+ switch (charset[0])
+ {
+ case 'B': /* BIG5 */
+ codepage = 950;
+ break;
+ case 'C': /* CPxxx */
+ codepage = strtoul (charset + 2, NULL, 10);
+ break;
+ case 'E': /* EUCxx */
+ switch (charset[3])
+ {
+ case 'J': /* EUCJP */
+ codepage = 20932;
+ break;
+ case 'K': /* EUCKR */
+ codepage = 949;
+ break;
+ case 'C': /* EUCCN */
+ codepage = 936;
+ break;
+ }
+ break;
+ case 'G': /* GBK/GB2312 */
+ codepage = 936;
+ break;
+ case 'I': /* ISO-8859-x */
+ codepage = strtoul (charset + 9, NULL, 10) + 28590;
+ break;
+ case 'S': /* SJIS */
+ codepage = 932;
+ break;
+ default: /* All set to UTF8 already */
+ break;
+ }
+ return codepage;
+}
+
/* This function is called from newlib's loadlocale if the locale identifier
was invalid, one way or the other. It looks for the file
More information about the Cygwin-cvs
mailing list