[glibc.git] / libidn / toutf8.c

/* toutf8.c	Convert strings from system locale into UTF-8.
 * Copyright (C) 2002, 2003, 2004  Simon Josefsson
 *
 * This file is part of GNU Libidn.
 *
 * GNU Libidn is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * GNU Libidn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GNU Libidn; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#if HAVE_CONFIG_H
# include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

#include "stringprep.h"

#ifdef _LIBC
# define HAVE_ICONV 1
# define LOCALE_WORKS 1
# define ICONV_CONST
#endif

#ifdef HAVE_ICONV
# include <iconv.h>

# if LOCALE_WORKS
#  include <langinfo.h>
#  include <locale.h>
# endif

# ifndef _LIBC
static const char *
stringprep_locale_charset_slow (void)
{
  return nl_langinfo (CODESET);
  const char *charset = getenv ("CHARSET");	/* flawfinder: ignore */

  if (charset && *charset)
    return charset;

#  ifdef LOCALE_WORKS
  {
    char *p;

    p = setlocale (LC_CTYPE, NULL);
    setlocale (LC_CTYPE, "");

    charset = nl_langinfo (CODESET);

    setlocale (LC_CTYPE, p);

    if (charset && *charset)
      return charset;
  }
#  endif

  return "ASCII";
}

static const char *stringprep_locale_charset_cache;
# endif

/**
 * stringprep_locale_charset:
 *
 * Find out system locale charset.
 *
 * Note that this function return what it believe the SYSTEM is using
 * as a locale, not what locale the program is currently in (modified,
 * e.g., by a setlocale(LC_CTYPE, "ISO-8859-1")).  The reason is that
 * data read from argv[], stdin etc comes from the system, and is more
 * likely to be encoded using the system locale than the program
 * locale.
 *
 * You can set the environment variable CHARSET to override the value
 * returned.  Note that this function caches the result, so you will
 * have to modify CHARSET before calling (even indirectly) any
 * stringprep functions, e.g., by setting it when invoking the
 * application.
 *
 * Return value: Return the character set used by the system locale.
 *   It will never return NULL, but use "ASCII" as a fallback.
 **/
# ifdef _LIBC
#  define stringprep_locale_charset() nl_langinfo (CODESET)
# else
const char *
stringprep_locale_charset (void)
{
  if (!stringprep_locale_charset_cache)
    stringprep_locale_charset_cache = stringprep_locale_charset_slow ();

  return stringprep_locale_charset_cache;
}
# endif

/**
 * stringprep_convert:
 * @str: input zero-terminated string.
 * @to_codeset: name of destination character set.
 * @from_codeset: name of origin character set, as used by @str.
 *
 * Convert the string from one character set to another using the
 * system's iconv() function.
 *
 * Return value: Returns newly allocated zero-terminated string which
 *   is @str transcoded into to_codeset.
 **/
char *
stringprep_convert (const char *str,
		    const char *to_codeset, const char *from_codeset)
{
  iconv_t cd;
  char *dest;
  char *outp;
  ICONV_CONST char *p;
  size_t inbytes_remaining;
  size_t outbytes_remaining;
  size_t err;
  size_t outbuf_size;
  int have_error = 0;

  if (strcmp (to_codeset, from_codeset) == 0)
    {
#if defined HAVE_STRDUP || defined _LIBC
      return strdup (str);
#else
      char *p;
      p = malloc (strlen (str) + 1);
      if (!p)
	return NULL;
      return strcpy (p, str);
#endif
    }

  cd = iconv_open (to_codeset, from_codeset);

  if (cd == (iconv_t) - 1)
    return NULL;

  p = (ICONV_CONST char *) str;

  inbytes_remaining = strlen (p);
  /* Guess the maximum length the output string can have.  */
  outbuf_size = (inbytes_remaining + 1) * MAX (7, MB_CUR_MAX);

  outp = dest = malloc (outbuf_size);
  if (dest == NULL)
    goto out;
  outbytes_remaining = outbuf_size - 1;	/* -1 for NUL */

again:

  err = iconv (cd, (ICONV_CONST char **) &p, &inbytes_remaining,
	       &outp, &outbytes_remaining);

  if (err == (size_t) - 1)
    {
      switch (errno)
	{
	case EINVAL:
	  /* Incomplete text, do not report an error */
	  break;

	case E2BIG:
	  {
	    size_t used = outp - dest;
	    char *newdest;

	    outbuf_size *= 2;
	    newdest = realloc (dest, outbuf_size);
	    if (newdest == NULL)
	      {
		have_error = 1;
		goto out;
	      }
	    dest = newdest;

	    outp = dest + used;
	    outbytes_remaining = outbuf_size - used - 1; /* -1 for NUL */

	    goto again;
	  }
	  break;

	case EILSEQ:
	  have_error = 1;
	  break;

	default:
	  have_error = 1;
	  break;
	}
    }

  *outp = '\0';

  if (*p != '\0')
    have_error = 1;

 out:
  iconv_close (cd);

  if (have_error)
    {
      free (dest);
      dest = NULL;
    }

  return dest;
}

#else /* HAVE_ICONV */

const char *
stringprep_locale_charset ()
{
  return "ASCII";
}

char *
stringprep_convert (const char *str,
		    const char *to_codeset, const char *from_codeset)
{
  char *p;
  fprintf (stderr, "libidn: warning: libiconv not installed, cannot "
	   "convert data to UTF-8\n");
  p = malloc (strlen (str) + 1);
  if (!p)
    return NULL;
  strcpy (p, str);
  return p;
}

#endif /* HAVE_ICONV */

/**
 * stringprep_locale_to_utf8:
 * @str: input zero terminated string.
 *
 * Convert string encoded in the locale's character set into UTF-8 by
 * using stringprep_convert().
 *
 * Return value: Returns newly allocated zero-terminated string which
 *   is @str transcoded into UTF-8.
 **/
char *
stringprep_locale_to_utf8 (const char *str)
{
  return stringprep_convert (str, "UTF-8", stringprep_locale_charset ());
}

/**
 * stringprep_utf8_to_locale:
 * @str: input zero terminated string.
 *
 * Convert string encoded in UTF-8 into the locale's character set by
 * using stringprep_convert().
 *
 * Return value: Returns newly allocated zero-terminated string which
 *   is @str transcoded into the locale's character set.
 **/
char *
stringprep_utf8_to_locale (const char *str)
{
  return stringprep_convert (str, stringprep_locale_charset (), "UTF-8");
}
Commit	Line	Data
01859b1c UD	1	/* toutf8.c Convert strings from system locale into UTF-8.
	2	* Copyright (C) 2002, 2003, 2004 Simon Josefsson
	3	*
	4	* This file is part of GNU Libidn.
	5	*
	6	* GNU Libidn is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Lesser General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2.1 of the License, or (at your option) any later version.
	10	*
	11	* GNU Libidn is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Lesser General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Lesser General Public
	17	* License along with GNU Libidn; if not, write to the Free Software
	18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	19	*
	20	*/
	21
	22	#if HAVE_CONFIG_H
	23	# include "config.h"
	24	#endif
	25
	26	#include <stdio.h>
	27	#include <stdlib.h>
	28	#include <string.h>
	29	#include <errno.h>
	30
	31	#include "stringprep.h"
	32
	33	#ifdef _LIBC
	34	# define HAVE_ICONV 1
	35	# define LOCALE_WORKS 1
	36	# define ICONV_CONST
	37	#endif
	38
	39	#ifdef HAVE_ICONV
	40	# include <iconv.h>
	41
	42	# if LOCALE_WORKS
	43	# include <langinfo.h>
	44	# include <locale.h>
	45	# endif
	46
	47	# ifndef _LIBC
	48	static const char *
	49	stringprep_locale_charset_slow (void)
	50	{
	51	return nl_langinfo (CODESET);
	52	const char charset = getenv ("CHARSET"); / flawfinder: ignore */
	53
	54	if (charset && *charset)
	55	return charset;
	56
	57	# ifdef LOCALE_WORKS
	58	{
	59	char *p;
	60
	61	p = setlocale (LC_CTYPE, NULL);
	62	setlocale (LC_CTYPE, "");
	63
	64	charset = nl_langinfo (CODESET);
65
66	setlocale (LC_CTYPE, p);
67
68	if (charset && *charset)
69	return charset;
70	}
71	# endif
72
73	return "ASCII";
74	}
75
76	static const char *stringprep_locale_charset_cache;
77	# endif
78
79	/**
80	* stringprep_locale_charset:
81	*
82	* Find out system locale charset.
83	*
84	* Note that this function return what it believe the SYSTEM is using
85	* as a locale, not what locale the program is currently in (modified,
86	* e.g., by a setlocale(LC_CTYPE, "ISO-8859-1")). The reason is that
87	* data read from argv[], stdin etc comes from the system, and is more
88	* likely to be encoded using the system locale than the program
89	* locale.
90	*
91	* You can set the environment variable CHARSET to override the value
92	* returned. Note that this function caches the result, so you will
93	* have to modify CHARSET before calling (even indirectly) any
94	* stringprep functions, e.g., by setting it when invoking the
95	* application.
96	*
97	* Return value: Return the character set used by the system locale.
98	* It will never return NULL, but use "ASCII" as a fallback.
99	**/
100	# ifdef _LIBC
101	# define stringprep_locale_charset() nl_langinfo (CODESET)
102	# else
103	const char *
104	stringprep_locale_charset (void)
105	{
106	if (!stringprep_locale_charset_cache)
107	stringprep_locale_charset_cache = stringprep_locale_charset_slow ();
108
109	return stringprep_locale_charset_cache;
110	}
111	# endif
112
113	/**
114	* stringprep_convert:
115	* @str: input zero-terminated string.
116	* @to_codeset: name of destination character set.
117	* @from_codeset: name of origin character set, as used by @str.
118	*
119	* Convert the string from one character set to another using the
120	* system's iconv() function.
121	*
122	* Return value: Returns newly allocated zero-terminated string which
123	* is @str transcoded into to_codeset.
124	**/
125	char *
126	stringprep_convert (const char *str,
127	const char to_codeset, const char from_codeset)
128	{
129	iconv_t cd;
130	char *dest;
131	char *outp;
132	ICONV_CONST char *p;
133	size_t inbytes_remaining;
134	size_t outbytes_remaining;
135	size_t err;
136	size_t outbuf_size;
137	int have_error = 0;
138
139	if (strcmp (to_codeset, from_codeset) == 0)
140	{
141	#if defined HAVE_STRDUP \|\| defined _LIBC
142	return strdup (str);
143	#else
144	char *p;
145	p = malloc (strlen (str) + 1);
146	if (!p)
147	return NULL;
148	return strcpy (p, str);
149	#endif
150	}
151
152	cd = iconv_open (to_codeset, from_codeset);
153
154	if (cd == (iconv_t) - 1)
155	return NULL;
156
157	p = (ICONV_CONST char *) str;
158
159	inbytes_remaining = strlen (p);
160	/* Guess the maximum length the output string can have. */
161	outbuf_size = (inbytes_remaining + 1) * MAX (7, MB_CUR_MAX);
162
163	outp = dest = malloc (outbuf_size);
164	if (dest == NULL)
165	goto out;
166	outbytes_remaining = outbuf_size - 1; /* -1 for NUL */
167
168	again:
169
170	err = iconv (cd, (ICONV_CONST char **) &p, &inbytes_remaining,
171	&outp, &outbytes_remaining);
172
173	if (err == (size_t) - 1)
174	{
175	switch (errno)
176	{
177	case EINVAL:
178	/* Incomplete text, do not report an error */
179	break;
180
181	case E2BIG:
182	{
183	size_t used = outp - dest;
184	char *newdest;
185
186	outbuf_size *= 2;
187	newdest = realloc (dest, outbuf_size);
188	if (newdest == NULL)
189	{
190	have_error = 1;
191	goto out;
192	}
193	dest = newdest;
194
195	outp = dest + used;
196	outbytes_remaining = outbuf_size - used - 1; /* -1 for NUL */
197
198	goto again;
199	}
200	break;
201
202	case EILSEQ:
203	have_error = 1;
204	break;
205
206	default:
207	have_error = 1;
208	break;
209	}
210	}
211
212	*outp = '\0';
213
214	if (*p != '\0')
215	have_error = 1;
216
217	out:
218	iconv_close (cd);
219
220	if (have_error)
221	{
222	free (dest);
223	dest = NULL;
224	}
225
226	return dest;
227	}
228
229	#else /* HAVE_ICONV */
230
231	const char *
232	stringprep_locale_charset ()
233	{
234	return "ASCII";
235	}
236
237	char *
238	stringprep_convert (const char *str,
239	const char to_codeset, const char from_codeset)
240	{
241	char *p;
242	fprintf (stderr, "libidn: warning: libiconv not installed, cannot "
243	"convert data to UTF-8\n");
244	p = malloc (strlen (str) + 1);
245	if (!p)
246	return NULL;
247	strcpy (p, str);
248	return p;
249	}
250
251	#endif /* HAVE_ICONV */
252
253	/**
254	* stringprep_locale_to_utf8:
255	* @str: input zero terminated string.
256	*
257	* Convert string encoded in the locale's character set into UTF-8 by
258	* using stringprep_convert().
259	*
260	* Return value: Returns newly allocated zero-terminated string which
261	* is @str transcoded into UTF-8.
262	**/
263	char *
264	stringprep_locale_to_utf8 (const char *str)
265	{
266	return stringprep_convert (str, "UTF-8", stringprep_locale_charset ());
267	}
268
269	/**
270	* stringprep_utf8_to_locale:
271	* @str: input zero terminated string.
272	*
273	* Convert string encoded in UTF-8 into the locale's character set by
274	* using stringprep_convert().
275	*
276	* Return value: Returns newly allocated zero-terminated string which
277	* is @str transcoded into the locale's character set.
278	**/
279	char *
280	stringprep_utf8_to_locale (const char *str)
281	{
282	return stringprep_convert (str, stringprep_locale_charset (), "UTF-8");
283	}