shrinking /usr/lib/locale

Bruno Haible
Tue Jun 5 06:32:00 GMT 2001


The entire /usr/lib/locale hierarchy currently takes up 42.6 MB on disk.
But many of the files are identical, and using hard links for those files
that are equal can reduce this to 14.6 MB. The script I used is the

========================= localecompact ====================================
# Usage: localecompact /usr/lib/locale

cd $localedir
for d in * ; do
  if test -d $d ; then
    locales="$locales $d"
for l in $locales; do
  for f in $facets; do
    for o in $ll; do
      if cmp -s $l/$f $o/$f 2>/dev/null ; then
        echo "Linking $l/$f"
        rm -f $l/$f
        ln $o/$f $l/$f
  ll="$ll $l"

Note this also improves memory usage on machines where several locales
are in use simultaneously (because mmapping the same file under different
hardlinked names maps it only once in RAM).

But I think this compactification should not needed to be done by an extra
script; it should be done by localedef itself. So here is a patch to make
localedef use a hard link to an existing locale file with same contents,
if possible and safe.

2001-06-04  Bruno Haible  <>

	* locale/programs/locfile.c (siblings_uncached, siblings, full_read,
	compare_files): New functions.
	(write_locale_data): Use xmalloc. Compare the file with the locale
	data files for the same category in other locales, and reuse it if
	possible and safe.

--- glibc-20010430/locale/programs/locfile.c.tmp	Mon Jun  4 22:22:30 2001
+++ glibc-20010430/locale/programs/locfile.c	Mon Jun  4 22:22:33 2001
@@ -21,9 +21,11 @@
 # include <config.h>
+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <string.h>
 #include <unistd.h>
 #include <sys/param.h>
 #include <sys/stat.h>
@@ -260,6 +262,8 @@
+/* Semantic checking of locale specifications.  */
 static void (*const check_funcs[]) (struct localedef_t *,
 				    struct charmap_t *) =
@@ -277,7 +281,6 @@
   [LC_IDENTIFICATION] = identification_finish
 check_all_categories (struct localedef_t *definitions,
 		      struct charmap_t *charmap)
@@ -290,6 +293,8 @@
+/* Writing the locale data files.  All files use the same output_path.  */
 static void (*const write_funcs[]) (struct localedef_t *, struct charmap_t *,
 				    const char *) =
@@ -307,7 +312,6 @@
   [LC_IDENTIFICATION] = identification_output
 write_all_categories (struct localedef_t *definitions,
 		      struct charmap_t *charmap,
@@ -320,7 +324,189 @@
       write_funcs[cnt] (definitions, charmap, output_path);
+/* Return a NULL terminated list of the directories next to output_path
+   that have the same owner, group, permissions and device as output_path.  */
+static const char **
+siblings_uncached (const char *output_path)
+  size_t len;
+  char *base, *p;
+  struct stat output_stat;
+  DIR *dirp;
+  int nelems;
+  const char **elems;
+  /* Remove trailing slashes and trailing pathname component.  */
+  len = strlen (output_path);
+  base = (char *) alloca (len);
+  memcpy (base, output_path, len);
+  p = base + len;
+  while (p > base && p[-1] == '/')
+    p--;
+  if (p == base)
+    return NULL;
+  do
+    p--;
+  while (p > base && p[-1] != '/');
+  if (p == base)
+    return NULL;
+  *--p = '\0';
+  len = p - base;
+  /* Get the properties of output_path.  */
+  if (lstat (output_path, &output_stat) < 0 || !S_ISDIR (output_stat.st_mode))
+    return NULL;
+  /* Iterate through the directories in base directory.  */
+  dirp = opendir (base);
+  if (dirp == NULL)
+    return NULL;
+  nelems = 0;
+  elems = NULL;
+  for (;;)
+    {
+      struct dirent *other_dentry;
+      const char *other_name;
+      char *other_path;
+      struct stat other_stat;
+      other_dentry = readdir (dirp);
+      if (other_dentry == NULL)
+	break;
+      other_name = other_dentry->d_name;
+      if (strcmp (other_name, ".") == 0 || strcmp (other_name, "..") == 0)
+	continue;
+      other_path = (char *) xmalloc (len + 1 + strlen (other_name) + 2);
+      memcpy (other_path, base, len);
+      other_path[len] = '/';
+      strcpy (other_path + len + 1, other_name);
+      if (lstat (other_path, &other_stat) >= 0
+	  && S_ISDIR (other_stat.st_mode)
+	  && other_stat.st_uid == output_stat.st_uid
+	  && other_stat.st_gid == output_stat.st_gid
+	  && other_stat.st_mode == output_stat.st_mode
+	  && other_stat.st_dev == output_stat.st_dev)
+	{
+	  /* Found a subdirectory.  Add a trailing slash and store it.  */
+	  p = other_path + len + 1 + strlen (other_name);
+	  *p++ = '/';
+	  *p = '\0';
+	  elems = (const char **) xrealloc ((char *) elems,
+					    (nelems + 2) * sizeof (char **));
+	  elems[nelems++] = other_path;
+	}
+      else
+	free (other_path);
+    }
+  closedir (dirp);
+  if (elems != NULL)
+    elems[nelems] = NULL;
+  return elems;
+/* Return a NULL terminated list of the directories next to output_path
+   that have the same owner, group, permissions and device as output_path.
+   Cache the result for future calls.  */
+static const char **
+siblings (const char *output_path)
+  static const char *last_output_path;
+  static const char **last_result;
+  if (output_path != last_output_path)
+    {
+      if (last_result != NULL)
+	{
+	  const char **p;
+	  for (p = last_result; *p != NULL; p++)
+	    free ((char *) *p);
+	  free (last_result);
+	}
+      last_output_path = output_path;
+      last_result = siblings_uncached (output_path);
+    }
+  return last_result;
+/* Read as many bytes from a file descriptor as possible.  */
+static ssize_t
+full_read (int fd, void *bufarea, size_t nbyte)
+  char *buf = (char *) bufarea;
+  while (nbyte > 0)
+    {
+      ssize_t retval = read (fd, buf, nbyte);
+      if (retval == 0)
+	break;
+      else if (retval > 0)
+	{
+	  buf += retval;
+	  nbyte -= retval;
+	}
+      else if (errno != EINTR)
+	return retval;
+    }
+  return buf - (char *) bufarea;
+/* Compare the contents of two regular files of the same size.  Return 0
+   if they are equal, 1 if they are different, or -1 if an error occurs.  */
+static int
+compare_files (const char *filename1, const char *filename2, size_t size,
+	       size_t blocksize)
+  int fd1, fd2;
+  int ret = -1;
+  fd1 = open (filename1, O_RDONLY);
+  if (fd1 >= 0)
+    {
+      fd2 = open (filename2, O_RDONLY);
+      if (fd2 >= 0)
+	{
+	  char *buf1 = (char *) xmalloc (2 * blocksize);
+	  char *buf2 = buf1 + blocksize;
+	  ret = 0;
+	  while (size > 0)
+	    {
+	      size_t bytes = (size < blocksize ? size : blocksize);
+	      if (full_read (fd1, buf1, bytes) < (ssize_t) bytes)
+		{
+		  ret = -1;
+		  break;
+		}
+	      if (full_read (fd2, buf2, bytes) < (ssize_t) bytes)
+		{
+		  ret = -1;
+		  break;
+		}
+	      if (memcmp (buf1, buf2, bytes) != 0)
+		{
+		  ret = 1;
+		  break;
+		}
+	      size -= bytes;
+	    }
+	  free (buf1);
+	  close (fd2);
+	}
+      close (fd1);
+    }
+  return ret;
+/* Write a locale file, with contents given by N_ELEM and VEC.  */
 write_locale_data (const char *output_path, const char *category,
 		   size_t n_elem, struct iovec *vec)
@@ -328,10 +514,9 @@
   size_t cnt, step, maxiov;
   int fd;
   char *fname;
+  const char **other_paths;
-  fname = malloc (strlen (output_path) + 2 * strlen (category) + 7);
-  if (fname == NULL)
-    error (5, errno, _("memory exhausted"));
+  fname = xmalloc (strlen (output_path) + 2 * strlen (category) + 7);
   /* Normally we write to the directory pointed to by the OUTPUT_PATH.
      But for LC_MESSAGES we have to take care for the translation
@@ -359,7 +544,8 @@
   /* Create the locale file with nlinks == 1; this avoids crashing processes
-     which currently use the locale.  */
+     which currently use the locale and damaging files belonging to other
+     locales as well.  */
   if (fd == -2)
       unlink (fname);
@@ -389,7 +575,6 @@
-  free (fname);
 #ifdef UIO_MAXIOV
   maxiov = UIO_MAXIOV;
@@ -415,4 +600,116 @@
   close (fd);
+  /* Compare the file with the locale data files for the same category in
+     other locales, and see if we can reuse it, to save disk space.  */
+  other_paths = siblings (output_path);
+  if (other_paths != NULL)
+    {
+      struct stat fname_stat;
+      if (lstat (fname, &fname_stat) >= 0
+	  && S_ISREG (fname_stat.st_mode))
+	{
+	  const char *fname_tail = fname + strlen (output_path);
+	  const char **other_p;
+	  int seen_count;
+	  ino_t *seen_inodes;
+	  seen_count = 0;
+	  for (other_p = other_paths; *other_p; other_p++)
+	    seen_count++;
+	  seen_inodes = (ino_t *) xmalloc (seen_count * sizeof (ino_t));
+	  seen_count = 0;
+	  for (other_p = other_paths; *other_p; other_p++)
+	    {
+	      const char *other_path = *other_p;
+	      size_t other_path_len = strlen (other_path);
+	      char *other_fname;
+	      struct stat other_fname_stat;
+	      other_fname =
+		(char *) xmalloc (other_path_len + strlen (fname_tail) + 1);
+	      memcpy (other_fname, other_path, other_path_len);
+	      strcpy (other_fname + other_path_len, fname_tail);
+	      if (lstat (other_fname, &other_fname_stat) >= 0
+		  && S_ISREG (other_fname_stat.st_mode)
+		  /* Consider only files on the same device.
+		     Otherwise hard linking won't work anyway.  */
+		  && other_fname_stat.st_dev == fname_stat.st_dev
+		  /* Consider only files with the same permissions.
+		     Otherwise there are security risks.  */
+		  && other_fname_stat.st_uid == fname_stat.st_uid
+		  && other_fname_stat.st_gid == fname_stat.st_gid
+		  && other_fname_stat.st_mode == fname_stat.st_mode
+		  /* Don't compare fname with itself.  */
+		  && other_fname_stat.st_ino != fname_stat.st_ino
+		  /* Files must have the same size, otherwise they
+		     cannot be the same.  */
+		  && other_fname_stat.st_size == fname_stat.st_size)
+		{
+		  /* Skip this file if we have already read it (under a
+		     different name).  */
+		  int i;
+		  for (i = seen_count - 1; i >= 0; i--)
+		    if (seen_inodes[i] == other_fname_stat.st_ino)
+		      break;
+		  if (i < 0)
+		    {
+		      /* Now compare fname and other_fname for real.  */
+		      blksize_t blocksize;
+		      blocksize = MAX (fname_stat.st_blksize,
+				       other_fname_stat.st_blksize);
+		      if (blocksize > 8 * 1024)
+			blocksize = 8 * 1024;
+		      blocksize = 8 * 1024;
+		      if (compare_files (fname, other_fname,
+					 fname_stat.st_size, blocksize) == 0)
+			{
+			  /* Found! other_fname is identical to fname.  */
+			  /* Link other_fname to fname.  But use a temporary
+			     file, in case hard links don't work on the
+			     particular filesystem.  */
+			  char * tmp_fname =
+			    (char *) xmalloc (strlen (fname) + 4 + 1);
+			  strcpy (tmp_fname, fname);
+			  strcat (tmp_fname, ".tmp");
+			  if (link (other_fname, tmp_fname) >= 0)
+			    {
+			      unlink (fname);
+			      if (rename (tmp_fname, fname) < 0)
+				{
+				  if (!be_quiet)
+				    error (0, errno, _("\
+cannot create output file `%s' for category `%s'"),
+					   fname, category);
+				}
+			      free (tmp_fname);
+			      free (other_fname);
+			      break;
+			    }
+			  free (tmp_fname);
+			}
+		      /* Don't compare with this file a second time.  */
+		      seen_inodes[seen_count++] = other_fname_stat.st_ino;
+		    }
+		}
+	      free (other_fname);
+	    }
+	  free (seen_inodes);
+	}
+    }
+  free (fname);

More information about the Libc-alpha mailing list