Note on choosing string hash functions
Pedro Alves
palves@redhat.com
Wed Nov 22 02:25:00 GMT 2017
On 11/22/2017 02:10 AM, Pedro Alves wrote:
> On 11/17/2017 01:42 PM, Pedro Alves wrote:
>
> Then, I played with making Ada/gnat and both Latin-1 and UTF-8 sources
> files (the latter with "pragma Wide_Character_Encoding (UTF8)"), and
> what I discovered was that Ada's encoding/mangling guarantees that only
> ASCII characters end up in mangled names. From gcc/ada/namet.ads:
>
> ~~~
> -- Identifiers Stored with upper case letters folded to lower case.
> -- Upper half (16#80# bit set) and wide characters are
> -- stored in an encoded form (Uhh for upper half char,
> -- Whhhh for wide characters, WWhhhhhhhh as provided by
> -- the routine Append_Encoded, where hh are hex
> -- digits for the character code using lower case a-f).
> -- Normally the use of U or W in other internal names is
> -- avoided, but these letters may be used in internal
> -- names (without this special meaning), if they appear
> -- as the last character of the name, or they are
> -- followed by an upper case letter (other than the WW
> -- sequence), or an underscore.
> ~~~
>
> Funny enough, GDB doesn't grok this Uhh/WWhhhhhhhh encoding today.
> (I wrote a quick patch to teach GDB about it, to help convince myself,
> though as is, it only works when gdb's charset/locale is UTF-8.)
For the record, here's what that patch looks like.
>From 710bde831ed78641e175046e0711a35d5061d7ee Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Tue, 21 Nov 2017 20:05:42 +0000
Subject: [PATCH] Ada: Support Uhh encoding, UTF-8
An attempt at checking whether TOLOWER for minsyms makes a difference
over tolower...
It doesn't, Ada's encoding encodes "upper half char"s using Uff, so
non-ASCII characters don't appear in the mangled names...
The Ada lexer change is necessary so that it's possible to input UTF-8
in expressions.
This assumes the host encoding is UTF-8 as is... I wonder... maybe
GDB should always use UTF-8 internally, and translate host-encoding ->
UTF-8 at the readline -> GDB boundary.
Yes, the test passes. :-)
---
gdb/ada-lang.c | 30 +++++++++++++++++++++
gdb/ada-lex.l | 2 +-
gdb/common/rsp-low.c | 2 +-
gdb/common/rsp-low.h | 4 +++
gdb/testsuite/gdb.ada/utf8.exp | 53 ++++++++++++++++++++++++++++++++++++++
gdb/testsuite/gdb.ada/utf8/foo.adb | 25 ++++++++++++++++++
gdb/testsuite/gdb.ada/utf8/pck.adb | 26 +++++++++++++++++++
gdb/testsuite/gdb.ada/utf8/pck.ads | 22 ++++++++++++++++
8 files changed, 162 insertions(+), 2 deletions(-)
create mode 100644 gdb/testsuite/gdb.ada/utf8.exp
create mode 100644 gdb/testsuite/gdb.ada/utf8/foo.adb
create mode 100644 gdb/testsuite/gdb.ada/utf8/pck.adb
create mode 100644 gdb/testsuite/gdb.ada/utf8/pck.ads
diff --git a/gdb/ada-lang.c b/gdb/ada-lang.c
index 33c4e8e..d0fb06d 100644
--- a/gdb/ada-lang.c
+++ b/gdb/ada-lang.c
@@ -63,6 +63,7 @@
#include "common/function-view.h"
#include "common/byte-vector.h"
#include <algorithm>
+#include "common/rsp-low.h"
/* Define whether or not the C operator '/' truncates towards zero for
differently signed operands (truncation direction is undefined in C).
@@ -1007,6 +1008,19 @@ ada_encode_1 (const char *decoded, bool throw_errors)
encoding_buffer[k] = encoding_buffer[k + 1] = '_';
k += 2;
}
+ else if (((unsigned char) *p & 0xe0) == 0xc0)
+ {
+ /* "Uhh" Ada encoding -> UTF-8 character. */
+
+ unsigned char c1 = p[0];
+ unsigned char c2 = p[1];
+ unsigned char c = (c1 << 6) | (c2 & (0xff >> 2));
+ p += 1;
+
+ encoding_buffer[k] = 'U';
+ pack_hex_byte (&encoding_buffer[k + 1], c);
+ k += 3;
+ }
else if (*p == '"')
{
const struct ada_opname_map *mapping;
@@ -1355,6 +1369,8 @@ ada_decode (const char *encoded)
i++;
}
+ std::pair<int, int> nibbles;
+
if (encoded[i] == 'X' && i != 0 && isalnum (encoded[i - 1]))
{
/* This is a X[bn]* sequence not separated from the previous
@@ -1378,6 +1394,20 @@ ada_decode (const char *encoded)
i += 2;
j += 1;
}
+ else if (len0 - i > 3
+ && encoded[i] == 'U'
+ && ishex (encoded[i + 1], &nibbles.first)
+ && ishex (encoded[i + 2], &nibbles.second))
+ {
+ /* Convert Ada upper half char encoding to UTF-8 character
+ (2 bytes code point). */
+ unsigned char c = nibbles.first << 4 | nibbles.second;
+
+ decoded[j] = 0xc0 | c >> 6;
+ decoded[j + 1] = 0x80 | (c & 0x03f);
+ i += 3;
+ j += 2;
+ }
else
{
/* It's a character part of the decoded name, so just copy it
diff --git a/gdb/ada-lex.l b/gdb/ada-lex.l
index 63137bd..41b0582 100644
--- a/gdb/ada-lex.l
+++ b/gdb/ada-lex.l
@@ -29,7 +29,7 @@ NUM10 ({DIG}({DIG}|_)*)
HEXDIG [0-9a-f]
NUM16 ({HEXDIG}({HEXDIG}|_)*)
OCTDIG [0-7]
-LETTER [a-z_]
+LETTER [a-z_\x80-\xff]
ID ({LETTER}({LETTER}|{DIG})*|"<"{LETTER}({LETTER}|{DIG})*">")
WHITE [ \t\n]
TICK ("'"{WHITE}*)
diff --git a/gdb/common/rsp-low.c b/gdb/common/rsp-low.c
index 85987f7..3209693 100644
--- a/gdb/common/rsp-low.c
+++ b/gdb/common/rsp-low.c
@@ -50,7 +50,7 @@ tohex (int nib)
static const char hexchars[] = "0123456789abcdef";
-static int
+int
ishex (int ch, int *val)
{
if ((ch >= 'a') && (ch <= 'f'))
diff --git a/gdb/common/rsp-low.h b/gdb/common/rsp-low.h
index 99dc93f..947ee20 100644
--- a/gdb/common/rsp-low.h
+++ b/gdb/common/rsp-low.h
@@ -20,6 +20,10 @@
#ifndef COMMON_RSP_LOW_H
#define COMMON_RSP_LOW_H
+/* FIXME: comment. */
+
+extern int ishex (int ch, int *val);
+
/* Convert hex digit A to a number, or throw an exception. */
extern int fromhex (int a);
diff --git a/gdb/testsuite/gdb.ada/utf8.exp b/gdb/testsuite/gdb.ada/utf8.exp
new file mode 100644
index 0000000..4e5fc01
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8.exp
@@ -0,0 +1,53 @@
+# -*-mode: tcl; coding: utf-8;-*-
+#
+# Copyright 2017 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Test GDB's support for symbols with UTF-8 multi-byte symbol names.
+
+# Actually, we're only testing "Uff" (Latin1 page) encoded names,
+# i.e., upper half char characters. Wider characters have a different
+# Ada encoding which we don't support yet.
+
+load_lib "ada.exp"
+
+# Enable basic use of UTF-8. This is restored automatically for every
+# testcase.
+setenv LC_ALL C.UTF-8
+
+standard_ada_testfile foo
+
+if {[gdb_compile_ada "${srcfile}" "${binfile}" executable {debug}] != "" } {
+ return -1
+}
+
+clean_restart ${testfile}
+
+if ![runto_main] then {
+ perror "Couldn't run ${testfile}"
+ return
+}
+
+# Check printing an expression involving an UTF8 symbol name.
+gdb_test "print &pck.funcáx" \
+ " = \\(access function \\(a1: integer\\) return integer\\) $hex <pck.funcáx>"
+
+# Check setting a breakpoint in a function with an UTF8 symbol name.
+gdb_test "b pck.funcáx" "Breakpoint $decimal .*"
+
+# Test running to the breakpoint, confirm GDB prints the function name
+# correctly.
+gdb_test "continue" "Breakpoint $decimal, pck.funcáx \\(i=1\\).*"
+
diff --git a/gdb/testsuite/gdb.ada/utf8/foo.adb b/gdb/testsuite/gdb.ada/utf8/foo.adb
new file mode 100644
index 0000000..f49ab49
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/foo.adb
@@ -0,0 +1,25 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+-- Copyright 2017 Free Software Foundation, Inc.
+--
+-- This program is free software; you can redistribute it and/or modify
+-- it under the terms of the GNU General Public License as published by
+-- the Free Software Foundation; either version 3 of the License, or
+-- (at your option) any later version.
+--
+-- This program is distributed in the hope that it will be useful,
+-- but WITHOUT ANY WARRANTY; without even the implied warranty of
+-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+-- GNU General Public License for more details.
+--
+-- You should have received a copy of the GNU General Public License
+-- along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+with Pck; use Pck;
+procedure Foo is
+ I : Integer := 1;
+begin
+ FuncÃx (I);
+end Foo;
diff --git a/gdb/testsuite/gdb.ada/utf8/pck.adb b/gdb/testsuite/gdb.ada/utf8/pck.adb
new file mode 100644
index 0000000..a4a4962
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/pck.adb
@@ -0,0 +1,26 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+-- Copyright 2017 Free Software Foundation, Inc.
+--
+-- This program is free software; you can redistribute it and/or modify
+-- it under the terms of the GNU General Public License as published by
+-- the Free Software Foundation; either version 3 of the License, or
+-- (at your option) any later version.
+--
+-- This program is distributed in the hope that it will be useful,
+-- but WITHOUT ANY WARRANTY; without even the implied warranty of
+-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+-- GNU General Public License for more details.
+--
+-- You should have received a copy of the GNU General Public License
+-- along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+package body Pck is
+ procedure FuncÃx (I: in out Integer) is
+ begin
+ I := I + 1;
+ end FuncÃx;
+
+end Pck;
diff --git a/gdb/testsuite/gdb.ada/utf8/pck.ads b/gdb/testsuite/gdb.ada/utf8/pck.ads
new file mode 100644
index 0000000..3978ba4
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/pck.ads
@@ -0,0 +1,22 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+-- Copyright 2017 Free Software Foundation, Inc.
+--
+-- This program is free software; you can redistribute it and/or modify
+-- it under the terms of the GNU General Public License as published by
+-- the Free Software Foundation; either version 3 of the License, or
+-- (at your option) any later version.
+--
+-- This program is distributed in the hope that it will be useful,
+-- but WITHOUT ANY WARRANTY; without even the implied warranty of
+-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+-- GNU General Public License for more details.
+--
+-- You should have received a copy of the GNU General Public License
+-- along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+package Pck is
+ procedure FuncÃx (I: in out Integer);
+end Pck;
--
2.5.5
More information about the Gdb
mailing list