This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/x86/optimize created. glibc-2.25-327-g70eb638


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/x86/optimize has been created
        at  70eb6382837254746b63abfc99b062118abb6c90 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=70eb6382837254746b63abfc99b062118abb6c90

commit 70eb6382837254746b63abfc99b062118abb6c90
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 12 09:37:28 2017 -0700

    Test only a subset of memcpy

diff --git a/benchtests/memcpy_benchmark2.cc b/benchtests/memcpy_benchmark2.cc
index 42d534e..8c5ccf9 100644
--- a/benchtests/memcpy_benchmark2.cc
+++ b/benchtests/memcpy_benchmark2.cc
@@ -122,11 +122,34 @@ std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
    {"No Cache", BM_memcpy_nocache},
    {"Read Cache", BM_memcpy_readcache}};
 
+const char *checks[]=
+{
+  "__memcpy_avx_unaligned",
+  "__memcpy_avx_unaligned_erms",
+  "__memcpy_sse2_unaligned",
+  "__memcpy_sse2_unaligned_erms",
+  "__memcpy_sse2_unaligned_2_19",
+  "__memcpy_erms",
+  NULL,
+};
+
+bool
+match (const char *name)
+{
+  int i;
+  for (i = 0; checks[i] != NULL; i++)
+    if (strcmp (checks[i], name) == 0)
+      return true;
+  return false;
+}
+
 void test() {
   std::cout << "Size(bytes) Alignment(src/dest) BW(Gbytes/sec)" << std::endl;
   bool first = true;
   FOR_EACH_IMPL (impl, 0)
     {
+      if (!match (impl->name))
+	continue;
       if (!first)
 	std::cout << " ";
       std::cout << impl->name;
@@ -139,6 +162,8 @@ void test() {
       first = true;
       FOR_EACH_IMPL (impl, 0)
 	{
+	  if (!match (impl->name))
+	    continue;
 	  int time = do_timing(scheme.second, impl, size);
 	  if (first)
 	    {

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0e1236cb62c8a5d9ea3119bf14a26365de13d609

commit 0e1236cb62c8a5d9ea3119bf14a26365de13d609
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 12 09:26:37 2017 -0700

    Improve output for memcpy_benchmark2.cc

diff --git a/benchtests/memcpy_benchmark2.cc b/benchtests/memcpy_benchmark2.cc
index b160da7..42d534e 100644
--- a/benchtests/memcpy_benchmark2.cc
+++ b/benchtests/memcpy_benchmark2.cc
@@ -124,13 +124,19 @@ std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
 
 void test() {
   std::cout << "Size(bytes) Alignment(src/dest) BW(Gbytes/sec)" << std::endl;
+  bool first = true;
   FOR_EACH_IMPL (impl, 0)
-    std::cout << impl->name << ",";
+    {
+      if (!first)
+	std::cout << " ";
+      std::cout << impl->name;
+      first = false;
+    }
   std::cout << std::endl;
   for (auto scheme : schemes) {
     std::cout << scheme.first << std::endl;
     for (auto size : size_list) {
-      bool first = true;
+      first = true;
       FOR_EACH_IMPL (impl, 0)
 	{
 	  int time = do_timing(scheme.second, impl, size);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=950859238238cf2950e92feac53678976acb19b8

commit 950859238238cf2950e92feac53678976acb19b8
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 12 09:13:31 2017 -0700

    Add memcpy_benchmark2.cc from memcpy_benchmark.cc
    
    For each scheme, run all implementations on each size.

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 79fab64..09cd7c8 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -103,6 +103,9 @@ ifneq (,$(CXX))
 binaries-bench += $(objpfx)memcpy_benchmark
 CFLAGS-memcpy_benchmark.cc = -fpermissive -Wno-error
 LDLIBS-memcpy_benchmark = -lstdc++
+binaries-bench += $(objpfx)memcpy_benchmark2
+CFLAGS-memcpy_benchmark2.cc = -fpermissive -Wno-error
+LDLIBS-memcpy_benchmark2 = -lstdc++
 endif
 
 # The default duration: 10 seconds.
@@ -130,6 +133,7 @@ cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) \
 		 $(binaries-bench-malloc:=.c)
 ifneq (,$(CXX))
 cpp-srcs-left += memcpy_benchmark.cc
+cpp-srcs-left += memcpy_benchmark2.cc
 endif
 lib := nonlib
 include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
diff --git a/benchtests/memcpy_benchmark2.cc b/benchtests/memcpy_benchmark2.cc
new file mode 100644
index 0000000..b160da7
--- /dev/null
+++ b/benchtests/memcpy_benchmark2.cc
@@ -0,0 +1,159 @@
+/* Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ======================================================================*/
+
+#if !defined DO_STRING_INLINES
+#undef __USE_STRING_INLINES
+#endif
+
+#include <string.h>
+#include <chrono>
+#include <iostream>
+#include <functional>
+#include <map>
+#include <string>
+
+#define TEST_MAIN
+#define TEST_NAME "memcpy"
+#define TIMEOUT (60 * 60)
+#include "bench-string.h"
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+IMPL (memcpy, 1)
+
+std::chrono::time_point<std::chrono::high_resolution_clock> start;
+std::chrono::time_point<std::chrono::high_resolution_clock> stop;
+size_t bytes;
+
+#define MAX_ALIGN 128
+int src_align, dest_align;
+
+void start_timing() { start = std::chrono::high_resolution_clock::now(); }
+void stop_timing() { stop = std::chrono::high_resolution_clock::now(); }
+
+int size_list[] = {1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19,
+                   1 << 20, 1 << 21, 1 << 22, 1 << 23, 1 << 24, 1 << 25, 1 << 26};
+size_t buffer_size = 1 << 28;
+
+void BM_memcpy_readwritecache(impl_t *impl, int iters, int size) {
+  unsigned char * buf1 = new unsigned char [size];
+  unsigned char * buf2 = new unsigned char [size];
+
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
+  memset (buf1, 0xa5, size); memset (buf2, 0x5a, size);
+
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    CALL(impl, buf2, buf1, size);
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_nocache(impl_t *impl, int iters, int size) {
+  unsigned char * buf1 = new unsigned char [buffer_size];
+  unsigned char * buf2 = new unsigned char [buffer_size];
+
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
+  memset (buf1, 0xa5, buffer_size); memset (buf2, 0x5a, buffer_size);
+
+  size_t offset = 0;
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    CALL(impl, buf2 + offset, buf1 + offset, size);
+    offset += std::max(4097, size + 1);
+    if (offset >= buffer_size - size) offset = 0;
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_readcache(impl_t *impl, int iters, int size) {
+  unsigned char * buf1 = new unsigned char [size];
+  unsigned char * buf2 = new unsigned char [buffer_size];
+
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
+  memset (buf1, 0xa5, size); memset (buf2, 0x5a, buffer_size);
+
+  size_t offset = 0;
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    CALL(impl, buf2 + offset, buf1, size);
+    offset += std::max(4097, size + 1);
+    if (offset >= buffer_size - size) offset = 0;
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+double do_timing(std::function<void(impl_t *, int, int)> &fn, impl_t *impl, int size) {
+  int iters = 2; double time = 0;
+  while (time < 500) {
+    iters *= 3;
+    fn(impl, iters, size);
+    time = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
+    bytes = (2UL * iters * size);
+  }
+  return time;
+}
+
+std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
+  {{"Read and Write Cache", BM_memcpy_readwritecache},
+   {"No Cache", BM_memcpy_nocache},
+   {"Read Cache", BM_memcpy_readcache}};
+
+void test() {
+  std::cout << "Size(bytes) Alignment(src/dest) BW(Gbytes/sec)" << std::endl;
+  FOR_EACH_IMPL (impl, 0)
+    std::cout << impl->name << ",";
+  std::cout << std::endl;
+  for (auto scheme : schemes) {
+    std::cout << scheme.first << std::endl;
+    for (auto size : size_list) {
+      bool first = true;
+      FOR_EACH_IMPL (impl, 0)
+	{
+	  int time = do_timing(scheme.second, impl, size);
+	  if (first)
+	    {
+	      first = false;
+	      printf("%d %d/%-d %.2f",
+		     size, src_align, dest_align,
+		     (bytes * 1000L / time) / 1e9);
+	    }
+	  else
+	    printf(" %.2f",
+		   (bytes * 1000L / time) / 1e9);
+	}
+      printf ("\n");
+    }
+    std::cout << "----------------\n";
+  }
+}
+
+int test_main(void) {
+  test_init ();
+  test ();
+  return 0;
+}
+#include <support/test-driver.c>

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bf77d312bb479c9182863d193cfdea55e98ecf12

commit bf77d312bb479c9182863d193cfdea55e98ecf12
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu May 11 10:45:25 2017 -0700

    memcpy_benchmark.cc: Set TIMEOUT to 1 hour

diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
index fb1d28a..f85414e 100644
--- a/benchtests/memcpy_benchmark.cc
+++ b/benchtests/memcpy_benchmark.cc
@@ -26,6 +26,7 @@
 
 #define TEST_MAIN
 #define TEST_NAME "memcpy"
+#define TIMEOUT (60 * 60)
 #include "bench-string.h"
 
 typedef char *(*proto_t) (char *, const char *, size_t);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3155c0d0a1787c1fb70ec130ce00c2dc334b049d

commit 3155c0d0a1787c1fb70ec130ce00c2dc334b049d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu May 11 09:01:15 2017 -0700

    Print alignments of source and destination

diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
index 979373c..fb1d28a 100644
--- a/benchtests/memcpy_benchmark.cc
+++ b/benchtests/memcpy_benchmark.cc
@@ -35,6 +35,9 @@ std::chrono::time_point<std::chrono::high_resolution_clock> start;
 std::chrono::time_point<std::chrono::high_resolution_clock> stop;
 size_t bytes;
 
+#define MAX_ALIGN 128
+int src_align, dest_align;
+
 void start_timing() { start = std::chrono::high_resolution_clock::now(); }
 void stop_timing() { stop = std::chrono::high_resolution_clock::now(); }
 
@@ -46,6 +49,9 @@ void BM_memcpy_readwritecache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [size];
   unsigned char * buf2 = new unsigned char [size];
 
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
   memset (buf1, 0xa5, size); memset (buf2, 0x5a, size);
 
   start_timing();
@@ -61,6 +67,9 @@ void BM_memcpy_nocache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [buffer_size];
   unsigned char * buf2 = new unsigned char [buffer_size];
 
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
   memset (buf1, 0xa5, buffer_size); memset (buf2, 0x5a, buffer_size);
 
   size_t offset = 0;
@@ -79,6 +88,9 @@ void BM_memcpy_readcache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [size];
   unsigned char * buf2 = new unsigned char [buffer_size];
 
+  src_align  = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+  dest_align  = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
   memset (buf1, 0xa5, size); memset (buf2, 0x5a, buffer_size);
 
   size_t offset = 0;
@@ -110,12 +122,14 @@ std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
    {"Read Cache", BM_memcpy_readcache}};
 
 void test(impl_t *impl) {
-  std::cout << "      Size (bytes) Time (msec) BW (Gbytes/sec)" << std::endl;
+  std::cout << "      Size (bytes) Alignment (src/dest) Time (msec) BW (Gbytes/sec)" << std::endl;
   for (auto scheme : schemes) {
     std::cout << scheme.first << std::endl;
     for (auto size : size_list) {
       int time = do_timing(scheme.second, impl, size);
-      printf("%12d %10d %10.2f\n", size, time, (bytes * 1000L / time) / 1e9);
+      printf("%12d %15d/%-7d %10d %10.2f\n",
+	     size, src_align, dest_align, time,
+	     (bytes * 1000L / time) / 1e9);
     }
     std::cout << "----------------\n";
   }

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fdf979dd403186a3bc8129a4c5b1d9bf2399f74b

commit fdf979dd403186a3bc8129a4c5b1d9bf2399f74b
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 16:05:09 2017 -0700

    Integrate memcpy_benchmark.cc with glibc benchtests

diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
index 51dff26..979373c 100644
--- a/benchtests/memcpy_benchmark.cc
+++ b/benchtests/memcpy_benchmark.cc
@@ -24,6 +24,13 @@
 #include <map>
 #include <string>
 
+#define TEST_MAIN
+#define TEST_NAME "memcpy"
+#include "bench-string.h"
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+IMPL (memcpy, 1)
+
 std::chrono::time_point<std::chrono::high_resolution_clock> start;
 std::chrono::time_point<std::chrono::high_resolution_clock> stop;
 size_t bytes;
@@ -35,7 +42,7 @@ int size_list[] = {1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19,
                    1 << 20, 1 << 21, 1 << 22, 1 << 23, 1 << 24, 1 << 25, 1 << 26};
 size_t buffer_size = 1 << 28;
 
-void BM_memcpy_readwritecache(int iters, int size) {
+void BM_memcpy_readwritecache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [size];
   unsigned char * buf2 = new unsigned char [size];
 
@@ -43,14 +50,14 @@ void BM_memcpy_readwritecache(int iters, int size) {
 
   start_timing();
   for (int i = 0; i < iters; ++i) {
-    memcpy(buf2, buf1, size);
+    CALL(impl, buf2, buf1, size);
   }
   stop_timing();
 
   delete[] buf1; delete[] buf2;
 }
 
-void BM_memcpy_nocache(int iters, int size) {
+void BM_memcpy_nocache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [buffer_size];
   unsigned char * buf2 = new unsigned char [buffer_size];
 
@@ -59,7 +66,7 @@ void BM_memcpy_nocache(int iters, int size) {
   size_t offset = 0;
   start_timing();
   for (int i = 0; i < iters; ++i) {
-    memcpy(buf2 + offset, buf1 + offset, size);
+    CALL(impl, buf2 + offset, buf1 + offset, size);
     offset += std::max(4097, size + 1);
     if (offset >= buffer_size - size) offset = 0;
   }
@@ -68,7 +75,7 @@ void BM_memcpy_nocache(int iters, int size) {
   delete[] buf1; delete[] buf2;
 }
 
-void BM_memcpy_readcache(int iters, int size) {
+void BM_memcpy_readcache(impl_t *impl, int iters, int size) {
   unsigned char * buf1 = new unsigned char [size];
   unsigned char * buf2 = new unsigned char [buffer_size];
 
@@ -77,7 +84,7 @@ void BM_memcpy_readcache(int iters, int size) {
   size_t offset = 0;
   start_timing();
   for (int i = 0; i < iters; ++i) {
-    memcpy(buf2 + offset, buf1, size);
+    CALL(impl, buf2 + offset, buf1, size);
     offset += std::max(4097, size + 1);
     if (offset >= buffer_size - size) offset = 0;
   }
@@ -86,30 +93,42 @@ void BM_memcpy_readcache(int iters, int size) {
   delete[] buf1; delete[] buf2;
 }
 
-double do_timing(std::function<void(int, int)> &fn, int size) {
+double do_timing(std::function<void(impl_t *, int, int)> &fn, impl_t *impl, int size) {
   int iters = 2; double time = 0;
   while (time < 500) {
     iters *= 3;
-    fn(iters, size);
+    fn(impl, iters, size);
     time = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
     bytes = (2UL * iters * size);
   }
   return time;
 }
 
-std::map<std::string, std::function<void(int, int)>> schemes =
+std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
   {{"Read and Write Cache", BM_memcpy_readwritecache},
    {"No Cache", BM_memcpy_nocache},
    {"Read Cache", BM_memcpy_readcache}};
 
-int main(void) {
+void test(impl_t *impl) {
   std::cout << "      Size (bytes) Time (msec) BW (Gbytes/sec)" << std::endl;
   for (auto scheme : schemes) {
     std::cout << scheme.first << std::endl;
     for (auto size : size_list) {
-      int time = do_timing(scheme.second, size);
+      int time = do_timing(scheme.second, impl, size);
       printf("%12d %10d %10.2f\n", size, time, (bytes * 1000L / time) / 1e9);
     }
     std::cout << "----------------\n";
   }
-}
\ No newline at end of file
+  return 0;
+}
+
+int test_main(void) {
+  test_init ();
+  FOR_EACH_IMPL (impl, 0)
+    {
+      std::cout << impl->name << std::endl;
+      test (impl);
+    }
+  return 0;
+}
+#include <support/test-driver.c>

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=61d082c6a5d6d52cb3b22379f5a80e09bf62cb29

commit 61d082c6a5d6d52cb3b22379f5a80e09bf62cb29
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 15:25:54 2017 -0700

    Build memcpy_benchmark in benchtests
    
    Compile memcpy_benchmark.cc with -fpermissive -Wno-error to silence GCC.

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 7f5fda5..79fab64 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -99,6 +99,12 @@ binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
 binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
 binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))
 
+ifneq (,$(CXX))
+binaries-bench += $(objpfx)memcpy_benchmark
+CFLAGS-memcpy_benchmark.cc = -fpermissive -Wno-error
+LDLIBS-memcpy_benchmark = -lstdc++
+endif
+
 # The default duration: 10 seconds.
 ifndef BENCH_DURATION
 BENCH_DURATION := 10
@@ -122,6 +128,9 @@ endif
 # for all these modules.
 cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) \
 		 $(binaries-bench-malloc:=.c)
+ifneq (,$(CXX))
+cpp-srcs-left += memcpy_benchmark.cc
+endif
 lib := nonlib
 include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
 
diff --git a/string/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
similarity index 100%
rename from string/memcpy_benchmark.cc
rename to benchtests/memcpy_benchmark.cc

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d6eb34753e751501727dd2063a2ba82c4e3f6916

commit d6eb34753e751501727dd2063a2ba82c4e3f6916
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 14:30:08 2017 -0700

    Import memcpy_benchmark.cc
    
    From
    
    https://gist.github.com/ekelsen/b66cc085eb39f0495b57679cdb1874fa

diff --git a/string/memcpy_benchmark.cc b/string/memcpy_benchmark.cc
new file mode 100644
index 0000000..51dff26
--- /dev/null
+++ b/string/memcpy_benchmark.cc
@@ -0,0 +1,115 @@
+/* Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ======================================================================*/
+
+#if !defined DO_STRING_INLINES
+#undef __USE_STRING_INLINES
+#endif
+
+#include <string.h>
+#include <chrono>
+#include <iostream>
+#include <functional>
+#include <map>
+#include <string>
+
+std::chrono::time_point<std::chrono::high_resolution_clock> start;
+std::chrono::time_point<std::chrono::high_resolution_clock> stop;
+size_t bytes;
+
+void start_timing() { start = std::chrono::high_resolution_clock::now(); }
+void stop_timing() { stop = std::chrono::high_resolution_clock::now(); }
+
+int size_list[] = {1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19,
+                   1 << 20, 1 << 21, 1 << 22, 1 << 23, 1 << 24, 1 << 25, 1 << 26};
+size_t buffer_size = 1 << 28;
+
+void BM_memcpy_readwritecache(int iters, int size) {
+  unsigned char * buf1 = new unsigned char [size];
+  unsigned char * buf2 = new unsigned char [size];
+
+  memset (buf1, 0xa5, size); memset (buf2, 0x5a, size);
+
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    memcpy(buf2, buf1, size);
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_nocache(int iters, int size) {
+  unsigned char * buf1 = new unsigned char [buffer_size];
+  unsigned char * buf2 = new unsigned char [buffer_size];
+
+  memset (buf1, 0xa5, buffer_size); memset (buf2, 0x5a, buffer_size);
+
+  size_t offset = 0;
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    memcpy(buf2 + offset, buf1 + offset, size);
+    offset += std::max(4097, size + 1);
+    if (offset >= buffer_size - size) offset = 0;
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_readcache(int iters, int size) {
+  unsigned char * buf1 = new unsigned char [size];
+  unsigned char * buf2 = new unsigned char [buffer_size];
+
+  memset (buf1, 0xa5, size); memset (buf2, 0x5a, buffer_size);
+
+  size_t offset = 0;
+  start_timing();
+  for (int i = 0; i < iters; ++i) {
+    memcpy(buf2 + offset, buf1, size);
+    offset += std::max(4097, size + 1);
+    if (offset >= buffer_size - size) offset = 0;
+  }
+  stop_timing();
+
+  delete[] buf1; delete[] buf2;
+}
+
+double do_timing(std::function<void(int, int)> &fn, int size) {
+  int iters = 2; double time = 0;
+  while (time < 500) {
+    iters *= 3;
+    fn(iters, size);
+    time = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
+    bytes = (2UL * iters * size);
+  }
+  return time;
+}
+
+std::map<std::string, std::function<void(int, int)>> schemes =
+  {{"Read and Write Cache", BM_memcpy_readwritecache},
+   {"No Cache", BM_memcpy_nocache},
+   {"Read Cache", BM_memcpy_readcache}};
+
+int main(void) {
+  std::cout << "      Size (bytes) Time (msec) BW (Gbytes/sec)" << std::endl;
+  for (auto scheme : schemes) {
+    std::cout << scheme.first << std::endl;
+    for (auto size : size_list) {
+      int time = do_timing(scheme.second, size);
+      printf("%12d %10d %10.2f\n", size, time, (bytes * 1000L / time) / 1e9);
+    }
+    std::cout << "----------------\n";
+  }
+}
\ No newline at end of file

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=54fc2ab399320df089e5e98bd2b4ffc5556eaace

commit 54fc2ab399320df089e5e98bd2b4ffc5556eaace
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 10:21:08 2017 -0700

    x86-64: Restore memcpy-sse2-unaligned.S from glibc 2.19

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2a30538..5ed4e74 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-avx512-unaligned-erms \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms \
-		   strlen-sse4
+		   strlen-sse4 memcpy-sse2-unaligned
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1604678..653716e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -353,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
 			      __memcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_2_19)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000..1d05c2c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,171 @@
+/* memcpy with unaliged loads
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+
+ENTRY(__memcpy_sse2_unaligned_2_19)
+	movq	%rsi, %rax
+	leaq	(%rdx,%rdx), %rcx
+	subq	%rdi, %rax
+	subq	%rdx, %rax
+	cmpq	%rcx, %rax
+	jb	L(overlapping)
+	cmpq	$16, %rdx
+	jbe	L(less_16)
+	movdqu	(%rsi), %xmm8
+	cmpq	$32, %rdx
+	movdqu	%xmm8, (%rdi)
+	movdqu	-16(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	ja	.L31
+L(return):
+	movq	%rdi, %rax
+	ret
+	.p2align 4,,10
+	.p2align 4
+.L31:
+	movdqu	16(%rsi), %xmm8
+	cmpq	$64, %rdx
+	movdqu	%xmm8, 16(%rdi)
+	movdqu	-32(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -32(%rdi,%rdx)
+	jbe	L(return)
+	movdqu	32(%rsi), %xmm8
+	cmpq	$128, %rdx
+	movdqu	%xmm8, 32(%rdi)
+	movdqu	-48(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -48(%rdi,%rdx)
+	movdqu	48(%rsi), %xmm8
+	movdqu	%xmm8, 48(%rdi)
+	movdqu	-64(%rsi,%rdx), %xmm8
+	movdqu	%xmm8, -64(%rdi,%rdx)
+	jbe	L(return)
+	leaq	64(%rdi), %rcx
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	andq	$-64, %rcx
+	movq	%rcx, %rax
+	subq	%rdi, %rax
+	addq	%rax, %rsi
+	cmpq	%rdx, %rcx
+	je	L(return)
+	movq	%rsi, %r10
+	subq	%rcx, %r10
+	leaq	16(%r10), %r9
+	leaq	32(%r10), %r8
+	leaq	48(%r10), %rax
+	.p2align 4,,10
+	.p2align 4
+L(loop):
+	movdqu	(%rcx,%r10), %xmm8
+	movdqa	%xmm8, (%rcx)
+	movdqu	(%rcx,%r9), %xmm8
+	movdqa	%xmm8, 16(%rcx)
+	movdqu	(%rcx,%r8), %xmm8
+	movdqa	%xmm8, 32(%rcx)
+	movdqu	(%rcx,%rax), %xmm8
+	movdqa	%xmm8, 48(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	jmp	L(return)
+L(overlapping):
+	cmpq	%rsi, %rdi
+	jae	.L3
+	testq	%rdx, %rdx
+	.p2align 4,,5
+	je	L(return)
+	movq	%rdx, %r9
+	leaq	16(%rsi), %rcx
+	leaq	16(%rdi), %r8
+	shrq	$4, %r9
+	movq	%r9, %rax
+	salq	$4, %rax
+	cmpq	%rcx, %rdi
+	setae	%cl
+	cmpq	%r8, %rsi
+	setae	%r8b
+	orl	%r8d, %ecx
+	cmpq	$15, %rdx
+	seta	%r8b
+	testb	%r8b, %cl
+	je	.L16
+	testq	%rax, %rax
+	je	.L16
+	xorl	%ecx, %ecx
+	xorl	%r8d, %r8d
+.L7:
+	movdqu	(%rsi,%rcx), %xmm8
+	addq	$1, %r8
+	movdqu	%xmm8, (%rdi,%rcx)
+	addq	$16, %rcx
+	cmpq	%r8, %r9
+	ja	.L7
+	cmpq	%rax, %rdx
+	je	L(return)
+.L21:
+	movzbl	(%rsi,%rax), %ecx
+	movb	%cl, (%rdi,%rax)
+	addq	$1, %rax
+	cmpq	%rax, %rdx
+	ja	.L21
+	jmp	L(return)
+L(less_16):
+	testb	$24, %dl
+	jne	L(between_9_16)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(between_5_8)
+	testq	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %eax
+	testb	$2, %dl
+	movb	%al, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %eax
+	movw	%ax, -2(%rdi,%rdx)
+	jmp	L(return)
+.L3:
+	leaq	-1(%rdx), %rax
+	.p2align 4,,10
+	.p2align 4
+.L11:
+	movzbl	(%rsi,%rax), %edx
+	movb	%dl, (%rdi,%rax)
+	subq	$1, %rax
+	jmp	.L11
+L(between_9_16):
+	movq	(%rsi), %rax
+	movq	%rax, (%rdi)
+	movq	-8(%rsi,%rdx), %rax
+	movq	%rax, -8(%rdi,%rdx)
+	jmp	L(return)
+.L16:
+	xorl	%eax, %eax
+	jmp	.L21
+L(between_5_8):
+	movl	(%rsi), %eax
+	movl	%eax, (%rdi)
+	movl	-4(%rsi,%rdx), %eax
+	movl	%eax, -4(%rdi,%rdx)
+	jmp	L(return)
+END(__memcpy_sse2_unaligned_2_19)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b3c4a1de7cf883fcb1e96ed6c4519cd758e0f846

commit b3c4a1de7cf883fcb1e96ed6c4519cd758e0f846
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon May 1 08:32:22 2017 -0700

    x86-64: Restore the old SSE4 strlen

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..2a30538 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -22,7 +22,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-avx-unaligned-erms \
 		   memmove-avx512-unaligned-erms \
 		   memset-avx2-unaligned-erms \
-		   memset-avx512-unaligned-erms
+		   memset-avx512-unaligned-erms \
+		   strlen-sse4
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..1604678 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -410,6 +410,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
 
+  /* Support sysdeps/x86_64/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE4_2),
+			      __strlen_sse42)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, strlen))
+
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000..8d685df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,84 @@
+/* strlen with SSE4
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
+	movq	%rdi, %r8
+	andq	$~15, %rdi
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
+
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
+	leaq	(%rdi,%rcx), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
+	bsfl	%edx, %eax
+	addq	%rdi, %rax
+	ret
+
+END (__strlen_sse42)
+
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f81cb4625de7ef359acd870a42a21984e7a2691f

commit f81cb4625de7ef359acd870a42a21984e7a2691f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 16:02:56 2017 -0700

    Add __BEGIN_DECLS and __END_DECLS for C++
    
    Add __BEGIN_DECLS and __END_DECLS to support C++.  IFUNC_IMPL_ADD and
    IFUNC_IMPL are used internally in libc.  They shouldn't be used in any
    programs.
    
    	* include/ifunc-impl-list.h: Add __BEGIN_DECLS and __END_DECLS.
    	(IFUNC_IMPL_ADD, IFUNC_IMPL): Define only if __cplusplus isn't
    	defined.

diff --git a/include/ifunc-impl-list.h b/include/ifunc-impl-list.h
index 22ca05f..7d53f11 100644
--- a/include/ifunc-impl-list.h
+++ b/include/ifunc-impl-list.h
@@ -22,6 +22,8 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+__BEGIN_DECLS
+
 struct libc_ifunc_impl
 {
   /* The name of function to be tested.  */
@@ -32,20 +34,25 @@ struct libc_ifunc_impl
   bool usable;
 };
 
+#ifndef __cplusplus
+/* NB: IFUNC_IMPL_ADD and IFUNC_IMPL are used internally in libc.  They
+   shouldn't be used in any programs.  */
+
 /* Add an IFUNC implementation, IMPL, for function FUNC, to ARRAY with
    USABLE at index I and advance I by one.  */
-#define IFUNC_IMPL_ADD(array, i, func, usable, impl) \
+# define IFUNC_IMPL_ADD(array, i, func, usable, impl) \
   extern __typeof (func) impl attribute_hidden; \
   (array)[i++] = (struct libc_ifunc_impl) { #impl, (void (*) (void)) impl, (usable) };
 
 /* Return the number of IFUNC implementations, N, for function FUNC if
    string NAME matches FUNC.  */
-#define IFUNC_IMPL(n, name, func, ...) \
+# define IFUNC_IMPL(n, name, func, ...) \
   if (strcmp (name, #func) == 0) \
     { \
       __VA_ARGS__; \
       return n; \
     }
+#endif /* __cplusplus  */
 
 /* Fill ARRAY of MAX elements with IFUNC implementations for function
    NAME and return the number of valid entries.  */
@@ -53,4 +60,6 @@ extern size_t __libc_ifunc_impl_list (const char *name,
 				      struct libc_ifunc_impl *array,
 				      size_t max);
 
+__END_DECLS
+
 #endif /* ifunc-impl-list.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=251138cde183f23924716f8ad983766a004633e2

commit 251138cde183f23924716f8ad983766a004633e2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 10 14:54:22 2017 -0700

    Check __cplusplus in addition to _ISOMAC
    
    When compiling for C++, only include <wctype/wctype.h> and nothing else.
    
    	* include/wctype.h: Check __cplusplus in addition to _ISOMAC.

diff --git a/include/wctype.h b/include/wctype.h
index a71b103..74f9f47 100644
--- a/include/wctype.h
+++ b/include/wctype.h
@@ -1,6 +1,6 @@
 #ifndef _WCTYPE_H
 
-#ifndef _ISOMAC
+#if !defined _ISOMAC && !defined __cplusplus
 /* We try to get wint_t from <stddef.h>, but not all GCC versions define it
    there.  So define it ourselves if it remains undefined.  */
 # define __need_wint_t
@@ -38,7 +38,7 @@ libc_hidden_proto (towupper)
 
 #include <wctype/wctype.h>
 
-#ifndef _ISOMAC
+#if !defined _ISOMAC && !defined __cplusplus
 /* Internal interfaces.  */
 extern int __iswspace (wint_t __wc);
 extern int __iswctype (wint_t __wc, wctype_t __desc);

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]