This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

memcmp-sse4.S EqualHappy bug


Hello everyone,

last week I run into some problem because of an erratic behavior of
memcmp/bcmp that returns "0" even thought the two regions are never
equal at any given time if using the memcmp-sse4.S version (the
default in most recent Linux on x86-64 hardware with sse4).

My bug was that I was getting the zeropage sometime erratically mapped
(that was a bug in the testsuite in userland but I didn't fix that
yet) so I added a memcmp(page, zeropage, page_size) in the code to
detect when it would happen. Unfortunately this memcmp started to
report all zero even when the "page" was not a zeropage, and it kept
reporting it even after I actually fixed such a bug in the
testsuite.

The original testcase was here (not guaranteed permalink but it should
work for a long while):

https://git.kernel.org/cgit/linux/kernel/git/andrea/aa.git/tree/tools/testing/selftests/vm/userfaultfd.c?h=userfault21

I had a contended pthread_mutex_t at the start of the page given as
parameter to memcmp (that was changing all the time), immediately
followed by an unsigned long long counter which was never zero at any
given time.

Now I reduced the testcase to the minimum and I appended it at the end
of this email.

By the C standard I assume this is not a bug because the C language
assumes everything is single threaded (and if I put a mutex lock
around the memcmp to prevent the memory to change under it, of course
it works correctly then). However having memcmp returning 0 when the
two areas can never zero at any given time (no matter part of the
memory compared is changing) looks risky. In my case I was testing
userfaultfd so I had to first think at all sort of tlb flushes or race
conditions in the kernel before I considered the possibility of memcmp
being "buggy" (buggy not by C standard terms but still...). I started
to consider it was memcmp failing after I checked the counter by hand
to be non zero before starting the memcmp.

The problem is that the unrolled loop using sse4 only do ptest so they
can't return positive negative values directly. When the unrolled loop
breaks out it jumps to an offset that repeat the test re-reading the
memory (but this time it will get an equal copy) and it will return 0
without continuing comparing the rest of the data.

I think it's possible to fix this without having to alter the fast
path of the sse4 unrolled loops. The rsi/rdi pointers of the two areas
being compared are already adjusted perfectly when jumping to the
right offset, to prepare for the non-sse4 final comparison.

In order to fix this problem it should be enough to adjust rdx length
argument as well in addition to the rsi/rdi pointers and to check if
rdx is going down to zero with that last final comparison that was
equal. If rdx isn't at the end, it should be enough to start of memcmp
again from scratch. So it will continue. There's no need to extract
the "different" value read in the two xmm with pextrq, just letting it
continue comparing if it found equal data in the last comparison
without sse4 is enough. If the data has changed under it, it doesn't
matter whatever garbage it read in the sse4 unrolled loop.

L(8bytes):
	mov	-8(%rdi), %rax
	mov	-8(%rsi), %rcx
	cmp	%rax, %rcx
	jne	L(diffin8bytes)
	^^^^^^^^ here check %rdx if at the end and restart if not
	xor	%eax, %eax
	ret

There are other places like "8bytes" above to adjust.

The propagation of rdx along rdi/rsi should be something along these
lines (broken/incomplete):

diff --git a/equalhappybug/memcmp-sse4.S b/equalhappybug/memcmp-sse4.S
index 0de80df..5812792 100644
--- a/equalhappybug/memcmp-sse4.S
+++ b/equalhappybug/memcmp-sse4.S
@@ -205,7 +205,6 @@ L(less32bytesin128):
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 
 L(less512bytes):
-	sub	$256, %rdx
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -712,34 +706,41 @@ L(L2_L3_aligned_128bytes_loop):
 
 	.p2align 4
 L(64bytesormore_loop_end):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm3, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	ptest	%xmm4, %xmm0
 	jnc	L(16bytes)
 
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 	jmp	L(16bytes)
 
 L(256bytesin256):
+	sub	$256, %rdx
 	add	$256, %rdi
 	add	$256, %rsi
 	jmp	L(16bytes)
 L(240bytesin256):
+	sub	$240, %rdx
 	add	$240, %rdi
 	add	$240, %rsi
 	jmp	L(16bytes)
 L(224bytesin256):
+	sub	$224, %rdx
 	add	$224, %rdi
 	add	$224, %rsi
 	jmp	L(16bytes)
@@ -748,45 +749,56 @@ L(208bytesin256):
 	add	$208, %rsi
 	jmp	L(16bytes)
 L(192bytesin256):
+	sub	$192, %rdx
 	add	$192, %rdi
 	add	$192, %rsi
 	jmp	L(16bytes)
 L(176bytesin256):
+	sub	$176, %rdx
 	add	$176, %rdi
 	add	$176, %rsi
 	jmp	L(16bytes)
 L(160bytesin256):
+	sub	$160, %rdx
 	add	$160, %rdi
 	add	$160, %rsi
 	jmp	L(16bytes)
 L(144bytesin256):
+	sub	$144, %rdx
 	add	$144, %rdi
 	add	$144, %rsi
 	jmp	L(16bytes)
 L(128bytesin256):
+	sub	$128, %rdx
 	add	$128, %rdi
 	add	$128, %rsi
 	jmp	L(16bytes)
 L(112bytesin256):
+	sub	$112, %rdx
 	add	$112, %rdi
 	add	$112, %rsi
 	jmp	L(16bytes)
 L(96bytesin256):
+	sub	$96, %rdx
 	add	$96, %rdi
 	add	$96, %rsi
 	jmp	L(16bytes)
 L(80bytesin256):
+	sub	$80, %rdx
 	add	$80, %rdi
 	add	$80, %rsi
 	jmp	L(16bytes)
 L(64bytesin256):
+	sub	$64, %rdx
 	add	$64, %rdi
 	add	$64, %rsi
 	jmp	L(16bytes)
 L(48bytesin256):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 L(32bytesin256):
+	sub	$16, %rdx
 	add	$16, %rdi
 	add	$16, %rsi
 L(16bytesin256):


Before doing any more work on this I'd first like some confirmation
that it is good idea to fix this (i.e. that the current unexpected
behavior is not intentional). And if yes, help would be appreciated.

Thanks,
Andrea

==
/*
 * EqualHappy memcmp/bcmp bug reproducer
 *
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 *
 *  gcc -Wall -O2 -o equalhappybug equalhappybug.c -lpthread
 *  gcc -DWORKAROUND -Wall -O2 -o equalhappybug equalhappybug.c -lpthread
 */

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>

static long nr_cpus, page_size;
#define NON_ZERO_OFFSET 16U
static char *page, *zeropage;
volatile int finished;

static int my_bcmp(char *str1, char *str2, unsigned long n)
{
	unsigned long i;
	for (i = 0; i < n; i++)
		if (str1[i] != str2[i])
			return 1;
	return 0;
}

static void *locking_thread(void *arg)
{
	char val;

	while (!finished) {
		val = page[NON_ZERO_OFFSET];
		if (val != 1)
			fprintf(stderr,
				"wrong value %d\n", val), exit(1);

#ifdef WORKAROUND
		if (!my_bcmp(page, zeropage, page_size))
			fprintf(stderr,
				"page all zero thread %p\n", page), exit(1);
#else
		unsigned long loops = 0;
		//while (!bcmp(page, zeropage, page_size))
		while (!memcmp(page, zeropage, page_size)) {
			loops += 1;
			asm volatile("" : : : "memory");
		}
		if (loops)
			fprintf(stderr,
				"page all zero thread %p %lu\n"
				"EqualHappy bug found\n",
				page, loops), exit(1);
#endif

		*page = 0;
		asm volatile("" : : : "memory");
		*page = 1;
	}

	return NULL;
}

static int stress(void)
{
	unsigned long cpu;
	pthread_t locking_threads[nr_cpus];

	if (!my_bcmp(page, zeropage, page_size))
		fprintf(stderr,
			"page all zero thread %p\n", page), exit(1);

	printf("Test started\n");

	finished = 0;
	for (cpu = 0; cpu < nr_cpus; cpu++)
		if (pthread_create(&locking_threads[cpu], NULL,
				   locking_thread, NULL))
			return 1;

	sleep(10);

	finished = 1;
	for (cpu = 0; cpu < nr_cpus; cpu++)
		if (pthread_join(locking_threads[cpu], NULL))
			return 1;

	printf("ZeroHappy bug not found\n");
	return 0;
}

static int userfaultfd_stress(void)
{
	void *tmp_area;

	if (posix_memalign(&tmp_area, page_size, page_size)) {
		fprintf(stderr, "out of memory\n");
		return 1;
	}
	page = tmp_area;
	memset(page, 0xff, page_size);
	bzero(page, NON_ZERO_OFFSET);
	page[NON_ZERO_OFFSET] = 1;

	if (posix_memalign(&tmp_area, page_size, page_size)) {
		fprintf(stderr, "out of memory\n");
		return 1;
	}
	zeropage = tmp_area;
	bzero(zeropage, page_size);

	return stress();
}

int main(int argc, char **argv)
{
	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (nr_cpus < 0)
		perror("_SC_NPROCESSORS_ONLN"), exit(1);
	page_size = sysconf(_SC_PAGE_SIZE);
	if (page_size < 0)
		perror("_SC_NPROCESSORS_ONLN"), exit(1);
	if (NON_ZERO_OFFSET >= page_size)
		fprintf(stderr, "Impossible to run this test\n"), exit(1);
	return userfaultfd_stress();
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]