View | Details | Raw Unified | Return to bug 25933 | Differences between
and this patch

Collapse All | Expand All

(-)a/sysdeps/x86_64/multiarch/strcmp-avx2.S (-138 / +8 lines)
Lines 489-632 L(test_3_vec): Link Here
489
	VZEROUPPER
489
	VZEROUPPER
490
	ret
490
	ret
491
491
492
	.p2align 4
493
L(loop_cross_page):
494
	xorl	%r10d, %r10d
495
	movq	%rdx, %rcx
496
	/* Align load via RDX.  We load the extra ECX bytes which should
497
	   be ignored.  */
498
	andl	$((VEC_SIZE * 4) - 1), %ecx
499
	/* R10 is -RCX.  */
500
	subq	%rcx, %r10
501
502
	/* This works only if VEC_SIZE * 2 == 64. */
503
# if (VEC_SIZE * 2) != 64
504
#  error (VEC_SIZE * 2) != 64
505
# endif
506
507
	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
508
	cmpl	$(VEC_SIZE * 2), %ecx
509
	jge	L(loop_cross_page_2_vec)
510
511
	vmovdqu	(%rax, %r10), %ymm2
512
	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
513
	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
514
	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
515
	VPMINU	%ymm2, %ymm0, %ymm0
516
	VPMINU	%ymm3, %ymm1, %ymm1
517
	VPCMPEQ	%ymm7, %ymm0, %ymm0
518
	VPCMPEQ	%ymm7, %ymm1, %ymm1
519
520
	vpmovmskb %ymm0, %edi
521
	vpmovmskb %ymm1, %esi
522
523
	salq	$32, %rsi
524
	xorq	%rsi, %rdi
525
526
	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
527
	shrq	%cl, %rdi
528
529
	testq	%rdi, %rdi
530
	je	L(loop_cross_page_2_vec)
531
	tzcntq	%rdi, %rcx
532
# ifdef USE_AS_STRNCMP
533
	cmpq	%rcx, %r11
534
	jbe	L(zero)
535
#  ifdef USE_AS_WCSCMP
536
	movq	%rax, %rsi
537
	xorl	%eax, %eax
538
	movl	(%rsi, %rcx), %edi
539
	cmpl	(%rdx, %rcx), %edi
540
	jne	L(wcscmp_return)
541
#  else
542
	movzbl	(%rax, %rcx), %eax
543
	movzbl	(%rdx, %rcx), %edx
544
	subl	%edx, %eax
545
#  endif
546
# else
547
#  ifdef USE_AS_WCSCMP
548
	movq	%rax, %rsi
549
	xorl	%eax, %eax
550
	movl	(%rsi, %rcx), %edi
551
	cmpl	(%rdx, %rcx), %edi
552
	jne	L(wcscmp_return)
553
#  else
554
	movzbl	(%rax, %rcx), %eax
555
	movzbl	(%rdx, %rcx), %edx
556
	subl	%edx, %eax
557
#  endif
558
# endif
559
	VZEROUPPER
560
	ret
561
562
	.p2align 4
563
L(loop_cross_page_2_vec):
564
	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
565
	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
566
	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
567
	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
568
	VPMINU	%ymm2, %ymm5, %ymm5
569
	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
570
	VPCMPEQ	%ymm7, %ymm5, %ymm5
571
	VPMINU	%ymm3, %ymm6, %ymm6
572
	VPCMPEQ	%ymm7, %ymm6, %ymm6
573
574
	vpmovmskb %ymm5, %edi
575
	vpmovmskb %ymm6, %esi
576
577
	salq	$32, %rsi
578
	xorq	%rsi, %rdi
579
580
	xorl	%r8d, %r8d
581
	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
582
	subl	$(VEC_SIZE * 2), %ecx
583
	jle	1f
584
	/* Skip ECX bytes.  */
585
	shrq	%cl, %rdi
586
	/* R8 has number of bytes skipped.  */
587
	movl	%ecx, %r8d
588
1:
589
	/* Before jumping back to the loop, set ESI to the number of
590
	   VEC_SIZE * 4 blocks before page crossing.  */
591
	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
592
593
	testq	%rdi, %rdi
594
	je	L(back_to_loop)
595
	tzcntq	%rdi, %rcx
596
	addq	%r10, %rcx
597
	/* Adjust for number of bytes skipped.  */
598
	addq	%r8, %rcx
599
# ifdef USE_AS_STRNCMP
600
	addq	$(VEC_SIZE * 2), %rcx
601
	subq	%rcx, %r11
602
	jbe	L(zero)
603
#  ifdef USE_AS_WCSCMP
604
	movq	%rax, %rsi
605
	xorl	%eax, %eax
606
	movl	(%rsi, %rcx), %edi
607
	cmpl	(%rdx, %rcx), %edi
608
	jne	L(wcscmp_return)
609
#  else
610
	movzbl	(%rax, %rcx), %eax
611
	movzbl	(%rdx, %rcx), %edx
612
	subl	%edx, %eax
613
#  endif
614
# else
615
#  ifdef USE_AS_WCSCMP
616
	movq	%rax, %rsi
617
	xorl	%eax, %eax
618
	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
619
	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
620
	jne	L(wcscmp_return)
621
#  else
622
	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
623
	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
624
	subl	%edx, %eax
625
#  endif
626
# endif
627
	VZEROUPPER
628
	ret
629
630
	.p2align 4
492
	.p2align 4
631
L(cross_page_loop):
493
L(cross_page_loop):
632
	/* Check one byte/dword at a time.  */
494
	/* Check one byte/dword at a time.  */
Lines 722-727 L(last_vector): Link Here
722
	VZEROUPPER
584
	VZEROUPPER
723
	ret
585
	ret
724
586
587
	.p2align 4
588
L(loop_cross_page):
589
	movq	%rax, %rdi
590
	mov	%rdx, %rsi
591
	orl	%edx, %eax
592
	xorq	%rdx, %rdx
593
	andl	$(PAGE_SIZE - 1), %eax
594
725
	/* Comparing on page boundary region requires special treatment:
595
	/* Comparing on page boundary region requires special treatment:
726
	   It must done one vector at the time, starting with the wider
596
	   It must done one vector at the time, starting with the wider
727
	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
597
	   ymm vector if possible, if not, with xmm. If fetching 16 bytes

Return to bug 25933