This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] avoid initialization overhead in strcoll


Hi Siddhesh,


Am 04.06.2015 um 20:38 schrieb Siddhesh Poyarekar:
> On Thu, May 28, 2015 at 05:39:01PM +0200, Leonhard Holz wrote:
>> This patch removes some initialization overhead in the hot path of strcoll.
>> It improves the file listing benchmark by 20% on x86.
> 
> You've removed a memset call which seems to be the reason for the 20% 
> overhead; I can't see anything else that could have done this, but I haven't
> looked hard enough.
> 

I think so too.

> This change however should be split up to only include the useful stuff.
> 
>> diff --git a/string/strcoll_l.c b/string/strcoll_l.c index 0fa005f..14bfb8b
>> 100644 --- a/string/strcoll_l.c +++ b/string/strcoll_l.c @@ -62,7 +62,6 @@
>> typedef struct int len;			/* Length of the current sequence.  */ size_t
>> val;			/* Position of the sequence relative to the previous non-ignored
>> sequence.  */ -  size_t idxnow;		/* Current index in sequences.  */
> 
> This should go in as a separate change since it is valid without any 
> performance justification.  idxnow is unused.  Please post this as a separate
> patch.

Ok.

> 
>> size_t idxmax;		/* Maximum index in sequences.  */ size_t idxcnt;		/*
>> Current count of indices.  */ size_t backw;			/* Current Backward sequence
>> index.  */ @@ -313,20 +312,22 @@ STRCOLL (const STRING_TYPE *s1, const
>> STRING_TYPE *s2, __locale_t l) assert (((uintptr_t) extra) % __alignof__
>> (extra[0]) == 0); assert (((uintptr_t) indirect) % __alignof__
>> (indirect[0]) == 0);
>> 
>> -  int result = 0, rule = 0; - +  int result = 0; coll_seq seq1, seq2; -
>> memset (&seq1, 0, sizeof (seq1)); -  seq2 = seq1; +  seq1.len = 0; +
>> seq1.idxmax = 0; +  seq1.rule = 0; +  seq2.len = 0; +  seq2.idxmax = 0;
> 
> I'm surprised that this actually makes a difference.  The memset should have
> been optimized out and replaced with movs.  The change to remove the RULE
> variable seems unrelated to me and should just be avoided if it does not
> result in any interesting code change..

It is not merely a replacement of memset with assignments, after the patch only
the properties that need to be set to zero are set and the rest is left
uninitialized. They will be set by get_next_seq() anyway. Removing the RULE var
is included because it saves an initialization.

> 
> Can you please post a more detailed justification of what caused the 
> performance improvement?

Maybe it is platform dependent, but my (x86) assembly output shows a rep stos
that is removed due to the patch.

Before:

319	  memset (&seq1, 0, sizeof (seq1));
   0x000000ef <__strcoll_l+175>:	lea    0x90(%esp),%edx
   0x000000f6 <__strcoll_l+182>:	mov    %ebx,%eax
   0x000000f8 <__strcoll_l+184>:	mov    $0xc,%ecx
   0x000000fd <__strcoll_l+189>:	movl   $0x0,0x20(%esp)
   0x00000105 <__strcoll_l+197>:	movl   $0x0,0x74(%esp)
   0x0000010d <__strcoll_l+205>:	mov    %edx,%edi
   0x0000010f <__strcoll_l+207>:	movl   $0x0,0x7c(%esp)
   0x00000117 <__strcoll_l+215>:	movl   $0x0,0x3c(%esp)
   0x0000011f <__strcoll_l+223>:	rep stos %eax,%es:(%edi)
   0x00000121 <__strcoll_l+225>:	xor    %eax,%eax

After (bigger snippet because compiler moved code):

308	    current->values[_NL_ITEM_INDEX
(CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
309	
310	  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
   0x000000c7 <__strcoll_l+135>:	and    $0x3,%esi
   0x000000d2 <__strcoll_l+146>:	jne    0x11f5 <__strcoll_l+4533>

311	  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
312	  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
313	  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
   0x000000d8 <__strcoll_l+152>:	testb  $0x3,0x38(%esp)
   0x000000dd <__strcoll_l+157>:	jne    0x11dc <__strcoll_l+4508>
   0x000000e3 <__strcoll_l+163>:	movl   $0x0,0x4c(%esp)
   0x000000eb <__strcoll_l+171>:	movb   $0x0,0x9d(%esp)
   0x000000f3 <__strcoll_l+179>:	movl   $0x0,0x20(%esp)
   0x000000fb <__strcoll_l+187>:	movl   $0x0,0x3c(%esp)

314	
315	  int result = 0;
316	  coll_seq seq1, seq2;
317	  seq1.len = 0;
318	  seq1.idxmax = 0;
319	  seq1.rule = 0;
320	  seq2.len = 0;
321	  seq2.idxmax = 0;
322	
323	  for (int pass = 0; pass < nrules; ++pass)
324	    {
325	      seq1.idxcnt = 0;
   0x00000163 <__strcoll_l+291>:	movl   $0x0,0x60(%esp)

Leonhard


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]