[ECOS] TCP/IP checksum routine performance
Grant Edwards
grante@visi.com
Fri Apr 28 08:41:00 GMT 2000
On Fri, Apr 28, 2000 at 09:25:26AM +0200, Andrew Lunn wrote:
> > I've been playing with various system performance issues for
> > the past couple days. According to the results of running
> > nc_test_slave, the checksum routine (in_cksum.c) is taking more
> > time than all of the other instrumented routines combined.
>
> Once you have finish your assembly code could you contribute it
> back please? I would be interested in using it on my ARM
> platform.
I don't think that should be a problem. I'm going to play with
it a little more first -- I've made a possibly dangerous
assumption that the only time an mbuf will contain an odd
number of bytes is at the end of a chain (IOW, a 16-bit word is
never split across mbuf boundaries). So far, experiments and
source code examination _seem_ to support that assumption. If
not, I'll need to add some special-case code to handle that.
> It may also be worth looking at the Linux implementation. The
> ARM CRC routines are in assembly. It may give you some ideas
> for optimisation.
I've also got a link to the BSD ARM code (the non-portable
version with asm() in-lines) that I'm going to take a look at.
> > So, the executive summary for ARM users is: you can probably
> > improve your TCP/IP performance noticably by compiling
> > in_cksum.c with optimization turned off.
>
> :-)
>
> Would you guess this is to do with endianess? Is the assembly
> doing lots of word order changes?
Nope, it doesn't seem to be an endian issue. It looks like the
optimizer tries to postpone calculations (the "add"
instructions in the example below). This leaves a lot of needed
data in registers. When it does decide to do the operations it
runs out of registers and has to shuffle stuff around (is
"register thrashing" a good term?). For instance, here is the
code for the inner checksum loop which sums 32 bytes per
iteration.
Code generated by -O0 (39 instructions) is on the left, -O3 (56
instructions) is on the right. Most of the extra instructions
access external memory (like the ldr/str rX,[r11,-#XX] lines),
so the timing penalty is worse than the instruction count would
indicate. The 5 lines at the top of the loop on the left could
be optimized a little bit, but the rest of the loop is pretty
much optimal as is unless you recognize that the values being
added are in contiguous memory and use a "ldmia" instruction to
burst them into a block of registers (which is what my assembly
language routine does).
while ((mlen -= 32) >= 0) { while ((mlen -= 32) >= 0) {
sub r8, r8, #32 ; 0x20 ldr r1, [r11, -#56]
mov r3, r8 subs r1, r1, #32 ; 0x20
cmp r3, #0 ; 0x0 str r1, [r11, -#56]
bge 220 <in_cksum+0x220> ldr r4, [r4]
b 2ac <in_cksum+0x2ac> str r4, [r11, -#92]
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; bmi 330 <in_cksum+0x330>
ldrh r3, [r6] sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
add r7, r7, r3 ldrh r3, [r9]
ldrh r3, [r6, #2] ldrh r2, [r9, #2]
add r7, r7, r3 ldrh r1, [r9, #4]
ldrh r3, [r6, #4] ldrh r0, [r9, #6]
add r7, r7, r3 sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
ldrh r3, [r6, #6] ldrh r12, [r9, #8]
add r7, r7, r3 ldrh lr, [r9, #10]
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; add r10, r10, r3
ldrh r3, [r6, #8] str lr, [r11, -#100]
add r7, r7, r3 add r10, r10, r2
ldrh r3, [r6, #10] sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
add r7, r7, r3 ldrh lr, [r9, #22]
ldrh r3, [r6, #12] add r10, r10, r1
add r7, r7, r3 ldr r1, [r11, -#100]
ldrh r3, [r6, #14] ldrh r4, [r9, #12]
add r7, r7, r3 ldrh r5, [r9, #14]
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; ldrh r6, [r9, #16]
ldrh r3, [r6, #16] ldrh r7, [r9, #18]
add r7, r7, r3 ldrh r8, [r9, #20]
ldrh r3, [r6, #18] str lr, [r11, -#72]
add r7, r7, r3 sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
ldrh r3, [r6, #20] ldrh lr, [r9, #24]
add r7, r7, r3 ldr r2, [r11, -#72]
ldrh r3, [r6, #22] add r10, r10, r0
add r7, r7, r3 str lr, [r11, -#76]
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15] add r10, r10, r12
ldrh r3, [r6, #24] ldrh lr, [r9, #26]
add r7, r7, r3 add r10, r10, r1
ldrh r3, [r6, #26] ldr r3, [r11, -#76]
add r7, r7, r3 add r10, r10, r4
ldrh r3, [r6, #28] str lr, [r11, -#80]
add r7, r7, r3 add r10, r10, r5
ldrh r3, [r6, #30] ldrh lr, [r9, #28]
add r7, r7, r3 add r10, r10, r6
w += 16; str lr, [r11, -#84]
add r6, r6, #32 ; 0x20 add r10, r10, r7
} ldrh lr, [r9, #30]
b 298 <in_cksum+0x298> add r10, r10, r8
str lr, [r11, -#88]
add r10, r10, r2
w += 16;
}
ldr lr, [r11, -#56]
add r10, r10, r3
ldr r12, [r11, -#80]
subs lr, lr, #32 ; 0x20
str lr, [r11, -#56]
add r9, r9, #32 ; 0x20
ldr lr, [r11, -#84]
add r10, r10, r12
ldr r1, [r11, -#88]
add r10, r10, lr
add r10, r10, r1
bpl 330 <in_cksum+0x330>
> Maybe this is a general issue for the ARM compiler.
All of the other code I've timed improved with optimization
turned on. There's just something about this particular source
code sequence that puts the optimizer into a tailspin.
> It could be interesting to run tm_basic with different levels
> of optimisation and see what happens.
I don't remember the exact numbers but tm_basic shows a
definite improvement with optimization turned on, as does my
mostly-assembly version (the outside loop is still in C).
--
Grant Edwards
grante@visi.com
More information about the Ecos-discuss
mailing list