horrible disk thorughput on itanium
Jakub Jelinek
jakub@redhat.com
Sat Dec 8 05:36:00 GMT 2001
On Fri, Dec 07, 2001 at 06:17:46PM -0800, Geoff Keating wrote:
> > Date: Fri, 07 Dec 2001 16:07:27 -0800
> > From: Andrew Morton <akpm@zip.com.au>
>
> > In answer to Geoff Keating's question, this program:
> >
> > #include <stdio.h>
> >
> > main()
> > {
> > int i;
> >
> > for (i = 0; i < 100*1000*1000; i++)
> > putc(0, stdout);
> > exit(0);
> > }
> >
> > when directed to /dev/null takes 7.5 seconds. But only 1.3 seconds
> > with putc_unlocked.
>
> I well believe that, since putc_unlocked is a macro which can be
> inlined and then further optimised, but I was referring to the
> proposed change to putc(), which doesn't make it a macro and actually
> adds more code.
Its not just that. I've tried Andrew's example code slightly modified
to test:
out-of-line putc, inlined putc_unlocked, non-inlined putc_unlocked
and two variants of what I was talking about yesterday (done on separate
structure, it would be somewhere in pad of FILE instead).
Here are results which show that inlining putc_unlocked is about half of the
win and simplified putc_unlocked is the other one.
TEST==4 (ie. inlined putc with locking if -lpthread is linked in) can be
about as fast as inlined putc_unlocked.
Here the locksetup function is needed for compatibility, because e.g.
pre-gcc3 libstdc++ will have stdin/stdout/stderr with zeros in the pad.
lockme and unlockme fn pointers with -lpthread linked in would point to
functions which would do:
void lockme(FILE *fp, FILB *buf)
{
_pthread_cleanup_push_defer (buf, _IO_funlockfile, fp);
_IO_flockfile (fp);
}
and
void unlockme(FILE *fp, FILB *buf)
{
_pthread_cleanup_pop_restore (buf, 1);
}
Results for test 0, 1, 2, 3, 4 (note that -freorder-blocks is now default at
-O2):
real 0m9.559s
user 0m9.530s
sys 0m0.030s
real 0m1.876s
user 0m1.860s
sys 0m0.020s
real 0m4.706s
user 0m4.680s
sys 0m0.030s
real 0m4.315s
user 0m4.320s
sys 0m0.000s
real 0m1.883s
user 0m1.840s
sys 0m0.040s
#include <stdio.h>
extern int my_putc_unlocked (int __c, FILE *__stream) asm ("putc_unlocked");
typedef struct FILB { char buf[64]; } FILB;
typedef struct FIL { void (*lockme) (struct FIL *, FILB *); void (*unlockme)
(struct FIL *, FILB *); } FIL;
void lockme_dummy (struct FIL *fp, FILB *buf) {}
void unlockme_dummy (struct FIL *fp, FILB *buf) {}
void *locksetup (FIL *fp)
{
#if TEST == 4
fp->lockme = (void *) 1;
fp->unlockme = (void *) 1;
return (void *) 1;
#else
fp->lockme = lockme_dummy;
fp->unlockme = unlockme_dummy;
return lockme_dummy;
#endif
}
FIL fpx, *fp = &fpx;
main()
{
int i;
for (i = 0; i < 100*1000*1000; i++)
#if TEST == 0
putc (0, stdout);
#elif TEST == 1
putc_unlocked (0, stdout);
#elif TEST == 2
my_putc_unlocked (0, stdout);
#elif TEST == 3
{
FILB __buf;
void (*lockme) (FIL *, FILB *) = fp->lockme;
if (__builtin_expect (lockme == NULL, 0))
lockme = locksetup (fp);
lockme(fp, &__buf);
putc_unlocked(0, stdout);
fp->unlockme(fp, &__buf);
}
#elif TEST == 4
{
FILB __buf;
void (*lockme) (FIL *, FILB *) = fp->lockme;
if (__builtin_expect (lockme == NULL, 0))
lockme = locksetup (fp);
if (__builtin_expect (lockme != (void *) 1, 0))
lockme(fp, &__buf);
putc_unlocked(0, stdout);
if (__builtin_expect (fp->unlockme != (void *) 1, 0))
fp->unlockme(fp, &__buf);
}
#endif
exit(0);
}
for i in 0 1 2 3 4; do gcc -O2 -o /tmp/y$i -DTEST=$i /tmp/y.c -freorder-blocks -mcpu=i686; done
for i in 0 1 2 3 4; do time /tmp/y$i > /dev/null; done
Jakub
More information about the Libc-alpha
mailing list