This is the mail archive of the
binutils@sourceware.org
mailing list for the binutils project.
Re: SPU overlay update
- From: Alan Modra <amodra at bigpond dot net dot au>
- To: binutils at sourceware dot org
- Date: Thu, 7 Feb 2008 11:58:09 +1030
- Subject: Re: SPU overlay update
- References: <20080128055644.GA9974@bubble.grove.modra.org>
On Mon, Jan 28, 2008 at 04:26:44PM +1030, Alan Modra wrote:
> compile time, a more compact stub. Double size of _ovly_buf_table
> so that low bit of _ovly_table.buf can be used as a "present" bit.
Using the low bit of _ovly_table.buf as a "present" bit turns out to
be a bad idea, as it can confuse gdb. gdb doesn't use
_ovly_table.size and oprofile doesn't read _ovly_table from target
memory as far as I'm aware, so using the low bit of .size is a better
choice.
bfd/
* elf32-spu.c (spu_elf_size_stubs): Revert 2008-01-28 doubling
of _ovly_buf_table size.
(spu_elf_build_stubs): Use low bit of .size as "present" bit.
Adjust initialisations relating to _ovly_buf_table.
ld/
* emultempl/spu_ovl.S: Use low bit of _ovly_table.size as
a "present" bit rather than low bit of .buf. Correct indexing
into _ovly_buf_table. Use relative loads and stores to access
overlay manager local vars.
* emultempl/spu_ovl.o: Regenerate.
Index: bfd/elf32-spu.c
===================================================================
RCS file: /cvs/src/src/bfd/elf32-spu.c,v
retrieving revision 1.28
diff -u -p -r1.28 elf32-spu.c
--- bfd/elf32-spu.c 4 Feb 2008 01:13:38 -0000 1.28
+++ bfd/elf32-spu.c 7 Feb 2008 00:15:53 -0000
@@ -1202,7 +1202,7 @@ spu_elf_size_stubs (bfd *output_bfd,
|| !bfd_set_section_alignment (ibfd, htab->ovtab, 4))
return 0;
- htab->ovtab->size = htab->num_overlays * 16 + 16 + htab->num_buf * 2 * 4;
+ htab->ovtab->size = htab->num_overlays * 16 + 16 + htab->num_buf * 4;
(*place_spu_section) (htab->ovtab, NULL, ".data");
htab->toe = bfd_make_section_anyway_with_flags (ibfd, ".toe", SEC_ALLOC);
@@ -1373,8 +1373,8 @@ spu_elf_build_stubs (struct bfd_link_inf
/* Write out _ovly_table. */
p = htab->ovtab->contents;
- /* set low bit of .buf to mark non-overlay area as present. */
- p[15] = 1;
+ /* set low bit of .size to mark non-overlay area as present. */
+ p[7] = 1;
for (s = obfd->sections; s != NULL; s = s->next)
{
unsigned int ovl_index = spu_elf_section_data (s)->u.o.ovl_index;
@@ -1387,7 +1387,7 @@ spu_elf_build_stubs (struct bfd_link_inf
bfd_put_32 (htab->ovtab->owner, s->vma, p + off);
bfd_put_32 (htab->ovtab->owner, (s->size + 15) & -16, p + off + 4);
/* file_off written later in spu_elf_modify_program_headers. */
- bfd_put_32 (htab->ovtab->owner, ovl_buf * 2, p + off + 12);
+ bfd_put_32 (htab->ovtab->owner, ovl_buf, p + off + 12);
}
}
@@ -1407,12 +1407,12 @@ spu_elf_build_stubs (struct bfd_link_inf
if (h == NULL)
return FALSE;
h->root.u.def.value = htab->num_overlays * 16 + 16;
- h->size = htab->num_buf * 2 * 4;
+ h->size = htab->num_buf * 4;
h = define_ovtab_symbol (htab, "_ovly_buf_table_end");
if (h == NULL)
return FALSE;
- h->root.u.def.value = htab->num_overlays * 16 + 16 + htab->num_buf * 2 * 4;
+ h->root.u.def.value = htab->num_overlays * 16 + 16 + htab->num_buf * 4;
h->size = 0;
h = define_ovtab_symbol (htab, "_EAR_");
Index: ld/emultempl/spu_ovl.S
===================================================================
RCS file: /cvs/src/src/ld/emultempl/spu_ovl.S,v
retrieving revision 1.8
diff -u -p -r1.8 spu_ovl.S
--- ld/emultempl/spu_ovl.S 28 Jan 2008 05:59:24 -0000 1.8
+++ ld/emultempl/spu_ovl.S 7 Feb 2008 01:14:47 -0000
@@ -46,12 +46,13 @@
#define cgbits reserved2
#define off3 reserved2
#define off4 reserved2
+#define addr4 reserved2
#define off5 reserved2
#define tagstat reserved2
#define reserved3 $77
-#define buf1 reserved3
-#define buf2 reserved3
+#define size1 reserved3
+#define size2 reserved3
#define rv3 reserved3
#define ealo reserved3
#define cmd reserved3
@@ -145,18 +146,18 @@ __ovly_return:
#nop; lnop
#nop; lnop
#nop
- rotqbyi buf1, vma, 12 # 1,4 14
+ rotqbyi size1, vma, 4 # 1,4 14
#nop
stqd save3, -48($sp) # 1,6 15
#nop
stqd save2, -32($sp) # 1,6 16
#nop
stqd save1, -16($sp) # 1,6 17
- andi present1, buf1, 1 # 0,2 18
- stqd ovl, (__ovly_current - __ovly_return)($lr) # 1,6 18
+ andi present1, size1, 1 # 0,2 18
+ stqr ovl, __ovly_current # 1,6 18
#nop; lnop
#nop
- brz present1, __ovly_load_event # 1,4 20
+ brz present1, do_load # 1,4 20
ovly_ret9:
#nop
bi target # 1,4 21
@@ -197,11 +198,11 @@ __ovly_load:
#lnop
#nop; lnop
#nop
- lqd cur, (__ovly_current - __ovly_return)(rv1) # 1,6 2
+ lqr cur, __ovly_current # 1,6 2
shli off2, ovl, 4 # 0,4 3
- stqd ovl, (__ovly_current - __ovly_return)(rv1) # 1,6 3
+ stqr ovl, __ovly_current # 1,6 3
ceq rv2, $lr, rv1 # 0,2 4
- lqd rv3, (__rv_pattern - __ovly_return)(rv1) # 1,6 4
+ lqr rv3, __rv_pattern # 1,6 4
#nop; lnop
#nop; lnop
#nop
@@ -214,11 +215,11 @@ __ovly_load:
ila rv1, __ovly_return # 0,2 1
stqd save2, -32($sp) # 1,6 1
shli off2, ovl, 4 # 0,4 2
- lqa cur, __ovly_current # 1,6 2
+ lqr cur, __ovly_current # 1,6 2
nop
- stqa ovl, __ovly_current # 1,6 3
+ stqr ovl, __ovly_current # 1,6 3
ceq rv2, $lr, rv1 # 0,2 4
- lqd rv3, (__rv_pattern - __ovly_return)(rv1) # 1,6 4
+ lqr rv3, __rv_pattern # 1,6 4
#nop
hbr ovly_load9, target # 1,15 5
#nop
@@ -237,18 +238,18 @@ __ovly_load:
#nop
rotqmbyi rv6, $lr, -8 # 1,4 12
#nop
- rotqbyi buf2, vma, 12 # 1,4 13
+ rotqbyi size2, vma, 4 # 1,4 13
#nop
lqd save3, -48($sp) # 1,6 14
#nop; lnop
or rv7, rv4, rv6 # 0,2 16
lqd save2, -32($sp) # 1,6 16
- andi present2, buf2, 1 # 0,2 17
+ andi present2, size2, 1 # 0,2 17
lnop # 1,0 17
selb $lr, rv7, $lr, rv5 # 0,2 18
lqd save1, -16($sp) # 1,6 18
#nop
- brz present2, __ovly_load_event # 1,4 19
+ brz present2, do_load # 1,4 19
ovly_load9:
#nop
bi target # 1,4 20
@@ -266,6 +267,7 @@ ovly_load9:
.global __ovly_load_event
.type __ovly_load_event, @function
__ovly_load_event:
+do_load:
#nop
rotqbyi sz, vma, 8 # 1,4 0
#nop
@@ -273,7 +275,7 @@ __ovly_load_event:
#nop
lqa ea64, _EAR_ # 1,6 2
#nop
- lqd cgshuf, (__cg_pattern - __ovly_return)($lr) # 1,6 3
+ lqr cgshuf, __cg_pattern # 1,6 3
/* We could predict the branch at the end of this loop by adding a few
instructions, and there are plenty of free cycles to do so without
@@ -316,13 +318,13 @@ __ovly_xfer_loop:
brnz osize, __ovly_xfer_loop # 1,4 24
/* Now update our data structions while waiting for DMA to complete.
- Low bit of .buf needs to be cleared on the _ovly_table entry
+ Low bit of .size needs to be cleared on the _ovly_table entry
corresponding to the evicted overlay, and set on the entry for the
newly loaded overlay. Note that no overlay may in fact be evicted
- as _ovly_buf_table[] starts with all zeros. Don't zap .buf entry
+ as _ovly_buf_table[] starts with all zeros. Don't zap .size entry
for zero index! Also of course update the _ovly_buf_table entry. */
#nop
- lqd newovl, (__ovly_current - __ovly_return)($lr) # 1,6 25
+ lqr newovl, __ovly_current # 1,6 25
#nop; lnop
#nop; lnop
#nop; lnop
@@ -333,7 +335,7 @@ __ovly_xfer_loop:
ila tab3, _ovly_table - 16 # 0,2 32
#lnop
#nop
- fsmbi pbyte, 1 # 1,4 33
+ fsmbi pbyte, 0x100 # 1,4 33
#nop; lnop
#nop
lqx vma, tab3, off3 # 1,6 35
@@ -351,7 +353,7 @@ __ovly_xfer_loop:
#nop; lnop
shli off4, buf3, 2 # 1,4 45
#lnop
- ila tab4, _ovly_buf_table # 0,2 46
+ ila tab4, _ovly_buf_table - 4 # 0,2 46
#lnop
#nop; lnop
#nop; lnop
@@ -359,13 +361,14 @@ __ovly_xfer_loop:
lqx map, tab4, off4 # 1,6 49
#nop
cwx genwi, tab4, off4 # 1,4 50
-#nop; lnop
+ a addr4, tab4, off4 # 0,2 51
+#lnop
#nop; lnop
#nop; lnop
#nop; lnop
#nop
- rotqby oldovl, map, off4 # 1,4 55
- nop
+ rotqby oldovl, map, addr4 # 1,4 55
+#nop
shufb newmap, newovl, map, genwi # 0,4 56
#if MFC_TAG_ID < 16
ila newmask, 1 << MFC_TAG_ID # 0,2 57
@@ -375,7 +378,7 @@ __ovly_xfer_loop:
#lnop
#nop; lnop
#nop; lnop
- stqx newmap, tab4, off4 # 1,6 60
+ stqd newmap, 0(addr4) # 1,6 60
/* Save app's tagmask, wait for DMA complete, restore mask. */
ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61
--
Alan Modra
Australia Development Lab, IBM