[PATCH,MIPS 2/3] Enable reorder for crt0.S

Matthew Fortune Matthew.Fortune@imgtec.com
Tue Nov 18 12:14:00 GMT 2014


As part of a long term plan to reduce the amount of hand written .set noreorder
code, I have reworked the crt0.S file so that the assembler can fill delay
slots instead of them being explicitly filled.  The reason for doing this is
to enable future auto-conversion of delay slot branches to 'compact' branches
present in the R6 architecture. Auto-conversion is not possible in a .set
noreorder block as any delay slot branch with a non-NOP delay slot would have
to be reordered!!! to convert to a compact branch without a delay slot.
Writing code in a natural linear order is (subjectively) also much simpler to
digest and maintain.

One ugly piece of code had to be reworked in the zerobss loop which was using
a pseudo-instruction BLTU as the branch. The structure of the old-loop was
clearly aiming to produce a tight loop with one instruction and the delay slot
filled but the expansion of the BLTU would have undone this anyway. This has
been reworked to create the kind of loop originally intended and have the
assembler fill the delay slot. The precise behaviour of the loop is subtly
different from before for two reasons:

1) When the _fbss and _end symbols have the same value then the old loop would
   have written zero to every address from _fbss to the end of memory (or an
   exception occurred). The new loop is skipped if the two symbols are the same.
2) The old loop wrote zero to address of _end which is past the end of the
   bss range. The new loop does not do this.
3) When _fbss is greater then _end at the start then the old loop would have
   written one element and exited. The new loop will attempt to write zero
   to every address from _fbss to the end of memory, wrap and continue to
   _end (or hit an exception). This change in behaviour is fine as the
   scenario is invalid anyway.
4) The _end marker is now aligned to 4-bytes to ensure that the last element
   written does not reach beyond the address of _end. This is also necessary
   as the termination condition is an equality test instead of an ordered
   test so (_end - _fbss) must be a multiple of 4-bytes.

Delay slot filling will occur when libgloss is built with GCC and an
optimisation level greater than zero. This gets translated to an assembler
optimisation level of '2'.

All instance of JAL <reg> have been changed to JALR <reg> as there is no
special handling in the JAL macro in binutils for a register operand and
JALR is the real underlying instruction.

This change is primarily verified by code inspection but has also been run
through some small test programs.



	* mips/crt0.S: Remove .set noreorder throughout.  Change JAL <reg> to
	JALR <reg> throughout.
	(zerobss): Open code the bltu macro instruction so that the
	zero-loop does not have a NOP in the branch delay slot.
 libgloss/mips/crt0.S | 53 ++++++++++++++++++----------------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/libgloss/mips/crt0.S b/libgloss/mips/crt0.S
index 599e79c..f66ef1b 100644
--- a/libgloss/mips/crt0.S
+++ b/libgloss/mips/crt0.S
@@ -57,13 +57,14 @@
 	.globl	_start
 	.ent	_start
-	.set	noreorder
 #ifdef __mips_embedded_pic
 #define PICBASE start_PICBASE
+	.set	noreorder
 	PICBASE = .+8
         bal	PICBASE
 	move	s0,$31
+	.set	reorder
 #if __mips<3
 #  define STATUS_MASK (SR_CU1|SR_PE)
@@ -89,9 +90,7 @@ _start:
 	/* Avoid hazard from FPU enable and other SR changes.  */
 	LA (t0, hardware_hazard_hook)
 	beq	t0,zero,1f
-	nop
-	jal	t0
-	nop
+	jalr	t0
 /* Check for FPU presence.  Don't check if we know that soft_float is
@@ -105,11 +104,8 @@ _start:
 	mfc1	t1,fp1
 	bne	t0,t2,1f	/* check for match */
-	nop
 	bne	t1,zero,1f	/* double check */
-	nop
 	j	2f		/* FPU is present. */
-	nop
 	/* FPU is not present.  Set status register to say that. */
@@ -119,9 +115,7 @@ _start:
 	/* Avoid hazard from FPU disable.  */
 	LA (t0, hardware_hazard_hook)
 	beq	t0,zero,2f
-	nop
-	jal	t0
-	nop
+	jalr	t0
@@ -129,7 +123,6 @@ _start:
    doesn't get confused.  */
 	LA (v0, 3f)
 	jr	v0
-	nop
 	LA (gp, _gp)				# set the global data pointer
 	.end _start
@@ -145,21 +138,20 @@ _start:
 	LA (v0, _fbss)
 	LA (v1, _end)
-	sw	zero,0(v0)
-	bltu	v0,v1,3b
-	addiu	v0,v0,4				# executed in delay slot
+	beq	v0,v1,2f
+	addiu	v0,v0,4
+	sw	zero,-4(v0)
+	bne	v0,v1,1b
 	la	t0, __lstack			# make a small stack so we
 	addiu	sp, t0, STARTUP_STACK_SIZE	# can run some C code
 	la	a0, __memsize			# get the usable memory size
 	jal	get_mem_info
-	nop
 	/* setup the stack pointer */
 	LA (t0, __stack)			# is __stack set ?
 	bne	t0,zero,4f
-	nop
 	/* NOTE: a0[0] contains the amount of memory available, and
 	         not the last memory address. */
@@ -189,19 +181,14 @@ zerobss:
 	LA (t9, hardware_init_hook)		# init the hardware if needed
 	beq	t9,zero,6f
-	nop
-	jal	t9
-	nop
+	jalr	t9
 	LA (t9, software_init_hook)		# init the hardware if needed
 	beq	t9,zero,7f
-	nop
-	jal	t9
-	nop
+	jalr	t9
 	LA (a0, _fini)
 	jal	atexit
-	nop
 #ifdef GCRT0
 	.globl	_ftext
@@ -209,12 +196,10 @@ init:
 	LA (a0, _ftext)
 	LA (a1, _etext)
 	jal	monstartup
-	nop
 	jal	_init				# run global constructors
-	nop
 	addiu	a1,sp,32			# argv = sp + 32
 	addiu	a2,sp,40			# envp = sp + 40
@@ -225,13 +210,13 @@ init:
 	sw	zero,(a1)
 	sw	zero,(a2)
-	jal	main				# call the program start function
 	move	a0,zero				# set argc to 0
+	jal	main				# call the program start function
 	# fall through to the "exit" routine
+	move	a0,v0				# pass through the exit code
 	jal	exit				# call libc exit to run the G++
 						# destructors
-	move	a0,v0				# pass through the exit code
 	.end	init
@@ -257,27 +242,25 @@ _exit:
 	/* Need to reinit PICBASE, since we might be called via exit()
 	   rather than via a return path which would restore old s0.  */
 #define PICBASE exit_PICBASE
+	.set	noreorder
 	PICBASE = .+8
 	move	s0,$31
+	.set	reorder
 #ifdef GCRT0
 	LA (t0, _mcleanup)
-	jal	t0
-	nop
+	jalr	t0
 	LA (t0, hardware_exit_hook)
 	beq	t0,zero,1f
-	nop
-	jal	t0
-	nop
+	jalr	t0
 	# break instruction can cope with 0xfffff, but GAS limits the range:
 	break	1023
 	b	7b				# but loop back just in-case
-	nop
 	.end _exit
 /* Assume the PICBASE set up above is no longer valid below here.  */

More information about the Newlib mailing list