This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH,MIPS 2/3] Enable reorder for crt0.S


Hi,

Matthew Fortune <Matthew.Fortune@imgtec.com> writes:
> Hi,
>
> As part of a long term plan to reduce the amount of hand written .set noreorder
> code, I have reworked the crt0.S file so that the assembler can fill delay
> slots instead of them being explicitly filled.  The reason for doing this is
> to enable future auto-conversion of delay slot branches to 'compact' branches
> present in the R6 architecture. Auto-conversion is not possible in a .set
> noreorder block as any delay slot branch with a non-NOP delay slot would have
> to be reordered!!! to convert to a compact branch without a delay slot.
> Writing code in a natural linear order is (subjectively) also much simpler to
> digest and maintain.
>
> One ugly piece of code had to be reworked in the zerobss loop which was using
> a pseudo-instruction BLTU as the branch. The structure of the old-loop was
> clearly aiming to produce a tight loop with one instruction and the delay slot
> filled but the expansion of the BLTU would have undone this anyway. This has
> been reworked to create the kind of loop originally intended and have the
> assembler fill the delay slot. The precise behaviour of the loop is subtly
> different from before for two reasons:
>
> 1) When the _fbss and _end symbols have the same value then the old loop would
>    have written zero to every address from _fbss to the end of memory (or an
>    exception occurred). The new loop is skipped if the two symbols are the same.
> 2) The old loop wrote zero to address of _end which is past the end of the
>    bss range. The new loop does not do this.
> 3) When _fbss is greater then _end at the start then the old loop would have
>    written one element and exited. The new loop will attempt to write zero
>    to every address from _fbss to the end of memory, wrap and continue to
>    _end (or hit an exception). This change in behaviour is fine as the
>    scenario is invalid anyway.
> 4) The _end marker is now aligned to 4-bytes to ensure that the last element
>    written does not reach beyond the address of _end. This is also necessary
>    as the termination condition is an equality test instead of an ordered
>    test so (_end - _fbss) must be a multiple of 4-bytes.

Sorry to jump on this old patch, but I couldn't see anything that did
(4).  I was trying to test mipsisa64-elf with idt64.ld and many tests
end up with an _end that isn't four-byte aligned, leading to an
"infinite" zeroing loop.

I guess we should add:

  . = ALIGN(4);

in front of:

   PROVIDE (end = .);
   _end = .;

for each script that doesn't already align to 4 or beyond.

Thanks,
Richard

>
> Delay slot filling will occur when libgloss is built with GCC and an
> optimisation level greater than zero. This gets translated to an assembler
> optimisation level of '2'.
>
> All instance of JAL <reg> have been changed to JALR <reg> as there is no
> special handling in the JAL macro in binutils for a register operand and
> JALR is the real underlying instruction.
>
> This change is primarily verified by code inspection but has also been run
> through some small test programs.
>
> Thanks,
> Matthew
>
> libgloss/
>
> 	* mips/crt0.S: Remove .set noreorder throughout.  Change JAL <reg> to
> 	JALR <reg> throughout.
> 	(zerobss): Open code the bltu macro instruction so that the
> 	zero-loop does not have a NOP in the branch delay slot.
> ---
>  libgloss/mips/crt0.S | 53 ++++++++++++++++++----------------------------------
>  1 file changed, 18 insertions(+), 35 deletions(-)
>
> diff --git a/libgloss/mips/crt0.S b/libgloss/mips/crt0.S
> index 599e79c..f66ef1b 100644
> --- a/libgloss/mips/crt0.S
> +++ b/libgloss/mips/crt0.S
> @@ -57,13 +57,14 @@
>  	.globl	_start
>  	.ent	_start
>  _start:
> -	.set	noreorder
>  #ifdef __mips_embedded_pic
>  #define PICBASE start_PICBASE
> +	.set	noreorder
>  	PICBASE = .+8
>          bal	PICBASE
>  	nop
>  	move	s0,$31
> +	.set	reorder
>  #endif
>  #if __mips<3
>  #  define STATUS_MASK (SR_CU1|SR_PE)
> @@ -89,9 +90,7 @@ _start:
>  	/* Avoid hazard from FPU enable and other SR changes.  */
>  	LA (t0, hardware_hazard_hook)
>  	beq	t0,zero,1f
> -	nop
> -	jal	t0
> -	nop
> +	jalr	t0
>  1:
>  
>  /* Check for FPU presence.  Don't check if we know that soft_float is
> @@ -105,11 +104,8 @@ _start:
>  	mfc1	t1,fp1
>  	nop
>  	bne	t0,t2,1f	/* check for match */
> -	nop
>  	bne	t1,zero,1f	/* double check */
> -	nop
>  	j	2f		/* FPU is present. */
> -	nop
>  #endif
>  1:
>  	/* FPU is not present.  Set status register to say that. */
> @@ -119,9 +115,7 @@ _start:
>  	/* Avoid hazard from FPU disable.  */
>  	LA (t0, hardware_hazard_hook)
>  	beq	t0,zero,2f
> -	nop
> -	jal	t0
> -	nop
> +	jalr	t0
>  2:
>  
>  
> @@ -129,7 +123,6 @@ _start:
>     doesn't get confused.  */
>  	LA (v0, 3f)
>  	jr	v0
> -	nop
>  3:
>  	LA (gp, _gp)				# set the global data pointer
>  	.end _start
> @@ -145,21 +138,20 @@ _start:
>  zerobss:
>  	LA (v0, _fbss)
>  	LA (v1, _end)
> -3:
> -	sw	zero,0(v0)
> -	bltu	v0,v1,3b
> -	addiu	v0,v0,4				# executed in delay slot
> -
> +	beq	v0,v1,2f
> +1:
> +	addiu	v0,v0,4
> +	sw	zero,-4(v0)
> +	bne	v0,v1,1b
> +2:
>  	la	t0, __lstack			# make a small stack so we
>  	addiu	sp, t0, STARTUP_STACK_SIZE	# can run some C code
>  	la	a0, __memsize			# get the usable memory size
>  	jal	get_mem_info
> -	nop
>  
>  	/* setup the stack pointer */
>  	LA (t0, __stack)			# is __stack set ?
>  	bne	t0,zero,4f
> -	nop
>  
>  	/* NOTE: a0[0] contains the amount of memory available, and
>  	         not the last memory address. */
> @@ -189,19 +181,14 @@ zerobss:
>  init:
>  	LA (t9, hardware_init_hook)		# init the hardware if needed
>  	beq	t9,zero,6f
> -	nop
> -	jal	t9
> -	nop
> +	jalr	t9
>  6:
>  	LA (t9, software_init_hook)		# init the hardware if needed
>  	beq	t9,zero,7f
> -	nop
> -	jal	t9
> -	nop
> +	jalr	t9
>  7:
>  	LA (a0, _fini)
>  	jal	atexit
> -	nop
>  
>  #ifdef GCRT0
>  	.globl	_ftext
> @@ -209,12 +196,10 @@ init:
>  	LA (a0, _ftext)
>  	LA (a1, _etext)
>  	jal	monstartup
> -	nop
>  #endif
>  
>  
>  	jal	_init				# run global constructors
> -	nop
>  
>  	addiu	a1,sp,32			# argv = sp + 32
>  	addiu	a2,sp,40			# envp = sp + 40
> @@ -225,13 +210,13 @@ init:
>  	sw	zero,(a1)
>  	sw	zero,(a2)
>  #endif
> -	jal	main				# call the program start function
>  	move	a0,zero				# set argc to 0
> +	jal	main				# call the program start function
>  
>  	# fall through to the "exit" routine
> +	move	a0,v0				# pass through the exit code
>  	jal	exit				# call libc exit to run the G++
>  						# destructors
> -	move	a0,v0				# pass through the exit code
>  	.end	init
>  
>   
> @@ -257,27 +242,25 @@ _exit:
>  	/* Need to reinit PICBASE, since we might be called via exit()
>  	   rather than via a return path which would restore old s0.  */
>  #define PICBASE exit_PICBASE
> +	.set	noreorder
>  	PICBASE = .+8
>  	bal	PICBASE
>  	nop
>  	move	s0,$31
> +	.set	reorder
>  #endif
>  #ifdef GCRT0
>  	LA (t0, _mcleanup)
> -	jal	t0
> -	nop
> +	jalr	t0
>  #endif
>  	LA (t0, hardware_exit_hook)
>  	beq	t0,zero,1f
> -	nop
> -	jal	t0
> -	nop
> +	jalr	t0
>  1:
>  
>  	# break instruction can cope with 0xfffff, but GAS limits the range:
>  	break	1023
>  	b	7b				# but loop back just in-case
> -	nop
>  	.end _exit
>   
>  /* Assume the PICBASE set up above is no longer valid below here.  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]