This is the mail archive of the binutils@sourceware.org mailing list for the binutils project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 3/5] i386: Align branches within a fixed boundary


On 12.11.2019 17:19,  H.J. Lu  wrote:
> @@ -632,6 +652,31 @@ static enum check_kind
>    }
>  sse_check, operand_check = check_warning;
>  
> +/* Non-zero if branches should be aligned within power of 2 boundary.  */
> +static int align_branch_power = 0;
> +
> +/* Types of branches to align.  */
> +enum align_branch_kind
> +  {
> +    align_branch_none = 0,
> +    align_branch_jcc = 1 << 0,
> +    align_branch_fused = 1 << 1,
> +    align_branch_jmp = 1 << 2,
> +    align_branch_call = 1 << 3,
> +    align_branch_indirect = 1 << 4,
> +    align_branch_ret = 1 << 5
> +  };
> +
> +static unsigned int align_branch = (align_branch_jcc
> +				    | align_branch_fused
> +				    | align_branch_jmp);
> +
> +/* The maximum padding size for fused jcc.  */
> +#define MAX_FUSED_JCC_PADDING_SIZE 20

What is this number derived from?

> @@ -3012,6 +3070,11 @@ md_begin (void)
>        x86_dwarf2_return_column = 8;
>        x86_cie_data_alignment = -4;
>      }
> +
> +  /* NB: FUSED_JCC_PADDING frag must have sufficient room so that it
> +     can be turned into BRANCH_PREFIX frag.  */
> +  if (align_branch_prefix_size > MAX_FUSED_JCC_PADDING_SIZE)
> +    abort ();

Wouldn't such a value better be rejected at command line option
handling time?

> @@ -8178,11 +8252,181 @@ encoding_length (const fragS *start_frag, offsetT start_off,
>    return len - start_off + (frag_now_ptr - frag_now->fr_literal);
>  }
>  
> +/* Return 1 for test, and, cmp, add, sub, inc and dec which may
> +   be macro-fused with conditional jumps.  */
> +
> +static int
> +maybe_fused_with_jcc_p (void)
> +{
> +  /* No RIP address.  */
> +  if (i.base_reg && i.base_reg->reg_num == RegIP)
> +    return 0;
> +
> +  /* and, add, sub with destination register.  */
> +  if (!strcmp (i.tm.name, "and")
> +      || !strcmp (i.tm.name, "add")
> +      || !strcmp (i.tm.name, "sub"))
> +    return i.types[1].bitfield.class == Reg;
> +
> +  /* test, cmp with any register.  */
> +  if (!strcmp (i.tm.name, "test") || !strcmp (i.tm.name, "cmp"))
> +    return (i.types[0].bitfield.class == Reg
> +	    || i.types[1].bitfield.class == Reg);
> +
> +  /* inc, dec with 16/32/64-bit register.   */
> +  if (!strcmp (i.tm.name, "inc") || !strcmp (i.tm.name, "dec"))
> +    return i.types[0].bitfield.class == Reg;

Code and comment don't match up: Either you also mean 8-bit
registers and comment is wrong, or you need to extend the
condition / return value expression.

Furthermore with this being executed quite frequently, I
think using strcmp() isn't a very performant approach here.
Like elsewhere I think you want to use i.tm checks here. This
would also eliminate my worry of the code not catching cases
where optimization changes the opcode in i.tm without also
changing the name.

> +static int
> +add_fused_jcc_padding_frag_p (void)
> +{
> +  if (!align_branch_power
> +      || now_seg == absolute_section
> +      || !cpu_arch_flags.bitfield.cpui386

Why the i386 dependency?

> +      || !(align_branch & align_branch_fused))
> +    return 0;
> +
> +  if (maybe_fused_with_jcc_p ())
> +    {
> +      if (last_insn.kind != last_insn_other
> +	  && last_insn.seg == now_seg)
> +	{
> +	  if (flag_debug)
> +	    as_warn_where (last_insn.file, last_insn.line,
> +			   _("`%s` skips -malign-branch-boundary on `%s`"),
> +			   last_insn.name, i.tm.name);
> +	  return 0;
> +	}
> +      return 1;
> +    }
> +
> +  return 0;

Maybe slightly better (less indentation and a folded return) as

  if (maybe_fused_with_jcc_p ())
    {
      if (last_insn.kind == last_insn_other
	  || last_insn.seg != now_seg)
	return 1;
      if (flag_debug)
	as_warn_where (last_insn.file, last_insn.line,
		       _("`%s` skips -malign-branch-boundary on `%s`"),
		       last_insn.name, i.tm.name);
    }

  return 0;

?

> +}
> +
> +/* Return 1 if a BRANCH_PREFIX frag should be generated.  */
> +
> +static int
> +add_branch_prefix_frag_p (void)
> +{
> +  if (!align_branch_power
> +      || now_seg == absolute_section
> +      || i.tm.cpu_flags.bitfield.cpupadlock
> +      || !cpu_arch_flags.bitfield.cpui386)

Why the PadLock and i386 dependencies?

> +    return 0;
> +
> +  /* Don't add prefix if it is a prefix or there is no operand.  */
> +  if (!i.operands || i.tm.opcode_modifier.isprefix)
> +    return 0;
> +
> +  if (last_insn.kind != last_insn_other
> +      && last_insn.seg == now_seg)
> +    {
> +      if (flag_debug)
> +	as_warn_where (last_insn.file, last_insn.line,
> +		       _("`%s` skips -malign-branch-boundary on `%s`"),
> +		       last_insn.name, i.tm.name);
> +      return 0;
> +    }
> +
> +  return 1;

Like above this too can be had with less indentation.

> +static int
> +add_branch_padding_frag_p (enum align_branch_kind *branch_p)
> +{
> +  int add_padding;
> +
> +  if (!align_branch_power
> +      || now_seg == absolute_section
> +      || !cpu_arch_flags.bitfield.cpui386)

Same question as above.

> +    return 0;
> +
> +  add_padding = 0;
> +
> +  /* Check for jcc and direct jmp.  */
> +  if (i.tm.opcode_modifier.jump)
> +    {
> +      if (i.tm.base_opcode == JUMP_PC_RELATIVE)
> +	{
> +	  *branch_p = align_branch_jmp;
> +	  add_padding = align_branch & align_branch_jmp;
> +	}
> +      else
> +	{
> +	  *branch_p = align_branch_jcc;
> +	  if ((align_branch & align_branch_jcc))
> +	    add_padding = 1;
> +	}
> +    }
> +  else if (i.tm.base_opcode == 0xc2
> +	   || i.tm.base_opcode == 0xc3
> +	   || i.tm.base_opcode == 0xca
> +	   || i.tm.base_opcode == 0xcb)

These will also match various VEX/XOP/EVEX encoded templates. Perhaps
exclude any such right away in the first if()?

Also you handle near and far returns here, but ...

> +    {
> +      *branch_p = align_branch_ret;
> +      if ((align_branch & align_branch_ret))
> +	add_padding = 1;
> +    }
> +  else
> +    {
> +      if (i.tm.base_opcode == 0xe8)
> +	{
> +	  *branch_p = align_branch_call;
> +	  if ((align_branch & align_branch_call))
> +	    add_padding = 1;
> +	}
> +      else if (i.tm.base_opcode == 0xff
> +	       && (i.rm.reg == 2 || i.rm.reg == 4))

... only near indirect call/jmp here?

It would also seem more consistent to be if you used
i.tm.extension_opcode instead of i.rm.reg for the checks here.

> +	{
> +	  *branch_p = align_branch_indirect;
> +	  if ((align_branch & align_branch_indirect))
> +	    add_padding = 1;
> +	}
> +
> +      /* Check for indirect jmp, direct and indirect calls.  */

I guess this comment really belongs several lines up from here (and
a similar one is missing ahead of the RET related conditional)?

> @@ -8279,6 +8523,30 @@ output_insn (void)
>    insn_start_frag = frag_now;
>    insn_start_off = frag_now_fix ();
>  
> +  if (add_branch_padding_frag_p (&branch))
> +    {
> +      char *p;
> +      unsigned int max_branch_padding_size = 14;

Where is this number coming from? Like above (and I think also in
one more case below) any such seemingly arbitrary numbers could do
with a comment.

> @@ -8317,6 +8585,44 @@ output_insn (void)
>  	  i.prefix[LOCK_PREFIX] = 0;
>  	}
>  
> +      if (branch)
> +	/* Skip if this is a branch.  */
> +	;
> +      else if (add_fused_jcc_padding_frag_p ())
> +	{
> +	  unsigned int max_fused_padding_size
> +	    = MAX_FUSED_JCC_PADDING_SIZE;

Why the local variable? You could as well use the manifest constant
at all the use sites below as it seems. If it was just for the
identifier length, then I guess the variable could do with further
shortening.

> @@ -8461,12 +8767,81 @@ output_insn (void)
>        if (now_seg != absolute_section)
>  	{
>  	  j = encoding_length (insn_start_frag, insn_start_off, frag_more (0));
> -	  if (j > 15)
> +	  if (fragP)
> +	    {
> +	      /* NB: Don't add prefix with GOTPC relocation since
> +		 output_disp() above depends on the fixed encoding
> +		 length.  */
> +	      unsigned int max = i.has_gotpc_reloc ? 0 : 15 - j;
> +	      /* Prefix count on the current instruction.  */
> +	      unsigned int count = !!is_any_vex_encoding (&i.tm);
> +	      unsigned int k;
> +	      for (k = 0; k < ARRAY_SIZE (i.prefix); k++)
> +		if (i.prefix[k])
> +		  count++;
> +
> +	      /* NB: prefix count + instruction size must be <= 15.  */
> +	      if (j > 15)
> +		as_fatal (_("instruction length of %u bytes exceeds the limit of 15"),
> +			  j);

I think you'd better keep the if (j > 15) as the first check above
and make the new code an else-if to it. You shouldn't fail assembly
in this case, whereas the warning implying no padding attempt to
get done seems quite reasonable a behavior to me.

> +	      if (TYPE_FROM_RELAX_STATE (fragP->fr_subtype)
> +		  == BRANCH_PREFIX)
> +		{
> +		  /* Set the maximum prefix size in BRANCH_PREFIX
> +		     frag.  */
> +		  if (fragP->tc_frag_data.max_bytes > max)
> +		    fragP->tc_frag_data.max_bytes = max;
> +		  if (fragP->tc_frag_data.max_bytes > count)
> +		    fragP->tc_frag_data.max_bytes -= count;
> +		  else
> +		    fragP->tc_frag_data.max_bytes = 0;
> +		}
> +	      else
> +		{
> +		  /* Remember the maximum prefix size in FUSED_JCC_PADDING
> +		     frag.  */
> +		  unsigned int max_prefix_size;
> +		  if (align_branch_prefix_size > max)
> +		    max_prefix_size = max;
> +		  else
> +		    max_prefix_size = align_branch_prefix_size;
> +		  if (max_prefix_size > count)
> +		    fragP->tc_frag_data.max_prefix_length
> +		      = max_prefix_size - count;
> +		}
> +
> +	      /* Use existing segment prefix if possible.  Use CS
> +		 segment prefix in 64-bit mode.  In 32-bit mode, use SS
> +		 segment prefix with ESP/EBP base register and use DS
> +		 segment prefix without ESP/EBP base register.  */

Is there a reason for the preference of CS: in 64-bit mode?

> +static void
> +i386_classify_machine_dependent_frag (fragS *fragP)
> +{
> +  fragS *cmp_fragP;
> +  fragS *pad_fragP;
> +  fragS *branch_fragP;
> +  fragS *next_fragP;
> +  unsigned int max_prefix_length;
> +
> +  if (fragP->tc_frag_data.classified)
> +    return;
> +
> +  /* First scan for BRANCH_PADDING and FUSED_JCC_PADDING.  Convert
> +     FUSED_JCC_PADDING and merge BRANCH_PADDING.  */
> +  for (next_fragP = fragP;
> +       next_fragP != NULL;
> +       next_fragP = next_fragP->fr_next)
> +    {
> +      next_fragP->tc_frag_data.classified = 1;
> +      if (next_fragP->fr_type == rs_machine_dependent)
> +	switch (TYPE_FROM_RELAX_STATE (next_fragP->fr_subtype))
> +	  {
> +	  case BRANCH_PADDING:
> +	    /* The BRANCH_PADDING frag must be followed by a branch
> +	       frag.  */
> +	    branch_fragP = i386_next_non_empty_frag (next_fragP);
> +	    next_fragP->tc_frag_data.u.branch_fragP = branch_fragP;
> +	    break;
> +	  case FUSED_JCC_PADDING:
> +	    /* Check if this is a fused jcc:
> +	       FUSED_JCC_PADDING
> +	       CMP

Here and in a number of other places I wonder why you say CMP when
the set of macro-fusion candidate insns is quite a bit larger.

> +	       BRANCH_PADDING
> +	       COND_JUMP
> +	       */
> +	    cmp_fragP = i386_next_non_empty_frag (next_fragP);
> +	    pad_fragP = i386_next_non_empty_frag (cmp_fragP);
> +	    branch_fragP = i386_next_jcc_frag (pad_fragP);
> +	    if (branch_fragP)
> +	      {
> +		/* The BRANCH_PADDING frag is merged with the
> +		   FUSED_JCC_PADDING frag.  */
> +		next_fragP->tc_frag_data.u.branch_fragP = branch_fragP;
> +		/* CMP instruction size.  */
> +		next_fragP->tc_frag_data.cmp_size = cmp_fragP->fr_fix;
> +		frag_wane (pad_fragP);
> +		/* Skip to branch_fragP.  */
> +		next_fragP = branch_fragP;
> +	      }
> +	    else if (next_fragP->tc_frag_data.max_prefix_length)
> +	      {
> +		/* Turn FUSED_JCC_PADDING into BRANCH_PREFIX if it isn't
> +		   a fused jcc.  */
> +		next_fragP->fr_subtype
> +		  = ENCODE_RELAX_STATE (BRANCH_PREFIX, 0);
> +		next_fragP->tc_frag_data.max_bytes
> +		  = next_fragP->tc_frag_data.max_prefix_length;
> +		/* This will be updated in the BRANCH_PREFIX scan.  */
> +		next_fragP->tc_frag_data.max_prefix_length = 0;
> +	      }
> +	    else
> +	      frag_wane (next_fragP);
> +	    break;
> +	  }
> +    }
> +
> +  /* Scan for BRANCH_PREFIX.  */
> +  for (; fragP != NULL; fragP = fragP->fr_next)
> +    if (fragP->fr_type == rs_machine_dependent
> +	&& (TYPE_FROM_RELAX_STATE (fragP->fr_subtype)
> +	    == BRANCH_PREFIX))
> +      {
> +	/* Count all BRANCH_PREFIX frags before BRANCH_PADDING and
> +	   COND_JUMP_PREFIX.  */
> +	max_prefix_length = 0;
> +	for (next_fragP = fragP;
> +	     next_fragP != NULL;
> +	     next_fragP = next_fragP->fr_next)
> +	  {
> +	    if (next_fragP->fr_type == rs_fill)
> +	      /* Skip rs_fill frags.  */
> +	      ;
> +	    else if (next_fragP->fr_type == rs_machine_dependent)
> +	      {
> +		if (TYPE_FROM_RELAX_STATE (next_fragP->fr_subtype)
> +		    == BRANCH_PREFIX)
> +		  {
> +		    /* Count BRANCH_PREFIX frags.  */
> +		    if (max_prefix_length >= MAX_FUSED_JCC_PADDING_SIZE)
> +		      {
> +			max_prefix_length = MAX_FUSED_JCC_PADDING_SIZE;
> +			frag_wane (next_fragP);
> +		      }
> +		    else
> +		      max_prefix_length
> +			+= next_fragP->tc_frag_data.max_bytes;
> +		  }
> +		else if ((TYPE_FROM_RELAX_STATE (next_fragP->fr_subtype)
> +			  == BRANCH_PADDING)
> +			 || (TYPE_FROM_RELAX_STATE (next_fragP->fr_subtype)
> +			     == FUSED_JCC_PADDING))
> +		  {
> +		    /* Stop at BRANCH_PADDING and FUSED_JCC_PADDING.  */
> +		    fragP->tc_frag_data.u.padding_fragP = next_fragP;
> +		    break;
> +		  }
> +		else
> +		  /* Stop for other rs_machine_dependent frags.  */
> +		  break;
> +	      }
> +	    else
> +	      /* Stop for all other frags.  */
> +	      break;

Some indentation depth could again be saved by re-arranging the code.

> +static int
> +i386_branch_padding_size (fragS *fragP, offsetT address)
> +{
> +  unsigned int offset, size, padding_size;
> +  fragS *branch_fragP = fragP->tc_frag_data.u.branch_fragP;
> +
> +  /* The start address of the BRANCH_PADDING or FUSED_JCC_PADDING frag.  */
> +  if (!address)
> +    address = fragP->fr_address;
> +  address += fragP->fr_fix;
> +
> +  /* CMP instrunction size.  */
> +  size = fragP->tc_frag_data.cmp_size;
> +
> +  /* The base size of the branch frag.  */
> +  size += branch_fragP->fr_fix;
> +
> +  /* Add opcode and displacement bytes for the rs_machine_dependent
> +     branch frag.  */
> +  if (branch_fragP->fr_type == rs_machine_dependent)
> +    size += md_relax_table[branch_fragP->fr_subtype].rlx_length;
> +
> +  /* Check if branch is within boundary and doesn't end at the last
> +     byte.  */
> +  offset = address & ((1U << align_branch_power) - 1);
> +  if ((offset + size) >= (1U << align_branch_power))
> +    /* Padding needed to avoid crossing boundary.  */
> +    padding_size = (1 << align_branch_power) - offset;

Shouldn't command line option handling make sure the shifts here
won't become undefined due to too large a shift count? Also isn't
the last 1 missing a U suffix?

> +  else
> +    /* No padding needed.  */
> +    padding_size = 0;
> +
> +  if (!fits_in_signed_byte (padding_size))
> +    abort ();

Why? The function's return type is int (should probably be
unsigned int).

> @@ -11483,6 +12323,78 @@ md_parse_option (int c, const char *arg)
>          as_fatal (_("invalid -mrelax-relocations= option: `%s'"), arg);
>        break;
>  
> +    case OPTION_MALIGN_BRANCH_BOUNDARY:
> +      {
> +	char *end;
> +	int align = strtoul (arg, &end, 0);
> +	if (*end == '\0')
> +	  {
> +	    if (align == 0)
> +	      {
> +		align_branch_power = 0;
> +		break;
> +	      }
> +	    else if (align >= 32)

Why the enforcement of 32 as the lower boundary? I.e. why would 16
not be an option as well?

> +	      {
> +		int align_power;
> +		for (align_power = 0;
> +		     (align & 1) == 0;
> +		     align >>= 1, align_power++)
> +		  continue;
> +		if (align == 1)
> +		  {
> +		    align_branch_power = align_power;
> +		    break;
> +		  }
> +	      }
> +	  }
> +	as_fatal (_("invalid -malign-branch-boundary= value: %s"), arg);
> +      }
> +      break;
> +
> +    case OPTION_MALIGN_BRANCH_PREFIX_SIZE:
> +      {
> +	char *end;
> +	int align = strtoul (arg, &end, 0);
> +	if (*end == '\0' && align >= 0 && align < 6)

Similarly here - why is 5 the upper boundary on the accepted values?

> +	  {
> +	    align_branch_prefix_size = align;
> +	    break;
> +	  }
> +	as_fatal (_("invalid -malign-branch-prefix-size= value: %s"),
> +		  arg);
> +      }
> +      break;
> +
> +    case OPTION_MALIGN_BRANCH:
> +      align_branch = 0;
> +      saved = xstrdup (arg);
> +      type = saved;
> +      do
> +	{
> +	  next = strchr (type, '+');
> +	  if (next)
> +	    *next++ = '\0';
> +	  if (strcasecmp (type, "jcc") == 0)
> +	    align_branch |= align_branch_jcc;
> +	  else if (strcasecmp (type, "fused") == 0)
> +	    align_branch |= align_branch_fused;
> +	  else if (strcasecmp (type, "jmp") == 0)
> +	    align_branch |= align_branch_jmp;
> +	  else if (strcasecmp (type, "call") == 0)
> +	    align_branch |= align_branch_call;
> +	  else if (strcasecmp (type, "ret") == 0)
> +	    align_branch |= align_branch_ret;
> +	  else if (strcasecmp (type, "indirect") == 0)
> +	    align_branch |= align_branch_indirect;

Aren't command line options generally case sensitive, i.e. wouldn't
you better use strcmp() here?

> @@ -11943,6 +12875,21 @@ s_bss (int ignore ATTRIBUTE_UNUSED)
>  
>  #endif
>  
> +/* Remember constant diretive.  */
> +
> +void
> +i386_cons_worker (int ignore ATTRIBUTE_UNUSED)
> +{
> +  if (last_insn.kind != last_insn_directive
> +      && (bfd_section_flags (now_seg) & SEC_CODE))
> +    {
> +      last_insn.seg = now_seg;
> +      last_insn.kind = last_insn_directive;
> +      last_insn.name = "constant diretive";

Nit: Missing 'c' in directive.

> @@ -250,10 +257,24 @@ extern i386_cpu_flags cpu_arch_isa_flags;
>  
>  struct i386_tc_frag_data
>  {
> +  union
> +    {
> +      fragS *padding_fragP;
> +      fragS *branch_fragP;
> +    } u;
> +  addressT padding_address;
>    enum processor_type isa;
>    i386_cpu_flags isa_flags;
>    enum processor_type tune;
>    unsigned int max_bytes;
> +  signed char length;
> +  signed char last_length;
> +  signed char max_prefix_length;
> +  signed char prefix_length;
> +  signed char default_prefix;
> +  signed char cmp_size;

Why signed char for all of these? Most if not all look to be
holding unsigned quantities only.

> +  unsigned int classified : 1;
> +  unsigned int branch_type : 7;

This doesn't require 7 bits, does it?

Jan


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]