[PATCH, 4.7] Add float/double vector reductions to VSX
Michael Meissner
meissner@linux.vnet.ibm.com
Wed Mar 23 17:43:00 GMT 2011
Pat Haugen noticed we were doing stores in benchmarks where we were doing
vector reductions to extract the final float element. So I decided to take a
look. This code implements the vector reductions without doing stores for
float vectors, and eliminates a vector shift for double vectors.
I suspect there are more opportunities for improving vector extract and insert
with VSX.
I did a bootstrap and make check with no regressions. Is this ok to install on
the trunk?
[gcc]
2011-03-23 Michael Meissner <meissner@linux.vnet.ibm.com>
PR target/48258
* config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector
reduction.
(VEC_reduc): New code iterator and splitters for vector reduction.
(VEC_reduc_name): Ditto.
(VEC_reduc_rtx): Ditto.
(reduc_<VEC_reduc_name>_v2df): Vector reduction expanders for VSX.
(reduc_<VEC_reduc_name>_v4sf): Ditto.
* config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add
support for extracting SF on VSX.
* config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for
generating xscvspdp.
(vsx_extract_v4sf): New insn to extract SF from V4SF vector.
(vsx_reduc_<VEC_reduc_name>_v2df): New insns and splitters for
double add, minimum, maximum vector reduction.
(vsx_reduc_<VEC_reduc_name>_v4sf): Ditto.
(vsx_reduc_<VEC_reduc_name>_v2df2_scalar): New combiner insn to
optimize double vector reduction.
(vsx_reduc_<VEC_reduc_name>_v4sf_scalar): Ditto.
[gcc/testsuite]
2011-03-23 Michael Meissner <meissner@linux.vnet.ibm.com>
PR target/48258
* gcc.target/powerpc/pr48258-1.c: New file.
* gcc.target/powerpc/pr48258-2.c: Ditto.
--
Michael Meissner, IBM
5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA
meissner@linux.vnet.ibm.com fax +1 (978) 399-6899
-------------- next part --------------
Index: gcc/config/rs6000/vector.md
===================================================================
--- gcc/config/rs6000/vector.md (revision 171306)
+++ gcc/config/rs6000/vector.md (working copy)
@@ -74,7 +74,19 @@ (define_mode_attr VEC_INT [(V4SF "V4SI"
(V2DF "V2DI")])
;; constants for unspec
-(define_c_enum "unspec" [UNSPEC_PREDICATE])
+(define_c_enum "unspec" [UNSPEC_PREDICATE
+ UNSPEC_REDUC])
+
+;; Vector reduction code iterators
+(define_code_iterator VEC_reduc [plus smin smax])
+
+(define_code_attr VEC_reduc_name [(plus "splus")
+ (smin "smin")
+ (smax "smax")])
+
+(define_code_attr VEC_reduc_rtx [(plus "add")
+ (smin "smin")
+ (smax "smax")])
;; Vector move instructions.
@@ -991,6 +1003,41 @@ (define_expand "vashr<mode>3"
"TARGET_ALTIVEC"
"")
+;; Vector reduction expanders for VSX
+
+(define_expand "reduc_<VEC_reduc_name>_v2df"
+ [(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "")
+ (VEC_reduc:V2DF
+ (vec_concat:V2DF
+ (vec_select:DF
+ (match_operand:V2DF 1 "vfloat_operand" "")
+ (parallel [(const_int 1)]))
+ (vec_select:DF
+ (match_dup 1)
+ (parallel [(const_int 0)])))
+ (match_dup 1)))
+ (clobber (match_scratch:V2DF 2 ""))])]
+ "VECTOR_UNIT_VSX_P (V2DFmode)"
+ "")
+
+; The (VEC_reduc:V4SF
+; (op1)
+; (unspec:V4SF [(const_int 0)] UNSPEC_REDUC))
+;
+; is to allow us to use a code iterator, but not completely list all of the
+; vector rotates, etc. to prevent canonicalization
+
+(define_expand "reduc_<VEC_reduc_name>_v4sf"
+ [(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "")
+ (VEC_reduc:V4SF
+ (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+ (match_operand:V4SF 1 "vfloat_operand" "")))
+ (clobber (match_scratch:V4SF 2 ""))
+ (clobber (match_scratch:V4SF 3 ""))])]
+ "VECTOR_UNIT_VSX_P (V4SFmode)"
+ "")
+
+
;;; Expanders for vector insn patterns shared between the SPE and TARGET_PAIRED systems.
(define_expand "absv2sf2"
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c (revision 171306)
+++ gcc/config/rs6000/rs6000.c (working copy)
@@ -5492,12 +5492,22 @@ rs6000_expand_vector_extract (rtx target
enum machine_mode inner_mode = GET_MODE_INNER (mode);
rtx mem;
- if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode))
+ if (VECTOR_MEM_VSX_P (mode))
{
- rtx (*extract_func) (rtx, rtx, rtx)
- = ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di);
- emit_insn (extract_func (target, vec, GEN_INT (elt)));
- return;
+ switch (mode)
+ {
+ default:
+ break;
+ case V2DFmode:
+ emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt)));
+ return;
+ case V2DImode:
+ emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt)));
+ return;
+ case V4SFmode:
+ emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt)));
+ return;
+ }
}
/* Allocate mode-sized buffer. */
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md (revision 171306)
+++ gcc/config/rs6000/vsx.md (working copy)
@@ -829,6 +829,15 @@ (define_insn "vsx_xscvdpsp_scalar"
"xscvdpsp %x0,%x1"
[(set_attr "type" "fp")])
+;; Same as vsx_xscvspdp, but use SF as the type
+(define_insn "vsx_xscvspdp_scalar2"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+ (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_VSX_CVSPDP))]
+ "VECTOR_UNIT_VSX_P (DFmode)"
+ "xscvspdp %x0,%x1"
+ [(set_attr "type" "fp")])
+
;; Convert from 64-bit to 32-bit types
;; Note, favor the Altivec registers since the usual use of these instructions
;; is in vector converts and we need to use the Altivec vperm instruction.
@@ -1039,6 +1048,43 @@ (define_insn "*vsx_extract_<mode>_zero"
[(set_attr "type" "fpload")
(set_attr "length" "4")])
+;; Extract a SF element from V4SF
+(define_insn_and_split "vsx_extract_v4sf"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=f,f")
+ (vec_select:SF
+ (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
+ (parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")])))
+ (clobber (match_scratch:V4SF 3 "=X,0"))]
+ "VECTOR_UNIT_VSX_P (V4SFmode)"
+ "@
+ xscvspdp %x0,%x1
+ #"
+ ""
+ [(const_int 0)]
+ "
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+ rtx op3 = operands[3];
+ rtx tmp;
+ HOST_WIDE_INT ele = INTVAL (op2);
+
+ if (ele == 0)
+ tmp = op1;
+ else
+ {
+ if (GET_CODE (op3) == SCRATCH)
+ op3 = gen_reg_rtx (V4SFmode);
+ emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2));
+ tmp = op3;
+ }
+ emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp));
+ DONE;
+}"
+ [(set_attr "length" "4,8")
+ (set_attr "type" "fp")])
+
;; General double word oriented permute, allow the other vector types for
;; optimizing the permute instruction.
(define_insn "vsx_xxpermdi_<mode>"
@@ -1076,7 +1122,7 @@ (define_insn "*vsx_xxpermdi2_<mode>"
(define_insn "vsx_splat_<mode>"
[(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,wd,wd,?wa,?wa,?wa")
(vec_duplicate:VSX_D
- (match_operand:<VS_scalar> 1 "splat_input_operand" "ws,f,Z,wa,wa,Z")))]
+ (match_operand:<VS_scalar> 1 "input_operand" "ws,f,Z,wa,wa,Z")))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
"@
xxpermdi %x0,%x1,%x1,0
@@ -1150,3 +1196,153 @@ (define_insn "vsx_xxsldwi_<mode>"
"VECTOR_MEM_VSX_P (<MODE>mode)"
"xxsldwi %x0,%x1,%x2,%3"
[(set_attr "type" "vecperm")])
+
+
+;; Vector reduction insns and splitters
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df"
+ [(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa")
+ (VEC_reduc:V2DF
+ (vec_concat:V2DF
+ (vec_select:DF
+ (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+ (parallel [(const_int 1)]))
+ (vec_select:DF
+ (match_dup 1)
+ (parallel [(const_int 0)])))
+ (match_dup 1)))
+ (clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))]
+ "VECTOR_UNIT_VSX_P (V2DFmode)"
+ "#"
+ ""
+ [(const_int 0)]
+ "
+{
+ rtx tmp = (GET_CODE (operands[2]) == SCRATCH)
+ ? gen_reg_rtx (V2DFmode)
+ : operands[2];
+ emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx));
+ emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1]));
+ DONE;
+}"
+ [(set_attr "length" "8")
+ (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf"
+ [(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa")
+ (VEC_reduc:V4SF
+ (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+ (match_operand:V4SF 1 "vfloat_operand" "wf,wa")))
+ (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+ (clobber (match_scratch:V4SF 3 "=&wf,&wa"))]
+ "VECTOR_UNIT_VSX_P (V4SFmode)"
+ "#"
+ ""
+ [(const_int 0)]
+ "
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx tmp2, tmp3, tmp4;
+
+ if (can_create_pseudo_p ())
+ {
+ tmp2 = gen_reg_rtx (V4SFmode);
+ tmp3 = gen_reg_rtx (V4SFmode);
+ tmp4 = gen_reg_rtx (V4SFmode);
+ }
+ else
+ {
+ tmp2 = operands[2];
+ tmp3 = operands[3];
+ tmp4 = tmp2;
+ }
+
+ emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+ emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+ emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+ emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3));
+ DONE;
+}"
+ [(set_attr "length" "16")
+ (set_attr "type" "veccomplex")])
+
+;; Combiner patterns with the vector reduction patterns that knows we can get
+;; to the top element of the V2DF array without doing an extract.
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar"
+ [(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa")
+ (vec_select:DF
+ (VEC_reduc:V2DF
+ (vec_concat:V2DF
+ (vec_select:DF
+ (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+ (parallel [(const_int 1)]))
+ (vec_select:DF
+ (match_dup 1)
+ (parallel [(const_int 0)])))
+ (match_dup 1))
+ (parallel [(const_int 1)])))
+ (clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))]
+ "VECTOR_UNIT_VSX_P (V2DFmode)"
+ "#"
+ ""
+ [(const_int 0)]
+ "
+{
+ rtx hi = gen_highpart (DFmode, operands[1]);
+ rtx lo = (GET_CODE (operands[2]) == SCRATCH)
+ ? gen_reg_rtx (DFmode)
+ : operands[2];
+
+ emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx));
+ emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo));
+ DONE;
+}"
+ [(set_attr "length" "8")
+ (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar"
+ [(set (match_operand:SF 0 "vfloat_operand" "=f,?f")
+ (vec_select:SF
+ (VEC_reduc:V4SF
+ (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+ (match_operand:V4SF 1 "vfloat_operand" "wf,wa"))
+ (parallel [(const_int 3)])))
+ (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+ (clobber (match_scratch:V4SF 3 "=&wf,&wa"))
+ (clobber (match_scratch:V4SF 4 "=0,0"))]
+ "VECTOR_UNIT_VSX_P (V4SFmode)"
+ "#"
+ ""
+ [(const_int 0)]
+ "
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx tmp2, tmp3, tmp4, tmp5;
+
+ if (can_create_pseudo_p ())
+ {
+ tmp2 = gen_reg_rtx (V4SFmode);
+ tmp3 = gen_reg_rtx (V4SFmode);
+ tmp4 = gen_reg_rtx (V4SFmode);
+ tmp5 = gen_reg_rtx (V4SFmode);
+ }
+ else
+ {
+ tmp2 = operands[2];
+ tmp3 = operands[3];
+ tmp4 = tmp2;
+ tmp5 = operands[4];
+ }
+
+ emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+ emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+ emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+ emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3));
+ emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5));
+ DONE;
+}"
+ [(set_attr "length" "20")
+ (set_attr "type" "veccomplex")])
More information about the Gcc-patches
mailing list