--- /dev/null
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; {{{ Vector iterators
+
+; Vector modes for one vector register
+(define_mode_iterator VEC_1REG_MODE
+ [V64QI V64HI V64SI V64HF V64SF])
+(define_mode_iterator VEC_1REG_ALT
+ [V64QI V64HI V64SI V64HF V64SF])
+
+(define_mode_iterator VEC_1REG_INT_MODE
+ [V64QI V64HI V64SI])
+(define_mode_iterator VEC_1REG_INT_ALT
+ [V64QI V64HI V64SI])
+
+; Vector modes for two vector registers
+(define_mode_iterator VEC_2REG_MODE
+ [V64DI V64DF])
+
+; All of above
+(define_mode_iterator VEC_REG_MODE
+ [V64QI V64HI V64SI V64HF V64SF ; Single reg
+ V64DI V64DF]) ; Double reg
+
+(define_mode_attr scalar_mode
+ [(V64QI "qi") (V64HI "hi") (V64SI "si")
+ (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
+
+(define_mode_attr SCALAR_MODE
+ [(V64QI "QI") (V64HI "HI") (V64SI "SI")
+ (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")])
+
+;; }}}
+;; {{{ Substitutions
+
+(define_subst_attr "exec" "vec_merge"
+ "" "_exec")
+(define_subst_attr "exec_clobber" "vec_merge_with_clobber"
+ "" "_exec")
+(define_subst_attr "exec_vcc" "vec_merge_with_vcc"
+ "" "_exec")
+(define_subst_attr "exec_scatter" "scatter_store"
+ "" "_exec")
+
+(define_subst "vec_merge"
+ [(set (match_operand:VEC_REG_MODE 0)
+ (match_operand:VEC_REG_MODE 1))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (match_dup 1)
+ (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" "e")))])
+
+(define_subst "vec_merge_with_clobber"
+ [(set (match_operand:VEC_REG_MODE 0)
+ (match_operand:VEC_REG_MODE 1))
+ (clobber (match_operand 2))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (match_dup 1)
+ (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" "e")))
+ (clobber (match_dup 2))])
+
+(define_subst "vec_merge_with_vcc"
+ [(set (match_operand:VEC_REG_MODE 0)
+ (match_operand:VEC_REG_MODE 1))
+ (set (match_operand:DI 2)
+ (match_operand:DI 3))]
+ ""
+ [(parallel
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (match_dup 1)
+ (match_operand:VEC_REG_MODE 4
+ "gcn_register_or_unspec_operand" "U0")
+ (match_operand:DI 5 "gcn_exec_reg_operand" "e")))
+ (set (match_dup 2)
+ (and:DI (match_dup 3)
+ (reg:DI EXEC_REG)))])])
+
+(define_subst "scatter_store"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand 0)
+ (match_operand 1)
+ (match_operand 2)
+ (match_operand 3)]
+ UNSPEC_SCATTER))]
+ ""
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_dup 0)
+ (match_dup 1)
+ (match_dup 2)
+ (match_dup 3)
+ (match_operand:DI 4 "gcn_exec_reg_operand" "e")]
+ UNSPEC_SCATTER))])
+
+;; }}}
+;; {{{ Vector moves
+
+; This is the entry point for all vector register moves. Memory accesses can
+; come this way also, but will more usually use the reload_in/out,
+; gather/scatter, maskload/store, etc.
+
+(define_expand "mov<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (match_operand:VEC_REG_MODE 1 "general_operand"))]
+ ""
+ {
+ if (MEM_P (operands[0]) && !lra_in_progress && !reload_completed)
+ {
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+ rtx scratch = gen_rtx_SCRATCH (V64DImode);
+ rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[0],
+ scratch);
+ emit_insn (gen_scatter<mode>_expr (expr, operands[1], a, v));
+ DONE;
+ }
+ else if (MEM_P (operands[1]) && !lra_in_progress && !reload_completed)
+ {
+ rtx scratch = gen_rtx_SCRATCH (V64DImode);
+ rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[1],
+ scratch);
+ emit_insn (gen_gather<mode>_expr (operands[0], expr, a, v));
+ DONE;
+ }
+ else if ((MEM_P (operands[0]) || MEM_P (operands[1])))
+ {
+ gcc_assert (!reload_completed);
+ rtx scratch = gen_reg_rtx (V64DImode);
+ emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], scratch));
+ DONE;
+ }
+ })
+
+; A pseudo instruction that helps LRA use the "U0" constraint.
+
+(define_insn "mov<mode>_unspec"
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand" "=v")
+ (match_operand:VEC_REG_MODE 1 "gcn_unspec_operand" " U"))]
+ ""
+ ""
+ [(set_attr "type" "unknown")
+ (set_attr "length" "0")])
+
+(define_insn "*mov<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v")
+ (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B"))]
+ ""
+ "v_mov_b32\t%0, %1"
+ [(set_attr "type" "vop1,vop1")
+ (set_attr "length" "4,8")])
+
+(define_insn "mov<mode>_exec"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand"
+ "=v, v, v, v, v, m")
+ (vec_merge:VEC_1REG_MODE
+ (match_operand:VEC_1REG_MODE 1 "general_operand"
+ "vA, B, v,vA, m, v")
+ (match_operand:VEC_1REG_MODE 3 "gcn_alu_or_unspec_operand"
+ "U0,U0,vA,vA,U0,U0")
+ (match_operand:DI 2 "register_operand" " e, e,cV,Sv, e, e")))
+ (clobber (match_scratch:V64DI 4 "=X, X, X, X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ "@
+ v_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ v_cndmask_b32\t%0, %3, %1, vcc
+ v_cndmask_b32\t%0, %3, %1, %2
+ #
+ #"
+ [(set_attr "type" "vop1,vop1,vop2,vop3a,*,*")
+ (set_attr "length" "4,8,4,8,16,16")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+;(define_insn "*mov<mode>_exec_match"
+; [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v, v, m")
+; (vec_merge:VEC_1REG_MODE
+; (match_operand:VEC_1REG_MODE 1 "general_operand" "vA,B, m, v")
+; (match_dup 0)
+; (match_operand:DI 2 "gcn_exec_reg_operand" " e,e, e, e")))
+; (clobber (match_scratch:V64DI 3 "=X,X,&v,&v"))]
+; "!MEM_P (operands[0]) || REG_P (operands[1])"
+; "@
+; v_mov_b32\t%0, %1
+; v_mov_b32\t%0, %1
+; #
+; #"
+; [(set_attr "type" "vop1,vop1,*,*")
+; (set_attr "length" "4,8,16,16")])
+
+(define_insn "*mov<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v")
+ (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB"))]
+ ""
+ {
+ if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+ return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+ else
+ return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")])
+
+(define_insn "mov<mode>_exec"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"
+ "= v, v, v, v, m")
+ (vec_merge:VEC_2REG_MODE
+ (match_operand:VEC_2REG_MODE 1 "general_operand"
+ "vDB, v0, v0, m, v")
+ (match_operand:VEC_2REG_MODE 3 "gcn_alu_or_unspec_operand"
+ " U0,vDA0,vDA0,U0,U0")
+ (match_operand:DI 2 "register_operand" " e, cV, Sv, e, e")))
+ (clobber (match_scratch:V64DI 4 "= X, X, X,&v,&v"))]
+ "!MEM_P (operands[0]) || REG_P (operands[1])"
+ {
+ if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+ switch (which_alternative)
+ {
+ case 0:
+ return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+ case 1:
+ return "v_cndmask_b32\t%L0, %L3, %L1, vcc\;"
+ "v_cndmask_b32\t%H0, %H3, %H1, vcc";
+ case 2:
+ return "v_cndmask_b32\t%L0, %L3, %L1, %2\;"
+ "v_cndmask_b32\t%H0, %H3, %H1, %2";
+ }
+ else
+ switch (which_alternative)
+ {
+ case 0:
+ return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+ case 1:
+ return "v_cndmask_b32\t%H0, %H3, %H1, vcc\;"
+ "v_cndmask_b32\t%L0, %L3, %L1, vcc";
+ case 2:
+ return "v_cndmask_b32\t%H0, %H3, %H1, %2\;"
+ "v_cndmask_b32\t%L0, %L3, %L1, %2";
+ }
+
+ return "#";
+ }
+ [(set_attr "type" "vmult,vmult,vmult,*,*")
+ (set_attr "length" "16,16,16,16,16")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+;(define_insn "*mov<mode>_exec_match"
+; [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v, v, m")
+; (vec_merge:VEC_2REG_MODE
+; (match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v")
+; (match_dup 0)
+; (match_operand:DI 2 "gcn_exec_reg_operand" " e, e, e")))
+; (clobber (match_scratch:V64DI 3 "=X,&v,&v"))]
+; "!MEM_P (operands[0]) || REG_P (operands[1])"
+; "@
+; * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+; return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+; else \
+; return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+; #
+; #"
+; [(set_attr "type" "vmult,*,*")
+; (set_attr "length" "16,16,16")])
+
+; A SGPR-base load looks like:
+; <load> v, Sv
+;
+; There's no hardware instruction that corresponds to this, but vector base
+; addresses are placed in an SGPR because it is easier to add to a vector.
+; We also have a temporary vT, and the vector v1 holding numbered lanes.
+;
+; Rewrite as:
+; vT = v1 << log2(element-size)
+; vT += Sv
+; flat_load v, vT
+
+(define_insn "mov<mode>_sgprbase"
+ [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "= v, v, v, m")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 1 "general_operand" " vA,vB, m, v")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v,&v"))]
+ "lra_in_progress || reload_completed"
+ "@
+ v_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ #
+ #"
+ [(set_attr "type" "vop1,vop1,*,*")
+ (set_attr "length" "4,8,12,12")])
+
+(define_insn "mov<mode>_sgprbase"
+ [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "= v, v, m")
+ (unspec:VEC_2REG_MODE
+ [(match_operand:VEC_2REG_MODE 1 "general_operand" "vDB, m, v")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v,&v,&v"))]
+ "lra_in_progress || reload_completed"
+ "@
+ * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+ #
+ #"
+ [(set_attr "type" "vmult,*,*")
+ (set_attr "length" "8,12,12")])
+
+; reload_in was once a standard name, but here it's only referenced by
+; gcn_secondary_reload. It allows a reload with a scratch register.
+
+(define_expand "reload_in<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "= v")
+ (match_operand:VEC_REG_MODE 1 "memory_operand" " m"))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v"))]
+ ""
+ {
+ emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+ DONE;
+ })
+
+; reload_out is similar to reload_in, above.
+
+(define_expand "reload_out<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand" "= m")
+ (match_operand:VEC_REG_MODE 1 "register_operand" " v"))
+ (clobber (match_operand:V64DI 2 "register_operand" "=&v"))]
+ ""
+ {
+ emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+ DONE;
+ })
+
+; Expand scalar addresses into gather/scatter patterns
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+ (unspec:VEC_REG_MODE
+ [(match_operand:VEC_REG_MODE 1 "general_operand")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_scratch:V64DI 2))]
+ ""
+ [(set (mem:BLK (scratch))
+ (unspec:BLK [(match_dup 5) (match_dup 1) (match_dup 6) (match_dup 7)]
+ UNSPEC_SCATTER))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[0],
+ operands[2]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "general_operand")
+ (match_operand:VEC_REG_MODE 2 "")
+ (match_operand:DI 3 "gcn_exec_reg_operand")))
+ (clobber (match_scratch:V64DI 4))]
+ ""
+ [(set (mem:BLK (scratch))
+ (unspec:BLK [(match_dup 5) (match_dup 1)
+ (match_dup 6) (match_dup 7) (match_dup 3)]
+ UNSPEC_SCATTER))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+ operands[3],
+ operands[0],
+ operands[4]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (unspec:VEC_REG_MODE
+ [(match_operand:VEC_REG_MODE 1 "memory_operand")]
+ UNSPEC_SGPRBASE))
+ (clobber (match_scratch:V64DI 2))]
+ ""
+ [(set (match_dup 0)
+ (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+ operands[1],
+ operands[2]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ })
+
+(define_split
+ [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "memory_operand")
+ (match_operand:VEC_REG_MODE 2 "")
+ (match_operand:DI 3 "gcn_exec_reg_operand")))
+ (clobber (match_scratch:V64DI 4))]
+ ""
+ [(set (match_dup 0)
+ (vec_merge:VEC_REG_MODE
+ (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER)
+ (match_dup 2)
+ (match_dup 3)))]
+ {
+ operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+ operands[3],
+ operands[1],
+ operands[4]);
+ operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ })
+
+; TODO: Add zero/sign extending variants.
+
+;; }}}
+;; {{{ Lane moves
+
+; v_writelane and v_readlane work regardless of exec flags.
+; We allow source to be scratch.
+;
+; FIXME these should take A immediates
+
+(define_insn "*vec_set<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_1REG_MODE
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" " Sv"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (ashift (const_int 1)
+ (match_operand:SI 2 "gcn_alu_operand" "SvB"))))]
+ ""
+ "v_writelane_b32 %0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+; FIXME: 64bit operations really should be splitters, but I am not sure how
+; to represent vertical subregs.
+(define_insn "*vec_set<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v")
+ (vec_merge:VEC_2REG_MODE
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" " Sv"))
+ (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+ " U0")
+ (ashift (const_int 1)
+ (match_operand:SI 2 "gcn_alu_operand" "SvB"))))]
+ ""
+ "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+(define_expand "vec_set<mode>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+ (vec_merge:VEC_REG_MODE
+ (vec_duplicate:VEC_REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand"))
+ (match_dup 0)
+ (ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))]
+ "")
+
+(define_insn "*vec_set<mode>_1"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_1REG_MODE
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" "Sv"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:SI 2 "const_int_operand" " i")))]
+ "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+ {
+ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+ return "v_writelane_b32 %0, %1, %2";
+ }
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "*vec_set<mode>_1"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=v")
+ (vec_merge:VEC_2REG_MODE
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "register_operand" "Sv"))
+ (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+ "U0")
+ (match_operand:SI 2 "const_int_operand" " i")))]
+ "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+ {
+ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+ return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2";
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "vec_duplicate<mode><exec>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvB")))]
+ ""
+ "v_mov_b32\t%0, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode><exec>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "= v")
+ (vec_duplicate:VEC_2REG_MODE
+ (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvDB")))]
+ ""
+ "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "16")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg")
+ (vec_select:<SCALAR_MODE>
+ (match_operand:VEC_1REG_MODE 1 "register_operand" " v")
+ (parallel [(match_operand:SI 2 "gcn_alu_operand" "SvB")])))]
+ ""
+ "v_readlane_b32 %0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg")
+ (vec_select:<SCALAR_MODE>
+ (match_operand:VEC_2REG_MODE 1 "register_operand" " v")
+ (parallel [(match_operand:SI 2 "gcn_alu_operand" "SvB")])))]
+ ""
+ "v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2"
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")
+ (set_attr "exec" "none")
+ (set_attr "laneselect" "yes")])
+
+(define_expand "vec_init<mode><scalar_mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand 1)]
+ ""
+ {
+ gcn_expand_vector_init (operands[0], operands[1]);
+ DONE;
+ })
+
+;; }}}
+;; {{{ Scatter / Gather
+
+;; GCN does not have an instruction for loading a vector from contiguous
+;; memory so *all* loads and stores are eventually converted to scatter
+;; or gather.
+;;
+;; GCC does not permit MEM to hold vectors of addresses, so we must use an
+;; unspec. The unspec formats are as follows:
+;;
+;; (unspec:V64??
+;; [(<address expression>)
+;; (<addr_space_t>)
+;; (<use_glc>)
+;; (mem:BLK (scratch))]
+;; UNSPEC_GATHER)
+;;
+;; (unspec:BLK
+;; [(<address expression>)
+;; (<source register>)
+;; (<addr_space_t>)
+;; (<use_glc>)
+;; (<exec>)]
+;; UNSPEC_SCATTER)
+;;
+;; - Loads are expected to be wrapped in a vec_merge, so do not need <exec>.
+;; - The mem:BLK does not contain any real information, but indicates that an
+;; unknown memory read is taking place. Stores are expected to use a similar
+;; mem:BLK outside the unspec.
+;; - The address space and glc (volatile) fields are there to replace the
+;; fields normally found in a MEM.
+;; - Multiple forms of address expression are supported, below.
+
+(define_expand "gather_load<mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")]
+ ""
+ {
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+ operands[2], operands[4],
+ INTVAL (operands[3]), NULL);
+
+ if (GET_MODE (addr) == V64DImode)
+ emit_insn (gen_gather<mode>_insn_1offset (operands[0], addr, const0_rtx,
+ const0_rtx, const0_rtx));
+ else
+ emit_insn (gen_gather<mode>_insn_2offsets (operands[0], operands[1],
+ addr, const0_rtx, const0_rtx,
+ const0_rtx));
+ DONE;
+ })
+
+(define_expand "gather<mode>_exec"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand:V64SI 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")
+ (match_operand:DI 5 "gcn_exec_reg_operand")]
+ ""
+ {
+ rtx undefmode = gcn_gen_undef (<MODE>mode);
+
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+ operands[2], operands[4],
+ INTVAL (operands[3]), operands[5]);
+
+ if (GET_MODE (addr) == V64DImode)
+ emit_insn (gen_gather<mode>_insn_1offset_exec (operands[0], addr,
+ const0_rtx, const0_rtx,
+ const0_rtx, undefmode,
+ operands[5]));
+ else
+ emit_insn (gen_gather<mode>_insn_2offsets_exec (operands[0], operands[1],
+ addr, const0_rtx,
+ const0_rtx, const0_rtx,
+ undefmode, operands[5]));
+ DONE;
+ })
+
+; Allow any address expression
+(define_expand "gather<mode>_expr<exec>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+ (unspec:VEC_REG_MODE
+ [(match_operand 1 "")
+ (match_operand 2 "immediate_operand")
+ (match_operand 3 "immediate_operand")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER))]
+ ""
+ {})
+
+(define_insn "gather<mode>_insn_1offset<exec>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_REG_MODE
+ [(plus:V64DI (match_operand:V64DI 1 "register_operand" " v")
+ (vec_duplicate:V64DI
+ (match_operand 2 "immediate_operand" " n")))
+ (match_operand 3 "immediate_operand" " n")
+ (match_operand 4 "immediate_operand" " n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER))]
+ "(AS_FLAT_P (INTVAL (operands[3]))
+ && ((TARGET_GCN3 && INTVAL(operands[2]) == 0)
+ || ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000)))
+ || (AS_GLOBAL_P (INTVAL (operands[3]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_FLAT_P (as))
+ {
+ if (TARGET_GCN5_PLUS)
+ sprintf (buf, "flat_load%%s0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0",
+ glc);
+ else
+ sprintf (buf, "flat_load%%s0\t%%0, %%1%s\;s_waitcnt\t0", glc);
+ }
+ else if (AS_GLOBAL_P (as))
+ sprintf (buf, "global_load%%s0\t%%0, %%1, off offset:%%2%s\;"
+ "s_waitcnt\tvmcnt(0)", glc);
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+(define_insn "gather<mode>_insn_1offset_ds<exec>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_REG_MODE
+ [(plus:V64SI (match_operand:V64SI 1 "register_operand" " v")
+ (vec_duplicate:V64SI
+ (match_operand 2 "immediate_operand" " n")))
+ (match_operand 3 "immediate_operand" " n")
+ (match_operand 4 "immediate_operand" " n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER))]
+ "(AS_ANY_DS_P (INTVAL (operands[3]))
+ && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ static char buf[200];
+ sprintf (buf, "ds_read%%b0\t%%0, %%1 offset:%%2%s\;s_waitcnt\tlgkmcnt(0)",
+ (AS_GDS_P (as) ? " gds" : ""));
+ return buf;
+ }
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")])
+
+(define_insn "gather<mode>_insn_2offsets<exec>"
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_REG_MODE
+ [(plus:V64DI
+ (plus:V64DI
+ (vec_duplicate:V64DI
+ (match_operand:DI 1 "register_operand" "Sv"))
+ (sign_extend:V64DI
+ (match_operand:V64SI 2 "register_operand" " v")))
+ (vec_duplicate:V64DI (match_operand 3 "immediate_operand" " n")))
+ (match_operand 4 "immediate_operand" " n")
+ (match_operand 5 "immediate_operand" " n")
+ (mem:BLK (scratch))]
+ UNSPEC_GATHER))]
+ "(AS_GLOBAL_P (INTVAL (operands[4]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[4]);
+ const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_GLOBAL_P (as))
+ {
+ /* Work around assembler bug in which a 64-bit register is expected,
+ but a 32-bit value would be correct. */
+ int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
+ sprintf (buf, "global_load%%s0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+ "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+ }
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+(define_expand "scatter_store<mode>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")]
+ ""
+ {
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+ operands[1], operands[3],
+ INTVAL (operands[2]), NULL);
+
+ if (GET_MODE (addr) == V64DImode)
+ emit_insn (gen_scatter<mode>_insn_1offset (addr, const0_rtx, operands[4],
+ const0_rtx, const0_rtx));
+ else
+ emit_insn (gen_scatter<mode>_insn_2offsets (operands[0], addr,
+ const0_rtx, operands[4],
+ const0_rtx, const0_rtx));
+ DONE;
+ })
+
+(define_expand "scatter<mode>_exec"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")
+ (match_operand:DI 5 "gcn_exec_reg_operand")]
+ ""
+ {
+ operands[5] = force_reg (DImode, operands[5]);
+
+ rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+ operands[1], operands[3],
+ INTVAL (operands[2]), operands[5]);
+
+ if (GET_MODE (addr) == V64DImode)
+ emit_insn (gen_scatter<mode>_insn_1offset_exec (addr, const0_rtx,
+ operands[4], const0_rtx,
+ const0_rtx,
+ operands[5]));
+ else
+ emit_insn (gen_scatter<mode>_insn_2offsets_exec (operands[0], addr,
+ const0_rtx, operands[4],
+ const0_rtx, const0_rtx,
+ operands[5]));
+ DONE;
+ })
+
+; Allow any address expression
+(define_expand "scatter<mode>_expr<exec_scatter>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:V64DI 0 "")
+ (match_operand:VEC_REG_MODE 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand 3 "immediate_operand")]
+ UNSPEC_SCATTER))]
+ ""
+ {})
+
+(define_insn "scatter<mode>_insn_1offset<exec_scatter>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64DI (match_operand:V64DI 0 "register_operand" "v")
+ (vec_duplicate:V64DI
+ (match_operand 1 "immediate_operand" "n")))
+ (match_operand:VEC_REG_MODE 2 "register_operand" "v")
+ (match_operand 3 "immediate_operand" "n")
+ (match_operand 4 "immediate_operand" "n")]
+ UNSPEC_SCATTER))]
+ "(AS_FLAT_P (INTVAL (operands[3]))
+ && (INTVAL(operands[1]) == 0
+ || (TARGET_GCN5_PLUS
+ && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000)))
+ || (AS_GLOBAL_P (INTVAL (operands[3]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_FLAT_P (as))
+ {
+ if (TARGET_GCN5_PLUS)
+ sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;"
+ "s_waitcnt\texpcnt(0)", glc);
+ else
+ sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)",
+ glc);
+ }
+ else if (AS_GLOBAL_P (as))
+ sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
+ "s_waitcnt\texpcnt(0)", glc);
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+(define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64SI (match_operand:V64SI 0 "register_operand" "v")
+ (vec_duplicate:V64SI
+ (match_operand 1 "immediate_operand" "n")))
+ (match_operand:VEC_REG_MODE 2 "register_operand" "v")
+ (match_operand 3 "immediate_operand" "n")
+ (match_operand 4 "immediate_operand" "n")]
+ UNSPEC_SCATTER))]
+ "(AS_ANY_DS_P (INTVAL (operands[3]))
+ && ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
+ {
+ addr_space_t as = INTVAL (operands[3]);
+ static char buf[200];
+ sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)",
+ (AS_GDS_P (as) ? " gds" : ""));
+ return buf;
+ }
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")])
+
+(define_insn "scatter<mode>_insn_2offsets<exec_scatter>"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(plus:V64DI
+ (plus:V64DI
+ (vec_duplicate:V64DI
+ (match_operand:DI 0 "register_operand" "Sv"))
+ (sign_extend:V64DI
+ (match_operand:V64SI 1 "register_operand" " v")))
+ (vec_duplicate:V64DI (match_operand 2 "immediate_operand"
+ " n")))
+ (match_operand:VEC_REG_MODE 3 "register_operand" " v")
+ (match_operand 4 "immediate_operand" " n")
+ (match_operand 5 "immediate_operand" " n")]
+ UNSPEC_SCATTER))]
+ "(AS_GLOBAL_P (INTVAL (operands[4]))
+ && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+ {
+ addr_space_t as = INTVAL (operands[4]);
+ const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+ static char buf[200];
+ if (AS_GLOBAL_P (as))
+ {
+ /* Work around assembler bug in which a 64-bit register is expected,
+ but a 32-bit value would be correct. */
+ int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
+ sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
+ "s_waitcnt\texpcnt(0)", reg, reg + 1, glc);
+ }
+ else
+ gcc_unreachable ();
+
+ return buf;
+ }
+ [(set_attr "type" "flat")
+ (set_attr "length" "12")])
+
+;; }}}
+;; {{{ Permutations
+
+(define_insn "ds_bpermute<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 2 "register_operand" " v")
+ (match_operand:V64SI 1 "register_operand" " v")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")]
+ UNSPEC_BPERMUTE))]
+ ""
+ "ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "12")])
+
+(define_insn_and_split "ds_bpermute<mode>"
+ [(set (match_operand:VEC_2REG_MODE 0 "register_operand" "=&v")
+ (unspec:VEC_2REG_MODE
+ [(match_operand:VEC_2REG_MODE 2 "register_operand" " v0")
+ (match_operand:V64SI 1 "register_operand" " v")
+ (match_operand:DI 3 "gcn_exec_reg_operand" " e")]
+ UNSPEC_BPERMUTE))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 4) (unspec:V64SI [(match_dup 6) (match_dup 1) (match_dup 3)]
+ UNSPEC_BPERMUTE))
+ (set (match_dup 5) (unspec:V64SI [(match_dup 7) (match_dup 1) (match_dup 3)]
+ UNSPEC_BPERMUTE))]
+ {
+ operands[4] = gcn_operand_part (<MODE>mode, operands[0], 0);
+ operands[5] = gcn_operand_part (<MODE>mode, operands[0], 1);
+ operands[6] = gcn_operand_part (<MODE>mode, operands[2], 0);
+ operands[7] = gcn_operand_part (<MODE>mode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "24")])
+
+;; }}}
+;; {{{ ALU special case: add/sub
+
+(define_insn "addv64si3<exec_clobber>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (plus:V64SI
+ (match_operand:V64SI 1 "register_operand" "% v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSvB")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "addv64si3_dup<exec_clobber>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (plus:V64SI
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" "SvB"))
+ (match_operand:V64SI 1 "register_operand" " v")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "addv64si3_vcc<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (plus:V64SI
+ (match_operand:V64SI 1 "register_operand" "% v, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSvB,vSvB")))
+ (set (match_operand:DI 3 "register_operand" "= cV, Sg")
+ (ltu:DI (plus:V64SI (match_dup 1) (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "v_add%^_u32\t%0, %3, %2, %1"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "8")])
+
+; This pattern only changes the VCC bits when the corresponding lane is
+; enabled, so the set must be described as an ior.
+
+(define_insn "addv64si3_vcc_dup<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (plus:V64SI
+ (vec_duplicate:V64SI
+ (match_operand:SI 1 "gcn_alu_operand" "SvB,SvB"))
+ (match_operand:V64SI 2 "register_operand" " v, v")))
+ (set (match_operand:DI 3 "register_operand" "=cV, Sg")
+ (ltu:DI (plus:V64SI (vec_duplicate:V64SI (match_dup 2))
+ (match_dup 1))
+ (vec_duplicate:V64SI (match_dup 2))))]
+ ""
+ "v_add%^_u32\t%0, %3, %2, %1"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "8,8")])
+
+; This pattern does not accept SGPR because VCC read already counts as an
+; SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "addcv64si3<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+ (plus:V64SI
+ (plus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_operand:DI 3 "register_operand" " cV,Sv"))
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vA,vA"))
+ (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB")))
+ (set (match_operand:DI 4 "register_operand" "=cV,Sg")
+ (ior:DI (ltu:DI (plus:V64SI
+ (plus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:DI (plus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "v_addc%^_u32\t%0, %4, %1, %2, %3"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "4,8")])
+
+(define_insn "addcv64si3_dup<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+ (plus:V64SI
+ (plus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_operand:DI 3 "register_operand" " cV, Sv"))
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vA, vA"))
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" "SvB,SvB"))))
+ (set (match_operand:DI 4 "register_operand" "=cV, Sg")
+ (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (vec_duplicate:V64SI
+ (match_dup 2)))
+ (vec_duplicate:V64SI
+ (match_dup 2)))
+ (ltu:DI (plus:V64SI (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "v_addc%^_u32\t%0, %4, %1, %2, %3"
+ [(set_attr "type" "vop2,vop3b")
+ (set_attr "length" "4,8")])
+
+(define_insn "subv64si3<exec_clobber>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v")
+ (minus:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "vSvB, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" " v,vSvB")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "@
+ v_sub%^_u32\t%0, vcc, %1, %2
+ v_subrev%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "subv64si3_vcc<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v")
+ (minus:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "vSvB,vSvB, v, v")
+ (match_operand:V64SI 2 "gcn_alu_operand" " v, v,vSvB,vSvB")))
+ (set (match_operand:DI 3 "register_operand" "= cV, Sg, cV, Sg")
+ (gtu:DI (minus:V64SI (match_dup 1) (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "@
+ v_sub%^_u32\t%0, %3, %1, %2
+ v_sub%^_u32\t%0, %3, %1, %2
+ v_subrev%^_u32\t%0, %3, %2, %1
+ v_subrev%^_u32\t%0, %3, %2, %1"
+ [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+ (set_attr "length" "8")])
+
+; This pattern does not accept SGPR because VCC read already counts
+; as a SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "subcv64si3<exec_vcc>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v, v, v, v")
+ (minus:V64SI
+ (minus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_operand:DI 3 "gcn_alu_operand" " cV,Sv,cV,Sv"))
+ (match_operand:V64SI 1 "gcn_alu_operand" " vA,vA,vB,vB"))
+ (match_operand:V64SI 2 "gcn_alu_operand" " vB,vB,vA,vA")))
+ (set (match_operand:DI 4 "register_operand" "=cV,Sg,cV,Sg")
+ (ior:DI (gtu:DI (minus:V64SI (minus:V64SI
+ (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:DI (minus:V64SI (vec_merge:V64SI
+ (vec_duplicate:V64SI (const_int 1))
+ (vec_duplicate:V64SI (const_int 0))
+ (match_dup 3))
+ (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "@
+ v_subb%^_u32\t%0, %4, %1, %2, %3
+ v_subb%^_u32\t%0, %4, %1, %2, %3
+ v_subbrev%^_u32\t%0, %4, %2, %1, %3
+ v_subbrev%^_u32\t%0, %4, %2, %1, %3"
+ [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" "% v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vSvB0")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc));
+ emit_insn (gen_addcv64si3
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" "% v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vSvB0"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[4])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_addcv64si3_exec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "subv64di3"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v, &v")
+ (minus:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "vSvB0, v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" " v0,vSvB0")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_subv64si3_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc));
+ emit_insn (gen_subcv64si3
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "subv64di3_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v, &v")
+ (vec_merge:V64DI
+ (minus:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "vSvB0, v0")
+ (match_operand:V64DI 2 "gcn_alu_operand" " v0,vSvB0"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand"
+ " U0, U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e")))
+ (clobber (reg:DI VCC_REG))]
+ "register_operand (operands[1], VOIDmode)
+ || register_operand (operands[2], VOIDmode)"
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_subv64si3_vcc_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_subcv64si3_exec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_dup"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" " v0")
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SvDB"))))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ vcc));
+ emit_insn (gen_addcv64si3_dup
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1),
+ vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_dup_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (match_operand:V64DI 1 "register_operand" " v0")
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SvDB")))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[1])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ gcn_operand_part (V64DImode, operands[1], 0),
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_addcv64si3_dup_exec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1),
+ vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v,&v")
+ (plus:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "0vB,0vA")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ operands[1],
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc));
+ emit_insn (gen_addcv64si3
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx, vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v,&v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "0vB,0vA"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0, U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ operands[1],
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_addcv64si3_exec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx, vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_dup"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (plus:V64DI
+ (zero_extend:V64DI
+ (vec_duplicate:V64SI
+ (match_operand:SI 1 "gcn_alu_operand" "BSv")))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vA0")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc));
+ emit_insn (gen_addcv64si3
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx, vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI
+ (vec_duplicate:V64SI
+ (match_operand:SI 1 "gcn_alu_operand" "BSv")))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vA0"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[2])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (V64DImode, operands[2], 0),
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ emit_insn (gen_addcv64si3_exec
+ (gcn_operand_part (V64DImode, operands[0], 1),
+ gcn_operand_part (V64DImode, operands[2], 1),
+ const0_rtx, vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (plus:V64DI
+ (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv"))))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[1],
+ vcc));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si
+ (dsthi, gcn_operand_part (DImode, operands[2], 1)));
+ emit_insn (gen_addcv64si3 (dsthi, dsthi, const0_rtx, vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+ " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_addv64si3_vcc_dup_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[1],
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si_exec
+ (dsthi, gcn_operand_part (DImode, operands[2], 1),
+ gcn_gen_undef (V64SImode), operands[4]));
+ emit_insn (gen_addcv64si3_exec
+ (dsthi, dsthi, const0_rtx, vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (plus:V64DI
+ (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv"))))
+ (clobber (match_scratch:V64SI 3 "=&v"))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_ashrv64si3 (operands[3], operands[1], GEN_INT (31)));
+ emit_insn (gen_addv64si3_vcc_dup
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[1],
+ vcc));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si
+ (dsthi, gcn_operand_part (DImode, operands[2], 1)));
+ emit_insn (gen_addcv64si3 (dsthi, dsthi, operands[3], vcc, vcc));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= v")
+ (vec_merge:V64DI
+ (plus:V64DI
+ (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+ " vA"))
+ (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "gcn_can_split_p (V64DImode, operands[0])
+ && gcn_can_split_p (V64DImode, operands[3])"
+ [(const_int 0)]
+ {
+ rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+ emit_insn (gen_ashrv64si3_exec (operands[5], operands[1], GEN_INT (31),
+ gcn_gen_undef (V64SImode), operands[4]));
+ emit_insn (gen_addv64si3_vcc_dup_exec
+ (gcn_operand_part (V64DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ operands[1],
+ vcc,
+ gcn_operand_part (V64DImode, operands[3], 0),
+ operands[4]));
+ rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+ emit_insn (gen_vec_duplicatev64si_exec
+ (dsthi, gcn_operand_part (DImode, operands[2], 1),
+ gcn_gen_undef (V64SImode), operands[4]));
+ emit_insn (gen_addcv64si3_exec
+ (dsthi, dsthi, operands[5], vcc, vcc,
+ gcn_operand_part (V64DImode, operands[3], 1),
+ operands[4]));
+ DONE;
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ DS memory ALU: add/sub
+
+(define_mode_iterator DS_ARITH_MODE [V64SI V64SF V64DI])
+(define_mode_iterator DS_ARITH_SCALAR_MODE [SI SF DI])
+
+;; FIXME: the vector patterns probably need RD expanded to a vector of
+;; addresses. For now, the only way a vector can get into LDS is
+;; if the user puts it there manually.
+;;
+;; FIXME: the scalar patterns are probably fine in themselves, but need to be
+;; checked to see if anything can ever use them.
+
+(define_insn "add<mode>3_ds<exec>"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (plus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" "%RD")
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_add%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "add<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (plus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ "%RD")
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_add%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds<exec>"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_sub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ " RD")
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_sub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds<exec>"
+ [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_MODE
+ (match_operand:DS_ARITH_MODE 2 "register_operand" " v")
+ (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_rsub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_scalar"
+ [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+ (minus:DS_ARITH_SCALAR_MODE
+ (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand" " v")
+ (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+ " RD")))]
+ "rtx_equal_p (operands[0], operands[1])"
+ "ds_rsub%u0\t%A0, %2%O0"
+ [(set_attr "type" "ds")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU special case: mult
+
+(define_insn "<su>mulv64si3_highpart<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (truncate:V64SI
+ (lshiftrt:V64DI
+ (mult:V64DI
+ (any_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " %v"))
+ (any_extend:V64DI
+ (match_operand:V64SI 2 "gcn_alu_operand" "vSvA")))
+ (const_int 32))))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "mulv64si3<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (mult:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+ (match_operand:V64SI 2 "gcn_alu_operand" " vSvA")))]
+ ""
+ "v_mul_lo_u32\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "mulv64si3_dup<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (mult:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" " SvA"))))]
+ ""
+ "v_mul_lo_u32\t%0, %1, %2"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "mulv64di3"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (mult:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "% v")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA")))
+ (clobber (match_scratch:V64SI 3 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+ rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx tmp = operands[3];
+
+ emit_insn (gen_mulv64si3 (out_lo, left_lo, right_lo));
+ emit_insn (gen_umulv64si3_highpart (out_hi, left_lo, right_lo));
+ emit_insn (gen_mulv64si3 (tmp, left_hi, right_lo));
+ emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+ emit_insn (gen_mulv64si3 (tmp, left_lo, right_hi));
+ emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+ emit_insn (gen_mulv64si3 (tmp, left_hi, right_hi));
+ emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (match_operand:V64DI 1 "gcn_alu_operand" "% v")
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+ rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[4];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[3]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_exec (out_lo, left_lo, right_lo, old_lo, exec));
+ emit_insn (gen_umulv64si3_highpart_exec (out_hi, left_lo, right_lo,
+ old_hi, exec));
+ emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_lo, undef, exec));
+ emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ emit_insn (gen_mulv64si3_exec (tmp, left_lo, right_hi, undef, exec));
+ emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_hi, undef, exec));
+ emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_zext"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA")))
+ (clobber (match_scratch:V64SI 3 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx tmp = operands[3];
+
+ emit_insn (gen_mulv64si3 (out_lo, left, right_lo));
+ emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo));
+ emit_insn (gen_mulv64si3 (tmp, left, right_hi));
+ emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_zext_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (match_operand:V64DI 2 "gcn_alu_operand" "vDA"))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "=&v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[4];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[3]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec));
+ emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo,
+ old_hi, exec));
+ emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec));
+ emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_zext_dup2"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SvDA"))))
+ (clobber (match_scratch:V64SI 3 "= &v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx tmp = operands[3];
+
+ emit_insn (gen_mulv64si3 (out_lo, left, right_lo));
+ emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo));
+ emit_insn (gen_mulv64si3 (tmp, left, right_hi));
+ emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+ DONE;
+ })
+
+(define_insn_and_split "mulv64di3_zext_dup2_exec"
+ [(set (match_operand:V64DI 0 "register_operand" "= &v")
+ (vec_merge:V64DI
+ (mult:V64DI
+ (zero_extend:V64DI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v"))
+ (vec_duplicate:V64DI
+ (match_operand:DI 2 "gcn_alu_operand" "SvDA")))
+ (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e")))
+ (clobber (match_scratch:V64SI 5 "= &v"))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+ rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+ rtx left = operands[1];
+ rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+ rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+ rtx exec = operands[4];
+ rtx tmp = operands[5];
+
+ rtx old_lo, old_hi;
+ if (GET_CODE (operands[3]) == UNSPEC)
+ {
+ old_lo = old_hi = gcn_gen_undef (V64SImode);
+ }
+ else
+ {
+ old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+ old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+ }
+
+ rtx undef = gcn_gen_undef (V64SImode);
+
+ emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec));
+ emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo,
+ old_hi, exec));
+ emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec));
+ emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+ DONE;
+ })
+
+;; }}}
+;; {{{ ALU generic case
+
+(define_mode_iterator VEC_INT_MODE [V64QI V64HI V64SI V64DI])
+
+(define_code_iterator bitop [and ior xor])
+(define_code_iterator shiftop [ashift lshiftrt ashiftrt])
+(define_code_iterator minmaxop [smin smax umin umax])
+
+(define_insn "<expander><mode>2<exec>"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v")
+ (bitunop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand" "vSvB")))]
+ ""
+ "v_<mnemonic>0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD")
+ (bitop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "% v, 0")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+ "vSvB, v")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8,8")])
+
+(define_insn_and_split "<expander>v64di3"
+ [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+ (bitop:V64DI
+ (match_operand:V64DI 1 "gcn_valu_src0_operand" "% v,RD")
+ (match_operand:V64DI 2 "gcn_valu_src1com_operand" "vSvB, v")))]
+ ""
+ "@
+ #
+ ds_<mnemonic>0\t%A0, %2%O0"
+ "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+ [(set (match_dup 3)
+ (bitop:V64SI (match_dup 5) (match_dup 7)))
+ (set (match_dup 4)
+ (bitop:V64SI (match_dup 6) (match_dup 8)))]
+ {
+ operands[3] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[4] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[5] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[6] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[7] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[8] = gcn_operand_part (V64DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult,ds")
+ (set_attr "length" "16,8")])
+
+(define_insn_and_split "<expander>v64di3_exec"
+ [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+ (vec_merge:V64DI
+ (bitop:V64DI
+ (match_operand:V64DI 1 "gcn_valu_src0_operand" "% v,RD")
+ (match_operand:V64DI 2 "gcn_valu_src1com_operand" "vSvB, v"))
+ (match_operand:V64DI 3 "gcn_register_ds_or_unspec_operand"
+ " U0,U0")
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e")))]
+ "!memory_operand (operands[0], VOIDmode)
+ || (rtx_equal_p (operands[0], operands[1])
+ && register_operand (operands[2], VOIDmode))"
+ "@
+ #
+ ds_<mnemonic>0\t%A0, %2%O0"
+ "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+ [(set (match_dup 5)
+ (vec_merge:V64SI
+ (bitop:V64SI (match_dup 7) (match_dup 9))
+ (match_dup 11)
+ (match_dup 4)))
+ (set (match_dup 6)
+ (vec_merge:V64SI
+ (bitop:V64SI (match_dup 8) (match_dup 10))
+ (match_dup 12)
+ (match_dup 4)))]
+ {
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[6] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[8] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[10] = gcn_operand_part (V64DImode, operands[2], 1);
+ operands[11] = gcn_operand_part (V64DImode, operands[3], 0);
+ operands[12] = gcn_operand_part (V64DImode, operands[3], 1);
+ }
+ [(set_attr "type" "vmult,ds")
+ (set_attr "length" "16,8")])
+
+(define_insn "<expander>v64si3<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "= v")
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v")
+ (vec_duplicate:V64SI
+ (match_operand:SI 2 "gcn_alu_operand" "SvB"))))]
+ ""
+ "v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "v<expander>v64si3<exec>"
+ [(set (match_operand:V64SI 0 "register_operand" "=v")
+ (shiftop:V64SI
+ (match_operand:V64SI 1 "gcn_alu_operand" " v")
+ (match_operand:V64SI 2 "gcn_alu_operand" "vB")))]
+ ""
+ "v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "= v,RD")
+ (minmaxop:VEC_1REG_INT_MODE
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+ "% v, 0")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+ "vSvB, v")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - special cases
+
+; GCN does not directly provide a DFmode subtract instruction, so we do it by
+; adding the negated second operand to the first.
+
+(define_insn "subv64df3<exec>"
+ [(set (match_operand:V64DF 0 "register_operand" "= v, v")
+ (minus:V64DF
+ (match_operand:V64DF 1 "gcn_alu_operand" "vSvB, v")
+ (match_operand:V64DF 2 "gcn_alu_operand" " v,vSvB")))]
+ ""
+ "@
+ v_add_f64\t%0, %1, -%2
+ v_add_f64\t%0, -%2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8,8")])
+
+(define_insn "subdf"
+ [(set (match_operand:DF 0 "register_operand" "= v, v")
+ (minus:DF
+ (match_operand:DF 1 "gcn_alu_operand" "vSvB, v")
+ (match_operand:DF 2 "gcn_alu_operand" " v,vSvB")))]
+ ""
+ "@
+ v_add_f64\t%0, %1, -%2
+ v_add_f64\t%0, -%2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - generic
+
+(define_mode_iterator VEC_FP_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VEC_FP_1REG_MODE [V64HF V64SF])
+(define_mode_iterator FP_MODE [HF SF DF])
+(define_mode_iterator FP_1REG_MODE [HF SF])
+
+(define_code_iterator comm_fp [plus mult smin smax])
+(define_code_iterator nocomm_fp [minus])
+(define_code_iterator all_fp [plus mult minus smin smax])
+
+(define_insn "<expander><mode>3<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (comm_fp:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% v")
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "vSvB")))]
+ ""
+ "v_<mnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3"
+ [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand" "= v, RL")
+ (comm_fp:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_valu_src0_operand" "% v, 0")
+ (match_operand:FP_MODE 2 "gcn_valu_src1_operand" "vSvB,vSvB")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %2, %1
+ v_<mnemonic>0\t%0, %1%O0"
+ [(set_attr "type" "vop2,ds")
+ (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+ [(set (match_operand:VEC_FP_1REG_MODE 0 "register_operand" "= v, v")
+ (nocomm_fp:VEC_FP_1REG_MODE
+ (match_operand:VEC_FP_1REG_MODE 1 "gcn_alu_operand" "vSvB, v")
+ (match_operand:VEC_FP_1REG_MODE 2 "gcn_alu_operand" " v,vSvB")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>3"
+ [(set (match_operand:FP_1REG_MODE 0 "register_operand" "= v, v")
+ (nocomm_fp:FP_1REG_MODE
+ (match_operand:FP_1REG_MODE 1 "gcn_alu_operand" "vSvB, v")
+ (match_operand:FP_1REG_MODE 2 "gcn_alu_operand" " v,vSvB")))]
+ ""
+ "@
+ v_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "vop2")
+ (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP unops
+
+(define_insn "abs<mode>2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "=v")
+ (abs:FP_MODE (match_operand:FP_MODE 1 "register_operand" " v")))]
+ ""
+ "v_add%i0\t%0, 0, |%1|"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "abs<mode>2<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v")
+ (abs:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "register_operand" " v")))]
+ ""
+ "v_add%i0\t%0, 0, |%1|"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "neg<mode>2<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "=v")
+ (neg:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "register_operand" " v")))]
+ ""
+ "v_add%i0\t%0, 0, -%1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>2<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (sqrt:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+ "flag_unsafe_math_optimizations"
+ "v_sqrt%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v")
+ (sqrt:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+ "flag_unsafe_math_optimizations"
+ "v_sqrt%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ FP fused multiply and add
+
+(define_insn "fma<mode>4<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v")
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" " vA,vSvA")
+ (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSvA, vA")))]
+ ""
+ "v_fma%i0\t%0, %1, %2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>4_negop2<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v, v, v")
+ (fma:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:VEC_FP_MODE
+ (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSvA, vA, vA")))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>4"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v, v")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSvA")
+ (match_operand:FP_MODE 3 "gcn_alu_operand" "vSvA, vA")))]
+ ""
+ "v_fma%i0\t%0, %1, %2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fma<mode>4_negop2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v, v, v")
+ (fma:FP_MODE
+ (match_operand:FP_MODE 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:FP_MODE
+ (match_operand:FP_MODE 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (match_operand:FP_MODE 3 "gcn_alu_operand" "vSvA, vA, vA")))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ FP division
+
+(define_insn "recip<mode>2<exec>"
+ [(set (match_operand:VEC_FP_MODE 0 "register_operand" "= v")
+ (div:VEC_FP_MODE
+ (vec_duplicate:VEC_FP_MODE (float:<SCALAR_MODE> (const_int 1)))
+ (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+ ""
+ "v_rcp%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "recip<mode>2"
+ [(set (match_operand:FP_MODE 0 "register_operand" "= v")
+ (div:FP_MODE
+ (float:FP_MODE (const_int 1))
+ (match_operand:FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+ ""
+ "v_rcp%i0\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+;; Do division via a = b * 1/c
+;; The v_rcp_* instructions are not sufficiently accurate on their own,
+;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
+;; which the ISA manual says is enough to improve the reciprocal accuracy.
+;;
+;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+
+(define_expand "div<mode>3"
+ [(match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+ (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:VEC_FP_MODE 2 "gcn_valu_src0_operand")]
+ "flag_reciprocal_math"
+ {
+ rtx two = gcn_vec_constant (<MODE>mode,
+ const_double_from_real_value (dconst2, <SCALAR_MODE>mode));
+ rtx initrcp = gen_reg_rtx (<MODE>mode);
+ rtx fma = gen_reg_rtx (<MODE>mode);
+ rtx rcp;
+
+ bool is_rcp = (GET_CODE (operands[1]) == CONST_VECTOR
+ && real_identical
+ (CONST_DOUBLE_REAL_VALUE
+ (CONST_VECTOR_ELT (operands[1], 0)), &dconstm1));
+
+ if (is_rcp)
+ rcp = operands[0];
+ else
+ rcp = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_recip<mode>2 (initrcp, operands[2]));
+ emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+ emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+ if (!is_rcp)
+ emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+ DONE;
+ })
+
+(define_expand "div<mode>3"
+ [(match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+ (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+ (match_operand:FP_MODE 2 "gcn_valu_src0_operand")]
+ "flag_reciprocal_math"
+ {
+ rtx two = const_double_from_real_value (dconst2, <MODE>mode);
+ rtx initrcp = gen_reg_rtx (<MODE>mode);
+ rtx fma = gen_reg_rtx (<MODE>mode);
+ rtx rcp;
+
+ bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
+ && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
+ &dconstm1));
+
+ if (is_rcp)
+ rcp = operands[0];
+ else
+ rcp = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_recip<mode>2 (initrcp, operands[2]));
+ emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+ emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+ if (!is_rcp)
+ emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+ DONE;
+ })
+
+;; }}}
+;; {{{ Int/FP conversions
+
+(define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_TO_MODE [HI SI HF SF DF])
+
+(define_mode_iterator VCVT_FROM_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_TO_MODE [V64HI V64SI V64HF V64SF V64DF])
+
+(define_code_iterator cvt_op [fix unsigned_fix
+ float unsigned_float
+ float_extend float_truncate])
+(define_code_attr cvt_name [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")
+ (float "float") (unsigned_float "floatuns")
+ (float_extend "extend") (float_truncate "trunc")])
+(define_code_attr cvt_operands [(fix "%i0%i1") (unsigned_fix "%u0%i1")
+ (float "%i0%i1") (unsigned_float "%i0%u1")
+ (float_extend "%i0%i1")
+ (float_truncate "%i0%i1")])
+
+(define_insn "<cvt_name><CVT_FROM_MODE:mode><CVT_TO_MODE:mode>2"
+ [(set (match_operand:CVT_TO_MODE 0 "register_operand" "= v")
+ (cvt_op:CVT_TO_MODE
+ (match_operand:CVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))]
+ "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_TO_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ "v_cvt<cvt_operands>\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+(define_insn "<cvt_name><VCVT_FROM_MODE:mode><VCVT_TO_MODE:mode>2<exec>"
+ [(set (match_operand:VCVT_TO_MODE 0 "register_operand" "= v")
+ (cvt_op:VCVT_TO_MODE
+ (match_operand:VCVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))]
+ "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_TO_MODE:MODE>mode,
+ <cvt_name>_cvt)"
+ "v_cvt<cvt_operands>\t%0, %1"
+ [(set_attr "type" "vop1")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Int/int conversions
+
+;; GCC can already do these for scalar types, but not for vector types.
+;; Unfortunately you can't just do SUBREG on a vector to select the low part,
+;; so there must be a few tricks here.
+
+(define_insn_and_split "vec_truncatev64div64si"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,&v")
+ (truncate:V64SI
+ (match_operand:V64DI 1 "register_operand" " 0, v")))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 0) (match_dup 1))]
+ {
+ operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+ }
+ [(set_attr "type" "vop2")
+ (set_attr "length" "0,4")])
+
+(define_insn_and_split "vec_truncatev64div64si_exec"
+ [(set (match_operand:V64SI 0 "register_operand" "=v,&v")
+ (vec_merge:V64SI
+ (truncate:V64SI
+ (match_operand:V64DI 1 "register_operand" " 0, v"))
+ (match_operand:V64SI 2 "gcn_alu_or_unspec_operand" "U0,U0")
+ (match_operand:DI 3 "gcn_exec_operand" " e, e")))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 0)
+ (vec_merge:V64SI (match_dup 1) (match_dup 2) (match_dup 3)))
+ (clobber (scratch:V64DI))])]
+ {
+ operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+ }
+ [(set_attr "type" "vop2")
+ (set_attr "length" "0,4")])
+
+;; }}}
+;; {{{ Vector comparison/merge
+
+(define_insn "vec_cmp<mode>di"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e, e,Sg,Sg")
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+ "vSv, B,vSv, B, v,vA")
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v, v,vA, v")]))
+ (clobber (match_scratch:DI 4 "= X, X, cV,cV, X, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+ (set_attr "length" "4,8,4,8,8,8")])
+
+(define_expand "vec_cmpu<mode>di"
+ [(match_operand:DI 0 "register_operand")
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])]
+ ""
+ {
+ /* Unsigned comparisons use the same patterns as signed comparisons,
+ except that they use unsigned operators (e.g. LTU vs LT).
+ The '%E1' directive then does the Right Thing. */
+ emit_insn (gen_vec_cmp<mode>di (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ })
+
+(define_insn "vec_cmp<mode>di_exec"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e, e,Sg,Sg")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+ "vSv, B,vSv, B, v,vA")
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v, v,vA, v")])
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e, e, e, e")))
+ (clobber (match_scratch:DI 5 "= X, X, cV,cV, X, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+ (set_attr "length" "4,8,4,8,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e,e,Sg")
+ (match_operator 1 "comparison_operator"
+ [(vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+ " Sv, B,Sv,B, A"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v,v, v")]))
+ (clobber (match_scratch:DI 4 "= X,X,cV,cV, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+ (set_attr "length" "4,8,4,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup_exec"
+ [(set (match_operand:DI 0 "register_operand" "=cV,cV, e,e,Sg")
+ (and:DI
+ (match_operator 1 "comparison_operator"
+ [(vec_duplicate:VEC_1REG_MODE
+ (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+ " Sv, B,Sv,B, A"))
+ (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+ " v, v, v,v, v")])
+ (match_operand:DI 4 "gcn_exec_reg_operand" " e, e, e,e, e")))
+ (clobber (match_scratch:DI 5 "= X,X,cV,cV, X"))]
+ ""
+ "@
+ v_cmp%E1\tvcc, %2, %3
+ v_cmp%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmpx%E1\tvcc, %2, %3
+ v_cmp%E1\t%0, %2, %3"
+ [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+ (set_attr "length" "4,8,4,8,8")])
+
+(define_expand "vcond_mask_<mode>di"
+ [(parallel
+ [(set (match_operand:VEC_REG_MODE 0 "register_operand" "")
+ (vec_merge:VEC_REG_MODE
+ (match_operand:VEC_REG_MODE 1 "gcn_vop3_operand" "")
+ (match_operand:VEC_REG_MODE 2 "gcn_alu_operand" "")
+ (match_operand:DI 3 "register_operand" "")))
+ (clobber (scratch:V64DI))])]
+ ""
+ "")
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>"
+ [(match_operand:VEC_1REG_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_vec_cmp<mode>di (tmp, operands[3], operands[4],
+ operands[5]));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>_exec"
+ [(match_operand:VEC_1REG_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])
+ (match_operand:DI 6 "gcn_exec_reg_operand" "e")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_vec_cmp<mode>di_exec (tmp, operands[3], operands[4],
+ operands[5], operands[6]));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>"
+ [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_vec_cmp<mode>di (tmp, operands[3], operands[4],
+ operands[5]));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>_exec"
+ [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+ (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+ (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+ (match_operator 3 "comparison_operator"
+ [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+ (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])
+ (match_operand:DI 6 "gcn_exec_reg_operand" "e")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_vec_cmp<mode>di_exec (tmp, operands[3], operands[4],
+ operands[5], operands[6]));
+ emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+ tmp));
+ DONE;
+ })
+
+;; }}}
+;; {{{ Fully masked loop support
+
+(define_expand "while_ultsidi"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand:SI 1 "")
+ (match_operand:SI 2 "")]
+ ""
+ {
+ if (GET_CODE (operands[1]) != CONST_INT
+ || GET_CODE (operands[2]) != CONST_INT)
+ {
+ rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+ rtx tmp = _0_1_2_3;
+ if (GET_CODE (operands[1]) != CONST_INT
+ || INTVAL (operands[1]) != 0)
+ {
+ tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_addv64si3_dup (tmp, _0_1_2_3, operands[1]));
+ }
+ emit_insn (gen_vec_cmpv64sidi_dup (operands[0],
+ gen_rtx_GT (VOIDmode, 0, 0),
+ operands[2], tmp));
+ }
+ else
+ {
+ HOST_WIDE_INT diff = INTVAL (operands[2]) - INTVAL (operands[1]);
+ HOST_WIDE_INT mask = (diff >= 64 ? -1
+ : ~((unsigned HOST_WIDE_INT)-1 << diff));
+ emit_move_insn (operands[0], gen_rtx_CONST_INT (VOIDmode, mask));
+ }
+ DONE;
+ })
+
+(define_expand "maskload<mode>di"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:VEC_REG_MODE 1 "memory_operand")
+ (match_operand 2 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[2]);
+ rtx addr = gcn_expand_scalar_to_vector_address
+ (<MODE>mode, exec, operands[1], gen_rtx_SCRATCH (V64DImode));
+ rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+ rtx undef = gcn_gen_undef (<MODE>mode);
+ emit_insn (gen_gather<mode>_expr_exec (operands[0], addr, as, v, undef,
+ exec));
+ DONE;
+ })
+
+(define_expand "maskstore<mode>di"
+ [(match_operand:VEC_REG_MODE 0 "memory_operand")
+ (match_operand:VEC_REG_MODE 1 "register_operand")
+ (match_operand 2 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[2]);
+ rtx addr = gcn_expand_scalar_to_vector_address
+ (<MODE>mode, exec, operands[0], gen_rtx_SCRATCH (V64DImode));
+ rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+ rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+ emit_insn (gen_scatter<mode>_expr_exec (addr, operands[1], as, v, exec));
+ DONE;
+ })
+
+(define_expand "mask_gather_load<mode>"
+ [(match_operand:VEC_REG_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (match_operand 2 "register_operand")
+ (match_operand 3 "immediate_operand")
+ (match_operand:SI 4 "gcn_alu_operand")
+ (match_operand:DI 5 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[5]);
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[2]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[2],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[2] = tmp;
+ }
+
+ emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+(define_expand "mask_scatter_store<mode>"
+ [(match_operand:DI 0 "register_operand")
+ (match_operand 1 "register_operand")
+ (match_operand 2 "immediate_operand")
+ (match_operand:SI 3 "gcn_alu_operand")
+ (match_operand:VEC_REG_MODE 4 "register_operand")
+ (match_operand:DI 5 "")]
+ ""
+ {
+ rtx exec = force_reg (DImode, operands[5]);
+
+ /* TODO: more conversions will be needed when more types are vectorized. */
+ if (GET_MODE (operands[1]) == V64DImode)
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[1],
+ gcn_gen_undef (V64SImode),
+ exec));
+ operands[1] = tmp;
+ }
+
+ emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+ operands[3], operands[4], exec));
+ DONE;
+ })
+
+; FIXME this should be VEC_REG_MODE, but not all dependencies are implemented.
+(define_mode_iterator COND_MODE [V64SI V64DI V64SF V64DF])
+(define_mode_iterator COND_INT_MODE [V64SI V64DI])
+
+(define_code_iterator cond_op [plus minus])
+
+(define_expand "cond_<expander><mode>"
+ [(match_operand:COND_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (cond_op:COND_MODE
+ (match_operand:COND_MODE 2 "gcn_alu_operand")
+ (match_operand:COND_MODE 3 "gcn_alu_operand"))
+ (match_operand:COND_MODE 4 "register_operand")]
+ ""
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ operands[2] = force_reg (<MODE>mode, operands[2]);
+
+ emit_insn (gen_<expander><mode>3_exec (operands[0], operands[2],
+ operands[3], operands[4],
+ operands[1]));
+ DONE;
+ })
+
+(define_code_iterator cond_bitop [and ior xor])
+
+(define_expand "cond_<expander><mode>"
+ [(match_operand:COND_INT_MODE 0 "register_operand")
+ (match_operand:DI 1 "register_operand")
+ (cond_bitop:COND_INT_MODE
+ (match_operand:COND_INT_MODE 2 "gcn_alu_operand")
+ (match_operand:COND_INT_MODE 3 "gcn_alu_operand"))
+ (match_operand:COND_INT_MODE 4 "register_operand")]
+ ""
+ {
+ operands[1] = force_reg (DImode, operands[1]);
+ operands[2] = force_reg (<MODE>mode, operands[2]);
+
+ emit_insn (gen_<expander><mode>3_exec (operands[0], operands[2],
+ operands[3], operands[4],
+ operands[1]));
+ DONE;
+ })
+
+;; }}}
+;; {{{ Vector reductions
+
+(define_int_iterator REDUC_UNSPEC [UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+ UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+ UNSPEC_PLUS_DPP_SHR
+ UNSPEC_AND_DPP_SHR
+ UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+(define_int_iterator REDUC_2REG_UNSPEC [UNSPEC_PLUS_DPP_SHR
+ UNSPEC_AND_DPP_SHR
+ UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+; FIXME: Isn't there a better way of doing this?
+(define_int_attr reduc_unspec [(UNSPEC_SMIN_DPP_SHR "UNSPEC_SMIN_DPP_SHR")
+ (UNSPEC_SMAX_DPP_SHR "UNSPEC_SMAX_DPP_SHR")
+ (UNSPEC_UMIN_DPP_SHR "UNSPEC_UMIN_DPP_SHR")
+ (UNSPEC_UMAX_DPP_SHR "UNSPEC_UMAX_DPP_SHR")
+ (UNSPEC_PLUS_DPP_SHR "UNSPEC_PLUS_DPP_SHR")
+ (UNSPEC_AND_DPP_SHR "UNSPEC_AND_DPP_SHR")
+ (UNSPEC_IOR_DPP_SHR "UNSPEC_IOR_DPP_SHR")
+ (UNSPEC_XOR_DPP_SHR "UNSPEC_XOR_DPP_SHR")])
+
+(define_int_attr reduc_op [(UNSPEC_SMIN_DPP_SHR "smin")
+ (UNSPEC_SMAX_DPP_SHR "smax")
+ (UNSPEC_UMIN_DPP_SHR "umin")
+ (UNSPEC_UMAX_DPP_SHR "umax")
+ (UNSPEC_PLUS_DPP_SHR "plus")
+ (UNSPEC_AND_DPP_SHR "and")
+ (UNSPEC_IOR_DPP_SHR "ior")
+ (UNSPEC_XOR_DPP_SHR "xor")])
+
+(define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
+ (UNSPEC_SMAX_DPP_SHR "v_max%i0")
+ (UNSPEC_UMIN_DPP_SHR "v_min%u0")
+ (UNSPEC_UMAX_DPP_SHR "v_max%u0")
+ (UNSPEC_PLUS_DPP_SHR "v_add%u0")
+ (UNSPEC_AND_DPP_SHR "v_and%b0")
+ (UNSPEC_IOR_DPP_SHR "v_or%b0")
+ (UNSPEC_XOR_DPP_SHR "v_xor%b0")])
+
+(define_expand "reduc_<reduc_op>_scal_<mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
+ (unspec:<SCALAR_MODE>
+ [(match_operand:VEC_1REG_MODE 1 "register_operand")]
+ REDUC_UNSPEC))]
+ ""
+ {
+ rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
+ <reduc_unspec>);
+
+ /* The result of the reduction is in lane 63 of tmp. */
+ emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+
+ DONE;
+ })
+
+(define_expand "reduc_<reduc_op>_scal_v64di"
+ [(set (match_operand:DI 0 "register_operand")
+ (unspec:DI
+ [(match_operand:V64DI 1 "register_operand")]
+ REDUC_2REG_UNSPEC))]
+ ""
+ {
+ rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1],
+ <reduc_unspec>);
+
+ /* The result of the reduction is in lane 63 of tmp. */
+ emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp));
+
+ DONE;
+ })
+
+(define_insn "*<reduc_op>_dpp_shr_<mode>"
+ [(set (match_operand:VEC_1REG_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_MODE
+ [(match_operand:VEC_1REG_MODE 1 "register_operand" "v")
+ (match_operand:VEC_1REG_MODE 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ REDUC_UNSPEC))]
+ "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
+ && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR)"
+ {
+ return gcn_expand_dpp_shr_insn (<MODE>mode, "<reduc_insn>",
+ <reduc_unspec>, INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "*<reduc_op>_dpp_shr_v64di"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (unspec:V64DI
+ [(match_operand:V64DI 1 "register_operand" "v0")
+ (match_operand:V64DI 2 "register_operand" "v0")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ REDUC_2REG_UNSPEC))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 4)
+ (unspec:V64SI
+ [(match_dup 6) (match_dup 8) (match_dup 3)] REDUC_2REG_UNSPEC))
+ (set (match_dup 5)
+ (unspec:V64SI
+ [(match_dup 7) (match_dup 9) (match_dup 3)] REDUC_2REG_UNSPEC))]
+ {
+ operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")])
+
+; Special cases for addition.
+
+(define_insn "*plus_carry_dpp_shr_<mode>"
+ [(set (match_operand:VEC_1REG_INT_MODE 0 "register_operand" "=v")
+ (unspec:VEC_1REG_INT_MODE
+ [(match_operand:VEC_1REG_INT_MODE 1 "register_operand" "v")
+ (match_operand:VEC_1REG_INT_MODE 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ {
+ const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0";
+ return gcn_expand_dpp_shr_insn (<MODE>mode, insn,
+ UNSPEC_PLUS_CARRY_DPP_SHR,
+ INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "8")])
+
+(define_insn "*plus_carry_in_dpp_shr_v64si"
+ [(set (match_operand:V64SI 0 "register_operand" "=v")
+ (unspec:V64SI
+ [(match_operand:V64SI 1 "register_operand" "v")
+ (match_operand:V64SI 2 "register_operand" "v")
+ (match_operand:SI 3 "const_int_operand" "n")
+ (match_operand:DI 4 "register_operand" "cV")]
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ {
+ const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0";
+ return gcn_expand_dpp_shr_insn (V64SImode, insn,
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR,
+ INTVAL (operands[3]));
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "8")])
+
+(define_insn_and_split "*plus_carry_dpp_shr_v64di"
+ [(set (match_operand:V64DI 0 "register_operand" "=&v")
+ (unspec:V64DI
+ [(match_operand:V64DI 1 "register_operand" "v0")
+ (match_operand:V64DI 2 "register_operand" "v0")
+ (match_operand:SI 3 "const_int_operand" "n")]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 4)
+ (unspec:V64SI
+ [(match_dup 6) (match_dup 8) (match_dup 3)]
+ UNSPEC_PLUS_CARRY_DPP_SHR))
+ (clobber (reg:DI VCC_REG))])
+ (parallel [(set (match_dup 5)
+ (unspec:V64SI
+ [(match_dup 7) (match_dup 9) (match_dup 3) (reg:DI VCC_REG)]
+ UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+ (clobber (reg:DI VCC_REG))])]
+ {
+ operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+ operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+ operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+ operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+ operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")])
+
+; Instructions to move a scalar value from lane 63 of a vector register.
+(define_insn "mov_from_lane63_<mode>"
+ [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
+ (unspec:<SCALAR_MODE>
+ [(match_operand:VEC_1REG_MODE 1 "register_operand" "v,v")]
+ UNSPEC_MOV_FROM_LANE63))]
+ ""
+ "@
+ v_readlane_b32\t%0, %1, 63
+ v_mov_b32\t%0, %1 wave_ror:1"
+ [(set_attr "type" "vop3a,vop_dpp")
+ (set_attr "exec" "none,*")
+ (set_attr "length" "8")])
+
+(define_insn "mov_from_lane63_v64di"
+ [(set (match_operand:DI 0 "register_operand" "=Sg,v")
+ (unspec:DI
+ [(match_operand:V64DI 1 "register_operand" "v,v")]
+ UNSPEC_MOV_FROM_LANE63))]
+ ""
+ "@
+ v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
+ * if (REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \
+ \"v_mov_b32\t%H0, %H1 wave_ror:1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \
+ \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
+ [(set_attr "type" "vop3a,vop_dpp")
+ (set_attr "exec" "none,*")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Miscellaneous
+
+(define_expand "vec_seriesv64si"
+ [(match_operand:V64SI 0 "register_operand")
+ (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (V64SImode);
+ rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+
+ emit_insn (gen_mulv64si3_dup (tmp, v1, operands[2]));
+ emit_insn (gen_addv64si3_dup (operands[0], tmp, operands[1]));
+ DONE;
+ })
+
+(define_expand "vec_seriesv64di"
+ [(match_operand:V64DI 0 "register_operand")
+ (match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:DI 2 "gcn_alu_operand")]
+ ""
+ {
+ rtx tmp = gen_reg_rtx (V64DImode);
+ rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+
+ emit_insn (gen_mulv64di3_zext_dup2 (tmp, v1, operands[2]));
+ emit_insn (gen_addv64di3_dup (operands[0], tmp, operands[1]));
+ DONE;
+ })
+
+;; }}}
--- /dev/null
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
+
+(include "predicates.md")
+(include "constraints.md")
+
+;; {{{ Constants and enums
+
+; Named registers
+(define_constants
+ [(FIRST_SGPR_REG 0)
+ (LAST_SGPR_REG 101)
+ (FLAT_SCRATCH_REG 102)
+ (FLAT_SCRATCH_LO_REG 102)
+ (FLAT_SCRATCH_HI_REG 103)
+ (XNACK_MASK_REG 104)
+ (XNACK_MASK_LO_REG 104)
+ (XNACK_MASK_HI_REG 105)
+ (VCC_REG 106)
+ (VCC_LO_REG 106)
+ (VCC_HI_REG 107)
+ (VCCZ_REG 108)
+ (TBA_REG 109)
+ (TBA_LO_REG 109)
+ (TBA_HI_REG 110)
+ (TMA_REG 111)
+ (TMA_LO_REG 111)
+ (TMA_HI_REG 112)
+ (TTMP0_REG 113)
+ (TTMP11_REG 124)
+ (M0_REG 125)
+ (EXEC_REG 126)
+ (EXEC_LO_REG 126)
+ (EXEC_HI_REG 127)
+ (EXECZ_REG 128)
+ (SCC_REG 129)
+ (FIRST_VGPR_REG 160)
+ (LAST_VGPR_REG 415)])
+
+(define_constants
+ [(SP_REGNUM 16)
+ (LR_REGNUM 18)
+ (AP_REGNUM 416)
+ (FP_REGNUM 418)])
+
+(define_c_enum "unspecv" [
+ UNSPECV_PROLOGUE_USE
+ UNSPECV_KERNEL_RETURN
+ UNSPECV_BARRIER
+ UNSPECV_ATOMIC
+ UNSPECV_ICACHE_INV])
+
+(define_c_enum "unspec" [
+ UNSPEC_VECTOR
+ UNSPEC_BPERMUTE
+ UNSPEC_SGPRBASE
+ UNSPEC_MEMORY_BARRIER
+ UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+ UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+ UNSPEC_PLUS_DPP_SHR
+ UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
+ UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
+ UNSPEC_MOV_FROM_LANE63
+ UNSPEC_GATHER
+ UNSPEC_SCATTER])
+
+;; }}}
+;; {{{ Attributes
+
+; Instruction type (encoding) as described in the ISA specification.
+; The following table summarizes possible operands of individual instruction
+; types and corresponding constraints.
+;
+; sop2 - scalar, two inputs, one output
+; ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+; vccz,execz,scc,inline immedate,fp inline immediate
+; sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;
+; Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
+;
+; sopk - scalar, inline constant input, one output
+; simm16: 16bit inline constant
+; sdst: same as sop2/ssrc0
+;
+; Constraints "=SD", "J"
+;
+; sop1 - scalar, one input, one output
+; ssrc0: same as sop2/ssrc0. FIXME: manual omit VCCZ
+; sdst: same as sop2/sdst
+;
+; Constraints "=SD", "SSA"
+;
+; sopc - scalar, two inputs, one comparsion
+; ssrc0: same as sop2/ssc0.
+;
+; Constraints "SSI,SSA","SSA,SSI"
+;
+; sopp - scalar, one constant input, one special
+; simm16
+;
+; smem - scalar memory
+; sbase: aligned pair of sgprs. Specify {size[15:0], base[47:0]} in
+; dwords
+; sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
+; offset: sgpr or 20bit unsigned byte offset
+;
+; vop2 - vector, two inputs, one output
+; vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
+; inline constant -16 to -64, fp inline immediate, vccz, execz,
+; scc, lds, literal constant, vgpr0-255
+; vsrc1: vgpr0-255
+; vdst: vgpr0-255
+; Limitations: At most one SGPR, at most one constant
+; if constant is used, SGPR must be M0
+; Only SRC0 can be LDS_DIRECT
+;
+; constraints: "=v", "vBSv", "v"
+;
+; vop1 - vector, one input, one output
+; vsrc0: same as vop2/src0
+; vdst: vgpr0-255
+;
+; constraints: "=v", "vBSv"
+;
+; vopc - vector, two inputs, one comparsion output;
+; vsrc0: same as vop2/src0
+; vsrc1: vgpr0-255
+; vdst:
+;
+; constraints: "vASv", "v"
+;
+; vop3a - vector, three inputs, one output
+; vdst: vgpr0-255, for v_cmp sgpr or vcc
+; abs,clamp
+; vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
+; inline constant -16 to -64, fp inline immediate, vccz, execz,
+; scc, lds_direct
+; FIXME: really missing 1/pi? really 104 SGPRs
+;
+; vop3b - vector, three inputs, one vector output, one scalar output
+; vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
+; vdst: vgpr0-255
+; sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
+;
+; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
+; src0: vgpr0-255
+; dst_sel: BYTE_0-3, WORD_0-1, DWORD
+; dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
+; clamp: true/false
+; src0_sel: BYTE_0-3, WORD_0-1, DWORD
+; flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
+ ; src1_abs
+;
+; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
+; src0: vgpr0-255
+; dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
+; wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
+; bcast15, bcast31
+; flags: src0_neg, src0_abs, src1_neg, src1_abs
+; bank_mask: 4-bit mask
+; row_mask: 4-bit mask
+;
+; ds - Local and global data share instructions.
+; offset0: 8-bit constant
+; offset1: 8-bit constant
+; flag: gds
+; addr: vgpr0-255
+; data0: vgpr0-255
+; data1: vgpr0-255
+; vdst: vgpr0-255
+;
+; mubuf - Untyped memory buffer operation. First word with LDS, second word
+; non-LDS.
+; offset: 12-bit constant
+; vaddr: vgpr0-255
+; vdata: vgpr0-255
+; srsrc: sgpr0-102
+; soffset: sgpr0-102
+; flags: offen, idxen, glc, lds, slc, tfe
+;
+; mtbuf - Typed memory buffer operation. Two words
+; offset: 12-bit constant
+; dfmt: 4-bit constant
+; nfmt: 3-bit constant
+; vaddr: vgpr0-255
+; vdata: vgpr0-255
+; srsrc: sgpr0-102
+; soffset: sgpr0-102
+; flags: offen, idxen, glc, lds, slc, tfe
+;
+; flat - flat or global memory operations
+; flags: glc, slc
+; addr: vgpr0-255
+; data: vgpr0-255
+; vdst: vgpr0-255
+;
+; mult - expands to multiple instructions (pseudo encoding)
+;
+; vmult - as mult, when a vector instruction is used.
+
+(define_attr "type"
+ "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
+ vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
+ (const_string "unknown"))
+
+; Set if instruction is executed in scalar or vector unit
+
+(define_attr "unit" "unknown,scalar,vector"
+ (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
+ (const_string "scalar")
+ (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
+ vop_sdwa,vop_dpp,flat,vmult")
+ (const_string "vector")]
+ (const_string "unknown")))
+
+; All vector instructions run as 64 threads as predicated by the EXEC
+; register. Scalar operations in vector register require a single lane
+; enabled, vector moves require a full set of lanes enabled, and most vector
+; operations handle the lane masking themselves.
+; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
+; according to the following settings:
+; auto - md_reorg will inspect def/use to determine what to do.
+; none - exec is not needed.
+; single - disable all but lane zero.
+; full - enable all lanes.
+
+(define_attr "exec" "auto,none,single,full"
+ (const_string "auto"))
+
+; Infer the (worst-case) length from the instruction type by default. Many
+; types can have an optional immediate word following, which we include here.
+; "Multiple" types are counted as two 64-bit instructions. This is just a
+; default fallback: it can be overridden per-alternative in insn patterns for
+; greater accuracy.
+
+(define_attr "length" ""
+ (cond [(eq_attr "type" "sop1") (const_int 8)
+ (eq_attr "type" "sop2") (const_int 8)
+ (eq_attr "type" "sopk") (const_int 8)
+ (eq_attr "type" "sopc") (const_int 8)
+ (eq_attr "type" "sopp") (const_int 4)
+ (eq_attr "type" "smem") (const_int 8)
+ (eq_attr "type" "ds") (const_int 8)
+ (eq_attr "type" "vop1") (const_int 8)
+ (eq_attr "type" "vop2") (const_int 8)
+ (eq_attr "type" "vopc") (const_int 8)
+ (eq_attr "type" "vop3a") (const_int 8)
+ (eq_attr "type" "vop3b") (const_int 8)
+ (eq_attr "type" "vop_sdwa") (const_int 8)
+ (eq_attr "type" "vop_dpp") (const_int 8)
+ (eq_attr "type" "flat") (const_int 8)
+ (eq_attr "type" "mult") (const_int 16)
+ (eq_attr "type" "vmult") (const_int 16)]
+ (const_int 4)))
+
+; Disable alternatives that only apply to specific ISA variants.
+
+(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
+
+(define_attr "enabled" ""
+ (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
+ (and (eq_attr "gcn_version" "gcn5")
+ (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+ (const_int 1)]
+ (const_int 0)))
+
+; We need to be able to identify v_readlane and v_writelane with
+; SGPR lane selection in order to handle "Manually Inserted Wait States".
+
+(define_attr "laneselect" "yes,no" (const_string "no"))
+
+;; }}}
+;; {{{ Iterators useful across the wole machine description
+
+(define_mode_iterator SIDI [SI DI])
+(define_mode_iterator SFDF [SF DF])
+(define_mode_iterator SISF [SI SF])
+(define_mode_iterator QIHI [QI HI])
+(define_mode_iterator DIDF [DI DF])
+
+;; }}}
+;; {{{ Attributes.
+
+; Translate RTX code into GCN instruction mnemonics with and without
+; suffixes such as _b32, etc.
+
+(define_code_attr mnemonic
+ [(minus "sub%i")
+ (plus "add%i")
+ (ashift "lshl%b")
+ (lshiftrt "lshr%b")
+ (ashiftrt "ashr%i")
+ (and "and%B")
+ (ior "or%B")
+ (xor "xor%B")
+ (mult "mul%i")
+ (smin "min%i")
+ (smax "max%i")
+ (umin "min%u")
+ (umax "max%u")
+ (not "not%b")
+ (popcount "bcnt_u32%b")])
+
+(define_code_attr bare_mnemonic
+ [(plus "add")
+ (minus "sub")
+ (and "and")
+ (ior "or")
+ (xor "xor")])
+
+(define_code_attr s_mnemonic
+ [(not "not%b")
+ (popcount "bcnt1_i32%b")])
+
+(define_code_attr revmnemonic
+ [(minus "subrev%i")
+ (ashift "lshlrev%b")
+ (lshiftrt "lshrrev%b")
+ (ashiftrt "ashrrev%i")])
+
+; Translate RTX code into corresponding expander name.
+
+(define_code_attr expander
+ [(and "and")
+ (ior "ior")
+ (xor "xor")
+ (plus "add")
+ (minus "sub")
+ (ashift "ashl")
+ (lshiftrt "lshr")
+ (ashiftrt "ashr")
+ (mult "mul")
+ (smin "smin")
+ (smax "smax")
+ (umin "umin")
+ (umax "umax")
+ (not "one_cmpl")
+ (popcount "popcount")])
+
+;; }}}
+;; {{{ Miscellaneous instructions
+
+(define_insn "nop"
+ [(const_int 0)]
+ ""
+ "s_nop\t0x0"
+ [(set_attr "type" "sopp")])
+
+; FIXME: What should the value of the immediate be? Zero is disallowed, so
+; pick 1 for now.
+(define_insn "trap"
+ [(trap_if (const_int 1) (const_int 0))]
+ ""
+ "s_trap\t1"
+ [(set_attr "type" "sopp")])
+
+;; }}}
+;; {{{ Moves
+
+;; All scalar modes we support moves in.
+(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
+
+; This is the entry point for creating all kinds of scalar moves,
+; including reloads and symbols.
+
+(define_expand "mov<mode>"
+ [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+ (match_operand:MOV_MODE 1 "general_operand"))]
+ ""
+ {
+ if (MEM_P (operands[0]))
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+
+ if (!lra_in_progress && !reload_completed
+ && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
+ {
+ /* Something is probably trying to generate a move
+ which can only work indirectly.
+ E.g. Move from LDS memory to SGPR hardreg
+ or MEM:QI to SGPR. */
+ rtx tmpreg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mov<mode> (tmpreg, operands[1]));
+ emit_insn (gen_mov<mode> (operands[0], tmpreg));
+ DONE;
+ }
+
+ if (<MODE>mode == DImode
+ && (GET_CODE (operands[1]) == SYMBOL_REF
+ || GET_CODE (operands[1]) == LABEL_REF))
+ {
+ emit_insn (gen_movdi_symbol (operands[0], operands[1]));
+ DONE;
+ }
+ })
+
+; Split invalid moves into two valid moves
+
+(define_split
+ [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+ (match_operand:MOV_MODE 1 "general_operand"))]
+ "!reload_completed && !lra_in_progress
+ && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (match_dup 2))]
+ {
+ operands[2] = gen_reg_rtx(<MODE>mode);
+ })
+
+; We need BImode move so we can reload flags registers.
+
+(define_insn "*movbi"
+ [(set (match_operand:BI 0 "nonimmediate_operand"
+ "=Sg, v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
+ (match_operand:BI 1 "gcn_load_operand"
+ "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
+ ""
+ {
+ /* SCC as an operand is currently not accepted by the LLVM assembler, so
+ we emit bytes directly as a workaround. */
+ switch (which_alternative) {
+ case 0:
+ if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+ return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x0\;"
+ ".byte\t0x80|%R0\;"
+ ".byte\t0xbe";
+ else
+ return "s_mov_b32\t%0, %1";
+ case 1:
+ if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+ return "; v_mov_b32\t%0, %1\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x2\;"
+ ".byte\t((%V0<<1)&0xff)\;"
+ ".byte\t0x7e|(%V0>>7)";
+ else
+ return "v_mov_b32\t%0, %1";
+ case 2:
+ return "v_readlane_b32\t%0, %1, 0";
+ case 3:
+ return "s_cmpk_lg_u32\t%1, 0";
+ case 4:
+ return "v_cmp_ne_u32\tvcc, 0, %1";
+ case 5:
+ if (REGNO (operands[1]) == SCC_REG)
+ return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x0\;"
+ ".byte\t0xea\;"
+ ".byte\t0xbe\;"
+ "s_mov_b32\tvcc_hi, 0";
+ else
+ return "s_mov_b32\tvcc_lo, %1\;"
+ "s_mov_b32\tvcc_hi, 0";
+ case 6:
+ return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
+ case 7:
+ return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)";
+ case 8:
+ return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
+ case 9:
+ return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+ case 10:
+ return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+ case 11:
+ return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+ default:
+ gcc_unreachable ();
+ }
+ }
+ [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
+ flat,flat")
+ (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*")
+ (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
+
+; 32bit move pattern
+
+(define_insn "*mov<mode>_insn"
+ [(set (match_operand:SISF 0 "nonimmediate_operand"
+ "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG, v,SD, v,RM")
+ (match_operand:SISF 1 "gcn_load_operand"
+ "SSA, J, B,RB,Sm,RS,Sm,v, v,Sv,RF, v,B, v,RLRG, Y,RM, v"))]
+ ""
+ "@
+ s_mov_b32\t%0, %1
+ s_movk_i32\t%0, %1
+ s_mov_b32\t%0, %1
+ s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0)
+ s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)
+ v_mov_b32\t%0, %1
+ v_readlane_b32\t%0, %1, 0
+ v_writelane_b32\t%0, %1, 0
+ flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+ v_mov_b32\t%0, %1
+ ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+ ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ s_mov_b32\t%0, %1
+ global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+ [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
+ flat,vop1,ds,ds,sop1,flat,flat")
+ (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
+ (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
+
+; 8/16bit move pattern
+
+(define_insn "*mov<mode>_insn"
+ [(set (match_operand:QIHI 0 "nonimmediate_operand"
+ "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG, v, v,RM")
+ (match_operand:QIHI 1 "gcn_load_operand"
+ "SSA, J, B,v, v,Sv,RF, v,B, v,RLRG,RM, v"))]
+ "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+ "@
+ s_mov_b32\t%0, %1
+ s_movk_i32\t%0, %1
+ s_mov_b32\t%0, %1
+ v_mov_b32\t%0, %1
+ v_readlane_b32\t%0, %1, 0
+ v_writelane_b32\t%0, %1, 0
+ flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+ v_mov_b32\t%0, %1
+ ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+ ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+ [(set_attr "type"
+ "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
+ (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
+ (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
+
+; 64bit move pattern
+
+(define_insn_and_split "*mov<mode>_insn"
+ [(set (match_operand:DIDF 0 "nonimmediate_operand"
+ "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG, v, v,RM")
+ (match_operand:DIDF 1 "general_operand"
+ "SSA, C,DB,Sm,RS,v,DB, v,Sv,RF, v, v,RLRG,RM, v"))]
+ "GET_CODE(operands[1]) != SYMBOL_REF"
+ "@
+ s_mov_b64\t%0, %1
+ s_mov_b64\t%0, %1
+ #
+ s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0)
+ s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ #
+ #
+ #
+ #
+ flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+ ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+ ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+ "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
+ && !gcn_sgpr_move_p (operands[0], operands[1]))
+ || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))]
+ {
+ rtx inlo = gen_lowpart (SImode, operands[1]);
+ rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
+ rtx outlo = gen_lowpart (SImode, operands[0]);
+ rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
+
+ /* Ensure that overlapping registers aren't corrupted. */
+ if (REGNO (outlo) == REGNO (inhi))
+ {
+ operands[0] = outhi;
+ operands[1] = inhi;
+ operands[2] = outlo;
+ operands[3] = inlo;
+ }
+ else
+ {
+ operands[0] = outlo;
+ operands[1] = inlo;
+ operands[2] = outhi;
+ operands[3] = inhi;
+ }
+ }
+ [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
+ flat,ds,ds,flat,flat")
+ (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
+
+; 128-bit move.
+
+(define_insn_and_split "*movti_insn"
+ [(set (match_operand:TI 0 "nonimmediate_operand"
+ "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
+ (match_operand:TI 1 "general_operand"
+ "SSB,Sm,RS, v,RF,v,Sv, v, v,RM, v,RL"))]
+ ""
+ "@
+ #
+ s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0)
+ s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+ flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ #
+ #
+ #
+ global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+ global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+ ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
+ "reload_completed
+ && REG_P (operands[0])
+ && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))
+ (set (match_dup 4) (match_dup 5))
+ (set (match_dup 6) (match_dup 7))]
+ {
+ operands[6] = gcn_operand_part (TImode, operands[0], 3);
+ operands[7] = gcn_operand_part (TImode, operands[1], 3);
+ operands[4] = gcn_operand_part (TImode, operands[0], 2);
+ operands[5] = gcn_operand_part (TImode, operands[1], 2);
+ operands[2] = gcn_operand_part (TImode, operands[0], 1);
+ operands[3] = gcn_operand_part (TImode, operands[1], 1);
+ operands[0] = gcn_operand_part (TImode, operands[0], 0);
+ operands[1] = gcn_operand_part (TImode, operands[1], 0);
+ }
+ [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
+ ds,ds")
+ (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
+
+;; }}}
+;; {{{ Prologue/Epilogue
+
+(define_insn "prologue_use"
+ [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+ ""
+ ""
+ [(set_attr "length" "0")])
+
+(define_expand "prologue"
+ [(const_int 0)]
+ ""
+ {
+ gcn_expand_prologue ();
+ DONE;
+ })
+
+(define_expand "epilogue"
+ [(const_int 0)]
+ ""
+ {
+ gcn_expand_epilogue ();
+ DONE;
+ })
+
+;; }}}
+;; {{{ Control flow
+
+; This pattern must satisfy simplejump_p, which means it cannot be a parallel
+; that clobbers SCC. Thus, we must preserve SCC if we're generating a long
+; branch sequence.
+
+(define_insn "jump"
+ [(set (pc)
+ (label_ref (match_operand 0)))]
+ ""
+ {
+ if (get_attr_length (insn) == 4)
+ return "s_branch\t%0";
+ else
+ /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG. */
+ return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+ ".long\t0xbe9600fd\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_cmpk_lg_u32\ts22, 0\;"
+ "s_setpc_b64\ts[20:21]";
+ }
+ [(set_attr "type" "sopp")
+ (set (attr "length")
+ (if_then_else (and (ge (minus (match_dup 0) (pc))
+ (const_int -131072))
+ (lt (minus (match_dup 0) (pc))
+ (const_int 131072)))
+ (const_int 4)
+ (const_int 32)))])
+
+(define_insn "indirect_jump"
+ [(set (pc)
+ (match_operand:DI 0 "register_operand" "Sg"))]
+ ""
+ "s_setpc_b64\t%0"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+(define_insn "cjump"
+ [(set (pc)
+ (if_then_else
+ (match_operator:BI 1 "gcn_conditional_operator"
+ [(match_operand:BI 2 "gcn_conditional_register_operand" "ca,cV")
+ (const_int 0)])
+ (label_ref (match_operand 0))
+ (pc)))]
+ ""
+ {
+ if (get_attr_length (insn) == 4)
+ return "s_cbranch%C1\t%0";
+ else
+ {
+ /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG but
+ restores SCC. */
+ if (REGNO (operands[2]) == SCC_REG)
+ {
+ if (GET_CODE (operands[1]) == EQ)
+ return "s_cbranch%c1\t.Lskip%=\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_cmp_lg_u32\t0, 0\;"
+ "s_setpc_b64\ts[20:21]\n"
+ ".Lskip%=:";
+ else
+ return "s_cbranch%c1\t.Lskip%=\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_cmp_eq_u32\t0, 0\;"
+ "s_setpc_b64\ts[20:21]\n"
+ ".Lskip%=:";
+ }
+ else
+ return "s_cbranch%c1\t.Lskip%=\;"
+ "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+ ".byte\t0xfd\;"
+ ".byte\t0x0\;"
+ ".byte\t0x80|22\;"
+ ".byte\t0xbe\;"
+ "s_getpc_b64\ts[20:21]\;"
+ "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+ "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+ "s_cmpk_lg_u32\ts22, 0\;"
+ "s_setpc_b64\ts[20:21]\n"
+ ".Lskip%=:";
+ }
+ }
+ [(set_attr "type" "sopp")
+ (set (attr "length")
+ (if_then_else (and (ge (minus (match_dup 0) (pc))
+ (const_int -131072))
+ (lt (minus (match_dup 0) (pc))
+ (const_int 131072)))
+ (const_int 4)
+ (const_int 36)))])
+
+; Returning from a normal function is different to returning from a
+; kernel function.
+
+(define_insn "gcn_return"
+ [(return)]
+ ""
+ {
+ if (cfun && cfun->machine && cfun->machine->normal_function)
+ return "s_setpc_b64\ts[18:19]";
+ else
+ return "s_dcache_wb\;s_endpgm";
+ }
+ [(set_attr "type" "sop1")
+ (set_attr "length" "8")])
+
+(define_expand "call"
+ [(parallel [(call (match_operand 0 "")
+ (match_operand 1 ""))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2))])]
+ ""
+ {})
+
+(define_insn "gcn_simple_call"
+ [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
+ (match_operand 1 "const_int_operand"))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2 "=&Sg,X"))]
+ ""
+ "@
+ s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
+ s_swappc_b64\ts[18:19], %0"
+ [(set_attr "type" "mult,sop1")
+ (set_attr "length" "24,4")])
+
+(define_insn "movdi_symbol"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
+ (match_operand:DI 1 "general_operand" "Y"))
+ (clobber (reg:BI SCC_REG))]
+ "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
+ {
+ if (SYMBOL_REF_P (operands[1])
+ && SYMBOL_REF_WEAK (operands[1]))
+ return "s_getpc_b64\t%0\;"
+ "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
+ "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
+ "s_load_dwordx2\t%0, %0\;"
+ "s_waitcnt\tlgkmcnt(0)";
+
+ return "s_getpc_b64\t%0\;"
+ "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
+ "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
+ }
+ [(set_attr "type" "mult")
+ (set_attr "length" "32")])
+
+(define_insn "gcn_indirect_call"
+ [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
+ (match_operand 1 "" ""))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 2 "=X"))]
+ ""
+ "s_swappc_b64\ts[18:19], %0"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+(define_expand "call_value"
+ [(parallel [(set (match_operand 0 "")
+ (call (match_operand 1 "")
+ (match_operand 2 "")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3))])]
+ ""
+ {})
+
+(define_insn "gcn_call_value"
+ [(set (match_operand 0 "register_operand" "=Sg,Sg")
+ (call (mem (match_operand 1 "immediate_operand" "Y,B"))
+ (match_operand 2 "const_int_operand")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3 "=&Sg,X"))]
+ ""
+ "@
+ s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
+ s_swappc_b64\ts[18:19], %1"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "24")])
+
+(define_insn "gcn_call_value_indirect"
+ [(set (match_operand 0 "register_operand" "=Sg")
+ (call (mem (match_operand:DI 1 "register_operand" "Sg"))
+ (match_operand 2 "" "")))
+ (clobber (reg:DI LR_REGNUM))
+ (clobber (match_scratch:DI 3 "=X"))]
+ ""
+ "s_swappc_b64\ts[18:19], %1"
+ [(set_attr "type" "sop1")
+ (set_attr "length" "4")])
+
+; GCN does not have an instruction to clear only part of the instruction
+; cache, so the operands are ignored.
+
+(define_insn "clear_icache"
+ [(unspec_volatile
+ [(match_operand 0 "") (match_operand 1 "")]
+ UNSPECV_ICACHE_INV)]
+ ""
+ "s_icache_inv"
+ [(set_attr "type" "sopp")
+ (set_attr "length" "4")])
+
+;; }}}
+;; {{{ Conditionals
+
+; 32-bit compare, scalar unit only
+
+(define_insn "cstoresi4"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand"
+ "=cs, cs, cs, cs")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:SI 2 "gcn_alu_operand" "SSA,SSA,SSB, SS")
+ (match_operand:SI 3 "gcn_alu_operand" "SSA,SSL, SS,SSB")]))]
+ ""
+ "@
+ s_cmp%D1\t%2, %3
+ s_cmpk%D1\t%2, %3
+ s_cmp%D1\t%2, %3
+ s_cmp%D1\t%2, %3"
+ [(set_attr "type" "sopc,sopk,sopk,sopk")
+ (set_attr "length" "4,4,8,8")])
+
+(define_expand "cbranchsi4"
+ [(match_operator 0 "gcn_compare_operator"
+ [(match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+; 64-bit compare; either unit, but scalar allows limited operators
+
+(define_expand "cstoredi4"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:DI 2 "gcn_alu_operand")
+ (match_operand:DI 3 "gcn_alu_operand")]))]
+ ""
+ {})
+
+(define_insn "cstoredi4_vec_and_scalar"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs, cV")
+ (match_operator:BI 1 "gcn_compare_64bit_operator"
+ [(match_operand:DI 2 "gcn_alu_operand" "%SSA,vSvC")
+ (match_operand:DI 3 "gcn_alu_operand" " SSC, v")]))]
+ ""
+ "@
+ s_cmp%D1\t%2, %3
+ v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "sopc,vopc")
+ (set_attr "length" "8")])
+
+(define_insn "cstoredi4_vector"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
+ (match_operator:BI 1 "gcn_compare_operator"
+ [(match_operand:DI 2 "gcn_alu_operand" "vSvB")
+ (match_operand:DI 3 "gcn_alu_operand" " v")]))]
+ ""
+ "v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "vopc")
+ (set_attr "length" "8")])
+
+(define_expand "cbranchdi4"
+ [(match_operator 0 "gcn_compare_operator"
+ [(match_operand:DI 1 "gcn_alu_operand")
+ (match_operand:DI 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+; FP compare; vector unit only
+
+(define_insn "cstore<mode>4"
+ [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
+ (match_operator:BI 1 "gcn_fp_compare_operator"
+ [(match_operand:SFDF 2 "gcn_alu_operand" "vB")
+ (match_operand:SFDF 3 "gcn_alu_operand" "v")]))]
+ ""
+ "v_cmp%E1\tvcc, %2, %3"
+ [(set_attr "type" "vopc")
+ (set_attr "length" "8")])
+
+(define_expand "cbranch<mode>4"
+ [(match_operator 0 "gcn_fp_compare_operator"
+ [(match_operand:SFDF 1 "gcn_alu_operand")
+ (match_operand:SFDF 2 "gcn_alu_operand")])
+ (match_operand 3)]
+ ""
+ {
+ rtx cc = gen_reg_rtx (BImode);
+ emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
+ emit_jump_insn (gen_cjump (operands[3],
+ gen_rtx_NE (BImode, cc, const0_rtx), cc));
+ DONE;
+ })
+
+;; }}}
+;; {{{ ALU special cases: Plus
+
+(define_insn "addsi3"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, Sg, Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ, B,vBSv")))
+ (clobber (match_scratch:BI 3 "= cs, cs, cs, X"))
+ (clobber (match_scratch:DI 4 "= X, X, X, cV"))]
+ ""
+ "@
+ s_add_i32\t%0, %1, %2
+ s_addk_i32\t%0, %2
+ s_add_i32\t%0, %1, %2
+ v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "sop2,sopk,sop2,vop2")
+ (set_attr "length" "4,4,8,8")])
+
+(define_expand "addsi3_scc"
+ [(parallel [(set (match_operand:SI 0 "register_operand")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (clobber (reg:BI SCC_REG))
+ (clobber (scratch:DI))])]
+ ""
+ {})
+
+; Having this as an insn_and_split allows us to keep together DImode adds
+; through some RTL optimisation passes, and means the CC reg we set isn't
+; dependent on the constraint alternative (which doesn't seem to work well).
+
+; There's an early clobber in the case where "v[0:1]=v[1:2]+?" but
+; "v[0:1]=v[0:1]+?" is fine (as is "v[1:2]=v[0:1]+?", but that's trickier).
+
+; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
+; used as an operand due to the read of VCC, so we restrict constants to the
+; inlinable range for that alternative.
+
+(define_insn_and_split "adddi3"
+ [(set (match_operand:DI 0 "register_operand"
+ "=&Sg,&Sg,&Sg,&Sg,&v,&v,&v,&v")
+ (plus:DI (match_operand:DI 1 "register_operand"
+ " Sg, 0, 0, Sg, v, 0, 0, v")
+ (match_operand:DI 2 "nonmemory_operand"
+ " 0,SgB, 0,SgB, 0,vA, 0,vA")))
+ (clobber (match_scratch:BI 3 "= cs, cs, cs, cs, X, X, X, X"))
+ (clobber (match_scratch:DI 4 "= X, X, X, X,cV,cV,cV,cV"))]
+ ""
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+ {
+ rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
+ DImode)
+ ? VCC_REG : SCC_REG);
+
+ emit_insn (gen_addsi3_scalar_carry
+ (gcn_operand_part (DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (DImode, operands[2], 0),
+ cc));
+ rtx val = gcn_operand_part (DImode, operands[2], 1);
+ if (val != const0_rtx)
+ emit_insn (gen_addcsi3_scalar
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1),
+ cc, cc));
+ else
+ emit_insn (gen_addcsi3_scalar_zero
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ cc));
+ DONE;
+ }
+ [(set_attr "type" "mult,mult,mult,mult,vmult,vmult,vmult,vmult")
+ (set_attr "length" "8")])
+
+(define_expand "adddi3_scc"
+ [(parallel [(set (match_operand:DI 0 "register_operand")
+ (plus:DI (match_operand:DI 1 "register_operand")
+ (match_operand:DI 2 "nonmemory_operand")))
+ (clobber (reg:BI SCC_REG))
+ (clobber (scratch:DI))])]
+ ""
+ {})
+
+;; Add with carry.
+
+(define_insn "addsi3_scalar_carry"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
+ (set (match_operand:BI 3 "register_operand" "= cs,cV")
+ (ltu:BI (plus:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "@
+ s_add_u32\t%0, %1, %2
+ v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "8,8")])
+
+(define_insn "addsi3_scalar_carry_cst"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (plus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA, v")
+ (match_operand:SI 2 "const_int_operand" " n, n")))
+ (set (match_operand:BI 4 "register_operand" "=cs,cV")
+ (geu:BI (plus:SI (match_dup 1)
+ (match_dup 2))
+ (match_operand:SI 3 "const_int_operand" " n, n")))]
+ "INTVAL (operands[2]) == -INTVAL (operands[3])"
+ "@
+ s_add_u32\t%0, %1, %2
+ v_add%^_u32\t%0, vcc, %2, %1"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "4")])
+
+(define_insn "addcsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "= Sg, v")
+ (plus:SI (plus:SI (zero_extend:SI
+ (match_operand:BI 3 "register_operand" "= cs,cV"))
+ (match_operand:SI 1 "gcn_alu_operand" "%SgA, v"))
+ (match_operand:SI 2 "gcn_alu_operand" " SgB,vA")))
+ (set (match_operand:BI 4 "register_operand" "= 3, 3")
+ (ior:BI (ltu:BI (plus:SI
+ (plus:SI
+ (zero_extend:SI (match_dup 3))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 2))
+ (ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "@
+ s_addc_u32\t%0, %1, %2
+ v_addc%^_u32\t%0, vcc, %2, %1, vcc"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "8,4")])
+
+(define_insn "addcsi3_scalar_zero"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (plus:SI (zero_extend:SI
+ (match_operand:BI 2 "register_operand" "=cs,cV"))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA, v")))
+ (set (match_dup 2)
+ (ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
+ (match_dup 1))
+ (match_dup 1)))]
+ ""
+ "@
+ s_addc_u32\t%0, %1, 0
+ v_addc%^_u32\t%0, vcc, 0, %1, vcc"
+ [(set_attr "type" "sop2,vop2")
+ (set_attr "length" "4")])
+
+; "addptr" is the same as "add" except that it must not write to VCC or SCC
+; as a side-effect. Unfortunately GCN does not have a suitable instruction
+; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp.
+; Note that it is not safe to save/clobber/restore SCC because doing so will
+; break data-flow analysis, so this must use vector registers.
+
+(define_insn "addptrdi3"
+ [(set (match_operand:DI 0 "register_operand" "= &v")
+ (plus:DI (match_operand:DI 1 "register_operand" " v0")
+ (match_operand:DI 2 "nonmemory_operand" "vDA0")))]
+ ""
+ {
+ rtx new_operands[4] = { operands[0], operands[1], operands[2],
+ gen_rtx_REG (DImode, CC_SAVE_REG) };
+
+ output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands);
+ output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands);
+
+ return "";
+ }
+ [(set_attr "type" "vmult")
+ (set_attr "length" "16")])
+
+;; }}}
+;; {{{ ALU special cases: Minus
+
+(define_insn "subsi3"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v, v")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA, v,vBSv")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA, B, vBSv, v")))
+ (clobber (match_scratch:BI 3 "=cs, cs, X, X"))
+ (clobber (match_scratch:DI 4 "= X, X, cV, cV"))]
+ ""
+ "@
+ s_sub_i32\t%0, %1, %2
+ s_sub_i32\t%0, %1, %2
+ v_subrev%^_u32\t%0, vcc, %2, %1
+ v_sub%^_u32\t%0, vcc, %1, %2"
+ [(set_attr "type" "sop2,sop2,vop2,vop2")
+ (set_attr "length" "4,8,8,8")])
+
+(define_insn_and_split "subdi3"
+ [(set (match_operand:DI 0 "register_operand" "=Sg, Sg")
+ (minus:DI
+ (match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
+ (match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
+ (clobber (reg:BI SCC_REG))]
+ ""
+ "#"
+ "reload_completed"
+ [(const_int 0)]
+ {
+ emit_insn (gen_subsi3_scalar_carry
+ (gcn_operand_part (DImode, operands[0], 0),
+ gcn_operand_part (DImode, operands[1], 0),
+ gcn_operand_part (DImode, operands[2], 0)));
+ rtx val = gcn_operand_part (DImode, operands[2], 1);
+ if (val != const0_rtx)
+ emit_insn (gen_subcsi3_scalar
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1),
+ gcn_operand_part (DImode, operands[2], 1)));
+ else
+ emit_insn (gen_subcsi3_scalar_zero
+ (gcn_operand_part (DImode, operands[0], 1),
+ gcn_operand_part (DImode, operands[1], 1)));
+ DONE;
+ }
+ [(set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
+ (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+ (set (reg:BI SCC_REG)
+ (gtu:BI (minus:SI (match_dup 1)
+ (match_dup 2))
+ (match_dup 1)))]
+ ""
+ "s_sub_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry_cst"
+ [(set (match_operand:SI 0 "register_operand" "=Sg")
+ (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA")
+ (match_operand:SI 2 "const_int_operand" " n")))
+ (set (reg:BI SCC_REG)
+ (leu:BI (minus:SI (match_dup 1)
+ (match_dup 2))
+ (match_operand:SI 3 "const_int_operand" " n")))]
+ "INTVAL (operands[2]) == -INTVAL (operands[3])"
+ "s_sub_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "4")])
+
+(define_insn "subcsi3_scalar"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg")
+ (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
+ (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+ (set (reg:BI SCC_REG)
+ (ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_dup 1))
+ (match_dup 2))
+ (match_dup 1))
+ (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_dup 1))
+ (match_dup 1))))]
+ ""
+ "s_subb_u32\t%0, %1, %2"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "8")])
+
+(define_insn "subcsi3_scalar_zero"
+ [(set (match_operand:SI 0 "register_operand" "=Sg")
+ (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+ (match_operand:SI 1 "gcn_alu_operand" "SgA")))
+ (set (reg:BI SCC_REG)
+ (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
+ (match_dup 1)))]
+ ""
+ "s_subb_u32\t%0, %1, 0"
+ [(set_attr "type" "sop2")
+ (set_attr "length" "4")])
+
+;; }}}
+;; {{{ ALU: mult
+
+; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
+; immediate.
+(define_insn "mulsi3"
+ [(set (match_operand:SI 0 "register_operand" "= Sg,Sg, Sg, v")
+ (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" " SgA, J, B,vASv")))]
+ ""
+ "@
+ s_mul_i32\t%0, %1, %2
+ s_mulk_i32\t%0, %2
+ s_mul_i32\t%0, %1, %2
+ v_mul_lo_i32\t%0, %1, %2"
+ [(set_attr "type" "sop2,sopk,sop2,vop3a")
+ (set_attr "length" "4,4,8,4")])
+
+(define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
+(define_code_attr su [(sign_extend "s") (zero_extend "u")])
+(define_code_attr u [(sign_extend "") (zero_extend "u")])
+(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
+(define_code_attr e [(sign_extend "e") (zero_extend "")])
+
+(define_insn "<su>mulsi3_highpart"
+ [(set (match_operand:SI 0 "register_operand" "= v")
+ (truncate:SI
+ (lshiftrt:DI
+ (mult:DI
+ (any_extend:DI
+ (match_operand:SI 1 "register_operand" "% v"))
+ (any_extend:DI
+ (match_operand:SI 2 "register_operand" "vSv")))
+ (const_int 32))))]
+ ""
+ "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "<u>mulhisi3"
+ [(set (match_operand:SI 0 "register_operand" "=v")
+ (mult:SI
+ (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
+ (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
+ ""
+ "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
+ [(set_attr "type" "vop_sdwa")
+ (set_attr "length" "8")])
+
+(define_insn "<u>mulqihi3_scalar"
+ [(set (match_operand:HI 0 "register_operand" "=v")
+ (mult:HI
+ (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
+ (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
+ ""
+ "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
+ [(set_attr "type" "vop_sdwa")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit unop
+
+(define_code_iterator bitunop [not popcount])
+(define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
+
+(define_insn "<expander>si2"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, v")
+ (bitunop:SI
+ (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
+ (clobber (match_scratch:BI 2 "=cs, X"))]
+ ""
+ "@
+ s_<s_mnemonic>0\t%0, %1
+ v_<mnemonic>0\t%0, %1<popcount_extra_op>"
+ [(set_attr "type" "sop1,vop1")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit binop
+
+; No plus and mult - they have variant with 16bit immediate
+; and thus are defined later.
+(define_code_iterator binop [and ior xor smin smax umin umax
+ ashift lshiftrt ashiftrt])
+(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
+(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
+
+(define_insn "<expander>si3"
+ [(set (match_operand:SI 0 "gcn_valu_dst_operand" "= Sg, v,RD")
+ (vec_and_scalar_com:SI
+ (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
+ (match_operand:SI 2 "gcn_alu_operand" " SgB, v, v")))
+ (clobber (match_scratch:BI 3 "= cs, X, X"))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ v_<mnemonic>0\t%0, %1, %2
+ ds_<mnemonic>0\t%A0, %2%O0"
+ [(set_attr "type" "sop2,vop2,ds")
+ (set_attr "length" "8")])
+
+(define_insn "<expander>si3"
+ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v")
+ (vec_and_scalar_nocom:SI
+ (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
+ (clobber (match_scratch:BI 3 "=cs, cs, X"))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ s_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "sop2,sop2,vop2")
+ (set_attr "length" "8")])
+
+(define_expand "<expander>si3_scc"
+ [(parallel [(set (match_operand:SI 0 "gcn_valu_dst_operand")
+ (binop:SI
+ (match_operand:SI 1 "gcn_valu_src0_operand")
+ (match_operand:SI 2 "gcn_alu_operand")))
+ (clobber (reg:BI SCC_REG))])]
+ ""
+ {})
+
+;; }}}
+;; {{{ ALU: generic 64-bit
+
+(define_code_iterator vec_and_scalar64_com [and ior xor])
+
+(define_insn_and_split "<expander>di3"
+ [(set (match_operand:DI 0 "register_operand" "= Sg, &v, &v")
+ (vec_and_scalar64_com:DI
+ (match_operand:DI 1 "gcn_alu_operand" "%SgA,vSvDB,vSvDB")
+ (match_operand:DI 2 "gcn_alu_operand" " SgC, v, 0")))
+ (clobber (match_scratch:BI 3 "= cs, X, X"))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ #
+ #"
+ "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
+ [(parallel [(set (match_dup 4)
+ (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
+ (clobber (match_dup 3))])
+ (parallel [(set (match_dup 7)
+ (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
+ (clobber (match_dup 3))])]
+ {
+ operands[4] = gcn_operand_part (DImode, operands[0], 0);
+ operands[5] = gcn_operand_part (DImode, operands[1], 0);
+ operands[6] = gcn_operand_part (DImode, operands[2], 0);
+ operands[7] = gcn_operand_part (DImode, operands[0], 1);
+ operands[8] = gcn_operand_part (DImode, operands[1], 1);
+ operands[9] = gcn_operand_part (DImode, operands[2], 1);
+ }
+ [(set_attr "type" "sop2,vop2,vop2")
+ (set_attr "length" "8")])
+
+(define_insn "<expander>di3"
+ [(set (match_operand:DI 0 "register_operand" "=Sg, Sg, v")
+ (vec_and_scalar_nocom:DI
+ (match_operand:DI 1 "gcn_alu_operand" "SgC,SgA, v")
+ (match_operand:SI 2 "gcn_alu_operand" "SgA,SgC,vSvC")))
+ (clobber (match_scratch:BI 3 "=cs, cs, X"))]
+ ""
+ "@
+ s_<mnemonic>0\t%0, %1, %2
+ s_<mnemonic>0\t%0, %1, %2
+ v_<revmnemonic>0\t%0, %2, %1"
+ [(set_attr "type" "sop2,sop2,vop2")
+ (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Atomics
+
+; Each compute unit has it's own L1 cache. The L2 cache is shared between
+; all the compute units. Any load or store instruction can skip L1 and
+; access L2 directly using the "glc" flag. Atomic instructions also skip
+; L1. The L1 cache can be flushed and invalidated using instructions.
+;
+; Therefore, in order for "acquire" and "release" atomic modes to work
+; correctly across compute units we must flush before each "release"
+; and invalidate the cache after each "acquire". It might seem like
+; invalidation could be safely done before an "acquire", but since each
+; compute unit can run up to 40 threads simultaneously, all reading values
+; into the L1 cache, this is not actually safe.
+;
+; Additionally, scalar flat instructions access L2 via a different cache
+; (the "constant cache"), so they have separate constrol instructions. We
+; do not attempt to invalidate both caches at once; instead, atomics
+; operating on scalar flat pointers will flush the constant cache, and
+; atomics operating on flat or global pointers will flush L1. It is up to
+; the programmer to get this right.
+
+(define_code_iterator atomicops [plus minus and ior xor])
+(define_mode_attr X [(SI "") (DI "_X2")])
+
+;; TODO compare_and_swap test_and_set inc dec
+;; Hardware also supports min and max, but GCC does not.
+
+(define_expand "memory_barrier"
+ [(set (match_dup 0)
+ (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+ ""
+ {
+ operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+ MEM_VOLATILE_P (operands[0]) = 1;
+ })
+
+(define_insn "*memory_barrier"
+ [(set (match_operand:BLK 0)
+ (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+ ""
+ "buffer_wbinvl1_vol"
+ [(set_attr "type" "mubuf")
+ (set_attr "length" "4")])
+
+; FIXME: These patterns have been disabled as they do not seem to work
+; reliably - they can cause hangs or incorrect results.
+; TODO: flush caches according to memory model
+(define_insn "atomic_fetch_<bare_mnemonic><mode>"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 1)
+ (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 3 "const_int_operand"))]
+ "0 /* Disabled. */"
+ "@
+ s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
+ global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+; FIXME: These patterns are disabled because the instructions don't
+; seem to work as advertised. Specifically, OMP "team distribute"
+; reductions apparently "lose" some of the writes, similar to what
+; you might expect from a concurrent non-atomic read-modify-write.
+; TODO: flush caches according to memory model
+(define_insn "atomic_<bare_mnemonic><mode>"
+ [(set (match_operand:SIDI 0 "memory_operand" "+RS,RF,RM")
+ (unspec_volatile:SIDI
+ [(atomicops:SIDI
+ (match_dup 0)
+ (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
+ UNSPECV_ATOMIC))
+ (use (match_operand 2 "const_int_operand"))]
+ "0 /* Disabled. */"
+ "@
+ s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
+ global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_mode_attr x2 [(SI "DI") (DI "TI")])
+(define_mode_attr size [(SI "4") (DI "8")])
+(define_mode_attr bitsize [(SI "32") (DI "64")])
+
+(define_expand "sync_compare_and_swap<mode>"
+ [(match_operand:SIDI 0 "register_operand")
+ (match_operand:SIDI 1 "memory_operand")
+ (match_operand:SIDI 2 "register_operand")
+ (match_operand:SIDI 3 "register_operand")]
+ ""
+ {
+ if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
+ {
+ emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+ }
+
+ /* Operands 2 and 3 must be placed in consecutive registers, and passed
+ as a combined value. */
+ rtx src_cmp = gen_reg_rtx (<x2>mode);
+ emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
+ emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
+ emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
+ operands[1],
+ src_cmp));
+ DONE;
+ })
+
+(define_insn "sync_compare_and_swap<mode>_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))]
+ ""
+ "@
+ s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+ flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
+ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "sync_compare_and_swap<mode>_lds_insn"
+ [(set (match_operand:SIDI 0 "register_operand" "= v")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "memory_operand" "+RL")]
+ UNSPECV_ATOMIC))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 2 "register_operand" " v")
+ (match_operand:SIDI 3 "register_operand" " v")]
+ UNSPECV_ATOMIC))]
+ ""
+ "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
+ [(set_attr "type" "ds")
+ (set_attr "length" "12")])
+
+(define_insn "atomic_load<mode>"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))]
+ ""
+ {
+ switch (INTVAL (operands[2]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_SYNC_ACQUIRE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
+ "s_dcache_wb_vol";
+ case 1:
+ return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
+ "buffer_wbinvl1_vol";
+ case 2:
+ return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
+ "buffer_wbinvl1_vol";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
+ "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
+ "s_waitcnt\t0\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "atomic_store<mode>"
+ [(set (match_operand:SIDI 0 "memory_operand" "=RS,RF,RM")
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))]
+ ""
+ {
+ switch (INTVAL (operands[2]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_SYNC_RELEASE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+ "s_waitcnt\texpcnt(0)";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\texpcnt(0)";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\texpcnt(0)";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+ "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+ "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "atomic_exchange<mode>"
+ [(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
+ (match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
+ (set (match_dup 1)
+ (unspec_volatile:SIDI
+ [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
+ UNSPECV_ATOMIC))
+ (use (match_operand 3 "immediate_operand"))]
+ ""
+ {
+ switch (INTVAL (operands[3]))
+ {
+ case MEMMODEL_RELAXED:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
+ case 2:
+ return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_SYNC_ACQUIRE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
+ "s_dcache_wb_vol\;s_dcache_inv_vol";
+ case 1:
+ return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
+ "buffer_wbinvl1_vol";
+ case 2:
+ return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_SYNC_RELEASE:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\tlgkmcnt(0)";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\t0";
+ case 2:
+ return "buffer_wbinvl1_vol\;"
+ "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)";
+ }
+ break;
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ case MEMMODEL_SYNC_SEQ_CST:
+ switch (which_alternative)
+ {
+ case 0:
+ return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+ case 1:
+ return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+ "s_waitcnt\t0\;buffer_wbinvl1_vol";
+ case 2:
+ return "buffer_wbinvl1_vol\;"
+ "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+ "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+ }
+ break;
+ }
+ gcc_unreachable ();
+ }
+ [(set_attr "type" "smem,flat,flat")
+ (set_attr "length" "20")
+ (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+;; }}}
+;; {{{ OpenACC / OpenMP
+
+(define_expand "oacc_dim_size"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")]
+ ""
+ {
+ rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
+ emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
+ DONE;
+ })
+
+(define_expand "oacc_dim_pos"
+ [(match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")]
+ ""
+ {
+ emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
+ DONE;
+ })
+
+(define_expand "gcn_wavefront_barrier"
+ [(set (match_dup 0)
+ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+ ""
+ {
+ operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+ MEM_VOLATILE_P (operands[0]) = 1;
+ })
+
+(define_insn "*gcn_wavefront_barrier"
+ [(set (match_operand:BLK 0 "")
+ (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+ ""
+ "s_barrier"
+ [(set_attr "type" "sopp")])
+
+(define_expand "oacc_fork"
+ [(set (match_operand:SI 0 "")
+ (match_operand:SI 1 ""))
+ (use (match_operand:SI 2 ""))]
+ ""
+ {
+ /* We need to have oacc_fork/oacc_join named patterns as a pair,
+ but the fork isn't actually used. */
+ gcc_unreachable ();
+ })
+
+(define_expand "oacc_join"
+ [(set (match_operand:SI 0 "")
+ (match_operand:SI 1 ""))
+ (use (match_operand:SI 2 ""))]
+ ""
+ {
+ emit_insn (gen_gcn_wavefront_barrier ());
+ DONE;
+ })
+
+;; }}}
+
+(include "gcn-valu.md")