GCN machine description
authorAndrew Stubbs <ams@codesourcery.com>
Thu, 17 Jan 2019 12:31:28 +0000 (12:31 +0000)
committerAndrew Stubbs <ams@gcc.gnu.org>
Thu, 17 Jan 2019 12:31:28 +0000 (12:31 +0000)
This patch contains the machine description portion of the GCN back-end.  I've
broken it out mainly to avoid the mailing list size limit.

2019-01-17  Andrew Stubbs  <ams@codesourcery.com>
    Kwok Cheung Yeung  <kcy@codesourcery.com>
    Julian Brown  <julian@codesourcery.com>
    Tom de Vries  <tom@codesourcery.com>
    Jan Hubicka  <hubicka@ucw.cz>
    Martin Jambor  <mjambor@suse.cz>

gcc/
* config/gcn/constraints.md: New file.
* config/gcn/gcn-valu.md: New file.
* config/gcn/gcn.md: New file.
* config/gcn/predicates.md: New file.

Co-Authored-By: Jan Hubicka <hubicka@ucw.cz>
Co-Authored-By: Julian Brown <julian@codesourcery.com>
Co-Authored-By: Kwok Cheung Yeung <kcy@codesourcery.com>
Co-Authored-By: Martin Jambor <mjambor@suse.cz>
Co-Authored-By: Tom de Vries <tom@codesourcery.com>
From-SVN: r268022

gcc/ChangeLog
gcc/config/gcn/constraints.md [new file with mode: 0644]
gcc/config/gcn/gcn-valu.md [new file with mode: 0644]
gcc/config/gcn/gcn.md [new file with mode: 0644]
gcc/config/gcn/predicates.md [new file with mode: 0644]

index d5dde3f90d1d1694439fdd5484abc77f84e954f4..12489cdf8389dc4c3febf42936e42e5723bd7176 100644 (file)
@@ -1,3 +1,15 @@
+2019-01-17  Andrew Stubbs  <ams@codesourcery.com>
+           Kwok Cheung Yeung  <kcy@codesourcery.com>
+           Julian Brown  <julian@codesourcery.com>
+           Tom de Vries  <tom@codesourcery.com>
+           Jan Hubicka  <hubicka@ucw.cz>
+           Martin Jambor  <mjambor@suse.cz>
+
+       * config/gcn/constraints.md: New file.
+       * config/gcn/gcn-valu.md: New file.
+       * config/gcn/gcn.md: New file.
+       * config/gcn/predicates.md: New file.
+
 2019-01-17  Eric Botcazou  <ebotcazou@adacore.com>
 
        * gimple-ssa-isolate-paths.c (stmt_uses_name_in_undefined_way): Replace
diff --git a/gcc/config/gcn/constraints.md b/gcc/config/gcn/constraints.md
new file mode 100644 (file)
index 0000000..08ba76a
--- /dev/null
@@ -0,0 +1,139 @@
+;; Constraint definitions for GCN.
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_constraint "I"
+  "Inline integer constant"
+  (and (match_code "const_int")
+       (match_test "ival >= -16 && ival <= 64")))
+
+(define_constraint "J"
+  "Signed integer 16-bit inline constant"
+  (and (match_code "const_int")
+       (match_test "((unsigned HOST_WIDE_INT) ival + 0x8000) < 0x10000")))
+
+(define_constraint "Kf"
+  "Immeditate constant -1"
+  (and (match_code "const_int")
+       (match_test "ival == -1")))
+
+(define_constraint "L"
+  "Unsigned integer 15-bit constant"
+  (and (match_code "const_int")
+       (match_test "((unsigned HOST_WIDE_INT) ival) < 0x8000")))
+
+(define_constraint "A"
+  "Inline immediate parameter"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_inline_constant_p (op)")))
+
+(define_constraint "B"
+  "Immediate 32-bit parameter"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_constant_p (op)")))
+
+(define_constraint "C"
+  "Immediate 32-bit parameter zero-extended to 64-bits"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_constant64_p (op)")))
+
+(define_constraint "DA"
+  "Splittable inline immediate 64-bit parameter"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "gcn_inline_constant64_p (op)")))
+
+(define_constraint "DB"
+  "Splittable immediate 64-bit parameter"
+  (match_code "const_int,const_double,const_vector"))
+
+(define_constraint "U"
+  "unspecified value"
+  (match_code "unspec"))
+
+(define_constraint "Y"
+  "Symbol or label for relative calls"
+  (match_code "symbol_ref,label_ref"))
+
+(define_register_constraint "v" "VGPR_REGS"
+  "VGPR registers")
+
+(define_register_constraint "Sg" "SGPR_REGS"
+  "SGPR registers")
+
+(define_register_constraint "SD" "SGPR_DST_REGS"
+  "registers useable as a destination of scalar operation")
+
+(define_register_constraint "SS" "SGPR_SRC_REGS"
+  "registers useable as a source of scalar operation")
+
+(define_register_constraint "Sm" "SGPR_MEM_SRC_REGS"
+  "registers useable as a source of scalar memory operation")
+
+(define_register_constraint "Sv" "SGPR_VOP_SRC_REGS"
+  "registers useable as a source of VOP3A instruction")
+
+(define_register_constraint "ca" "ALL_CONDITIONAL_REGS"
+  "SCC VCCZ or EXECZ")
+
+(define_register_constraint "cs" "SCC_CONDITIONAL_REG"
+  "SCC")
+
+(define_register_constraint "cV" "VCC_CONDITIONAL_REG"
+  "VCC")
+
+(define_register_constraint "e" "EXEC_MASK_REG"
+  "EXEC")
+
+(define_special_memory_constraint "RB"
+  "Buffer memory address to scratch memory."
+  (and (match_code "mem")
+       (match_test "AS_SCRATCH_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RF"
+  "Buffer memory address to flat memory."
+  (and (match_code "mem")
+       (match_test "AS_FLAT_P (MEM_ADDR_SPACE (op))
+                   && gcn_flat_address_p (XEXP (op, 0), mode)")))
+
+(define_special_memory_constraint "RS"
+  "Buffer memory address to scalar flat memory."
+  (and (match_code "mem")
+       (match_test "AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op))
+                   && gcn_scalar_flat_mem_p (op)")))
+
+(define_special_memory_constraint "RL"
+  "Buffer memory address to LDS memory."
+  (and (match_code "mem")
+       (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RG"
+  "Buffer memory address to GDS memory."
+  (and (match_code "mem")
+       (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")))
+
+(define_special_memory_constraint "RD"
+  "Buffer memory address to GDS or LDS memory."
+  (and (match_code "mem")
+       (ior (match_test "AS_GDS_P (MEM_ADDR_SPACE (op))")
+           (match_test "AS_LDS_P (MEM_ADDR_SPACE (op))"))))
+
+(define_special_memory_constraint "RM"
+  "Memory address to global (main) memory."
+  (and (match_code "mem")
+       (match_test "AS_GLOBAL_P (MEM_ADDR_SPACE (op))
+                   && gcn_global_address_p (XEXP (op, 0))")))
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
new file mode 100644 (file)
index 0000000..3cc59dd
--- /dev/null
@@ -0,0 +1,3049 @@
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; {{{ Vector iterators
+
+; Vector modes for one vector register
+(define_mode_iterator VEC_1REG_MODE
+                     [V64QI V64HI V64SI V64HF V64SF])
+(define_mode_iterator VEC_1REG_ALT
+                     [V64QI V64HI V64SI V64HF V64SF])
+
+(define_mode_iterator VEC_1REG_INT_MODE
+                     [V64QI V64HI V64SI])
+(define_mode_iterator VEC_1REG_INT_ALT
+                     [V64QI V64HI V64SI])
+
+; Vector modes for two vector registers
+(define_mode_iterator VEC_2REG_MODE
+                     [V64DI V64DF])
+
+; All of above
+(define_mode_iterator VEC_REG_MODE
+                     [V64QI V64HI V64SI V64HF V64SF    ; Single reg
+                      V64DI V64DF])                    ; Double reg
+
+(define_mode_attr scalar_mode
+  [(V64QI "qi") (V64HI "hi") (V64SI "si")
+   (V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
+
+(define_mode_attr SCALAR_MODE
+  [(V64QI "QI") (V64HI "HI") (V64SI "SI")
+   (V64HF "HF") (V64SF "SF") (V64DI "DI") (V64DF "DF")])
+
+;; }}}
+;; {{{ Substitutions
+
+(define_subst_attr "exec" "vec_merge"
+                  "" "_exec")
+(define_subst_attr "exec_clobber" "vec_merge_with_clobber"
+                  "" "_exec")
+(define_subst_attr "exec_vcc" "vec_merge_with_vcc"
+                  "" "_exec")
+(define_subst_attr "exec_scatter" "scatter_store"
+                  "" "_exec")
+
+(define_subst "vec_merge"
+  [(set (match_operand:VEC_REG_MODE 0)
+       (match_operand:VEC_REG_MODE 1))]
+  ""
+  [(set (match_dup 0)
+       (vec_merge:VEC_REG_MODE
+         (match_dup 1)
+         (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand" "e")))])
+
+(define_subst "vec_merge_with_clobber"
+  [(set (match_operand:VEC_REG_MODE 0)
+       (match_operand:VEC_REG_MODE 1))
+   (clobber (match_operand 2))]
+  ""
+  [(set (match_dup 0)
+       (vec_merge:VEC_REG_MODE
+         (match_dup 1)
+         (match_operand:VEC_REG_MODE 3 "gcn_register_or_unspec_operand" "U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand" "e")))
+   (clobber (match_dup 2))])
+
+(define_subst "vec_merge_with_vcc"
+  [(set (match_operand:VEC_REG_MODE 0)
+       (match_operand:VEC_REG_MODE 1))
+   (set (match_operand:DI 2)
+       (match_operand:DI 3))]
+  ""
+  [(parallel
+     [(set (match_dup 0)
+          (vec_merge:VEC_REG_MODE
+            (match_dup 1)
+            (match_operand:VEC_REG_MODE 4
+                                        "gcn_register_or_unspec_operand" "U0")
+            (match_operand:DI 5 "gcn_exec_reg_operand" "e")))
+      (set (match_dup 2)
+          (and:DI (match_dup 3)
+                  (reg:DI EXEC_REG)))])])
+
+(define_subst "scatter_store"
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(match_operand 0)
+          (match_operand 1)
+          (match_operand 2)
+          (match_operand 3)]
+         UNSPEC_SCATTER))]
+  ""
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(match_dup 0)
+          (match_dup 1)
+          (match_dup 2)
+          (match_dup 3)
+          (match_operand:DI 4 "gcn_exec_reg_operand" "e")]
+         UNSPEC_SCATTER))])
+
+;; }}}
+;; {{{ Vector moves
+
+; This is the entry point for all vector register moves.  Memory accesses can
+; come this way also, but will more usually use the reload_in/out,
+; gather/scatter, maskload/store, etc.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+       (match_operand:VEC_REG_MODE 1 "general_operand"))]
+  ""
+  {
+    if (MEM_P (operands[0]) && !lra_in_progress && !reload_completed)
+      {
+       operands[1] = force_reg (<MODE>mode, operands[1]);
+       rtx scratch = gen_rtx_SCRATCH (V64DImode);
+       rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+       rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+       rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+                                                       operands[0],
+                                                       scratch);
+       emit_insn (gen_scatter<mode>_expr (expr, operands[1], a, v));
+       DONE;
+      }
+    else if (MEM_P (operands[1]) && !lra_in_progress && !reload_completed)
+      {
+       rtx scratch = gen_rtx_SCRATCH (V64DImode);
+       rtx a = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+       rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+       rtx expr = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+                                                       operands[1],
+                                                       scratch);
+       emit_insn (gen_gather<mode>_expr (operands[0], expr, a, v));
+       DONE;
+      }
+    else if ((MEM_P (operands[0]) || MEM_P (operands[1])))
+      {
+        gcc_assert (!reload_completed);
+       rtx scratch = gen_reg_rtx (V64DImode);
+       emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], scratch));
+       DONE;
+      }
+  })
+
+; A pseudo instruction that helps LRA use the "U0" constraint.
+
+(define_insn "mov<mode>_unspec"
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand" "=v")
+       (match_operand:VEC_REG_MODE 1 "gcn_unspec_operand"   " U"))]
+  ""
+  ""
+  [(set_attr "type" "unknown")
+   (set_attr "length" "0")])
+
+(define_insn "*mov<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v")
+       (match_operand:VEC_1REG_MODE 1 "general_operand"      "vA,B"))]
+  ""
+  "v_mov_b32\t%0, %1"
+  [(set_attr "type" "vop1,vop1")
+   (set_attr "length" "4,8")])
+
+(define_insn "mov<mode>_exec"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand"
+                                                        "=v, v, v, v, v, m")
+       (vec_merge:VEC_1REG_MODE
+         (match_operand:VEC_1REG_MODE 1 "general_operand"
+                                                        "vA, B, v,vA, m, v")
+         (match_operand:VEC_1REG_MODE 3 "gcn_alu_or_unspec_operand"
+                                                        "U0,U0,vA,vA,U0,U0")
+         (match_operand:DI 2 "register_operand"         " e, e,cV,Sv, e, e")))
+   (clobber (match_scratch:V64DI 4                      "=X, X, X, X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  "@
+   v_mov_b32\t%0, %1
+   v_mov_b32\t%0, %1
+   v_cndmask_b32\t%0, %3, %1, vcc
+   v_cndmask_b32\t%0, %3, %1, %2
+   #
+   #"
+  [(set_attr "type" "vop1,vop1,vop2,vop3a,*,*")
+   (set_attr "length" "4,8,4,8,16,16")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+;(define_insn "*mov<mode>_exec_match"
+;  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "=v,v, v, m")
+;      (vec_merge:VEC_1REG_MODE
+;        (match_operand:VEC_1REG_MODE 1 "general_operand"    "vA,B, m, v")
+;        (match_dup 0)
+;        (match_operand:DI 2 "gcn_exec_reg_operand"          " e,e, e, e")))
+;   (clobber (match_scratch:V64DI 3                          "=X,X,&v,&v"))]
+;  "!MEM_P (operands[0]) || REG_P (operands[1])"
+;  "@
+;  v_mov_b32\t%0, %1
+;  v_mov_b32\t%0, %1
+;  #
+;  #"
+;  [(set_attr "type" "vop1,vop1,*,*")
+;   (set_attr "length" "4,8,16,16")])
+
+(define_insn "*mov<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"  "=v")
+       (match_operand:VEC_2REG_MODE 1 "general_operand"      "vDB"))]
+  ""
+  {
+    if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+      return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+    else
+      return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")])
+
+(define_insn "mov<mode>_exec"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand"
+                                                      "= v,   v,   v, v, m")
+       (vec_merge:VEC_2REG_MODE
+         (match_operand:VEC_2REG_MODE 1 "general_operand"
+                                                      "vDB,  v0,  v0, m, v")
+         (match_operand:VEC_2REG_MODE 3 "gcn_alu_or_unspec_operand"
+                                                      " U0,vDA0,vDA0,U0,U0")
+         (match_operand:DI 2 "register_operand"       "  e,  cV,  Sv, e, e")))
+   (clobber (match_scratch:V64DI 4                    "= X,   X,   X,&v,&v"))]
+  "!MEM_P (operands[0]) || REG_P (operands[1])"
+  {
+    if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
+      switch (which_alternative)
+       {
+       case 0:
+         return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
+       case 1:
+         return "v_cndmask_b32\t%L0, %L3, %L1, vcc\;"
+                "v_cndmask_b32\t%H0, %H3, %H1, vcc";
+       case 2:
+         return "v_cndmask_b32\t%L0, %L3, %L1, %2\;"
+                "v_cndmask_b32\t%H0, %H3, %H1, %2";
+       }
+    else
+      switch (which_alternative)
+       {
+       case 0:
+         return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
+       case 1:
+         return "v_cndmask_b32\t%H0, %H3, %H1, vcc\;"
+                "v_cndmask_b32\t%L0, %L3, %L1, vcc";
+       case 2:
+         return "v_cndmask_b32\t%H0, %H3, %H1, %2\;"
+                "v_cndmask_b32\t%L0, %L3, %L1, %2";
+       }
+
+    return "#";
+  }
+  [(set_attr "type" "vmult,vmult,vmult,*,*")
+   (set_attr "length" "16,16,16,16,16")])
+
+; This variant does not accept an unspec, but does permit MEM
+; read/modify/write which is necessary for maskstore.
+
+;(define_insn "*mov<mode>_exec_match"
+;  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "=v, v, m")
+;      (vec_merge:VEC_2REG_MODE
+;        (match_operand:VEC_2REG_MODE 1 "general_operand"   "vDB, m, v")
+;        (match_dup 0)
+;        (match_operand:DI 2 "gcn_exec_reg_operand"          " e, e, e")))
+;   (clobber (match_scratch:V64DI 3                          "=X,&v,&v"))]
+;  "!MEM_P (operands[0]) || REG_P (operands[1])"
+;  "@
+;   * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+;       return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+;     else \
+;       return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+;   #
+;   #"
+;  [(set_attr "type" "vmult,*,*")
+;   (set_attr "length" "16,16,16")])
+
+; A SGPR-base load looks like:
+;   <load> v, Sv
+;
+; There's no hardware instruction that corresponds to this, but vector base
+; addresses are placed in an SGPR because it is easier to add to a vector.
+; We also have a temporary vT, and the vector v1 holding numbered lanes.
+;
+; Rewrite as:
+;   vT = v1 << log2(element-size)
+;   vT += Sv
+;   flat_load v, vT
+
+(define_insn "mov<mode>_sgprbase"
+  [(set (match_operand:VEC_1REG_MODE 0 "nonimmediate_operand" "= v, v, v, m")
+       (unspec:VEC_1REG_MODE
+         [(match_operand:VEC_1REG_MODE 1 "general_operand"   " vA,vB, m, v")]
+         UNSPEC_SGPRBASE))
+   (clobber (match_operand:V64DI 2 "register_operand"        "=&v,&v,&v,&v"))]
+  "lra_in_progress || reload_completed"
+  "@
+   v_mov_b32\t%0, %1
+   v_mov_b32\t%0, %1
+   #
+   #"
+  [(set_attr "type" "vop1,vop1,*,*")
+   (set_attr "length" "4,8,12,12")])
+
+(define_insn "mov<mode>_sgprbase"
+  [(set (match_operand:VEC_2REG_MODE 0 "nonimmediate_operand" "= v, v, m")
+       (unspec:VEC_2REG_MODE
+         [(match_operand:VEC_2REG_MODE 1 "general_operand"   "vDB, m, v")]
+         UNSPEC_SGPRBASE))
+   (clobber (match_operand:V64DI 2 "register_operand"        "=&v,&v,&v"))]
+  "lra_in_progress || reload_completed"
+  "@
+   * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+       return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+     else \
+       return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+   #
+   #"
+  [(set_attr "type" "vmult,*,*")
+   (set_attr "length" "8,12,12")])
+
+; reload_in was once a standard name, but here it's only referenced by
+; gcn_secondary_reload.  It allows a reload with a scratch register.
+
+(define_expand "reload_in<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand" "= v")
+       (match_operand:VEC_REG_MODE 1 "memory_operand"   "  m"))
+   (clobber (match_operand:V64DI 2 "register_operand"    "=&v"))]
+  ""
+  {
+    emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+    DONE;
+  })
+
+; reload_out is similar to reload_in, above.
+
+(define_expand "reload_out<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand"   "= m")
+       (match_operand:VEC_REG_MODE 1 "register_operand" "  v"))
+   (clobber (match_operand:V64DI 2 "register_operand"    "=&v"))]
+  ""
+  {
+    emit_insn (gen_mov<mode>_sgprbase (operands[0], operands[1], operands[2]));
+    DONE;
+  })
+
+; Expand scalar addresses into gather/scatter patterns
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+       (unspec:VEC_REG_MODE
+         [(match_operand:VEC_REG_MODE 1 "general_operand")]
+         UNSPEC_SGPRBASE))
+   (clobber (match_scratch:V64DI 2))]
+  ""
+  [(set (mem:BLK (scratch))
+       (unspec:BLK [(match_dup 5) (match_dup 1) (match_dup 6) (match_dup 7)]
+                   UNSPEC_SCATTER))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+                                                      operands[0],
+                                                      operands[2]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "memory_operand")
+       (vec_merge:VEC_REG_MODE
+         (match_operand:VEC_REG_MODE 1 "general_operand")
+         (match_operand:VEC_REG_MODE 2 "")
+         (match_operand:DI 3 "gcn_exec_reg_operand")))
+   (clobber (match_scratch:V64DI 4))]
+  ""
+  [(set (mem:BLK (scratch))
+       (unspec:BLK [(match_dup 5) (match_dup 1)
+                    (match_dup 6) (match_dup 7) (match_dup 3)]
+                   UNSPEC_SCATTER))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+                                                      operands[3],
+                                                      operands[0],
+                                                      operands[4]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+       (unspec:VEC_REG_MODE
+         [(match_operand:VEC_REG_MODE 1 "memory_operand")]
+         UNSPEC_SGPRBASE))
+   (clobber (match_scratch:V64DI 2))]
+  ""
+  [(set (match_dup 0)
+       (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+                             (mem:BLK (scratch))]
+                            UNSPEC_GATHER))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode, NULL,
+                                                      operands[1],
+                                                      operands[2]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+  })
+
+(define_split
+  [(set (match_operand:VEC_REG_MODE 0 "nonimmediate_operand")
+       (vec_merge:VEC_REG_MODE
+         (match_operand:VEC_REG_MODE 1 "memory_operand")
+         (match_operand:VEC_REG_MODE 2 "")
+         (match_operand:DI 3 "gcn_exec_reg_operand")))
+   (clobber (match_scratch:V64DI 4))]
+  ""
+  [(set (match_dup 0)
+       (vec_merge:VEC_REG_MODE
+         (unspec:VEC_REG_MODE [(match_dup 5) (match_dup 6) (match_dup 7)
+                               (mem:BLK (scratch))]
+                              UNSPEC_GATHER)
+         (match_dup 2)
+         (match_dup 3)))]
+  {
+    operands[5] = gcn_expand_scalar_to_vector_address (<MODE>mode,
+                                                      operands[3],
+                                                      operands[1],
+                                                      operands[4]);
+    operands[6] = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    operands[7] = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+  })
+
+; TODO: Add zero/sign extending variants.
+
+;; }}}
+;; {{{ Lane moves
+
+; v_writelane and v_readlane work regardless of exec flags.
+; We allow source to be scratch.
+;
+; FIXME these should take A immediates
+
+(define_insn "*vec_set<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"            "= v")
+       (vec_merge:VEC_1REG_MODE
+         (vec_duplicate:VEC_1REG_MODE
+           (match_operand:<SCALAR_MODE> 1 "register_operand"        " Sv"))
+         (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+                                                                    " U0")
+         (ashift (const_int 1)
+                 (match_operand:SI 2 "gcn_alu_operand"              "SvB"))))]
+  ""
+  "v_writelane_b32 %0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+; FIXME: 64bit operations really should be splitters, but I am not sure how
+; to represent vertical subregs.
+(define_insn "*vec_set<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"           "= v")
+       (vec_merge:VEC_2REG_MODE
+         (vec_duplicate:VEC_2REG_MODE
+           (match_operand:<SCALAR_MODE> 1 "register_operand"        " Sv"))
+         (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+                                                                    " U0")
+         (ashift (const_int 1)
+                 (match_operand:SI 2 "gcn_alu_operand"              "SvB"))))]
+  ""
+  "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+(define_expand "vec_set<mode>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+       (vec_merge:VEC_REG_MODE
+         (vec_duplicate:VEC_REG_MODE
+           (match_operand:<SCALAR_MODE> 1 "register_operand"))
+         (match_dup 0)
+         (ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))]
+  "")
+
+(define_insn "*vec_set<mode>_1"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"             "=v")
+       (vec_merge:VEC_1REG_MODE
+         (vec_duplicate:VEC_1REG_MODE
+           (match_operand:<SCALAR_MODE> 1 "register_operand"          "Sv"))
+         (match_operand:VEC_1REG_MODE 3 "gcn_register_or_unspec_operand"
+                                                                      "U0")
+         (match_operand:SI 2 "const_int_operand"                      " i")))]
+  "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+  {
+    operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+    return "v_writelane_b32 %0, %1, %2";
+  }
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "*vec_set<mode>_1"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"             "=v")
+       (vec_merge:VEC_2REG_MODE
+         (vec_duplicate:VEC_2REG_MODE
+           (match_operand:<SCALAR_MODE> 1 "register_operand"          "Sv"))
+         (match_operand:VEC_2REG_MODE 3 "gcn_register_or_unspec_operand"
+                                                                      "U0")
+         (match_operand:SI 2 "const_int_operand"                      " i")))]
+  "((unsigned) exact_log2 (INTVAL (operands[2])) < 64)"
+  {
+    operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
+    return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "vec_duplicate<mode><exec>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"  "=v")
+       (vec_duplicate:VEC_1REG_MODE
+         (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvB")))]
+  ""
+  "v_mov_b32\t%0, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "vec_duplicate<mode><exec>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"  "=  v")
+       (vec_duplicate:VEC_2REG_MODE
+         (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvDB")))]
+  ""
+  "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "16")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"   "=Sg")
+       (vec_select:<SCALAR_MODE>
+         (match_operand:VEC_1REG_MODE 1 "register_operand" "  v")
+         (parallel [(match_operand:SI 2 "gcn_alu_operand"  "SvB")])))]
+  ""
+  "v_readlane_b32 %0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+(define_insn "vec_extract<mode><scalar_mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"   "=Sg")
+       (vec_select:<SCALAR_MODE>
+         (match_operand:VEC_2REG_MODE 1 "register_operand" "  v")
+         (parallel [(match_operand:SI 2 "gcn_alu_operand"  "SvB")])))]
+  ""
+  "v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2"
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")
+   (set_attr "exec" "none")
+   (set_attr "laneselect" "yes")])
+
+(define_expand "vec_init<mode><scalar_mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand 1)]
+  ""
+  {
+    gcn_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  })
+
+;; }}}
+;; {{{ Scatter / Gather
+
+;; GCN does not have an instruction for loading a vector from contiguous
+;; memory so *all* loads and stores are eventually converted to scatter
+;; or gather.
+;;
+;; GCC does not permit MEM to hold vectors of addresses, so we must use an
+;; unspec.  The unspec formats are as follows:
+;;
+;;     (unspec:V64??
+;;      [(<address expression>)
+;;       (<addr_space_t>)
+;;       (<use_glc>)
+;;       (mem:BLK (scratch))]
+;;      UNSPEC_GATHER)
+;;
+;;     (unspec:BLK
+;;       [(<address expression>)
+;;        (<source register>)
+;;        (<addr_space_t>)
+;;        (<use_glc>)
+;;        (<exec>)]
+;;       UNSPEC_SCATTER)
+;;
+;; - Loads are expected to be wrapped in a vec_merge, so do not need <exec>.
+;; - The mem:BLK does not contain any real information, but indicates that an
+;;   unknown memory read is taking place.  Stores are expected to use a similar
+;;   mem:BLK outside the unspec.
+;; - The address space and glc (volatile) fields are there to replace the
+;;   fields normally found in a MEM.
+;; - Multiple forms of address expression are supported, below.
+
+(define_expand "gather_load<mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")]
+  ""
+  {
+    rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+                                         operands[2], operands[4],
+                                         INTVAL (operands[3]), NULL);
+
+    if (GET_MODE (addr) == V64DImode)
+      emit_insn (gen_gather<mode>_insn_1offset (operands[0], addr, const0_rtx,
+                                               const0_rtx, const0_rtx));
+    else
+      emit_insn (gen_gather<mode>_insn_2offsets (operands[0], operands[1],
+                                                addr, const0_rtx, const0_rtx,
+                                                const0_rtx));
+    DONE;
+  })
+
+(define_expand "gather<mode>_exec"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:V64SI 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")
+   (match_operand:DI 5 "gcn_exec_reg_operand")]
+  ""
+  {
+    rtx undefmode = gcn_gen_undef (<MODE>mode);
+
+    rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1],
+                                         operands[2], operands[4],
+                                         INTVAL (operands[3]), operands[5]);
+
+    if (GET_MODE (addr) == V64DImode)
+      emit_insn (gen_gather<mode>_insn_1offset_exec (operands[0], addr,
+                                                    const0_rtx, const0_rtx,
+                                                    const0_rtx, undefmode,
+                                                    operands[5]));
+    else
+      emit_insn (gen_gather<mode>_insn_2offsets_exec (operands[0], operands[1],
+                                                     addr, const0_rtx,
+                                                     const0_rtx, const0_rtx,
+                                                     undefmode, operands[5]));
+    DONE;
+  })
+
+; Allow any address expression
+(define_expand "gather<mode>_expr<exec>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand")
+       (unspec:VEC_REG_MODE
+         [(match_operand 1 "")
+          (match_operand 2 "immediate_operand")
+          (match_operand 3 "immediate_operand")
+          (mem:BLK (scratch))]
+         UNSPEC_GATHER))]
+    ""
+    {})
+
+(define_insn "gather<mode>_insn_1offset<exec>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"        "=v")
+       (unspec:VEC_REG_MODE
+         [(plus:V64DI (match_operand:V64DI 1 "register_operand" " v")
+                      (vec_duplicate:V64DI
+                        (match_operand 2 "immediate_operand"    " n")))
+          (match_operand 3 "immediate_operand"                  " n")
+          (match_operand 4 "immediate_operand"                  " n")
+          (mem:BLK (scratch))]
+         UNSPEC_GATHER))]
+  "(AS_FLAT_P (INTVAL (operands[3]))
+    && ((TARGET_GCN3 && INTVAL(operands[2]) == 0)
+       || ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000)))
+    || (AS_GLOBAL_P (INTVAL (operands[3]))
+       && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_FLAT_P (as))
+      {
+       if (TARGET_GCN5_PLUS)
+         sprintf (buf, "flat_load%%s0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0",
+                  glc);
+       else
+         sprintf (buf, "flat_load%%s0\t%%0, %%1%s\;s_waitcnt\t0", glc);
+      }
+    else if (AS_GLOBAL_P (as))
+      sprintf (buf, "global_load%%s0\t%%0, %%1, off offset:%%2%s\;"
+              "s_waitcnt\tvmcnt(0)", glc);
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")])
+
+(define_insn "gather<mode>_insn_1offset_ds<exec>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"        "=v")
+       (unspec:VEC_REG_MODE
+         [(plus:V64SI (match_operand:V64SI 1 "register_operand" " v")
+                      (vec_duplicate:V64SI
+                        (match_operand 2 "immediate_operand"    " n")))
+          (match_operand 3 "immediate_operand"                  " n")
+          (match_operand 4 "immediate_operand"                  " n")
+          (mem:BLK (scratch))]
+         UNSPEC_GATHER))]
+  "(AS_ANY_DS_P (INTVAL (operands[3]))
+    && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    static char buf[200];
+    sprintf (buf, "ds_read%%b0\t%%0, %%1 offset:%%2%s\;s_waitcnt\tlgkmcnt(0)",
+            (AS_GDS_P (as) ? " gds" : ""));
+    return buf;
+  }
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")])
+
+(define_insn "gather<mode>_insn_2offsets<exec>"
+  [(set (match_operand:VEC_REG_MODE 0 "register_operand"              "=v")
+       (unspec:VEC_REG_MODE
+         [(plus:V64DI
+            (plus:V64DI
+              (vec_duplicate:V64DI
+                (match_operand:DI 1 "register_operand"                "Sv"))
+              (sign_extend:V64DI
+                (match_operand:V64SI 2 "register_operand"             " v")))
+            (vec_duplicate:V64DI (match_operand 3 "immediate_operand" " n")))
+          (match_operand 4 "immediate_operand"                        " n")
+          (match_operand 5 "immediate_operand"                        " n")
+          (mem:BLK (scratch))]
+         UNSPEC_GATHER))]
+  "(AS_GLOBAL_P (INTVAL (operands[4]))
+    && (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[4]);
+    const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_GLOBAL_P (as))
+      {
+       /* Work around assembler bug in which a 64-bit register is expected,
+       but a 32-bit value would be correct.  */
+       int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
+       sprintf (buf, "global_load%%s0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+                     "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+      }
+    else
+      gcc_unreachable ();
+      
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")])
+
+(define_expand "scatter_store<mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")]
+  ""
+  {
+    rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+                                         operands[1], operands[3],
+                                         INTVAL (operands[2]), NULL);
+
+    if (GET_MODE (addr) == V64DImode)
+      emit_insn (gen_scatter<mode>_insn_1offset (addr, const0_rtx, operands[4],
+                                                const0_rtx, const0_rtx));
+    else
+      emit_insn (gen_scatter<mode>_insn_2offsets (operands[0], addr,
+                                                 const0_rtx, operands[4],
+                                                 const0_rtx, const0_rtx));
+    DONE;
+  })
+
+(define_expand "scatter<mode>_exec"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")
+   (match_operand:DI 5 "gcn_exec_reg_operand")]
+  ""
+  {
+    operands[5] = force_reg (DImode, operands[5]);
+
+    rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
+                                         operands[1], operands[3],
+                                         INTVAL (operands[2]), operands[5]);
+
+    if (GET_MODE (addr) == V64DImode)
+      emit_insn (gen_scatter<mode>_insn_1offset_exec (addr, const0_rtx,
+                                                     operands[4], const0_rtx,
+                                                     const0_rtx,
+                                                     operands[5]));
+    else
+      emit_insn (gen_scatter<mode>_insn_2offsets_exec (operands[0], addr,
+                                                      const0_rtx, operands[4],
+                                                      const0_rtx, const0_rtx,
+                                                      operands[5]));
+    DONE;
+  })
+
+; Allow any address expression
+(define_expand "scatter<mode>_expr<exec_scatter>"
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(match_operand:V64DI 0 "")
+          (match_operand:VEC_REG_MODE 1 "register_operand")
+          (match_operand 2 "immediate_operand")
+          (match_operand 3 "immediate_operand")]
+         UNSPEC_SCATTER))]
+  ""
+  {})
+
+(define_insn "scatter<mode>_insn_1offset<exec_scatter>"
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(plus:V64DI (match_operand:V64DI 0 "register_operand" "v")
+                      (vec_duplicate:V64DI
+                        (match_operand 1 "immediate_operand"    "n")))
+          (match_operand:VEC_REG_MODE 2 "register_operand"      "v")
+          (match_operand 3 "immediate_operand"                  "n")
+          (match_operand 4 "immediate_operand"                  "n")]
+         UNSPEC_SCATTER))]
+  "(AS_FLAT_P (INTVAL (operands[3]))
+    && (INTVAL(operands[1]) == 0
+       || (TARGET_GCN5_PLUS
+           && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000)))
+    || (AS_GLOBAL_P (INTVAL (operands[3]))
+       && (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    const char *glc = INTVAL (operands[4]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_FLAT_P (as))
+      {
+       if (TARGET_GCN5_PLUS)
+         sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;"
+                  "s_waitcnt\texpcnt(0)", glc);
+       else
+         sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)",
+                  glc);
+      }
+    else if (AS_GLOBAL_P (as))
+      sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;"
+              "s_waitcnt\texpcnt(0)", glc);
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")])
+
+(define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>"
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(plus:V64SI (match_operand:V64SI 0 "register_operand" "v")
+                      (vec_duplicate:V64SI
+                        (match_operand 1 "immediate_operand"    "n")))
+          (match_operand:VEC_REG_MODE 2 "register_operand"      "v")
+          (match_operand 3 "immediate_operand"                  "n")
+          (match_operand 4 "immediate_operand"                  "n")]
+         UNSPEC_SCATTER))]
+  "(AS_ANY_DS_P (INTVAL (operands[3]))
+    && ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
+  {
+    addr_space_t as = INTVAL (operands[3]);
+    static char buf[200];
+    sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)",
+            (AS_GDS_P (as) ? " gds" : ""));
+    return buf;
+  }
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")])
+
+(define_insn "scatter<mode>_insn_2offsets<exec_scatter>"
+  [(set (mem:BLK (scratch))
+       (unspec:BLK
+         [(plus:V64DI
+            (plus:V64DI
+              (vec_duplicate:V64DI
+                (match_operand:DI 0 "register_operand"             "Sv"))
+              (sign_extend:V64DI
+                (match_operand:V64SI 1 "register_operand"          " v")))
+            (vec_duplicate:V64DI (match_operand 2 "immediate_operand"
+                                                                   " n")))
+          (match_operand:VEC_REG_MODE 3 "register_operand"         " v")
+          (match_operand 4 "immediate_operand"                     " n")
+          (match_operand 5 "immediate_operand"                     " n")]
+         UNSPEC_SCATTER))]
+  "(AS_GLOBAL_P (INTVAL (operands[4]))
+    && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
+  {
+    addr_space_t as = INTVAL (operands[4]);
+    const char *glc = INTVAL (operands[5]) ? " glc" : "";
+
+    static char buf[200];
+    if (AS_GLOBAL_P (as))
+      {
+       /* Work around assembler bug in which a 64-bit register is expected,
+       but a 32-bit value would be correct.  */
+       int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
+       sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;"
+                     "s_waitcnt\texpcnt(0)", reg, reg + 1, glc);
+      }
+    else
+      gcc_unreachable ();
+
+    return buf;
+  }
+  [(set_attr "type" "flat")
+   (set_attr "length" "12")])
+
+;; }}}
+;; {{{ Permutations
+
+(define_insn "ds_bpermute<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"    "=v")
+       (unspec:VEC_1REG_MODE
+         [(match_operand:VEC_1REG_MODE 2 "register_operand" " v")
+          (match_operand:V64SI 1 "register_operand"         " v")
+          (match_operand:DI 3 "gcn_exec_reg_operand"        " e")]
+         UNSPEC_BPERMUTE))]
+  ""
+  "ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "12")])
+
+(define_insn_and_split "ds_bpermute<mode>"
+  [(set (match_operand:VEC_2REG_MODE 0 "register_operand"    "=&v")
+       (unspec:VEC_2REG_MODE
+         [(match_operand:VEC_2REG_MODE 2 "register_operand" " v0")
+          (match_operand:V64SI 1 "register_operand"         "  v")
+          (match_operand:DI 3 "gcn_exec_reg_operand"        "  e")]
+         UNSPEC_BPERMUTE))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4) (unspec:V64SI [(match_dup 6) (match_dup 1) (match_dup 3)]
+                                   UNSPEC_BPERMUTE))
+   (set (match_dup 5) (unspec:V64SI [(match_dup 7) (match_dup 1) (match_dup 3)]
+                                   UNSPEC_BPERMUTE))]
+  {
+    operands[4] = gcn_operand_part (<MODE>mode, operands[0], 0);
+    operands[5] = gcn_operand_part (<MODE>mode, operands[0], 1);
+    operands[6] = gcn_operand_part (<MODE>mode, operands[2], 0);
+    operands[7] = gcn_operand_part (<MODE>mode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "24")])
+
+;; }}}
+;; {{{ ALU special case: add/sub
+
+(define_insn "addv64si3<exec_clobber>"
+  [(set (match_operand:V64SI 0 "register_operand"   "=  v")
+       (plus:V64SI
+         (match_operand:V64SI 1 "register_operand" "%  v")
+         (match_operand:V64SI 2 "gcn_alu_operand"  "vSvB")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "addv64si3_dup<exec_clobber>"
+  [(set (match_operand:V64SI 0 "register_operand"   "= v")
+       (plus:V64SI
+         (vec_duplicate:V64SI
+           (match_operand:SI 2 "gcn_alu_operand"   "SvB"))
+         (match_operand:V64SI 1 "register_operand" "  v")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "addv64si3_vcc<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand"   "=  v,   v")
+       (plus:V64SI
+         (match_operand:V64SI 1 "register_operand" "%  v,   v")
+         (match_operand:V64SI 2 "gcn_alu_operand"  "vSvB,vSvB")))
+   (set (match_operand:DI 3 "register_operand"     "= cV,  Sg")
+       (ltu:DI (plus:V64SI (match_dup 1) (match_dup 2))
+               (match_dup 1)))]
+  ""
+  "v_add%^_u32\t%0, %3, %2, %1"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "8")])
+
+; This pattern only changes the VCC bits when the corresponding lane is
+; enabled, so the set must be described as an ior.
+
+(define_insn "addv64si3_vcc_dup<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand"   "= v,  v")
+       (plus:V64SI
+         (vec_duplicate:V64SI
+           (match_operand:SI 1 "gcn_alu_operand"   "SvB,SvB"))
+         (match_operand:V64SI 2 "register_operand" "  v,  v")))
+   (set (match_operand:DI 3 "register_operand"     "=cV, Sg")
+       (ltu:DI (plus:V64SI (vec_duplicate:V64SI (match_dup 2))
+                           (match_dup 1))
+               (vec_duplicate:V64SI (match_dup 2))))]
+  ""
+  "v_add%^_u32\t%0, %3, %2, %1"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "8,8")])
+
+; This pattern does not accept SGPR because VCC read already counts as an
+; SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "addcv64si3<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+       (plus:V64SI
+         (plus:V64SI
+           (vec_merge:V64SI
+             (vec_duplicate:V64SI (const_int 1))
+             (vec_duplicate:V64SI (const_int 0))
+             (match_operand:DI 3 "register_operand" " cV,Sv"))
+           (match_operand:V64SI 1 "gcn_alu_operand" "%vA,vA"))
+         (match_operand:V64SI 2 "gcn_alu_operand"   " vB,vB")))
+   (set (match_operand:DI 4 "register_operand"      "=cV,Sg")
+       (ior:DI (ltu:DI (plus:V64SI
+                         (plus:V64SI
+                           (vec_merge:V64SI
+                             (vec_duplicate:V64SI (const_int 1))
+                             (vec_duplicate:V64SI (const_int 0))
+                             (match_dup 3))
+                           (match_dup 1))
+                         (match_dup 2))
+                       (match_dup 2))
+               (ltu:DI (plus:V64SI
+                         (vec_merge:V64SI
+                           (vec_duplicate:V64SI (const_int 1))
+                           (vec_duplicate:V64SI (const_int 0))
+                           (match_dup 3))
+                         (match_dup 1))
+                       (match_dup 1))))]
+  ""
+  "v_addc%^_u32\t%0, %4, %1, %2, %3"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "4,8")])
+
+(define_insn "addcv64si3_dup<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand" "=v,v")
+       (plus:V64SI
+         (plus:V64SI
+           (vec_merge:V64SI
+             (vec_duplicate:V64SI (const_int 1))
+             (vec_duplicate:V64SI (const_int 0))
+             (match_operand:DI 3 "register_operand" " cV, Sv"))
+           (match_operand:V64SI 1 "gcn_alu_operand" "%vA, vA"))
+         (vec_duplicate:V64SI
+           (match_operand:SI 2 "gcn_alu_operand"    "SvB,SvB"))))
+   (set (match_operand:DI 4 "register_operand"  "=cV, Sg")
+       (ior:DI (ltu:DI (plus:V64SI (plus:V64SI
+                                     (vec_merge:V64SI
+                                       (vec_duplicate:V64SI (const_int 1))
+                                       (vec_duplicate:V64SI (const_int 0))
+                                       (match_dup 3))
+                                     (match_dup 1))
+                                   (vec_duplicate:V64SI
+                                     (match_dup 2)))
+                       (vec_duplicate:V64SI
+                         (match_dup 2)))
+               (ltu:DI (plus:V64SI (vec_merge:V64SI
+                                     (vec_duplicate:V64SI (const_int 1))
+                                     (vec_duplicate:V64SI (const_int 0))
+                                     (match_dup 3))
+                                   (match_dup 1))
+                       (match_dup 1))))]
+  ""
+  "v_addc%^_u32\t%0, %4, %1, %2, %3"
+  [(set_attr "type" "vop2,vop3b")
+   (set_attr "length" "4,8")])
+
+(define_insn "subv64si3<exec_clobber>"
+  [(set (match_operand:V64SI 0 "register_operand"  "=  v,   v")
+       (minus:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" "vSvB,   v")
+         (match_operand:V64SI 2 "gcn_alu_operand" "   v,vSvB")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "@
+   v_sub%^_u32\t%0, vcc, %1, %2
+   v_subrev%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "subv64si3_vcc<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand"  "=  v,   v,   v,   v")
+       (minus:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" "vSvB,vSvB,   v,   v")
+         (match_operand:V64SI 2 "gcn_alu_operand" "   v,   v,vSvB,vSvB")))
+   (set (match_operand:DI 3 "register_operand"    "= cV,  Sg,  cV,  Sg")
+       (gtu:DI (minus:V64SI (match_dup 1) (match_dup 2))
+               (match_dup 1)))]
+  ""
+  "@
+   v_sub%^_u32\t%0, %3, %1, %2
+   v_sub%^_u32\t%0, %3, %1, %2
+   v_subrev%^_u32\t%0, %3, %2, %1
+   v_subrev%^_u32\t%0, %3, %2, %1"
+  [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+   (set_attr "length" "8")])
+
+; This pattern does not accept SGPR because VCC read already counts
+; as a SGPR use and number of SGPR operands is limited to 1.
+
+(define_insn "subcv64si3<exec_vcc>"
+  [(set (match_operand:V64SI 0 "register_operand"    "= v, v, v, v")
+       (minus:V64SI
+         (minus:V64SI
+           (vec_merge:V64SI
+             (vec_duplicate:V64SI (const_int 1))
+             (vec_duplicate:V64SI (const_int 0))
+             (match_operand:DI 3 "gcn_alu_operand"  " cV,Sv,cV,Sv"))
+           (match_operand:V64SI 1 "gcn_alu_operand" " vA,vA,vB,vB"))
+         (match_operand:V64SI 2 "gcn_alu_operand"   " vB,vB,vA,vA")))
+   (set (match_operand:DI 4 "register_operand"      "=cV,Sg,cV,Sg")
+       (ior:DI (gtu:DI (minus:V64SI (minus:V64SI
+                                      (vec_merge:V64SI
+                                        (vec_duplicate:V64SI (const_int 1))
+                                        (vec_duplicate:V64SI (const_int 0))
+                                        (match_dup 3))
+                                      (match_dup 1))
+                                    (match_dup 2))
+                       (match_dup 2))
+               (ltu:DI (minus:V64SI (vec_merge:V64SI
+                                      (vec_duplicate:V64SI (const_int 1))
+                                      (vec_duplicate:V64SI (const_int 0))
+                                      (match_dup 3))
+                                    (match_dup 1))
+                       (match_dup 1))))]
+  ""
+  "@
+   v_subb%^_u32\t%0, %4, %1, %2, %3
+   v_subb%^_u32\t%0, %4, %1, %2, %3
+   v_subbrev%^_u32\t%0, %4, %2, %1, %3
+   v_subbrev%^_u32\t%0, %4, %2, %1, %3"
+  [(set_attr "type" "vop2,vop3b,vop2,vop3b")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3"
+  [(set (match_operand:V64DI 0 "register_operand"   "=  &v")
+       (plus:V64DI
+         (match_operand:V64DI 1 "register_operand" "%  v0")
+         (match_operand:V64DI 2 "gcn_alu_operand"  "vSvB0")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc));
+    emit_insn (gen_addcv64si3
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "=  &v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (match_operand:V64DI 1 "register_operand"             "%  v0")
+           (match_operand:V64DI 2 "gcn_alu_operand"              "vSvB0"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" "   U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "    e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[4])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    emit_insn (gen_addcv64si3_exec
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "subv64di3"
+  [(set (match_operand:V64DI 0 "register_operand"  "=  &v,   &v")
+       (minus:V64DI
+         (match_operand:V64DI 1 "gcn_alu_operand" "vSvB0,   v0")
+         (match_operand:V64DI 2 "gcn_alu_operand" "   v0,vSvB0")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_subv64si3_vcc
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc));
+    emit_insn (gen_subcv64si3
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "subv64di3_exec"
+  [(set (match_operand:V64DI 0 "register_operand"             "=  &v,   &v")
+       (vec_merge:V64DI
+         (minus:V64DI
+           (match_operand:V64DI 1 "gcn_alu_operand"           "vSvB0,   v0")
+           (match_operand:V64DI 2 "gcn_alu_operand"           "   v0,vSvB0"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand"
+                                                              "   U0,   U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"           "    e,    e")))
+   (clobber (reg:DI VCC_REG))]
+  "register_operand (operands[1], VOIDmode)
+   || register_operand (operands[2], VOIDmode)"
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_subv64si3_vcc_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    emit_insn (gen_subcv64si3_exec
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_dup"
+  [(set (match_operand:V64DI 0 "register_operand"   "= &v")
+       (plus:V64DI
+         (match_operand:V64DI 1 "register_operand" "  v0")
+         (vec_duplicate:V64DI
+           (match_operand:DI 2 "gcn_alu_operand"   "SvDB"))))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                vcc));
+    emit_insn (gen_addcv64si3_dup
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (DImode, operands[2], 1),
+                vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_dup_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "= &v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (match_operand:V64DI 1 "register_operand"             "  v0")
+           (vec_duplicate:V64DI
+             (match_operand:DI 2 "gcn_alu_operand"               "SvDB")))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" "  U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "   e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[1])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                gcn_operand_part (V64DImode, operands[1], 0),
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    emit_insn (gen_addcv64si3_dup_exec
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[1], 1),
+                gcn_operand_part (DImode, operands[2], 1),
+                vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext"
+  [(set (match_operand:V64DI 0 "register_operand"    "=&v,&v")
+       (plus:V64DI
+         (zero_extend:V64DI
+           (match_operand:V64SI 1 "gcn_alu_operand" "0vA,0vB"))
+         (match_operand:V64DI 2 "gcn_alu_operand"   "0vB,0vA")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                operands[1],
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc));
+    emit_insn (gen_addcv64si3
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                const0_rtx, vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "=&v,&v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (zero_extend:V64DI
+             (match_operand:V64SI 1 "gcn_alu_operand"            "0vA,0vB"))
+           (match_operand:V64DI 2 "gcn_alu_operand"              "0vB,0vA"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0, U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "  e,  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                operands[1],
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    emit_insn (gen_addcv64si3_exec
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                const0_rtx, vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "addv64di3_zext_dup"
+  [(set (match_operand:V64DI 0 "register_operand"   "=&v")
+       (plus:V64DI
+         (zero_extend:V64DI
+           (vec_duplicate:V64SI
+             (match_operand:SI 1 "gcn_alu_operand" "BSv")))
+         (match_operand:V64DI 2 "gcn_alu_operand"  "vA0")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc));
+    emit_insn (gen_addcv64si3
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                const0_rtx, vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "=&v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (zero_extend:V64DI
+             (vec_duplicate:V64SI
+               (match_operand:SI 1 "gcn_alu_operand"             "BSv")))
+           (match_operand:V64DI 2 "gcn_alu_operand"              "vA0"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[2])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[1], 0),
+                gcn_operand_part (V64DImode, operands[2], 0),
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    emit_insn (gen_addcv64si3_exec
+               (gcn_operand_part (V64DImode, operands[0], 1),
+                gcn_operand_part (V64DImode, operands[2], 1),
+                const0_rtx, vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"                    "= v")
+       (plus:V64DI
+         (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA"))
+         (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand"  "BSv"))))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                operands[1],
+                vcc));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si
+               (dsthi, gcn_operand_part (DImode, operands[2], 1)));
+    emit_insn (gen_addcv64si3 (dsthi, dsthi, const0_rtx, vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_zext_dup2_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                     "= v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (zero_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+                                                                      " vA"))
+           (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand"      " U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"                   "  e")))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_addv64si3_vcc_dup_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                operands[1],
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si_exec
+               (dsthi, gcn_operand_part (DImode, operands[2], 1),
+                gcn_gen_undef (V64SImode), operands[4]));
+    emit_insn (gen_addcv64si3_exec
+               (dsthi, dsthi, const0_rtx, vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"                    "= v")
+       (plus:V64DI
+         (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand" " vA"))
+         (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand"  "BSv"))))
+   (clobber (match_scratch:V64SI 3                                   "=&v"))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_ashrv64si3 (operands[3], operands[1], GEN_INT (31)));
+    emit_insn (gen_addv64si3_vcc_dup
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                operands[1],
+                vcc));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si
+               (dsthi, gcn_operand_part (DImode, operands[2], 1)));
+    emit_insn (gen_addcv64si3 (dsthi, dsthi, operands[3], vcc, vcc));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "addv64di3_sext_dup2_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                     "= v")
+       (vec_merge:V64DI
+         (plus:V64DI
+           (sign_extend:V64DI (match_operand:V64SI 1 "gcn_alu_operand"
+                                                                      " vA"))
+           (vec_duplicate:V64DI (match_operand:DI 2 "gcn_alu_operand" "BSv")))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand"      " U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"                   "  e")))
+   (clobber (match_scratch:V64SI 5                                    "=&v"))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "gcn_can_split_p  (V64DImode, operands[0])
+   && gcn_can_split_p (V64DImode, operands[3])"
+  [(const_int 0)]
+  {
+    rtx vcc = gen_rtx_REG (DImode, VCC_REG);
+    emit_insn (gen_ashrv64si3_exec (operands[5], operands[1], GEN_INT (31),
+                                   gcn_gen_undef (V64SImode), operands[4]));
+    emit_insn (gen_addv64si3_vcc_dup_exec
+               (gcn_operand_part (V64DImode, operands[0], 0),
+                gcn_operand_part (DImode, operands[2], 0),
+                operands[1],
+                vcc,
+                gcn_operand_part (V64DImode, operands[3], 0),
+                operands[4]));
+    rtx dsthi = gcn_operand_part (V64DImode, operands[0], 1);
+    emit_insn (gen_vec_duplicatev64si_exec
+               (dsthi, gcn_operand_part (DImode, operands[2], 1),
+                gcn_gen_undef (V64SImode), operands[4]));
+    emit_insn (gen_addcv64si3_exec
+               (dsthi, dsthi, operands[5], vcc, vcc,
+                gcn_operand_part (V64DImode, operands[3], 1),
+                operands[4]));
+    DONE;
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ DS memory ALU: add/sub
+
+(define_mode_iterator DS_ARITH_MODE [V64SI V64SF V64DI])
+(define_mode_iterator DS_ARITH_SCALAR_MODE [SI SF DI])
+
+;; FIXME: the vector patterns probably need RD expanded to a vector of
+;;        addresses.  For now, the only way a vector can get into LDS is
+;;        if the user puts it there manually.
+;;
+;; FIXME: the scalar patterns are probably fine in themselves, but need to be
+;;        checked to see if anything can ever use them.
+
+(define_insn "add<mode>3_ds<exec>"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"  "=RD")
+       (plus:DS_ARITH_MODE
+         (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" "%RD")
+         (match_operand:DS_ARITH_MODE 2 "register_operand"      "  v")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_add%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "add<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+       (plus:DS_ARITH_SCALAR_MODE
+         (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+                                                                     "%RD")
+         (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"    "  v")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_add%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds<exec>"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"  "=RD")
+       (minus:DS_ARITH_MODE
+         (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")
+         (match_operand:DS_ARITH_MODE 2 "register_operand"      "  v")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_sub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "sub<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+       (minus:DS_ARITH_SCALAR_MODE
+         (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand"
+                                                                     " RD")
+         (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"    "  v")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_sub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds<exec>"
+  [(set (match_operand:DS_ARITH_MODE 0 "gcn_ds_memory_operand"  "=RD")
+       (minus:DS_ARITH_MODE
+         (match_operand:DS_ARITH_MODE 2 "register_operand"      "  v")
+         (match_operand:DS_ARITH_MODE 1 "gcn_ds_memory_operand" " RD")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_rsub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+(define_insn "subr<mode>3_ds_scalar"
+  [(set (match_operand:DS_ARITH_SCALAR_MODE 0 "gcn_ds_memory_operand" "=RD")
+       (minus:DS_ARITH_SCALAR_MODE
+         (match_operand:DS_ARITH_SCALAR_MODE 2 "register_operand"    "  v")
+         (match_operand:DS_ARITH_SCALAR_MODE 1 "gcn_ds_memory_operand" 
+                                                                     " RD")))]
+  "rtx_equal_p (operands[0], operands[1])"
+  "ds_rsub%u0\t%A0, %2%O0"
+  [(set_attr "type" "ds")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU special case: mult
+
+(define_insn "<su>mulv64si3_highpart<exec>"
+  [(set (match_operand:V64SI 0 "register_operand"       "=  v")
+       (truncate:V64SI
+         (lshiftrt:V64DI
+           (mult:V64DI
+             (any_extend:V64DI
+               (match_operand:V64SI 1 "gcn_alu_operand" "  %v"))
+             (any_extend:V64DI
+               (match_operand:V64SI 2 "gcn_alu_operand" "vSvA")))
+           (const_int 32))))]
+  ""
+  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "mulv64si3<exec>"
+  [(set (match_operand:V64SI 0 "register_operand"  "=   v")
+       (mult:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+         (match_operand:V64SI 2 "gcn_alu_operand" " vSvA")))]
+  ""
+  "v_mul_lo_u32\t%0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "mulv64si3_dup<exec>"
+  [(set (match_operand:V64SI 0 "register_operand"  "=   v")
+       (mult:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" "%vSvA")
+         (vec_duplicate:V64SI
+           (match_operand:SI 2 "gcn_alu_operand"  "  SvA"))))]
+  ""
+  "v_mul_lo_u32\t%0, %1, %2"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "mulv64di3"
+  [(set (match_operand:V64DI 0 "register_operand"  "=&v")
+       (mult:V64DI
+         (match_operand:V64DI 1 "gcn_alu_operand" "% v")
+         (match_operand:V64DI 2 "gcn_alu_operand" "vDA")))
+   (clobber (match_scratch:V64SI 3                "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+    rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx tmp = operands[3];
+
+    emit_insn (gen_mulv64si3 (out_lo, left_lo, right_lo));
+    emit_insn (gen_umulv64si3_highpart (out_hi, left_lo, right_lo));
+    emit_insn (gen_mulv64si3 (tmp, left_hi, right_lo));
+    emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+    emit_insn (gen_mulv64si3 (tmp, left_lo, right_hi));
+    emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+    emit_insn (gen_mulv64si3 (tmp, left_hi, right_hi));
+    emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "=&v")
+       (vec_merge:V64DI
+         (mult:V64DI
+           (match_operand:V64DI 1 "gcn_alu_operand"              "% v")
+           (match_operand:V64DI 2 "gcn_alu_operand"              "vDA"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "  e")))
+   (clobber (match_scratch:V64SI 5                                "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left_lo = gcn_operand_part (V64DImode, operands[1], 0);
+    rtx left_hi = gcn_operand_part (V64DImode, operands[1], 1);
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[4];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[3]) == UNSPEC)
+      {
+       old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+       old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+       old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_exec (out_lo, left_lo, right_lo, old_lo, exec));
+    emit_insn (gen_umulv64si3_highpart_exec (out_hi, left_lo, right_lo,
+                                            old_hi, exec));
+    emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_lo, undef, exec));
+    emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+    emit_insn (gen_mulv64si3_exec (tmp, left_lo, right_hi, undef, exec));
+    emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+    emit_insn (gen_mulv64si3_exec (tmp, left_hi, right_hi, undef, exec));
+    emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_zext"
+  [(set (match_operand:V64DI 0 "register_operand"    "=&v")
+       (mult:V64DI
+         (zero_extend:V64DI
+           (match_operand:V64SI 1 "gcn_alu_operand" "  v"))
+         (match_operand:V64DI 2 "gcn_alu_operand"   "vDA")))
+   (clobber (match_scratch:V64SI 3                  "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx tmp = operands[3];
+
+    emit_insn (gen_mulv64si3 (out_lo, left, right_lo));
+    emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo));
+    emit_insn (gen_mulv64si3 (tmp, left, right_hi));
+    emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_zext_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "=&v")
+       (vec_merge:V64DI
+         (mult:V64DI
+           (zero_extend:V64DI
+             (match_operand:V64SI 1 "gcn_alu_operand"            "  v"))
+           (match_operand:V64DI 2 "gcn_alu_operand"              "vDA"))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" " U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "  e")))
+   (clobber (match_scratch:V64SI 5                                "=&v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[4];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[3]) == UNSPEC)
+      {
+       old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+       old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+       old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec));
+    emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo,
+                                            old_hi, exec));
+    emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec));
+    emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_zext_dup2"
+  [(set (match_operand:V64DI 0 "register_operand"    "= &v")
+       (mult:V64DI
+         (zero_extend:V64DI
+           (match_operand:V64SI 1 "gcn_alu_operand" "   v"))
+         (vec_duplicate:V64DI
+           (match_operand:DI 2 "gcn_alu_operand"    "SvDA"))))
+   (clobber (match_scratch:V64SI 3                  "= &v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx tmp = operands[3];
+
+    emit_insn (gen_mulv64si3 (out_lo, left, right_lo));
+    emit_insn (gen_umulv64si3_highpart (out_hi, left, right_lo));
+    emit_insn (gen_mulv64si3 (tmp, left, right_hi));
+    emit_insn (gen_addv64si3 (out_hi, out_hi, tmp));
+    DONE;
+  })
+
+(define_insn_and_split "mulv64di3_zext_dup2_exec"
+  [(set (match_operand:V64DI 0 "register_operand"                "= &v")
+       (vec_merge:V64DI
+         (mult:V64DI
+           (zero_extend:V64DI
+             (match_operand:V64SI 1 "gcn_alu_operand"            "   v"))
+           (vec_duplicate:V64DI
+             (match_operand:DI 2 "gcn_alu_operand"               "SvDA")))
+         (match_operand:V64DI 3 "gcn_register_or_unspec_operand" "  U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "   e")))
+   (clobber (match_scratch:V64SI 5                                "= &v"))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    rtx out_lo = gcn_operand_part (V64DImode, operands[0], 0);
+    rtx out_hi = gcn_operand_part (V64DImode, operands[0], 1);
+    rtx left = operands[1];
+    rtx right_lo = gcn_operand_part (V64DImode, operands[2], 0);
+    rtx right_hi = gcn_operand_part (V64DImode, operands[2], 1);
+    rtx exec = operands[4];
+    rtx tmp = operands[5];
+
+    rtx old_lo, old_hi;
+    if (GET_CODE (operands[3]) == UNSPEC)
+      {
+       old_lo = old_hi = gcn_gen_undef (V64SImode);
+      }
+    else
+      {
+       old_lo = gcn_operand_part (V64DImode, operands[3], 0);
+       old_hi = gcn_operand_part (V64DImode, operands[3], 1);
+      }
+
+    rtx undef = gcn_gen_undef (V64SImode);
+
+    emit_insn (gen_mulv64si3_exec (out_lo, left, right_lo, old_lo, exec));
+    emit_insn (gen_umulv64si3_highpart_exec (out_hi, left, right_lo,
+                                            old_hi, exec));
+    emit_insn (gen_mulv64si3_exec (tmp, left, right_hi, undef, exec));
+    emit_insn (gen_addv64si3_exec (out_hi, out_hi, tmp, out_hi, exec));
+    DONE;
+  })
+
+;; }}}
+;; {{{ ALU generic case
+
+(define_mode_iterator VEC_INT_MODE [V64QI V64HI V64SI V64DI])
+
+(define_code_iterator bitop [and ior xor])
+(define_code_iterator shiftop [ashift lshiftrt ashiftrt])
+(define_code_iterator minmaxop [smin smax umin umax])
+
+(define_insn "<expander><mode>2<exec>"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand"    "=  v")
+       (bitunop:VEC_1REG_INT_MODE
+         (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand" "vSvB")))]
+  ""
+  "v_<mnemonic>0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "=  v,RD")
+       (bitop:VEC_1REG_INT_MODE
+         (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+                                                                 "%  v, 0")
+         (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+                                                                 "vSvB, v")))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %2, %1
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8,8")])
+
+(define_insn_and_split "<expander>v64di3"
+  [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+       (bitop:V64DI
+         (match_operand:V64DI 1 "gcn_valu_src0_operand"          "%  v,RD")
+         (match_operand:V64DI 2 "gcn_valu_src1com_operand"       "vSvB, v")))]
+  ""
+  "@
+   #
+   ds_<mnemonic>0\t%A0, %2%O0"
+  "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+  [(set (match_dup 3)
+       (bitop:V64SI (match_dup 5) (match_dup 7)))
+   (set (match_dup 4)
+       (bitop:V64SI (match_dup 6) (match_dup 8)))]
+  {
+    operands[3] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[4] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[5] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[6] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[7] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[8] = gcn_operand_part (V64DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult,ds")
+   (set_attr "length" "16,8")])
+
+(define_insn_and_split "<expander>v64di3_exec"
+  [(set (match_operand:V64DI 0 "gcn_valu_dst_operand" "=&v,RD")
+       (vec_merge:V64DI
+         (bitop:V64DI
+           (match_operand:V64DI 1 "gcn_valu_src0_operand"        "%  v,RD")
+           (match_operand:V64DI 2 "gcn_valu_src1com_operand"     "vSvB, v"))
+         (match_operand:V64DI 3 "gcn_register_ds_or_unspec_operand"
+                                                                 "  U0,U0")
+         (match_operand:DI 4 "gcn_exec_reg_operand"              "   e, e")))]
+  "!memory_operand (operands[0], VOIDmode)
+   || (rtx_equal_p (operands[0], operands[1])
+       && register_operand (operands[2], VOIDmode))"
+  "@
+   #
+   ds_<mnemonic>0\t%A0, %2%O0"
+  "(reload_completed && !gcn_ds_memory_operand (operands[0], V64DImode))"
+  [(set (match_dup 5)
+       (vec_merge:V64SI
+         (bitop:V64SI (match_dup 7) (match_dup 9))
+         (match_dup 11)
+         (match_dup 4)))
+   (set (match_dup 6)
+       (vec_merge:V64SI
+         (bitop:V64SI (match_dup 8) (match_dup 10))
+         (match_dup 12)
+         (match_dup 4)))]
+  {
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[6] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[8] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[10] = gcn_operand_part (V64DImode, operands[2], 1);
+    operands[11] = gcn_operand_part (V64DImode, operands[3], 0);
+    operands[12] = gcn_operand_part (V64DImode, operands[3], 1);
+  }
+  [(set_attr "type" "vmult,ds")
+   (set_attr "length" "16,8")])
+
+(define_insn "<expander>v64si3<exec>"
+  [(set (match_operand:V64SI 0 "register_operand"  "= v")
+       (shiftop:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" "  v")
+         (vec_duplicate:V64SI
+           (match_operand:SI 2 "gcn_alu_operand"  "SvB"))))]
+  ""
+  "v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "v<expander>v64si3<exec>"
+  [(set (match_operand:V64SI 0 "register_operand"  "=v")
+       (shiftop:V64SI
+         (match_operand:V64SI 1 "gcn_alu_operand" " v")
+         (match_operand:V64SI 2 "gcn_alu_operand" "vB")))]
+  ""
+  "v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "gcn_valu_dst_operand" "=  v,RD")
+       (minmaxop:VEC_1REG_INT_MODE
+         (match_operand:VEC_1REG_INT_MODE 1 "gcn_valu_src0_operand"
+                                                                 "%  v, 0")
+         (match_operand:VEC_1REG_INT_MODE 2 "gcn_valu_src1com_operand"
+                                                                 "vSvB, v")))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %2, %1
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - special cases
+
+; GCN does not directly provide a DFmode subtract instruction, so we do it by
+; adding the negated second operand to the first.
+
+(define_insn "subv64df3<exec>"
+  [(set (match_operand:V64DF 0 "register_operand"  "=  v,   v")
+       (minus:V64DF
+         (match_operand:V64DF 1 "gcn_alu_operand" "vSvB,   v")
+         (match_operand:V64DF 2 "gcn_alu_operand" "   v,vSvB")))]
+  ""
+  "@
+   v_add_f64\t%0, %1, -%2
+   v_add_f64\t%0, -%2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8,8")])
+
+(define_insn "subdf"
+  [(set (match_operand:DF 0 "register_operand"  "=  v,   v")
+       (minus:DF
+         (match_operand:DF 1 "gcn_alu_operand" "vSvB,   v")
+         (match_operand:DF 2 "gcn_alu_operand" "   v,vSvB")))]
+  ""
+  "@
+   v_add_f64\t%0, %1, -%2
+   v_add_f64\t%0, -%2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP binops - generic
+
+(define_mode_iterator VEC_FP_MODE [V64HF V64SF V64DF])
+(define_mode_iterator VEC_FP_1REG_MODE [V64HF V64SF])
+(define_mode_iterator FP_MODE [HF SF DF])
+(define_mode_iterator FP_1REG_MODE [HF SF])
+
+(define_code_iterator comm_fp [plus mult smin smax])
+(define_code_iterator nocomm_fp [minus])
+(define_code_iterator all_fp [plus mult minus smin smax])
+
+(define_insn "<expander><mode>3<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"  "=  v")
+       (comm_fp:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "%  v")
+         (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "vSvB")))]
+  ""
+  "v_<mnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3"
+  [(set (match_operand:FP_MODE 0 "gcn_valu_dst_operand"    "=  v,  RL")
+       (comm_fp:FP_MODE
+         (match_operand:FP_MODE 1 "gcn_valu_src0_operand" "%  v,   0")
+         (match_operand:FP_MODE 2 "gcn_valu_src1_operand" "vSvB,vSvB")))]
+  ""
+  "@
+  v_<mnemonic>0\t%0, %2, %1
+  v_<mnemonic>0\t%0, %1%O0"
+  [(set_attr "type" "vop2,ds")
+   (set_attr "length" "8")])
+
+(define_insn "<expander><mode>3<exec>"
+  [(set (match_operand:VEC_FP_1REG_MODE 0 "register_operand"  "=  v,   v")
+       (nocomm_fp:VEC_FP_1REG_MODE
+         (match_operand:VEC_FP_1REG_MODE 1 "gcn_alu_operand" "vSvB,   v")
+         (match_operand:VEC_FP_1REG_MODE 2 "gcn_alu_operand" "   v,vSvB")))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "<expander><mode>3"
+  [(set (match_operand:FP_1REG_MODE 0 "register_operand"  "=  v,   v")
+       (nocomm_fp:FP_1REG_MODE
+         (match_operand:FP_1REG_MODE 1 "gcn_alu_operand" "vSvB,   v")
+         (match_operand:FP_1REG_MODE 2 "gcn_alu_operand" "   v,vSvB")))]
+  ""
+  "@
+   v_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "vop2")
+   (set_attr "length" "8,8")])
+
+;; }}}
+;; {{{ FP unops
+
+(define_insn "abs<mode>2"
+  [(set (match_operand:FP_MODE 0 "register_operand"             "=v")
+       (abs:FP_MODE (match_operand:FP_MODE 1 "register_operand" " v")))]
+  ""
+  "v_add%i0\t%0, 0, |%1|"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "abs<mode>2<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"          "=v")
+       (abs:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "register_operand" " v")))]
+  ""
+  "v_add%i0\t%0, 0, |%1|"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "neg<mode>2<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"          "=v")
+       (neg:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "register_operand" " v")))]
+  ""
+  "v_add%i0\t%0, 0, -%1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>2<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"         "=  v")
+       (sqrt:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+  "flag_unsafe_math_optimizations"
+  "v_sqrt%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "sqrt<mode>2"
+  [(set (match_operand:FP_MODE 0 "register_operand"  "=  v")
+       (sqrt:FP_MODE
+         (match_operand:FP_MODE 1 "gcn_alu_operand" "vSvB")))]
+  "flag_unsafe_math_optimizations"
+  "v_sqrt%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ FP fused multiply and add
+
+(define_insn "fma<mode>4<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"         "=  v,   v")
+       (fma:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "gcn_alu_operand" "% vA,  vA")
+         (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "  vA,vSvA")
+         (match_operand:VEC_FP_MODE 3 "gcn_alu_operand" "vSvA,  vA")))]
+  ""
+  "v_fma%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>4_negop2<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"           "=  v,   v,   v")
+       (fma:VEC_FP_MODE
+         (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"   "  vA,  vA,vSvA")
+         (neg:VEC_FP_MODE
+           (match_operand:VEC_FP_MODE 2 "gcn_alu_operand" "  vA,vSvA,  vA"))
+         (match_operand:VEC_FP_MODE 3 "gcn_alu_operand"   "vSvA,  vA,  vA")))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>4"
+  [(set (match_operand:FP_MODE 0 "register_operand"  "=  v,   v")
+       (fma:FP_MODE
+         (match_operand:FP_MODE 1 "gcn_alu_operand" "% vA,  vA")
+         (match_operand:FP_MODE 2 "gcn_alu_operand" "  vA,vSvA")
+         (match_operand:FP_MODE 3 "gcn_alu_operand" "vSvA,  vA")))]
+  ""
+  "v_fma%i0\t%0, %1, %2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "fma<mode>4_negop2"
+  [(set (match_operand:FP_MODE 0 "register_operand"    "=  v,   v,   v")
+       (fma:FP_MODE
+         (match_operand:FP_MODE 1 "gcn_alu_operand"   "  vA,  vA,vSvA")
+         (neg:FP_MODE
+           (match_operand:FP_MODE 2 "gcn_alu_operand" "  vA,vSvA,  vA"))
+         (match_operand:FP_MODE 3 "gcn_alu_operand"   "vSvA,  vA,  vA")))]
+  ""
+  "v_fma%i0\t%0, %1, -%2, %3"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ FP division
+
+(define_insn "recip<mode>2<exec>"
+  [(set (match_operand:VEC_FP_MODE 0 "register_operand"           "=  v")
+       (div:VEC_FP_MODE
+         (vec_duplicate:VEC_FP_MODE (float:<SCALAR_MODE> (const_int 1)))
+         (match_operand:VEC_FP_MODE 1 "gcn_alu_operand"   "vSvB")))]
+  ""
+  "v_rcp%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "recip<mode>2"
+  [(set (match_operand:FP_MODE 0 "register_operand"     "=  v")
+       (div:FP_MODE
+         (float:FP_MODE (const_int 1))
+         (match_operand:FP_MODE 1 "gcn_alu_operand"     "vSvB")))]
+  ""
+  "v_rcp%i0\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+;; Do division via a = b * 1/c
+;; The v_rcp_* instructions are not sufficiently accurate on their own,
+;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
+;; which the ISA manual says is enough to improve the reciprocal accuracy.
+;;
+;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+
+(define_expand "div<mode>3"
+  [(match_operand:VEC_FP_MODE 0 "gcn_valu_dst_operand")
+   (match_operand:VEC_FP_MODE 1 "gcn_valu_src0_operand")
+   (match_operand:VEC_FP_MODE 2 "gcn_valu_src0_operand")]
+  "flag_reciprocal_math"
+  {
+    rtx two = gcn_vec_constant (<MODE>mode,
+                 const_double_from_real_value (dconst2, <SCALAR_MODE>mode));
+    rtx initrcp = gen_reg_rtx (<MODE>mode);
+    rtx fma = gen_reg_rtx (<MODE>mode);
+    rtx rcp;
+
+    bool is_rcp = (GET_CODE (operands[1]) == CONST_VECTOR
+                  && real_identical
+                       (CONST_DOUBLE_REAL_VALUE
+                         (CONST_VECTOR_ELT (operands[1], 0)), &dconstm1));
+
+    if (is_rcp)
+      rcp = operands[0];
+    else
+      rcp = gen_reg_rtx (<MODE>mode);
+
+    emit_insn (gen_recip<mode>2 (initrcp, operands[2]));
+    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+    emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+    if (!is_rcp)
+      emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+    DONE;
+  })
+
+(define_expand "div<mode>3"
+  [(match_operand:FP_MODE 0 "gcn_valu_dst_operand")
+   (match_operand:FP_MODE 1 "gcn_valu_src0_operand")
+   (match_operand:FP_MODE 2 "gcn_valu_src0_operand")]
+  "flag_reciprocal_math"
+  {
+    rtx two = const_double_from_real_value (dconst2, <MODE>mode);
+    rtx initrcp = gen_reg_rtx (<MODE>mode);
+    rtx fma = gen_reg_rtx (<MODE>mode);
+    rtx rcp;
+
+    bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
+                  && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
+                                     &dconstm1));
+
+    if (is_rcp)
+      rcp = operands[0];
+    else
+      rcp = gen_reg_rtx (<MODE>mode);
+
+    emit_insn (gen_recip<mode>2 (initrcp, operands[2]));
+    emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, operands[2], two));
+    emit_insn (gen_mul<mode>3 (rcp, initrcp, fma));
+
+    if (!is_rcp)
+      emit_insn (gen_mul<mode>3 (operands[0], operands[1], rcp));
+
+    DONE;
+  })
+
+;; }}}
+;; {{{ Int/FP conversions
+
+(define_mode_iterator CVT_FROM_MODE [HI SI HF SF DF])
+(define_mode_iterator CVT_TO_MODE [HI SI HF SF DF])
+
+(define_mode_iterator VCVT_FROM_MODE [V64HI V64SI V64HF V64SF V64DF])
+(define_mode_iterator VCVT_TO_MODE [V64HI V64SI V64HF V64SF V64DF])
+
+(define_code_iterator cvt_op [fix unsigned_fix
+                             float unsigned_float
+                             float_extend float_truncate])
+(define_code_attr cvt_name [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")
+                           (float "float") (unsigned_float "floatuns")
+                           (float_extend "extend") (float_truncate "trunc")])
+(define_code_attr cvt_operands [(fix "%i0%i1") (unsigned_fix "%u0%i1")
+                               (float "%i0%i1") (unsigned_float "%i0%u1")
+                               (float_extend "%i0%i1")
+                               (float_truncate "%i0%i1")])
+
+(define_insn "<cvt_name><CVT_FROM_MODE:mode><CVT_TO_MODE:mode>2"
+  [(set (match_operand:CVT_TO_MODE 0 "register_operand"           "=  v")
+       (cvt_op:CVT_TO_MODE
+         (match_operand:CVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))]
+  "gcn_valid_cvt_p (<CVT_FROM_MODE:MODE>mode, <CVT_TO_MODE:MODE>mode,
+                   <cvt_name>_cvt)"
+  "v_cvt<cvt_operands>\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+(define_insn "<cvt_name><VCVT_FROM_MODE:mode><VCVT_TO_MODE:mode>2<exec>"
+  [(set (match_operand:VCVT_TO_MODE 0 "register_operand"    "=  v")
+       (cvt_op:VCVT_TO_MODE
+         (match_operand:VCVT_FROM_MODE 1 "gcn_alu_operand" "vSvB")))]
+  "gcn_valid_cvt_p (<VCVT_FROM_MODE:MODE>mode, <VCVT_TO_MODE:MODE>mode,
+                   <cvt_name>_cvt)"
+  "v_cvt<cvt_operands>\t%0, %1"
+  [(set_attr "type" "vop1")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Int/int conversions
+
+;; GCC can already do these for scalar types, but not for vector types.
+;; Unfortunately you can't just do SUBREG on a vector to select the low part,
+;; so there must be a few tricks here.
+
+(define_insn_and_split "vec_truncatev64div64si"
+  [(set (match_operand:V64SI 0 "register_operand"   "=v,&v")
+       (truncate:V64SI
+         (match_operand:V64DI 1 "register_operand" " 0, v")))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  {
+    operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+  }
+  [(set_attr "type" "vop2")
+   (set_attr "length" "0,4")])
+
+(define_insn_and_split "vec_truncatev64div64si_exec"
+  [(set (match_operand:V64SI 0 "register_operand"           "=v,&v")
+       (vec_merge:V64SI
+         (truncate:V64SI
+           (match_operand:V64DI 1 "register_operand"        " 0, v"))
+         (match_operand:V64SI 2 "gcn_alu_or_unspec_operand" "U0,U0")
+         (match_operand:DI 3 "gcn_exec_operand"             " e, e")))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+                  (vec_merge:V64SI (match_dup 1) (match_dup 2) (match_dup 3)))
+             (clobber (scratch:V64DI))])]
+  {
+    operands[1] = gcn_operand_part (V64SImode, operands[1], 0);
+  }
+  [(set_attr "type" "vop2")
+   (set_attr "length" "0,4")])
+
+;; }}}
+;; {{{ Vector comparison/merge
+
+(define_insn "vec_cmp<mode>di"
+  [(set (match_operand:DI 0 "register_operand"       "=cV,cV,  e, e,Sg,Sg")
+       (match_operator 1 "comparison_operator"
+         [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+                                                     "vSv, B,vSv, B, v,vA")
+          (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+                                                     "  v, v,  v, v,vA, v")]))
+   (clobber (match_scratch:DI 4                              "= X, X, cV,cV, X, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+   (set_attr "length" "4,8,4,8,8,8")])
+
+(define_expand "vec_cmpu<mode>di"
+  [(match_operand:DI 0 "register_operand")
+   (match_operator 1 "comparison_operator"
+     [(match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+      (match_operand:VEC_1REG_INT_MODE 3 "gcn_vop3_operand")])]
+  ""
+  {
+    /* Unsigned comparisons use the same patterns as signed comparisons,
+       except that they use unsigned operators (e.g. LTU vs LT).
+       The '%E1' directive then does the Right Thing.  */
+    emit_insn (gen_vec_cmp<mode>di (operands[0], operands[1], operands[2],
+                                   operands[3]));
+    DONE;
+  })
+
+(define_insn "vec_cmp<mode>di_exec"
+  [(set (match_operand:DI 0 "register_operand"        "=cV,cV,  e, e,Sg,Sg")
+       (and:DI
+         (match_operator 1 "comparison_operator"
+           [(match_operand:VEC_1REG_MODE 2 "gcn_alu_operand"
+                                                      "vSv, B,vSv, B, v,vA")
+            (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+                                                      "  v, v,  v, v,vA, v")])
+         (match_operand:DI 4 "gcn_exec_reg_operand"   "  e, e,  e, e, e, e")))
+   (clobber (match_scratch:DI 5                               "= X, X, cV,cV, X, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a")
+   (set_attr "length" "4,8,4,8,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup"
+  [(set (match_operand:DI 0 "register_operand"            "=cV,cV, e,e,Sg")
+       (match_operator 1 "comparison_operator"
+         [(vec_duplicate:VEC_1REG_MODE
+            (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+                                                          " Sv, B,Sv,B, A"))
+          (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+                                                          "  v, v, v,v, v")]))
+   (clobber (match_scratch:DI 4                                   "= X,X,cV,cV, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+   (set_attr "length" "4,8,4,8,8")])
+
+(define_insn "vec_cmp<mode>di_dup_exec"
+  [(set (match_operand:DI 0 "register_operand"             "=cV,cV, e,e,Sg")
+       (and:DI
+         (match_operator 1 "comparison_operator"
+           [(vec_duplicate:VEC_1REG_MODE
+              (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand"
+                                                           " Sv, B,Sv,B, A"))
+            (match_operand:VEC_1REG_MODE 3 "gcn_vop3_operand"
+                                                           "  v, v, v,v, v")])
+         (match_operand:DI 4 "gcn_exec_reg_operand"        "  e, e, e,e, e")))
+   (clobber (match_scratch:DI 5                                    "= X,X,cV,cV, X"))]
+  ""
+  "@
+   v_cmp%E1\tvcc, %2, %3
+   v_cmp%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmpx%E1\tvcc, %2, %3
+   v_cmp%E1\t%0, %2, %3"
+  [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a")
+   (set_attr "length" "4,8,4,8,8")])
+
+(define_expand "vcond_mask_<mode>di"
+  [(parallel
+    [(set (match_operand:VEC_REG_MODE 0 "register_operand" "")
+         (vec_merge:VEC_REG_MODE
+           (match_operand:VEC_REG_MODE 1 "gcn_vop3_operand" "")
+           (match_operand:VEC_REG_MODE 2 "gcn_alu_operand" "")
+           (match_operand:DI 3 "register_operand" "")))
+     (clobber (scratch:V64DI))])]
+  ""
+  "")
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>"
+  [(match_operand:VEC_1REG_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    emit_insn (gen_vec_cmp<mode>di (tmp, operands[3], operands[4],
+                                   operands[5]));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+                                       tmp));
+    DONE;
+  })
+
+(define_expand "vcond<VEC_1REG_MODE:mode><VEC_1REG_ALT:mode>_exec"
+  [(match_operand:VEC_1REG_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_ALT 5 "gcn_vop3_operand")])
+   (match_operand:DI 6 "gcn_exec_reg_operand" "e")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    emit_insn (gen_vec_cmp<mode>di_exec (tmp, operands[3], operands[4],
+                                        operands[5], operands[6]));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+                                       tmp));
+    DONE;
+  })
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>"
+  [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    emit_insn (gen_vec_cmp<mode>di (tmp, operands[3], operands[4],
+                                   operands[5]));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+                                       tmp));
+    DONE;
+  })
+
+(define_expand "vcondu<VEC_1REG_INT_MODE:mode><VEC_1REG_INT_ALT:mode>_exec"
+  [(match_operand:VEC_1REG_INT_MODE 0 "register_operand")
+   (match_operand:VEC_1REG_INT_MODE 1 "gcn_vop3_operand")
+   (match_operand:VEC_1REG_INT_MODE 2 "gcn_alu_operand")
+   (match_operator 3 "comparison_operator"
+     [(match_operand:VEC_1REG_INT_ALT 4 "gcn_alu_operand")
+      (match_operand:VEC_1REG_INT_ALT 5 "gcn_vop3_operand")])
+   (match_operand:DI 6 "gcn_exec_reg_operand" "e")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    emit_insn (gen_vec_cmp<mode>di_exec (tmp, operands[3], operands[4],
+                                        operands[5], operands[6]));
+    emit_insn (gen_vcond_mask_<mode>di (operands[0], operands[1], operands[2],
+                                       tmp));
+    DONE;
+  })
+
+;; }}}
+;; {{{ Fully masked loop support
+
+(define_expand "while_ultsidi"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:SI 1 "")
+   (match_operand:SI 2 "")]
+  ""
+  {
+    if (GET_CODE (operands[1]) != CONST_INT
+       || GET_CODE (operands[2]) != CONST_INT)
+      {
+       rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+       rtx tmp = _0_1_2_3;
+       if (GET_CODE (operands[1]) != CONST_INT
+           || INTVAL (operands[1]) != 0)
+         {
+           tmp = gen_reg_rtx (V64SImode);
+           emit_insn (gen_addv64si3_dup (tmp, _0_1_2_3, operands[1]));
+         }
+       emit_insn (gen_vec_cmpv64sidi_dup (operands[0],
+                                          gen_rtx_GT (VOIDmode, 0, 0),
+                                          operands[2], tmp));
+      }
+    else
+      {
+       HOST_WIDE_INT diff = INTVAL (operands[2]) - INTVAL (operands[1]);
+       HOST_WIDE_INT mask = (diff >= 64 ? -1
+                             : ~((unsigned HOST_WIDE_INT)-1 << diff));
+       emit_move_insn (operands[0], gen_rtx_CONST_INT (VOIDmode, mask));
+      }
+    DONE;
+  })
+
+(define_expand "maskload<mode>di"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:VEC_REG_MODE 1 "memory_operand")
+   (match_operand 2 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[2]);
+    rtx addr = gcn_expand_scalar_to_vector_address
+               (<MODE>mode, exec, operands[1], gen_rtx_SCRATCH (V64DImode));
+    rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
+    rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
+    rtx undef = gcn_gen_undef (<MODE>mode);
+    emit_insn (gen_gather<mode>_expr_exec (operands[0], addr, as, v, undef,
+                                          exec));
+    DONE;
+  })
+
+(define_expand "maskstore<mode>di"
+  [(match_operand:VEC_REG_MODE 0 "memory_operand")
+   (match_operand:VEC_REG_MODE 1 "register_operand")
+   (match_operand 2 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[2]);
+    rtx addr = gcn_expand_scalar_to_vector_address
+               (<MODE>mode, exec, operands[0], gen_rtx_SCRATCH (V64DImode));
+    rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[0]));
+    rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[0]));
+    emit_insn (gen_scatter<mode>_expr_exec (addr, operands[1], as, v, exec));
+    DONE;
+  })
+
+(define_expand "mask_gather_load<mode>"
+  [(match_operand:VEC_REG_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand 2 "register_operand")
+   (match_operand 3 "immediate_operand")
+   (match_operand:SI 4 "gcn_alu_operand")
+   (match_operand:DI 5 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[5]);
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[2]) == V64DImode)
+      {
+       rtx tmp = gen_reg_rtx (V64SImode);
+       emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[2],
+                                                   gcn_gen_undef (V64SImode),
+                                                   exec));
+       operands[2] = tmp;
+      }
+
+    emit_insn (gen_gather<mode>_exec (operands[0], operands[1], operands[2],
+                                     operands[3], operands[4], exec));
+    DONE;
+  })
+
+(define_expand "mask_scatter_store<mode>"
+  [(match_operand:DI 0 "register_operand")
+   (match_operand 1 "register_operand")
+   (match_operand 2 "immediate_operand")
+   (match_operand:SI 3 "gcn_alu_operand")
+   (match_operand:VEC_REG_MODE 4 "register_operand")
+   (match_operand:DI 5 "")]
+  ""
+  {
+    rtx exec = force_reg (DImode, operands[5]);
+
+    /* TODO: more conversions will be needed when more types are vectorized. */
+    if (GET_MODE (operands[1]) == V64DImode)
+      {
+       rtx tmp = gen_reg_rtx (V64SImode);
+       emit_insn (gen_vec_truncatev64div64si_exec (tmp, operands[1],
+                                                   gcn_gen_undef (V64SImode),
+                                                   exec));
+       operands[1] = tmp;
+      }
+
+    emit_insn (gen_scatter<mode>_exec (operands[0], operands[1], operands[2],
+                                      operands[3], operands[4], exec));
+    DONE;
+  })
+
+; FIXME this should be VEC_REG_MODE, but not all dependencies are implemented.
+(define_mode_iterator COND_MODE [V64SI V64DI V64SF V64DF])
+(define_mode_iterator COND_INT_MODE [V64SI V64DI])
+
+(define_code_iterator cond_op [plus minus])
+
+(define_expand "cond_<expander><mode>"
+  [(match_operand:COND_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (cond_op:COND_MODE
+     (match_operand:COND_MODE 2 "gcn_alu_operand")
+     (match_operand:COND_MODE 3 "gcn_alu_operand"))
+   (match_operand:COND_MODE 4 "register_operand")]
+  ""
+  {
+    operands[1] = force_reg (DImode, operands[1]);
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+
+    emit_insn (gen_<expander><mode>3_exec (operands[0], operands[2],
+                                          operands[3], operands[4],
+                                          operands[1]));
+    DONE;
+  })
+
+(define_code_iterator cond_bitop [and ior xor])
+
+(define_expand "cond_<expander><mode>"
+  [(match_operand:COND_INT_MODE 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (cond_bitop:COND_INT_MODE
+     (match_operand:COND_INT_MODE 2 "gcn_alu_operand")
+     (match_operand:COND_INT_MODE 3 "gcn_alu_operand"))
+   (match_operand:COND_INT_MODE 4 "register_operand")]
+  ""
+  {
+    operands[1] = force_reg (DImode, operands[1]);
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+
+    emit_insn (gen_<expander><mode>3_exec (operands[0], operands[2],
+                                          operands[3], operands[4],
+                                          operands[1]));
+    DONE;
+  })
+
+;; }}}
+;; {{{ Vector reductions
+
+(define_int_iterator REDUC_UNSPEC [UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+                                  UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+                                  UNSPEC_PLUS_DPP_SHR
+                                  UNSPEC_AND_DPP_SHR
+                                  UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+(define_int_iterator REDUC_2REG_UNSPEC [UNSPEC_PLUS_DPP_SHR
+                                       UNSPEC_AND_DPP_SHR
+                                       UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR])
+
+; FIXME: Isn't there a better way of doing this?
+(define_int_attr reduc_unspec [(UNSPEC_SMIN_DPP_SHR "UNSPEC_SMIN_DPP_SHR")
+                              (UNSPEC_SMAX_DPP_SHR "UNSPEC_SMAX_DPP_SHR")
+                              (UNSPEC_UMIN_DPP_SHR "UNSPEC_UMIN_DPP_SHR")
+                              (UNSPEC_UMAX_DPP_SHR "UNSPEC_UMAX_DPP_SHR")
+                              (UNSPEC_PLUS_DPP_SHR "UNSPEC_PLUS_DPP_SHR")
+                              (UNSPEC_AND_DPP_SHR "UNSPEC_AND_DPP_SHR")
+                              (UNSPEC_IOR_DPP_SHR "UNSPEC_IOR_DPP_SHR")
+                              (UNSPEC_XOR_DPP_SHR "UNSPEC_XOR_DPP_SHR")])
+
+(define_int_attr reduc_op [(UNSPEC_SMIN_DPP_SHR "smin")
+                          (UNSPEC_SMAX_DPP_SHR "smax")
+                          (UNSPEC_UMIN_DPP_SHR "umin")
+                          (UNSPEC_UMAX_DPP_SHR "umax")
+                          (UNSPEC_PLUS_DPP_SHR "plus")
+                          (UNSPEC_AND_DPP_SHR "and")
+                          (UNSPEC_IOR_DPP_SHR "ior")
+                          (UNSPEC_XOR_DPP_SHR "xor")])
+
+(define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
+                            (UNSPEC_SMAX_DPP_SHR "v_max%i0")
+                            (UNSPEC_UMIN_DPP_SHR "v_min%u0")
+                            (UNSPEC_UMAX_DPP_SHR "v_max%u0")
+                            (UNSPEC_PLUS_DPP_SHR "v_add%u0")
+                            (UNSPEC_AND_DPP_SHR  "v_and%b0")
+                            (UNSPEC_IOR_DPP_SHR  "v_or%b0")
+                            (UNSPEC_XOR_DPP_SHR  "v_xor%b0")])
+
+(define_expand "reduc_<reduc_op>_scal_<mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
+       (unspec:<SCALAR_MODE>
+         [(match_operand:VEC_1REG_MODE 1 "register_operand")]
+         REDUC_UNSPEC))]
+  ""
+  {
+    rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
+                                      <reduc_unspec>);
+
+    /* The result of the reduction is in lane 63 of tmp.  */
+    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+
+    DONE;
+  })
+
+(define_expand "reduc_<reduc_op>_scal_v64di"
+  [(set (match_operand:DI 0 "register_operand")
+       (unspec:DI
+         [(match_operand:V64DI 1 "register_operand")]
+         REDUC_2REG_UNSPEC))]
+  ""
+  {
+    rtx tmp = gcn_expand_reduc_scalar (V64DImode, operands[1],
+                                      <reduc_unspec>);
+
+    /* The result of the reduction is in lane 63 of tmp.  */
+    emit_insn (gen_mov_from_lane63_v64di (operands[0], tmp));
+
+    DONE;
+  })
+
+(define_insn "*<reduc_op>_dpp_shr_<mode>"
+  [(set (match_operand:VEC_1REG_MODE 0 "register_operand"   "=v")
+       (unspec:VEC_1REG_MODE
+         [(match_operand:VEC_1REG_MODE 1 "register_operand" "v")
+          (match_operand:VEC_1REG_MODE 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"           "n")]
+         REDUC_UNSPEC))]
+  "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
+     && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR)"
+  {
+    return gcn_expand_dpp_shr_insn (<MODE>mode, "<reduc_insn>",
+                                   <reduc_unspec>, INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "*<reduc_op>_dpp_shr_v64di"
+  [(set (match_operand:V64DI 0 "register_operand"   "=&v")
+       (unspec:V64DI
+         [(match_operand:V64DI 1 "register_operand" "v0")
+          (match_operand:V64DI 2 "register_operand" "v0")
+          (match_operand:SI 3 "const_int_operand"    "n")]
+         REDUC_2REG_UNSPEC))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4)
+       (unspec:V64SI
+         [(match_dup 6) (match_dup 8) (match_dup 3)] REDUC_2REG_UNSPEC))
+   (set (match_dup 5)
+       (unspec:V64SI
+         [(match_dup 7) (match_dup 9) (match_dup 3)] REDUC_2REG_UNSPEC))]
+  {
+    operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")])
+
+; Special cases for addition.
+
+(define_insn "*plus_carry_dpp_shr_<mode>"
+  [(set (match_operand:VEC_1REG_INT_MODE 0 "register_operand"   "=v")
+       (unspec:VEC_1REG_INT_MODE
+         [(match_operand:VEC_1REG_INT_MODE 1 "register_operand" "v")
+          (match_operand:VEC_1REG_INT_MODE 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"               "n")]
+         UNSPEC_PLUS_CARRY_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  {
+    const char *insn = TARGET_GCN3 ? "v_add%u0" : "v_add_co%u0";
+    return gcn_expand_dpp_shr_insn (<MODE>mode, insn,
+                                   UNSPEC_PLUS_CARRY_DPP_SHR,
+                                   INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "8")])
+
+(define_insn "*plus_carry_in_dpp_shr_v64si"
+  [(set (match_operand:V64SI 0 "register_operand"   "=v")
+       (unspec:V64SI
+         [(match_operand:V64SI 1 "register_operand" "v")
+          (match_operand:V64SI 2 "register_operand" "v")
+          (match_operand:SI 3 "const_int_operand"   "n")
+          (match_operand:DI 4 "register_operand"   "cV")]
+         UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  {
+    const char *insn = TARGET_GCN3 ? "v_addc%u0" : "v_addc_co%u0";
+    return gcn_expand_dpp_shr_insn (V64SImode, insn,
+                                   UNSPEC_PLUS_CARRY_IN_DPP_SHR,
+                                   INTVAL (operands[3]));
+  }
+  [(set_attr "type" "vop_dpp")
+   (set_attr "length" "8")])
+
+(define_insn_and_split "*plus_carry_dpp_shr_v64di"
+  [(set (match_operand:V64DI 0 "register_operand"   "=&v")
+       (unspec:V64DI
+         [(match_operand:V64DI 1 "register_operand" "v0")
+          (match_operand:V64DI 2 "register_operand" "v0")
+          (match_operand:SI 3 "const_int_operand"    "n")]
+         UNSPEC_PLUS_CARRY_DPP_SHR))
+   (clobber (reg:DI VCC_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 4)
+               (unspec:V64SI
+                 [(match_dup 6) (match_dup 8) (match_dup 3)]
+                 UNSPEC_PLUS_CARRY_DPP_SHR))
+             (clobber (reg:DI VCC_REG))])
+   (parallel [(set (match_dup 5)
+               (unspec:V64SI
+                 [(match_dup 7) (match_dup 9) (match_dup 3) (reg:DI VCC_REG)]
+                 UNSPEC_PLUS_CARRY_IN_DPP_SHR))
+             (clobber (reg:DI VCC_REG))])]
+  {
+    operands[4] = gcn_operand_part (V64DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (V64DImode, operands[0], 1);
+    operands[6] = gcn_operand_part (V64DImode, operands[1], 0);
+    operands[7] = gcn_operand_part (V64DImode, operands[1], 1);
+    operands[8] = gcn_operand_part (V64DImode, operands[2], 0);
+    operands[9] = gcn_operand_part (V64DImode, operands[2], 1);
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")])
+
+; Instructions to move a scalar value from lane 63 of a vector register.
+(define_insn "mov_from_lane63_<mode>"
+  [(set (match_operand:<SCALAR_MODE> 0 "register_operand"  "=Sg,v")
+       (unspec:<SCALAR_MODE>
+         [(match_operand:VEC_1REG_MODE 1 "register_operand" "v,v")]
+         UNSPEC_MOV_FROM_LANE63))]
+  ""
+  "@
+   v_readlane_b32\t%0, %1, 63
+   v_mov_b32\t%0, %1 wave_ror:1"
+  [(set_attr "type" "vop3a,vop_dpp")
+   (set_attr "exec" "none,*")
+   (set_attr "length" "8")])
+
+(define_insn "mov_from_lane63_v64di"
+  [(set (match_operand:DI 0 "register_operand"      "=Sg,v")
+       (unspec:DI
+         [(match_operand:V64DI 1 "register_operand"   "v,v")]
+         UNSPEC_MOV_FROM_LANE63))]
+  ""
+  "@
+   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
+   * if (REGNO (operands[0]) <= REGNO (operands[1]))   \
+       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"     \
+             \"v_mov_b32\t%H0, %H1 wave_ror:1\";       \
+     else                                              \
+       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"     \
+             \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
+  [(set_attr "type" "vop3a,vop_dpp")
+   (set_attr "exec" "none,*")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Miscellaneous
+
+(define_expand "vec_seriesv64si"
+  [(match_operand:V64SI 0 "register_operand")
+   (match_operand:SI 1 "gcn_alu_operand")
+   (match_operand:SI 2 "gcn_alu_operand")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (V64SImode);
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+
+    emit_insn (gen_mulv64si3_dup (tmp, v1, operands[2]));
+    emit_insn (gen_addv64si3_dup (operands[0], tmp, operands[1]));
+    DONE;
+  })
+
+(define_expand "vec_seriesv64di"
+  [(match_operand:V64DI 0 "register_operand")
+   (match_operand:DI 1 "gcn_alu_operand")
+   (match_operand:DI 2 "gcn_alu_operand")]
+  ""
+  {
+    rtx tmp = gen_reg_rtx (V64DImode);
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+
+    emit_insn (gen_mulv64di3_zext_dup2 (tmp, v1, operands[2]));
+    emit_insn (gen_addv64di3_dup (operands[0], tmp, operands[1]));
+    DONE;
+  })
+
+;; }}}
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
new file mode 100644 (file)
index 0000000..4573a4c
--- /dev/null
@@ -0,0 +1,1857 @@
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+
+;; This file is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3 of the License, or (at your option)
+;; any later version.
+
+;; This file is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
+
+(include "predicates.md")
+(include "constraints.md")
+
+;; {{{ Constants and enums
+
+; Named registers
+(define_constants
+  [(FIRST_SGPR_REG              0)
+   (LAST_SGPR_REG               101)
+   (FLAT_SCRATCH_REG            102)
+   (FLAT_SCRATCH_LO_REG                 102)
+   (FLAT_SCRATCH_HI_REG                 103)
+   (XNACK_MASK_REG              104)
+   (XNACK_MASK_LO_REG           104)
+   (XNACK_MASK_HI_REG           105)
+   (VCC_REG                     106)
+   (VCC_LO_REG                  106)
+   (VCC_HI_REG                  107)
+   (VCCZ_REG                    108)
+   (TBA_REG                     109)
+   (TBA_LO_REG                  109)
+   (TBA_HI_REG                  110)
+   (TMA_REG                     111)
+   (TMA_LO_REG                  111)
+   (TMA_HI_REG                  112)
+   (TTMP0_REG                   113)
+   (TTMP11_REG                  124)
+   (M0_REG                      125)
+   (EXEC_REG                    126)
+   (EXEC_LO_REG                         126)
+   (EXEC_HI_REG                         127)
+   (EXECZ_REG                   128)
+   (SCC_REG                     129)
+   (FIRST_VGPR_REG              160)
+   (LAST_VGPR_REG               415)])
+
+(define_constants
+  [(SP_REGNUM 16)
+   (LR_REGNUM 18)
+   (AP_REGNUM 416)
+   (FP_REGNUM 418)])
+
+(define_c_enum "unspecv" [
+  UNSPECV_PROLOGUE_USE
+  UNSPECV_KERNEL_RETURN
+  UNSPECV_BARRIER
+  UNSPECV_ATOMIC
+  UNSPECV_ICACHE_INV])
+
+(define_c_enum "unspec" [
+  UNSPEC_VECTOR
+  UNSPEC_BPERMUTE
+  UNSPEC_SGPRBASE
+  UNSPEC_MEMORY_BARRIER
+  UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
+  UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
+  UNSPEC_PLUS_DPP_SHR
+  UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
+  UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
+  UNSPEC_MOV_FROM_LANE63
+  UNSPEC_GATHER
+  UNSPEC_SCATTER])
+
+;; }}}
+;; {{{ Attributes
+
+; Instruction type (encoding) as described in the ISA specification.
+; The following table summarizes possible operands of individual instruction
+; types and corresponding constraints.
+;
+; sop2 - scalar, two inputs, one output
+;       ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;                    vccz,execz,scc,inline immedate,fp inline immediate
+;       sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
+;
+;       Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
+;
+; sopk - scalar, inline constant input, one output
+;       simm16: 16bit inline constant
+;       sdst: same as sop2/ssrc0
+;
+;       Constraints "=SD", "J"
+;
+; sop1 - scalar, one input, one output
+;       ssrc0: same as sop2/ssrc0.  FIXME: manual omit VCCZ
+;       sdst: same as sop2/sdst
+;
+;       Constraints "=SD", "SSA"
+;
+; sopc - scalar, two inputs, one comparsion
+;       ssrc0: same as sop2/ssc0.
+;
+;       Constraints "SSI,SSA","SSA,SSI"
+;
+; sopp - scalar, one constant input, one special
+;       simm16
+;
+; smem - scalar memory
+;       sbase: aligned pair of sgprs.  Specify {size[15:0], base[47:0]} in
+;               dwords
+;       sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
+;       offset: sgpr or 20bit unsigned byte offset
+;
+; vop2 - vector, two inputs, one output
+;       vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
+;              inline constant -16 to -64, fp inline immediate, vccz, execz,
+;              scc, lds, literal constant, vgpr0-255
+;       vsrc1: vgpr0-255
+;       vdst: vgpr0-255
+;       Limitations: At most one SGPR, at most one constant
+;                    if constant is used, SGPR must be M0
+;                    Only SRC0 can be LDS_DIRECT
+;
+;       constraints: "=v", "vBSv", "v"
+;
+; vop1 - vector, one input, one output
+;       vsrc0: same as vop2/src0
+;       vdst: vgpr0-255
+;
+;       constraints: "=v", "vBSv"
+;
+; vopc - vector, two inputs, one comparsion output;
+;       vsrc0: same as vop2/src0
+;       vsrc1: vgpr0-255
+;       vdst:
+;
+;       constraints: "vASv", "v"
+;
+; vop3a - vector, three inputs, one output
+;       vdst: vgpr0-255, for v_cmp sgpr or vcc
+;       abs,clamp
+;       vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
+;              inline constant -16 to -64, fp inline immediate, vccz, execz,
+;              scc, lds_direct
+;              FIXME: really missing 1/pi? really 104 SGPRs
+;
+; vop3b - vector, three inputs, one vector output, one scalar output
+;       vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
+;       vdst: vgpr0-255
+;       sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
+;
+; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
+;       src0: vgpr0-255
+;       dst_sel: BYTE_0-3, WORD_0-1, DWORD
+;       dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
+;       clamp: true/false
+;       src0_sel: BYTE_0-3, WORD_0-1, DWORD
+;       flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
+  ;            src1_abs
+;
+; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
+;       src0: vgpr0-255
+;       dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
+;                wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
+;                bcast15, bcast31
+;       flags: src0_neg, src0_abs, src1_neg, src1_abs
+;       bank_mask: 4-bit mask
+;       row_mask: 4-bit mask
+;
+; ds - Local and global data share instructions.
+;       offset0: 8-bit constant
+;       offset1: 8-bit constant
+;       flag: gds
+;       addr: vgpr0-255
+;       data0: vgpr0-255
+;       data1: vgpr0-255
+;       vdst: vgpr0-255
+;
+; mubuf - Untyped memory buffer operation. First word with LDS, second word
+;        non-LDS.
+;       offset: 12-bit constant
+;       vaddr: vgpr0-255
+;       vdata: vgpr0-255
+;       srsrc: sgpr0-102
+;       soffset: sgpr0-102
+;       flags: offen, idxen, glc, lds, slc, tfe
+;
+; mtbuf - Typed memory buffer operation. Two words
+;       offset: 12-bit constant
+;       dfmt: 4-bit constant
+;       nfmt: 3-bit constant
+;       vaddr: vgpr0-255
+;       vdata: vgpr0-255
+;       srsrc: sgpr0-102
+;       soffset: sgpr0-102
+;       flags: offen, idxen, glc, lds, slc, tfe
+;
+; flat - flat or global memory operations
+;       flags: glc, slc
+;       addr: vgpr0-255
+;       data: vgpr0-255
+;       vdst: vgpr0-255
+;
+; mult - expands to multiple instructions (pseudo encoding)
+;
+; vmult - as mult, when a vector instruction is used.
+
+(define_attr "type"
+            "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
+             vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
+            (const_string "unknown"))
+
+; Set if instruction is executed in scalar or vector unit
+
+(define_attr "unit" "unknown,scalar,vector"
+  (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
+           (const_string "scalar")
+        (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
+                         vop_sdwa,vop_dpp,flat,vmult")
+           (const_string "vector")]
+        (const_string "unknown")))
+
+; All vector instructions run as 64 threads as predicated by the EXEC
+; register.  Scalar operations in vector register require a single lane
+; enabled, vector moves require a full set of lanes enabled, and most vector
+; operations handle the lane masking themselves.
+; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
+; according to the following settings:
+;   auto   - md_reorg will inspect def/use to determine what to do.
+;   none   - exec is not needed.
+;   single - disable all but lane zero.
+;   full   - enable all lanes.
+
+(define_attr "exec" "auto,none,single,full"
+   (const_string "auto"))
+
+; Infer the (worst-case) length from the instruction type by default.  Many
+; types can have an optional immediate word following, which we include here.
+; "Multiple" types are counted as two 64-bit instructions.  This is just a
+; default fallback: it can be overridden per-alternative in insn patterns for
+; greater accuracy.
+
+(define_attr "length" ""
+  (cond [(eq_attr "type" "sop1") (const_int 8)
+        (eq_attr "type" "sop2") (const_int 8)
+        (eq_attr "type" "sopk") (const_int 8)
+        (eq_attr "type" "sopc") (const_int 8)
+        (eq_attr "type" "sopp") (const_int 4)
+        (eq_attr "type" "smem") (const_int 8)
+        (eq_attr "type" "ds")   (const_int 8)
+        (eq_attr "type" "vop1") (const_int 8)
+        (eq_attr "type" "vop2") (const_int 8)
+        (eq_attr "type" "vopc") (const_int 8)
+        (eq_attr "type" "vop3a") (const_int 8)
+        (eq_attr "type" "vop3b") (const_int 8)
+        (eq_attr "type" "vop_sdwa") (const_int 8)
+        (eq_attr "type" "vop_dpp") (const_int 8)
+        (eq_attr "type" "flat") (const_int 8)
+        (eq_attr "type" "mult") (const_int 16)
+        (eq_attr "type" "vmult") (const_int 16)]
+       (const_int 4)))
+
+; Disable alternatives that only apply to specific ISA variants.
+
+(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
+
+(define_attr "enabled" ""
+  (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
+        (and (eq_attr "gcn_version" "gcn5")
+             (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+          (const_int 1)]
+       (const_int 0)))
+
+; We need to be able to identify v_readlane and v_writelane with
+; SGPR lane selection in order to handle "Manually Inserted Wait States".
+
+(define_attr "laneselect" "yes,no" (const_string "no"))
+
+;; }}}
+;; {{{ Iterators useful across the wole machine description
+
+(define_mode_iterator SIDI [SI DI])
+(define_mode_iterator SFDF [SF DF])
+(define_mode_iterator SISF [SI SF])
+(define_mode_iterator QIHI [QI HI])
+(define_mode_iterator DIDF [DI DF])
+
+;; }}}
+;; {{{ Attributes.
+
+; Translate RTX code into GCN instruction mnemonics with and without
+; suffixes such as _b32, etc.
+
+(define_code_attr mnemonic
+  [(minus "sub%i")
+   (plus "add%i")
+   (ashift "lshl%b")
+   (lshiftrt "lshr%b")
+   (ashiftrt "ashr%i")
+   (and "and%B")
+   (ior "or%B")
+   (xor "xor%B")
+   (mult "mul%i")
+   (smin "min%i")
+   (smax "max%i")
+   (umin "min%u")
+   (umax "max%u")
+   (not "not%b")
+   (popcount "bcnt_u32%b")])
+
+(define_code_attr bare_mnemonic
+  [(plus "add")
+   (minus "sub")
+   (and "and")
+   (ior "or")
+   (xor "xor")])
+
+(define_code_attr s_mnemonic
+  [(not "not%b")
+   (popcount "bcnt1_i32%b")])
+
+(define_code_attr revmnemonic
+  [(minus "subrev%i")
+   (ashift "lshlrev%b")
+   (lshiftrt "lshrrev%b")
+   (ashiftrt "ashrrev%i")])
+
+; Translate RTX code into corresponding expander name.
+
+(define_code_attr expander
+  [(and "and")
+   (ior "ior")
+   (xor "xor")
+   (plus "add")
+   (minus "sub")
+   (ashift "ashl")
+   (lshiftrt "lshr")
+   (ashiftrt "ashr")
+   (mult "mul")
+   (smin "smin")
+   (smax "smax")
+   (umin "umin")
+   (umax "umax")
+   (not "one_cmpl")
+   (popcount "popcount")])
+
+;; }}}
+;; {{{ Miscellaneous instructions
+
+(define_insn "nop"
+  [(const_int 0)]
+  ""
+  "s_nop\t0x0"
+  [(set_attr "type" "sopp")])
+
+; FIXME: What should the value of the immediate be? Zero is disallowed, so
+; pick 1 for now.
+(define_insn "trap"
+  [(trap_if (const_int 1) (const_int 0))]
+  ""
+  "s_trap\t1"
+  [(set_attr "type" "sopp")])
+
+;; }}}
+;; {{{ Moves
+
+;; All scalar modes we support moves in.
+(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
+
+; This is the entry point for creating all kinds of scalar moves,
+; including reloads and symbols.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+       (match_operand:MOV_MODE 1 "general_operand"))]
+  ""
+  {
+    if (MEM_P (operands[0]))
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+
+    if (!lra_in_progress && !reload_completed
+       && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
+      {
+       /* Something is probably trying to generate a move
+          which can only work indirectly.
+          E.g. Move from LDS memory to SGPR hardreg
+            or MEM:QI to SGPR.  */
+       rtx tmpreg = gen_reg_rtx (<MODE>mode);
+       emit_insn (gen_mov<mode> (tmpreg, operands[1]));
+       emit_insn (gen_mov<mode> (operands[0], tmpreg));
+       DONE;
+      }
+
+    if (<MODE>mode == DImode
+       && (GET_CODE (operands[1]) == SYMBOL_REF
+           || GET_CODE (operands[1]) == LABEL_REF))
+      {
+       emit_insn (gen_movdi_symbol (operands[0], operands[1]));
+       DONE;
+      }
+  })
+
+; Split invalid moves into two valid moves
+
+(define_split
+  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
+       (match_operand:MOV_MODE 1 "general_operand"))]
+  "!reload_completed && !lra_in_progress
+   && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))]
+  {
+    operands[2] = gen_reg_rtx(<MODE>mode);
+  })
+
+; We need BImode move so we can reload flags registers.
+
+(define_insn "*movbi"
+  [(set (match_operand:BI 0 "nonimmediate_operand"
+                                   "=Sg,   v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
+       (match_operand:BI 1 "gcn_load_operand"
+                                   "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
+  ""
+  {
+    /* SCC as an operand is currently not accepted by the LLVM assembler, so
+       we emit bytes directly as a workaround.  */
+    switch (which_alternative) {
+    case 0:
+      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+       return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
+              ".byte\t0xfd\;"
+              ".byte\t0x0\;"
+              ".byte\t0x80|%R0\;"
+              ".byte\t0xbe";
+      else
+       return "s_mov_b32\t%0, %1";
+    case 1:
+      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
+       return "; v_mov_b32\t%0, %1\;"
+              ".byte\t0xfd\;"
+              ".byte\t0x2\;"
+              ".byte\t((%V0<<1)&0xff)\;"
+              ".byte\t0x7e|(%V0>>7)";
+      else
+       return "v_mov_b32\t%0, %1";
+    case 2:
+      return "v_readlane_b32\t%0, %1, 0";
+    case 3:
+      return "s_cmpk_lg_u32\t%1, 0";
+    case 4:
+      return "v_cmp_ne_u32\tvcc, 0, %1";
+    case 5:
+      if (REGNO (operands[1]) == SCC_REG)
+       return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
+              ".byte\t0xfd\;"
+              ".byte\t0x0\;"
+              ".byte\t0xea\;"
+              ".byte\t0xbe\;"
+              "s_mov_b32\tvcc_hi, 0";
+      else
+       return "s_mov_b32\tvcc_lo, %1\;"
+              "s_mov_b32\tvcc_hi, 0";
+    case 6:
+      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
+    case 7:
+      return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)";
+    case 8:
+      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
+    case 9:
+      return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+    case 10:
+      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+    case 11:
+      return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)";
+    default:
+      gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
+                    flat,flat")
+   (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*")
+   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
+
+; 32bit move pattern
+
+(define_insn "*mov<mode>_insn"
+  [(set (match_operand:SISF 0 "nonimmediate_operand"
+                 "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG,   v,SD, v,RM")
+       (match_operand:SISF 1 "gcn_load_operand"
+                 "SSA, J, B,RB,Sm,RS,Sm,v, v,Sv,RF, v,B,   v,RLRG, Y,RM, v"))]
+  ""
+  "@
+  s_mov_b32\t%0, %1
+  s_movk_i32\t%0, %1
+  s_mov_b32\t%0, %1
+  s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+  s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0)
+  s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)
+  v_mov_b32\t%0, %1
+  v_readlane_b32\t%0, %1, 0
+  v_writelane_b32\t%0, %1, 0
+  flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  v_mov_b32\t%0, %1
+  ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  s_mov_b32\t%0, %1
+  global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
+                    flat,vop1,ds,ds,sop1,flat,flat")
+   (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
+   (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
+
+; 8/16bit move pattern
+
+(define_insn "*mov<mode>_insn"
+  [(set (match_operand:QIHI 0 "nonimmediate_operand"
+                                "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG,   v, v,RM")
+       (match_operand:QIHI 1 "gcn_load_operand"
+                                "SSA, J, B,v, v,Sv,RF, v,B,   v,RLRG,RM, v"))]
+  "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
+  "@
+  s_mov_b32\t%0, %1
+  s_movk_i32\t%0, %1
+  s_mov_b32\t%0, %1
+  v_mov_b32\t%0, %1
+  v_readlane_b32\t%0, %1, 0
+  v_writelane_b32\t%0, %1, 0
+  flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  v_mov_b32\t%0, %1
+  ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  [(set_attr "type"
+            "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
+   (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
+   (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
+
+; 64bit move pattern
+
+(define_insn_and_split "*mov<mode>_insn"
+  [(set (match_operand:DIDF 0 "nonimmediate_operand"
+                         "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG,   v, v,RM")
+       (match_operand:DIDF 1 "general_operand"
+                         "SSA, C,DB,Sm,RS,v,DB, v,Sv,RF, v,   v,RLRG,RM, v"))]
+  "GET_CODE(operands[1]) != SYMBOL_REF"
+  "@
+  s_mov_b64\t%0, %1
+  s_mov_b64\t%0, %1
+  #
+  s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  #
+  #
+  #
+  #
+  flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+  flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"
+  "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1])
+    && !gcn_sgpr_move_p (operands[0], operands[1]))
+   || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+  {
+    rtx inlo = gen_lowpart (SImode, operands[1]);
+    rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
+    rtx outlo = gen_lowpart (SImode, operands[0]);
+    rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
+
+    /* Ensure that overlapping registers aren't corrupted.  */
+    if (REGNO (outlo) == REGNO (inhi))
+      {
+       operands[0] = outhi;
+       operands[1] = inhi;
+       operands[2] = outlo;
+       operands[3] = inlo;
+      }
+    else
+      {
+       operands[0] = outlo;
+       operands[1] = inlo;
+       operands[2] = outhi;
+       operands[3] = inhi;
+      }
+  }
+  [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
+                    flat,ds,ds,flat,flat")
+   (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
+
+; 128-bit move.
+
+(define_insn_and_split "*movti_insn"
+  [(set (match_operand:TI 0 "nonimmediate_operand"
+                                     "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
+       (match_operand:TI 1 "general_operand"  
+                                     "SSB,Sm,RS, v,RF,v,Sv, v, v,RM, v,RL"))]
+  ""
+  "@
+  #
+  s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0)
+  s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+  #
+  #
+  #
+  global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)
+  global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0)
+  ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
+  "reload_completed
+   && REG_P (operands[0])
+   && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))
+   (set (match_dup 4) (match_dup 5))
+   (set (match_dup 6) (match_dup 7))]
+  {
+    operands[6] = gcn_operand_part (TImode, operands[0], 3);
+    operands[7] = gcn_operand_part (TImode, operands[1], 3);
+    operands[4] = gcn_operand_part (TImode, operands[0], 2);
+    operands[5] = gcn_operand_part (TImode, operands[1], 2);
+    operands[2] = gcn_operand_part (TImode, operands[0], 1);
+    operands[3] = gcn_operand_part (TImode, operands[1], 1);
+    operands[0] = gcn_operand_part (TImode, operands[0], 0);
+    operands[1] = gcn_operand_part (TImode, operands[1], 0);
+  }
+  [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
+                    ds,ds")
+   (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
+
+;; }}}
+;; {{{ Prologue/Epilogue
+
+(define_insn "prologue_use"
+  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+(define_expand "prologue"
+  [(const_int 0)]
+  ""
+  {
+    gcn_expand_prologue ();
+    DONE;
+  })
+
+(define_expand "epilogue"
+  [(const_int 0)]
+  ""
+  {
+    gcn_expand_epilogue ();
+    DONE;
+  })
+
+;; }}}
+;; {{{ Control flow
+
+; This pattern must satisfy simplejump_p, which means it cannot be a parallel
+; that clobbers SCC.  Thus, we must preserve SCC if we're generating a long
+; branch sequence.
+
+(define_insn "jump"
+  [(set (pc)
+       (label_ref (match_operand 0)))]
+  ""
+  {
+    if (get_attr_length (insn) == 4)
+      return "s_branch\t%0";
+    else
+      /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG.  */
+      return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+            ".long\t0xbe9600fd\;"
+            "s_getpc_b64\ts[20:21]\;"
+            "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+            "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+            "s_cmpk_lg_u32\ts22, 0\;"
+            "s_setpc_b64\ts[20:21]";
+  }
+  [(set_attr "type" "sopp")
+   (set (attr "length")
+       (if_then_else (and (ge (minus (match_dup 0) (pc))
+                              (const_int -131072))
+                          (lt (minus (match_dup 0) (pc))
+                              (const_int 131072)))
+                     (const_int 4)
+                     (const_int 32)))])
+
+(define_insn "indirect_jump"
+  [(set (pc)
+       (match_operand:DI 0 "register_operand" "Sg"))]
+  ""
+  "s_setpc_b64\t%0"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+(define_insn "cjump"
+  [(set (pc)
+       (if_then_else
+         (match_operator:BI 1 "gcn_conditional_operator"
+           [(match_operand:BI 2 "gcn_conditional_register_operand" "ca,cV")
+            (const_int 0)])
+         (label_ref (match_operand 0))
+         (pc)))]
+  ""
+  {
+    if (get_attr_length (insn) == 4)
+      return "s_cbranch%C1\t%0";
+    else
+      {
+       /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG but
+              restores SCC.  */
+       if (REGNO (operands[2]) == SCC_REG)
+         {
+           if (GET_CODE (operands[1]) == EQ)
+             return "s_cbranch%c1\t.Lskip%=\;"
+                    "s_getpc_b64\ts[20:21]\;"
+                    "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+                    "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+                    "s_cmp_lg_u32\t0, 0\;"
+                    "s_setpc_b64\ts[20:21]\n"
+                    ".Lskip%=:";
+           else
+             return "s_cbranch%c1\t.Lskip%=\;"
+                    "s_getpc_b64\ts[20:21]\;"
+                    "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+                    "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+                    "s_cmp_eq_u32\t0, 0\;"
+                    "s_setpc_b64\ts[20:21]\n"
+                    ".Lskip%=:";
+         }
+       else
+         return "s_cbranch%c1\t.Lskip%=\;"
+                "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
+                ".byte\t0xfd\;"
+                ".byte\t0x0\;"
+                ".byte\t0x80|22\;"
+                ".byte\t0xbe\;"
+                "s_getpc_b64\ts[20:21]\;"
+                "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
+                "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
+                "s_cmpk_lg_u32\ts22, 0\;"
+                "s_setpc_b64\ts[20:21]\n"
+                ".Lskip%=:";
+      }
+  }
+  [(set_attr "type" "sopp")
+   (set (attr "length")
+       (if_then_else (and (ge (minus (match_dup 0) (pc))
+                              (const_int -131072))
+                          (lt (minus (match_dup 0) (pc))
+                              (const_int 131072)))
+                     (const_int 4)
+                     (const_int 36)))])
+
+; Returning from a normal function is different to returning from a
+; kernel function.
+
+(define_insn "gcn_return"
+  [(return)]
+  ""
+  {
+    if (cfun && cfun->machine && cfun->machine->normal_function)
+      return "s_setpc_b64\ts[18:19]";
+    else
+      return "s_dcache_wb\;s_endpgm";
+  }
+  [(set_attr "type" "sop1")
+   (set_attr "length" "8")])
+
+(define_expand "call"
+  [(parallel [(call (match_operand 0 "")
+                   (match_operand 1 ""))
+             (clobber (reg:DI LR_REGNUM))
+             (clobber (match_scratch:DI 2))])]
+  ""
+  {})
+
+(define_insn "gcn_simple_call"
+  [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
+        (match_operand 1 "const_int_operand"))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 2 "=&Sg,X"))]
+  ""
+  "@
+  s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
+  s_swappc_b64\ts[18:19], %0"
+  [(set_attr "type" "mult,sop1")
+   (set_attr "length" "24,4")])
+
+(define_insn "movdi_symbol"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
+       (match_operand:DI 1 "general_operand" "Y"))
+  (clobber (reg:BI SCC_REG))]
+ "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
+  {
+    if (SYMBOL_REF_P (operands[1])
+       && SYMBOL_REF_WEAK (operands[1]))
+       return "s_getpc_b64\t%0\;"
+              "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
+              "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
+              "s_load_dwordx2\t%0, %0\;"
+              "s_waitcnt\tlgkmcnt(0)";
+
+    return "s_getpc_b64\t%0\;"
+          "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
+          "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
+  }
+ [(set_attr "type" "mult")
+  (set_attr "length" "32")])
+
+(define_insn "gcn_indirect_call"
+  [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
+        (match_operand 1 "" ""))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 2 "=X"))]
+  ""
+  "s_swappc_b64\ts[18:19], %0"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+(define_expand "call_value"
+  [(parallel [(set (match_operand 0 "")
+                  (call (match_operand 1 "")
+                        (match_operand 2 "")))
+             (clobber (reg:DI LR_REGNUM))
+             (clobber (match_scratch:DI 3))])]
+  ""
+  {})
+
+(define_insn "gcn_call_value"
+  [(set (match_operand 0 "register_operand" "=Sg,Sg")
+       (call (mem (match_operand 1 "immediate_operand" "Y,B"))
+             (match_operand 2 "const_int_operand")))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 3 "=&Sg,X"))]
+  ""
+  "@
+  s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
+  s_swappc_b64\ts[18:19], %1"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "24")])
+
+(define_insn "gcn_call_value_indirect"
+  [(set (match_operand 0 "register_operand" "=Sg")
+       (call (mem (match_operand:DI 1 "register_operand" "Sg"))
+             (match_operand 2 "" "")))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (match_scratch:DI 3 "=X"))]
+  ""
+  "s_swappc_b64\ts[18:19], %1"
+  [(set_attr "type" "sop1")
+   (set_attr "length" "4")])
+
+; GCN does not have an instruction to clear only part of the instruction
+; cache, so the operands are ignored.
+
+(define_insn "clear_icache"
+  [(unspec_volatile
+    [(match_operand 0 "") (match_operand 1 "")]
+    UNSPECV_ICACHE_INV)]
+  ""
+  "s_icache_inv"
+  [(set_attr "type" "sopp")
+   (set_attr "length" "4")])
+
+;; }}}
+;; {{{ Conditionals
+
+; 32-bit compare, scalar unit only
+
+(define_insn "cstoresi4"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand"
+                                                        "=cs, cs, cs, cs")
+       (match_operator:BI 1 "gcn_compare_operator"
+         [(match_operand:SI 2 "gcn_alu_operand"         "SSA,SSA,SSB, SS")
+          (match_operand:SI 3 "gcn_alu_operand"         "SSA,SSL, SS,SSB")]))]
+  ""
+  "@
+   s_cmp%D1\t%2, %3
+   s_cmpk%D1\t%2, %3
+   s_cmp%D1\t%2, %3
+   s_cmp%D1\t%2, %3"
+  [(set_attr "type" "sopc,sopk,sopk,sopk")
+   (set_attr "length" "4,4,8,8")])
+
+(define_expand "cbranchsi4"
+  [(match_operator 0 "gcn_compare_operator"
+     [(match_operand:SI 1 "gcn_alu_operand")
+      (match_operand:SI 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+                              gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+; 64-bit compare; either unit, but scalar allows limited operators
+
+(define_expand "cstoredi4"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand")
+       (match_operator:BI 1 "gcn_compare_operator"
+                          [(match_operand:DI 2 "gcn_alu_operand")
+                           (match_operand:DI 3 "gcn_alu_operand")]))]
+  ""
+  {})
+
+(define_insn "cstoredi4_vec_and_scalar"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs,  cV")
+       (match_operator:BI 1 "gcn_compare_64bit_operator"
+         [(match_operand:DI 2 "gcn_alu_operand"               "%SSA,vSvC")
+          (match_operand:DI 3 "gcn_alu_operand"               " SSC,   v")]))]
+  ""
+  "@
+   s_cmp%D1\t%2, %3
+   v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "sopc,vopc")
+   (set_attr "length" "8")])
+
+(define_insn "cstoredi4_vector"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
+       (match_operator:BI 1 "gcn_compare_operator"
+          [(match_operand:DI 2 "gcn_alu_operand"              "vSvB")
+          (match_operand:DI 3 "gcn_alu_operand"               "   v")]))]
+  ""
+  "v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "vopc")
+   (set_attr "length" "8")])
+
+(define_expand "cbranchdi4"
+  [(match_operator 0 "gcn_compare_operator"
+     [(match_operand:DI 1 "gcn_alu_operand")
+      (match_operand:DI 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+                              gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+; FP compare; vector unit only
+
+(define_insn "cstore<mode>4"
+  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
+       (match_operator:BI 1 "gcn_fp_compare_operator"
+         [(match_operand:SFDF 2 "gcn_alu_operand"              "vB")
+          (match_operand:SFDF 3 "gcn_alu_operand"               "v")]))]
+  ""
+  "v_cmp%E1\tvcc, %2, %3"
+  [(set_attr "type" "vopc")
+   (set_attr "length" "8")])
+
+(define_expand "cbranch<mode>4"
+  [(match_operator 0 "gcn_fp_compare_operator"
+     [(match_operand:SFDF 1 "gcn_alu_operand")
+      (match_operand:SFDF 2 "gcn_alu_operand")])
+   (match_operand 3)]
+  ""
+  {
+    rtx cc = gen_reg_rtx (BImode);
+    emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
+    emit_jump_insn (gen_cjump (operands[3],
+                              gen_rtx_NE (BImode, cc, const0_rtx), cc));
+    DONE;
+  })
+
+;; }}}
+;; {{{ ALU special cases: Plus
+
+(define_insn "addsi3"
+  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
+        (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
+                (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSv")))
+   (clobber (match_scratch:BI 3                               "= cs, cs, cs,   X"))
+   (clobber (match_scratch:DI 4                               "=  X,  X,  X,  cV"))]
+  ""
+  "@
+   s_add_i32\t%0, %1, %2
+   s_addk_i32\t%0, %2
+   s_add_i32\t%0, %1, %2
+   v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "sop2,sopk,sop2,vop2")
+   (set_attr "length" "4,4,8,8")])
+
+(define_expand "addsi3_scc"
+  [(parallel [(set (match_operand:SI 0 "register_operand")
+                  (plus:SI (match_operand:SI 1 "gcn_alu_operand")
+                           (match_operand:SI 2 "gcn_alu_operand")))
+             (clobber (reg:BI SCC_REG))
+             (clobber (scratch:DI))])]
+  ""
+  {})
+
+; Having this as an insn_and_split allows us to keep together DImode adds
+; through some RTL optimisation passes, and means the CC reg we set isn't
+; dependent on the constraint alternative (which doesn't seem to work well).
+
+; There's an early clobber in the case where "v[0:1]=v[1:2]+?" but
+; "v[0:1]=v[0:1]+?" is fine (as is "v[1:2]=v[0:1]+?", but that's trickier).
+
+; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
+; used as an operand due to the read of VCC, so we restrict constants to the
+; inlinable range for that alternative.
+
+(define_insn_and_split "adddi3"
+  [(set (match_operand:DI 0 "register_operand"         
+                                             "=&Sg,&Sg,&Sg,&Sg,&v,&v,&v,&v")
+       (plus:DI (match_operand:DI 1 "register_operand" 
+                                             "  Sg,  0,  0, Sg, v, 0, 0, v")
+                (match_operand:DI 2 "nonmemory_operand"
+                                             "   0,SgB,  0,SgB, 0,vA, 0,vA")))
+   (clobber (match_scratch:BI 3                      "= cs, cs, cs, cs, X, X, X, X"))
+   (clobber (match_scratch:DI 4                      "=  X,  X,  X,  X,cV,cV,cV,cV"))]
+  ""
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
+                                                            DImode)
+                         ? VCC_REG : SCC_REG);
+
+    emit_insn (gen_addsi3_scalar_carry
+              (gcn_operand_part (DImode, operands[0], 0),
+               gcn_operand_part (DImode, operands[1], 0),
+               gcn_operand_part (DImode, operands[2], 0),
+               cc));
+    rtx val = gcn_operand_part (DImode, operands[2], 1);
+    if (val != const0_rtx)
+      emit_insn (gen_addcsi3_scalar
+                (gcn_operand_part (DImode, operands[0], 1),
+                 gcn_operand_part (DImode, operands[1], 1),
+                 gcn_operand_part (DImode, operands[2], 1),
+                 cc, cc));
+    else
+      emit_insn (gen_addcsi3_scalar_zero
+                (gcn_operand_part (DImode, operands[0], 1),
+                 gcn_operand_part (DImode, operands[1], 1),
+                 cc));
+    DONE;
+  }
+  [(set_attr "type" "mult,mult,mult,mult,vmult,vmult,vmult,vmult")
+   (set_attr "length" "8")])
+
+(define_expand "adddi3_scc"
+  [(parallel [(set (match_operand:DI 0 "register_operand")
+                  (plus:DI (match_operand:DI 1 "register_operand")
+                           (match_operand:DI 2 "nonmemory_operand")))
+             (clobber (reg:BI SCC_REG))
+             (clobber (scratch:DI))])]
+  ""
+  {})
+
+;; Add with carry.
+
+(define_insn "addsi3_scalar_carry"
+  [(set (match_operand:SI 0 "register_operand"        "= Sg, v")
+       (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
+                (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
+   (set (match_operand:BI 3 "register_operand"        "= cs,cV")
+       (ltu:BI (plus:SI (match_dup 1)
+                        (match_dup 2))
+               (match_dup 1)))]
+  ""
+  "@
+   s_add_u32\t%0, %1, %2
+   v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "8,8")])
+
+(define_insn "addsi3_scalar_carry_cst"
+  [(set (match_operand:SI 0 "register_operand"           "=Sg, v")
+        (plus:SI (match_operand:SI 1 "gcn_alu_operand"   "SgA, v")
+                (match_operand:SI 2 "const_int_operand" "  n, n")))
+   (set (match_operand:BI 4 "register_operand"           "=cs,cV")
+       (geu:BI (plus:SI (match_dup 1)
+                        (match_dup 2))
+               (match_operand:SI 3 "const_int_operand"  "  n, n")))]
+  "INTVAL (operands[2]) == -INTVAL (operands[3])"
+  "@
+   s_add_u32\t%0, %1, %2
+   v_add%^_u32\t%0, vcc, %2, %1"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "4")])
+
+(define_insn "addcsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"                    "= Sg, v")
+       (plus:SI (plus:SI (zero_extend:SI
+                           (match_operand:BI 3 "register_operand" "= cs,cV"))
+                         (match_operand:SI 1 "gcn_alu_operand"    "%SgA, v"))
+                (match_operand:SI 2 "gcn_alu_operand"             " SgB,vA")))
+   (set (match_operand:BI 4 "register_operand"                    "=  3, 3")
+       (ior:BI (ltu:BI (plus:SI
+                         (plus:SI
+                           (zero_extend:SI (match_dup 3))
+                           (match_dup 1))
+                         (match_dup 2))
+                       (match_dup 2))
+               (ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
+                       (match_dup 1))))]
+  ""
+  "@
+   s_addc_u32\t%0, %1, %2
+   v_addc%^_u32\t%0, vcc, %2, %1, vcc"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "8,4")])
+
+(define_insn "addcsi3_scalar_zero"
+  [(set (match_operand:SI 0 "register_operand"           "=Sg, v")
+        (plus:SI (zero_extend:SI
+                  (match_operand:BI 2 "register_operand" "=cs,cV"))
+                (match_operand:SI 1 "gcn_alu_operand"    "SgA, v")))
+   (set (match_dup 2)
+       (ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
+                        (match_dup 1))
+               (match_dup 1)))]
+  ""
+  "@
+   s_addc_u32\t%0, %1, 0
+   v_addc%^_u32\t%0, vcc, 0, %1, vcc"
+  [(set_attr "type" "sop2,vop2")
+   (set_attr "length" "4")])
+
+; "addptr" is the same as "add" except that it must not write to VCC or SCC
+; as a side-effect.  Unfortunately GCN does not have a suitable instruction
+; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp.
+; Note that it is not safe to save/clobber/restore SCC because doing so will
+; break data-flow analysis, so this must use vector registers.
+
+(define_insn "addptrdi3"
+  [(set (match_operand:DI 0 "register_operand"          "= &v")
+       (plus:DI (match_operand:DI 1 "register_operand"  "  v0")
+                (match_operand:DI 2 "nonmemory_operand" "vDA0")))]
+  ""
+  {
+    rtx new_operands[4] = { operands[0], operands[1], operands[2],
+                           gen_rtx_REG (DImode, CC_SAVE_REG) };
+
+    output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands);
+    output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands);
+
+    return "";
+  }
+  [(set_attr "type" "vmult")
+   (set_attr "length" "16")])
+
+;; }}}
+;; {{{ ALU special cases: Minus
+
+(define_insn "subsi3"
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
+       (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSv")
+                 (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSv,   v")))
+   (clobber (match_scratch:BI 3                                "=cs, cs,    X,   X"))
+   (clobber (match_scratch:DI 4                                "= X,  X,   cV,  cV"))]
+  ""
+  "@
+   s_sub_i32\t%0, %1, %2
+   s_sub_i32\t%0, %1, %2
+   v_subrev%^_u32\t%0, vcc, %2, %1
+   v_sub%^_u32\t%0, vcc, %1, %2"
+  [(set_attr "type" "sop2,sop2,vop2,vop2")
+   (set_attr "length" "4,8,8,8")])
+
+(define_insn_and_split "subdi3"
+  [(set (match_operand:DI 0 "register_operand"        "=Sg, Sg")
+       (minus:DI
+               (match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
+               (match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
+   (clobber (reg:BI SCC_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(const_int 0)]
+  {
+    emit_insn (gen_subsi3_scalar_carry
+              (gcn_operand_part (DImode, operands[0], 0),
+               gcn_operand_part (DImode, operands[1], 0),
+               gcn_operand_part (DImode, operands[2], 0)));
+    rtx val = gcn_operand_part (DImode, operands[2], 1);
+    if (val != const0_rtx)
+      emit_insn (gen_subcsi3_scalar
+                (gcn_operand_part (DImode, operands[0], 1),
+                 gcn_operand_part (DImode, operands[1], 1),
+                 gcn_operand_part (DImode, operands[2], 1)));
+    else
+      emit_insn (gen_subcsi3_scalar_zero
+                (gcn_operand_part (DImode, operands[0], 1),
+                 gcn_operand_part (DImode, operands[1], 1)));
+    DONE;
+  }
+  [(set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry"
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg")
+        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
+                 (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
+   (set (reg:BI SCC_REG)
+       (gtu:BI (minus:SI (match_dup 1)
+                         (match_dup 2))
+               (match_dup 1)))]
+  ""
+  "s_sub_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+(define_insn "subsi3_scalar_carry_cst"
+  [(set (match_operand:SI 0 "register_operand"           "=Sg")
+        (minus:SI (match_operand:SI 1 "gcn_alu_operand"  "SgA")
+                (match_operand:SI 2 "const_int_operand" "  n")))
+   (set (reg:BI SCC_REG)
+       (leu:BI (minus:SI (match_dup 1)
+                        (match_dup 2))
+               (match_operand:SI 3 "const_int_operand"  "  n")))]
+  "INTVAL (operands[2]) == -INTVAL (operands[3])"
+  "s_sub_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "4")])
+
+(define_insn "subcsi3_scalar"
+  [(set (match_operand:SI 0 "register_operand"                    "=Sg, Sg")
+        (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+                           (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
+                (match_operand:SI 2 "gcn_alu_operand"            "SgB,SgA")))
+   (set (reg:BI SCC_REG)
+       (ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+                                           (match_dup 1))
+                                (match_dup 2))
+                       (match_dup 1))
+               (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+                                 (match_dup 1))
+                       (match_dup 1))))]
+  ""
+  "s_subb_u32\t%0, %1, %2"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "8")])
+
+(define_insn "subcsi3_scalar_zero"
+  [(set (match_operand:SI 0 "register_operand"         "=Sg")
+        (minus:SI (zero_extend:SI (reg:BI SCC_REG))
+                 (match_operand:SI 1 "gcn_alu_operand" "SgA")))
+   (set (reg:BI SCC_REG)
+       (gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
+               (match_dup 1)))]
+  ""
+  "s_subb_u32\t%0, %1, 0"
+  [(set_attr "type" "sop2")
+   (set_attr "length" "4")])
+
+;; }}}
+;; {{{ ALU: mult
+
+; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
+; immediate.
+(define_insn "mulsi3"
+  [(set (match_operand:SI 0 "register_operand"        "= Sg,Sg, Sg,   v")
+        (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
+                (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASv")))]
+  ""
+  "@
+   s_mul_i32\t%0, %1, %2
+   s_mulk_i32\t%0, %2
+   s_mul_i32\t%0, %1, %2
+   v_mul_lo_i32\t%0, %1, %2"
+  [(set_attr "type" "sop2,sopk,sop2,vop3a")
+   (set_attr "length" "4,4,8,4")])
+
+(define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
+(define_code_attr su [(sign_extend "s") (zero_extend "u")])
+(define_code_attr u [(sign_extend "") (zero_extend "u")])
+(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
+(define_code_attr e [(sign_extend "e") (zero_extend "")])
+
+(define_insn "<su>mulsi3_highpart"
+  [(set (match_operand:SI 0 "register_operand"        "= v")
+       (truncate:SI
+         (lshiftrt:DI
+           (mult:DI
+             (any_extend:DI
+               (match_operand:SI 1 "register_operand" "% v"))
+             (any_extend:DI
+               (match_operand:SI 2 "register_operand" "vSv")))
+           (const_int 32))))]
+  ""
+  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
+  [(set_attr "type" "vop3a")
+   (set_attr "length" "8")])
+
+(define_insn "<u>mulhisi3"
+  [(set (match_operand:SI 0 "register_operand"                 "=v")
+       (mult:SI
+         (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
+         (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
+  ""
+  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
+  [(set_attr "type" "vop_sdwa")
+   (set_attr "length" "8")])
+
+(define_insn "<u>mulqihi3_scalar"
+  [(set (match_operand:HI 0 "register_operand"                 "=v")
+       (mult:HI
+         (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
+         (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
+  ""
+  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
+  [(set_attr "type" "vop_sdwa")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit unop
+
+(define_code_iterator bitunop [not popcount])
+(define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
+
+(define_insn "<expander>si2"
+  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
+        (bitunop:SI
+         (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
+   (clobber (match_scratch:BI 2                        "=cs,   X"))]
+  ""
+  "@
+   s_<s_mnemonic>0\t%0, %1
+   v_<mnemonic>0\t%0, %1<popcount_extra_op>"
+  [(set_attr "type" "sop1,vop1")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ ALU: generic 32-bit binop
+
+; No plus and mult - they have variant with 16bit immediate
+; and thus are defined later.
+(define_code_iterator binop [and ior xor smin smax umin umax
+                                ashift lshiftrt ashiftrt])
+(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
+(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
+
+(define_insn "<expander>si3"
+  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,   v,RD")
+        (vec_and_scalar_com:SI
+         (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
+         (match_operand:SI 2 "gcn_alu_operand"       " SgB,   v, v")))
+   (clobber (match_scratch:BI 3                              "= cs,   X, X"))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   v_<mnemonic>0\t%0, %1, %2
+   ds_<mnemonic>0\t%A0, %2%O0"
+  [(set_attr "type" "sop2,vop2,ds")
+   (set_attr "length" "8")])
+
+(define_insn "<expander>si3"
+  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,   v")
+        (vec_and_scalar_nocom:SI
+         (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA,   v")
+         (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
+   (clobber (match_scratch:BI 3                        "=cs, cs,   X"))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   s_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "sop2,sop2,vop2")
+   (set_attr "length" "8")])
+
+(define_expand "<expander>si3_scc"
+  [(parallel [(set (match_operand:SI 0 "gcn_valu_dst_operand")
+                  (binop:SI
+                    (match_operand:SI 1 "gcn_valu_src0_operand")
+                    (match_operand:SI 2 "gcn_alu_operand")))
+             (clobber (reg:BI SCC_REG))])]
+  ""
+  {})
+
+;; }}}
+;; {{{ ALU: generic 64-bit
+
+(define_code_iterator vec_and_scalar64_com [and ior xor])
+
+(define_insn_and_split "<expander>di3"
+   [(set (match_operand:DI 0 "register_operand"  "= Sg,   &v,   &v")
+        (vec_and_scalar64_com:DI
+         (match_operand:DI 1 "gcn_alu_operand"  "%SgA,vSvDB,vSvDB")
+          (match_operand:DI 2 "gcn_alu_operand" " SgC,    v,    0")))
+   (clobber (match_scratch:BI 3                         "= cs,    X,    X"))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   #
+   #"
+  "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
+  [(parallel [(set (match_dup 4)
+                  (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
+             (clobber (match_dup 3))])
+   (parallel [(set (match_dup 7)
+                  (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
+             (clobber (match_dup 3))])]
+  {
+    operands[4] = gcn_operand_part (DImode, operands[0], 0);
+    operands[5] = gcn_operand_part (DImode, operands[1], 0);
+    operands[6] = gcn_operand_part (DImode, operands[2], 0);
+    operands[7] = gcn_operand_part (DImode, operands[0], 1);
+    operands[8] = gcn_operand_part (DImode, operands[1], 1);
+    operands[9] = gcn_operand_part (DImode, operands[2], 1);
+  }
+  [(set_attr "type" "sop2,vop2,vop2")
+   (set_attr "length" "8")])
+
+(define_insn "<expander>di3"
+  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
+       (vec_and_scalar_nocom:DI
+         (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
+         (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSvC")))
+   (clobber (match_scratch:BI 3                         "=cs, cs,   X"))]
+  ""
+  "@
+   s_<mnemonic>0\t%0, %1, %2
+   s_<mnemonic>0\t%0, %1, %2
+   v_<revmnemonic>0\t%0, %2, %1"
+  [(set_attr "type" "sop2,sop2,vop2")
+   (set_attr "length" "8")])
+
+;; }}}
+;; {{{ Atomics
+
+; Each compute unit has it's own L1 cache. The L2 cache is shared between
+; all the compute units.  Any load or store instruction can skip L1 and
+; access L2 directly using the "glc" flag.  Atomic instructions also skip
+; L1.  The L1 cache can be flushed and invalidated using instructions.
+;
+; Therefore, in order for "acquire" and "release" atomic modes to work
+; correctly across compute units we must flush before each "release"
+; and invalidate the cache after each "acquire".  It might seem like
+; invalidation could be safely done before an "acquire", but since each
+; compute unit can run up to 40 threads simultaneously, all reading values
+; into the L1 cache, this is not actually safe.
+;
+; Additionally, scalar flat instructions access L2 via a different cache
+; (the "constant cache"), so they have separate constrol instructions.  We
+; do not attempt to invalidate both caches at once; instead, atomics
+; operating on scalar flat pointers will flush the constant cache, and
+; atomics operating on flat or global pointers will flush L1.  It is up to
+; the programmer to get this right.
+
+(define_code_iterator atomicops [plus minus and ior xor])
+(define_mode_attr X [(SI "") (DI "_X2")])
+
+;; TODO compare_and_swap test_and_set inc dec
+;; Hardware also supports min and max, but GCC does not.
+
+(define_expand "memory_barrier"
+  [(set (match_dup 0)
+       (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  ""
+  {
+    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+    MEM_VOLATILE_P (operands[0]) = 1;
+  })
+
+(define_insn "*memory_barrier"
+  [(set (match_operand:BLK 0)
+       (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  ""
+  "buffer_wbinvl1_vol"
+  [(set_attr "type" "mubuf")
+   (set_attr "length" "4")])
+
+; FIXME: These patterns have been disabled as they do not seem to work
+; reliably - they can cause hangs or incorrect results.
+; TODO: flush caches according to memory model
+(define_insn "atomic_fetch_<bare_mnemonic><mode>"
+  [(set (match_operand:SIDI 0 "register_operand"     "=Sm, v, v")
+       (match_operand:SIDI 1 "memory_operand"       "+RS,RF,RM"))
+   (set (match_dup 1)
+       (unspec_volatile:SIDI
+         [(atomicops:SIDI
+           (match_dup 1)
+           (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
+          UNSPECV_ATOMIC))
+   (use (match_operand 3 "const_int_operand"))]
+  "0 /* Disabled.  */"
+  "@
+   s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
+   global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+; FIXME: These patterns are disabled because the instructions don't
+; seem to work as advertised.  Specifically, OMP "team distribute"
+; reductions apparently "lose" some of the writes, similar to what
+; you might expect from a concurrent non-atomic read-modify-write.
+; TODO: flush caches according to memory model
+(define_insn "atomic_<bare_mnemonic><mode>"
+  [(set (match_operand:SIDI 0 "memory_operand"       "+RS,RF,RM")
+       (unspec_volatile:SIDI
+         [(atomicops:SIDI
+           (match_dup 0)
+           (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
+         UNSPECV_ATOMIC))
+   (use (match_operand 2 "const_int_operand"))]
+  "0 /* Disabled.  */"
+  "@
+   s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
+   global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_mode_attr x2 [(SI "DI") (DI "TI")])
+(define_mode_attr size [(SI "4") (DI "8")])
+(define_mode_attr bitsize [(SI "32") (DI "64")])
+
+(define_expand "sync_compare_and_swap<mode>"
+  [(match_operand:SIDI 0 "register_operand")
+   (match_operand:SIDI 1 "memory_operand")
+   (match_operand:SIDI 2 "register_operand")
+   (match_operand:SIDI 3 "register_operand")]
+  ""
+  {
+    if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
+      {
+       emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
+                                                            operands[1],
+                                                            operands[2],
+                                                            operands[3]));
+       DONE;
+      }
+
+    /* Operands 2 and 3 must be placed in consecutive registers, and passed
+       as a combined value.  */
+    rtx src_cmp = gen_reg_rtx (<x2>mode);
+    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
+    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
+    emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
+                                                    operands[1],
+                                                    src_cmp));
+    DONE;
+  })
+
+(define_insn "sync_compare_and_swap<mode>_insn"
+  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
+       (match_operand:SIDI 1 "memory_operand"      "+RS,RF,RM"))
+   (set (match_dup 1)
+       (unspec_volatile:SIDI
+         [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
+         UNSPECV_ATOMIC))]
+  ""
+  "@
+   s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
+   flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
+   global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "12")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "sync_compare_and_swap<mode>_lds_insn"
+  [(set (match_operand:SIDI 0 "register_operand"    "= v")
+       (unspec_volatile:SIDI
+         [(match_operand:SIDI 1 "memory_operand"   "+RL")]
+         UNSPECV_ATOMIC))
+   (set (match_dup 1)
+       (unspec_volatile:SIDI
+         [(match_operand:SIDI 2 "register_operand" "  v")
+          (match_operand:SIDI 3 "register_operand" "  v")]
+         UNSPECV_ATOMIC))]
+  ""
+  "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
+  [(set_attr "type" "ds")
+   (set_attr "length" "12")])
+
+(define_insn "atomic_load<mode>"
+  [(set (match_operand:SIDI 0 "register_operand"  "=Sm, v, v")
+       (unspec_volatile:SIDI
+         [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
+         UNSPECV_ATOMIC))
+   (use (match_operand:SIDI 2 "immediate_operand" "  i, i, i"))]
+  ""
+  {
+    switch (INTVAL (operands[2]))
+      {
+      case MEMMODEL_RELAXED:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
+         case 1:
+           return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+         case 2:
+           return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+         }
+       break;
+      case MEMMODEL_CONSUME:
+      case MEMMODEL_ACQUIRE:
+      case MEMMODEL_SYNC_ACQUIRE:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
+                  "s_dcache_wb_vol";
+         case 1:
+           return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
+                  "buffer_wbinvl1_vol";
+         case 2:
+           return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
+                  "buffer_wbinvl1_vol";
+         }
+       break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
+                  "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+         case 1:
+           return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
+                  "s_waitcnt\t0\;buffer_wbinvl1_vol";
+         case 2:
+           return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
+                  "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+         }
+       break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "atomic_store<mode>"
+  [(set (match_operand:SIDI 0 "memory_operand"      "=RS,RF,RM")
+       (unspec_volatile:SIDI
+         [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
+         UNSPECV_ATOMIC))
+  (use (match_operand:SIDI 2 "immediate_operand"    "  i, i, i"))]
+  ""
+  {
+    switch (INTVAL (operands[2]))
+      {
+      case MEMMODEL_RELAXED:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
+         case 1:
+           return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
+         case 2:
+           return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
+         }
+       break;
+      case MEMMODEL_RELEASE:
+      case MEMMODEL_SYNC_RELEASE:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+                  "s_waitcnt\texpcnt(0)";
+         case 1:
+           return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+                  "s_waitcnt\texpcnt(0)";
+         case 2:
+           return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+                  "s_waitcnt\texpcnt(0)";
+         }
+       break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
+                  "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol";
+         case 1:
+           return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
+                  "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+         case 2:
+           return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
+                  "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol";
+         }
+       break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+(define_insn "atomic_exchange<mode>"
+  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
+        (match_operand:SIDI 1 "memory_operand"     "+RS,RF,RM"))
+   (set (match_dup 1)
+       (unspec_volatile:SIDI
+         [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
+         UNSPECV_ATOMIC))
+   (use (match_operand 3 "immediate_operand"))]
+  ""
+  {
+    switch (INTVAL (operands[3]))
+      {
+      case MEMMODEL_RELAXED:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
+         case 1:
+           return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
+         case 2:
+           return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+                  "s_waitcnt\tvmcnt(0)";
+         }
+       break;
+      case MEMMODEL_CONSUME:
+      case MEMMODEL_ACQUIRE:
+      case MEMMODEL_SYNC_ACQUIRE:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
+                  "s_dcache_wb_vol\;s_dcache_inv_vol";
+         case 1:
+           return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
+                  "buffer_wbinvl1_vol";
+         case 2:
+           return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+                  "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+         }
+       break;
+      case MEMMODEL_RELEASE:
+      case MEMMODEL_SYNC_RELEASE:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+                  "s_waitcnt\tlgkmcnt(0)";
+         case 1:
+           return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+                  "s_waitcnt\t0";
+         case 2:
+           return "buffer_wbinvl1_vol\;"
+                  "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+                  "s_waitcnt\tvmcnt(0)";
+         }
+       break;
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+       switch (which_alternative)
+         {
+         case 0:
+           return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
+                  "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
+         case 1:
+           return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+                  "s_waitcnt\t0\;buffer_wbinvl1_vol";
+         case 2:
+           return "buffer_wbinvl1_vol\;"
+                  "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
+                  "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
+         }
+       break;
+      }
+    gcc_unreachable ();
+  }
+  [(set_attr "type" "smem,flat,flat")
+   (set_attr "length" "20")
+   (set_attr "gcn_version" "gcn5,*,gcn5")])
+
+;; }}}
+;; {{{ OpenACC / OpenMP
+
+(define_expand "oacc_dim_size"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SI 1 "const_int_operand")]
+  ""
+  {
+    rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
+    emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
+    DONE;
+  })
+
+(define_expand "oacc_dim_pos"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SI 1 "const_int_operand")]
+  ""
+  {
+    emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
+    DONE;
+  })
+
+(define_expand "gcn_wavefront_barrier"
+  [(set (match_dup 0)
+       (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+  ""
+  {
+    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+    MEM_VOLATILE_P (operands[0]) = 1;
+  })
+
+(define_insn "*gcn_wavefront_barrier"
+  [(set (match_operand:BLK 0 "")
+       (unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
+  ""
+  "s_barrier"
+  [(set_attr "type" "sopp")])
+
+(define_expand "oacc_fork"
+  [(set (match_operand:SI 0 "")
+       (match_operand:SI 1 ""))
+   (use (match_operand:SI 2 ""))]
+  ""
+  {
+    /* We need to have oacc_fork/oacc_join named patterns as a pair,
+       but the fork isn't actually used.  */
+    gcc_unreachable ();
+  })
+
+(define_expand "oacc_join"
+  [(set (match_operand:SI 0 "")
+       (match_operand:SI 1 ""))
+   (use (match_operand:SI 2 ""))]
+  ""
+  {
+    emit_insn (gen_gcn_wavefront_barrier ());
+    DONE;
+  })
+
+;; }}}
+
+(include "gcn-valu.md")
diff --git a/gcc/config/gcn/predicates.md b/gcc/config/gcn/predicates.md
new file mode 100644 (file)
index 0000000..5b54f49
--- /dev/null
@@ -0,0 +1,199 @@
+;; Predicate definitions for GCN.
+;; Copyright (C) 2016-2019 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;; Return true if VALUE can be stored in a sign extended immediate field.
+
+(define_predicate "gcn_conditional_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op) || GET_MODE (op) != BImode)
+    return 0;
+
+  return REGNO (op) == VCCZ_REG
+        || REGNO (op) == VCC_REG   /* Implied VCCZ.  */
+        || REGNO (op) == SCC_REG
+        || REGNO (op) == EXECZ_REG
+        || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_ssrc_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return SSRC_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_sdst_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return SDST_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_vgpr_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!REG_P (op))
+    return false;
+
+  return VGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_inline_immediate_operand"
+  (match_code "const_int,const_double,const_vector")
+{
+  return gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vop3_operand"
+  (ior (match_operand 0 "gcn_inline_immediate_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "gcn_vec0_operand"
+  (match_code "const_vector")
+{
+  return CONST_VECTOR_ELT (op, 0) == const0_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1_operand"
+  (match_code "const_vector")
+{
+  return CONST_VECTOR_ELT (op, 0) == const1_rtx && gcn_inline_constant_p (op);
+})
+
+(define_predicate "gcn_vec1d_operand"
+  (match_code "const_vector")
+{
+  if (!gcn_inline_constant_p (op))
+    return false;
+
+  rtx elem = CONST_VECTOR_ELT (op, 0);
+  if (!CONST_DOUBLE_P (elem))
+    return false;
+  return real_identical (CONST_DOUBLE_REAL_VALUE (elem), &dconst1);
+})
+
+(define_predicate "gcn_const1d_operand"
+  (match_code "const_double")
+{
+  return gcn_inline_constant_p (op)
+      && real_identical (CONST_DOUBLE_REAL_VALUE (op), &dconst1);
+})
+
+(define_predicate "gcn_32bit_immediate_operand"
+  (match_code "const_int,const_double,const_vector,symbol_ref,label_ref")
+{
+  return gcn_constant_p (op);
+})
+
+; LRA works smoother when exec values are immediate constants
+; prior register allocation.
+(define_predicate "gcn_exec_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_code "const_int")))
+
+(define_predicate "gcn_exec_reg_operand"
+  (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_load_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_alu_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_ds_memory_operand"
+  (and (match_code "mem")
+       (and (match_test "AS_ANY_DS_P (MEM_ADDR_SPACE (op))")
+           (match_operand 0 "memory_operand"))))
+
+(define_predicate "gcn_valu_dst_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_ds_memory_operand")))
+
+(define_predicate "gcn_valu_src0_operand"
+  (ior (match_operand 0 "register_operand")
+       (ior (match_operand 0 "gcn_32bit_immediate_operand")
+           (match_operand 0 "gcn_ds_memory_operand"))))
+
+(define_predicate "gcn_valu_src1_operand"
+  (match_operand 0 "register_operand"))
+
+(define_predicate "gcn_valu_src1com_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "gcn_32bit_immediate_operand")))
+
+(define_predicate "gcn_conditional_operator"
+  (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_64bit_operator"
+  (match_code "eq,ne"))
+
+(define_predicate "gcn_compare_operator"
+  (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu"))
+
+(define_predicate "gcn_fp_compare_operator"
+  (match_code "eq,ne,gt,ge,lt,le,gtu,geu,ltu,leu,ordered,unordered"))
+
+(define_predicate "unary_operator"
+  (match_code "not,popcount"))
+
+(define_predicate "binary_operator"
+  (match_code "and,ior,xor,ashift,lshiftrt,ashiftrt,smin,smax,umin,umax"))
+
+(define_predicate "gcn_unspec_operand"
+  (and (match_code "unspec")
+       (match_test "XINT (op, 1) == UNSPEC_VECTOR")))
+
+(define_predicate "general_or_unspec_operand"
+  (ior (match_operand 0 "general_operand")
+       (and (match_code "unspec")
+            (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_register_or_unspec_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "unspec")
+            (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_alu_or_unspec_operand"
+  (ior (match_operand 0 "gcn_alu_operand")
+       (and (match_code "unspec")
+            (match_test "XINT (op, 1) == UNSPEC_VECTOR"))))
+
+(define_predicate "gcn_register_ds_or_unspec_operand"
+  (ior (match_operand 0 "register_operand")
+       (ior (match_operand 0 "gcn_ds_memory_operand")
+           (and (match_code "unspec")
+              (match_test "XINT (op, 1) == UNSPEC_VECTOR")))))