[BRIGFE] phsa-specific optimizations
authorPekka Jääskeläinen <visit0r@gcc.gnu.org>
Fri, 4 May 2018 19:43:57 +0000 (19:43 +0000)
committerPekka Jääskeläinen <visit0r@gcc.gnu.org>
Fri, 4 May 2018 19:43:57 +0000 (19:43 +0000)
Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.

With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.

First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.

Other optimizations added in this commit:

- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.

From-SVN: r259957

25 files changed:
gcc/ChangeLog
gcc/brig-builtins.def
gcc/brig/ChangeLog
gcc/brig/brigfrontend/brig-basic-inst-handler.cc
gcc/brig/brigfrontend/brig-branch-inst-handler.cc
gcc/brig/brigfrontend/brig-cmp-inst-handler.cc
gcc/brig/brigfrontend/brig-code-entry-handler.cc
gcc/brig/brigfrontend/brig-code-entry-handler.h
gcc/brig/brigfrontend/brig-control-handler.cc
gcc/brig/brigfrontend/brig-cvt-inst-handler.cc
gcc/brig/brigfrontend/brig-function-handler.cc
gcc/brig/brigfrontend/brig-function.cc
gcc/brig/brigfrontend/brig-function.h
gcc/brig/brigfrontend/brig-label-handler.cc
gcc/brig/brigfrontend/brig-lane-inst-handler.cc
gcc/brig/brigfrontend/brig-mem-inst-handler.cc
gcc/brig/brigfrontend/phsa.h
gcc/brig/lang.opt
gcc/builtin-types.def
gcc/testsuite/ChangeLog
gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail
libhsail-rt/ChangeLog
libhsail-rt/include/internal/phsa-rt.h
libhsail-rt/include/internal/workitems.h
libhsail-rt/rt/workitems.c

index 150639cb474c2750f2bc811c87c84138686f8981..47862a80a0dc62d8337a073fd56f3ca9b2f52448 100644 (file)
@@ -1,3 +1,9 @@
+2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
+
+       * brig-builtins.def: Add consts to ptrs etc. in BRIG builtin defs.
+       To improve optimization opportunities.
+       * builtin-types.def: The new needed builtin types for the above.
+
 2018-05-04  Richard Biener  <rguenther@suse.de>
 
        * bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check.
index f94f7e62bb25f55808ac65594b391e07eedf924b..c2e8d2c034d7b538b4a826a4641ce2034a6b4c33 100644 (file)
@@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE,
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32,
                  BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32,
-                 "__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR,
-                 ATTR_NOTHROW_LEAF_LIST)
+                 "__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR,
+                 ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64,
                  BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64,
-                 "__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR,
-                 ATTR_NOTHROW_LEAF_LIST)
+                 "__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR,
+                 ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID,
-                 BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR,
-                 ATTR_NOTHROW_LEAF_LIST)
+                 BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR,
+                 ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID,
-                 BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR,
-                 ATTR_NOTHROW_LEAF_LIST)
+                 BRIG_TYPE_U32, "__hsail_workitemid",
+                 BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID,
-                 BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR,
-                 ATTR_PURE_NOTHROW_LEAF_LIST)
+                 BRIG_TYPE_U32, "__hsail_workgroupid",
+                 BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID,
                  BRIG_OPCODE_CURRENTWORKITEMFLATID,
@@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32,
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE,
                  BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32,
-                 "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR,
+                 "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR,
                  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE,
-                 BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR,
+                 BRIG_TYPE_U32, "__hsail_workgroupsize",
+                 BT_FN_UINT_UINT_CONST_PTR,
                  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM,
@@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid",
 
 DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC,
                       "__hsail_launch_wg_function",
-                      BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST)
+                      BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL,
                       "__hsail_launch_kernel",
index 732696420584efb0ba06afbe97fe72c1631dc58b..ce4aea615eb54cb6acaa2f8922d4755f2031faf4 100644 (file)
@@ -1,3 +1,46 @@
+2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
+
+       Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
+       is given, these optimizations are disabled.  With this flag, gccbrig
+       can generate GENERIC that assumes we are targeting a phsa-runtime
+       based implementation, which allows us to expose the work-item context
+       accesses to retrieve WI IDs etc.  which helps optimizers.
+       First optimization that takes advantage of this is to get rid of
+       the setworkitemid calls whenever we have non-inlined calls that
+       use IDs internally.  Other optimizations added in this commit:
+       - expand absoluteid to similar level of simplicity as workitemid.
+       At the moment absoluteid is the best indexing ID to end up with
+       WG vectorization.
+       - propagate ID variables closer to their uses. This is mainly
+       to avoid known useless casts, which confuse at least scalar
+       evolution analysis.
+       - use signed long long for storing IDs. Unsigned integers have
+       defined wraparound semantics, which confuse at least scalar
+       evolution analysis, leading to unvectorizable WI loops.
+       - also refactor some BRIG function generation helpers to brig_function.
+       - no point in having the wi-loop as a for-loop. It's really
+       a do...while and SCEV can analyze it just fine still.
+       - add consts to ptrs etc. in BRIG builtin defs.
+       Improves optimization opportunities.
+       - add qualifiers to generated function parameters.
+       Const and restrict on the hidden local/private pointers,
+       the arg buffer and the context pointer help some optimizations.
+       * brig/brigfrontend/brig-basic-inst-handler.cc: See above.
+       * brig/brigfrontend/brig-branch-inst-handler.cc: See above.
+       * brig/brigfrontend/brig-cmp-inst-handler.cc: See above.
+       * brig/brigfrontend/brig-code-entry-handler.cc: See above.
+       * brig/brigfrontend/brig-code-entry-handler.h: See above.
+       * brig/brigfrontend/brig-control-handler.cc: See above.
+       * brig/brigfrontend/brig-cvt-inst-handler.cc: See above.
+       * brig/brigfrontend/brig-function-handler.cc: See above.
+       * brig/brigfrontend/brig-function.cc: See above.
+       * brig/brigfrontend/brig-function.h: See above.
+       * brig/brigfrontend/brig-label-handler.cc: See above.
+       * brig/brigfrontend/brig-lane-inst-handler.cc: See above.
+       * brig/brigfrontend/brig-mem-inst-handler.cc: See above.
+       * brig/brigfrontend/phsa.h: See above.
+       * brig/lang.opt: See above.
+
 2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
 
        * brig/brigfrontend/brig-function-handler.cc: Skip multiple forward
index 283da7ac80ea93291e11fb9aa63dcb6e1c079b58..c8224ae6a51209c7b7f4b76983a0a3f2e46f6823 100644 (file)
@@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type,
   /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
      from which to construct the mask vector as understood by
      VEC_PERM_EXPR.  */
-  tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
+  tree mask_operand
+    = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
 
   tree mask_element_type
     = build_nonstandard_integer_type (input_mask_element_size, true);
@@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
   tree wide_type = build_nonstandard_integer_type (vecsize, 1);
 
   tree src_vect = build_resize_convert_view (wide_type, operands[0]);
-  src_vect = add_temp_var ("src_vect", src_vect);
+  src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
 
   tree scalar = operands[1];
-  scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
+  scalar = m_parent.m_cf->add_temp_var ("scalar",
+                                       convert_to_integer (wide_type, scalar));
 
   tree pos = operands[2];
 
@@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
      Zero them for well-defined semantics.  */
   tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
                   build_int_cstu (TREE_TYPE (pos), ecount - 1));
-  pos = add_temp_var ("pos", convert (wide_type, t));
+  pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
 
   tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
   size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
   tree ewidth = build_int_cstu (wide_type, element_width);
 
   tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
-  bitoffset = add_temp_var ("offset", bitoffset);
+  bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
 
   uint64_t mask_int
     = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
 
   tree mask = build_int_cstu (wide_type, mask_int);
 
-  mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
+  mask = m_parent.m_cf->add_temp_var ("mask",
+                                     convert_to_integer (wide_type, mask));
 
   tree clearing_mask
     = build1 (BIT_NOT_EXPR, wide_type,
@@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
                                          tree arith_type,
                                          tree_stl_vec &operands)
 {
-  tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
+  tree_code opcode
+    = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
 
   BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
 
@@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
             on which cannot be used in general to remain HSAIL compliant.
             Perhaps a builtin call would be better option here.  */
          return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
-                        expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
-                                                arith_type, operands));
+                        m_parent.m_cf->expand_or_call_builtin
+                        (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
        }
       else if (brig_opcode == BRIG_OPCODE_NRCP)
        {
@@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
        gcc_unreachable ();
     }
   else if (opcode == CALL_EXPR)
-    return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
-                                  operands);
+    return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
+                                                 arith_type, operands);
   else if (output_count == 1)
     {
       if (input_count == 1)
@@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
     in_operands[0] = build_lower_element_broadcast (in_operands[0]);
 
   tree_code opcode
-    = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
+    = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
+                                                  brig_inst_type);
 
   if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
     {
@@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
       */
       tree_stl_vec operand0_elements;
       if (input_count > 0)
-       unpack (in_operands[0], operand0_elements);
+       m_parent.m_cf->unpack (in_operands[0], operand0_elements);
 
       tree_stl_vec operand1_elements;
       if (input_count > 1)
-       unpack (in_operands[1], operand1_elements);
+       m_parent.m_cf->unpack (in_operands[1], operand1_elements);
 
       tree_stl_vec result_elements;
 
@@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
 
          result_elements.push_back (convert (scalar_type, scalar_expr));
        }
-      instr_expr = pack (result_elements);
+      instr_expr = m_parent.m_cf->pack (result_elements);
     }
   else
     {
@@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
                 vec_operand, mask);
 }
 
-/* Returns the tree code that should be used to implement the given
-   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
-   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
-   returns TREE_LIST (if it can be emulated with a simple chain of tree
-   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
-   call.  */
-
-tree_code
-brig_basic_inst_handler::get_tree_code_for_hsa_opcode
-  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
-  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_NOP:
-      return NOP_EXPR;
-    case BRIG_OPCODE_ADD:
-      return PLUS_EXPR;
-    case BRIG_OPCODE_CMOV:
-      if (brig_inner_type == brig_type)
-       return COND_EXPR;
-      else
-       return VEC_COND_EXPR;
-    case BRIG_OPCODE_SUB:
-      return MINUS_EXPR;
-    case BRIG_OPCODE_MUL:
-    case BRIG_OPCODE_MUL24:
-      return MULT_EXPR;
-    case BRIG_OPCODE_MULHI:
-    case BRIG_OPCODE_MUL24HI:
-      return MULT_HIGHPART_EXPR;
-    case BRIG_OPCODE_DIV:
-      if (gccbrig_is_float_type (brig_inner_type))
-       return RDIV_EXPR;
-      else
-       return TRUNC_DIV_EXPR;
-    case BRIG_OPCODE_NEG:
-      return NEGATE_EXPR;
-    case BRIG_OPCODE_MIN:
-      if (gccbrig_is_float_type (brig_inner_type))
-       return CALL_EXPR;
-      else
-       return MIN_EXPR;
-    case BRIG_OPCODE_MAX:
-      if (gccbrig_is_float_type (brig_inner_type))
-       return CALL_EXPR;
-      else
-       return MAX_EXPR;
-    case BRIG_OPCODE_FMA:
-      return FMA_EXPR;
-    case BRIG_OPCODE_ABS:
-      return ABS_EXPR;
-    case BRIG_OPCODE_SHL:
-      return LSHIFT_EXPR;
-    case BRIG_OPCODE_SHR:
-      return RSHIFT_EXPR;
-    case BRIG_OPCODE_OR:
-      return BIT_IOR_EXPR;
-    case BRIG_OPCODE_XOR:
-      return BIT_XOR_EXPR;
-    case BRIG_OPCODE_AND:
-      return BIT_AND_EXPR;
-    case BRIG_OPCODE_NOT:
-      return BIT_NOT_EXPR;
-    case BRIG_OPCODE_RET:
-      return RETURN_EXPR;
-    case BRIG_OPCODE_MOV:
-    case BRIG_OPCODE_LDF:
-      return MODIFY_EXPR;
-    case BRIG_OPCODE_LD:
-    case BRIG_OPCODE_ST:
-      return MEM_REF;
-    case BRIG_OPCODE_BR:
-      return GOTO_EXPR;
-    case BRIG_OPCODE_REM:
-      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
-       return TRUNC_MOD_EXPR;
-      else
-       return CALL_EXPR;
-    case BRIG_OPCODE_NRCP:
-    case BRIG_OPCODE_NRSQRT:
-      /* Implement as 1/f (x).  gcc should pattern detect that and
-        use a native instruction, if available, for it.  */
-      return TREE_LIST;
-    case BRIG_OPCODE_FLOOR:
-    case BRIG_OPCODE_CEIL:
-    case BRIG_OPCODE_SQRT:
-    case BRIG_OPCODE_NSQRT:
-    case BRIG_OPCODE_RINT:
-    case BRIG_OPCODE_TRUNC:
-    case BRIG_OPCODE_POPCOUNT:
-    case BRIG_OPCODE_COPYSIGN:
-    case BRIG_OPCODE_NCOS:
-    case BRIG_OPCODE_NSIN:
-    case BRIG_OPCODE_NLOG2:
-    case BRIG_OPCODE_NEXP2:
-    case BRIG_OPCODE_NFMA:
-      /* Class has type B1 regardless of the float type, thus
-        the below builtin map search cannot find it.  */
-    case BRIG_OPCODE_CLASS:
-    case BRIG_OPCODE_WORKITEMABSID:
-      return CALL_EXPR;
-    default:
-
-      /* Some BRIG opcodes can use the same builtins for unsigned and
-        signed types.  Force these cases to unsigned types.
-      */
-
-      if (brig_opcode == BRIG_OPCODE_BORROW
-         || brig_opcode == BRIG_OPCODE_CARRY
-         || brig_opcode == BRIG_OPCODE_LASTBIT
-         || brig_opcode == BRIG_OPCODE_BITINSERT)
-       {
-         if (brig_type == BRIG_TYPE_S32)
-           brig_type = BRIG_TYPE_U32;
-         else if (brig_type == BRIG_TYPE_S64)
-           brig_type = BRIG_TYPE_U64;
-       }
-
-
-      builtin_map::const_iterator i
-       = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
-      if (i != s_custom_builtins.end ())
-       return CALL_EXPR;
-      else if (s_custom_builtins.find
-              (std::make_pair (brig_opcode, brig_inner_type))
-              != s_custom_builtins.end ())
-       return CALL_EXPR;
-      if (brig_inner_type == BRIG_TYPE_F16
-         && s_custom_builtins.find
-         (std::make_pair (brig_opcode, BRIG_TYPE_F32))
-         != s_custom_builtins.end ())
-       return CALL_EXPR;
-      break;
-    }
-  return TREE_LIST; /* Emulate using a chain of nodes.  */
-}
index 1340b74dd35f442728184c3f8ffe8e5e36911a73..b6baf13711b88243aba0b4289019701687205c1d 100644 (file)
@@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
         memory.  */
 
       tree group_local_offset
-       = add_temp_var ("group_local_offset",
-                       build_int_cst
-                       (uint32_type_node,
-                        m_parent.m_cf->m_local_group_variables.size()));
+       = m_parent.m_cf->add_temp_var ("group_local_offset",
+                                      build_int_cst
+                                      (uint32_type_node,
+                                       m_parent.m_cf->
+                                       m_local_group_variables.size()));
 
       /* TODO: ensure the callee's frame is aligned!  */
 
@@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
       m_parent.m_cf->m_called_functions.push_back (func_ref);
       if (DECL_EXTERNAL (func_ref))
        m_parent.add_decl_call (call);
+      m_parent.m_cf->start_new_bb ();
 
       return base->byteCount;
     }
@@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
         ensure the barrier won't be duplicated or moved out of loops etc.
         Like the 'noduplicate' of LLVM.  Same goes for fbarriers.  */
       m_parent.m_cf->append_statement
-       (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE,
-                                call_operands));
+       (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+                                               BRIG_TYPE_NONE, NULL_TREE,
+                                               call_operands));
     }
   else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR
           && brig_inst->opcode <= BRIG_OPCODE_WAITFBAR)
     {
       m_parent.m_cf->m_has_barriers = true;
       m_parent.m_cf->append_statement
-       (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE,
-                                uint32_type_node, operands));
+       (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+                                               BRIG_TYPE_NONE,
+                                               uint32_type_node, operands));
     }
   else
     gcc_unreachable ();
+  m_parent.m_cf->start_new_bb ();
   return base->byteCount;
 }
index 1155ead9c079d1ccbed11f55b2325e2d7c3458c6..729e3fd0b226b6eccf58007ac69bea805c2513b8 100644 (file)
@@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base)
         results, we must now truncate the result vector to S16s so it
         fits to the destination register.  We can build the target vector
         type from the f16 storage type (unsigned ints).  */
-      expr = add_temp_var ("wide_cmp_result", expr);
+      expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr);
       tree_stl_vec wide_elements;
       tree_stl_vec shrunk_elements;
-      unpack (expr, wide_elements);
+      m_parent.m_cf->unpack (expr, wide_elements);
       for (size_t i = 0; i < wide_elements.size (); ++i)
        {
          tree wide = wide_elements.at (i);
          shrunk_elements.push_back
            (convert_to_integer (short_integer_type_node, wide));
        }
-      expr = pack (shrunk_elements);
+      expr = m_parent.m_cf->pack (shrunk_elements);
     }
   build_output_assignment (*inst_base, operands[0], expr);
 
index 36a8deb403d1a84168b78afd28c04a53d1c22c2e..4fa37fd7a4b0940dc7434691a94e26a741168ee6 100644 (file)
 #include "brig-builtins.h"
 #include "fold-const.h"
 
-brig_code_entry_handler::builtin_map brig_code_entry_handler::s_custom_builtins;
-
 brig_code_entry_handler::brig_code_entry_handler (brig_to_generic &parent)
   : brig_entry_handler (parent)
 {
-  if (s_custom_builtins.size () > 0) return;
-
-  /* Populate the builtin index.  */
-#undef DEF_HSAIL_ATOMIC_BUILTIN
-#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
-#undef DEF_HSAIL_INTR_BUILTIN
-#undef DEF_HSAIL_SAT_BUILTIN
-#undef DEF_HSAIL_BUILTIN
-#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
-  s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)]         \
-    = builtin_decl_explicit (ENUM);
-
-#include "brig-builtins.def"
 }
 
 /* Build a tree operand which is a reference to a piece of code.  REF is the
@@ -301,18 +286,18 @@ brig_code_entry_handler::build_address_operand
 
          tree local_size
            = build2 (MULT_EXPR, uint32_type_node,
-                     expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-                                             BRIG_TYPE_U32,
-                                             uint32_type_node, uint32_0),
-                     expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-                                             BRIG_TYPE_U32,
-                                             uint32_type_node, uint32_1));
+                     m_parent.m_cf->expand_or_call_builtin
+                     (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+                      uint32_type_node, uint32_0),
+                     m_parent.m_cf->expand_or_call_builtin
+                     (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+                      uint32_type_node, uint32_1));
 
          local_size
            = build2 (MULT_EXPR, uint32_type_node,
-                     expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-                                             BRIG_TYPE_U32,
-                                             uint32_type_node, uint32_2),
+                     m_parent.m_cf->expand_or_call_builtin
+                     (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+                      uint32_type_node, uint32_2),
                      local_size);
 
          tree var_region
@@ -324,9 +309,9 @@ brig_code_entry_handler::build_address_operand
            = build2 (MULT_EXPR, uint32_type_node,
                      build_int_cst (uint32_type_node,
                                     m_parent.private_variable_size (var_name)),
-                     expand_or_call_builtin (BRIG_OPCODE_WORKITEMFLATID,
-                                             BRIG_TYPE_U32,
-                                             uint32_type_node, operands));
+                     m_parent.m_cf->expand_or_call_builtin
+                     (BRIG_OPCODE_WORKITEMFLATID, BRIG_TYPE_U32,
+                      uint32_type_node, operands));
 
          tree var_offset
            = build2 (PLUS_EXPR, uint32_type_node, var_region, pos);
@@ -336,8 +321,9 @@ brig_code_entry_handler::build_address_operand
             offset to a flat address by adding it as an offset to a (private
             or group) base pointer later on.  Same applies to group_var_offset.  */
          symbol_base
-           = add_temp_var ("priv_var_offset",
-                           convert (size_type_node, var_offset));
+           = m_parent.m_cf->add_temp_var ("priv_var_offset",
+                                          convert (size_type_node,
+                                                   var_offset));
        }
       else if (segment == BRIG_SEGMENT_ARG)
        {
@@ -699,138 +685,6 @@ brig_code_entry_handler::get_tree_expr_type_for_hsa_type
     return gccbrig_tree_type_for_hsa_type (brig_type);
 }
 
-/* In case the HSA instruction must be implemented using a builtin,
-   this function is called to get the correct builtin function.
-   TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
-   brig instruction and BRIG_TYPE the brig instruction's type.  */
-
-tree
-brig_code_entry_handler::get_builtin_for_hsa_opcode
-  (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
-  tree builtin = NULL_TREE;
-  tree builtin_type = type;
-
-  /* For vector types, first find the scalar version of the builtin.  */
-  if (type != NULL_TREE && VECTOR_TYPE_P (type))
-    builtin_type = TREE_TYPE (type);
-  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
-
-  /* Some BRIG opcodes can use the same builtins for unsigned and
-     signed types.  Force these cases to unsigned types.  */
-
-  if (brig_opcode == BRIG_OPCODE_BORROW
-      || brig_opcode == BRIG_OPCODE_CARRY
-      || brig_opcode == BRIG_OPCODE_LASTBIT
-      || brig_opcode == BRIG_OPCODE_BITINSERT)
-    {
-      if (brig_type == BRIG_TYPE_S32)
-       brig_type = BRIG_TYPE_U32;
-      else if (brig_type == BRIG_TYPE_S64)
-       brig_type = BRIG_TYPE_U64;
-    }
-
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_FLOOR:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
-      break;
-    case BRIG_OPCODE_CEIL:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
-      break;
-    case BRIG_OPCODE_SQRT:
-    case BRIG_OPCODE_NSQRT:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
-      break;
-    case BRIG_OPCODE_RINT:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
-      break;
-    case BRIG_OPCODE_TRUNC:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
-      break;
-    case BRIG_OPCODE_COPYSIGN:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
-      break;
-    case BRIG_OPCODE_NSIN:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
-      break;
-    case BRIG_OPCODE_NLOG2:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
-      break;
-    case BRIG_OPCODE_NEXP2:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
-      break;
-    case BRIG_OPCODE_NFMA:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
-      break;
-    case BRIG_OPCODE_NCOS:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
-      break;
-    case BRIG_OPCODE_POPCOUNT:
-      /* Popcount should be typed by its argument type (the return value
-        is always u32).  Let's use a b64 version for also for b32 for now.  */
-      return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
-    case BRIG_OPCODE_BORROW:
-      /* Borrow uses the same builtin for unsigned and signed types.  */
-      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
-       return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
-      else
-       return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
-    case BRIG_OPCODE_CARRY:
-      /* Carry also uses the same builtin for unsigned and signed types.  */
-      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
-       return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
-      else
-       return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
-    default:
-
-      /* Use our builtin index for finding a proper builtin for the BRIG
-        opcode and BRIG type.  This takes care most of the builtin cases,
-        the special cases are handled in the separate 'case' statements
-        above.  */
-      builtin_map::const_iterator i
-       = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
-      if (i != s_custom_builtins.end ())
-       return (*i).second;
-
-      if (brig_inner_type != brig_type)
-       {
-         /* Try to find a scalar built-in we could use.  */
-         i = s_custom_builtins.find
-           (std::make_pair (brig_opcode, brig_inner_type));
-         if (i != s_custom_builtins.end ())
-           return (*i).second;
-       }
-
-      /* In case this is an fp16 operation that is promoted to fp32,
-        try to find a fp32 scalar built-in.  */
-      if (brig_inner_type == BRIG_TYPE_F16)
-       {
-         i = s_custom_builtins.find
-           (std::make_pair (brig_opcode, BRIG_TYPE_F32));
-         if (i != s_custom_builtins.end ())
-           return (*i).second;
-       }
-      gcc_unreachable ();
-    }
-
-  if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
-    {
-      /* Try to find a vectorized version of the built-in.
-        TODO: properly assert that builtin is a mathfn builtin? */
-      tree vec_builtin
-       = targetm.vectorize.builtin_vectorized_function
-       (builtin_mathfn_code (builtin), type, type);
-      if (vec_builtin != NULL_TREE)
-       return vec_builtin;
-      else
-       return builtin;
-    }
-  if (builtin == NULL_TREE)
-    gcc_unreachable ();
-  return builtin;
-}
-
 /* Return the correct GENERIC type for storing comparison results
    of operand with the type given in SOURCE_TYPE.  */
 
@@ -848,264 +702,6 @@ brig_code_entry_handler::get_comparison_result_type (tree source_type)
     return gccbrig_tree_type_for_hsa_type (BRIG_TYPE_B1);
 }
 
-/* Returns true in case the given opcode needs to know about work-item context
-   data.  In such case the context data is passed as a pointer to a work-item
-   context object, as the last argument in the builtin call.  */
-
-bool
-brig_code_entry_handler::needs_workitem_context_data
-  (BrigOpcode16_t brig_opcode) const
-{
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_WORKITEMABSID:
-    case BRIG_OPCODE_WORKITEMFLATABSID:
-    case BRIG_OPCODE_WORKITEMFLATID:
-    case BRIG_OPCODE_CURRENTWORKITEMFLATID:
-    case BRIG_OPCODE_WORKITEMID:
-    case BRIG_OPCODE_WORKGROUPID:
-    case BRIG_OPCODE_WORKGROUPSIZE:
-    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
-    case BRIG_OPCODE_GRIDGROUPS:
-    case BRIG_OPCODE_GRIDSIZE:
-    case BRIG_OPCODE_DIM:
-    case BRIG_OPCODE_PACKETID:
-    case BRIG_OPCODE_PACKETCOMPLETIONSIG:
-    case BRIG_OPCODE_BARRIER:
-    case BRIG_OPCODE_WAVEBARRIER:
-    case BRIG_OPCODE_ARRIVEFBAR:
-    case BRIG_OPCODE_INITFBAR:
-    case BRIG_OPCODE_JOINFBAR:
-    case BRIG_OPCODE_LEAVEFBAR:
-    case BRIG_OPCODE_RELEASEFBAR:
-    case BRIG_OPCODE_WAITFBAR:
-    case BRIG_OPCODE_CUID:
-    case BRIG_OPCODE_MAXCUID:
-    case BRIG_OPCODE_DEBUGTRAP:
-    case BRIG_OPCODE_GROUPBASEPTR:
-    case BRIG_OPCODE_KERNARGBASEPTR:
-    case BRIG_OPCODE_ALLOCA:
-      return true;
-    default:
-      return false;
-    };
-}
-
-/* Returns true in case the given opcode that would normally be generated
-   as a builtin call can be expanded to tree nodes.  */
-
-bool
-brig_code_entry_handler::can_expand_builtin (BrigOpcode16_t brig_opcode) const
-{
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_WORKITEMFLATABSID:
-    case BRIG_OPCODE_WORKITEMFLATID:
-    case BRIG_OPCODE_WORKITEMABSID:
-    case BRIG_OPCODE_WORKGROUPSIZE:
-    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
-      /* TODO: expand more builtins.  */
-      return true;
-    default:
-      return false;
-    };
-}
-
-/* Try to expand the given builtin call to reuse a previously generated
-   variable, if possible.  If not, just call the given builtin.
-   BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
-   ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
-   input operands.  */
-
-tree
-brig_code_entry_handler::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
-                                                BrigType16_t brig_type,
-                                                tree arith_type,
-                                                tree_stl_vec &operands)
-{
-  if (m_parent.m_cf->m_is_kernel && can_expand_builtin (brig_opcode))
-    return expand_builtin (brig_opcode, operands);
-
-  tree built_in
-    = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
-
-  if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
-      && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
-      && brig_opcode != BRIG_OPCODE_LERP
-      && brig_opcode != BRIG_OPCODE_PACKCVT
-      && brig_opcode != BRIG_OPCODE_SAD
-      && brig_opcode != BRIG_OPCODE_SADHI)
-    {
-      /* Call the scalar built-in for all elements in the vector.  */
-      tree_stl_vec operand0_elements;
-      if (operands.size () > 0)
-       unpack (operands[0], operand0_elements);
-
-      tree_stl_vec operand1_elements;
-      if (operands.size () > 1)
-       unpack (operands[1], operand1_elements);
-
-      tree_stl_vec result_elements;
-
-      size_t element_count = gccbrig_type_vector_subparts (arith_type);
-      for (size_t i = 0; i < element_count; ++i)
-       {
-         tree_stl_vec call_operands;
-         if (operand0_elements.size () > 0)
-           call_operands.push_back (operand0_elements.at (i));
-
-         if (operand1_elements.size () > 0)
-           call_operands.push_back (operand1_elements.at (i));
-
-         result_elements.push_back
-           (expand_or_call_builtin (brig_opcode, brig_type,
-                                    TREE_TYPE (arith_type),
-                                    call_operands));
-       }
-      return pack (result_elements);
-    }
-
-  tree_stl_vec call_operands;
-  tree_stl_vec operand_types;
-
-  tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
-
-  for (size_t i = 0; i < operands.size (); ++i)
-    {
-      tree operand_type = TREE_VALUE (arg_type_chain);
-      call_operands.push_back (convert (operand_type, operands[i]));
-      operand_types.push_back (operand_type);
-      arg_type_chain = TREE_CHAIN (arg_type_chain);
-    }
-
-  if (needs_workitem_context_data (brig_opcode))
-    {
-      call_operands.push_back (m_parent.m_cf->m_context_arg);
-      operand_types.push_back (ptr_type_node);
-      m_parent.m_cf->m_has_unexpanded_dp_builtins = true;
-    }
-
-  size_t operand_count = call_operands.size ();
-
-  call_operands.resize (4, NULL_TREE);
-  operand_types.resize (4, NULL_TREE);
-  for (size_t i = 0; i < operand_count; ++i)
-    call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
-                                                     call_operands.at (i));
-
-  tree fnptr = build_fold_addr_expr (built_in);
-  return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
-                          operand_count, &call_operands[0]);
-}
-
-/* Instead of calling a built-in, reuse a previously returned value known to
-   be still valid.  This is beneficial especially for the work-item
-   identification related builtins as not having them as calls can lead to
-   more easily vectorizable parallel loops for multi work-item work-groups.
-   BRIG_OPCODE identifies the builtin and OPERANDS store the operands.  */
-
-tree
-brig_code_entry_handler::expand_builtin (BrigOpcode16_t brig_opcode,
-                                        tree_stl_vec &operands)
-{
-  tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
-
-  tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
-
-  tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
-
-  if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
-    {
-      tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
-      id0 = convert (uint64_type_node, id0);
-
-      tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
-      id1 = convert (uint64_type_node, id1);
-
-      tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
-      id2 = convert (uint64_type_node, id2);
-
-      tree max0 = convert (uint64_type_node,
-                          m_parent.m_cf->m_grid_size_vars[0]);
-      tree max1 = convert (uint64_type_node,
-                          m_parent.m_cf->m_grid_size_vars[1]);
-
-      tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
-      id2_x_max0_x_max1
-       = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
-
-      tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
-
-      tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
-      sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
-
-      return add_temp_var ("workitemflatabsid", sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-
-      tree local_id_var = m_parent.m_cf->m_local_id_vars[dim];
-      tree wg_id_var = m_parent.m_cf->m_wg_id_vars[dim];
-      tree wg_size_var = m_parent.m_cf->m_wg_size_vars[dim];
-
-      tree wg_id_x_wg_size = build2 (MULT_EXPR, uint32_type_node,
-                                    convert (uint32_type_node, wg_id_var),
-                                    convert (uint32_type_node, wg_size_var));
-      tree sum
-       = build2 (PLUS_EXPR, uint32_type_node, wg_id_x_wg_size, local_id_var);
-
-      return add_temp_var (std::string ("workitemabsid_")
-                          + (char) ((int) 'x' + dim), sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
-    {
-      tree z_x_wgsx_wgsy
-       = build2 (MULT_EXPR, uint32_type_node,
-                 m_parent.m_cf->m_local_id_vars[2],
-                 m_parent.m_cf->m_wg_size_vars[0]);
-      z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
-                             m_parent.m_cf->m_wg_size_vars[1]);
-
-      tree y_x_wgsx
-       = build2 (MULT_EXPR, uint32_type_node,
-                 m_parent.m_cf->m_local_id_vars[1],
-                 m_parent.m_cf->m_wg_size_vars[0]);
-
-      tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
-      sum = build2 (PLUS_EXPR, uint32_type_node,
-                   m_parent.m_cf->m_local_id_vars[0],
-                   sum);
-      return add_temp_var ("workitemflatid", sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-      return m_parent.m_cf->m_wg_size_vars[dim];
-    }
-  else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-      return m_parent.m_cf->m_cur_wg_size_vars[dim];
-    }
-  else
-    gcc_unreachable ();
-
-  return NULL_TREE;
-}
-
-/* Appends and returns a new temp variable and an accompanying assignment
-   statement that stores the value of the given EXPR and has the given NAME.  */
-
-tree
-brig_code_entry_handler::add_temp_var (std::string name, tree expr)
-{
-  tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
-  tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
-  m_parent.m_cf->append_statement (assign);
-  return temp_var;
-}
-
 /* Creates a FP32 to FP16 conversion call, assuming the source and destination
    are FP32 type variables.  */
 
@@ -1387,7 +983,6 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
      variable type (can be any type; see get_m_var_declfor_reg @
      brig-function.cc).  */
   tree output_type = TREE_TYPE (output);
-  tree input_type = TREE_TYPE (inst_expr);
   bool is_fp16 = (brig_inst.type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
                 && brig_inst.base.kind != BRIG_KIND_INST_MEM
                 && !gccbrig_is_bit_operation (brig_inst.opcode);
@@ -1396,6 +991,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
   bool ftz = false;
   const BrigBase *base = &brig_inst.base;
 
+  if (m_parent.m_cf->is_id_val (inst_expr))
+    inst_expr = m_parent.m_cf->id_val (inst_expr);
+
+  tree input_type = TREE_TYPE (inst_expr);
+
+  m_parent.m_cf->add_reg_var_update (output, inst_expr);
+
   if (base->kind == BRIG_KIND_INST_MOD)
     {
       const BrigInstMod *mod = (const BrigInstMod *) base;
@@ -1418,13 +1020,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
     {
       /* Ensure we don't duplicate the arithmetics to the arguments of the bit
         field reference operators.  */
-      inst_expr = add_temp_var ("before_ftz", inst_expr);
+      inst_expr = m_parent.m_cf->add_temp_var ("before_ftz", inst_expr);
       inst_expr = flush_to_zero (is_fp16) (*this, inst_expr);
     }
 
   if (is_fp16)
     {
-      inst_expr = add_temp_var ("before_f2h", inst_expr);
+      inst_expr = m_parent.m_cf->add_temp_var ("before_f2h", inst_expr);
       tree f2h_output = build_f2h_conversion (inst_expr);
       tree conv = build_resize_convert_view (output_type, f2h_output);
       tree assign = build2 (MODIFY_EXPR, output_type, output, conv);
@@ -1486,62 +1088,6 @@ brig_code_entry_handler::append_statement (tree stmt)
   m_parent.m_cf->append_statement (stmt);
 }
 
-/* Unpacks the elements of the vector in VALUE to scalars (bit field
-   references) in ELEMENTS.  */
-
-void
-brig_code_entry_handler::unpack (tree value, tree_stl_vec &elements)
-{
-  size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
-  size_t element_size
-    = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
-  size_t element_count
-    = vec_size * BITS_PER_UNIT / element_size;
-
-  tree input_element_type = TREE_TYPE (TREE_TYPE (value));
-
-  value = add_temp_var ("unpack_input", value);
-
-  for (size_t i = 0; i < element_count; ++i)
-    {
-      tree element
-       = build3 (BIT_FIELD_REF, input_element_type, value,
-                 TYPE_SIZE (input_element_type),
-                 bitsize_int(i * element_size));
-
-      element = add_temp_var ("scalar", element);
-      elements.push_back (element);
-    }
-}
-
-/* Pack the elements of the scalars in ELEMENTS to the returned vector.  */
-
-tree
-brig_code_entry_handler::pack (tree_stl_vec &elements)
-{
-  size_t element_count = elements.size ();
-
-  gcc_assert (element_count > 1);
-
-  tree output_element_type = TREE_TYPE (elements.at (0));
-
-  vec<constructor_elt, va_gc> *constructor_vals = NULL;
-  for (size_t i = 0; i < element_count; ++i)
-    CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
-
-  tree vec_type = build_vector_type (output_element_type, element_count);
-
-  /* build_constructor creates a vector type which is not a vector_cst
-     that requires compile time constant elements.  */
-  tree vec = build_constructor (vec_type, constructor_vals);
-
-  /* Add a temp variable for readability.  */
-  tree tmp_var = create_tmp_var (vec_type, "vec_out");
-  tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
-  m_parent.m_cf->append_statement (vec_tmp_assign);
-  return tmp_var;
-}
-
 /* Visits the element(s) in the OPERAND, calling HANDLER to each of them.  */
 
 tree
@@ -1757,4 +1303,3 @@ brig_code_entry_handler::int_constant_value (tree node)
     n = TREE_OPERAND (n, 0);
   return int_cst_value (n);
 }
-
index 3aa4d9eaa362a11bc49a98411258bb4913bcb0d1..1e082c436c60f18e0974a5ff65145cf6b905c92b 100644 (file)
@@ -35,8 +35,6 @@ class tree_element_unary_visitor;
 class brig_code_entry_handler : public brig_entry_handler
 {
 public:
-  typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
-
   brig_code_entry_handler (brig_to_generic &parent);
 
   /* Handles the brig_code data at the given pointer and adds it to the
@@ -51,8 +49,6 @@ protected:
   tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const;
   tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst,
                                     tree type) const;
-  tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
-                                  BrigType16_t brig_type) const;
   tree get_comparison_result_type (tree source_type);
 
   tree build_code_ref (const BrigBase &ref);
@@ -73,16 +69,6 @@ protected:
 
   bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const;
 
-  void unpack (tree value, tree_stl_vec &elements);
-  tree pack (tree_stl_vec &elements);
-
-  bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
-  tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
-
-  tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
-                              BrigType16_t brig_type, tree arith_type,
-                              tree_stl_vec &operands);
-
   tree add_temp_var (std::string name, tree expr);
 
   tree build_f2h_conversion (tree source);
@@ -100,10 +86,6 @@ protected:
 
   tree extend_int (tree input, tree dest_type, tree src_type);
 
-  /* HSAIL-specific builtin functions not yet integrated to gcc.  */
-
-  static builtin_map s_custom_builtins;
-
 private:
 
   tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst,
@@ -299,9 +281,6 @@ private:
 
   tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type,
                              tree_stl_vec &operands);
-
-  tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
-                                         BrigType16_t brig_type) const;
 };
 
 class brig_cvt_inst_handler : public brig_inst_mod_handler
index b7e07226028f4456d8f47363673cec3cf14afc3a..82189e149f521be09a845b7378f2010608b41c89 100644 (file)
@@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base)
     case BRIG_CONTROL_MAXDYNAMICGROUPSIZE:
       {
        m_parent.m_cf->m_descriptor.max_dynamic_group_size
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        break;
       }
     case BRIG_CONTROL_MAXFLATGRIDSIZE:
       {
        m_parent.m_cf->m_descriptor.max_flat_grid_size
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        break;
       }
     case BRIG_CONTROL_MAXFLATWORKGROUPSIZE:
       {
        m_parent.m_cf->m_descriptor.max_flat_workgroup_size
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        break;
       }
     case BRIG_CONTROL_REQUIREDDIM:
       {
        m_parent.m_cf->m_descriptor.required_dim
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        break;
       }
     case BRIG_CONTROL_REQUIREDGRIDSIZE:
       {
        m_parent.m_cf->m_descriptor.required_grid_size[0]
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        m_parent.m_cf->m_descriptor.required_grid_size[1]
-         = int_constant_value (operands.at (1));
+         = brig_function::int_constant_value (operands.at (1));
        m_parent.m_cf->m_descriptor.required_grid_size[2]
-         = int_constant_value (operands.at (2));
+         = brig_function::int_constant_value (operands.at (2));
        break;
       }
     case BRIG_CONTROL_REQUIREDWORKGROUPSIZE:
       {
        m_parent.m_cf->m_descriptor.required_workgroup_size[0]
-         = int_constant_value (operands.at (0));
+         = brig_function::int_constant_value (operands.at (0));
        m_parent.m_cf->m_descriptor.required_workgroup_size[1]
-         = int_constant_value (operands.at (1));
+         = brig_function::int_constant_value (operands.at (1));
        m_parent.m_cf->m_descriptor.required_workgroup_size[2]
-         = int_constant_value (operands.at (2));
+         = brig_function::int_constant_value (operands.at (2));
        break;
       }
     case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS:
index e5ac799cdf27da09896bbf9090c1cbd1def45d1e..3b8c9ea01df54d29a7f1e28ff1e75f70f75eace1 100644 (file)
@@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base)
   tree &input = operands.at (1);
   tree &output = operands.at (0);
 
+  if (m_parent.m_cf->is_id_val (input))
+    {
+      input = m_parent.m_cf->id_val (input);
+      src_type = TREE_TYPE (input);
+    }
+
   size_t conv_src_size = int_size_in_bytes (src_type);
   size_t conv_dst_size = int_size_in_bytes (dest_type);
   size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input));
index d64135db7f25e4b1878680942f6d6c64d26c6e00..f22f065c45ceee7a48542d70555f433663163337 100644 (file)
@@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
      represent HSAIL registers.  */
   tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL);
 
+  tree restrict_char_ptr
+    = build_qualified_type (build_pointer_type (char_type_node),
+                           TYPE_QUAL_RESTRICT);
+  tree restrict_void_ptr
+    = build_qualified_type (build_pointer_type (void_type_node),
+                           TYPE_QUAL_RESTRICT);
+
+  tree restrict_const_char_ptr
+    = build_qualified_type (build_pointer_type
+                           (build_qualified_type (char_type_node,
+                                                  TYPE_QUAL_CONST)),
+                           TYPE_QUAL_RESTRICT);
+
+  tree restrict_const_void_ptr
+    = build_qualified_type (build_pointer_type
+                           (build_qualified_type (void_type_node,
+                                                  TYPE_QUAL_CONST)),
+                           TYPE_QUAL_RESTRICT);
+
   if (is_kernel)
     {
       tree name_identifier
@@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base)
         3) a void* parameter that contains the first flat address of the group
         region allocated to the current work-group.  */
 
-      tree char_ptr_type_node = build_pointer_type (char_type_node);
       fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
                           build_function_type_list (void_type_node,
-                                                    char_ptr_type_node,
-                                                    ptr_type_node,
-                                                    ptr_type_node, NULL_TREE));
+                                                    restrict_const_char_ptr,
+                                                    restrict_void_ptr,
+                                                    restrict_char_ptr, NULL_TREE));
 
       SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier);
 
@@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base)
        = gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG);
 
       tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                                get_identifier ("__args"), char_ptr_type_node);
+                                get_identifier ("__args"),
+                                restrict_const_char_ptr);
       DECL_ARGUMENTS (fndecl) = arg_arg;
-      DECL_ARG_TYPE (arg_arg) = char_ptr_type_node;
+      DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr;
       DECL_CONTEXT (arg_arg) = fndecl;
       DECL_ARTIFICIAL (arg_arg) = 1;
       TREE_READONLY (arg_arg) = 1;
@@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
              if (arg_decls == NULL_TREE)
                arg_decls = arg_var;
              else
-               chainon (arg_decls, arg_var);
+               arg_decls = chainon (arg_decls, arg_var);
 
              m_parent.m_cf->add_arg_variable (brigVar, arg_var);
 
@@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base)
              vec_safe_push (args, TREE_TYPE (arg_var));
 
              m_parent.m_cf->add_arg_variable (brigVar, arg_var);
-
-             if (arg_decls == NULL_TREE)
-               arg_decls = arg_var;
-             else
-               chainon (arg_decls, arg_var);
+             arg_decls = chainon (arg_decls, arg_var);
            }
        }
-
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
+      vec_safe_push (args, restrict_void_ptr);
+      vec_safe_push (args, restrict_char_ptr);
+      vec_safe_push (args, uint32_type_node);
+      vec_safe_push (args, restrict_char_ptr);
 
       fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
                           build_function_type_vec (ret_type, args));
@@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 
   /* All functions need the hidden __context argument passed on
      because they might call WI-specific functions which need
-     the context info.  */
+     the context info.  Only kernels can write it, if they need
+     to update the local ids in the work-item loop.  */
+
+  tree context_arg_type
+    = true ? restrict_void_ptr : restrict_const_void_ptr;
   tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                                get_identifier ("__context"), ptr_type_node);
-  if (DECL_ARGUMENTS (fndecl) == NULL_TREE)
-    DECL_ARGUMENTS (fndecl) = context_arg;
-  else
-    chainon (DECL_ARGUMENTS (fndecl), context_arg);
+                                get_identifier ("__context"),
+                                context_arg_type);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg);
   DECL_CONTEXT (context_arg) = fndecl;
-  DECL_ARG_TYPE (context_arg) = ptr_type_node;
+  DECL_ARG_TYPE (context_arg) = context_arg_type;
   DECL_ARTIFICIAL (context_arg) = 1;
   TREE_READONLY (context_arg) = 1;
   TREE_USED (context_arg) = 1;
+  m_parent.m_cf->m_context_arg = context_arg;
 
   /* They can also access group memory, so we need to pass the
      group pointer along too.  */
   tree group_base_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                 get_identifier ("__group_base_addr"), ptr_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
-  DECL_ARG_TYPE (group_base_arg) = ptr_type_node;
+                 get_identifier ("__group_base_addr"),
+                 restrict_char_ptr);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
+  DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr;
   DECL_CONTEXT (group_base_arg) = fndecl;
   DECL_ARTIFICIAL (group_base_arg) = 1;
   TREE_READONLY (group_base_arg) = 1;
@@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
   tree group_local_offset_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
                  get_identifier ("__group_local_offset"), uint32_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
   DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node;
   DECL_CONTEXT (group_local_offset_arg) = fndecl;
   DECL_ARTIFICIAL (group_local_offset_arg) = 1;
@@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
   /* Same for private.  */
   tree private_base_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                 get_identifier ("__private_base_addr"), ptr_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
-  DECL_ARG_TYPE (private_base_arg) = ptr_type_node;
+                 get_identifier ("__private_base_addr"), restrict_char_ptr);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
+  DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr;
   DECL_CONTEXT (private_base_arg) = fndecl;
   DECL_ARTIFICIAL (private_base_arg) = 1;
   TREE_READONLY (private_base_arg) = 1;
   TREE_USED (private_base_arg) = 1;
+  m_parent.m_cf->m_private_base_arg = private_base_arg;
 
   DECL_SAVED_TREE (fndecl) = bind_expr;
 
-  set_externally_visible (fndecl);
-
   if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION)
     {
       TREE_STATIC (fndecl) = 0;
       TREE_PUBLIC (fndecl) = 1;
       DECL_EXTERNAL (fndecl) = 0;
       DECL_DECLARED_INLINE_P (fndecl) = 1;
+      set_inline (fndecl);
+      set_externally_visible (fndecl);
     }
   else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL)
     {
@@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
       TREE_STATIC (fndecl) = 0;
       TREE_PUBLIC (fndecl) = 1;
       DECL_EXTERNAL (fndecl) = 1;
+      set_inline (fndecl);
     }
   else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION)
     {
@@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base)
     }
 
   m_parent.start_function (fndecl);
-
   m_parent.m_cf->m_func_decl = fndecl;
   m_parent.m_cf->m_current_bind_expr = bind_expr;
-  m_parent.m_cf->m_context_arg = context_arg;
-  m_parent.m_cf->m_private_base_arg = private_base_arg;
 
   if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node)
     {
index e1a14da8b72efec7b2477ec62844e0e35ddbc49a..f0c499d47f6d11419a30af67f9485b50fe8811bd 100644 (file)
 #include "function.h"
 #include "brig-to-generic.h"
 #include "brig-builtins.h"
+#include "options.h"
+#include "fold-const.h"
+#include "target.h"
+#include "builtins.h"
+
+brig_function::builtin_map brig_function::s_custom_builtins;
 
 brig_function::brig_function (const BrigDirectiveExecutable *exec,
                              brig_to_generic *parent)
@@ -60,6 +66,20 @@ brig_function::brig_function (const BrigDirectiveExecutable *exec,
   memset (m_regs, 0,
          BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT * sizeof (BrigOperandRegister *));
   memset (&m_descriptor, 0, sizeof (phsa_descriptor));
+
+  if (s_custom_builtins.size () > 0) return;
+
+  /* Populate the builtin index.  */
+#undef DEF_HSAIL_ATOMIC_BUILTIN
+#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
+#undef DEF_HSAIL_INTR_BUILTIN
+#undef DEF_HSAIL_SAT_BUILTIN
+#undef DEF_HSAIL_BUILTIN
+#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
+  s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)]         \
+    = builtin_decl_explicit (ENUM);
+
+#include "brig-builtins.def"
 }
 
 brig_function::~brig_function ()
@@ -158,8 +178,7 @@ brig_function::add_id_variables ()
   tree stmts = BIND_EXPR_BODY (bind_expr);
 
   /* Initialize the WG limits and local ids.  */
-
-  tree_stmt_iterator entry = tsi_start (stmts);
+  m_kernel_entry = tsi_start (stmts);
 
   for (int i = 0; i < 3; ++i)
     {
@@ -169,7 +188,7 @@ brig_function::add_id_variables ()
         to avoid unnecessary casts (the ID functions are 32b).  */
       m_local_id_vars[i]
        = add_local_variable (std::string ("__local_") + dim_char,
-                             uint32_type_node);
+                             long_long_integer_type_node);
 
       tree workitemid_call
        = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKITEMID), 2,
@@ -178,54 +197,88 @@ brig_function::add_id_variables ()
                        m_context_arg);
 
       tree id_init = build2 (MODIFY_EXPR, TREE_TYPE (m_local_id_vars[i]),
-                            m_local_id_vars[i], workitemid_call);
+                            m_local_id_vars[i],
+                            convert (TREE_TYPE (m_local_id_vars[i]),
+                                     workitemid_call));
 
-      tsi_link_after (&entry, id_init, TSI_NEW_STMT);
+      append_statement (id_init);
 
       m_cur_wg_size_vars[i]
        = add_local_variable (std::string ("__cur_wg_size_") + dim_char,
-                             uint32_type_node);
+                             long_long_integer_type_node);
 
-      tree cwgz_call
-       = call_builtin
-       (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
-        2, uint32_type_node, uint32_type_node,
-        build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
+      tree cwgz_call;
+      if (flag_assume_phsa)
+       {
+         tree_stl_vec operands
+           = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+         cwgz_call
+           = expand_or_call_builtin (BRIG_OPCODE_CURRENTWORKGROUPSIZE,
+                                     BRIG_TYPE_U32, uint32_type_node,
+                                     operands);
+       }
+      else
+       cwgz_call = call_builtin
+         (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
+          2, uint32_type_node, uint32_type_node,
+          build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
 
       tree limit_init = build2 (MODIFY_EXPR, TREE_TYPE (m_cur_wg_size_vars[i]),
-                               m_cur_wg_size_vars[i], cwgz_call);
+                               m_cur_wg_size_vars[i],
+                               convert (TREE_TYPE (m_cur_wg_size_vars[i]),
+                                        cwgz_call));
 
-      tsi_link_after (&entry, limit_init, TSI_NEW_STMT);
+      append_statement (limit_init);
 
       m_wg_id_vars[i]
        = add_local_variable (std::string ("__workgroupid_") + dim_char,
                              uint32_type_node);
 
-      tree wgid_call
-       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
-                       2, uint32_type_node, uint32_type_node,
-                       build_int_cst (uint32_type_node, i), ptr_type_node,
-                       m_context_arg);
+      tree wgid_call;
+      if (flag_assume_phsa)
+       {
+         tree_stl_vec operands
+           = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+         wgid_call
+           = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPID, BRIG_TYPE_U32,
+                                     uint32_type_node, operands);
+       }
+      else
+       wgid_call
+         = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
+                         2, uint32_type_node, uint32_type_node,
+                         build_int_cst (uint32_type_node, i), ptr_type_node,
+                         m_context_arg);
 
       tree wgid_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_id_vars[i]),
                               m_wg_id_vars[i], wgid_call);
 
-      tsi_link_after (&entry, wgid_init, TSI_NEW_STMT);
+      append_statement (wgid_init);
 
       m_wg_size_vars[i]
        = add_local_variable (std::string ("__workgroupsize_") + dim_char,
                              uint32_type_node);
 
-      tree wgsize_call
-       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
-                       2, uint32_type_node, uint32_type_node,
-                       build_int_cst (uint32_type_node, i), ptr_type_node,
-                       m_context_arg);
+      tree wgsize_call;
+      if (flag_assume_phsa)
+       {
+         tree_stl_vec operands
+           = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+         wgsize_call
+           = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+                                     uint32_type_node, operands);
+       }
+      else
+       wgsize_call
+         = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
+                         2, uint32_type_node, uint32_type_node,
+                         build_int_cst (uint32_type_node, i), ptr_type_node,
+                         m_context_arg);
 
       tree wgsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_size_vars[i]),
                                 m_wg_size_vars[i], wgsize_call);
 
-      tsi_link_after (&entry, wgsize_init, TSI_NEW_STMT);
+      append_statement (wgsize_init);
 
       m_grid_size_vars[i]
        = add_local_variable (std::string ("__gridsize_") + dim_char,
@@ -240,10 +293,34 @@ brig_function::add_id_variables ()
       tree gridsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_grid_size_vars[i]),
                                   m_grid_size_vars[i], gridsize_call);
 
-      tsi_link_after (&entry, gridsize_init, TSI_NEW_STMT);
+      append_statement (gridsize_init);
+
+      m_abs_id_base_vars[i]
+       = add_local_variable (std::string ("__abs_id_base_") + dim_char,
+                             long_long_integer_type_node);
+
+      m_abs_id_vars[i]
+       = add_local_variable (std::string ("__abs_id_") + dim_char,
+                             long_long_integer_type_node);
+
+      tree abs_id_base
+       = build2 (MULT_EXPR, long_long_integer_type_node,
+                 convert (long_long_integer_type_node, m_wg_id_vars[i]),
+                 convert (long_long_integer_type_node, m_wg_size_vars[i]));
+      tree abs_id
+       = build2 (PLUS_EXPR, long_long_integer_type_node, abs_id_base,
+                 convert (long_long_integer_type_node, m_local_id_vars[i]));
+
+      tree abs_id_base_init
+       = build2 (MODIFY_EXPR, TREE_TYPE (m_abs_id_base_vars[i]),
+                 m_abs_id_base_vars[i], abs_id_base);
+      append_statement (abs_id_base_init);
+
+      tree abs_id_init = build2 (MODIFY_EXPR,
+                                TREE_TYPE (m_abs_id_vars[i]),
+                                m_abs_id_vars[i], abs_id);
+      append_statement (abs_id_init);
     }
-
-  m_kernel_entry = entry;
 }
 
 /* Creates a new local variable with the given NAME and given GENERIC
@@ -359,6 +436,8 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
                            tree_stmt_iterator *branch_after)
 {
   tree ivar = m_local_id_vars[dim];
+  tree abs_id_base_var = m_abs_id_base_vars[dim];
+  tree abs_id_var = m_abs_id_vars[dim];
   tree ivar_max = m_cur_wg_size_vars[dim];
   tree_stmt_iterator entry = *header_entry;
 
@@ -371,6 +450,12 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
                           build_zero_cst (TREE_TYPE (ivar)));
   tsi_link_after (&entry, ivar_init, TSI_NEW_STMT);
 
+  tree abs_id_var_init = build2 (MODIFY_EXPR, TREE_TYPE (abs_id_var),
+                                abs_id_var,
+                                convert (TREE_TYPE (abs_id_var),
+                                         abs_id_base_var));
+  tsi_link_after (&entry, abs_id_var_init, TSI_NEW_STMT);
+
   tree loop_body_label
     = label (std::string ("__wi_loop_") + (char) ((int) 'x' + dim));
   tree loop_body_label_stmt = build_stmt (LABEL_EXPR, loop_body_label);
@@ -379,16 +464,30 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 
   if (m_has_unexpanded_dp_builtins)
     {
-      tree id_set_builtin
-       = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
-      /* Set the local ID to the current wi-loop iteration variable value to
-        ensure the builtins see the correct values.  */
-      tree id_set_call
-       = call_builtin (id_set_builtin, 3,
-                       void_type_node, uint32_type_node,
-                       build_int_cst (uint32_type_node, dim), uint32_type_node,
-                       ivar, ptr_type_node, m_context_arg);
-      tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+      if (!flag_assume_phsa)
+       {
+         tree id_set_builtin
+           = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
+         /* Set the local ID to the current wi-loop iteration variable value
+            to ensure the builtins see the correct values.  */
+         tree id_set_call
+           = call_builtin (id_set_builtin, 3,
+                           void_type_node, uint32_type_node,
+                           build_int_cst (uint32_type_node, dim),
+                           uint32_type_node, convert (uint32_type_node, ivar),
+                           ptr_type_node, m_context_arg);
+         tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+       }
+      else
+       {
+         tree ptr_type = build_pointer_type (uint32_type_node);
+         tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+                            build_int_cst (ptr_type, dim * 4));
+         tree assign = build2 (MODIFY_EXPR, uint32_type_node, ctx,
+                               convert (uint32_type_node, ivar));
+
+         tsi_link_after (&entry, assign, TSI_NEW_STMT);
+       }
     }
 
   /* Increment the WI iteration variable.  */
@@ -397,6 +496,13 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 
   tsi_link_after (branch_after, incr, TSI_NEW_STMT);
 
+  /* ...and the abs id variable.  */
+  tree abs_id_incr = build2 (PREINCREMENT_EXPR, TREE_TYPE (abs_id_var),
+                            abs_id_var,
+                            build_one_cst (TREE_TYPE (abs_id_var)));
+
+  tsi_link_after (branch_after, abs_id_incr, TSI_NEW_STMT);
+
   /* Append the predicate check with the back edge goto.  */
   tree condition = build2 (LT_EXPR, TREE_TYPE (ivar), ivar, ivar_max);
   tree target_goto = build1 (GOTO_EXPR, void_type_node, loop_body_label);
@@ -549,29 +655,36 @@ brig_function::emit_launcher_and_metadata ()
   tree name_identifier
     = get_identifier_with_length (kern_name.c_str (), kern_name.size ());
 
+  tree restrict_void_ptr
+    = build_qualified_type (build_pointer_type (void_type_node),
+                           TYPE_QUAL_RESTRICT);
+  tree restrict_char_ptr
+    = build_qualified_type (build_pointer_type (char_type_node),
+                           TYPE_QUAL_RESTRICT);
   tree launcher
     = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
-                 build_function_type_list (void_type_node, ptr_type_node,
-                                           ptr_type_node, NULL_TREE));
+                 build_function_type_list (void_type_node, restrict_void_ptr,
+                                           restrict_char_ptr, NULL_TREE));
 
   TREE_USED (launcher) = 1;
   DECL_ARTIFICIAL (launcher) = 1;
 
   tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                                get_identifier ("__context"), ptr_type_node);
+                                get_identifier ("__context"),
+                                restrict_void_ptr);
 
   DECL_ARGUMENTS (launcher) = context_arg;
-  DECL_ARG_TYPE (context_arg) = ptr_type_node;
+  DECL_ARG_TYPE (context_arg) = restrict_void_ptr;
   DECL_CONTEXT (context_arg) = launcher;
   TREE_USED (context_arg) = 1;
   DECL_ARTIFICIAL (context_arg) = 1;
 
   tree group_base_addr_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-                 get_identifier ("__group_base_addr"), ptr_type_node);
+                 get_identifier ("__group_base_addr"), restrict_char_ptr);
 
   chainon (DECL_ARGUMENTS (launcher), group_base_addr_arg);
-  DECL_ARG_TYPE (group_base_addr_arg) = ptr_type_node;
+  DECL_ARG_TYPE (group_base_addr_arg) = restrict_char_ptr;
   DECL_CONTEXT (group_base_addr_arg) = launcher;
   TREE_USED (group_base_addr_arg) = 1;
   DECL_ARTIFICIAL (group_base_addr_arg) = 1;
@@ -618,15 +731,15 @@ brig_function::emit_launcher_and_metadata ()
     phsail_launch_kernel_call
       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_WG_FUNC),
                      4, void_type_node,
-                     ptr_type_node, kernel_func_ptr, ptr_type_node,
-                     context_arg, ptr_type_node, group_base_addr_arg,
+                     ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+                     context_arg, restrict_char_ptr, group_base_addr_arg,
                      uint32_type_node, group_local_offset_arg);
   else
     phsail_launch_kernel_call
       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_KERNEL),
                      4, void_type_node,
-                     ptr_type_node, kernel_func_ptr, ptr_type_node,
-                     context_arg, ptr_type_node, group_base_addr_arg,
+                     ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+                     context_arg, restrict_char_ptr, group_base_addr_arg,
                      uint32_type_node, group_local_offset_arg);
 
   append_to_statement_list_force (phsail_launch_kernel_call, &stmt_list);
@@ -771,3 +884,719 @@ brig_function::group_variable_segment_offset (const std::string &name) const
   gcc_assert (m_parent->m_module_group_variables.has_variable (name));
   return m_parent->m_module_group_variables.segment_offset (name);
 }
+
+/* Try to expand the given builtin call to reuse a previously generated
+   variable, if possible.  If not, just call the given builtin.
+   BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
+   ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
+   input operands.  */
+
+tree
+brig_function::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+                                      BrigType16_t brig_type,
+                                      tree arith_type,
+                                      tree_stl_vec &operands)
+{
+  if (needs_workitem_context_data (brig_opcode))
+    m_has_unexpanded_dp_builtins = true;
+
+  if (can_expand_builtin (brig_opcode))
+    return expand_builtin (brig_opcode, operands);
+
+  tree built_in
+    = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
+
+  if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
+      && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
+      && brig_opcode != BRIG_OPCODE_LERP
+      && brig_opcode != BRIG_OPCODE_PACKCVT
+      && brig_opcode != BRIG_OPCODE_SAD
+      && brig_opcode != BRIG_OPCODE_SADHI)
+    {
+      /* Call the scalar built-in for all elements in the vector.  */
+      tree_stl_vec operand0_elements;
+      if (operands.size () > 0)
+       unpack (operands[0], operand0_elements);
+
+      tree_stl_vec operand1_elements;
+      if (operands.size () > 1)
+       unpack (operands[1], operand1_elements);
+
+      tree_stl_vec result_elements;
+
+      size_t element_count = gccbrig_type_vector_subparts (arith_type);
+      for (size_t i = 0; i < element_count; ++i)
+       {
+         tree_stl_vec call_operands;
+         if (operand0_elements.size () > 0)
+           call_operands.push_back (operand0_elements.at (i));
+
+         if (operand1_elements.size () > 0)
+           call_operands.push_back (operand1_elements.at (i));
+
+         result_elements.push_back
+           (expand_or_call_builtin (brig_opcode, brig_type,
+                                    TREE_TYPE (arith_type),
+                                    call_operands));
+       }
+      return pack (result_elements);
+    }
+
+  tree_stl_vec call_operands;
+  tree_stl_vec operand_types;
+
+  tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
+
+  for (size_t i = 0; i < operands.size (); ++i)
+    {
+      tree operand_type = TREE_VALUE (arg_type_chain);
+      call_operands.push_back (convert (operand_type, operands[i]));
+      operand_types.push_back (operand_type);
+      arg_type_chain = TREE_CHAIN (arg_type_chain);
+    }
+
+  if (needs_workitem_context_data (brig_opcode))
+    {
+      call_operands.push_back (m_context_arg);
+      operand_types.push_back (ptr_type_node);
+    }
+
+  size_t operand_count = call_operands.size ();
+
+  call_operands.resize (4, NULL_TREE);
+  operand_types.resize (4, NULL_TREE);
+  for (size_t i = 0; i < operand_count; ++i)
+    call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
+                                                     call_operands.at (i));
+
+  tree fnptr = build_fold_addr_expr (built_in);
+  return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
+                          operand_count, &call_operands[0]);
+}
+
+/* Instead of calling a built-in function, use a more efficient mechanism
+   such as reuse a previously returned value known to be still valid, or
+   access the work-item context struct directly.  This is beneficial especially
+   for the work-item identification related builtins as not having them as
+   unanalyzable black box calls can lead to more easily vectorizable parallel
+   loops for multi work-item work-groups.  BRIG_OPCODE identifies the builtin
+   and OPERANDS store the operands.  */
+
+tree
+brig_function::expand_builtin (BrigOpcode16_t brig_opcode,
+                              tree_stl_vec &operands)
+{
+  tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
+
+  tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
+
+  tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
+
+  if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
+    {
+      tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
+      id0 = convert (uint64_type_node, id0);
+
+      tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
+      id1 = convert (uint64_type_node, id1);
+
+      tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
+      id2 = convert (uint64_type_node, id2);
+
+      tree max0 = convert (uint64_type_node, m_grid_size_vars[0]);
+      tree max1 = convert (uint64_type_node, m_grid_size_vars[1]);
+
+      tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
+      id2_x_max0_x_max1
+       = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
+
+      tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
+
+      tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
+      sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
+
+      return add_temp_var ("workitemflatabsid", sum);
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      return m_abs_id_vars[dim];
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
+    {
+
+      tree wg_size_x = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_0);
+      tree wg_size_y = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_1);
+      tree z_x_wgsx_wgsy
+       = build2 (MULT_EXPR, uint32_type_node,
+                 convert (uint32_type_node,
+                          expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_2)),
+                 wg_size_x);
+      z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
+                             wg_size_y);
+
+      tree y_x_wgsx
+       = build2 (MULT_EXPR, uint32_type_node,
+                 convert (uint32_type_node,
+                          expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_1)),
+                 wg_size_x);
+
+      tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
+      sum = build2 (PLUS_EXPR, uint32_type_node,
+                   convert (uint32_type_node,
+                            expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_0)),
+                   sum);
+      return add_temp_var ("workitemflatid", sum);
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+       {
+         tree ptr_type = build_pointer_type (uint32_type_node);
+         tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+                            build_int_cst (ptr_type,
+                                           PHSA_CONTEXT_WG_SIZES
+                                           + dim * 4));
+         std::string name ("wgsize_x");
+         name [name.length() - 1] += dim;
+         return add_temp_var (name.c_str(), ctx);
+       }
+      else if (m_is_kernel)
+       {
+         /* For kernels without phsa we generate certain temps before
+            the WI loop, which means we don't need to rely on LICM to get
+            them moved out.  */
+         return m_wg_size_vars[dim];
+       }
+      else
+       gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (m_is_kernel)
+       {
+         return m_local_id_vars [dim];
+       }
+      else if (flag_assume_phsa)
+       {
+         tree ptr_type = build_pointer_type (uint32_type_node);
+         tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+                            build_int_cst (ptr_type,
+                                           PHSA_CONTEXT_OFFS_WI_IDS
+                                           + dim * 4));
+         std::string name ("wiid_x");
+         name [name.length() - 1] += dim;
+         return add_temp_var (name.c_str(), ctx);
+       }
+      else
+       gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKGROUPID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+       {
+         tree ptr_type = build_pointer_type (uint32_type_node);
+         tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+                            build_int_cst (ptr_type,
+                                           PHSA_CONTEXT_OFFS_WG_IDS
+                                           + dim * 4));
+         std::string name ("wgid_x");
+         name [name.length() - 1] += dim;
+         return add_temp_var (name.c_str(), ctx);
+       } else if (m_is_kernel)
+       return m_wg_id_vars [dim];
+      else
+       gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+       {
+         tree ptr_type = build_pointer_type (uint32_type_node);
+         tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+                            build_int_cst (ptr_type,
+                                           PHSA_CONTEXT_CURRENT_WG_SIZES
+                                           + dim * 4));
+         std::string name ("curwgsize_x");
+         name [name.length() - 1] += dim;
+         return add_temp_var (name.c_str(), ctx);
+       } else if (m_is_kernel)
+       return m_cur_wg_size_vars[dim];
+      else
+       gcc_unreachable ();
+    }
+  else
+    gcc_unreachable ();
+
+  return NULL_TREE;
+}
+
+/* Returns true in case the given opcode that would normally be generated
+   as a builtin call can be expanded to tree nodes.  */
+
+bool
+brig_function::can_expand_builtin (BrigOpcode16_t brig_opcode) const
+{
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+    case BRIG_OPCODE_WORKITEMFLATID:
+    case BRIG_OPCODE_WORKITEMID:
+    case BRIG_OPCODE_WORKGROUPID:
+    case BRIG_OPCODE_WORKGROUPSIZE:
+      return m_is_kernel || flag_assume_phsa;
+    case BRIG_OPCODE_WORKITEMFLATABSID:
+    case BRIG_OPCODE_WORKITEMABSID:
+      return m_is_kernel;
+    default:
+      return false;
+    };
+}
+
+/* In case the HSA instruction must be implemented using a builtin,
+   this function is called to get the correct builtin function.
+   TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
+   brig instruction and BRIG_TYPE the brig instruction's type.  */
+
+tree
+brig_function::get_builtin_for_hsa_opcode
+  (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
+{
+  tree builtin = NULL_TREE;
+  tree builtin_type = type;
+
+  /* For vector types, first find the scalar version of the builtin.  */
+  if (type != NULL_TREE && VECTOR_TYPE_P (type))
+    builtin_type = TREE_TYPE (type);
+  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+
+  /* Some BRIG opcodes can use the same builtins for unsigned and
+     signed types.  Force these cases to unsigned types.  */
+
+  if (brig_opcode == BRIG_OPCODE_BORROW
+      || brig_opcode == BRIG_OPCODE_CARRY
+      || brig_opcode == BRIG_OPCODE_LASTBIT
+      || brig_opcode == BRIG_OPCODE_BITINSERT)
+    {
+      if (brig_type == BRIG_TYPE_S32)
+       brig_type = BRIG_TYPE_U32;
+      else if (brig_type == BRIG_TYPE_S64)
+       brig_type = BRIG_TYPE_U64;
+    }
+
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_FLOOR:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
+      break;
+    case BRIG_OPCODE_CEIL:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
+      break;
+    case BRIG_OPCODE_SQRT:
+    case BRIG_OPCODE_NSQRT:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
+      break;
+    case BRIG_OPCODE_RINT:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
+      break;
+    case BRIG_OPCODE_TRUNC:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
+      break;
+    case BRIG_OPCODE_COPYSIGN:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
+      break;
+    case BRIG_OPCODE_NSIN:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
+      break;
+    case BRIG_OPCODE_NLOG2:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
+      break;
+    case BRIG_OPCODE_NEXP2:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
+      break;
+    case BRIG_OPCODE_NFMA:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
+      break;
+    case BRIG_OPCODE_NCOS:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
+      break;
+    case BRIG_OPCODE_POPCOUNT:
+      /* Popcount should be typed by its argument type (the return value
+        is always u32).  Let's use a b64 version for also for b32 for now.  */
+      return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
+    case BRIG_OPCODE_BORROW:
+      /* Borrow uses the same builtin for unsigned and signed types.  */
+      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+       return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
+      else
+       return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
+    case BRIG_OPCODE_CARRY:
+      /* Carry also uses the same builtin for unsigned and signed types.  */
+      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+       return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
+      else
+       return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
+    default:
+
+      /* Use our builtin index for finding a proper builtin for the BRIG
+        opcode and BRIG type.  This takes care most of the builtin cases,
+        the special cases are handled in the separate 'case' statements
+        above.  */
+      builtin_map::const_iterator i
+       = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+      if (i != s_custom_builtins.end ())
+       return (*i).second;
+
+      if (brig_inner_type != brig_type)
+       {
+         /* Try to find a scalar built-in we could use.  */
+         i = s_custom_builtins.find
+           (std::make_pair (brig_opcode, brig_inner_type));
+         if (i != s_custom_builtins.end ())
+           return (*i).second;
+       }
+
+      /* In case this is an fp16 operation that is promoted to fp32,
+        try to find a fp32 scalar built-in.  */
+      if (brig_inner_type == BRIG_TYPE_F16)
+       {
+         i = s_custom_builtins.find
+           (std::make_pair (brig_opcode, BRIG_TYPE_F32));
+         if (i != s_custom_builtins.end ())
+           return (*i).second;
+       }
+      gcc_unreachable ();
+    }
+
+  if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
+    {
+      /* Try to find a vectorized version of the built-in.
+        TODO: properly assert that builtin is a mathfn builtin? */
+      tree vec_builtin
+       = targetm.vectorize.builtin_vectorized_function
+       (builtin_mathfn_code (builtin), type, type);
+      if (vec_builtin != NULL_TREE)
+       return vec_builtin;
+      else
+       return builtin;
+    }
+  if (builtin == NULL_TREE)
+    gcc_unreachable ();
+  return builtin;
+}
+
+/* Unpacks the elements of the vector in VALUE to scalars (bit field
+   references) in ELEMENTS.  */
+
+void
+brig_function::unpack (tree value, tree_stl_vec &elements)
+{
+  size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
+  size_t element_size
+    = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
+  size_t element_count
+    = vec_size * BITS_PER_UNIT / element_size;
+
+  tree input_element_type = TREE_TYPE (TREE_TYPE (value));
+
+  value = add_temp_var ("unpack_input", value);
+
+  for (size_t i = 0; i < element_count; ++i)
+    {
+      tree element
+       = build3 (BIT_FIELD_REF, input_element_type, value,
+                 TYPE_SIZE (input_element_type),
+                 bitsize_int(i * element_size));
+
+      element = add_temp_var ("scalar", element);
+      elements.push_back (element);
+    }
+}
+
+/* Pack the elements of the scalars in ELEMENTS to the returned vector.  */
+
+tree
+brig_function::pack (tree_stl_vec &elements)
+{
+  size_t element_count = elements.size ();
+
+  gcc_assert (element_count > 1);
+
+  tree output_element_type = TREE_TYPE (elements.at (0));
+
+  vec<constructor_elt, va_gc> *constructor_vals = NULL;
+  for (size_t i = 0; i < element_count; ++i)
+    CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
+
+  tree vec_type = build_vector_type (output_element_type, element_count);
+
+  /* build_constructor creates a vector type which is not a vector_cst
+     that requires compile time constant elements.  */
+  tree vec = build_constructor (vec_type, constructor_vals);
+
+  /* Add a temp variable for readability.  */
+  tree tmp_var = create_tmp_var (vec_type, "vec_out");
+  tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
+  append_statement (vec_tmp_assign);
+  return tmp_var;
+}
+
+/* Returns true in case the given opcode needs to know about work-item context
+   data.  In such case the context data is passed as a pointer to a work-item
+   context object, as the last argument in the builtin call.  */
+
+bool
+brig_function::needs_workitem_context_data
+(BrigOpcode16_t brig_opcode)
+{
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_WORKITEMABSID:
+    case BRIG_OPCODE_WORKITEMFLATABSID:
+    case BRIG_OPCODE_WORKITEMFLATID:
+    case BRIG_OPCODE_CURRENTWORKITEMFLATID:
+    case BRIG_OPCODE_WORKITEMID:
+    case BRIG_OPCODE_WORKGROUPID:
+    case BRIG_OPCODE_WORKGROUPSIZE:
+    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+    case BRIG_OPCODE_GRIDGROUPS:
+    case BRIG_OPCODE_GRIDSIZE:
+    case BRIG_OPCODE_DIM:
+    case BRIG_OPCODE_PACKETID:
+    case BRIG_OPCODE_PACKETCOMPLETIONSIG:
+    case BRIG_OPCODE_BARRIER:
+    case BRIG_OPCODE_WAVEBARRIER:
+    case BRIG_OPCODE_ARRIVEFBAR:
+    case BRIG_OPCODE_INITFBAR:
+    case BRIG_OPCODE_JOINFBAR:
+    case BRIG_OPCODE_LEAVEFBAR:
+    case BRIG_OPCODE_RELEASEFBAR:
+    case BRIG_OPCODE_WAITFBAR:
+    case BRIG_OPCODE_CUID:
+    case BRIG_OPCODE_MAXCUID:
+    case BRIG_OPCODE_DEBUGTRAP:
+    case BRIG_OPCODE_GROUPBASEPTR:
+    case BRIG_OPCODE_KERNARGBASEPTR:
+    case BRIG_OPCODE_ALLOCA:
+      return true;
+    default:
+      return false;
+    };
+}
+
+/* Appends and returns a new temp variable and an accompanying assignment
+   statement that stores the value of the given EXPR and has the given NAME.  */
+
+tree
+brig_function::add_temp_var (std::string name, tree expr)
+{
+  tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
+  tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
+  append_statement (assign);
+  return temp_var;
+}
+
+/* Returns the integer constant value of the given node.
+   If it's a cast, looks into the source of the cast.  */
+
+HOST_WIDE_INT
+brig_function::int_constant_value (tree node)
+{
+  tree n = node;
+  if (TREE_CODE (n) == VIEW_CONVERT_EXPR)
+    n = TREE_OPERAND (n, 0);
+  return int_cst_value (n);
+}
+
+/* Returns the tree code that should be used to implement the given
+   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
+   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
+   returns TREE_LIST (if it can be emulated with a simple chain of tree
+   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
+   call.  */
+
+tree_code
+brig_function::get_tree_code_for_hsa_opcode
+  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type)
+{
+  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_NOP:
+      return NOP_EXPR;
+    case BRIG_OPCODE_ADD:
+      return PLUS_EXPR;
+    case BRIG_OPCODE_CMOV:
+      if (brig_inner_type == brig_type)
+       return COND_EXPR;
+      else
+       return VEC_COND_EXPR;
+    case BRIG_OPCODE_SUB:
+      return MINUS_EXPR;
+    case BRIG_OPCODE_MUL:
+    case BRIG_OPCODE_MUL24:
+      return MULT_EXPR;
+    case BRIG_OPCODE_MULHI:
+    case BRIG_OPCODE_MUL24HI:
+      return MULT_HIGHPART_EXPR;
+    case BRIG_OPCODE_DIV:
+      if (gccbrig_is_float_type (brig_inner_type))
+       return RDIV_EXPR;
+      else
+       return TRUNC_DIV_EXPR;
+    case BRIG_OPCODE_NEG:
+      return NEGATE_EXPR;
+    case BRIG_OPCODE_MIN:
+      if (gccbrig_is_float_type (brig_inner_type))
+       return CALL_EXPR;
+      else
+       return MIN_EXPR;
+    case BRIG_OPCODE_MAX:
+      if (gccbrig_is_float_type (brig_inner_type))
+       return CALL_EXPR;
+      else
+       return MAX_EXPR;
+    case BRIG_OPCODE_FMA:
+      return FMA_EXPR;
+    case BRIG_OPCODE_ABS:
+      return ABS_EXPR;
+    case BRIG_OPCODE_SHL:
+      return LSHIFT_EXPR;
+    case BRIG_OPCODE_SHR:
+      return RSHIFT_EXPR;
+    case BRIG_OPCODE_OR:
+      return BIT_IOR_EXPR;
+    case BRIG_OPCODE_XOR:
+      return BIT_XOR_EXPR;
+    case BRIG_OPCODE_AND:
+      return BIT_AND_EXPR;
+    case BRIG_OPCODE_NOT:
+      return BIT_NOT_EXPR;
+    case BRIG_OPCODE_RET:
+      return RETURN_EXPR;
+    case BRIG_OPCODE_MOV:
+    case BRIG_OPCODE_LDF:
+      return MODIFY_EXPR;
+    case BRIG_OPCODE_LD:
+    case BRIG_OPCODE_ST:
+      return MEM_REF;
+    case BRIG_OPCODE_BR:
+      return GOTO_EXPR;
+    case BRIG_OPCODE_REM:
+      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
+       return TRUNC_MOD_EXPR;
+      else
+       return CALL_EXPR;
+    case BRIG_OPCODE_NRCP:
+    case BRIG_OPCODE_NRSQRT:
+      /* Implement as 1/f (x).  gcc should pattern detect that and
+        use a native instruction, if available, for it.  */
+      return TREE_LIST;
+    case BRIG_OPCODE_FLOOR:
+    case BRIG_OPCODE_CEIL:
+    case BRIG_OPCODE_SQRT:
+    case BRIG_OPCODE_NSQRT:
+    case BRIG_OPCODE_RINT:
+    case BRIG_OPCODE_TRUNC:
+    case BRIG_OPCODE_POPCOUNT:
+    case BRIG_OPCODE_COPYSIGN:
+    case BRIG_OPCODE_NCOS:
+    case BRIG_OPCODE_NSIN:
+    case BRIG_OPCODE_NLOG2:
+    case BRIG_OPCODE_NEXP2:
+    case BRIG_OPCODE_NFMA:
+      /* Class has type B1 regardless of the float type, thus
+        the below builtin map search cannot find it.  */
+    case BRIG_OPCODE_CLASS:
+    case BRIG_OPCODE_WORKITEMABSID:
+      return CALL_EXPR;
+    default:
+
+      /* Some BRIG opcodes can use the same builtins for unsigned and
+        signed types.  Force these cases to unsigned types.
+      */
+
+      if (brig_opcode == BRIG_OPCODE_BORROW
+         || brig_opcode == BRIG_OPCODE_CARRY
+         || brig_opcode == BRIG_OPCODE_LASTBIT
+         || brig_opcode == BRIG_OPCODE_BITINSERT)
+       {
+         if (brig_type == BRIG_TYPE_S32)
+           brig_type = BRIG_TYPE_U32;
+         else if (brig_type == BRIG_TYPE_S64)
+           brig_type = BRIG_TYPE_U64;
+       }
+
+
+      builtin_map::const_iterator i
+       = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+      if (i != s_custom_builtins.end ())
+       return CALL_EXPR;
+      else if (s_custom_builtins.find
+              (std::make_pair (brig_opcode, brig_inner_type))
+              != s_custom_builtins.end ())
+       return CALL_EXPR;
+      if (brig_inner_type == BRIG_TYPE_F16
+         && s_custom_builtins.find
+         (std::make_pair (brig_opcode, BRIG_TYPE_F32))
+         != s_custom_builtins.end ())
+       return CALL_EXPR;
+      break;
+    }
+  return TREE_LIST; /* Emulate using a chain of nodes.  */
+}
+
+/* Inform of an update to the REG_VAR.  */
+
+void
+brig_function::add_reg_var_update (tree reg_var, tree var)
+{
+  if (var == m_abs_id_vars[0] || var == m_abs_id_vars[1]
+      || var == m_abs_id_vars[2] || var == m_local_id_vars[0]
+      || var == m_local_id_vars[1] || var == m_local_id_vars[2])
+    m_id_val_defs [reg_var] = var;
+  else
+    {
+      /* Possible overwrite of an ID value.  */
+
+      id_val_map::iterator i = m_id_val_defs.find (reg_var);
+      if (i != m_id_val_defs.end())
+       m_id_val_defs.erase (i);
+    }
+}
+
+/* If the REG_VAR is known to contain an ID value at this point in
+   the basic block, return true.  */
+
+bool
+brig_function::is_id_val (tree reg_var)
+{
+  id_val_map::iterator i = m_id_val_defs.find (reg_var);
+  return i != m_id_val_defs.end();
+}
+
+/* Return an ID value for the given REG_VAR if its known to contain
+   one at this point in the BB, NULL_TREE otherwise.  */
+
+tree
+brig_function::id_val (tree reg_var)
+{
+  id_val_map::iterator i = m_id_val_defs.find (reg_var);
+  if (i != m_id_val_defs.end())
+    return (*i).second;
+  else
+    return NULL_TREE;
+}
+
+/* Informs of starting a new basic block.  Called when generating
+   a label, a call, a jump, or a return.  */
+
+void
+brig_function::start_new_bb ()
+{
+  m_id_val_defs.clear ();
+}
index 6149719fc4527a0e33a1209a2eaf3bb3eb71a6d4..8fde3a5bfa3135a9db545d2db93e87a6df811065 100644 (file)
@@ -105,6 +105,30 @@ public:
 
   void analyze_calls ();
 
+  tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
+
+  tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+                              BrigType16_t brig_type, tree arith_type,
+                              tree_stl_vec &operands);
+  bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
+
+  tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
+                                  BrigType16_t brig_type) const;
+
+  void unpack (tree value, tree_stl_vec &elements);
+  tree pack (tree_stl_vec &elements);
+  tree add_temp_var (std::string name, tree expr);
+
+  static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode);
+  static HOST_WIDE_INT int_constant_value (tree node);
+  static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
+                                                BrigType16_t brig_type);
+
+  void start_new_bb ();
+  void add_reg_var_update (tree reg_var, tree val);
+  bool is_id_val (tree reg_var);
+  tree id_val (tree reg_var);
+
   const BrigDirectiveExecutable *m_brig_def;
 
   bool m_is_kernel;
@@ -183,6 +207,11 @@ public:
   tree m_wg_id_vars[3];
   tree m_wg_size_vars[3];
   tree m_grid_size_vars[3];
+  /* Explicitly computed WG base for the absolute IDs which is used
+     as the initial value when looping that dimension.   We update
+     the abs id with ++ to make it easy for the vectorizer.  */
+  tree m_abs_id_base_vars[3];
+  tree m_abs_id_vars[3];
 
   /* Set to true in case the kernel contains at least one dispatch packet
      (work-item ID-related) builtin call that could not be expanded to
@@ -219,6 +248,20 @@ private:
   /* Bookkeeping for the different HSA registers and their tree declarations
      for the currently generated function.  */
   reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT];
+
+  /* Map for keeping book reads of ID variables, which can be propagated
+     to uses in address expressions to produce cleaner indexing functions
+     with unnecessary casts stripped off, etc.  */
+  typedef std::map<tree, tree> id_val_map;
+
+  /* Keeps track of ID values alive in registers in the currently
+     processed BB.  */
+  id_val_map m_id_val_defs;
+
+  /* HSAIL-specific builtin functions not yet integrated to gcc.  */
+  typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
+
+  static builtin_map s_custom_builtins;
 };
 
 #endif
index 7605b76f7db66a3a215dcee943a1f8c4d1f111f0..938df82b03aca176bbea8d70295ed3d512119299 100644 (file)
@@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base)
   std::string label_str ((const char *) (label_name->bytes),
                         label_name->byteCount);
 
+  m_parent.m_cf->start_new_bb ();
+
   tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str));
   m_parent.m_cf->append_statement (stmt);
+
   return base->byteCount;
 }
index 1da0bc0fa84448fb7c3c65897a28b01e65595e56..385da33f0893f339e889c154aeebd102bed6d0f2 100644 (file)
@@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base)
       elements.push_back (zero_cst);
       elements.push_back (zero_cst);
 
-      expr = pack (elements);
+      expr = m_parent.m_cf->pack (elements);
     }
   else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE)
     {
index 350516f64044534c6d36cbb5b2be830d08385118..d8374f232fb3e3d725b13609b19c269560c604c9 100644 (file)
@@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst,
     {
       /* Add a temporary variable so there won't be multiple
         reads in case of vector unpack.  */
-      mem_ref = add_temp_var ("mem_read", mem_ref);
+      mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref);
       return build_output_assignment (*brig_inst, data, mem_ref);
     }
   else
@@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base)
       inputs.push_back (operands[1]);
       inputs.push_back (align_opr);
       tree builtin_call
-       = expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32,
-                                 uint32_type_node, inputs);
+       = m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA,
+                                                BRIG_TYPE_U32,
+                                                uint32_type_node, inputs);
       build_output_assignment (*brig_inst, operands[0], builtin_call);
       m_parent.m_cf->m_has_allocas = true;
       return base->byteCount;
index d224752084220600546d5851911fd603443c5e4f..fe0b9a59406db245976f287b73c8c2f5aaa532d5 100644 (file)
@@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__))
 
 /* The prefix to use in the ELF section containing descriptor for
    a function.  */
+
 #define PHSA_DESC_SECTION_PREFIX "phsa.desc."
 #define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def."
 
 /* The frontend error messages are parsed by the host runtime.  Known
    prefix strings are used to separate the different runtime error
    codes.  */
+
 #define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: "
 #define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: "
 
+/* Offsets of attributes in the PHSA context structs.
+   Used by -fphsa-wi-context-opt.  */
+#define PHSA_CONTEXT_OFFS_WI_IDS 0
+#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4)
+#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4)
+#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4)
+
 #endif
index 1c83f5f8d543dd83ff43dc3bf398f92dffeab6fd..2cc6cb9c987ff56713b1c0813721d89fec467d86 100644 (file)
@@ -31,6 +31,11 @@ BRIG Separate Alias(d)
 -dump=
 BRIG Joined Alias(d)
 
+fassume-phsa
+BRIG Report Var(flag_assume_phsa) Init(1) Optimization
+Assume we are finalizing for phsa and its libhsail-rt.  Enables additional
+phsa-specific optimizations (default).
+
 L
 BRIG Joined Separate
 ; Not documented
index 8f3d796bcfa1aafc8beb4413019d76707617b4e2..5365befd3518e2936f8372b7ee5700c919138a67 100644 (file)
@@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT)
@@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE,
 DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT)
 DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE)
 DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR)
@@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR,
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR,
                     BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32)
 DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING,
                     BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING)
 DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG,
index f4de7e52a5e692c3d89aadc0cadf70a49eb24694..bba9294c2d6430608d50e3a4d3b6ab59e83155a0 100644 (file)
@@ -1,4 +1,9 @@
-2018-05-04 Carl Love  <cel@us.ibm.com>
+2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
+
+       * testsuite/brig.dg/test/gimple/smoke_test.hsail: Fix the test
+       to match the currently produced gimple.
+
+2018-05-04  Carl Love  <cel@us.ibm.com>
        * gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max,
        vec_trunc.
        * gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp,
index 1f36ddc41811ea66ecc6e457ae3ea46e388ef4ec..6e2326391daad66f92af1765c77ccfe9b6f4a04d 100644 (file)
@@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
 };
 
 /* The kernel function itself should have a fingerprint as follows */
-/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */
-/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */
+/* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */
+/* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */
 
 /* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */
 /* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */
 
 /* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */
 /* mem_read.2 = *D.1691; */
-/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */
+/* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */
 
 /* add_u32s should generate +operators */
 /* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */
@@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
 /* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */
 
 /* The launcher should call __hsail_launch_wg_function in this case: */
-/* Kernel (void * __context, void * __group_base_addr) */
-/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */
+/* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */
+/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */
 /* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/
 
 /* The kernel should have the magic metadata section injected to the ELF. */
index 5ab9e8515f00573deec1274933a8b119e86edbf1..17aeb6e490c9f91632215a2eb5d7703099db1f6c 100644 (file)
@@ -1,3 +1,10 @@
+2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
+
+       * include/internal/phsa-rt.h: Whitespace cleanup.
+       * include/internal/workitems.h: Store work item ID data to easily
+       accessible locations.
+       * rt/workitems.c: Same.
+
 2018-05-04  Pekka Jääskeläinen  <pekka.jaaskelainen@parmance.com>
 
        * rt/workitems.c: Fix an alloca stack underflow.
index d9db56ca08ec7c014fa19143496593d2d833f4a4..c09f18d0095e0d5181329a2c95327b1361aca3c3 100644 (file)
@@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t,
 */
 typedef struct
 {
-
   /* Data set by the HSA Runtime's kernel launcher.  */
   hsa_kernel_dispatch_packet_t *dp;
 
index 73add287d8d4a3ff7299d830917b49677ed32c86..0839853ff122d19d95654a59cb0d2211969e8c59 100644 (file)
 
 typedef struct
 {
-  /* The group id of the currently executed WG.  */
-  size_t x;
-  size_t y;
-  size_t z;
-
   /* This is 1 in case there are more work groups to execute.
      If 0, the work-item threads should finish themselves.  */
   int more_wgs;
@@ -89,6 +84,16 @@ typedef struct
      stack frame.  Initialized to point outside the private segment.  */
   uint32_t alloca_frame_p;
 
+  /* The group id of the currently executed WG.  This is for fiber based
+     execution.  The group ids are duplicated also to the per WI context
+     struct for simplified single pointer access in the GCCBRIG produced
+     code.
+   */
+
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+
 } PHSAWorkGroup;
 
 /* Data identifying a single work-item, passed to the work-item thread in case
@@ -96,17 +101,42 @@ typedef struct
 
 typedef struct
 {
+  /* NOTE: These members STARTing here should not be moved as they are
+     accessed directly by code emitted by BRIG FE.   */
+
+  /* The local id of the current WI. */
+
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+
+  /* The group id of the currently executed WG.  */
+
+  uint32_t group_x;
+  uint32_t group_y;
+  uint32_t group_z;
+
+  /* The local size of a complete WG.  */
+
+  uint32_t wg_size_x;
+  uint32_t wg_size_y;
+  uint32_t wg_size_z;
+
+  /* The local size of the current WG.  */
+
+  uint32_t cur_wg_size_x;
+  uint32_t cur_wg_size_y;
+  uint32_t cur_wg_size_z;
+
+  /* NOTE: Fixed members END here.  */
+
   PHSAKernelLaunchData *launch_data;
   /* Identifies and keeps book of the currently executed WG of the WI swarm.  */
   volatile PHSAWorkGroup *wg;
-  /* The local id of the current WI.  */
-  size_t x;
-  size_t y;
-  size_t z;
 #ifdef HAVE_FIBERS
   fiber_t fiber;
 #endif
-} PHSAWorkItem;
+} __attribute__((packed)) PHSAWorkItem;
 
 
 #endif
index 36c91691a71450c186a50ac1346560cf4ef51b38..c846350e1cdb5d22d26fa1b569912cca50cd9e03 100644 (file)
@@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1)
         the current_work_group_* is set to point to the WG executed next.  */
       if (!wi->wg->more_wgs)
        break;
+
+      wi->group_x = wg->x;
+      wi->group_y = wg->y;
+      wi->group_z = wg->z;
+
+      wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+      wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+      wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
 #ifdef DEBUG_PHSA_RT
       printf (
        "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
-       wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
-       l_data->wg_max_y, l_data->wg_max_z);
+       wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
+       l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
 #endif
 
       if (wi->x < __hsail_currentworkgroupsize (0, wi)
@@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1)
          else
            wg->x++;
 #endif
+         wi->group_x = wg->x;
+         wi->group_y = wg->y;
+         wi->group_z = wg->z;
+
+         wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+         wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+         wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
 
          /* Reinitialize the work-group barrier according to the new WG's
             size, which might not be the same as the previous ones, due
@@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
   PHSAWorkItem *wi_threads = NULL;
   PHSAWorkGroup wg;
   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
+  uint32_t group_x, group_y, group_z;
   fiber_barrier_t wg_start_barrier;
   fiber_barrier_t wg_completion_barrier;
   fiber_barrier_t wg_sync_barrier;
@@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
   wg.initial_group_offset = group_local_offset;
 
 #ifdef EXECUTE_WGS_BACKWARDS
-  wg.x = context->wg_max_x - 1;
-  wg.y = context->wg_max_y - 1;
-  wg.z = context->wg_max_z - 1;
+  group_x = context->wg_max_x - 1;
+  group_y = context->wg_max_y - 1;
+  group_z = context->wg_max_z - 1;
 #else
-  wg.x = context->wg_min_x;
-  wg.y = context->wg_min_y;
-  wg.z = context->wg_min_z;
+  group_x = context->wg_min_x;
+  group_y = context->wg_min_y;
+  group_z = context->wg_min_z;
 #endif
 
   fiber_barrier_init (&wg_sync_barrier, wg_size);
@@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
          PHSAWorkItem *wi = &wi_threads[flat_wi_id];
          wi->launch_data = context;
          wi->wg = &wg;
+
+         wg.x = wi->group_x = group_x;
+         wg.y = wi->group_y = group_y;
+         wg.z = wi->group_z = group_z;
+
+         wi->wg_size_x = context->dp->workgroup_size_x;
+         wi->wg_size_y = context->dp->workgroup_size_y;
+         wi->wg_size_z = context->dp->workgroup_size_z;
+
+         wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+         wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+         wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
          wi->x = x;
          wi->y = y;
          wi->z = z;
@@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
        {
-         wi.wg->x = wg_x;
-         wi.wg->y = wg_y;
-         wi.wg->z = wg_z;
+         wi.group_x = wg_x;
+         wi.group_y = wg_y;
+         wi.group_z = wg_z;
+
+         wi.wg_size_x = context->dp->workgroup_size_x;
+         wi.wg_size_y = context->dp->workgroup_size_y;
+         wi.wg_size_z = context->dp->workgroup_size_z;
+
+         wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
+         wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
+         wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
 
          context->kernel (context->kernarg_addr, &wi, group_base_ptr,
                           group_local_offset, private_base_ptr);
@@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
     default:
     case 0:
       /* Overflow semantics in the case of WG dim > grid dim.  */
-      id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+      id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
           % dp->grid_size_x;
       break;
     case 1:
-      id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+      id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
           % dp->grid_size_y;
       break;
     case 2:
-      id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+      id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
           % dp->grid_size_z;
       break;
     }
@@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
     default:
     case 0:
       /* Overflow semantics in the case of WG dim > grid dim.  */
-      id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+      id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
           % dp->grid_size_x;
       break;
     case 1:
-      id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+      id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
           % dp->grid_size_y;
       break;
     case 2:
-      id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+      id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
           % dp->grid_size_z;
       break;
     }
@@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
     {
     default:
     case 0:
-      if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
+      if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
        wg_size = dp->workgroup_size_x; /* Full WG.  */
       else
        wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
       break;
     case 1:
-      if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
+      if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
        wg_size = dp->workgroup_size_y; /* Full WG.  */
       else
        wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
       break;
     case 2:
-      if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
+      if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
        wg_size = dp->workgroup_size_z; /* Full WG.  */
       else
        wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
@@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
     {
     default:
     case 0:
-      return wi->wg->x;
+      return wi->group_x;
     case 1:
-      return wi->wg->y;
+      return wi->group_y;
     case 2:
-      return wi->wg->z;
+      return wi->group_z;
     }
 }