+2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
+
+ * brig-builtins.def: Add consts to ptrs etc. in BRIG builtin defs.
+ To improve optimization opportunities.
+ * builtin-types.def: The new needed builtin types for the above.
+
2018-05-04 Richard Biener <rguenther@suse.de>
* bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check.
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32,
- "__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR,
- ATTR_NOTHROW_LEAF_LIST)
+ "__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR,
+ ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64,
- "__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR,
- ATTR_NOTHROW_LEAF_LIST)
+ "__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR,
+ ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID,
- BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR,
- ATTR_NOTHROW_LEAF_LIST)
+ BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR,
+ ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID,
- BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR,
- ATTR_NOTHROW_LEAF_LIST)
+ BRIG_TYPE_U32, "__hsail_workitemid",
+ BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID,
- BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR,
- ATTR_PURE_NOTHROW_LEAF_LIST)
+ BRIG_TYPE_U32, "__hsail_workgroupid",
+ BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID,
BRIG_OPCODE_CURRENTWORKITEMFLATID,
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE,
BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32,
- "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR,
+ "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE,
- BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR,
+ BRIG_TYPE_U32, "__hsail_workgroupsize",
+ BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM,
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC,
"__hsail_launch_wg_function",
- BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST)
+ BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST)
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL,
"__hsail_launch_kernel",
+2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
+
+ Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
+ is given, these optimizations are disabled. With this flag, gccbrig
+ can generate GENERIC that assumes we are targeting a phsa-runtime
+ based implementation, which allows us to expose the work-item context
+ accesses to retrieve WI IDs etc. which helps optimizers.
+ First optimization that takes advantage of this is to get rid of
+ the setworkitemid calls whenever we have non-inlined calls that
+ use IDs internally. Other optimizations added in this commit:
+ - expand absoluteid to similar level of simplicity as workitemid.
+ At the moment absoluteid is the best indexing ID to end up with
+ WG vectorization.
+ - propagate ID variables closer to their uses. This is mainly
+ to avoid known useless casts, which confuse at least scalar
+ evolution analysis.
+ - use signed long long for storing IDs. Unsigned integers have
+ defined wraparound semantics, which confuse at least scalar
+ evolution analysis, leading to unvectorizable WI loops.
+ - also refactor some BRIG function generation helpers to brig_function.
+ - no point in having the wi-loop as a for-loop. It's really
+ a do...while and SCEV can analyze it just fine still.
+ - add consts to ptrs etc. in BRIG builtin defs.
+ Improves optimization opportunities.
+ - add qualifiers to generated function parameters.
+ Const and restrict on the hidden local/private pointers,
+ the arg buffer and the context pointer help some optimizations.
+ * brig/brigfrontend/brig-basic-inst-handler.cc: See above.
+ * brig/brigfrontend/brig-branch-inst-handler.cc: See above.
+ * brig/brigfrontend/brig-cmp-inst-handler.cc: See above.
+ * brig/brigfrontend/brig-code-entry-handler.cc: See above.
+ * brig/brigfrontend/brig-code-entry-handler.h: See above.
+ * brig/brigfrontend/brig-control-handler.cc: See above.
+ * brig/brigfrontend/brig-cvt-inst-handler.cc: See above.
+ * brig/brigfrontend/brig-function-handler.cc: See above.
+ * brig/brigfrontend/brig-function.cc: See above.
+ * brig/brigfrontend/brig-function.h: See above.
+ * brig/brigfrontend/brig-label-handler.cc: See above.
+ * brig/brigfrontend/brig-lane-inst-handler.cc: See above.
+ * brig/brigfrontend/brig-mem-inst-handler.cc: See above.
+ * brig/brigfrontend/phsa.h: See above.
+ * brig/lang.opt: See above.
+
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* brig/brigfrontend/brig-function-handler.cc: Skip multiple forward
/* Unpack the tightly packed mask elements to BIT_FIELD_REFs
from which to construct the mask vector as understood by
VEC_PERM_EXPR. */
- tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
+ tree mask_operand
+ = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
tree mask_element_type
= build_nonstandard_integer_type (input_mask_element_size, true);
tree wide_type = build_nonstandard_integer_type (vecsize, 1);
tree src_vect = build_resize_convert_view (wide_type, operands[0]);
- src_vect = add_temp_var ("src_vect", src_vect);
+ src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
tree scalar = operands[1];
- scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
+ scalar = m_parent.m_cf->add_temp_var ("scalar",
+ convert_to_integer (wide_type, scalar));
tree pos = operands[2];
Zero them for well-defined semantics. */
tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
build_int_cstu (TREE_TYPE (pos), ecount - 1));
- pos = add_temp_var ("pos", convert (wide_type, t));
+ pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
tree ewidth = build_int_cstu (wide_type, element_width);
tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
- bitoffset = add_temp_var ("offset", bitoffset);
+ bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
uint64_t mask_int
= element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
tree mask = build_int_cstu (wide_type, mask_int);
- mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
+ mask = m_parent.m_cf->add_temp_var ("mask",
+ convert_to_integer (wide_type, mask));
tree clearing_mask
= build1 (BIT_NOT_EXPR, wide_type,
tree arith_type,
tree_stl_vec &operands)
{
- tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
+ tree_code opcode
+ = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
on which cannot be used in general to remain HSAIL compliant.
Perhaps a builtin call would be better option here. */
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
- expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
- arith_type, operands));
+ m_parent.m_cf->expand_or_call_builtin
+ (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
}
else if (brig_opcode == BRIG_OPCODE_NRCP)
{
gcc_unreachable ();
}
else if (opcode == CALL_EXPR)
- return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
- operands);
+ return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
+ arith_type, operands);
else if (output_count == 1)
{
if (input_count == 1)
in_operands[0] = build_lower_element_broadcast (in_operands[0]);
tree_code opcode
- = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
+ = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
+ brig_inst_type);
if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
{
*/
tree_stl_vec operand0_elements;
if (input_count > 0)
- unpack (in_operands[0], operand0_elements);
+ m_parent.m_cf->unpack (in_operands[0], operand0_elements);
tree_stl_vec operand1_elements;
if (input_count > 1)
- unpack (in_operands[1], operand1_elements);
+ m_parent.m_cf->unpack (in_operands[1], operand1_elements);
tree_stl_vec result_elements;
result_elements.push_back (convert (scalar_type, scalar_expr));
}
- instr_expr = pack (result_elements);
+ instr_expr = m_parent.m_cf->pack (result_elements);
}
else
{
vec_operand, mask);
}
-/* Returns the tree code that should be used to implement the given
- HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
- (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly,
- returns TREE_LIST (if it can be emulated with a simple chain of tree
- nodes) or CALL_EXPR if the opcode should be implemented using a builtin
- call. */
-
-tree_code
-brig_basic_inst_handler::get_tree_code_for_hsa_opcode
- (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
- BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
- switch (brig_opcode)
- {
- case BRIG_OPCODE_NOP:
- return NOP_EXPR;
- case BRIG_OPCODE_ADD:
- return PLUS_EXPR;
- case BRIG_OPCODE_CMOV:
- if (brig_inner_type == brig_type)
- return COND_EXPR;
- else
- return VEC_COND_EXPR;
- case BRIG_OPCODE_SUB:
- return MINUS_EXPR;
- case BRIG_OPCODE_MUL:
- case BRIG_OPCODE_MUL24:
- return MULT_EXPR;
- case BRIG_OPCODE_MULHI:
- case BRIG_OPCODE_MUL24HI:
- return MULT_HIGHPART_EXPR;
- case BRIG_OPCODE_DIV:
- if (gccbrig_is_float_type (brig_inner_type))
- return RDIV_EXPR;
- else
- return TRUNC_DIV_EXPR;
- case BRIG_OPCODE_NEG:
- return NEGATE_EXPR;
- case BRIG_OPCODE_MIN:
- if (gccbrig_is_float_type (brig_inner_type))
- return CALL_EXPR;
- else
- return MIN_EXPR;
- case BRIG_OPCODE_MAX:
- if (gccbrig_is_float_type (brig_inner_type))
- return CALL_EXPR;
- else
- return MAX_EXPR;
- case BRIG_OPCODE_FMA:
- return FMA_EXPR;
- case BRIG_OPCODE_ABS:
- return ABS_EXPR;
- case BRIG_OPCODE_SHL:
- return LSHIFT_EXPR;
- case BRIG_OPCODE_SHR:
- return RSHIFT_EXPR;
- case BRIG_OPCODE_OR:
- return BIT_IOR_EXPR;
- case BRIG_OPCODE_XOR:
- return BIT_XOR_EXPR;
- case BRIG_OPCODE_AND:
- return BIT_AND_EXPR;
- case BRIG_OPCODE_NOT:
- return BIT_NOT_EXPR;
- case BRIG_OPCODE_RET:
- return RETURN_EXPR;
- case BRIG_OPCODE_MOV:
- case BRIG_OPCODE_LDF:
- return MODIFY_EXPR;
- case BRIG_OPCODE_LD:
- case BRIG_OPCODE_ST:
- return MEM_REF;
- case BRIG_OPCODE_BR:
- return GOTO_EXPR;
- case BRIG_OPCODE_REM:
- if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
- return TRUNC_MOD_EXPR;
- else
- return CALL_EXPR;
- case BRIG_OPCODE_NRCP:
- case BRIG_OPCODE_NRSQRT:
- /* Implement as 1/f (x). gcc should pattern detect that and
- use a native instruction, if available, for it. */
- return TREE_LIST;
- case BRIG_OPCODE_FLOOR:
- case BRIG_OPCODE_CEIL:
- case BRIG_OPCODE_SQRT:
- case BRIG_OPCODE_NSQRT:
- case BRIG_OPCODE_RINT:
- case BRIG_OPCODE_TRUNC:
- case BRIG_OPCODE_POPCOUNT:
- case BRIG_OPCODE_COPYSIGN:
- case BRIG_OPCODE_NCOS:
- case BRIG_OPCODE_NSIN:
- case BRIG_OPCODE_NLOG2:
- case BRIG_OPCODE_NEXP2:
- case BRIG_OPCODE_NFMA:
- /* Class has type B1 regardless of the float type, thus
- the below builtin map search cannot find it. */
- case BRIG_OPCODE_CLASS:
- case BRIG_OPCODE_WORKITEMABSID:
- return CALL_EXPR;
- default:
-
- /* Some BRIG opcodes can use the same builtins for unsigned and
- signed types. Force these cases to unsigned types.
- */
-
- if (brig_opcode == BRIG_OPCODE_BORROW
- || brig_opcode == BRIG_OPCODE_CARRY
- || brig_opcode == BRIG_OPCODE_LASTBIT
- || brig_opcode == BRIG_OPCODE_BITINSERT)
- {
- if (brig_type == BRIG_TYPE_S32)
- brig_type = BRIG_TYPE_U32;
- else if (brig_type == BRIG_TYPE_S64)
- brig_type = BRIG_TYPE_U64;
- }
-
-
- builtin_map::const_iterator i
- = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
- if (i != s_custom_builtins.end ())
- return CALL_EXPR;
- else if (s_custom_builtins.find
- (std::make_pair (brig_opcode, brig_inner_type))
- != s_custom_builtins.end ())
- return CALL_EXPR;
- if (brig_inner_type == BRIG_TYPE_F16
- && s_custom_builtins.find
- (std::make_pair (brig_opcode, BRIG_TYPE_F32))
- != s_custom_builtins.end ())
- return CALL_EXPR;
- break;
- }
- return TREE_LIST; /* Emulate using a chain of nodes. */
-}
memory. */
tree group_local_offset
- = add_temp_var ("group_local_offset",
- build_int_cst
- (uint32_type_node,
- m_parent.m_cf->m_local_group_variables.size()));
+ = m_parent.m_cf->add_temp_var ("group_local_offset",
+ build_int_cst
+ (uint32_type_node,
+ m_parent.m_cf->
+ m_local_group_variables.size()));
/* TODO: ensure the callee's frame is aligned! */
m_parent.m_cf->m_called_functions.push_back (func_ref);
if (DECL_EXTERNAL (func_ref))
m_parent.add_decl_call (call);
+ m_parent.m_cf->start_new_bb ();
return base->byteCount;
}
ensure the barrier won't be duplicated or moved out of loops etc.
Like the 'noduplicate' of LLVM. Same goes for fbarriers. */
m_parent.m_cf->append_statement
- (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE,
- call_operands));
+ (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+ BRIG_TYPE_NONE, NULL_TREE,
+ call_operands));
}
else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR
&& brig_inst->opcode <= BRIG_OPCODE_WAITFBAR)
{
m_parent.m_cf->m_has_barriers = true;
m_parent.m_cf->append_statement
- (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE,
- uint32_type_node, operands));
+ (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+ BRIG_TYPE_NONE,
+ uint32_type_node, operands));
}
else
gcc_unreachable ();
+ m_parent.m_cf->start_new_bb ();
return base->byteCount;
}
results, we must now truncate the result vector to S16s so it
fits to the destination register. We can build the target vector
type from the f16 storage type (unsigned ints). */
- expr = add_temp_var ("wide_cmp_result", expr);
+ expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr);
tree_stl_vec wide_elements;
tree_stl_vec shrunk_elements;
- unpack (expr, wide_elements);
+ m_parent.m_cf->unpack (expr, wide_elements);
for (size_t i = 0; i < wide_elements.size (); ++i)
{
tree wide = wide_elements.at (i);
shrunk_elements.push_back
(convert_to_integer (short_integer_type_node, wide));
}
- expr = pack (shrunk_elements);
+ expr = m_parent.m_cf->pack (shrunk_elements);
}
build_output_assignment (*inst_base, operands[0], expr);
#include "brig-builtins.h"
#include "fold-const.h"
-brig_code_entry_handler::builtin_map brig_code_entry_handler::s_custom_builtins;
-
brig_code_entry_handler::brig_code_entry_handler (brig_to_generic &parent)
: brig_entry_handler (parent)
{
- if (s_custom_builtins.size () > 0) return;
-
- /* Populate the builtin index. */
-#undef DEF_HSAIL_ATOMIC_BUILTIN
-#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
-#undef DEF_HSAIL_INTR_BUILTIN
-#undef DEF_HSAIL_SAT_BUILTIN
-#undef DEF_HSAIL_BUILTIN
-#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
- s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)] \
- = builtin_decl_explicit (ENUM);
-
-#include "brig-builtins.def"
}
/* Build a tree operand which is a reference to a piece of code. REF is the
tree local_size
= build2 (MULT_EXPR, uint32_type_node,
- expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
- BRIG_TYPE_U32,
- uint32_type_node, uint32_0),
- expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
- BRIG_TYPE_U32,
- uint32_type_node, uint32_1));
+ m_parent.m_cf->expand_or_call_builtin
+ (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+ uint32_type_node, uint32_0),
+ m_parent.m_cf->expand_or_call_builtin
+ (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+ uint32_type_node, uint32_1));
local_size
= build2 (MULT_EXPR, uint32_type_node,
- expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
- BRIG_TYPE_U32,
- uint32_type_node, uint32_2),
+ m_parent.m_cf->expand_or_call_builtin
+ (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+ uint32_type_node, uint32_2),
local_size);
tree var_region
= build2 (MULT_EXPR, uint32_type_node,
build_int_cst (uint32_type_node,
m_parent.private_variable_size (var_name)),
- expand_or_call_builtin (BRIG_OPCODE_WORKITEMFLATID,
- BRIG_TYPE_U32,
- uint32_type_node, operands));
+ m_parent.m_cf->expand_or_call_builtin
+ (BRIG_OPCODE_WORKITEMFLATID, BRIG_TYPE_U32,
+ uint32_type_node, operands));
tree var_offset
= build2 (PLUS_EXPR, uint32_type_node, var_region, pos);
offset to a flat address by adding it as an offset to a (private
or group) base pointer later on. Same applies to group_var_offset. */
symbol_base
- = add_temp_var ("priv_var_offset",
- convert (size_type_node, var_offset));
+ = m_parent.m_cf->add_temp_var ("priv_var_offset",
+ convert (size_type_node,
+ var_offset));
}
else if (segment == BRIG_SEGMENT_ARG)
{
return gccbrig_tree_type_for_hsa_type (brig_type);
}
-/* In case the HSA instruction must be implemented using a builtin,
- this function is called to get the correct builtin function.
- TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
- brig instruction and BRIG_TYPE the brig instruction's type. */
-
-tree
-brig_code_entry_handler::get_builtin_for_hsa_opcode
- (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
- tree builtin = NULL_TREE;
- tree builtin_type = type;
-
- /* For vector types, first find the scalar version of the builtin. */
- if (type != NULL_TREE && VECTOR_TYPE_P (type))
- builtin_type = TREE_TYPE (type);
- BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
-
- /* Some BRIG opcodes can use the same builtins for unsigned and
- signed types. Force these cases to unsigned types. */
-
- if (brig_opcode == BRIG_OPCODE_BORROW
- || brig_opcode == BRIG_OPCODE_CARRY
- || brig_opcode == BRIG_OPCODE_LASTBIT
- || brig_opcode == BRIG_OPCODE_BITINSERT)
- {
- if (brig_type == BRIG_TYPE_S32)
- brig_type = BRIG_TYPE_U32;
- else if (brig_type == BRIG_TYPE_S64)
- brig_type = BRIG_TYPE_U64;
- }
-
- switch (brig_opcode)
- {
- case BRIG_OPCODE_FLOOR:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
- break;
- case BRIG_OPCODE_CEIL:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
- break;
- case BRIG_OPCODE_SQRT:
- case BRIG_OPCODE_NSQRT:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
- break;
- case BRIG_OPCODE_RINT:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
- break;
- case BRIG_OPCODE_TRUNC:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
- break;
- case BRIG_OPCODE_COPYSIGN:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
- break;
- case BRIG_OPCODE_NSIN:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
- break;
- case BRIG_OPCODE_NLOG2:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
- break;
- case BRIG_OPCODE_NEXP2:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
- break;
- case BRIG_OPCODE_NFMA:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
- break;
- case BRIG_OPCODE_NCOS:
- builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
- break;
- case BRIG_OPCODE_POPCOUNT:
- /* Popcount should be typed by its argument type (the return value
- is always u32). Let's use a b64 version for also for b32 for now. */
- return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
- case BRIG_OPCODE_BORROW:
- /* Borrow uses the same builtin for unsigned and signed types. */
- if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
- return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
- else
- return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
- case BRIG_OPCODE_CARRY:
- /* Carry also uses the same builtin for unsigned and signed types. */
- if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
- return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
- else
- return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
- default:
-
- /* Use our builtin index for finding a proper builtin for the BRIG
- opcode and BRIG type. This takes care most of the builtin cases,
- the special cases are handled in the separate 'case' statements
- above. */
- builtin_map::const_iterator i
- = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
- if (i != s_custom_builtins.end ())
- return (*i).second;
-
- if (brig_inner_type != brig_type)
- {
- /* Try to find a scalar built-in we could use. */
- i = s_custom_builtins.find
- (std::make_pair (brig_opcode, brig_inner_type));
- if (i != s_custom_builtins.end ())
- return (*i).second;
- }
-
- /* In case this is an fp16 operation that is promoted to fp32,
- try to find a fp32 scalar built-in. */
- if (brig_inner_type == BRIG_TYPE_F16)
- {
- i = s_custom_builtins.find
- (std::make_pair (brig_opcode, BRIG_TYPE_F32));
- if (i != s_custom_builtins.end ())
- return (*i).second;
- }
- gcc_unreachable ();
- }
-
- if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
- {
- /* Try to find a vectorized version of the built-in.
- TODO: properly assert that builtin is a mathfn builtin? */
- tree vec_builtin
- = targetm.vectorize.builtin_vectorized_function
- (builtin_mathfn_code (builtin), type, type);
- if (vec_builtin != NULL_TREE)
- return vec_builtin;
- else
- return builtin;
- }
- if (builtin == NULL_TREE)
- gcc_unreachable ();
- return builtin;
-}
-
/* Return the correct GENERIC type for storing comparison results
of operand with the type given in SOURCE_TYPE. */
return gccbrig_tree_type_for_hsa_type (BRIG_TYPE_B1);
}
-/* Returns true in case the given opcode needs to know about work-item context
- data. In such case the context data is passed as a pointer to a work-item
- context object, as the last argument in the builtin call. */
-
-bool
-brig_code_entry_handler::needs_workitem_context_data
- (BrigOpcode16_t brig_opcode) const
-{
- switch (brig_opcode)
- {
- case BRIG_OPCODE_WORKITEMABSID:
- case BRIG_OPCODE_WORKITEMFLATABSID:
- case BRIG_OPCODE_WORKITEMFLATID:
- case BRIG_OPCODE_CURRENTWORKITEMFLATID:
- case BRIG_OPCODE_WORKITEMID:
- case BRIG_OPCODE_WORKGROUPID:
- case BRIG_OPCODE_WORKGROUPSIZE:
- case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
- case BRIG_OPCODE_GRIDGROUPS:
- case BRIG_OPCODE_GRIDSIZE:
- case BRIG_OPCODE_DIM:
- case BRIG_OPCODE_PACKETID:
- case BRIG_OPCODE_PACKETCOMPLETIONSIG:
- case BRIG_OPCODE_BARRIER:
- case BRIG_OPCODE_WAVEBARRIER:
- case BRIG_OPCODE_ARRIVEFBAR:
- case BRIG_OPCODE_INITFBAR:
- case BRIG_OPCODE_JOINFBAR:
- case BRIG_OPCODE_LEAVEFBAR:
- case BRIG_OPCODE_RELEASEFBAR:
- case BRIG_OPCODE_WAITFBAR:
- case BRIG_OPCODE_CUID:
- case BRIG_OPCODE_MAXCUID:
- case BRIG_OPCODE_DEBUGTRAP:
- case BRIG_OPCODE_GROUPBASEPTR:
- case BRIG_OPCODE_KERNARGBASEPTR:
- case BRIG_OPCODE_ALLOCA:
- return true;
- default:
- return false;
- };
-}
-
-/* Returns true in case the given opcode that would normally be generated
- as a builtin call can be expanded to tree nodes. */
-
-bool
-brig_code_entry_handler::can_expand_builtin (BrigOpcode16_t brig_opcode) const
-{
- switch (brig_opcode)
- {
- case BRIG_OPCODE_WORKITEMFLATABSID:
- case BRIG_OPCODE_WORKITEMFLATID:
- case BRIG_OPCODE_WORKITEMABSID:
- case BRIG_OPCODE_WORKGROUPSIZE:
- case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
- /* TODO: expand more builtins. */
- return true;
- default:
- return false;
- };
-}
-
-/* Try to expand the given builtin call to reuse a previously generated
- variable, if possible. If not, just call the given builtin.
- BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
- ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
- input operands. */
-
-tree
-brig_code_entry_handler::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
- BrigType16_t brig_type,
- tree arith_type,
- tree_stl_vec &operands)
-{
- if (m_parent.m_cf->m_is_kernel && can_expand_builtin (brig_opcode))
- return expand_builtin (brig_opcode, operands);
-
- tree built_in
- = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
-
- if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
- && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
- && brig_opcode != BRIG_OPCODE_LERP
- && brig_opcode != BRIG_OPCODE_PACKCVT
- && brig_opcode != BRIG_OPCODE_SAD
- && brig_opcode != BRIG_OPCODE_SADHI)
- {
- /* Call the scalar built-in for all elements in the vector. */
- tree_stl_vec operand0_elements;
- if (operands.size () > 0)
- unpack (operands[0], operand0_elements);
-
- tree_stl_vec operand1_elements;
- if (operands.size () > 1)
- unpack (operands[1], operand1_elements);
-
- tree_stl_vec result_elements;
-
- size_t element_count = gccbrig_type_vector_subparts (arith_type);
- for (size_t i = 0; i < element_count; ++i)
- {
- tree_stl_vec call_operands;
- if (operand0_elements.size () > 0)
- call_operands.push_back (operand0_elements.at (i));
-
- if (operand1_elements.size () > 0)
- call_operands.push_back (operand1_elements.at (i));
-
- result_elements.push_back
- (expand_or_call_builtin (brig_opcode, brig_type,
- TREE_TYPE (arith_type),
- call_operands));
- }
- return pack (result_elements);
- }
-
- tree_stl_vec call_operands;
- tree_stl_vec operand_types;
-
- tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
-
- for (size_t i = 0; i < operands.size (); ++i)
- {
- tree operand_type = TREE_VALUE (arg_type_chain);
- call_operands.push_back (convert (operand_type, operands[i]));
- operand_types.push_back (operand_type);
- arg_type_chain = TREE_CHAIN (arg_type_chain);
- }
-
- if (needs_workitem_context_data (brig_opcode))
- {
- call_operands.push_back (m_parent.m_cf->m_context_arg);
- operand_types.push_back (ptr_type_node);
- m_parent.m_cf->m_has_unexpanded_dp_builtins = true;
- }
-
- size_t operand_count = call_operands.size ();
-
- call_operands.resize (4, NULL_TREE);
- operand_types.resize (4, NULL_TREE);
- for (size_t i = 0; i < operand_count; ++i)
- call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
- call_operands.at (i));
-
- tree fnptr = build_fold_addr_expr (built_in);
- return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
- operand_count, &call_operands[0]);
-}
-
-/* Instead of calling a built-in, reuse a previously returned value known to
- be still valid. This is beneficial especially for the work-item
- identification related builtins as not having them as calls can lead to
- more easily vectorizable parallel loops for multi work-item work-groups.
- BRIG_OPCODE identifies the builtin and OPERANDS store the operands. */
-
-tree
-brig_code_entry_handler::expand_builtin (BrigOpcode16_t brig_opcode,
- tree_stl_vec &operands)
-{
- tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
-
- tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
-
- tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
-
- if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
- {
- tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
- id0 = convert (uint64_type_node, id0);
-
- tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
- id1 = convert (uint64_type_node, id1);
-
- tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
- id2 = convert (uint64_type_node, id2);
-
- tree max0 = convert (uint64_type_node,
- m_parent.m_cf->m_grid_size_vars[0]);
- tree max1 = convert (uint64_type_node,
- m_parent.m_cf->m_grid_size_vars[1]);
-
- tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
- id2_x_max0_x_max1
- = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
-
- tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
-
- tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
- sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
-
- return add_temp_var ("workitemflatabsid", sum);
- }
- else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
- {
- HOST_WIDE_INT dim = int_constant_value (operands[0]);
-
- tree local_id_var = m_parent.m_cf->m_local_id_vars[dim];
- tree wg_id_var = m_parent.m_cf->m_wg_id_vars[dim];
- tree wg_size_var = m_parent.m_cf->m_wg_size_vars[dim];
-
- tree wg_id_x_wg_size = build2 (MULT_EXPR, uint32_type_node,
- convert (uint32_type_node, wg_id_var),
- convert (uint32_type_node, wg_size_var));
- tree sum
- = build2 (PLUS_EXPR, uint32_type_node, wg_id_x_wg_size, local_id_var);
-
- return add_temp_var (std::string ("workitemabsid_")
- + (char) ((int) 'x' + dim), sum);
- }
- else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
- {
- tree z_x_wgsx_wgsy
- = build2 (MULT_EXPR, uint32_type_node,
- m_parent.m_cf->m_local_id_vars[2],
- m_parent.m_cf->m_wg_size_vars[0]);
- z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
- m_parent.m_cf->m_wg_size_vars[1]);
-
- tree y_x_wgsx
- = build2 (MULT_EXPR, uint32_type_node,
- m_parent.m_cf->m_local_id_vars[1],
- m_parent.m_cf->m_wg_size_vars[0]);
-
- tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
- sum = build2 (PLUS_EXPR, uint32_type_node,
- m_parent.m_cf->m_local_id_vars[0],
- sum);
- return add_temp_var ("workitemflatid", sum);
- }
- else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
- {
- HOST_WIDE_INT dim = int_constant_value (operands[0]);
- return m_parent.m_cf->m_wg_size_vars[dim];
- }
- else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
- {
- HOST_WIDE_INT dim = int_constant_value (operands[0]);
- return m_parent.m_cf->m_cur_wg_size_vars[dim];
- }
- else
- gcc_unreachable ();
-
- return NULL_TREE;
-}
-
-/* Appends and returns a new temp variable and an accompanying assignment
- statement that stores the value of the given EXPR and has the given NAME. */
-
-tree
-brig_code_entry_handler::add_temp_var (std::string name, tree expr)
-{
- tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
- tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
- m_parent.m_cf->append_statement (assign);
- return temp_var;
-}
-
/* Creates a FP32 to FP16 conversion call, assuming the source and destination
are FP32 type variables. */
variable type (can be any type; see get_m_var_declfor_reg @
brig-function.cc). */
tree output_type = TREE_TYPE (output);
- tree input_type = TREE_TYPE (inst_expr);
bool is_fp16 = (brig_inst.type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
&& brig_inst.base.kind != BRIG_KIND_INST_MEM
&& !gccbrig_is_bit_operation (brig_inst.opcode);
bool ftz = false;
const BrigBase *base = &brig_inst.base;
+ if (m_parent.m_cf->is_id_val (inst_expr))
+ inst_expr = m_parent.m_cf->id_val (inst_expr);
+
+ tree input_type = TREE_TYPE (inst_expr);
+
+ m_parent.m_cf->add_reg_var_update (output, inst_expr);
+
if (base->kind == BRIG_KIND_INST_MOD)
{
const BrigInstMod *mod = (const BrigInstMod *) base;
{
/* Ensure we don't duplicate the arithmetics to the arguments of the bit
field reference operators. */
- inst_expr = add_temp_var ("before_ftz", inst_expr);
+ inst_expr = m_parent.m_cf->add_temp_var ("before_ftz", inst_expr);
inst_expr = flush_to_zero (is_fp16) (*this, inst_expr);
}
if (is_fp16)
{
- inst_expr = add_temp_var ("before_f2h", inst_expr);
+ inst_expr = m_parent.m_cf->add_temp_var ("before_f2h", inst_expr);
tree f2h_output = build_f2h_conversion (inst_expr);
tree conv = build_resize_convert_view (output_type, f2h_output);
tree assign = build2 (MODIFY_EXPR, output_type, output, conv);
m_parent.m_cf->append_statement (stmt);
}
-/* Unpacks the elements of the vector in VALUE to scalars (bit field
- references) in ELEMENTS. */
-
-void
-brig_code_entry_handler::unpack (tree value, tree_stl_vec &elements)
-{
- size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
- size_t element_size
- = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
- size_t element_count
- = vec_size * BITS_PER_UNIT / element_size;
-
- tree input_element_type = TREE_TYPE (TREE_TYPE (value));
-
- value = add_temp_var ("unpack_input", value);
-
- for (size_t i = 0; i < element_count; ++i)
- {
- tree element
- = build3 (BIT_FIELD_REF, input_element_type, value,
- TYPE_SIZE (input_element_type),
- bitsize_int(i * element_size));
-
- element = add_temp_var ("scalar", element);
- elements.push_back (element);
- }
-}
-
-/* Pack the elements of the scalars in ELEMENTS to the returned vector. */
-
-tree
-brig_code_entry_handler::pack (tree_stl_vec &elements)
-{
- size_t element_count = elements.size ();
-
- gcc_assert (element_count > 1);
-
- tree output_element_type = TREE_TYPE (elements.at (0));
-
- vec<constructor_elt, va_gc> *constructor_vals = NULL;
- for (size_t i = 0; i < element_count; ++i)
- CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
-
- tree vec_type = build_vector_type (output_element_type, element_count);
-
- /* build_constructor creates a vector type which is not a vector_cst
- that requires compile time constant elements. */
- tree vec = build_constructor (vec_type, constructor_vals);
-
- /* Add a temp variable for readability. */
- tree tmp_var = create_tmp_var (vec_type, "vec_out");
- tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
- m_parent.m_cf->append_statement (vec_tmp_assign);
- return tmp_var;
-}
-
/* Visits the element(s) in the OPERAND, calling HANDLER to each of them. */
tree
n = TREE_OPERAND (n, 0);
return int_cst_value (n);
}
-
class brig_code_entry_handler : public brig_entry_handler
{
public:
- typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
-
brig_code_entry_handler (brig_to_generic &parent);
/* Handles the brig_code data at the given pointer and adds it to the
tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const;
tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst,
tree type) const;
- tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
- BrigType16_t brig_type) const;
tree get_comparison_result_type (tree source_type);
tree build_code_ref (const BrigBase &ref);
bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const;
- void unpack (tree value, tree_stl_vec &elements);
- tree pack (tree_stl_vec &elements);
-
- bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
- tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
-
- tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
- BrigType16_t brig_type, tree arith_type,
- tree_stl_vec &operands);
-
tree add_temp_var (std::string name, tree expr);
tree build_f2h_conversion (tree source);
tree extend_int (tree input, tree dest_type, tree src_type);
- /* HSAIL-specific builtin functions not yet integrated to gcc. */
-
- static builtin_map s_custom_builtins;
-
private:
tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst,
tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type,
tree_stl_vec &operands);
-
- tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
- BrigType16_t brig_type) const;
};
class brig_cvt_inst_handler : public brig_inst_mod_handler
case BRIG_CONTROL_MAXDYNAMICGROUPSIZE:
{
m_parent.m_cf->m_descriptor.max_dynamic_group_size
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_MAXFLATGRIDSIZE:
{
m_parent.m_cf->m_descriptor.max_flat_grid_size
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_MAXFLATWORKGROUPSIZE:
{
m_parent.m_cf->m_descriptor.max_flat_workgroup_size
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_REQUIREDDIM:
{
m_parent.m_cf->m_descriptor.required_dim
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
break;
}
case BRIG_CONTROL_REQUIREDGRIDSIZE:
{
m_parent.m_cf->m_descriptor.required_grid_size[0]
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_grid_size[1]
- = int_constant_value (operands.at (1));
+ = brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_grid_size[2]
- = int_constant_value (operands.at (2));
+ = brig_function::int_constant_value (operands.at (2));
break;
}
case BRIG_CONTROL_REQUIREDWORKGROUPSIZE:
{
m_parent.m_cf->m_descriptor.required_workgroup_size[0]
- = int_constant_value (operands.at (0));
+ = brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_workgroup_size[1]
- = int_constant_value (operands.at (1));
+ = brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_workgroup_size[2]
- = int_constant_value (operands.at (2));
+ = brig_function::int_constant_value (operands.at (2));
break;
}
case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS:
tree &input = operands.at (1);
tree &output = operands.at (0);
+ if (m_parent.m_cf->is_id_val (input))
+ {
+ input = m_parent.m_cf->id_val (input);
+ src_type = TREE_TYPE (input);
+ }
+
size_t conv_src_size = int_size_in_bytes (src_type);
size_t conv_dst_size = int_size_in_bytes (dest_type);
size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input));
represent HSAIL registers. */
tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL);
+ tree restrict_char_ptr
+ = build_qualified_type (build_pointer_type (char_type_node),
+ TYPE_QUAL_RESTRICT);
+ tree restrict_void_ptr
+ = build_qualified_type (build_pointer_type (void_type_node),
+ TYPE_QUAL_RESTRICT);
+
+ tree restrict_const_char_ptr
+ = build_qualified_type (build_pointer_type
+ (build_qualified_type (char_type_node,
+ TYPE_QUAL_CONST)),
+ TYPE_QUAL_RESTRICT);
+
+ tree restrict_const_void_ptr
+ = build_qualified_type (build_pointer_type
+ (build_qualified_type (void_type_node,
+ TYPE_QUAL_CONST)),
+ TYPE_QUAL_RESTRICT);
+
if (is_kernel)
{
tree name_identifier
3) a void* parameter that contains the first flat address of the group
region allocated to the current work-group. */
- tree char_ptr_type_node = build_pointer_type (char_type_node);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_list (void_type_node,
- char_ptr_type_node,
- ptr_type_node,
- ptr_type_node, NULL_TREE));
+ restrict_const_char_ptr,
+ restrict_void_ptr,
+ restrict_char_ptr, NULL_TREE));
SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier);
= gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG);
tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__args"), char_ptr_type_node);
+ get_identifier ("__args"),
+ restrict_const_char_ptr);
DECL_ARGUMENTS (fndecl) = arg_arg;
- DECL_ARG_TYPE (arg_arg) = char_ptr_type_node;
+ DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr;
DECL_CONTEXT (arg_arg) = fndecl;
DECL_ARTIFICIAL (arg_arg) = 1;
TREE_READONLY (arg_arg) = 1;
if (arg_decls == NULL_TREE)
arg_decls = arg_var;
else
- chainon (arg_decls, arg_var);
+ arg_decls = chainon (arg_decls, arg_var);
m_parent.m_cf->add_arg_variable (brigVar, arg_var);
vec_safe_push (args, TREE_TYPE (arg_var));
m_parent.m_cf->add_arg_variable (brigVar, arg_var);
-
- if (arg_decls == NULL_TREE)
- arg_decls = arg_var;
- else
- chainon (arg_decls, arg_var);
+ arg_decls = chainon (arg_decls, arg_var);
}
}
-
- vec_safe_push (args, ptr_type_node);
- vec_safe_push (args, ptr_type_node);
- vec_safe_push (args, ptr_type_node);
- vec_safe_push (args, ptr_type_node);
+ vec_safe_push (args, restrict_void_ptr);
+ vec_safe_push (args, restrict_char_ptr);
+ vec_safe_push (args, uint32_type_node);
+ vec_safe_push (args, restrict_char_ptr);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_vec (ret_type, args));
/* All functions need the hidden __context argument passed on
because they might call WI-specific functions which need
- the context info. */
+ the context info. Only kernels can write it, if they need
+ to update the local ids in the work-item loop. */
+
+ tree context_arg_type
+ = true ? restrict_void_ptr : restrict_const_void_ptr;
tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__context"), ptr_type_node);
- if (DECL_ARGUMENTS (fndecl) == NULL_TREE)
- DECL_ARGUMENTS (fndecl) = context_arg;
- else
- chainon (DECL_ARGUMENTS (fndecl), context_arg);
+ get_identifier ("__context"),
+ context_arg_type);
+ DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg);
DECL_CONTEXT (context_arg) = fndecl;
- DECL_ARG_TYPE (context_arg) = ptr_type_node;
+ DECL_ARG_TYPE (context_arg) = context_arg_type;
DECL_ARTIFICIAL (context_arg) = 1;
TREE_READONLY (context_arg) = 1;
TREE_USED (context_arg) = 1;
+ m_parent.m_cf->m_context_arg = context_arg;
/* They can also access group memory, so we need to pass the
group pointer along too. */
tree group_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__group_base_addr"), ptr_type_node);
- chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
- DECL_ARG_TYPE (group_base_arg) = ptr_type_node;
+ get_identifier ("__group_base_addr"),
+ restrict_char_ptr);
+ DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
+ DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr;
DECL_CONTEXT (group_base_arg) = fndecl;
DECL_ARTIFICIAL (group_base_arg) = 1;
TREE_READONLY (group_base_arg) = 1;
tree group_local_offset_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__group_local_offset"), uint32_type_node);
- chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
+ DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node;
DECL_CONTEXT (group_local_offset_arg) = fndecl;
DECL_ARTIFICIAL (group_local_offset_arg) = 1;
/* Same for private. */
tree private_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__private_base_addr"), ptr_type_node);
- chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
- DECL_ARG_TYPE (private_base_arg) = ptr_type_node;
+ get_identifier ("__private_base_addr"), restrict_char_ptr);
+ DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
+ DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr;
DECL_CONTEXT (private_base_arg) = fndecl;
DECL_ARTIFICIAL (private_base_arg) = 1;
TREE_READONLY (private_base_arg) = 1;
TREE_USED (private_base_arg) = 1;
+ m_parent.m_cf->m_private_base_arg = private_base_arg;
DECL_SAVED_TREE (fndecl) = bind_expr;
- set_externally_visible (fndecl);
-
if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION)
{
TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 0;
DECL_DECLARED_INLINE_P (fndecl) = 1;
+ set_inline (fndecl);
+ set_externally_visible (fndecl);
}
else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL)
{
TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 1;
+ set_inline (fndecl);
}
else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION)
{
}
m_parent.start_function (fndecl);
-
m_parent.m_cf->m_func_decl = fndecl;
m_parent.m_cf->m_current_bind_expr = bind_expr;
- m_parent.m_cf->m_context_arg = context_arg;
- m_parent.m_cf->m_private_base_arg = private_base_arg;
if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node)
{
#include "function.h"
#include "brig-to-generic.h"
#include "brig-builtins.h"
+#include "options.h"
+#include "fold-const.h"
+#include "target.h"
+#include "builtins.h"
+
+brig_function::builtin_map brig_function::s_custom_builtins;
brig_function::brig_function (const BrigDirectiveExecutable *exec,
brig_to_generic *parent)
memset (m_regs, 0,
BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT * sizeof (BrigOperandRegister *));
memset (&m_descriptor, 0, sizeof (phsa_descriptor));
+
+ if (s_custom_builtins.size () > 0) return;
+
+ /* Populate the builtin index. */
+#undef DEF_HSAIL_ATOMIC_BUILTIN
+#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
+#undef DEF_HSAIL_INTR_BUILTIN
+#undef DEF_HSAIL_SAT_BUILTIN
+#undef DEF_HSAIL_BUILTIN
+#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
+ s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)] \
+ = builtin_decl_explicit (ENUM);
+
+#include "brig-builtins.def"
}
brig_function::~brig_function ()
tree stmts = BIND_EXPR_BODY (bind_expr);
/* Initialize the WG limits and local ids. */
-
- tree_stmt_iterator entry = tsi_start (stmts);
+ m_kernel_entry = tsi_start (stmts);
for (int i = 0; i < 3; ++i)
{
to avoid unnecessary casts (the ID functions are 32b). */
m_local_id_vars[i]
= add_local_variable (std::string ("__local_") + dim_char,
- uint32_type_node);
+ long_long_integer_type_node);
tree workitemid_call
= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKITEMID), 2,
m_context_arg);
tree id_init = build2 (MODIFY_EXPR, TREE_TYPE (m_local_id_vars[i]),
- m_local_id_vars[i], workitemid_call);
+ m_local_id_vars[i],
+ convert (TREE_TYPE (m_local_id_vars[i]),
+ workitemid_call));
- tsi_link_after (&entry, id_init, TSI_NEW_STMT);
+ append_statement (id_init);
m_cur_wg_size_vars[i]
= add_local_variable (std::string ("__cur_wg_size_") + dim_char,
- uint32_type_node);
+ long_long_integer_type_node);
- tree cwgz_call
- = call_builtin
- (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
- 2, uint32_type_node, uint32_type_node,
- build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
+ tree cwgz_call;
+ if (flag_assume_phsa)
+ {
+ tree_stl_vec operands
+ = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+ cwgz_call
+ = expand_or_call_builtin (BRIG_OPCODE_CURRENTWORKGROUPSIZE,
+ BRIG_TYPE_U32, uint32_type_node,
+ operands);
+ }
+ else
+ cwgz_call = call_builtin
+ (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
+ 2, uint32_type_node, uint32_type_node,
+ build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
tree limit_init = build2 (MODIFY_EXPR, TREE_TYPE (m_cur_wg_size_vars[i]),
- m_cur_wg_size_vars[i], cwgz_call);
+ m_cur_wg_size_vars[i],
+ convert (TREE_TYPE (m_cur_wg_size_vars[i]),
+ cwgz_call));
- tsi_link_after (&entry, limit_init, TSI_NEW_STMT);
+ append_statement (limit_init);
m_wg_id_vars[i]
= add_local_variable (std::string ("__workgroupid_") + dim_char,
uint32_type_node);
- tree wgid_call
- = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
- 2, uint32_type_node, uint32_type_node,
- build_int_cst (uint32_type_node, i), ptr_type_node,
- m_context_arg);
+ tree wgid_call;
+ if (flag_assume_phsa)
+ {
+ tree_stl_vec operands
+ = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+ wgid_call
+ = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPID, BRIG_TYPE_U32,
+ uint32_type_node, operands);
+ }
+ else
+ wgid_call
+ = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
+ 2, uint32_type_node, uint32_type_node,
+ build_int_cst (uint32_type_node, i), ptr_type_node,
+ m_context_arg);
tree wgid_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_id_vars[i]),
m_wg_id_vars[i], wgid_call);
- tsi_link_after (&entry, wgid_init, TSI_NEW_STMT);
+ append_statement (wgid_init);
m_wg_size_vars[i]
= add_local_variable (std::string ("__workgroupsize_") + dim_char,
uint32_type_node);
- tree wgsize_call
- = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
- 2, uint32_type_node, uint32_type_node,
- build_int_cst (uint32_type_node, i), ptr_type_node,
- m_context_arg);
+ tree wgsize_call;
+ if (flag_assume_phsa)
+ {
+ tree_stl_vec operands
+ = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+ wgsize_call
+ = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+ uint32_type_node, operands);
+ }
+ else
+ wgsize_call
+ = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
+ 2, uint32_type_node, uint32_type_node,
+ build_int_cst (uint32_type_node, i), ptr_type_node,
+ m_context_arg);
tree wgsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_size_vars[i]),
m_wg_size_vars[i], wgsize_call);
- tsi_link_after (&entry, wgsize_init, TSI_NEW_STMT);
+ append_statement (wgsize_init);
m_grid_size_vars[i]
= add_local_variable (std::string ("__gridsize_") + dim_char,
tree gridsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_grid_size_vars[i]),
m_grid_size_vars[i], gridsize_call);
- tsi_link_after (&entry, gridsize_init, TSI_NEW_STMT);
+ append_statement (gridsize_init);
+
+ m_abs_id_base_vars[i]
+ = add_local_variable (std::string ("__abs_id_base_") + dim_char,
+ long_long_integer_type_node);
+
+ m_abs_id_vars[i]
+ = add_local_variable (std::string ("__abs_id_") + dim_char,
+ long_long_integer_type_node);
+
+ tree abs_id_base
+ = build2 (MULT_EXPR, long_long_integer_type_node,
+ convert (long_long_integer_type_node, m_wg_id_vars[i]),
+ convert (long_long_integer_type_node, m_wg_size_vars[i]));
+ tree abs_id
+ = build2 (PLUS_EXPR, long_long_integer_type_node, abs_id_base,
+ convert (long_long_integer_type_node, m_local_id_vars[i]));
+
+ tree abs_id_base_init
+ = build2 (MODIFY_EXPR, TREE_TYPE (m_abs_id_base_vars[i]),
+ m_abs_id_base_vars[i], abs_id_base);
+ append_statement (abs_id_base_init);
+
+ tree abs_id_init = build2 (MODIFY_EXPR,
+ TREE_TYPE (m_abs_id_vars[i]),
+ m_abs_id_vars[i], abs_id);
+ append_statement (abs_id_init);
}
-
- m_kernel_entry = entry;
}
/* Creates a new local variable with the given NAME and given GENERIC
tree_stmt_iterator *branch_after)
{
tree ivar = m_local_id_vars[dim];
+ tree abs_id_base_var = m_abs_id_base_vars[dim];
+ tree abs_id_var = m_abs_id_vars[dim];
tree ivar_max = m_cur_wg_size_vars[dim];
tree_stmt_iterator entry = *header_entry;
build_zero_cst (TREE_TYPE (ivar)));
tsi_link_after (&entry, ivar_init, TSI_NEW_STMT);
+ tree abs_id_var_init = build2 (MODIFY_EXPR, TREE_TYPE (abs_id_var),
+ abs_id_var,
+ convert (TREE_TYPE (abs_id_var),
+ abs_id_base_var));
+ tsi_link_after (&entry, abs_id_var_init, TSI_NEW_STMT);
+
tree loop_body_label
= label (std::string ("__wi_loop_") + (char) ((int) 'x' + dim));
tree loop_body_label_stmt = build_stmt (LABEL_EXPR, loop_body_label);
if (m_has_unexpanded_dp_builtins)
{
- tree id_set_builtin
- = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
- /* Set the local ID to the current wi-loop iteration variable value to
- ensure the builtins see the correct values. */
- tree id_set_call
- = call_builtin (id_set_builtin, 3,
- void_type_node, uint32_type_node,
- build_int_cst (uint32_type_node, dim), uint32_type_node,
- ivar, ptr_type_node, m_context_arg);
- tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+ if (!flag_assume_phsa)
+ {
+ tree id_set_builtin
+ = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
+ /* Set the local ID to the current wi-loop iteration variable value
+ to ensure the builtins see the correct values. */
+ tree id_set_call
+ = call_builtin (id_set_builtin, 3,
+ void_type_node, uint32_type_node,
+ build_int_cst (uint32_type_node, dim),
+ uint32_type_node, convert (uint32_type_node, ivar),
+ ptr_type_node, m_context_arg);
+ tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+ }
+ else
+ {
+ tree ptr_type = build_pointer_type (uint32_type_node);
+ tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+ build_int_cst (ptr_type, dim * 4));
+ tree assign = build2 (MODIFY_EXPR, uint32_type_node, ctx,
+ convert (uint32_type_node, ivar));
+
+ tsi_link_after (&entry, assign, TSI_NEW_STMT);
+ }
}
/* Increment the WI iteration variable. */
tsi_link_after (branch_after, incr, TSI_NEW_STMT);
+ /* ...and the abs id variable. */
+ tree abs_id_incr = build2 (PREINCREMENT_EXPR, TREE_TYPE (abs_id_var),
+ abs_id_var,
+ build_one_cst (TREE_TYPE (abs_id_var)));
+
+ tsi_link_after (branch_after, abs_id_incr, TSI_NEW_STMT);
+
/* Append the predicate check with the back edge goto. */
tree condition = build2 (LT_EXPR, TREE_TYPE (ivar), ivar, ivar_max);
tree target_goto = build1 (GOTO_EXPR, void_type_node, loop_body_label);
tree name_identifier
= get_identifier_with_length (kern_name.c_str (), kern_name.size ());
+ tree restrict_void_ptr
+ = build_qualified_type (build_pointer_type (void_type_node),
+ TYPE_QUAL_RESTRICT);
+ tree restrict_char_ptr
+ = build_qualified_type (build_pointer_type (char_type_node),
+ TYPE_QUAL_RESTRICT);
tree launcher
= build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
- build_function_type_list (void_type_node, ptr_type_node,
- ptr_type_node, NULL_TREE));
+ build_function_type_list (void_type_node, restrict_void_ptr,
+ restrict_char_ptr, NULL_TREE));
TREE_USED (launcher) = 1;
DECL_ARTIFICIAL (launcher) = 1;
tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__context"), ptr_type_node);
+ get_identifier ("__context"),
+ restrict_void_ptr);
DECL_ARGUMENTS (launcher) = context_arg;
- DECL_ARG_TYPE (context_arg) = ptr_type_node;
+ DECL_ARG_TYPE (context_arg) = restrict_void_ptr;
DECL_CONTEXT (context_arg) = launcher;
TREE_USED (context_arg) = 1;
DECL_ARTIFICIAL (context_arg) = 1;
tree group_base_addr_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL,
- get_identifier ("__group_base_addr"), ptr_type_node);
+ get_identifier ("__group_base_addr"), restrict_char_ptr);
chainon (DECL_ARGUMENTS (launcher), group_base_addr_arg);
- DECL_ARG_TYPE (group_base_addr_arg) = ptr_type_node;
+ DECL_ARG_TYPE (group_base_addr_arg) = restrict_char_ptr;
DECL_CONTEXT (group_base_addr_arg) = launcher;
TREE_USED (group_base_addr_arg) = 1;
DECL_ARTIFICIAL (group_base_addr_arg) = 1;
phsail_launch_kernel_call
= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_WG_FUNC),
4, void_type_node,
- ptr_type_node, kernel_func_ptr, ptr_type_node,
- context_arg, ptr_type_node, group_base_addr_arg,
+ ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+ context_arg, restrict_char_ptr, group_base_addr_arg,
uint32_type_node, group_local_offset_arg);
else
phsail_launch_kernel_call
= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_KERNEL),
4, void_type_node,
- ptr_type_node, kernel_func_ptr, ptr_type_node,
- context_arg, ptr_type_node, group_base_addr_arg,
+ ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+ context_arg, restrict_char_ptr, group_base_addr_arg,
uint32_type_node, group_local_offset_arg);
append_to_statement_list_force (phsail_launch_kernel_call, &stmt_list);
gcc_assert (m_parent->m_module_group_variables.has_variable (name));
return m_parent->m_module_group_variables.segment_offset (name);
}
+
+/* Try to expand the given builtin call to reuse a previously generated
+ variable, if possible. If not, just call the given builtin.
+ BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
+ ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
+ input operands. */
+
+tree
+brig_function::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+ BrigType16_t brig_type,
+ tree arith_type,
+ tree_stl_vec &operands)
+{
+ if (needs_workitem_context_data (brig_opcode))
+ m_has_unexpanded_dp_builtins = true;
+
+ if (can_expand_builtin (brig_opcode))
+ return expand_builtin (brig_opcode, operands);
+
+ tree built_in
+ = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
+
+ if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
+ && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
+ && brig_opcode != BRIG_OPCODE_LERP
+ && brig_opcode != BRIG_OPCODE_PACKCVT
+ && brig_opcode != BRIG_OPCODE_SAD
+ && brig_opcode != BRIG_OPCODE_SADHI)
+ {
+ /* Call the scalar built-in for all elements in the vector. */
+ tree_stl_vec operand0_elements;
+ if (operands.size () > 0)
+ unpack (operands[0], operand0_elements);
+
+ tree_stl_vec operand1_elements;
+ if (operands.size () > 1)
+ unpack (operands[1], operand1_elements);
+
+ tree_stl_vec result_elements;
+
+ size_t element_count = gccbrig_type_vector_subparts (arith_type);
+ for (size_t i = 0; i < element_count; ++i)
+ {
+ tree_stl_vec call_operands;
+ if (operand0_elements.size () > 0)
+ call_operands.push_back (operand0_elements.at (i));
+
+ if (operand1_elements.size () > 0)
+ call_operands.push_back (operand1_elements.at (i));
+
+ result_elements.push_back
+ (expand_or_call_builtin (brig_opcode, brig_type,
+ TREE_TYPE (arith_type),
+ call_operands));
+ }
+ return pack (result_elements);
+ }
+
+ tree_stl_vec call_operands;
+ tree_stl_vec operand_types;
+
+ tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
+
+ for (size_t i = 0; i < operands.size (); ++i)
+ {
+ tree operand_type = TREE_VALUE (arg_type_chain);
+ call_operands.push_back (convert (operand_type, operands[i]));
+ operand_types.push_back (operand_type);
+ arg_type_chain = TREE_CHAIN (arg_type_chain);
+ }
+
+ if (needs_workitem_context_data (brig_opcode))
+ {
+ call_operands.push_back (m_context_arg);
+ operand_types.push_back (ptr_type_node);
+ }
+
+ size_t operand_count = call_operands.size ();
+
+ call_operands.resize (4, NULL_TREE);
+ operand_types.resize (4, NULL_TREE);
+ for (size_t i = 0; i < operand_count; ++i)
+ call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
+ call_operands.at (i));
+
+ tree fnptr = build_fold_addr_expr (built_in);
+ return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
+ operand_count, &call_operands[0]);
+}
+
+/* Instead of calling a built-in function, use a more efficient mechanism
+ such as reuse a previously returned value known to be still valid, or
+ access the work-item context struct directly. This is beneficial especially
+ for the work-item identification related builtins as not having them as
+ unanalyzable black box calls can lead to more easily vectorizable parallel
+ loops for multi work-item work-groups. BRIG_OPCODE identifies the builtin
+ and OPERANDS store the operands. */
+
+tree
+brig_function::expand_builtin (BrigOpcode16_t brig_opcode,
+ tree_stl_vec &operands)
+{
+ tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
+
+ tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
+
+ tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
+
+ if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
+ {
+ tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
+ id0 = convert (uint64_type_node, id0);
+
+ tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
+ id1 = convert (uint64_type_node, id1);
+
+ tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
+ id2 = convert (uint64_type_node, id2);
+
+ tree max0 = convert (uint64_type_node, m_grid_size_vars[0]);
+ tree max1 = convert (uint64_type_node, m_grid_size_vars[1]);
+
+ tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
+ id2_x_max0_x_max1
+ = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
+
+ tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
+
+ tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
+ sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
+
+ return add_temp_var ("workitemflatabsid", sum);
+ }
+ else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
+ {
+ HOST_WIDE_INT dim = int_constant_value (operands[0]);
+ return m_abs_id_vars[dim];
+ }
+ else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
+ {
+
+ tree wg_size_x = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_0);
+ tree wg_size_y = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_1);
+ tree z_x_wgsx_wgsy
+ = build2 (MULT_EXPR, uint32_type_node,
+ convert (uint32_type_node,
+ expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_2)),
+ wg_size_x);
+ z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
+ wg_size_y);
+
+ tree y_x_wgsx
+ = build2 (MULT_EXPR, uint32_type_node,
+ convert (uint32_type_node,
+ expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_1)),
+ wg_size_x);
+
+ tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
+ sum = build2 (PLUS_EXPR, uint32_type_node,
+ convert (uint32_type_node,
+ expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_0)),
+ sum);
+ return add_temp_var ("workitemflatid", sum);
+ }
+ else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
+ {
+ HOST_WIDE_INT dim = int_constant_value (operands[0]);
+ if (flag_assume_phsa)
+ {
+ tree ptr_type = build_pointer_type (uint32_type_node);
+ tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+ build_int_cst (ptr_type,
+ PHSA_CONTEXT_WG_SIZES
+ + dim * 4));
+ std::string name ("wgsize_x");
+ name [name.length() - 1] += dim;
+ return add_temp_var (name.c_str(), ctx);
+ }
+ else if (m_is_kernel)
+ {
+ /* For kernels without phsa we generate certain temps before
+ the WI loop, which means we don't need to rely on LICM to get
+ them moved out. */
+ return m_wg_size_vars[dim];
+ }
+ else
+ gcc_unreachable ();
+ }
+ else if (brig_opcode == BRIG_OPCODE_WORKITEMID)
+ {
+ HOST_WIDE_INT dim = int_constant_value (operands[0]);
+ if (m_is_kernel)
+ {
+ return m_local_id_vars [dim];
+ }
+ else if (flag_assume_phsa)
+ {
+ tree ptr_type = build_pointer_type (uint32_type_node);
+ tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+ build_int_cst (ptr_type,
+ PHSA_CONTEXT_OFFS_WI_IDS
+ + dim * 4));
+ std::string name ("wiid_x");
+ name [name.length() - 1] += dim;
+ return add_temp_var (name.c_str(), ctx);
+ }
+ else
+ gcc_unreachable ();
+ }
+ else if (brig_opcode == BRIG_OPCODE_WORKGROUPID)
+ {
+ HOST_WIDE_INT dim = int_constant_value (operands[0]);
+ if (flag_assume_phsa)
+ {
+ tree ptr_type = build_pointer_type (uint32_type_node);
+ tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+ build_int_cst (ptr_type,
+ PHSA_CONTEXT_OFFS_WG_IDS
+ + dim * 4));
+ std::string name ("wgid_x");
+ name [name.length() - 1] += dim;
+ return add_temp_var (name.c_str(), ctx);
+ } else if (m_is_kernel)
+ return m_wg_id_vars [dim];
+ else
+ gcc_unreachable ();
+ }
+ else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
+ {
+ HOST_WIDE_INT dim = int_constant_value (operands[0]);
+ if (flag_assume_phsa)
+ {
+ tree ptr_type = build_pointer_type (uint32_type_node);
+ tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+ build_int_cst (ptr_type,
+ PHSA_CONTEXT_CURRENT_WG_SIZES
+ + dim * 4));
+ std::string name ("curwgsize_x");
+ name [name.length() - 1] += dim;
+ return add_temp_var (name.c_str(), ctx);
+ } else if (m_is_kernel)
+ return m_cur_wg_size_vars[dim];
+ else
+ gcc_unreachable ();
+ }
+ else
+ gcc_unreachable ();
+
+ return NULL_TREE;
+}
+
+/* Returns true in case the given opcode that would normally be generated
+ as a builtin call can be expanded to tree nodes. */
+
+bool
+brig_function::can_expand_builtin (BrigOpcode16_t brig_opcode) const
+{
+ switch (brig_opcode)
+ {
+ case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+ case BRIG_OPCODE_WORKITEMFLATID:
+ case BRIG_OPCODE_WORKITEMID:
+ case BRIG_OPCODE_WORKGROUPID:
+ case BRIG_OPCODE_WORKGROUPSIZE:
+ return m_is_kernel || flag_assume_phsa;
+ case BRIG_OPCODE_WORKITEMFLATABSID:
+ case BRIG_OPCODE_WORKITEMABSID:
+ return m_is_kernel;
+ default:
+ return false;
+ };
+}
+
+/* In case the HSA instruction must be implemented using a builtin,
+ this function is called to get the correct builtin function.
+ TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
+ brig instruction and BRIG_TYPE the brig instruction's type. */
+
+tree
+brig_function::get_builtin_for_hsa_opcode
+ (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
+{
+ tree builtin = NULL_TREE;
+ tree builtin_type = type;
+
+ /* For vector types, first find the scalar version of the builtin. */
+ if (type != NULL_TREE && VECTOR_TYPE_P (type))
+ builtin_type = TREE_TYPE (type);
+ BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+
+ /* Some BRIG opcodes can use the same builtins for unsigned and
+ signed types. Force these cases to unsigned types. */
+
+ if (brig_opcode == BRIG_OPCODE_BORROW
+ || brig_opcode == BRIG_OPCODE_CARRY
+ || brig_opcode == BRIG_OPCODE_LASTBIT
+ || brig_opcode == BRIG_OPCODE_BITINSERT)
+ {
+ if (brig_type == BRIG_TYPE_S32)
+ brig_type = BRIG_TYPE_U32;
+ else if (brig_type == BRIG_TYPE_S64)
+ brig_type = BRIG_TYPE_U64;
+ }
+
+ switch (brig_opcode)
+ {
+ case BRIG_OPCODE_FLOOR:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
+ break;
+ case BRIG_OPCODE_CEIL:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
+ break;
+ case BRIG_OPCODE_SQRT:
+ case BRIG_OPCODE_NSQRT:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
+ break;
+ case BRIG_OPCODE_RINT:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
+ break;
+ case BRIG_OPCODE_TRUNC:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
+ break;
+ case BRIG_OPCODE_COPYSIGN:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
+ break;
+ case BRIG_OPCODE_NSIN:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
+ break;
+ case BRIG_OPCODE_NLOG2:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
+ break;
+ case BRIG_OPCODE_NEXP2:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
+ break;
+ case BRIG_OPCODE_NFMA:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
+ break;
+ case BRIG_OPCODE_NCOS:
+ builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
+ break;
+ case BRIG_OPCODE_POPCOUNT:
+ /* Popcount should be typed by its argument type (the return value
+ is always u32). Let's use a b64 version for also for b32 for now. */
+ return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
+ case BRIG_OPCODE_BORROW:
+ /* Borrow uses the same builtin for unsigned and signed types. */
+ if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+ return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
+ else
+ return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
+ case BRIG_OPCODE_CARRY:
+ /* Carry also uses the same builtin for unsigned and signed types. */
+ if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+ return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
+ else
+ return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
+ default:
+
+ /* Use our builtin index for finding a proper builtin for the BRIG
+ opcode and BRIG type. This takes care most of the builtin cases,
+ the special cases are handled in the separate 'case' statements
+ above. */
+ builtin_map::const_iterator i
+ = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+ if (i != s_custom_builtins.end ())
+ return (*i).second;
+
+ if (brig_inner_type != brig_type)
+ {
+ /* Try to find a scalar built-in we could use. */
+ i = s_custom_builtins.find
+ (std::make_pair (brig_opcode, brig_inner_type));
+ if (i != s_custom_builtins.end ())
+ return (*i).second;
+ }
+
+ /* In case this is an fp16 operation that is promoted to fp32,
+ try to find a fp32 scalar built-in. */
+ if (brig_inner_type == BRIG_TYPE_F16)
+ {
+ i = s_custom_builtins.find
+ (std::make_pair (brig_opcode, BRIG_TYPE_F32));
+ if (i != s_custom_builtins.end ())
+ return (*i).second;
+ }
+ gcc_unreachable ();
+ }
+
+ if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
+ {
+ /* Try to find a vectorized version of the built-in.
+ TODO: properly assert that builtin is a mathfn builtin? */
+ tree vec_builtin
+ = targetm.vectorize.builtin_vectorized_function
+ (builtin_mathfn_code (builtin), type, type);
+ if (vec_builtin != NULL_TREE)
+ return vec_builtin;
+ else
+ return builtin;
+ }
+ if (builtin == NULL_TREE)
+ gcc_unreachable ();
+ return builtin;
+}
+
+/* Unpacks the elements of the vector in VALUE to scalars (bit field
+ references) in ELEMENTS. */
+
+void
+brig_function::unpack (tree value, tree_stl_vec &elements)
+{
+ size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
+ size_t element_size
+ = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
+ size_t element_count
+ = vec_size * BITS_PER_UNIT / element_size;
+
+ tree input_element_type = TREE_TYPE (TREE_TYPE (value));
+
+ value = add_temp_var ("unpack_input", value);
+
+ for (size_t i = 0; i < element_count; ++i)
+ {
+ tree element
+ = build3 (BIT_FIELD_REF, input_element_type, value,
+ TYPE_SIZE (input_element_type),
+ bitsize_int(i * element_size));
+
+ element = add_temp_var ("scalar", element);
+ elements.push_back (element);
+ }
+}
+
+/* Pack the elements of the scalars in ELEMENTS to the returned vector. */
+
+tree
+brig_function::pack (tree_stl_vec &elements)
+{
+ size_t element_count = elements.size ();
+
+ gcc_assert (element_count > 1);
+
+ tree output_element_type = TREE_TYPE (elements.at (0));
+
+ vec<constructor_elt, va_gc> *constructor_vals = NULL;
+ for (size_t i = 0; i < element_count; ++i)
+ CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
+
+ tree vec_type = build_vector_type (output_element_type, element_count);
+
+ /* build_constructor creates a vector type which is not a vector_cst
+ that requires compile time constant elements. */
+ tree vec = build_constructor (vec_type, constructor_vals);
+
+ /* Add a temp variable for readability. */
+ tree tmp_var = create_tmp_var (vec_type, "vec_out");
+ tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
+ append_statement (vec_tmp_assign);
+ return tmp_var;
+}
+
+/* Returns true in case the given opcode needs to know about work-item context
+ data. In such case the context data is passed as a pointer to a work-item
+ context object, as the last argument in the builtin call. */
+
+bool
+brig_function::needs_workitem_context_data
+(BrigOpcode16_t brig_opcode)
+{
+ switch (brig_opcode)
+ {
+ case BRIG_OPCODE_WORKITEMABSID:
+ case BRIG_OPCODE_WORKITEMFLATABSID:
+ case BRIG_OPCODE_WORKITEMFLATID:
+ case BRIG_OPCODE_CURRENTWORKITEMFLATID:
+ case BRIG_OPCODE_WORKITEMID:
+ case BRIG_OPCODE_WORKGROUPID:
+ case BRIG_OPCODE_WORKGROUPSIZE:
+ case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+ case BRIG_OPCODE_GRIDGROUPS:
+ case BRIG_OPCODE_GRIDSIZE:
+ case BRIG_OPCODE_DIM:
+ case BRIG_OPCODE_PACKETID:
+ case BRIG_OPCODE_PACKETCOMPLETIONSIG:
+ case BRIG_OPCODE_BARRIER:
+ case BRIG_OPCODE_WAVEBARRIER:
+ case BRIG_OPCODE_ARRIVEFBAR:
+ case BRIG_OPCODE_INITFBAR:
+ case BRIG_OPCODE_JOINFBAR:
+ case BRIG_OPCODE_LEAVEFBAR:
+ case BRIG_OPCODE_RELEASEFBAR:
+ case BRIG_OPCODE_WAITFBAR:
+ case BRIG_OPCODE_CUID:
+ case BRIG_OPCODE_MAXCUID:
+ case BRIG_OPCODE_DEBUGTRAP:
+ case BRIG_OPCODE_GROUPBASEPTR:
+ case BRIG_OPCODE_KERNARGBASEPTR:
+ case BRIG_OPCODE_ALLOCA:
+ return true;
+ default:
+ return false;
+ };
+}
+
+/* Appends and returns a new temp variable and an accompanying assignment
+ statement that stores the value of the given EXPR and has the given NAME. */
+
+tree
+brig_function::add_temp_var (std::string name, tree expr)
+{
+ tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
+ tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
+ append_statement (assign);
+ return temp_var;
+}
+
+/* Returns the integer constant value of the given node.
+ If it's a cast, looks into the source of the cast. */
+
+HOST_WIDE_INT
+brig_function::int_constant_value (tree node)
+{
+ tree n = node;
+ if (TREE_CODE (n) == VIEW_CONVERT_EXPR)
+ n = TREE_OPERAND (n, 0);
+ return int_cst_value (n);
+}
+
+/* Returns the tree code that should be used to implement the given
+ HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
+ (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly,
+ returns TREE_LIST (if it can be emulated with a simple chain of tree
+ nodes) or CALL_EXPR if the opcode should be implemented using a builtin
+ call. */
+
+tree_code
+brig_function::get_tree_code_for_hsa_opcode
+ (BrigOpcode16_t brig_opcode, BrigType16_t brig_type)
+{
+ BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+ switch (brig_opcode)
+ {
+ case BRIG_OPCODE_NOP:
+ return NOP_EXPR;
+ case BRIG_OPCODE_ADD:
+ return PLUS_EXPR;
+ case BRIG_OPCODE_CMOV:
+ if (brig_inner_type == brig_type)
+ return COND_EXPR;
+ else
+ return VEC_COND_EXPR;
+ case BRIG_OPCODE_SUB:
+ return MINUS_EXPR;
+ case BRIG_OPCODE_MUL:
+ case BRIG_OPCODE_MUL24:
+ return MULT_EXPR;
+ case BRIG_OPCODE_MULHI:
+ case BRIG_OPCODE_MUL24HI:
+ return MULT_HIGHPART_EXPR;
+ case BRIG_OPCODE_DIV:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return RDIV_EXPR;
+ else
+ return TRUNC_DIV_EXPR;
+ case BRIG_OPCODE_NEG:
+ return NEGATE_EXPR;
+ case BRIG_OPCODE_MIN:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return CALL_EXPR;
+ else
+ return MIN_EXPR;
+ case BRIG_OPCODE_MAX:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return CALL_EXPR;
+ else
+ return MAX_EXPR;
+ case BRIG_OPCODE_FMA:
+ return FMA_EXPR;
+ case BRIG_OPCODE_ABS:
+ return ABS_EXPR;
+ case BRIG_OPCODE_SHL:
+ return LSHIFT_EXPR;
+ case BRIG_OPCODE_SHR:
+ return RSHIFT_EXPR;
+ case BRIG_OPCODE_OR:
+ return BIT_IOR_EXPR;
+ case BRIG_OPCODE_XOR:
+ return BIT_XOR_EXPR;
+ case BRIG_OPCODE_AND:
+ return BIT_AND_EXPR;
+ case BRIG_OPCODE_NOT:
+ return BIT_NOT_EXPR;
+ case BRIG_OPCODE_RET:
+ return RETURN_EXPR;
+ case BRIG_OPCODE_MOV:
+ case BRIG_OPCODE_LDF:
+ return MODIFY_EXPR;
+ case BRIG_OPCODE_LD:
+ case BRIG_OPCODE_ST:
+ return MEM_REF;
+ case BRIG_OPCODE_BR:
+ return GOTO_EXPR;
+ case BRIG_OPCODE_REM:
+ if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
+ return TRUNC_MOD_EXPR;
+ else
+ return CALL_EXPR;
+ case BRIG_OPCODE_NRCP:
+ case BRIG_OPCODE_NRSQRT:
+ /* Implement as 1/f (x). gcc should pattern detect that and
+ use a native instruction, if available, for it. */
+ return TREE_LIST;
+ case BRIG_OPCODE_FLOOR:
+ case BRIG_OPCODE_CEIL:
+ case BRIG_OPCODE_SQRT:
+ case BRIG_OPCODE_NSQRT:
+ case BRIG_OPCODE_RINT:
+ case BRIG_OPCODE_TRUNC:
+ case BRIG_OPCODE_POPCOUNT:
+ case BRIG_OPCODE_COPYSIGN:
+ case BRIG_OPCODE_NCOS:
+ case BRIG_OPCODE_NSIN:
+ case BRIG_OPCODE_NLOG2:
+ case BRIG_OPCODE_NEXP2:
+ case BRIG_OPCODE_NFMA:
+ /* Class has type B1 regardless of the float type, thus
+ the below builtin map search cannot find it. */
+ case BRIG_OPCODE_CLASS:
+ case BRIG_OPCODE_WORKITEMABSID:
+ return CALL_EXPR;
+ default:
+
+ /* Some BRIG opcodes can use the same builtins for unsigned and
+ signed types. Force these cases to unsigned types.
+ */
+
+ if (brig_opcode == BRIG_OPCODE_BORROW
+ || brig_opcode == BRIG_OPCODE_CARRY
+ || brig_opcode == BRIG_OPCODE_LASTBIT
+ || brig_opcode == BRIG_OPCODE_BITINSERT)
+ {
+ if (brig_type == BRIG_TYPE_S32)
+ brig_type = BRIG_TYPE_U32;
+ else if (brig_type == BRIG_TYPE_S64)
+ brig_type = BRIG_TYPE_U64;
+ }
+
+
+ builtin_map::const_iterator i
+ = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+ if (i != s_custom_builtins.end ())
+ return CALL_EXPR;
+ else if (s_custom_builtins.find
+ (std::make_pair (brig_opcode, brig_inner_type))
+ != s_custom_builtins.end ())
+ return CALL_EXPR;
+ if (brig_inner_type == BRIG_TYPE_F16
+ && s_custom_builtins.find
+ (std::make_pair (brig_opcode, BRIG_TYPE_F32))
+ != s_custom_builtins.end ())
+ return CALL_EXPR;
+ break;
+ }
+ return TREE_LIST; /* Emulate using a chain of nodes. */
+}
+
+/* Inform of an update to the REG_VAR. */
+
+void
+brig_function::add_reg_var_update (tree reg_var, tree var)
+{
+ if (var == m_abs_id_vars[0] || var == m_abs_id_vars[1]
+ || var == m_abs_id_vars[2] || var == m_local_id_vars[0]
+ || var == m_local_id_vars[1] || var == m_local_id_vars[2])
+ m_id_val_defs [reg_var] = var;
+ else
+ {
+ /* Possible overwrite of an ID value. */
+
+ id_val_map::iterator i = m_id_val_defs.find (reg_var);
+ if (i != m_id_val_defs.end())
+ m_id_val_defs.erase (i);
+ }
+}
+
+/* If the REG_VAR is known to contain an ID value at this point in
+ the basic block, return true. */
+
+bool
+brig_function::is_id_val (tree reg_var)
+{
+ id_val_map::iterator i = m_id_val_defs.find (reg_var);
+ return i != m_id_val_defs.end();
+}
+
+/* Return an ID value for the given REG_VAR if its known to contain
+ one at this point in the BB, NULL_TREE otherwise. */
+
+tree
+brig_function::id_val (tree reg_var)
+{
+ id_val_map::iterator i = m_id_val_defs.find (reg_var);
+ if (i != m_id_val_defs.end())
+ return (*i).second;
+ else
+ return NULL_TREE;
+}
+
+/* Informs of starting a new basic block. Called when generating
+ a label, a call, a jump, or a return. */
+
+void
+brig_function::start_new_bb ()
+{
+ m_id_val_defs.clear ();
+}
void analyze_calls ();
+ tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
+
+ tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+ BrigType16_t brig_type, tree arith_type,
+ tree_stl_vec &operands);
+ bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
+
+ tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
+ BrigType16_t brig_type) const;
+
+ void unpack (tree value, tree_stl_vec &elements);
+ tree pack (tree_stl_vec &elements);
+ tree add_temp_var (std::string name, tree expr);
+
+ static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode);
+ static HOST_WIDE_INT int_constant_value (tree node);
+ static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
+ BrigType16_t brig_type);
+
+ void start_new_bb ();
+ void add_reg_var_update (tree reg_var, tree val);
+ bool is_id_val (tree reg_var);
+ tree id_val (tree reg_var);
+
const BrigDirectiveExecutable *m_brig_def;
bool m_is_kernel;
tree m_wg_id_vars[3];
tree m_wg_size_vars[3];
tree m_grid_size_vars[3];
+ /* Explicitly computed WG base for the absolute IDs which is used
+ as the initial value when looping that dimension. We update
+ the abs id with ++ to make it easy for the vectorizer. */
+ tree m_abs_id_base_vars[3];
+ tree m_abs_id_vars[3];
/* Set to true in case the kernel contains at least one dispatch packet
(work-item ID-related) builtin call that could not be expanded to
/* Bookkeeping for the different HSA registers and their tree declarations
for the currently generated function. */
reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT];
+
+ /* Map for keeping book reads of ID variables, which can be propagated
+ to uses in address expressions to produce cleaner indexing functions
+ with unnecessary casts stripped off, etc. */
+ typedef std::map<tree, tree> id_val_map;
+
+ /* Keeps track of ID values alive in registers in the currently
+ processed BB. */
+ id_val_map m_id_val_defs;
+
+ /* HSAIL-specific builtin functions not yet integrated to gcc. */
+ typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
+
+ static builtin_map s_custom_builtins;
};
#endif
std::string label_str ((const char *) (label_name->bytes),
label_name->byteCount);
+ m_parent.m_cf->start_new_bb ();
+
tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str));
m_parent.m_cf->append_statement (stmt);
+
return base->byteCount;
}
elements.push_back (zero_cst);
elements.push_back (zero_cst);
- expr = pack (elements);
+ expr = m_parent.m_cf->pack (elements);
}
else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE)
{
{
/* Add a temporary variable so there won't be multiple
reads in case of vector unpack. */
- mem_ref = add_temp_var ("mem_read", mem_ref);
+ mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref);
return build_output_assignment (*brig_inst, data, mem_ref);
}
else
inputs.push_back (operands[1]);
inputs.push_back (align_opr);
tree builtin_call
- = expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32,
- uint32_type_node, inputs);
+ = m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA,
+ BRIG_TYPE_U32,
+ uint32_type_node, inputs);
build_output_assignment (*brig_inst, operands[0], builtin_call);
m_parent.m_cf->m_has_allocas = true;
return base->byteCount;
/* The prefix to use in the ELF section containing descriptor for
a function. */
+
#define PHSA_DESC_SECTION_PREFIX "phsa.desc."
#define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def."
/* The frontend error messages are parsed by the host runtime. Known
prefix strings are used to separate the different runtime error
codes. */
+
#define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: "
#define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: "
+/* Offsets of attributes in the PHSA context structs.
+ Used by -fphsa-wi-context-opt. */
+#define PHSA_CONTEXT_OFFS_WI_IDS 0
+#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4)
+#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4)
+#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4)
+
#endif
-dump=
BRIG Joined Alias(d)
+fassume-phsa
+BRIG Report Var(flag_assume_phsa) Init(1) Optimization
+Assume we are finalizing for phsa and its libhsail-rt. Enables additional
+phsa-specific optimizations (default).
+
L
BRIG Joined Separate
; Not documented
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT)
DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT)
DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR,
BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING,
BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG,
-2018-05-04 Carl Love <cel@us.ibm.com>
+2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
+
+ * testsuite/brig.dg/test/gimple/smoke_test.hsail: Fix the test
+ to match the currently produced gimple.
+
+2018-05-04 Carl Love <cel@us.ibm.com>
* gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max,
vec_trunc.
* gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp,
};
/* The kernel function itself should have a fingerprint as follows */
-/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */
-/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */
+/* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */
+/* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */
/* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */
/* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */
/* mem_read.2 = *D.1691; */
-/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */
+/* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */
/* add_u32s should generate +operators */
/* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */
/* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */
/* The launcher should call __hsail_launch_wg_function in this case: */
-/* Kernel (void * __context, void * __group_base_addr) */
-/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */
+/* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */
+/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */
/* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/
/* The kernel should have the magic metadata section injected to the ELF. */
+2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
+
+ * include/internal/phsa-rt.h: Whitespace cleanup.
+ * include/internal/workitems.h: Store work item ID data to easily
+ accessible locations.
+ * rt/workitems.c: Same.
+
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* rt/workitems.c: Fix an alloca stack underflow.
*/
typedef struct
{
-
/* Data set by the HSA Runtime's kernel launcher. */
hsa_kernel_dispatch_packet_t *dp;
typedef struct
{
- /* The group id of the currently executed WG. */
- size_t x;
- size_t y;
- size_t z;
-
/* This is 1 in case there are more work groups to execute.
If 0, the work-item threads should finish themselves. */
int more_wgs;
stack frame. Initialized to point outside the private segment. */
uint32_t alloca_frame_p;
+ /* The group id of the currently executed WG. This is for fiber based
+ execution. The group ids are duplicated also to the per WI context
+ struct for simplified single pointer access in the GCCBRIG produced
+ code.
+ */
+
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+
} PHSAWorkGroup;
/* Data identifying a single work-item, passed to the work-item thread in case
typedef struct
{
+ /* NOTE: These members STARTing here should not be moved as they are
+ accessed directly by code emitted by BRIG FE. */
+
+ /* The local id of the current WI. */
+
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+
+ /* The group id of the currently executed WG. */
+
+ uint32_t group_x;
+ uint32_t group_y;
+ uint32_t group_z;
+
+ /* The local size of a complete WG. */
+
+ uint32_t wg_size_x;
+ uint32_t wg_size_y;
+ uint32_t wg_size_z;
+
+ /* The local size of the current WG. */
+
+ uint32_t cur_wg_size_x;
+ uint32_t cur_wg_size_y;
+ uint32_t cur_wg_size_z;
+
+ /* NOTE: Fixed members END here. */
+
PHSAKernelLaunchData *launch_data;
/* Identifies and keeps book of the currently executed WG of the WI swarm. */
volatile PHSAWorkGroup *wg;
- /* The local id of the current WI. */
- size_t x;
- size_t y;
- size_t z;
#ifdef HAVE_FIBERS
fiber_t fiber;
#endif
-} PHSAWorkItem;
+} __attribute__((packed)) PHSAWorkItem;
#endif
the current_work_group_* is set to point to the WG executed next. */
if (!wi->wg->more_wgs)
break;
+
+ wi->group_x = wg->x;
+ wi->group_y = wg->y;
+ wi->group_z = wg->z;
+
+ wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+ wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+ wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
#ifdef DEBUG_PHSA_RT
printf (
"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
- wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
- l_data->wg_max_y, l_data->wg_max_z);
+ wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
+ l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
#endif
if (wi->x < __hsail_currentworkgroupsize (0, wi)
else
wg->x++;
#endif
+ wi->group_x = wg->x;
+ wi->group_y = wg->y;
+ wi->group_z = wg->z;
+
+ wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+ wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+ wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
/* Reinitialize the work-group barrier according to the new WG's
size, which might not be the same as the previous ones, due
PHSAWorkItem *wi_threads = NULL;
PHSAWorkGroup wg;
size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
+ uint32_t group_x, group_y, group_z;
fiber_barrier_t wg_start_barrier;
fiber_barrier_t wg_completion_barrier;
fiber_barrier_t wg_sync_barrier;
wg.initial_group_offset = group_local_offset;
#ifdef EXECUTE_WGS_BACKWARDS
- wg.x = context->wg_max_x - 1;
- wg.y = context->wg_max_y - 1;
- wg.z = context->wg_max_z - 1;
+ group_x = context->wg_max_x - 1;
+ group_y = context->wg_max_y - 1;
+ group_z = context->wg_max_z - 1;
#else
- wg.x = context->wg_min_x;
- wg.y = context->wg_min_y;
- wg.z = context->wg_min_z;
+ group_x = context->wg_min_x;
+ group_y = context->wg_min_y;
+ group_z = context->wg_min_z;
#endif
fiber_barrier_init (&wg_sync_barrier, wg_size);
PHSAWorkItem *wi = &wi_threads[flat_wi_id];
wi->launch_data = context;
wi->wg = &wg;
+
+ wg.x = wi->group_x = group_x;
+ wg.y = wi->group_y = group_y;
+ wg.z = wi->group_z = group_z;
+
+ wi->wg_size_x = context->dp->workgroup_size_x;
+ wi->wg_size_y = context->dp->workgroup_size_y;
+ wi->wg_size_z = context->dp->workgroup_size_z;
+
+ wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+ wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+ wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
wi->x = x;
wi->y = y;
wi->z = z;
for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
{
- wi.wg->x = wg_x;
- wi.wg->y = wg_y;
- wi.wg->z = wg_z;
+ wi.group_x = wg_x;
+ wi.group_y = wg_y;
+ wi.group_z = wg_z;
+
+ wi.wg_size_x = context->dp->workgroup_size_x;
+ wi.wg_size_y = context->dp->workgroup_size_y;
+ wi.wg_size_z = context->dp->workgroup_size_z;
+
+ wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
+ wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
+ wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
context->kernel (context->kernarg_addr, &wi, group_base_ptr,
group_local_offset, private_base_ptr);
default:
case 0:
/* Overflow semantics in the case of WG dim > grid dim. */
- id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+ id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x;
break;
case 1:
- id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+ id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y;
break;
case 2:
- id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+ id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z;
break;
}
default:
case 0:
/* Overflow semantics in the case of WG dim > grid dim. */
- id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+ id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x;
break;
case 1:
- id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+ id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y;
break;
case 2:
- id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+ id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z;
break;
}
{
default:
case 0:
- if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
+ if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
wg_size = dp->workgroup_size_x; /* Full WG. */
else
wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
break;
case 1:
- if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
+ if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
wg_size = dp->workgroup_size_y; /* Full WG. */
else
wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
break;
case 2:
- if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
+ if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
wg_size = dp->workgroup_size_z; /* Full WG. */
else
wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
{
default:
case 0:
- return wi->wg->x;
+ return wi->group_x;
case 1:
- return wi->wg->y;
+ return wi->group_y;
case 2:
- return wi->wg->z;
+ return wi->group_z;
}
}