From 080dc24383a602a5a4095eb05b04100f15ba1ad4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Fri, 4 May 2018 19:43:57 +0000 Subject: [PATCH] [BRIGFE] phsa-specific optimizations Add flag -fassume-phsa that is on by default. If -fno-assume-phsa is given, these optimizations are disabled. With this flag, gccbrig can generate GENERIC that assumes we are targeting a phsa-runtime based implementation, which allows us to expose the work-item context accesses to retrieve WI IDs etc. which helps optimizers. First optimization that takes advantage of this is to get rid of the setworkitemid calls whenever we have non-inlined calls that use IDs internally. Other optimizations added in this commit: - expand absoluteid to similar level of simplicity as workitemid. At the moment absoluteid is the best indexing ID to end up with WG vectorization. - propagate ID variables closer to their uses. This is mainly to avoid known useless casts, which confuse at least scalar evolution analysis. - use signed long long for storing IDs. Unsigned integers have defined wraparound semantics, which confuse at least scalar evolution analysis, leading to unvectorizable WI loops. - also refactor some BRIG function generation helpers to brig_function. - no point in having the wi-loop as a for-loop. It's really a do...while and SCEV can analyze it just fine still. - add consts to ptrs etc. in BRIG builtin defs. Improves optimization opportunities. - add qualifiers to generated function parameters. Const and restrict on the hidden local/private pointers, the arg buffer and the context pointer help some optimizations. From-SVN: r259957 --- gcc/ChangeLog | 6 + gcc/brig-builtins.def | 27 +- gcc/brig/ChangeLog | 43 + .../brigfrontend/brig-basic-inst-handler.cc | 172 +--- .../brigfrontend/brig-branch-inst-handler.cc | 21 +- .../brigfrontend/brig-cmp-inst-handler.cc | 6 +- .../brigfrontend/brig-code-entry-handler.cc | 503 +--------- .../brigfrontend/brig-code-entry-handler.h | 21 - gcc/brig/brigfrontend/brig-control-handler.cc | 20 +- .../brigfrontend/brig-cvt-inst-handler.cc | 6 + .../brigfrontend/brig-function-handler.cc | 89 +- gcc/brig/brigfrontend/brig-function.cc | 925 +++++++++++++++++- gcc/brig/brigfrontend/brig-function.h | 43 + gcc/brig/brigfrontend/brig-label-handler.cc | 3 + .../brigfrontend/brig-lane-inst-handler.cc | 2 +- .../brigfrontend/brig-mem-inst-handler.cc | 7 +- gcc/brig/brigfrontend/phsa.h | 9 + gcc/brig/lang.opt | 5 + gcc/builtin-types.def | 4 + gcc/testsuite/ChangeLog | 7 +- .../brig.dg/test/gimple/smoke_test.hsail | 10 +- libhsail-rt/ChangeLog | 7 + libhsail-rt/include/internal/phsa-rt.h | 1 - libhsail-rt/include/internal/workitems.h | 50 +- libhsail-rt/rt/workitems.c | 84 +- 25 files changed, 1257 insertions(+), 814 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 150639cb474..47862a80a0d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2018-05-04 Pekka Jääskeläinen + + * brig-builtins.def: Add consts to ptrs etc. in BRIG builtin defs. + To improve optimization opportunities. + * builtin-types.def: The new needed builtin types for the above. + 2018-05-04 Richard Biener * bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check. diff --git a/gcc/brig-builtins.def b/gcc/brig-builtins.def index f94f7e62bb2..c2e8d2c034d 100644 --- a/gcc/brig-builtins.def +++ b/gcc/brig-builtins.def @@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32, BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32, - "__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR, - ATTR_NOTHROW_LEAF_LIST) + "__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR, + ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64, BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64, - "__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR, - ATTR_NOTHROW_LEAF_LIST) + "__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR, + ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID, - BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR, - ATTR_NOTHROW_LEAF_LIST) + BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR, + ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID, - BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR, - ATTR_NOTHROW_LEAF_LIST) + BRIG_TYPE_U32, "__hsail_workitemid", + BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID, - BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR, - ATTR_PURE_NOTHROW_LEAF_LIST) + BRIG_TYPE_U32, "__hsail_workgroupid", + BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID, BRIG_OPCODE_CURRENTWORKITEMFLATID, @@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE, BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32, - "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR, + "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE, - BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR, + BRIG_TYPE_U32, "__hsail_workgroupsize", + BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST) DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM, @@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid", DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC, "__hsail_launch_wg_function", - BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST) + BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST) DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL, "__hsail_launch_kernel", diff --git a/gcc/brig/ChangeLog b/gcc/brig/ChangeLog index 73269642058..ce4aea615eb 100644 --- a/gcc/brig/ChangeLog +++ b/gcc/brig/ChangeLog @@ -1,3 +1,46 @@ +2018-05-04 Pekka Jääskeläinen + + Add flag -fassume-phsa that is on by default. If -fno-assume-phsa + is given, these optimizations are disabled. With this flag, gccbrig + can generate GENERIC that assumes we are targeting a phsa-runtime + based implementation, which allows us to expose the work-item context + accesses to retrieve WI IDs etc. which helps optimizers. + First optimization that takes advantage of this is to get rid of + the setworkitemid calls whenever we have non-inlined calls that + use IDs internally. Other optimizations added in this commit: + - expand absoluteid to similar level of simplicity as workitemid. + At the moment absoluteid is the best indexing ID to end up with + WG vectorization. + - propagate ID variables closer to their uses. This is mainly + to avoid known useless casts, which confuse at least scalar + evolution analysis. + - use signed long long for storing IDs. Unsigned integers have + defined wraparound semantics, which confuse at least scalar + evolution analysis, leading to unvectorizable WI loops. + - also refactor some BRIG function generation helpers to brig_function. + - no point in having the wi-loop as a for-loop. It's really + a do...while and SCEV can analyze it just fine still. + - add consts to ptrs etc. in BRIG builtin defs. + Improves optimization opportunities. + - add qualifiers to generated function parameters. + Const and restrict on the hidden local/private pointers, + the arg buffer and the context pointer help some optimizations. + * brig/brigfrontend/brig-basic-inst-handler.cc: See above. + * brig/brigfrontend/brig-branch-inst-handler.cc: See above. + * brig/brigfrontend/brig-cmp-inst-handler.cc: See above. + * brig/brigfrontend/brig-code-entry-handler.cc: See above. + * brig/brigfrontend/brig-code-entry-handler.h: See above. + * brig/brigfrontend/brig-control-handler.cc: See above. + * brig/brigfrontend/brig-cvt-inst-handler.cc: See above. + * brig/brigfrontend/brig-function-handler.cc: See above. + * brig/brigfrontend/brig-function.cc: See above. + * brig/brigfrontend/brig-function.h: See above. + * brig/brigfrontend/brig-label-handler.cc: See above. + * brig/brigfrontend/brig-lane-inst-handler.cc: See above. + * brig/brigfrontend/brig-mem-inst-handler.cc: See above. + * brig/brigfrontend/phsa.h: See above. + * brig/lang.opt: See above. + 2018-05-04 Pekka Jääskeläinen * brig/brigfrontend/brig-function-handler.cc: Skip multiple forward diff --git a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc index 283da7ac80e..c8224ae6a51 100644 --- a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc @@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type, /* Unpack the tightly packed mask elements to BIT_FIELD_REFs from which to construct the mask vector as understood by VEC_PERM_EXPR. */ - tree mask_operand = add_temp_var ("shuffle_mask", operands[2]); + tree mask_operand + = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]); tree mask_element_type = build_nonstandard_integer_type (input_mask_element_size, true); @@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands) tree wide_type = build_nonstandard_integer_type (vecsize, 1); tree src_vect = build_resize_convert_view (wide_type, operands[0]); - src_vect = add_temp_var ("src_vect", src_vect); + src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect); tree scalar = operands[1]; - scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar)); + scalar = m_parent.m_cf->add_temp_var ("scalar", + convert_to_integer (wide_type, scalar)); tree pos = operands[2]; @@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands) Zero them for well-defined semantics. */ tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2], build_int_cstu (TREE_TYPE (pos), ecount - 1)); - pos = add_temp_var ("pos", convert (wide_type, t)); + pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t)); tree element_type = TREE_TYPE (TREE_TYPE (operands[0])); size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT; tree ewidth = build_int_cstu (wide_type, element_width); tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos); - bitoffset = add_temp_var ("offset", bitoffset); + bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset); uint64_t mask_int = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1; tree mask = build_int_cstu (wide_type, mask_int); - mask = add_temp_var ("mask", convert_to_integer (wide_type, mask)); + mask = m_parent.m_cf->add_temp_var ("mask", + convert_to_integer (wide_type, mask)); tree clearing_mask = build1 (BIT_NOT_EXPR, wide_type, @@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, tree arith_type, tree_stl_vec &operands) { - tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type); + tree_code opcode + = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type); BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK; @@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, on which cannot be used in general to remain HSAIL compliant. Perhaps a builtin call would be better option here. */ return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type), - expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type, - arith_type, operands)); + m_parent.m_cf->expand_or_call_builtin + (BRIG_OPCODE_SQRT, brig_type, arith_type, operands)); } else if (brig_opcode == BRIG_OPCODE_NRCP) { @@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, gcc_unreachable (); } else if (opcode == CALL_EXPR) - return expand_or_call_builtin (brig_opcode, brig_type, arith_type, - operands); + return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type, + arith_type, operands); else if (output_count == 1) { if (input_count == 1) @@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base) in_operands[0] = build_lower_element_broadcast (in_operands[0]); tree_code opcode - = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type); + = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode, + brig_inst_type); if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT) { @@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base) */ tree_stl_vec operand0_elements; if (input_count > 0) - unpack (in_operands[0], operand0_elements); + m_parent.m_cf->unpack (in_operands[0], operand0_elements); tree_stl_vec operand1_elements; if (input_count > 1) - unpack (in_operands[1], operand1_elements); + m_parent.m_cf->unpack (in_operands[1], operand1_elements); tree_stl_vec result_elements; @@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base) result_elements.push_back (convert (scalar_type, scalar_expr)); } - instr_expr = pack (result_elements); + instr_expr = m_parent.m_cf->pack (result_elements); } else { @@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand) vec_operand, mask); } -/* Returns the tree code that should be used to implement the given - HSA instruction opcode (BRIG_OPCODE) for the given type of instruction - (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly, - returns TREE_LIST (if it can be emulated with a simple chain of tree - nodes) or CALL_EXPR if the opcode should be implemented using a builtin - call. */ - -tree_code -brig_basic_inst_handler::get_tree_code_for_hsa_opcode - (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const -{ - BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK; - switch (brig_opcode) - { - case BRIG_OPCODE_NOP: - return NOP_EXPR; - case BRIG_OPCODE_ADD: - return PLUS_EXPR; - case BRIG_OPCODE_CMOV: - if (brig_inner_type == brig_type) - return COND_EXPR; - else - return VEC_COND_EXPR; - case BRIG_OPCODE_SUB: - return MINUS_EXPR; - case BRIG_OPCODE_MUL: - case BRIG_OPCODE_MUL24: - return MULT_EXPR; - case BRIG_OPCODE_MULHI: - case BRIG_OPCODE_MUL24HI: - return MULT_HIGHPART_EXPR; - case BRIG_OPCODE_DIV: - if (gccbrig_is_float_type (brig_inner_type)) - return RDIV_EXPR; - else - return TRUNC_DIV_EXPR; - case BRIG_OPCODE_NEG: - return NEGATE_EXPR; - case BRIG_OPCODE_MIN: - if (gccbrig_is_float_type (brig_inner_type)) - return CALL_EXPR; - else - return MIN_EXPR; - case BRIG_OPCODE_MAX: - if (gccbrig_is_float_type (brig_inner_type)) - return CALL_EXPR; - else - return MAX_EXPR; - case BRIG_OPCODE_FMA: - return FMA_EXPR; - case BRIG_OPCODE_ABS: - return ABS_EXPR; - case BRIG_OPCODE_SHL: - return LSHIFT_EXPR; - case BRIG_OPCODE_SHR: - return RSHIFT_EXPR; - case BRIG_OPCODE_OR: - return BIT_IOR_EXPR; - case BRIG_OPCODE_XOR: - return BIT_XOR_EXPR; - case BRIG_OPCODE_AND: - return BIT_AND_EXPR; - case BRIG_OPCODE_NOT: - return BIT_NOT_EXPR; - case BRIG_OPCODE_RET: - return RETURN_EXPR; - case BRIG_OPCODE_MOV: - case BRIG_OPCODE_LDF: - return MODIFY_EXPR; - case BRIG_OPCODE_LD: - case BRIG_OPCODE_ST: - return MEM_REF; - case BRIG_OPCODE_BR: - return GOTO_EXPR; - case BRIG_OPCODE_REM: - if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32) - return TRUNC_MOD_EXPR; - else - return CALL_EXPR; - case BRIG_OPCODE_NRCP: - case BRIG_OPCODE_NRSQRT: - /* Implement as 1/f (x). gcc should pattern detect that and - use a native instruction, if available, for it. */ - return TREE_LIST; - case BRIG_OPCODE_FLOOR: - case BRIG_OPCODE_CEIL: - case BRIG_OPCODE_SQRT: - case BRIG_OPCODE_NSQRT: - case BRIG_OPCODE_RINT: - case BRIG_OPCODE_TRUNC: - case BRIG_OPCODE_POPCOUNT: - case BRIG_OPCODE_COPYSIGN: - case BRIG_OPCODE_NCOS: - case BRIG_OPCODE_NSIN: - case BRIG_OPCODE_NLOG2: - case BRIG_OPCODE_NEXP2: - case BRIG_OPCODE_NFMA: - /* Class has type B1 regardless of the float type, thus - the below builtin map search cannot find it. */ - case BRIG_OPCODE_CLASS: - case BRIG_OPCODE_WORKITEMABSID: - return CALL_EXPR; - default: - - /* Some BRIG opcodes can use the same builtins for unsigned and - signed types. Force these cases to unsigned types. - */ - - if (brig_opcode == BRIG_OPCODE_BORROW - || brig_opcode == BRIG_OPCODE_CARRY - || brig_opcode == BRIG_OPCODE_LASTBIT - || brig_opcode == BRIG_OPCODE_BITINSERT) - { - if (brig_type == BRIG_TYPE_S32) - brig_type = BRIG_TYPE_U32; - else if (brig_type == BRIG_TYPE_S64) - brig_type = BRIG_TYPE_U64; - } - - - builtin_map::const_iterator i - = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type)); - if (i != s_custom_builtins.end ()) - return CALL_EXPR; - else if (s_custom_builtins.find - (std::make_pair (brig_opcode, brig_inner_type)) - != s_custom_builtins.end ()) - return CALL_EXPR; - if (brig_inner_type == BRIG_TYPE_F16 - && s_custom_builtins.find - (std::make_pair (brig_opcode, BRIG_TYPE_F32)) - != s_custom_builtins.end ()) - return CALL_EXPR; - break; - } - return TREE_LIST; /* Emulate using a chain of nodes. */ -} diff --git a/gcc/brig/brigfrontend/brig-branch-inst-handler.cc b/gcc/brig/brigfrontend/brig-branch-inst-handler.cc index 1340b74dd35..b6baf13711b 100644 --- a/gcc/brig/brigfrontend/brig-branch-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-branch-inst-handler.cc @@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base) memory. */ tree group_local_offset - = add_temp_var ("group_local_offset", - build_int_cst - (uint32_type_node, - m_parent.m_cf->m_local_group_variables.size())); + = m_parent.m_cf->add_temp_var ("group_local_offset", + build_int_cst + (uint32_type_node, + m_parent.m_cf-> + m_local_group_variables.size())); /* TODO: ensure the callee's frame is aligned! */ @@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base) m_parent.m_cf->m_called_functions.push_back (func_ref); if (DECL_EXTERNAL (func_ref)) m_parent.add_decl_call (call); + m_parent.m_cf->start_new_bb (); return base->byteCount; } @@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base) ensure the barrier won't be duplicated or moved out of loops etc. Like the 'noduplicate' of LLVM. Same goes for fbarriers. */ m_parent.m_cf->append_statement - (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE, - call_operands)); + (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode, + BRIG_TYPE_NONE, NULL_TREE, + call_operands)); } else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR && brig_inst->opcode <= BRIG_OPCODE_WAITFBAR) { m_parent.m_cf->m_has_barriers = true; m_parent.m_cf->append_statement - (expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, - uint32_type_node, operands)); + (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode, + BRIG_TYPE_NONE, + uint32_type_node, operands)); } else gcc_unreachable (); + m_parent.m_cf->start_new_bb (); return base->byteCount; } diff --git a/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc b/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc index 1155ead9c07..729e3fd0b22 100644 --- a/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc @@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base) results, we must now truncate the result vector to S16s so it fits to the destination register. We can build the target vector type from the f16 storage type (unsigned ints). */ - expr = add_temp_var ("wide_cmp_result", expr); + expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr); tree_stl_vec wide_elements; tree_stl_vec shrunk_elements; - unpack (expr, wide_elements); + m_parent.m_cf->unpack (expr, wide_elements); for (size_t i = 0; i < wide_elements.size (); ++i) { tree wide = wide_elements.at (i); shrunk_elements.push_back (convert_to_integer (short_integer_type_node, wide)); } - expr = pack (shrunk_elements); + expr = m_parent.m_cf->pack (shrunk_elements); } build_output_assignment (*inst_base, operands[0], expr); diff --git a/gcc/brig/brigfrontend/brig-code-entry-handler.cc b/gcc/brig/brigfrontend/brig-code-entry-handler.cc index 36a8deb403d..4fa37fd7a4b 100644 --- a/gcc/brig/brigfrontend/brig-code-entry-handler.cc +++ b/gcc/brig/brigfrontend/brig-code-entry-handler.cc @@ -41,24 +41,9 @@ #include "brig-builtins.h" #include "fold-const.h" -brig_code_entry_handler::builtin_map brig_code_entry_handler::s_custom_builtins; - brig_code_entry_handler::brig_code_entry_handler (brig_to_generic &parent) : brig_entry_handler (parent) { - if (s_custom_builtins.size () > 0) return; - - /* Populate the builtin index. */ -#undef DEF_HSAIL_ATOMIC_BUILTIN -#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN -#undef DEF_HSAIL_INTR_BUILTIN -#undef DEF_HSAIL_SAT_BUILTIN -#undef DEF_HSAIL_BUILTIN -#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \ - s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)] \ - = builtin_decl_explicit (ENUM); - -#include "brig-builtins.def" } /* Build a tree operand which is a reference to a piece of code. REF is the @@ -301,18 +286,18 @@ brig_code_entry_handler::build_address_operand tree local_size = build2 (MULT_EXPR, uint32_type_node, - expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, - BRIG_TYPE_U32, - uint32_type_node, uint32_0), - expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, - BRIG_TYPE_U32, - uint32_type_node, uint32_1)); + m_parent.m_cf->expand_or_call_builtin + (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32, + uint32_type_node, uint32_0), + m_parent.m_cf->expand_or_call_builtin + (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32, + uint32_type_node, uint32_1)); local_size = build2 (MULT_EXPR, uint32_type_node, - expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, - BRIG_TYPE_U32, - uint32_type_node, uint32_2), + m_parent.m_cf->expand_or_call_builtin + (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32, + uint32_type_node, uint32_2), local_size); tree var_region @@ -324,9 +309,9 @@ brig_code_entry_handler::build_address_operand = build2 (MULT_EXPR, uint32_type_node, build_int_cst (uint32_type_node, m_parent.private_variable_size (var_name)), - expand_or_call_builtin (BRIG_OPCODE_WORKITEMFLATID, - BRIG_TYPE_U32, - uint32_type_node, operands)); + m_parent.m_cf->expand_or_call_builtin + (BRIG_OPCODE_WORKITEMFLATID, BRIG_TYPE_U32, + uint32_type_node, operands)); tree var_offset = build2 (PLUS_EXPR, uint32_type_node, var_region, pos); @@ -336,8 +321,9 @@ brig_code_entry_handler::build_address_operand offset to a flat address by adding it as an offset to a (private or group) base pointer later on. Same applies to group_var_offset. */ symbol_base - = add_temp_var ("priv_var_offset", - convert (size_type_node, var_offset)); + = m_parent.m_cf->add_temp_var ("priv_var_offset", + convert (size_type_node, + var_offset)); } else if (segment == BRIG_SEGMENT_ARG) { @@ -699,138 +685,6 @@ brig_code_entry_handler::get_tree_expr_type_for_hsa_type return gccbrig_tree_type_for_hsa_type (brig_type); } -/* In case the HSA instruction must be implemented using a builtin, - this function is called to get the correct builtin function. - TYPE is the instruction tree type, BRIG_OPCODE the opcode of the - brig instruction and BRIG_TYPE the brig instruction's type. */ - -tree -brig_code_entry_handler::get_builtin_for_hsa_opcode - (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const -{ - tree builtin = NULL_TREE; - tree builtin_type = type; - - /* For vector types, first find the scalar version of the builtin. */ - if (type != NULL_TREE && VECTOR_TYPE_P (type)) - builtin_type = TREE_TYPE (type); - BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK; - - /* Some BRIG opcodes can use the same builtins for unsigned and - signed types. Force these cases to unsigned types. */ - - if (brig_opcode == BRIG_OPCODE_BORROW - || brig_opcode == BRIG_OPCODE_CARRY - || brig_opcode == BRIG_OPCODE_LASTBIT - || brig_opcode == BRIG_OPCODE_BITINSERT) - { - if (brig_type == BRIG_TYPE_S32) - brig_type = BRIG_TYPE_U32; - else if (brig_type == BRIG_TYPE_S64) - brig_type = BRIG_TYPE_U64; - } - - switch (brig_opcode) - { - case BRIG_OPCODE_FLOOR: - builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR); - break; - case BRIG_OPCODE_CEIL: - builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL); - break; - case BRIG_OPCODE_SQRT: - case BRIG_OPCODE_NSQRT: - builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT); - break; - case BRIG_OPCODE_RINT: - builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT); - break; - case BRIG_OPCODE_TRUNC: - builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC); - break; - case BRIG_OPCODE_COPYSIGN: - builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN); - break; - case BRIG_OPCODE_NSIN: - builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN); - break; - case BRIG_OPCODE_NLOG2: - builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2); - break; - case BRIG_OPCODE_NEXP2: - builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2); - break; - case BRIG_OPCODE_NFMA: - builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA); - break; - case BRIG_OPCODE_NCOS: - builtin = mathfn_built_in (builtin_type, BUILT_IN_COS); - break; - case BRIG_OPCODE_POPCOUNT: - /* Popcount should be typed by its argument type (the return value - is always u32). Let's use a b64 version for also for b32 for now. */ - return builtin_decl_explicit (BUILT_IN_POPCOUNTL); - case BRIG_OPCODE_BORROW: - /* Borrow uses the same builtin for unsigned and signed types. */ - if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32) - return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32); - else - return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64); - case BRIG_OPCODE_CARRY: - /* Carry also uses the same builtin for unsigned and signed types. */ - if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32) - return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32); - else - return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64); - default: - - /* Use our builtin index for finding a proper builtin for the BRIG - opcode and BRIG type. This takes care most of the builtin cases, - the special cases are handled in the separate 'case' statements - above. */ - builtin_map::const_iterator i - = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type)); - if (i != s_custom_builtins.end ()) - return (*i).second; - - if (brig_inner_type != brig_type) - { - /* Try to find a scalar built-in we could use. */ - i = s_custom_builtins.find - (std::make_pair (brig_opcode, brig_inner_type)); - if (i != s_custom_builtins.end ()) - return (*i).second; - } - - /* In case this is an fp16 operation that is promoted to fp32, - try to find a fp32 scalar built-in. */ - if (brig_inner_type == BRIG_TYPE_F16) - { - i = s_custom_builtins.find - (std::make_pair (brig_opcode, BRIG_TYPE_F32)); - if (i != s_custom_builtins.end ()) - return (*i).second; - } - gcc_unreachable (); - } - - if (VECTOR_TYPE_P (type) && builtin != NULL_TREE) - { - /* Try to find a vectorized version of the built-in. - TODO: properly assert that builtin is a mathfn builtin? */ - tree vec_builtin - = targetm.vectorize.builtin_vectorized_function - (builtin_mathfn_code (builtin), type, type); - if (vec_builtin != NULL_TREE) - return vec_builtin; - else - return builtin; - } - if (builtin == NULL_TREE) - gcc_unreachable (); - return builtin; -} - /* Return the correct GENERIC type for storing comparison results of operand with the type given in SOURCE_TYPE. */ @@ -848,264 +702,6 @@ brig_code_entry_handler::get_comparison_result_type (tree source_type) return gccbrig_tree_type_for_hsa_type (BRIG_TYPE_B1); } -/* Returns true in case the given opcode needs to know about work-item context - data. In such case the context data is passed as a pointer to a work-item - context object, as the last argument in the builtin call. */ - -bool -brig_code_entry_handler::needs_workitem_context_data - (BrigOpcode16_t brig_opcode) const -{ - switch (brig_opcode) - { - case BRIG_OPCODE_WORKITEMABSID: - case BRIG_OPCODE_WORKITEMFLATABSID: - case BRIG_OPCODE_WORKITEMFLATID: - case BRIG_OPCODE_CURRENTWORKITEMFLATID: - case BRIG_OPCODE_WORKITEMID: - case BRIG_OPCODE_WORKGROUPID: - case BRIG_OPCODE_WORKGROUPSIZE: - case BRIG_OPCODE_CURRENTWORKGROUPSIZE: - case BRIG_OPCODE_GRIDGROUPS: - case BRIG_OPCODE_GRIDSIZE: - case BRIG_OPCODE_DIM: - case BRIG_OPCODE_PACKETID: - case BRIG_OPCODE_PACKETCOMPLETIONSIG: - case BRIG_OPCODE_BARRIER: - case BRIG_OPCODE_WAVEBARRIER: - case BRIG_OPCODE_ARRIVEFBAR: - case BRIG_OPCODE_INITFBAR: - case BRIG_OPCODE_JOINFBAR: - case BRIG_OPCODE_LEAVEFBAR: - case BRIG_OPCODE_RELEASEFBAR: - case BRIG_OPCODE_WAITFBAR: - case BRIG_OPCODE_CUID: - case BRIG_OPCODE_MAXCUID: - case BRIG_OPCODE_DEBUGTRAP: - case BRIG_OPCODE_GROUPBASEPTR: - case BRIG_OPCODE_KERNARGBASEPTR: - case BRIG_OPCODE_ALLOCA: - return true; - default: - return false; - }; -} - -/* Returns true in case the given opcode that would normally be generated - as a builtin call can be expanded to tree nodes. */ - -bool -brig_code_entry_handler::can_expand_builtin (BrigOpcode16_t brig_opcode) const -{ - switch (brig_opcode) - { - case BRIG_OPCODE_WORKITEMFLATABSID: - case BRIG_OPCODE_WORKITEMFLATID: - case BRIG_OPCODE_WORKITEMABSID: - case BRIG_OPCODE_WORKGROUPSIZE: - case BRIG_OPCODE_CURRENTWORKGROUPSIZE: - /* TODO: expand more builtins. */ - return true; - default: - return false; - }; -} - -/* Try to expand the given builtin call to reuse a previously generated - variable, if possible. If not, just call the given builtin. - BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type, - ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's - input operands. */ - -tree -brig_code_entry_handler::expand_or_call_builtin (BrigOpcode16_t brig_opcode, - BrigType16_t brig_type, - tree arith_type, - tree_stl_vec &operands) -{ - if (m_parent.m_cf->m_is_kernel && can_expand_builtin (brig_opcode)) - return expand_builtin (brig_opcode, operands); - - tree built_in - = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type); - - if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in))) - && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type) - && brig_opcode != BRIG_OPCODE_LERP - && brig_opcode != BRIG_OPCODE_PACKCVT - && brig_opcode != BRIG_OPCODE_SAD - && brig_opcode != BRIG_OPCODE_SADHI) - { - /* Call the scalar built-in for all elements in the vector. */ - tree_stl_vec operand0_elements; - if (operands.size () > 0) - unpack (operands[0], operand0_elements); - - tree_stl_vec operand1_elements; - if (operands.size () > 1) - unpack (operands[1], operand1_elements); - - tree_stl_vec result_elements; - - size_t element_count = gccbrig_type_vector_subparts (arith_type); - for (size_t i = 0; i < element_count; ++i) - { - tree_stl_vec call_operands; - if (operand0_elements.size () > 0) - call_operands.push_back (operand0_elements.at (i)); - - if (operand1_elements.size () > 0) - call_operands.push_back (operand1_elements.at (i)); - - result_elements.push_back - (expand_or_call_builtin (brig_opcode, brig_type, - TREE_TYPE (arith_type), - call_operands)); - } - return pack (result_elements); - } - - tree_stl_vec call_operands; - tree_stl_vec operand_types; - - tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in)); - - for (size_t i = 0; i < operands.size (); ++i) - { - tree operand_type = TREE_VALUE (arg_type_chain); - call_operands.push_back (convert (operand_type, operands[i])); - operand_types.push_back (operand_type); - arg_type_chain = TREE_CHAIN (arg_type_chain); - } - - if (needs_workitem_context_data (brig_opcode)) - { - call_operands.push_back (m_parent.m_cf->m_context_arg); - operand_types.push_back (ptr_type_node); - m_parent.m_cf->m_has_unexpanded_dp_builtins = true; - } - - size_t operand_count = call_operands.size (); - - call_operands.resize (4, NULL_TREE); - operand_types.resize (4, NULL_TREE); - for (size_t i = 0; i < operand_count; ++i) - call_operands.at (i) = build_resize_convert_view (operand_types.at (i), - call_operands.at (i)); - - tree fnptr = build_fold_addr_expr (built_in); - return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr, - operand_count, &call_operands[0]); -} - -/* Instead of calling a built-in, reuse a previously returned value known to - be still valid. This is beneficial especially for the work-item - identification related builtins as not having them as calls can lead to - more easily vectorizable parallel loops for multi work-item work-groups. - BRIG_OPCODE identifies the builtin and OPERANDS store the operands. */ - -tree -brig_code_entry_handler::expand_builtin (BrigOpcode16_t brig_opcode, - tree_stl_vec &operands) -{ - tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0)); - - tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1)); - - tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2)); - - if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID) - { - tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0); - id0 = convert (uint64_type_node, id0); - - tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1); - id1 = convert (uint64_type_node, id1); - - tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2); - id2 = convert (uint64_type_node, id2); - - tree max0 = convert (uint64_type_node, - m_parent.m_cf->m_grid_size_vars[0]); - tree max1 = convert (uint64_type_node, - m_parent.m_cf->m_grid_size_vars[1]); - - tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0); - id2_x_max0_x_max1 - = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1); - - tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0); - - tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0); - sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1); - - return add_temp_var ("workitemflatabsid", sum); - } - else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID) - { - HOST_WIDE_INT dim = int_constant_value (operands[0]); - - tree local_id_var = m_parent.m_cf->m_local_id_vars[dim]; - tree wg_id_var = m_parent.m_cf->m_wg_id_vars[dim]; - tree wg_size_var = m_parent.m_cf->m_wg_size_vars[dim]; - - tree wg_id_x_wg_size = build2 (MULT_EXPR, uint32_type_node, - convert (uint32_type_node, wg_id_var), - convert (uint32_type_node, wg_size_var)); - tree sum - = build2 (PLUS_EXPR, uint32_type_node, wg_id_x_wg_size, local_id_var); - - return add_temp_var (std::string ("workitemabsid_") - + (char) ((int) 'x' + dim), sum); - } - else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID) - { - tree z_x_wgsx_wgsy - = build2 (MULT_EXPR, uint32_type_node, - m_parent.m_cf->m_local_id_vars[2], - m_parent.m_cf->m_wg_size_vars[0]); - z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy, - m_parent.m_cf->m_wg_size_vars[1]); - - tree y_x_wgsx - = build2 (MULT_EXPR, uint32_type_node, - m_parent.m_cf->m_local_id_vars[1], - m_parent.m_cf->m_wg_size_vars[0]); - - tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy); - sum = build2 (PLUS_EXPR, uint32_type_node, - m_parent.m_cf->m_local_id_vars[0], - sum); - return add_temp_var ("workitemflatid", sum); - } - else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE) - { - HOST_WIDE_INT dim = int_constant_value (operands[0]); - return m_parent.m_cf->m_wg_size_vars[dim]; - } - else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE) - { - HOST_WIDE_INT dim = int_constant_value (operands[0]); - return m_parent.m_cf->m_cur_wg_size_vars[dim]; - } - else - gcc_unreachable (); - - return NULL_TREE; -} - -/* Appends and returns a new temp variable and an accompanying assignment - statement that stores the value of the given EXPR and has the given NAME. */ - -tree -brig_code_entry_handler::add_temp_var (std::string name, tree expr) -{ - tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ()); - tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr); - m_parent.m_cf->append_statement (assign); - return temp_var; -} - /* Creates a FP32 to FP16 conversion call, assuming the source and destination are FP32 type variables. */ @@ -1387,7 +983,6 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst, variable type (can be any type; see get_m_var_declfor_reg @ brig-function.cc). */ tree output_type = TREE_TYPE (output); - tree input_type = TREE_TYPE (inst_expr); bool is_fp16 = (brig_inst.type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16 && brig_inst.base.kind != BRIG_KIND_INST_MEM && !gccbrig_is_bit_operation (brig_inst.opcode); @@ -1396,6 +991,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst, bool ftz = false; const BrigBase *base = &brig_inst.base; + if (m_parent.m_cf->is_id_val (inst_expr)) + inst_expr = m_parent.m_cf->id_val (inst_expr); + + tree input_type = TREE_TYPE (inst_expr); + + m_parent.m_cf->add_reg_var_update (output, inst_expr); + if (base->kind == BRIG_KIND_INST_MOD) { const BrigInstMod *mod = (const BrigInstMod *) base; @@ -1418,13 +1020,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst, { /* Ensure we don't duplicate the arithmetics to the arguments of the bit field reference operators. */ - inst_expr = add_temp_var ("before_ftz", inst_expr); + inst_expr = m_parent.m_cf->add_temp_var ("before_ftz", inst_expr); inst_expr = flush_to_zero (is_fp16) (*this, inst_expr); } if (is_fp16) { - inst_expr = add_temp_var ("before_f2h", inst_expr); + inst_expr = m_parent.m_cf->add_temp_var ("before_f2h", inst_expr); tree f2h_output = build_f2h_conversion (inst_expr); tree conv = build_resize_convert_view (output_type, f2h_output); tree assign = build2 (MODIFY_EXPR, output_type, output, conv); @@ -1486,62 +1088,6 @@ brig_code_entry_handler::append_statement (tree stmt) m_parent.m_cf->append_statement (stmt); } -/* Unpacks the elements of the vector in VALUE to scalars (bit field - references) in ELEMENTS. */ - -void -brig_code_entry_handler::unpack (tree value, tree_stl_vec &elements) -{ - size_t vec_size = int_size_in_bytes (TREE_TYPE (value)); - size_t element_size - = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT; - size_t element_count - = vec_size * BITS_PER_UNIT / element_size; - - tree input_element_type = TREE_TYPE (TREE_TYPE (value)); - - value = add_temp_var ("unpack_input", value); - - for (size_t i = 0; i < element_count; ++i) - { - tree element - = build3 (BIT_FIELD_REF, input_element_type, value, - TYPE_SIZE (input_element_type), - bitsize_int(i * element_size)); - - element = add_temp_var ("scalar", element); - elements.push_back (element); - } -} - -/* Pack the elements of the scalars in ELEMENTS to the returned vector. */ - -tree -brig_code_entry_handler::pack (tree_stl_vec &elements) -{ - size_t element_count = elements.size (); - - gcc_assert (element_count > 1); - - tree output_element_type = TREE_TYPE (elements.at (0)); - - vec *constructor_vals = NULL; - for (size_t i = 0; i < element_count; ++i) - CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i)); - - tree vec_type = build_vector_type (output_element_type, element_count); - - /* build_constructor creates a vector type which is not a vector_cst - that requires compile time constant elements. */ - tree vec = build_constructor (vec_type, constructor_vals); - - /* Add a temp variable for readability. */ - tree tmp_var = create_tmp_var (vec_type, "vec_out"); - tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec); - m_parent.m_cf->append_statement (vec_tmp_assign); - return tmp_var; -} - /* Visits the element(s) in the OPERAND, calling HANDLER to each of them. */ tree @@ -1757,4 +1303,3 @@ brig_code_entry_handler::int_constant_value (tree node) n = TREE_OPERAND (n, 0); return int_cst_value (n); } - diff --git a/gcc/brig/brigfrontend/brig-code-entry-handler.h b/gcc/brig/brigfrontend/brig-code-entry-handler.h index 3aa4d9eaa36..1e082c436c6 100644 --- a/gcc/brig/brigfrontend/brig-code-entry-handler.h +++ b/gcc/brig/brigfrontend/brig-code-entry-handler.h @@ -35,8 +35,6 @@ class tree_element_unary_visitor; class brig_code_entry_handler : public brig_entry_handler { public: - typedef std::map, tree> builtin_map; - brig_code_entry_handler (brig_to_generic &parent); /* Handles the brig_code data at the given pointer and adds it to the @@ -51,8 +49,6 @@ protected: tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const; tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst, tree type) const; - tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode, - BrigType16_t brig_type) const; tree get_comparison_result_type (tree source_type); tree build_code_ref (const BrigBase &ref); @@ -73,16 +69,6 @@ protected: bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const; - void unpack (tree value, tree_stl_vec &elements); - tree pack (tree_stl_vec &elements); - - bool can_expand_builtin (BrigOpcode16_t brig_opcode) const; - tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands); - - tree expand_or_call_builtin (BrigOpcode16_t brig_opcode, - BrigType16_t brig_type, tree arith_type, - tree_stl_vec &operands); - tree add_temp_var (std::string name, tree expr); tree build_f2h_conversion (tree source); @@ -100,10 +86,6 @@ protected: tree extend_int (tree input, tree dest_type, tree src_type); - /* HSAIL-specific builtin functions not yet integrated to gcc. */ - - static builtin_map s_custom_builtins; - private: tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst, @@ -299,9 +281,6 @@ private: tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type, tree_stl_vec &operands); - - tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode, - BrigType16_t brig_type) const; }; class brig_cvt_inst_handler : public brig_inst_mod_handler diff --git a/gcc/brig/brigfrontend/brig-control-handler.cc b/gcc/brig/brigfrontend/brig-control-handler.cc index b7e07226028..82189e149f5 100644 --- a/gcc/brig/brigfrontend/brig-control-handler.cc +++ b/gcc/brig/brigfrontend/brig-control-handler.cc @@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base) case BRIG_CONTROL_MAXDYNAMICGROUPSIZE: { m_parent.m_cf->m_descriptor.max_dynamic_group_size - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); break; } case BRIG_CONTROL_MAXFLATGRIDSIZE: { m_parent.m_cf->m_descriptor.max_flat_grid_size - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); break; } case BRIG_CONTROL_MAXFLATWORKGROUPSIZE: { m_parent.m_cf->m_descriptor.max_flat_workgroup_size - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); break; } case BRIG_CONTROL_REQUIREDDIM: { m_parent.m_cf->m_descriptor.required_dim - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); break; } case BRIG_CONTROL_REQUIREDGRIDSIZE: { m_parent.m_cf->m_descriptor.required_grid_size[0] - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); m_parent.m_cf->m_descriptor.required_grid_size[1] - = int_constant_value (operands.at (1)); + = brig_function::int_constant_value (operands.at (1)); m_parent.m_cf->m_descriptor.required_grid_size[2] - = int_constant_value (operands.at (2)); + = brig_function::int_constant_value (operands.at (2)); break; } case BRIG_CONTROL_REQUIREDWORKGROUPSIZE: { m_parent.m_cf->m_descriptor.required_workgroup_size[0] - = int_constant_value (operands.at (0)); + = brig_function::int_constant_value (operands.at (0)); m_parent.m_cf->m_descriptor.required_workgroup_size[1] - = int_constant_value (operands.at (1)); + = brig_function::int_constant_value (operands.at (1)); m_parent.m_cf->m_descriptor.required_workgroup_size[2] - = int_constant_value (operands.at (2)); + = brig_function::int_constant_value (operands.at (2)); break; } case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS: diff --git a/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc b/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc index e5ac799cdf2..3b8c9ea01df 100644 --- a/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc @@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base) tree &input = operands.at (1); tree &output = operands.at (0); + if (m_parent.m_cf->is_id_val (input)) + { + input = m_parent.m_cf->id_val (input); + src_type = TREE_TYPE (input); + } + size_t conv_src_size = int_size_in_bytes (src_type); size_t conv_dst_size = int_size_in_bytes (dest_type); size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input)); diff --git a/gcc/brig/brigfrontend/brig-function-handler.cc b/gcc/brig/brigfrontend/brig-function-handler.cc index d64135db7f2..f22f065c45c 100644 --- a/gcc/brig/brigfrontend/brig-function-handler.cc +++ b/gcc/brig/brigfrontend/brig-function-handler.cc @@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base) represent HSAIL registers. */ tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL); + tree restrict_char_ptr + = build_qualified_type (build_pointer_type (char_type_node), + TYPE_QUAL_RESTRICT); + tree restrict_void_ptr + = build_qualified_type (build_pointer_type (void_type_node), + TYPE_QUAL_RESTRICT); + + tree restrict_const_char_ptr + = build_qualified_type (build_pointer_type + (build_qualified_type (char_type_node, + TYPE_QUAL_CONST)), + TYPE_QUAL_RESTRICT); + + tree restrict_const_void_ptr + = build_qualified_type (build_pointer_type + (build_qualified_type (void_type_node, + TYPE_QUAL_CONST)), + TYPE_QUAL_RESTRICT); + if (is_kernel) { tree name_identifier @@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base) 3) a void* parameter that contains the first flat address of the group region allocated to the current work-group. */ - tree char_ptr_type_node = build_pointer_type (char_type_node); fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier, build_function_type_list (void_type_node, - char_ptr_type_node, - ptr_type_node, - ptr_type_node, NULL_TREE)); + restrict_const_char_ptr, + restrict_void_ptr, + restrict_char_ptr, NULL_TREE)); SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier); @@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base) = gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG); tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__args"), char_ptr_type_node); + get_identifier ("__args"), + restrict_const_char_ptr); DECL_ARGUMENTS (fndecl) = arg_arg; - DECL_ARG_TYPE (arg_arg) = char_ptr_type_node; + DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr; DECL_CONTEXT (arg_arg) = fndecl; DECL_ARTIFICIAL (arg_arg) = 1; TREE_READONLY (arg_arg) = 1; @@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) if (arg_decls == NULL_TREE) arg_decls = arg_var; else - chainon (arg_decls, arg_var); + arg_decls = chainon (arg_decls, arg_var); m_parent.m_cf->add_arg_variable (brigVar, arg_var); @@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base) vec_safe_push (args, TREE_TYPE (arg_var)); m_parent.m_cf->add_arg_variable (brigVar, arg_var); - - if (arg_decls == NULL_TREE) - arg_decls = arg_var; - else - chainon (arg_decls, arg_var); + arg_decls = chainon (arg_decls, arg_var); } } - - vec_safe_push (args, ptr_type_node); - vec_safe_push (args, ptr_type_node); - vec_safe_push (args, ptr_type_node); - vec_safe_push (args, ptr_type_node); + vec_safe_push (args, restrict_void_ptr); + vec_safe_push (args, restrict_char_ptr); + vec_safe_push (args, uint32_type_node); + vec_safe_push (args, restrict_char_ptr); fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier, build_function_type_vec (ret_type, args)); @@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base) /* All functions need the hidden __context argument passed on because they might call WI-specific functions which need - the context info. */ + the context info. Only kernels can write it, if they need + to update the local ids in the work-item loop. */ + + tree context_arg_type + = true ? restrict_void_ptr : restrict_const_void_ptr; tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__context"), ptr_type_node); - if (DECL_ARGUMENTS (fndecl) == NULL_TREE) - DECL_ARGUMENTS (fndecl) = context_arg; - else - chainon (DECL_ARGUMENTS (fndecl), context_arg); + get_identifier ("__context"), + context_arg_type); + DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg); DECL_CONTEXT (context_arg) = fndecl; - DECL_ARG_TYPE (context_arg) = ptr_type_node; + DECL_ARG_TYPE (context_arg) = context_arg_type; DECL_ARTIFICIAL (context_arg) = 1; TREE_READONLY (context_arg) = 1; TREE_USED (context_arg) = 1; + m_parent.m_cf->m_context_arg = context_arg; /* They can also access group memory, so we need to pass the group pointer along too. */ tree group_base_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__group_base_addr"), ptr_type_node); - chainon (DECL_ARGUMENTS (fndecl), group_base_arg); - DECL_ARG_TYPE (group_base_arg) = ptr_type_node; + get_identifier ("__group_base_addr"), + restrict_char_ptr); + DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg); + DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr; DECL_CONTEXT (group_base_arg) = fndecl; DECL_ARTIFICIAL (group_base_arg) = 1; TREE_READONLY (group_base_arg) = 1; @@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) tree group_local_offset_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, get_identifier ("__group_local_offset"), uint32_type_node); - chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg); + DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg); DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node; DECL_CONTEXT (group_local_offset_arg) = fndecl; DECL_ARTIFICIAL (group_local_offset_arg) = 1; @@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base) /* Same for private. */ tree private_base_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__private_base_addr"), ptr_type_node); - chainon (DECL_ARGUMENTS (fndecl), private_base_arg); - DECL_ARG_TYPE (private_base_arg) = ptr_type_node; + get_identifier ("__private_base_addr"), restrict_char_ptr); + DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg); + DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr; DECL_CONTEXT (private_base_arg) = fndecl; DECL_ARTIFICIAL (private_base_arg) = 1; TREE_READONLY (private_base_arg) = 1; TREE_USED (private_base_arg) = 1; + m_parent.m_cf->m_private_base_arg = private_base_arg; DECL_SAVED_TREE (fndecl) = bind_expr; - set_externally_visible (fndecl); - if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION) { TREE_STATIC (fndecl) = 0; TREE_PUBLIC (fndecl) = 1; DECL_EXTERNAL (fndecl) = 0; DECL_DECLARED_INLINE_P (fndecl) = 1; + set_inline (fndecl); + set_externally_visible (fndecl); } else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL) { @@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) TREE_STATIC (fndecl) = 0; TREE_PUBLIC (fndecl) = 1; DECL_EXTERNAL (fndecl) = 1; + set_inline (fndecl); } else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION) { @@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base) } m_parent.start_function (fndecl); - m_parent.m_cf->m_func_decl = fndecl; m_parent.m_cf->m_current_bind_expr = bind_expr; - m_parent.m_cf->m_context_arg = context_arg; - m_parent.m_cf->m_private_base_arg = private_base_arg; if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node) { diff --git a/gcc/brig/brigfrontend/brig-function.cc b/gcc/brig/brigfrontend/brig-function.cc index e1a14da8b72..f0c499d47f6 100644 --- a/gcc/brig/brigfrontend/brig-function.cc +++ b/gcc/brig/brigfrontend/brig-function.cc @@ -44,6 +44,12 @@ #include "function.h" #include "brig-to-generic.h" #include "brig-builtins.h" +#include "options.h" +#include "fold-const.h" +#include "target.h" +#include "builtins.h" + +brig_function::builtin_map brig_function::s_custom_builtins; brig_function::brig_function (const BrigDirectiveExecutable *exec, brig_to_generic *parent) @@ -60,6 +66,20 @@ brig_function::brig_function (const BrigDirectiveExecutable *exec, memset (m_regs, 0, BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT * sizeof (BrigOperandRegister *)); memset (&m_descriptor, 0, sizeof (phsa_descriptor)); + + if (s_custom_builtins.size () > 0) return; + + /* Populate the builtin index. */ +#undef DEF_HSAIL_ATOMIC_BUILTIN +#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN +#undef DEF_HSAIL_INTR_BUILTIN +#undef DEF_HSAIL_SAT_BUILTIN +#undef DEF_HSAIL_BUILTIN +#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \ + s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)] \ + = builtin_decl_explicit (ENUM); + +#include "brig-builtins.def" } brig_function::~brig_function () @@ -158,8 +178,7 @@ brig_function::add_id_variables () tree stmts = BIND_EXPR_BODY (bind_expr); /* Initialize the WG limits and local ids. */ - - tree_stmt_iterator entry = tsi_start (stmts); + m_kernel_entry = tsi_start (stmts); for (int i = 0; i < 3; ++i) { @@ -169,7 +188,7 @@ brig_function::add_id_variables () to avoid unnecessary casts (the ID functions are 32b). */ m_local_id_vars[i] = add_local_variable (std::string ("__local_") + dim_char, - uint32_type_node); + long_long_integer_type_node); tree workitemid_call = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKITEMID), 2, @@ -178,54 +197,88 @@ brig_function::add_id_variables () m_context_arg); tree id_init = build2 (MODIFY_EXPR, TREE_TYPE (m_local_id_vars[i]), - m_local_id_vars[i], workitemid_call); + m_local_id_vars[i], + convert (TREE_TYPE (m_local_id_vars[i]), + workitemid_call)); - tsi_link_after (&entry, id_init, TSI_NEW_STMT); + append_statement (id_init); m_cur_wg_size_vars[i] = add_local_variable (std::string ("__cur_wg_size_") + dim_char, - uint32_type_node); + long_long_integer_type_node); - tree cwgz_call - = call_builtin - (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE), - 2, uint32_type_node, uint32_type_node, - build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg); + tree cwgz_call; + if (flag_assume_phsa) + { + tree_stl_vec operands + = tree_stl_vec (1, build_int_cst (uint32_type_node, i)); + cwgz_call + = expand_or_call_builtin (BRIG_OPCODE_CURRENTWORKGROUPSIZE, + BRIG_TYPE_U32, uint32_type_node, + operands); + } + else + cwgz_call = call_builtin + (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE), + 2, uint32_type_node, uint32_type_node, + build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg); tree limit_init = build2 (MODIFY_EXPR, TREE_TYPE (m_cur_wg_size_vars[i]), - m_cur_wg_size_vars[i], cwgz_call); + m_cur_wg_size_vars[i], + convert (TREE_TYPE (m_cur_wg_size_vars[i]), + cwgz_call)); - tsi_link_after (&entry, limit_init, TSI_NEW_STMT); + append_statement (limit_init); m_wg_id_vars[i] = add_local_variable (std::string ("__workgroupid_") + dim_char, uint32_type_node); - tree wgid_call - = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID), - 2, uint32_type_node, uint32_type_node, - build_int_cst (uint32_type_node, i), ptr_type_node, - m_context_arg); + tree wgid_call; + if (flag_assume_phsa) + { + tree_stl_vec operands + = tree_stl_vec (1, build_int_cst (uint32_type_node, i)); + wgid_call + = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPID, BRIG_TYPE_U32, + uint32_type_node, operands); + } + else + wgid_call + = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID), + 2, uint32_type_node, uint32_type_node, + build_int_cst (uint32_type_node, i), ptr_type_node, + m_context_arg); tree wgid_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_id_vars[i]), m_wg_id_vars[i], wgid_call); - tsi_link_after (&entry, wgid_init, TSI_NEW_STMT); + append_statement (wgid_init); m_wg_size_vars[i] = add_local_variable (std::string ("__workgroupsize_") + dim_char, uint32_type_node); - tree wgsize_call - = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE), - 2, uint32_type_node, uint32_type_node, - build_int_cst (uint32_type_node, i), ptr_type_node, - m_context_arg); + tree wgsize_call; + if (flag_assume_phsa) + { + tree_stl_vec operands + = tree_stl_vec (1, build_int_cst (uint32_type_node, i)); + wgsize_call + = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32, + uint32_type_node, operands); + } + else + wgsize_call + = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE), + 2, uint32_type_node, uint32_type_node, + build_int_cst (uint32_type_node, i), ptr_type_node, + m_context_arg); tree wgsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_size_vars[i]), m_wg_size_vars[i], wgsize_call); - tsi_link_after (&entry, wgsize_init, TSI_NEW_STMT); + append_statement (wgsize_init); m_grid_size_vars[i] = add_local_variable (std::string ("__gridsize_") + dim_char, @@ -240,10 +293,34 @@ brig_function::add_id_variables () tree gridsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_grid_size_vars[i]), m_grid_size_vars[i], gridsize_call); - tsi_link_after (&entry, gridsize_init, TSI_NEW_STMT); + append_statement (gridsize_init); + + m_abs_id_base_vars[i] + = add_local_variable (std::string ("__abs_id_base_") + dim_char, + long_long_integer_type_node); + + m_abs_id_vars[i] + = add_local_variable (std::string ("__abs_id_") + dim_char, + long_long_integer_type_node); + + tree abs_id_base + = build2 (MULT_EXPR, long_long_integer_type_node, + convert (long_long_integer_type_node, m_wg_id_vars[i]), + convert (long_long_integer_type_node, m_wg_size_vars[i])); + tree abs_id + = build2 (PLUS_EXPR, long_long_integer_type_node, abs_id_base, + convert (long_long_integer_type_node, m_local_id_vars[i])); + + tree abs_id_base_init + = build2 (MODIFY_EXPR, TREE_TYPE (m_abs_id_base_vars[i]), + m_abs_id_base_vars[i], abs_id_base); + append_statement (abs_id_base_init); + + tree abs_id_init = build2 (MODIFY_EXPR, + TREE_TYPE (m_abs_id_vars[i]), + m_abs_id_vars[i], abs_id); + append_statement (abs_id_init); } - - m_kernel_entry = entry; } /* Creates a new local variable with the given NAME and given GENERIC @@ -359,6 +436,8 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry, tree_stmt_iterator *branch_after) { tree ivar = m_local_id_vars[dim]; + tree abs_id_base_var = m_abs_id_base_vars[dim]; + tree abs_id_var = m_abs_id_vars[dim]; tree ivar_max = m_cur_wg_size_vars[dim]; tree_stmt_iterator entry = *header_entry; @@ -371,6 +450,12 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry, build_zero_cst (TREE_TYPE (ivar))); tsi_link_after (&entry, ivar_init, TSI_NEW_STMT); + tree abs_id_var_init = build2 (MODIFY_EXPR, TREE_TYPE (abs_id_var), + abs_id_var, + convert (TREE_TYPE (abs_id_var), + abs_id_base_var)); + tsi_link_after (&entry, abs_id_var_init, TSI_NEW_STMT); + tree loop_body_label = label (std::string ("__wi_loop_") + (char) ((int) 'x' + dim)); tree loop_body_label_stmt = build_stmt (LABEL_EXPR, loop_body_label); @@ -379,16 +464,30 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry, if (m_has_unexpanded_dp_builtins) { - tree id_set_builtin - = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID); - /* Set the local ID to the current wi-loop iteration variable value to - ensure the builtins see the correct values. */ - tree id_set_call - = call_builtin (id_set_builtin, 3, - void_type_node, uint32_type_node, - build_int_cst (uint32_type_node, dim), uint32_type_node, - ivar, ptr_type_node, m_context_arg); - tsi_link_after (&entry, id_set_call, TSI_NEW_STMT); + if (!flag_assume_phsa) + { + tree id_set_builtin + = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID); + /* Set the local ID to the current wi-loop iteration variable value + to ensure the builtins see the correct values. */ + tree id_set_call + = call_builtin (id_set_builtin, 3, + void_type_node, uint32_type_node, + build_int_cst (uint32_type_node, dim), + uint32_type_node, convert (uint32_type_node, ivar), + ptr_type_node, m_context_arg); + tsi_link_after (&entry, id_set_call, TSI_NEW_STMT); + } + else + { + tree ptr_type = build_pointer_type (uint32_type_node); + tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg, + build_int_cst (ptr_type, dim * 4)); + tree assign = build2 (MODIFY_EXPR, uint32_type_node, ctx, + convert (uint32_type_node, ivar)); + + tsi_link_after (&entry, assign, TSI_NEW_STMT); + } } /* Increment the WI iteration variable. */ @@ -397,6 +496,13 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry, tsi_link_after (branch_after, incr, TSI_NEW_STMT); + /* ...and the abs id variable. */ + tree abs_id_incr = build2 (PREINCREMENT_EXPR, TREE_TYPE (abs_id_var), + abs_id_var, + build_one_cst (TREE_TYPE (abs_id_var))); + + tsi_link_after (branch_after, abs_id_incr, TSI_NEW_STMT); + /* Append the predicate check with the back edge goto. */ tree condition = build2 (LT_EXPR, TREE_TYPE (ivar), ivar, ivar_max); tree target_goto = build1 (GOTO_EXPR, void_type_node, loop_body_label); @@ -549,29 +655,36 @@ brig_function::emit_launcher_and_metadata () tree name_identifier = get_identifier_with_length (kern_name.c_str (), kern_name.size ()); + tree restrict_void_ptr + = build_qualified_type (build_pointer_type (void_type_node), + TYPE_QUAL_RESTRICT); + tree restrict_char_ptr + = build_qualified_type (build_pointer_type (char_type_node), + TYPE_QUAL_RESTRICT); tree launcher = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier, - build_function_type_list (void_type_node, ptr_type_node, - ptr_type_node, NULL_TREE)); + build_function_type_list (void_type_node, restrict_void_ptr, + restrict_char_ptr, NULL_TREE)); TREE_USED (launcher) = 1; DECL_ARTIFICIAL (launcher) = 1; tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__context"), ptr_type_node); + get_identifier ("__context"), + restrict_void_ptr); DECL_ARGUMENTS (launcher) = context_arg; - DECL_ARG_TYPE (context_arg) = ptr_type_node; + DECL_ARG_TYPE (context_arg) = restrict_void_ptr; DECL_CONTEXT (context_arg) = launcher; TREE_USED (context_arg) = 1; DECL_ARTIFICIAL (context_arg) = 1; tree group_base_addr_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, - get_identifier ("__group_base_addr"), ptr_type_node); + get_identifier ("__group_base_addr"), restrict_char_ptr); chainon (DECL_ARGUMENTS (launcher), group_base_addr_arg); - DECL_ARG_TYPE (group_base_addr_arg) = ptr_type_node; + DECL_ARG_TYPE (group_base_addr_arg) = restrict_char_ptr; DECL_CONTEXT (group_base_addr_arg) = launcher; TREE_USED (group_base_addr_arg) = 1; DECL_ARTIFICIAL (group_base_addr_arg) = 1; @@ -618,15 +731,15 @@ brig_function::emit_launcher_and_metadata () phsail_launch_kernel_call = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_WG_FUNC), 4, void_type_node, - ptr_type_node, kernel_func_ptr, ptr_type_node, - context_arg, ptr_type_node, group_base_addr_arg, + ptr_type_node, kernel_func_ptr, restrict_void_ptr, + context_arg, restrict_char_ptr, group_base_addr_arg, uint32_type_node, group_local_offset_arg); else phsail_launch_kernel_call = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_KERNEL), 4, void_type_node, - ptr_type_node, kernel_func_ptr, ptr_type_node, - context_arg, ptr_type_node, group_base_addr_arg, + ptr_type_node, kernel_func_ptr, restrict_void_ptr, + context_arg, restrict_char_ptr, group_base_addr_arg, uint32_type_node, group_local_offset_arg); append_to_statement_list_force (phsail_launch_kernel_call, &stmt_list); @@ -771,3 +884,719 @@ brig_function::group_variable_segment_offset (const std::string &name) const gcc_assert (m_parent->m_module_group_variables.has_variable (name)); return m_parent->m_module_group_variables.segment_offset (name); } + +/* Try to expand the given builtin call to reuse a previously generated + variable, if possible. If not, just call the given builtin. + BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type, + ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's + input operands. */ + +tree +brig_function::expand_or_call_builtin (BrigOpcode16_t brig_opcode, + BrigType16_t brig_type, + tree arith_type, + tree_stl_vec &operands) +{ + if (needs_workitem_context_data (brig_opcode)) + m_has_unexpanded_dp_builtins = true; + + if (can_expand_builtin (brig_opcode)) + return expand_builtin (brig_opcode, operands); + + tree built_in + = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type); + + if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in))) + && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type) + && brig_opcode != BRIG_OPCODE_LERP + && brig_opcode != BRIG_OPCODE_PACKCVT + && brig_opcode != BRIG_OPCODE_SAD + && brig_opcode != BRIG_OPCODE_SADHI) + { + /* Call the scalar built-in for all elements in the vector. */ + tree_stl_vec operand0_elements; + if (operands.size () > 0) + unpack (operands[0], operand0_elements); + + tree_stl_vec operand1_elements; + if (operands.size () > 1) + unpack (operands[1], operand1_elements); + + tree_stl_vec result_elements; + + size_t element_count = gccbrig_type_vector_subparts (arith_type); + for (size_t i = 0; i < element_count; ++i) + { + tree_stl_vec call_operands; + if (operand0_elements.size () > 0) + call_operands.push_back (operand0_elements.at (i)); + + if (operand1_elements.size () > 0) + call_operands.push_back (operand1_elements.at (i)); + + result_elements.push_back + (expand_or_call_builtin (brig_opcode, brig_type, + TREE_TYPE (arith_type), + call_operands)); + } + return pack (result_elements); + } + + tree_stl_vec call_operands; + tree_stl_vec operand_types; + + tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in)); + + for (size_t i = 0; i < operands.size (); ++i) + { + tree operand_type = TREE_VALUE (arg_type_chain); + call_operands.push_back (convert (operand_type, operands[i])); + operand_types.push_back (operand_type); + arg_type_chain = TREE_CHAIN (arg_type_chain); + } + + if (needs_workitem_context_data (brig_opcode)) + { + call_operands.push_back (m_context_arg); + operand_types.push_back (ptr_type_node); + } + + size_t operand_count = call_operands.size (); + + call_operands.resize (4, NULL_TREE); + operand_types.resize (4, NULL_TREE); + for (size_t i = 0; i < operand_count; ++i) + call_operands.at (i) = build_resize_convert_view (operand_types.at (i), + call_operands.at (i)); + + tree fnptr = build_fold_addr_expr (built_in); + return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr, + operand_count, &call_operands[0]); +} + +/* Instead of calling a built-in function, use a more efficient mechanism + such as reuse a previously returned value known to be still valid, or + access the work-item context struct directly. This is beneficial especially + for the work-item identification related builtins as not having them as + unanalyzable black box calls can lead to more easily vectorizable parallel + loops for multi work-item work-groups. BRIG_OPCODE identifies the builtin + and OPERANDS store the operands. */ + +tree +brig_function::expand_builtin (BrigOpcode16_t brig_opcode, + tree_stl_vec &operands) +{ + tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0)); + + tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1)); + + tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2)); + + if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID) + { + tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0); + id0 = convert (uint64_type_node, id0); + + tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1); + id1 = convert (uint64_type_node, id1); + + tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2); + id2 = convert (uint64_type_node, id2); + + tree max0 = convert (uint64_type_node, m_grid_size_vars[0]); + tree max1 = convert (uint64_type_node, m_grid_size_vars[1]); + + tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0); + id2_x_max0_x_max1 + = build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1); + + tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0); + + tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0); + sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1); + + return add_temp_var ("workitemflatabsid", sum); + } + else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID) + { + HOST_WIDE_INT dim = int_constant_value (operands[0]); + return m_abs_id_vars[dim]; + } + else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID) + { + + tree wg_size_x = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_0); + tree wg_size_y = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_1); + tree z_x_wgsx_wgsy + = build2 (MULT_EXPR, uint32_type_node, + convert (uint32_type_node, + expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_2)), + wg_size_x); + z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy, + wg_size_y); + + tree y_x_wgsx + = build2 (MULT_EXPR, uint32_type_node, + convert (uint32_type_node, + expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_1)), + wg_size_x); + + tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy); + sum = build2 (PLUS_EXPR, uint32_type_node, + convert (uint32_type_node, + expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_0)), + sum); + return add_temp_var ("workitemflatid", sum); + } + else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE) + { + HOST_WIDE_INT dim = int_constant_value (operands[0]); + if (flag_assume_phsa) + { + tree ptr_type = build_pointer_type (uint32_type_node); + tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg, + build_int_cst (ptr_type, + PHSA_CONTEXT_WG_SIZES + + dim * 4)); + std::string name ("wgsize_x"); + name [name.length() - 1] += dim; + return add_temp_var (name.c_str(), ctx); + } + else if (m_is_kernel) + { + /* For kernels without phsa we generate certain temps before + the WI loop, which means we don't need to rely on LICM to get + them moved out. */ + return m_wg_size_vars[dim]; + } + else + gcc_unreachable (); + } + else if (brig_opcode == BRIG_OPCODE_WORKITEMID) + { + HOST_WIDE_INT dim = int_constant_value (operands[0]); + if (m_is_kernel) + { + return m_local_id_vars [dim]; + } + else if (flag_assume_phsa) + { + tree ptr_type = build_pointer_type (uint32_type_node); + tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg, + build_int_cst (ptr_type, + PHSA_CONTEXT_OFFS_WI_IDS + + dim * 4)); + std::string name ("wiid_x"); + name [name.length() - 1] += dim; + return add_temp_var (name.c_str(), ctx); + } + else + gcc_unreachable (); + } + else if (brig_opcode == BRIG_OPCODE_WORKGROUPID) + { + HOST_WIDE_INT dim = int_constant_value (operands[0]); + if (flag_assume_phsa) + { + tree ptr_type = build_pointer_type (uint32_type_node); + tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg, + build_int_cst (ptr_type, + PHSA_CONTEXT_OFFS_WG_IDS + + dim * 4)); + std::string name ("wgid_x"); + name [name.length() - 1] += dim; + return add_temp_var (name.c_str(), ctx); + } else if (m_is_kernel) + return m_wg_id_vars [dim]; + else + gcc_unreachable (); + } + else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE) + { + HOST_WIDE_INT dim = int_constant_value (operands[0]); + if (flag_assume_phsa) + { + tree ptr_type = build_pointer_type (uint32_type_node); + tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg, + build_int_cst (ptr_type, + PHSA_CONTEXT_CURRENT_WG_SIZES + + dim * 4)); + std::string name ("curwgsize_x"); + name [name.length() - 1] += dim; + return add_temp_var (name.c_str(), ctx); + } else if (m_is_kernel) + return m_cur_wg_size_vars[dim]; + else + gcc_unreachable (); + } + else + gcc_unreachable (); + + return NULL_TREE; +} + +/* Returns true in case the given opcode that would normally be generated + as a builtin call can be expanded to tree nodes. */ + +bool +brig_function::can_expand_builtin (BrigOpcode16_t brig_opcode) const +{ + switch (brig_opcode) + { + case BRIG_OPCODE_CURRENTWORKGROUPSIZE: + case BRIG_OPCODE_WORKITEMFLATID: + case BRIG_OPCODE_WORKITEMID: + case BRIG_OPCODE_WORKGROUPID: + case BRIG_OPCODE_WORKGROUPSIZE: + return m_is_kernel || flag_assume_phsa; + case BRIG_OPCODE_WORKITEMFLATABSID: + case BRIG_OPCODE_WORKITEMABSID: + return m_is_kernel; + default: + return false; + }; +} + +/* In case the HSA instruction must be implemented using a builtin, + this function is called to get the correct builtin function. + TYPE is the instruction tree type, BRIG_OPCODE the opcode of the + brig instruction and BRIG_TYPE the brig instruction's type. */ + +tree +brig_function::get_builtin_for_hsa_opcode + (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const +{ + tree builtin = NULL_TREE; + tree builtin_type = type; + + /* For vector types, first find the scalar version of the builtin. */ + if (type != NULL_TREE && VECTOR_TYPE_P (type)) + builtin_type = TREE_TYPE (type); + BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK; + + /* Some BRIG opcodes can use the same builtins for unsigned and + signed types. Force these cases to unsigned types. */ + + if (brig_opcode == BRIG_OPCODE_BORROW + || brig_opcode == BRIG_OPCODE_CARRY + || brig_opcode == BRIG_OPCODE_LASTBIT + || brig_opcode == BRIG_OPCODE_BITINSERT) + { + if (brig_type == BRIG_TYPE_S32) + brig_type = BRIG_TYPE_U32; + else if (brig_type == BRIG_TYPE_S64) + brig_type = BRIG_TYPE_U64; + } + + switch (brig_opcode) + { + case BRIG_OPCODE_FLOOR: + builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR); + break; + case BRIG_OPCODE_CEIL: + builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL); + break; + case BRIG_OPCODE_SQRT: + case BRIG_OPCODE_NSQRT: + builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT); + break; + case BRIG_OPCODE_RINT: + builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT); + break; + case BRIG_OPCODE_TRUNC: + builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC); + break; + case BRIG_OPCODE_COPYSIGN: + builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN); + break; + case BRIG_OPCODE_NSIN: + builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN); + break; + case BRIG_OPCODE_NLOG2: + builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2); + break; + case BRIG_OPCODE_NEXP2: + builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2); + break; + case BRIG_OPCODE_NFMA: + builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA); + break; + case BRIG_OPCODE_NCOS: + builtin = mathfn_built_in (builtin_type, BUILT_IN_COS); + break; + case BRIG_OPCODE_POPCOUNT: + /* Popcount should be typed by its argument type (the return value + is always u32). Let's use a b64 version for also for b32 for now. */ + return builtin_decl_explicit (BUILT_IN_POPCOUNTL); + case BRIG_OPCODE_BORROW: + /* Borrow uses the same builtin for unsigned and signed types. */ + if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32) + return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32); + else + return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64); + case BRIG_OPCODE_CARRY: + /* Carry also uses the same builtin for unsigned and signed types. */ + if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32) + return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32); + else + return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64); + default: + + /* Use our builtin index for finding a proper builtin for the BRIG + opcode and BRIG type. This takes care most of the builtin cases, + the special cases are handled in the separate 'case' statements + above. */ + builtin_map::const_iterator i + = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type)); + if (i != s_custom_builtins.end ()) + return (*i).second; + + if (brig_inner_type != brig_type) + { + /* Try to find a scalar built-in we could use. */ + i = s_custom_builtins.find + (std::make_pair (brig_opcode, brig_inner_type)); + if (i != s_custom_builtins.end ()) + return (*i).second; + } + + /* In case this is an fp16 operation that is promoted to fp32, + try to find a fp32 scalar built-in. */ + if (brig_inner_type == BRIG_TYPE_F16) + { + i = s_custom_builtins.find + (std::make_pair (brig_opcode, BRIG_TYPE_F32)); + if (i != s_custom_builtins.end ()) + return (*i).second; + } + gcc_unreachable (); + } + + if (VECTOR_TYPE_P (type) && builtin != NULL_TREE) + { + /* Try to find a vectorized version of the built-in. + TODO: properly assert that builtin is a mathfn builtin? */ + tree vec_builtin + = targetm.vectorize.builtin_vectorized_function + (builtin_mathfn_code (builtin), type, type); + if (vec_builtin != NULL_TREE) + return vec_builtin; + else + return builtin; + } + if (builtin == NULL_TREE) + gcc_unreachable (); + return builtin; +} + +/* Unpacks the elements of the vector in VALUE to scalars (bit field + references) in ELEMENTS. */ + +void +brig_function::unpack (tree value, tree_stl_vec &elements) +{ + size_t vec_size = int_size_in_bytes (TREE_TYPE (value)); + size_t element_size + = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT; + size_t element_count + = vec_size * BITS_PER_UNIT / element_size; + + tree input_element_type = TREE_TYPE (TREE_TYPE (value)); + + value = add_temp_var ("unpack_input", value); + + for (size_t i = 0; i < element_count; ++i) + { + tree element + = build3 (BIT_FIELD_REF, input_element_type, value, + TYPE_SIZE (input_element_type), + bitsize_int(i * element_size)); + + element = add_temp_var ("scalar", element); + elements.push_back (element); + } +} + +/* Pack the elements of the scalars in ELEMENTS to the returned vector. */ + +tree +brig_function::pack (tree_stl_vec &elements) +{ + size_t element_count = elements.size (); + + gcc_assert (element_count > 1); + + tree output_element_type = TREE_TYPE (elements.at (0)); + + vec *constructor_vals = NULL; + for (size_t i = 0; i < element_count; ++i) + CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i)); + + tree vec_type = build_vector_type (output_element_type, element_count); + + /* build_constructor creates a vector type which is not a vector_cst + that requires compile time constant elements. */ + tree vec = build_constructor (vec_type, constructor_vals); + + /* Add a temp variable for readability. */ + tree tmp_var = create_tmp_var (vec_type, "vec_out"); + tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec); + append_statement (vec_tmp_assign); + return tmp_var; +} + +/* Returns true in case the given opcode needs to know about work-item context + data. In such case the context data is passed as a pointer to a work-item + context object, as the last argument in the builtin call. */ + +bool +brig_function::needs_workitem_context_data +(BrigOpcode16_t brig_opcode) +{ + switch (brig_opcode) + { + case BRIG_OPCODE_WORKITEMABSID: + case BRIG_OPCODE_WORKITEMFLATABSID: + case BRIG_OPCODE_WORKITEMFLATID: + case BRIG_OPCODE_CURRENTWORKITEMFLATID: + case BRIG_OPCODE_WORKITEMID: + case BRIG_OPCODE_WORKGROUPID: + case BRIG_OPCODE_WORKGROUPSIZE: + case BRIG_OPCODE_CURRENTWORKGROUPSIZE: + case BRIG_OPCODE_GRIDGROUPS: + case BRIG_OPCODE_GRIDSIZE: + case BRIG_OPCODE_DIM: + case BRIG_OPCODE_PACKETID: + case BRIG_OPCODE_PACKETCOMPLETIONSIG: + case BRIG_OPCODE_BARRIER: + case BRIG_OPCODE_WAVEBARRIER: + case BRIG_OPCODE_ARRIVEFBAR: + case BRIG_OPCODE_INITFBAR: + case BRIG_OPCODE_JOINFBAR: + case BRIG_OPCODE_LEAVEFBAR: + case BRIG_OPCODE_RELEASEFBAR: + case BRIG_OPCODE_WAITFBAR: + case BRIG_OPCODE_CUID: + case BRIG_OPCODE_MAXCUID: + case BRIG_OPCODE_DEBUGTRAP: + case BRIG_OPCODE_GROUPBASEPTR: + case BRIG_OPCODE_KERNARGBASEPTR: + case BRIG_OPCODE_ALLOCA: + return true; + default: + return false; + }; +} + +/* Appends and returns a new temp variable and an accompanying assignment + statement that stores the value of the given EXPR and has the given NAME. */ + +tree +brig_function::add_temp_var (std::string name, tree expr) +{ + tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ()); + tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr); + append_statement (assign); + return temp_var; +} + +/* Returns the integer constant value of the given node. + If it's a cast, looks into the source of the cast. */ + +HOST_WIDE_INT +brig_function::int_constant_value (tree node) +{ + tree n = node; + if (TREE_CODE (n) == VIEW_CONVERT_EXPR) + n = TREE_OPERAND (n, 0); + return int_cst_value (n); +} + +/* Returns the tree code that should be used to implement the given + HSA instruction opcode (BRIG_OPCODE) for the given type of instruction + (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly, + returns TREE_LIST (if it can be emulated with a simple chain of tree + nodes) or CALL_EXPR if the opcode should be implemented using a builtin + call. */ + +tree_code +brig_function::get_tree_code_for_hsa_opcode + (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) +{ + BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK; + switch (brig_opcode) + { + case BRIG_OPCODE_NOP: + return NOP_EXPR; + case BRIG_OPCODE_ADD: + return PLUS_EXPR; + case BRIG_OPCODE_CMOV: + if (brig_inner_type == brig_type) + return COND_EXPR; + else + return VEC_COND_EXPR; + case BRIG_OPCODE_SUB: + return MINUS_EXPR; + case BRIG_OPCODE_MUL: + case BRIG_OPCODE_MUL24: + return MULT_EXPR; + case BRIG_OPCODE_MULHI: + case BRIG_OPCODE_MUL24HI: + return MULT_HIGHPART_EXPR; + case BRIG_OPCODE_DIV: + if (gccbrig_is_float_type (brig_inner_type)) + return RDIV_EXPR; + else + return TRUNC_DIV_EXPR; + case BRIG_OPCODE_NEG: + return NEGATE_EXPR; + case BRIG_OPCODE_MIN: + if (gccbrig_is_float_type (brig_inner_type)) + return CALL_EXPR; + else + return MIN_EXPR; + case BRIG_OPCODE_MAX: + if (gccbrig_is_float_type (brig_inner_type)) + return CALL_EXPR; + else + return MAX_EXPR; + case BRIG_OPCODE_FMA: + return FMA_EXPR; + case BRIG_OPCODE_ABS: + return ABS_EXPR; + case BRIG_OPCODE_SHL: + return LSHIFT_EXPR; + case BRIG_OPCODE_SHR: + return RSHIFT_EXPR; + case BRIG_OPCODE_OR: + return BIT_IOR_EXPR; + case BRIG_OPCODE_XOR: + return BIT_XOR_EXPR; + case BRIG_OPCODE_AND: + return BIT_AND_EXPR; + case BRIG_OPCODE_NOT: + return BIT_NOT_EXPR; + case BRIG_OPCODE_RET: + return RETURN_EXPR; + case BRIG_OPCODE_MOV: + case BRIG_OPCODE_LDF: + return MODIFY_EXPR; + case BRIG_OPCODE_LD: + case BRIG_OPCODE_ST: + return MEM_REF; + case BRIG_OPCODE_BR: + return GOTO_EXPR; + case BRIG_OPCODE_REM: + if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32) + return TRUNC_MOD_EXPR; + else + return CALL_EXPR; + case BRIG_OPCODE_NRCP: + case BRIG_OPCODE_NRSQRT: + /* Implement as 1/f (x). gcc should pattern detect that and + use a native instruction, if available, for it. */ + return TREE_LIST; + case BRIG_OPCODE_FLOOR: + case BRIG_OPCODE_CEIL: + case BRIG_OPCODE_SQRT: + case BRIG_OPCODE_NSQRT: + case BRIG_OPCODE_RINT: + case BRIG_OPCODE_TRUNC: + case BRIG_OPCODE_POPCOUNT: + case BRIG_OPCODE_COPYSIGN: + case BRIG_OPCODE_NCOS: + case BRIG_OPCODE_NSIN: + case BRIG_OPCODE_NLOG2: + case BRIG_OPCODE_NEXP2: + case BRIG_OPCODE_NFMA: + /* Class has type B1 regardless of the float type, thus + the below builtin map search cannot find it. */ + case BRIG_OPCODE_CLASS: + case BRIG_OPCODE_WORKITEMABSID: + return CALL_EXPR; + default: + + /* Some BRIG opcodes can use the same builtins for unsigned and + signed types. Force these cases to unsigned types. + */ + + if (brig_opcode == BRIG_OPCODE_BORROW + || brig_opcode == BRIG_OPCODE_CARRY + || brig_opcode == BRIG_OPCODE_LASTBIT + || brig_opcode == BRIG_OPCODE_BITINSERT) + { + if (brig_type == BRIG_TYPE_S32) + brig_type = BRIG_TYPE_U32; + else if (brig_type == BRIG_TYPE_S64) + brig_type = BRIG_TYPE_U64; + } + + + builtin_map::const_iterator i + = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type)); + if (i != s_custom_builtins.end ()) + return CALL_EXPR; + else if (s_custom_builtins.find + (std::make_pair (brig_opcode, brig_inner_type)) + != s_custom_builtins.end ()) + return CALL_EXPR; + if (brig_inner_type == BRIG_TYPE_F16 + && s_custom_builtins.find + (std::make_pair (brig_opcode, BRIG_TYPE_F32)) + != s_custom_builtins.end ()) + return CALL_EXPR; + break; + } + return TREE_LIST; /* Emulate using a chain of nodes. */ +} + +/* Inform of an update to the REG_VAR. */ + +void +brig_function::add_reg_var_update (tree reg_var, tree var) +{ + if (var == m_abs_id_vars[0] || var == m_abs_id_vars[1] + || var == m_abs_id_vars[2] || var == m_local_id_vars[0] + || var == m_local_id_vars[1] || var == m_local_id_vars[2]) + m_id_val_defs [reg_var] = var; + else + { + /* Possible overwrite of an ID value. */ + + id_val_map::iterator i = m_id_val_defs.find (reg_var); + if (i != m_id_val_defs.end()) + m_id_val_defs.erase (i); + } +} + +/* If the REG_VAR is known to contain an ID value at this point in + the basic block, return true. */ + +bool +brig_function::is_id_val (tree reg_var) +{ + id_val_map::iterator i = m_id_val_defs.find (reg_var); + return i != m_id_val_defs.end(); +} + +/* Return an ID value for the given REG_VAR if its known to contain + one at this point in the BB, NULL_TREE otherwise. */ + +tree +brig_function::id_val (tree reg_var) +{ + id_val_map::iterator i = m_id_val_defs.find (reg_var); + if (i != m_id_val_defs.end()) + return (*i).second; + else + return NULL_TREE; +} + +/* Informs of starting a new basic block. Called when generating + a label, a call, a jump, or a return. */ + +void +brig_function::start_new_bb () +{ + m_id_val_defs.clear (); +} diff --git a/gcc/brig/brigfrontend/brig-function.h b/gcc/brig/brigfrontend/brig-function.h index 6149719fc45..8fde3a5bfa3 100644 --- a/gcc/brig/brigfrontend/brig-function.h +++ b/gcc/brig/brigfrontend/brig-function.h @@ -105,6 +105,30 @@ public: void analyze_calls (); + tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands); + + tree expand_or_call_builtin (BrigOpcode16_t brig_opcode, + BrigType16_t brig_type, tree arith_type, + tree_stl_vec &operands); + bool can_expand_builtin (BrigOpcode16_t brig_opcode) const; + + tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode, + BrigType16_t brig_type) const; + + void unpack (tree value, tree_stl_vec &elements); + tree pack (tree_stl_vec &elements); + tree add_temp_var (std::string name, tree expr); + + static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode); + static HOST_WIDE_INT int_constant_value (tree node); + static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode, + BrigType16_t brig_type); + + void start_new_bb (); + void add_reg_var_update (tree reg_var, tree val); + bool is_id_val (tree reg_var); + tree id_val (tree reg_var); + const BrigDirectiveExecutable *m_brig_def; bool m_is_kernel; @@ -183,6 +207,11 @@ public: tree m_wg_id_vars[3]; tree m_wg_size_vars[3]; tree m_grid_size_vars[3]; + /* Explicitly computed WG base for the absolute IDs which is used + as the initial value when looping that dimension. We update + the abs id with ++ to make it easy for the vectorizer. */ + tree m_abs_id_base_vars[3]; + tree m_abs_id_vars[3]; /* Set to true in case the kernel contains at least one dispatch packet (work-item ID-related) builtin call that could not be expanded to @@ -219,6 +248,20 @@ private: /* Bookkeeping for the different HSA registers and their tree declarations for the currently generated function. */ reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT]; + + /* Map for keeping book reads of ID variables, which can be propagated + to uses in address expressions to produce cleaner indexing functions + with unnecessary casts stripped off, etc. */ + typedef std::map id_val_map; + + /* Keeps track of ID values alive in registers in the currently + processed BB. */ + id_val_map m_id_val_defs; + + /* HSAIL-specific builtin functions not yet integrated to gcc. */ + typedef std::map, tree> builtin_map; + + static builtin_map s_custom_builtins; }; #endif diff --git a/gcc/brig/brigfrontend/brig-label-handler.cc b/gcc/brig/brigfrontend/brig-label-handler.cc index 7605b76f7db..938df82b03a 100644 --- a/gcc/brig/brigfrontend/brig-label-handler.cc +++ b/gcc/brig/brigfrontend/brig-label-handler.cc @@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base) std::string label_str ((const char *) (label_name->bytes), label_name->byteCount); + m_parent.m_cf->start_new_bb (); + tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str)); m_parent.m_cf->append_statement (stmt); + return base->byteCount; } diff --git a/gcc/brig/brigfrontend/brig-lane-inst-handler.cc b/gcc/brig/brigfrontend/brig-lane-inst-handler.cc index 1da0bc0fa84..385da33f089 100644 --- a/gcc/brig/brigfrontend/brig-lane-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-lane-inst-handler.cc @@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base) elements.push_back (zero_cst); elements.push_back (zero_cst); - expr = pack (elements); + expr = m_parent.m_cf->pack (elements); } else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE) { diff --git a/gcc/brig/brigfrontend/brig-mem-inst-handler.cc b/gcc/brig/brigfrontend/brig-mem-inst-handler.cc index 350516f6404..d8374f232fb 100644 --- a/gcc/brig/brigfrontend/brig-mem-inst-handler.cc +++ b/gcc/brig/brigfrontend/brig-mem-inst-handler.cc @@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst, { /* Add a temporary variable so there won't be multiple reads in case of vector unpack. */ - mem_ref = add_temp_var ("mem_read", mem_ref); + mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref); return build_output_assignment (*brig_inst, data, mem_ref); } else @@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base) inputs.push_back (operands[1]); inputs.push_back (align_opr); tree builtin_call - = expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32, - uint32_type_node, inputs); + = m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA, + BRIG_TYPE_U32, + uint32_type_node, inputs); build_output_assignment (*brig_inst, operands[0], builtin_call); m_parent.m_cf->m_has_allocas = true; return base->byteCount; diff --git a/gcc/brig/brigfrontend/phsa.h b/gcc/brig/brigfrontend/phsa.h index d2247520842..fe0b9a59406 100644 --- a/gcc/brig/brigfrontend/phsa.h +++ b/gcc/brig/brigfrontend/phsa.h @@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__)) /* The prefix to use in the ELF section containing descriptor for a function. */ + #define PHSA_DESC_SECTION_PREFIX "phsa.desc." #define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def." /* The frontend error messages are parsed by the host runtime. Known prefix strings are used to separate the different runtime error codes. */ + #define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: " #define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: " +/* Offsets of attributes in the PHSA context structs. + Used by -fphsa-wi-context-opt. */ +#define PHSA_CONTEXT_OFFS_WI_IDS 0 +#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4) +#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4) +#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4) + #endif diff --git a/gcc/brig/lang.opt b/gcc/brig/lang.opt index 1c83f5f8d54..2cc6cb9c987 100644 --- a/gcc/brig/lang.opt +++ b/gcc/brig/lang.opt @@ -31,6 +31,11 @@ BRIG Separate Alias(d) -dump= BRIG Joined Alias(d) +fassume-phsa +BRIG Report Var(flag_assume_phsa) Init(1) Optimization +Assume we are finalizing for phsa and its libhsail-rt. Enables additional +phsa-specific optimizations (default). + L BRIG Joined Separate ; Not documented diff --git a/gcc/builtin-types.def b/gcc/builtin-types.def index 8f3d796bcfa..5365befd351 100644 --- a/gcc/builtin-types.def +++ b/gcc/builtin-types.def @@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR) +DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR) DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR) +DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR) DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG) DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG) DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT) @@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE, DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT) DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR) +DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE) DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR) @@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR, DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR, BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR) DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR) +DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32) DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING, BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING) DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f4de7e52a5e..bba9294c2d6 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,4 +1,9 @@ -2018-05-04 Carl Love +2018-05-04 Pekka Jääskeläinen + + * testsuite/brig.dg/test/gimple/smoke_test.hsail: Fix the test + to match the currently produced gimple. + +2018-05-04 Carl Love * gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max, vec_trunc. * gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp, diff --git a/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail b/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail index 1f36ddc4181..6e2326391da 100644 --- a/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail +++ b/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail @@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr) }; /* The kernel function itself should have a fingerprint as follows */ -/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */ -/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */ +/* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */ +/* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */ /* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */ /* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */ /* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */ /* mem_read.2 = *D.1691; */ -/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */ +/* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */ /* add_u32s should generate +operators */ /* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */ @@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr) /* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */ /* The launcher should call __hsail_launch_wg_function in this case: */ -/* Kernel (void * __context, void * __group_base_addr) */ -/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */ +/* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */ +/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */ /* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/ /* The kernel should have the magic metadata section injected to the ELF. */ diff --git a/libhsail-rt/ChangeLog b/libhsail-rt/ChangeLog index 5ab9e8515f0..17aeb6e490c 100644 --- a/libhsail-rt/ChangeLog +++ b/libhsail-rt/ChangeLog @@ -1,3 +1,10 @@ +2018-05-04 Pekka Jääskeläinen + + * include/internal/phsa-rt.h: Whitespace cleanup. + * include/internal/workitems.h: Store work item ID data to easily + accessible locations. + * rt/workitems.c: Same. + 2018-05-04 Pekka Jääskeläinen * rt/workitems.c: Fix an alloca stack underflow. diff --git a/libhsail-rt/include/internal/phsa-rt.h b/libhsail-rt/include/internal/phsa-rt.h index d9db56ca08e..c09f18d0095 100644 --- a/libhsail-rt/include/internal/phsa-rt.h +++ b/libhsail-rt/include/internal/phsa-rt.h @@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t, */ typedef struct { - /* Data set by the HSA Runtime's kernel launcher. */ hsa_kernel_dispatch_packet_t *dp; diff --git a/libhsail-rt/include/internal/workitems.h b/libhsail-rt/include/internal/workitems.h index 73add287d8d..0839853ff12 100644 --- a/libhsail-rt/include/internal/workitems.h +++ b/libhsail-rt/include/internal/workitems.h @@ -45,11 +45,6 @@ typedef struct { - /* The group id of the currently executed WG. */ - size_t x; - size_t y; - size_t z; - /* This is 1 in case there are more work groups to execute. If 0, the work-item threads should finish themselves. */ int more_wgs; @@ -89,6 +84,16 @@ typedef struct stack frame. Initialized to point outside the private segment. */ uint32_t alloca_frame_p; + /* The group id of the currently executed WG. This is for fiber based + execution. The group ids are duplicated also to the per WI context + struct for simplified single pointer access in the GCCBRIG produced + code. + */ + + uint32_t x; + uint32_t y; + uint32_t z; + } PHSAWorkGroup; /* Data identifying a single work-item, passed to the work-item thread in case @@ -96,17 +101,42 @@ typedef struct typedef struct { + /* NOTE: These members STARTing here should not be moved as they are + accessed directly by code emitted by BRIG FE. */ + + /* The local id of the current WI. */ + + uint32_t x; + uint32_t y; + uint32_t z; + + /* The group id of the currently executed WG. */ + + uint32_t group_x; + uint32_t group_y; + uint32_t group_z; + + /* The local size of a complete WG. */ + + uint32_t wg_size_x; + uint32_t wg_size_y; + uint32_t wg_size_z; + + /* The local size of the current WG. */ + + uint32_t cur_wg_size_x; + uint32_t cur_wg_size_y; + uint32_t cur_wg_size_z; + + /* NOTE: Fixed members END here. */ + PHSAKernelLaunchData *launch_data; /* Identifies and keeps book of the currently executed WG of the WI swarm. */ volatile PHSAWorkGroup *wg; - /* The local id of the current WI. */ - size_t x; - size_t y; - size_t z; #ifdef HAVE_FIBERS fiber_t fiber; #endif -} PHSAWorkItem; +} __attribute__((packed)) PHSAWorkItem; #endif diff --git a/libhsail-rt/rt/workitems.c b/libhsail-rt/rt/workitems.c index 36c91691a71..c846350e1cd 100644 --- a/libhsail-rt/rt/workitems.c +++ b/libhsail-rt/rt/workitems.c @@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1) the current_work_group_* is set to point to the WG executed next. */ if (!wi->wg->more_wgs) break; + + wi->group_x = wg->x; + wi->group_y = wg->y; + wi->group_z = wg->z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); + #ifdef DEBUG_PHSA_RT printf ( "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n", - wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x, - l_data->wg_max_y, l_data->wg_max_z); + wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z, + l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z); #endif if (wi->x < __hsail_currentworkgroupsize (0, wi) @@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1) else wg->x++; #endif + wi->group_x = wg->x; + wi->group_y = wg->y; + wi->group_z = wg->z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); /* Reinitialize the work-group barrier according to the new WG's size, which might not be the same as the previous ones, due @@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, PHSAWorkItem *wi_threads = NULL; PHSAWorkGroup wg; size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z; + uint32_t group_x, group_y, group_z; fiber_barrier_t wg_start_barrier; fiber_barrier_t wg_completion_barrier; fiber_barrier_t wg_sync_barrier; @@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, wg.initial_group_offset = group_local_offset; #ifdef EXECUTE_WGS_BACKWARDS - wg.x = context->wg_max_x - 1; - wg.y = context->wg_max_y - 1; - wg.z = context->wg_max_z - 1; + group_x = context->wg_max_x - 1; + group_y = context->wg_max_y - 1; + group_z = context->wg_max_z - 1; #else - wg.x = context->wg_min_x; - wg.y = context->wg_min_y; - wg.z = context->wg_min_z; + group_x = context->wg_min_x; + group_y = context->wg_min_y; + group_z = context->wg_min_z; #endif fiber_barrier_init (&wg_sync_barrier, wg_size); @@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, PHSAWorkItem *wi = &wi_threads[flat_wi_id]; wi->launch_data = context; wi->wg = &wg; + + wg.x = wi->group_x = group_x; + wg.y = wi->group_y = group_y; + wg.z = wi->group_z = group_z; + + wi->wg_size_x = context->dp->workgroup_size_x; + wi->wg_size_y = context->dp->workgroup_size_y; + wi->wg_size_z = context->dp->workgroup_size_z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); + wi->x = x; wi->y = y; wi->z = z; @@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr, for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y) for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x) { - wi.wg->x = wg_x; - wi.wg->y = wg_y; - wi.wg->z = wg_z; + wi.group_x = wg_x; + wi.group_y = wg_y; + wi.group_z = wg_z; + + wi.wg_size_x = context->dp->workgroup_size_x; + wi.wg_size_y = context->dp->workgroup_size_y; + wi.wg_size_z = context->dp->workgroup_size_z; + + wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi); + wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi); + wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi); context->kernel (context->kernarg_addr, &wi, group_base_ptr, group_local_offset, private_base_ptr); @@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context) default: case 0: /* Overflow semantics in the case of WG dim > grid dim. */ - id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) + id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) % dp->grid_size_x; break; case 1: - id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) + id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) % dp->grid_size_y; break; case 2: - id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) + id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) % dp->grid_size_z; break; } @@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context) default: case 0: /* Overflow semantics in the case of WG dim > grid dim. */ - id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) + id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) % dp->grid_size_x; break; case 1: - id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) + id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) % dp->grid_size_y; break; case 2: - id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) + id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) % dp->grid_size_z; break; } @@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi) { default: case 0: - if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x) + if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x) wg_size = dp->workgroup_size_x; /* Full WG. */ else wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */ break; case 1: - if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y) + if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y) wg_size = dp->workgroup_size_y; /* Full WG. */ else wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */ break; case 2: - if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z) + if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z) wg_size = dp->workgroup_size_z; /* Full WG. */ else wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */ @@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi) { default: case 0: - return wi->wg->x; + return wi->group_x; case 1: - return wi->wg->y; + return wi->group_y; case 2: - return wi->wg->z; + return wi->group_z; } } -- 2.30.2