From 56b1c60e412fcf1245b4780871553cbdebb956a3 Mon Sep 17 00:00:00 2001
From: Martin Jambor <mjambor@suse.cz>
Date: Wed, 23 Nov 2016 15:51:02 +0100
Subject: [PATCH] backport: hsa-builtins.def: New file.

Merge from HSA branch to trunk

2016-11-23  Martin Jambor  <mjambor@suse.cz>
	    Martin Liska  <mliska@suse.cz>

gcc/
	* hsa-builtins.def: New file.
	* Makefile.in (BUILTINS_DEF): Add hsa-builtins.def dependency.
	* builtins.def: Include hsa-builtins.def.
	(DEF_HSA_BUILTIN): New macro.
	* dumpfile.h (OPTGROUP_OPENMP): Define.
	* dumpfile.c (optgroup_options): Added OPTGROUP_OPENMP.
	* gimple.h (gf_mask): Added elements GF_OMP_FOR_GRID_INTRA_GROUP and
	GF_OMP_FOR_GRID_GROUP_ITER.
	(gimple_omp_for_grid_phony): Added checking assert.
	(gimple_omp_for_set_grid_phony): Likewise.
	(gimple_omp_for_grid_intra_group): New function.
	(gimple_omp_for_set_grid_intra_group): Likewise.
	(gimple_omp_for_grid_group_iter): Likewise.
	(gimple_omp_for_set_grid_group_iter): Likewise.
	* omp-low.c (check_omp_nesting_restrictions): Allow GRID loop where
	previosuly only distribute loop was permitted.
	(lower_lastprivate_clauses): Allow non tcc_comparison predicates.
	(grid_get_kernel_launch_attributes): Support multiple HSA grid
	dimensions.
	(grid_expand_omp_for_loop): Likewise and also support standalone
	distribute constructs.  New parameter INTRA_GROUP, updated both users.
	(grid_expand_target_grid_body): Support standalone distribute
	constructs.
	(pass_data_expand_omp): Changed optinfo_flags to OPTGROUP_OPENMP.
	(pass_data_expand_omp_ssa): Likewise.
	(pass_data_omp_device_lower): Likewsie.
	(pass_data_lower_omp): Likewise.
	(pass_data_diagnose_omp_blocks): Likewise.
	(pass_data_oacc_device_lower): Likewise.
	(pass_data_omp_target_link): Likewise.
	(grid_lastprivate_predicate): New function.
	(lower_omp_for_lastprivate): Call grid_lastprivate_predicate for
	gridified loops.
	(lower_omp_for): Support standalone distribute constructs.
	(grid_prop): New type.
	(grid_safe_assignment_p): Check for assignments to group_sizes, new
	parameter GRID.
	(grid_seq_only_contains_local_assignments): New parameter GRID, pass
	it to callee.
	(grid_find_single_omp_among_assignments_1): Likewise, improve missed
	optimization info messages.
	(grid_find_single_omp_among_assignments): Likewise.
	(grid_find_ungridifiable_statement): Do not bail out for SIMDs.
	(grid_parallel_clauses_gridifiable): New function.
	(grid_inner_loop_gridifiable_p): Likewise.
	(grid_dist_follows_simple_pattern): Likewise.
	(grid_gfor_follows_tiling_pattern): Likewise.
	(grid_call_permissible_in_distribute_p): Likewise.
	(grid_handle_call_in_distribute): Likewise.
	(grid_dist_follows_tiling_pattern): Likewise.
	(grid_target_follows_gridifiable_pattern): Support standalone distribute
	constructs.
	(grid_var_segment): New enum.
	(grid_mark_variable_segment): New function.
	(grid_copy_leading_local_assignments): Call grid_mark_variable_segment
	if a new argument says so.
	(grid_process_grid_body): New function.
	(grid_eliminate_combined_simd_part): Likewise.
	(grid_mark_tiling_loops): Likewise.
	(grid_mark_tiling_parallels_and_loops): Likewise.
	(grid_process_kernel_body_copy): Support standalone distribute
	constructs.
	(grid_attempt_target_gridification): New grid variable holding overall
	gridification state.  Support standalone distribute constructs and
	collapse clauses.
	* doc/optinfo.texi (Optimization groups): Document OPTGROUP_OPENMP.
	* hsa.h (hsa_bb): Add method method append_phi.
	(hsa_insn_br): Renamed to hsa_insn_cbr, renamed all
	occurences in all files too.
	(hsa_insn_br): New class, now the ancestor of hsa_incn_cbr.
	(is_a_helper <hsa_insn_br *>::test): New function.
	(is_a_helper <hsa_insn_cbr *>::test): Adjust to only cover conditional
	branch instructions.
	(hsa_insn_signal): Make a direct descendant of
	hsa_insn_basic.  Add memorder constructor parameter and
	m_memory_order and m_signalop member variables.
	(hsa_insn_queue): Changed constructor parameters to common form.
	Added m_segment and m_memory_order member variables.
	(hsa_summary_t): Add private member function
	process_gpu_implementation_attributes.
	(hsa_function_summary): Rename m_binded_function to
	m_bound_function.
	(hsa_insn_basic_p): Remove typedef.
	(hsa_op_with_type): Change hsa_insn_basic_p into plain pointers.
	(hsa_op_reg_p): Remove typedef.
	(hsa_function_representation): Change hsa_op_reg_p into plain
	pointers.
	(hsa_insn_phi): Removed new and delete operators.
	(hsa_insn_br): Likewise.
	(hsa_insn_cbr): Likewise.
	(hsa_insn_sbr): Likewise.
	(hsa_insn_cmp): Likewise.
	(hsa_insn_mem): Likewise.
	(hsa_insn_atomic): Likewise.
	(hsa_insn_signal): Likewise.
	(hsa_insn_seg): Likewise.
	(hsa_insn_call): Likewise.
	(hsa_insn_arg_block): Likewise.
	(hsa_insn_comment): Likewise.
	(hsa_insn_srctype): Likewise.
	(hsa_insn_packed): Likewise.
	(hsa_insn_cvt): Likewise.
	(hsa_insn_alloca): Likewise.
	* hsa.c (hsa_destroy_insn): Also handle instances of hsa_insn_br.
	(process_gpu_implementation_attributes): New function.
	(link_functions): Move some functionality into it.  Adjust after
	renaming m_binded_functions to m_bound_functions.
	(hsa_insn_basic::op_output_p): Add BRIG_OPCODE_DEBUGTRAP
	to the list of instructions with no output registers.
	(get_in_type): Return this if it is a register of
	matching size.
	(hsa_get_declaration_name): Moved to...
        * hsa-gen.c (hsa_get_declaration_name): ...here.  Allocate
	temporary string on an obstack instead from ggc.
	(query_hsa_grid): Renamed to query_hsa_grid_dim, reimplemented, cut
	down to two overloads.
	(hsa_allocp_operand_address): Removed.
	(hsa_allocp_operand_immed): Likewise.
	(hsa_allocp_operand_reg): Likewise.
	(hsa_allocp_operand_code_list): Likewise.
	(hsa_allocp_operand_operand_list): Likewise.
	(hsa_allocp_inst_basic): Likewise.
	(hsa_allocp_inst_phi): Likewise.
	(hsa_allocp_inst_mem): Likewise.
	(hsa_allocp_inst_atomic): Likewise.
	(hsa_allocp_inst_signal): Likewise.
	(hsa_allocp_inst_seg): Likewise.
	(hsa_allocp_inst_cmp): Likewise.
	(hsa_allocp_inst_br): Likewise.
	(hsa_allocp_inst_sbr): Likewise.
	(hsa_allocp_inst_call): Likewise.
	(hsa_allocp_inst_arg_block): Likewise.
	(hsa_allocp_inst_comment): Likewise.
	(hsa_allocp_inst_queue): Likewise.
	(hsa_allocp_inst_srctype): Likewise.
	(hsa_allocp_inst_packed): Likewise.
	(hsa_allocp_inst_cvt): Likewise.
	(hsa_allocp_inst_alloca): Likewise.
	(hsa_allocp_bb): Likewise.
	(hsa_obstack): New.
	(hsa_init_data_for_cfun): Initialize obstack.
	(hsa_deinit_data_for_cfun): Release memory of the obstack.
	(hsa_op_immed::operator new): Use obstack instead of object_allocator.
	(hsa_op_reg::operator new): Likewise.
	(hsa_op_address::operator new): Likewise.
	(hsa_op_code_list::operator new): Likewise.
	(hsa_op_operand_list::operator new): Likewise.
	(hsa_insn_basic::operator new): Likewise.
	(hsa_insn_phi::operator new): Likewise.
	(hsa_insn_br::operator new): Likewise.
	(hsa_insn_sbr::operator new): Likewise.
	(hsa_insn_cmp::operator new): Likewise.
	(hsa_insn_mem::operator new): Likewise.
	(hsa_insn_atomic::operator new): Likewise.
	(hsa_insn_signal::operator new): Likewise.
	(hsa_insn_seg::operator new): Likewise.
	(hsa_insn_call::operator new): Likewise.
	(hsa_insn_arg_block::operator new): Likewise.
	(hsa_insn_comment::operator new): Likewise.
	(hsa_insn_srctype::operator new): Likewise.
	(hsa_insn_packed::operator new): Likewise.
	(hsa_insn_cvt::operator new): Likewise.
	(hsa_insn_alloca::operator new): Likewise.
	(hsa_init_new_bb): Likewise.
	(hsa_bb::append_phi): New function.
	(gen_hsa_phi_from_gimple_phi): Use it.
	(get_symbol_for_decl): Fix dinstinguishing between
	global and local functions.  Put local variables into a segment
	according to their attribute or static flag, if there is one.
	(hsa_insn_br::hsa_insn_br): New.
	(hsa_insn_br::operator new): Likewise.
	(hsa_insn_cbr::hsa_insn_cbr): Set width via ancestor constructor.
	(query_hsa_grid_nodim): New function.
	(multiply_grid_dim_characteristics): Likewise.
	(gen_get_num_threads): Likewise.
	(gen_get_num_teams): Reimplemented.
	(gen_get_team_num): Likewise.
	(gen_hsa_insns_for_known_library_call): Updated calls to the above
	helper functions.
	(get_memory_order_name): Removed.
	(get_memory_order): Likewise.
	(hsa_memorder_from_tree): New function.
	(gen_hsa_ternary_atomic_for_builtin): Renamed to
	gen_hsa_atomic_for_builtin, can also create signals.
	(gen_hsa_insns_for_call): Handle many new builtins.  Adjust to use
	hsa_memory_order_from_tree and gen_hsa_atomic_for_builtin.
	(hsa_insn_atomic): Fix function comment.
	(hsa_insn_signal::hsa_insn_signal): Fix comment.  Update call to
	ancestor constructor and initialization of new member variables.
	(hsa_insn_queue::hsa_insn_queue): Added initialization of new
	member variables.
	(hsa_get_host_function): Handle functions with no bound CPU
	implementation.  Fix binded to bound.
	(get_brig_function_name): Likewise.
	(HSA_SORRY_ATV): Remove semicolon after macro.
	(HSA_SORRY_AT): Likewise.
	(omp_simple_builtin::generate): Add missing semicolons.
	(hsa_insn_phi::operator new): Removed.
	(hsa_insn_br::operator new): Likewise.
	(hsa_insn_cbr::operator new): Likewise.
	(hsa_insn_sbr::operator new): Likewise.
	(hsa_insn_cmp::operator new): Likewise.
	(hsa_insn_mem::operator new): Likewise.
	(hsa_insn_atomic::operator new): Likewise.
	(hsa_insn_signal::operator new): Likewise.
	(hsa_insn_seg::operator new): Likewise.
	(hsa_insn_call::operator new): Likewise.
	(hsa_insn_arg_block::operator new): Likewise.
	(hsa_insn_comment::operator new): Likewise.
	(hsa_insn_srctype::operator new): Likewise.
	(hsa_insn_packed::operator new): Likewise.
	(hsa_insn_cvt::operator new): Likewise.
	(hsa_insn_alloca::operator new): Likewise.
	(get_symbol_for_decl): Accept CONST_DECLs, put them to
	readonly segment.
	(gen_hsa_addr): Also process CONST_DECLs.
	(gen_hsa_addr_insns): Process CONST_DECLs by creating private
	copies.
	(gen_hsa_unary_operation): Make sure the function does
	not use bittype source type for firstbit and lastbit operations.
	(gen_hsa_popcount_to_dest): Make sure the function uses a bittype
	source type.
	* hsa-brig.c (emit_insn_operands): Cope with zero operands in an
	instruction.
	(emit_branch_insn): Renamed to emit_cond_branch_insn.
	Emit the width stored in the class.
	(emit_generic_branch_insn): New function.
	(emit_insn): Call emit_generic_branch_insn.
	(emit_signal_insn): Remove obsolete comment.  Update
	member variable name, pick a type according to profile.
	(emit_alloca_insn): Remove obsolete comment.
	(emit_atomic_insn): Likewise.
	(emit_queue_insn): Get segment and memory order from the IR object.
	(hsa_brig_section): Make allocate_new_chunk, chunks
	and cur_chunk provate, add a default NULL parameter to add method.
	(hsa_brig_section::add): Added a new parameter, store pointer to
	output data there if it is non-NULL.
	(emit_function_directives): Use this new parameter instead of
	calculating the pointer itself, fix function comment.
	(hsa_brig_emit_function): Add forgotten endian conversion.
	(hsa_output_kernels): Remove unnecessary building of
	kernel_dependencies_vector_type.
	(emit_immediate_operand): Declare.
	(emit_directive_variable): Also emit initializers of CONST_DECLs.
	(gen_hsa_insn_for_internal_fn_call): Also handle IFN_RSQRT.
	(verify_function_arguments): Properly detect variadic
	arguments.
	* hsa-dump.c (hsa_width_specifier_name): New function.
	(dump_hsa_insn_1): Dump generic branch instructions, update signal
	member variable name.  Special dumping for queue objects.
	* ipa-hsa.c (process_hsa_functions): Adjust after renaming
	m_binded_functions to m_bound_functions.  Copy externally visible flag
	to the node.
	(ipa_hsa_write_summary): Likewise.
	(ipa_hsa_read_section): Likewise.


	gcc/fortran/
        * f95-lang.c (DEF_HSA_BUILTIN): New macro.

gcc/testsuite/
	* c-c++-common/gomp/gridify-1.c: Update scan string.
	* gfortran.dg/gomp/gridify-1.f90: Likewise.
	* c-c++-common/gomp/gridify-2.c: New test.
	* c-c++-common/gomp/gridify-3.c: Likewise.

libgomp/
	* testsuite/libgomp.hsa.c/bits-insns.c: New test.
	* testsuite/libgomp.hsa.c/tiling-1.c: Likewise.
	* testsuite/libgomp.hsa.c/tiling-2.c: Likewise.


Co-Authored-By: Martin Liska <mliska@suse.cz>

From-SVN: r242761
---
 gcc/ChangeLog                                |  259 +++
 gcc/Makefile.in                              |    3 +-
 gcc/builtins.def                             |   16 +
 gcc/doc/optinfo.texi                         |    3 +
 gcc/dumpfile.c                               |    1 +
 gcc/dumpfile.h                               |    3 +-
 gcc/fortran/ChangeLog                        |    4 +
 gcc/fortran/f95-lang.c                       |   11 +
 gcc/gimple.h                                 |   57 +
 gcc/hsa-brig.c                               |  140 +-
 gcc/hsa-builtins.def                         |   39 +
 gcc/hsa-dump.c                               |  107 +-
 gcc/hsa-gen.c                                |  914 +++++-----
 gcc/hsa.c                                    |   60 +-
 gcc/hsa.h                                    |  157 +-
 gcc/ipa-hsa.c                                |   14 +-
 gcc/omp-low.c                                | 1559 +++++++++++++-----
 gcc/testsuite/ChangeLog                      |    7 +
 gcc/testsuite/c-c++-common/gomp/gridify-1.c  |    2 +-
 gcc/testsuite/c-c++-common/gomp/gridify-2.c  |   66 +
 gcc/testsuite/c-c++-common/gomp/gridify-3.c  |   68 +
 gcc/testsuite/gfortran.dg/gomp/gridify-1.f90 |    2 +-
 libgomp/ChangeLog                            |    6 +
 libgomp/testsuite/libgomp.hsa.c/bits-insns.c |   73 +
 libgomp/testsuite/libgomp.hsa.c/tiling-1.c   |  212 +++
 libgomp/testsuite/libgomp.hsa.c/tiling-2.c   |  258 +++
 26 files changed, 2996 insertions(+), 1045 deletions(-)
 create mode 100644 gcc/hsa-builtins.def
 create mode 100644 gcc/testsuite/c-c++-common/gomp/gridify-2.c
 create mode 100644 gcc/testsuite/c-c++-common/gomp/gridify-3.c
 create mode 100644 libgomp/testsuite/libgomp.hsa.c/bits-insns.c
 create mode 100644 libgomp/testsuite/libgomp.hsa.c/tiling-1.c
 create mode 100644 libgomp/testsuite/libgomp.hsa.c/tiling-2.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a5f757c4e17..f082b0aa6cb 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,262 @@
+2016-11-23  Martin Jambor  <mjambor@suse.cz>
+	    Martin Liska  <mliska@suse.cz>
+
+	* hsa-builtins.def: New file.
+	* Makefile.in (BUILTINS_DEF): Add hsa-builtins.def dependency.
+	* builtins.def: Include hsa-builtins.def.
+	(DEF_HSA_BUILTIN): New macro.
+	* dumpfile.h (OPTGROUP_OPENMP): Define.
+	* dumpfile.c (optgroup_options): Added OPTGROUP_OPENMP.
+	* gimple.h (gf_mask): Added elements GF_OMP_FOR_GRID_INTRA_GROUP and
+	GF_OMP_FOR_GRID_GROUP_ITER.
+	(gimple_omp_for_grid_phony): Added checking assert.
+	(gimple_omp_for_set_grid_phony): Likewise.
+	(gimple_omp_for_grid_intra_group): New function.
+	(gimple_omp_for_set_grid_intra_group): Likewise.
+	(gimple_omp_for_grid_group_iter): Likewise.
+	(gimple_omp_for_set_grid_group_iter): Likewise.
+	* omp-low.c (check_omp_nesting_restrictions): Allow GRID loop where
+	previosuly only distribute loop was permitted.
+	(lower_lastprivate_clauses): Allow non tcc_comparison predicates.
+	(grid_get_kernel_launch_attributes): Support multiple HSA grid
+	dimensions.
+	(grid_expand_omp_for_loop): Likewise and also support standalone
+	distribute constructs.  New parameter INTRA_GROUP, updated both users.
+	(grid_expand_target_grid_body): Support standalone distribute
+	constructs.
+	(pass_data_expand_omp): Changed optinfo_flags to OPTGROUP_OPENMP.
+	(pass_data_expand_omp_ssa): Likewise.
+	(pass_data_omp_device_lower): Likewsie.
+	(pass_data_lower_omp): Likewise.
+	(pass_data_diagnose_omp_blocks): Likewise.
+	(pass_data_oacc_device_lower): Likewise.
+	(pass_data_omp_target_link): Likewise.
+	(grid_lastprivate_predicate): New function.
+	(lower_omp_for_lastprivate): Call grid_lastprivate_predicate for
+	gridified loops.
+	(lower_omp_for): Support standalone distribute constructs.
+	(grid_prop): New type.
+	(grid_safe_assignment_p): Check for assignments to group_sizes, new
+	parameter GRID.
+	(grid_seq_only_contains_local_assignments): New parameter GRID, pass
+	it to callee.
+	(grid_find_single_omp_among_assignments_1): Likewise, improve missed
+	optimization info messages.
+	(grid_find_single_omp_among_assignments): Likewise.
+	(grid_find_ungridifiable_statement): Do not bail out for SIMDs.
+	(grid_parallel_clauses_gridifiable): New function.
+	(grid_inner_loop_gridifiable_p): Likewise.
+	(grid_dist_follows_simple_pattern): Likewise.
+	(grid_gfor_follows_tiling_pattern): Likewise.
+	(grid_call_permissible_in_distribute_p): Likewise.
+	(grid_handle_call_in_distribute): Likewise.
+	(grid_dist_follows_tiling_pattern): Likewise.
+	(grid_target_follows_gridifiable_pattern): Support standalone distribute
+	constructs.
+	(grid_var_segment): New enum.
+	(grid_mark_variable_segment): New function.
+	(grid_copy_leading_local_assignments): Call grid_mark_variable_segment
+	if a new argument says so.
+	(grid_process_grid_body): New function.
+	(grid_eliminate_combined_simd_part): Likewise.
+	(grid_mark_tiling_loops): Likewise.
+	(grid_mark_tiling_parallels_and_loops): Likewise.
+	(grid_process_kernel_body_copy): Support standalone distribute
+	constructs.
+	(grid_attempt_target_gridification): New grid variable holding overall
+	gridification state.  Support standalone distribute constructs and
+	collapse clauses.
+	* doc/optinfo.texi (Optimization groups): Document OPTGROUP_OPENMP.
+	* hsa.h (hsa_bb): Add method method append_phi.
+	(hsa_insn_br): Renamed to hsa_insn_cbr, renamed all
+	occurences in all files too.
+	(hsa_insn_br): New class, now the ancestor of hsa_incn_cbr.
+	(is_a_helper <hsa_insn_br *>::test): New function.
+	(is_a_helper <hsa_insn_cbr *>::test): Adjust to only cover conditional
+	branch instructions.
+	(hsa_insn_signal): Make a direct descendant of
+	hsa_insn_basic.  Add memorder constructor parameter and
+	m_memory_order and m_signalop member variables.
+	(hsa_insn_queue): Changed constructor parameters to common form.
+	Added m_segment and m_memory_order member variables.
+	(hsa_summary_t): Add private member function
+	process_gpu_implementation_attributes.
+	(hsa_function_summary): Rename m_binded_function to
+	m_bound_function.
+	(hsa_insn_basic_p): Remove typedef.
+	(hsa_op_with_type): Change hsa_insn_basic_p into plain pointers.
+	(hsa_op_reg_p): Remove typedef.
+	(hsa_function_representation): Change hsa_op_reg_p into plain
+	pointers.
+	(hsa_insn_phi): Removed new and delete operators.
+	(hsa_insn_br): Likewise.
+	(hsa_insn_cbr): Likewise.
+	(hsa_insn_sbr): Likewise.
+	(hsa_insn_cmp): Likewise.
+	(hsa_insn_mem): Likewise.
+	(hsa_insn_atomic): Likewise.
+	(hsa_insn_signal): Likewise.
+	(hsa_insn_seg): Likewise.
+	(hsa_insn_call): Likewise.
+	(hsa_insn_arg_block): Likewise.
+	(hsa_insn_comment): Likewise.
+	(hsa_insn_srctype): Likewise.
+	(hsa_insn_packed): Likewise.
+	(hsa_insn_cvt): Likewise.
+	(hsa_insn_alloca): Likewise.
+	* hsa.c (hsa_destroy_insn): Also handle instances of hsa_insn_br.
+	(process_gpu_implementation_attributes): New function.
+	(link_functions): Move some functionality into it.  Adjust after
+	renaming m_binded_functions to m_bound_functions.
+	(hsa_insn_basic::op_output_p): Add BRIG_OPCODE_DEBUGTRAP
+	to the list of instructions with no output registers.
+	(get_in_type): Return this if it is a register of
+	matching size.
+	(hsa_get_declaration_name): Moved to...
+        * hsa-gen.c (hsa_get_declaration_name): ...here.  Allocate
+	temporary string on an obstack instead from ggc.
+	(query_hsa_grid): Renamed to query_hsa_grid_dim, reimplemented, cut
+	down to two overloads.
+	(hsa_allocp_operand_address): Removed.
+	(hsa_allocp_operand_immed): Likewise.
+	(hsa_allocp_operand_reg): Likewise.
+	(hsa_allocp_operand_code_list): Likewise.
+	(hsa_allocp_operand_operand_list): Likewise.
+	(hsa_allocp_inst_basic): Likewise.
+	(hsa_allocp_inst_phi): Likewise.
+	(hsa_allocp_inst_mem): Likewise.
+	(hsa_allocp_inst_atomic): Likewise.
+	(hsa_allocp_inst_signal): Likewise.
+	(hsa_allocp_inst_seg): Likewise.
+	(hsa_allocp_inst_cmp): Likewise.
+	(hsa_allocp_inst_br): Likewise.
+	(hsa_allocp_inst_sbr): Likewise.
+	(hsa_allocp_inst_call): Likewise.
+	(hsa_allocp_inst_arg_block): Likewise.
+	(hsa_allocp_inst_comment): Likewise.
+	(hsa_allocp_inst_queue): Likewise.
+	(hsa_allocp_inst_srctype): Likewise.
+	(hsa_allocp_inst_packed): Likewise.
+	(hsa_allocp_inst_cvt): Likewise.
+	(hsa_allocp_inst_alloca): Likewise.
+	(hsa_allocp_bb): Likewise.
+	(hsa_obstack): New.
+	(hsa_init_data_for_cfun): Initialize obstack.
+	(hsa_deinit_data_for_cfun): Release memory of the obstack.
+	(hsa_op_immed::operator new): Use obstack instead of object_allocator.
+	(hsa_op_reg::operator new): Likewise.
+	(hsa_op_address::operator new): Likewise.
+	(hsa_op_code_list::operator new): Likewise.
+	(hsa_op_operand_list::operator new): Likewise.
+	(hsa_insn_basic::operator new): Likewise.
+	(hsa_insn_phi::operator new): Likewise.
+	(hsa_insn_br::operator new): Likewise.
+	(hsa_insn_sbr::operator new): Likewise.
+	(hsa_insn_cmp::operator new): Likewise.
+	(hsa_insn_mem::operator new): Likewise.
+	(hsa_insn_atomic::operator new): Likewise.
+	(hsa_insn_signal::operator new): Likewise.
+	(hsa_insn_seg::operator new): Likewise.
+	(hsa_insn_call::operator new): Likewise.
+	(hsa_insn_arg_block::operator new): Likewise.
+	(hsa_insn_comment::operator new): Likewise.
+	(hsa_insn_srctype::operator new): Likewise.
+	(hsa_insn_packed::operator new): Likewise.
+	(hsa_insn_cvt::operator new): Likewise.
+	(hsa_insn_alloca::operator new): Likewise.
+	(hsa_init_new_bb): Likewise.
+	(hsa_bb::append_phi): New function.
+	(gen_hsa_phi_from_gimple_phi): Use it.
+	(get_symbol_for_decl): Fix dinstinguishing between
+	global and local functions.  Put local variables into a segment
+	according to their attribute or static flag, if there is one.
+	(hsa_insn_br::hsa_insn_br): New.
+	(hsa_insn_br::operator new): Likewise.
+	(hsa_insn_cbr::hsa_insn_cbr): Set width via ancestor constructor.
+	(query_hsa_grid_nodim): New function.
+	(multiply_grid_dim_characteristics): Likewise.
+	(gen_get_num_threads): Likewise.
+	(gen_get_num_teams): Reimplemented.
+	(gen_get_team_num): Likewise.
+	(gen_hsa_insns_for_known_library_call): Updated calls to the above
+	helper functions.
+	(get_memory_order_name): Removed.
+	(get_memory_order): Likewise.
+	(hsa_memorder_from_tree): New function.
+	(gen_hsa_ternary_atomic_for_builtin): Renamed to
+	gen_hsa_atomic_for_builtin, can also create signals.
+	(gen_hsa_insns_for_call): Handle many new builtins.  Adjust to use
+	hsa_memory_order_from_tree and gen_hsa_atomic_for_builtin.
+	(hsa_insn_atomic): Fix function comment.
+	(hsa_insn_signal::hsa_insn_signal): Fix comment.  Update call to
+	ancestor constructor and initialization of new member variables.
+	(hsa_insn_queue::hsa_insn_queue): Added initialization of new
+	member variables.
+	(hsa_get_host_function): Handle functions with no bound CPU
+	implementation.  Fix binded to bound.
+	(get_brig_function_name): Likewise.
+	(HSA_SORRY_ATV): Remove semicolon after macro.
+	(HSA_SORRY_AT): Likewise.
+	(omp_simple_builtin::generate): Add missing semicolons.
+	(hsa_insn_phi::operator new): Removed.
+	(hsa_insn_br::operator new): Likewise.
+	(hsa_insn_cbr::operator new): Likewise.
+	(hsa_insn_sbr::operator new): Likewise.
+	(hsa_insn_cmp::operator new): Likewise.
+	(hsa_insn_mem::operator new): Likewise.
+	(hsa_insn_atomic::operator new): Likewise.
+	(hsa_insn_signal::operator new): Likewise.
+	(hsa_insn_seg::operator new): Likewise.
+	(hsa_insn_call::operator new): Likewise.
+	(hsa_insn_arg_block::operator new): Likewise.
+	(hsa_insn_comment::operator new): Likewise.
+	(hsa_insn_srctype::operator new): Likewise.
+	(hsa_insn_packed::operator new): Likewise.
+	(hsa_insn_cvt::operator new): Likewise.
+	(hsa_insn_alloca::operator new): Likewise.
+	(get_symbol_for_decl): Accept CONST_DECLs, put them to
+	readonly segment.
+	(gen_hsa_addr): Also process CONST_DECLs.
+	(gen_hsa_addr_insns): Process CONST_DECLs by creating private
+	copies.
+	(gen_hsa_unary_operation): Make sure the function does
+	not use bittype source type for firstbit and lastbit operations.
+	(gen_hsa_popcount_to_dest): Make sure the function uses a bittype
+	source type.
+	* hsa-brig.c (emit_insn_operands): Cope with zero operands in an
+	instruction.
+	(emit_branch_insn): Renamed to emit_cond_branch_insn.
+	Emit the width stored in the class.
+	(emit_generic_branch_insn): New function.
+	(emit_insn): Call emit_generic_branch_insn.
+	(emit_signal_insn): Remove obsolete comment.  Update
+	member variable name, pick a type according to profile.
+	(emit_alloca_insn): Remove obsolete comment.
+	(emit_atomic_insn): Likewise.
+	(emit_queue_insn): Get segment and memory order from the IR object.
+	(hsa_brig_section): Make allocate_new_chunk, chunks
+	and cur_chunk provate, add a default NULL parameter to add method.
+	(hsa_brig_section::add): Added a new parameter, store pointer to
+	output data there if it is non-NULL.
+	(emit_function_directives): Use this new parameter instead of
+	calculating the pointer itself, fix function comment.
+	(hsa_brig_emit_function): Add forgotten endian conversion.
+	(hsa_output_kernels): Remove unnecessary building of
+	kernel_dependencies_vector_type.
+	(emit_immediate_operand): Declare.
+	(emit_directive_variable): Also emit initializers of CONST_DECLs.
+	(gen_hsa_insn_for_internal_fn_call): Also handle IFN_RSQRT.
+	(verify_function_arguments): Properly detect variadic
+	arguments.
+	* hsa-dump.c (hsa_width_specifier_name): New function.
+	(dump_hsa_insn_1): Dump generic branch instructions, update signal
+	member variable name.  Special dumping for queue objects.
+	* ipa-hsa.c (process_hsa_functions): Adjust after renaming
+	m_binded_functions to m_bound_functions.  Copy externally visible flag
+	to the node.
+	(ipa_hsa_write_summary): Likewise.
+	(ipa_hsa_read_section): Likewise.
+
 2016-11-23  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/78396
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index d1acededa7a..df4f64f7c28 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -911,7 +911,8 @@ RTL_H = $(RTL_BASE_H) $(FLAGS_H) genrtl.h
 READ_MD_H = $(OBSTACK_H) $(HASHTAB_H) read-md.h
 PARAMS_H = params.h params-enum.h params.def
 BUILTINS_DEF = builtins.def sync-builtins.def omp-builtins.def \
-	gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def
+	gtm-builtins.def sanitizer.def cilkplus.def cilk-builtins.def \
+	hsa-builtins.def
 INTERNAL_FN_DEF = internal-fn.def
 INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF)
 TREE_CORE_H = tree-core.h coretypes.h all-tree.def tree.def \
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 151836a797f..6766975fdf9 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -201,6 +201,19 @@ along with GCC; see the file COPYING3.  If not see
 		|| flag_cilkplus \
 		|| flag_offload_abi != OFFLOAD_ABI_UNSET))
 
+#undef DEF_HSA_BUILTIN
+#ifdef ENABLE_HSA
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS)			\
+  DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
+               false, false, true, ATTRS, false, \
+	       (!flag_disable_hsa))
+#else
+#define DEF_HSA_BUILTIN(ENUM, NAME, TYPE, ATTRS)			\
+  DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
+               false, false, true, ATTRS, false, \
+	       (false))
+#endif
+
 /* Builtin used by implementation of Cilk Plus.  Most of these are decomposed
    by the compiler but a few are implemented in libcilkrts.  */ 
 #undef DEF_CILK_BUILTIN_STUB
@@ -968,6 +981,9 @@ DEF_GCC_BUILTIN (BUILT_IN_LINE, "LINE", BT_FN_INT, ATTR_NOTHROW_LEAF_LIST)
 /* Offloading and Multi Processing builtins.  */
 #include "omp-builtins.def"
 
+/* Heterogeneous Systems Architecture.  */
+#include "hsa-builtins.def"
+
 /* Cilk keywords builtins.  */
 #include "cilk-builtins.def"
 
diff --git a/gcc/doc/optinfo.texi b/gcc/doc/optinfo.texi
index 3c8fdba3de3..20ca560ff42 100644
--- a/gcc/doc/optinfo.texi
+++ b/gcc/doc/optinfo.texi
@@ -59,6 +59,9 @@ Loop optimization passes. Enabled by @option{-loop}.
 @item OPTGROUP_INLINE
 Inlining passes. Enabled by @option{-inline}.
 
+@item OPTGROUP_OPENMP
+OpenMP passes. Enabled by @option{-openmp}.
+
 @item OPTGROUP_VEC
 Vectorization passes. Enabled by @option{-vec}.
 
diff --git a/gcc/dumpfile.c b/gcc/dumpfile.c
index e9483bc8d4d..5b23c3f77a8 100644
--- a/gcc/dumpfile.c
+++ b/gcc/dumpfile.c
@@ -138,6 +138,7 @@ static const struct dump_option_value_info optgroup_options[] =
   {"ipa", OPTGROUP_IPA},
   {"loop", OPTGROUP_LOOP},
   {"inline", OPTGROUP_INLINE},
+  {"openmp", OPTGROUP_OPENMP},
   {"vec", OPTGROUP_VEC},
   {"optall", OPTGROUP_ALL},
   {NULL, 0}
diff --git a/gcc/dumpfile.h b/gcc/dumpfile.h
index b7d70f2804b..f366228f465 100644
--- a/gcc/dumpfile.h
+++ b/gcc/dumpfile.h
@@ -98,7 +98,8 @@ enum tree_dump_index
 #define OPTGROUP_LOOP        (1 << 2)   /* Loop optimization passes */
 #define OPTGROUP_INLINE      (1 << 3)   /* Inlining passes */
 #define OPTGROUP_VEC         (1 << 4)   /* Vectorization passes */
-#define OPTGROUP_OTHER       (1 << 5)   /* All other passes */
+#define OPTGROUP_OPENMP      (1 << 5)	/* OpenMP specific transformations */
+#define OPTGROUP_OTHER       (1 << 6)   /* All other passes */
 #define OPTGROUP_ALL	     (OPTGROUP_IPA | OPTGROUP_LOOP | OPTGROUP_INLINE \
                               | OPTGROUP_VEC | OPTGROUP_OTHER)
 
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index 1ea1c2cbc9f..adeb08da896 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,7 @@
+2016-11-23  Martin Jambor  <mjambor@suse.cz>
+
+	* f95-lang.c (DEF_HSA_BUILTIN): New macro.
+
 2016-11-22  Steven G. Kargl  <kargl@gcc.gnu.org>
 
 	PR fortran/78479
diff --git a/gcc/fortran/f95-lang.c b/gcc/fortran/f95-lang.c
index cea6675d53a..22d29daf08d 100644
--- a/gcc/fortran/f95-lang.c
+++ b/gcc/fortran/f95-lang.c
@@ -1224,6 +1224,17 @@ gfc_init_builtin_functions (void)
 #undef DEF_GOMP_BUILTIN
     }
 
+#ifdef ENABLE_HSA
+  if (!flag_disable_hsa)
+    {
+#undef DEF_HSA_BUILTIN
+#define DEF_HSA_BUILTIN(code, name, type, attr) \
+      gfc_define_builtin ("__builtin_" name, builtin_types[type], \
+			  code, name, attr);
+#include "../hsa-builtins.def"
+    }
+#endif
+
   gfc_define_builtin ("__builtin_trap", builtin_types[BT_FN_VOID],
 		      BUILT_IN_TRAP, NULL, ATTR_NOTHROW_LEAF_LIST);
   TREE_THIS_VOLATILE (builtin_decl_explicit (BUILT_IN_TRAP)) = 1;
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 0eafada61b4..0d0296e3f16 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -163,7 +163,13 @@ enum gf_mask {
     GF_OMP_FOR_KIND_CILKSIMD	= GF_OMP_FOR_SIMD | 1,
     GF_OMP_FOR_COMBINED		= 1 << 4,
     GF_OMP_FOR_COMBINED_INTO	= 1 << 5,
+    /* The following flag must not be used on GF_OMP_FOR_KIND_GRID_LOOP loop
+       statements.  */
     GF_OMP_FOR_GRID_PHONY	= 1 << 6,
+    /* The following two flags should only be set on GF_OMP_FOR_KIND_GRID_LOOP
+       loop statements.  */
+    GF_OMP_FOR_GRID_INTRA_GROUP	= 1 << 6,
+    GF_OMP_FOR_GRID_GROUP_ITER  = 1 << 7,
     GF_OMP_TARGET_KIND_MASK	= (1 << 4) - 1,
     GF_OMP_TARGET_KIND_REGION	= 0,
     GF_OMP_TARGET_KIND_DATA	= 1,
@@ -5143,6 +5149,8 @@ gimple_omp_for_set_pre_body (gimple *gs, gimple_seq pre_body)
 static inline bool
 gimple_omp_for_grid_phony (const gomp_for *omp_for)
 {
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       != GF_OMP_FOR_KIND_GRID_LOOP);
   return (gimple_omp_subcode (omp_for) & GF_OMP_FOR_GRID_PHONY) != 0;
 }
 
@@ -5151,12 +5159,61 @@ gimple_omp_for_grid_phony (const gomp_for *omp_for)
 static inline void
 gimple_omp_for_set_grid_phony (gomp_for *omp_for, bool value)
 {
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       != GF_OMP_FOR_KIND_GRID_LOOP);
   if (value)
     omp_for->subcode |= GF_OMP_FOR_GRID_PHONY;
   else
     omp_for->subcode &= ~GF_OMP_FOR_GRID_PHONY;
 }
 
+/* Return the kernel_intra_group of a GRID_LOOP OMP_FOR statement.  */
+
+static inline bool
+gimple_omp_for_grid_intra_group (const gomp_for *omp_for)
+{
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  return (gimple_omp_subcode (omp_for) & GF_OMP_FOR_GRID_INTRA_GROUP) != 0;
+}
+
+/* Set kernel_intra_group flag of OMP_FOR to VALUE.  */
+
+static inline void
+gimple_omp_for_set_grid_intra_group (gomp_for *omp_for, bool value)
+{
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  if (value)
+    omp_for->subcode |= GF_OMP_FOR_GRID_INTRA_GROUP;
+  else
+    omp_for->subcode &= ~GF_OMP_FOR_GRID_INTRA_GROUP;
+}
+
+/* Return true if iterations of a grid OMP_FOR statement correspond to HSA
+   groups.  */
+
+static inline bool
+gimple_omp_for_grid_group_iter (const gomp_for *omp_for)
+{
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  return (gimple_omp_subcode (omp_for) & GF_OMP_FOR_GRID_GROUP_ITER) != 0;
+}
+
+/* Set group_iter flag of OMP_FOR to VALUE.  */
+
+static inline void
+gimple_omp_for_set_grid_group_iter (gomp_for *omp_for, bool value)
+{
+  gcc_checking_assert (gimple_omp_for_kind (omp_for)
+		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  if (value)
+    omp_for->subcode |= GF_OMP_FOR_GRID_GROUP_ITER;
+  else
+    omp_for->subcode &= ~GF_OMP_FOR_GRID_GROUP_ITER;
+}
+
 /* Return the clauses associated with OMP_PARALLEL GS.  */
 
 static inline tree
diff --git a/gcc/hsa-brig.c b/gcc/hsa-brig.c
index 66ff8f97e07..acd91647cc6 100644
--- a/gcc/hsa-brig.c
+++ b/gcc/hsa-brig.c
@@ -161,19 +161,21 @@ public:
   /* The size of the header of the section without any padding.  */
   unsigned header_byte_delta;
 
-  /* Buffers of binary data, each containing BRIG_CHUNK_MAX_SIZE bytes.  */
-  vec <struct hsa_brig_data_chunk> chunks;
-
-  /* More convenient access to the last chunk from the vector above.  */
-  struct hsa_brig_data_chunk *cur_chunk;
-
-  void allocate_new_chunk ();
   void init (const char *name);
   void release ();
   void output ();
-  unsigned add (const void *data, unsigned len);
+  unsigned add (const void *data, unsigned len, void **output = NULL);
   void round_size_up (int factor);
   void *get_ptr_by_offset (unsigned int offset);
+
+private:
+  void allocate_new_chunk ();
+
+  /* Buffers of binary data, each containing BRIG_CHUNK_MAX_SIZE bytes.  */
+  vec <struct hsa_brig_data_chunk> chunks;
+
+  /* More convenient access to the last chunk from the vector above.  */
+  struct hsa_brig_data_chunk *cur_chunk;
 };
 
 static struct hsa_brig_section brig_data, brig_code, brig_operand;
@@ -271,10 +273,11 @@ hsa_brig_section::output ()
 }
 
 /* Add to the stream LEN bytes of opaque binary DATA.  Return the offset at
-   which it was stored.  */
+   which it was stored.  If OUTPUT is not NULL, store into it the pointer to
+   the place where DATA was actually stored.  */
 
 unsigned
-hsa_brig_section::add (const void *data, unsigned len)
+hsa_brig_section::add (const void *data, unsigned len, void **output)
 {
   unsigned offset = total_size;
 
@@ -282,7 +285,10 @@ hsa_brig_section::add (const void *data, unsigned len)
   if (cur_chunk->size > (BRIG_CHUNK_MAX_SIZE - len))
     allocate_new_chunk ();
 
-  memcpy (cur_chunk->data + cur_chunk->size, data, len);
+  char *dst = cur_chunk->data + cur_chunk->size;
+  memcpy (dst, data, len);
+  if (output)
+    *output = dst;
   cur_chunk->size += len;
   total_size += len;
 
@@ -565,6 +571,7 @@ enqueue_op (hsa_op_base *op)
   return ret;
 }
 
+static void emit_immediate_operand (hsa_op_immed *imm);
 
 /* Emit directive describing a symbol if it has not been emitted already.
    Return the offset of the directive.  */
@@ -603,7 +610,14 @@ emit_directive_variable (struct hsa_symbol *symbol)
     }
 
   dirvar.name = lendian32 (name_offset);
-  dirvar.init = 0;
+
+  if (symbol->m_decl && TREE_CODE (symbol->m_decl) == CONST_DECL)
+    {
+      hsa_op_immed *tmp = new hsa_op_immed (DECL_INITIAL (symbol->m_decl));
+      dirvar.init = lendian32 (enqueue_op (tmp));
+    }
+  else
+    dirvar.init = 0;
   dirvar.type = lendian16 (symbol->m_type);
   dirvar.segment = symbol->m_segment;
   dirvar.align = symbol->m_align;
@@ -626,8 +640,12 @@ emit_directive_variable (struct hsa_symbol *symbol)
   return symbol->m_directive_offset;
 }
 
-/* Emit directives describing either a function declaration or
-   definition F.  */
+/* Emit directives describing either a function declaration or definition F and
+   return the produced BrigDirectiveExecutable structure.  The function does
+   not take into account any instructions when calculating nextModuleEntry
+   field of the produced BrigDirectiveExecutable structure so when emitting
+   actual definitions, this field needs to be updated after all of the function
+   is actually added to the code section.  */
 
 static BrigDirectiveExecutable *
 emit_function_directives (hsa_function_representation *f, bool is_declaration)
@@ -635,7 +653,7 @@ emit_function_directives (hsa_function_representation *f, bool is_declaration)
   struct BrigDirectiveExecutable fndir;
   unsigned name_offset, inarg_off, scoped_off, next_toplev_off;
   int count = 0;
-  BrigDirectiveExecutable *ptr_to_fndir;
+  void *ptr_to_fndir;
   hsa_symbol *sym;
 
   if (!f->m_declaration_p)
@@ -693,17 +711,7 @@ emit_function_directives (hsa_function_representation *f, bool is_declaration)
       *slot = int_fn;
     }
 
-  brig_code.add (&fndir, sizeof (fndir));
-  /* terrible hack: we need to set instCount after we emit all
-     insns, but we need to emit directive in order, and we emit directives
-     during insn emitting.  So we need to emit the FUNCTION directive
-     early, then the insns, and then we need to set instCount, so remember
-     a pointer to it, in some horrible way.  cur_chunk.data+size points
-     directly to after fndir here.  */
-  ptr_to_fndir
-      = (BrigDirectiveExecutable *)(brig_code.cur_chunk->data
-				    + brig_code.cur_chunk->size
-				    - sizeof (fndir));
+  brig_code.add (&fndir, sizeof (fndir), &ptr_to_fndir);
 
   if (f->m_output_arg)
     emit_directive_variable (f->m_output_arg);
@@ -724,7 +732,7 @@ emit_function_directives (hsa_function_representation *f, bool is_declaration)
 	}
     }
 
-  return ptr_to_fndir;
+  return (BrigDirectiveExecutable *) ptr_to_fndir;
 }
 
 /* Emit a label directive for the given HBB.  We assume it is about to start on
@@ -1237,20 +1245,20 @@ emit_insn_operands (hsa_insn_basic *insn)
     operand_offsets;
 
   unsigned l = insn->operand_count ();
-  operand_offsets.safe_grow (l);
-
-  for (unsigned i = 0; i < l; i++)
-    operand_offsets[i] = lendian32 (enqueue_op (insn->get_op (i)));
 
   /* We have N operands so use 4 * N for the byte_count.  */
   uint32_t byte_count = lendian32 (4 * l);
-
   unsigned offset = brig_data.add (&byte_count, sizeof (byte_count));
-  brig_data.add (operand_offsets.address (),
-		 l * sizeof (BrigOperandOffset32_t));
+  if (l > 0)
+    {
+      operand_offsets.safe_grow (l);
+      for (unsigned i = 0; i < l; i++)
+	operand_offsets[i] = lendian32 (enqueue_op (insn->get_op (i)));
 
+      brig_data.add (operand_offsets.address (),
+		     l * sizeof (BrigOperandOffset32_t));
+    }
   brig_data.round_size_up (4);
-
   return offset;
 }
 
@@ -1334,10 +1342,6 @@ emit_signal_insn (hsa_insn_signal *mem)
 {
   struct BrigInstSignal repr;
 
-  /* This is necessary because of the erroneous typedef of
-     BrigMemoryModifier8_t which introduces padding which may then contain
-     random stuff (which we do not want so that we can test things don't
-     change).  */
   memset (&repr, 0, sizeof (repr));
   repr.base.base.byteCount = lendian16 (sizeof (repr));
   repr.base.base.kind = lendian16 (BRIG_KIND_INST_SIGNAL);
@@ -1345,9 +1349,9 @@ emit_signal_insn (hsa_insn_signal *mem)
   repr.base.type = lendian16 (mem->m_type);
   repr.base.operands = lendian32 (emit_insn_operands (mem));
 
-  repr.memoryOrder = mem->m_memoryorder;
-  repr.signalOperation = mem->m_atomicop;
-  repr.signalType = BRIG_TYPE_SIG64;
+  repr.memoryOrder = mem->m_memory_order;
+  repr.signalOperation = mem->m_signalop;
+  repr.signalType = hsa_machine_large_p () ? BRIG_TYPE_SIG64 : BRIG_TYPE_SIG32;
 
   brig_code.add (&repr, sizeof (repr));
   brig_insn_count++;
@@ -1368,10 +1372,6 @@ emit_atomic_insn (hsa_insn_atomic *mem)
   else
     addr = as_a <hsa_op_address *> (mem->get_op (1));
 
-  /* This is necessary because of the erroneous typedef of
-     BrigMemoryModifier8_t which introduces padding which may then contain
-     random stuff (which we do not want so that we can test things don't
-     change).  */
   memset (&repr, 0, sizeof (repr));
   repr.base.base.byteCount = lendian16 (sizeof (repr));
   repr.base.base.kind = lendian16 (BRIG_KIND_INST_ATOMIC);
@@ -1448,10 +1448,6 @@ emit_alloca_insn (hsa_insn_alloca *alloca)
   struct BrigInstMem repr;
   gcc_checking_assert (alloca->operand_count () == 2);
 
-  /* This is necessary because of the erroneous typedef of
-     BrigMemoryModifier8_t which introduces padding which may then contain
-     random stuff (which we do not want so that we can test things don't
-     change).  */
   memset (&repr, 0, sizeof (repr));
   repr.base.base.byteCount = lendian16 (sizeof (repr));
   repr.base.base.kind = lendian16 (BRIG_KIND_INST_MEM);
@@ -1497,11 +1493,29 @@ emit_cmp_insn (hsa_insn_cmp *cmp)
   brig_insn_count++;
 }
 
-/* Emit an HSA branching instruction and all necessary directives, schedule
-   necessary operands for writing.  */
+/* Emit an HSA generic branching/sycnronization instruction.  */
+
+static void
+emit_generic_branch_insn (hsa_insn_br *br)
+{
+  struct BrigInstBr repr;
+  repr.base.base.byteCount = lendian16 (sizeof (repr));
+  repr.base.base.kind = lendian16 (BRIG_KIND_INST_BR);
+  repr.base.opcode = lendian16 (br->m_opcode);
+  repr.width = br->m_width;
+  repr.base.type = lendian16 (br->m_type);
+  repr.base.operands = lendian32 (emit_insn_operands (br));
+  memset (&repr.reserved, 0, sizeof (repr.reserved));
+
+  brig_code.add (&repr, sizeof (repr));
+  brig_insn_count++;
+}
+
+/* Emit an HSA conditional branching instruction and all necessary directives,
+   schedule necessary operands for writing.  */
 
 static void
-emit_branch_insn (hsa_insn_br *br)
+emit_cond_branch_insn (hsa_insn_cbr *br)
 {
   struct BrigInstBr repr;
 
@@ -1514,7 +1528,7 @@ emit_branch_insn (hsa_insn_br *br)
   repr.base.base.byteCount = lendian16 (sizeof (repr));
   repr.base.base.kind = lendian16 (BRIG_KIND_INST_BR);
   repr.base.opcode = lendian16 (br->m_opcode);
-  repr.width = BRIG_WIDTH_1;
+  repr.width = br->m_width;
   /* For Conditional jumps the type is always B1.  */
   repr.base.type = lendian16 (BRIG_TYPE_B1);
 
@@ -1730,8 +1744,8 @@ emit_queue_insn (hsa_insn_queue *insn)
   repr.base.base.kind = lendian16 (BRIG_KIND_INST_QUEUE);
   repr.base.opcode = lendian16 (insn->m_opcode);
   repr.base.type = lendian16 (insn->m_type);
-  repr.segment = BRIG_SEGMENT_GLOBAL;
-  repr.memoryOrder = BRIG_MEMORY_ORDER_SC_RELEASE;
+  repr.segment = insn->m_segment;
+  repr.memoryOrder = insn->m_memory_order;
   repr.base.operands = lendian32 (emit_insn_operands (insn));
   brig_data.round_size_up (4);
   brig_code.add (&repr, sizeof (repr));
@@ -1886,8 +1900,8 @@ emit_insn (hsa_insn_basic *insn)
     emit_segment_insn (seg);
   else if (hsa_insn_cmp *cmp = dyn_cast <hsa_insn_cmp *> (insn))
     emit_cmp_insn (cmp);
-  else if (hsa_insn_br *br = dyn_cast <hsa_insn_br *> (insn))
-    emit_branch_insn (br);
+  else if (hsa_insn_cbr *br = dyn_cast <hsa_insn_cbr *> (insn))
+    emit_cond_branch_insn (br);
   else if (hsa_insn_sbr *sbr = dyn_cast <hsa_insn_sbr *> (insn))
     {
       if (switch_instructions == NULL)
@@ -1896,6 +1910,8 @@ emit_insn (hsa_insn_basic *insn)
       switch_instructions->safe_push (sbr);
       emit_switch_insn (sbr);
     }
+  else if (hsa_insn_br *br = dyn_cast <hsa_insn_br *> (insn))
+    emit_generic_branch_insn (br);
   else if (hsa_insn_arg_block *block = dyn_cast <hsa_insn_arg_block *> (insn))
     emit_arg_block_insn (block);
   else if (hsa_insn_call *call = dyn_cast <hsa_insn_call *> (insn))
@@ -2006,7 +2022,7 @@ hsa_brig_emit_function (void)
       prev_bb = bb;
     }
   perhaps_emit_branch (prev_bb, NULL);
-  ptr_to_fndir->nextModuleEntry = brig_code.total_size;
+  ptr_to_fndir->nextModuleEntry = lendian32 (brig_code.total_size);
 
   /* Fill up label references for all sbr instructions.  */
   if (switch_instructions)
@@ -2225,11 +2241,6 @@ hsa_output_kernels (tree *host_func_table, tree *kernels)
       tree gridified_kernel_p_tree = build_int_cstu (boolean_type_node,
 						     gridified_kernel_p);
       unsigned count = 0;
-
-      kernel_dependencies_vector_type
-	= build_array_type (build_pointer_type (char_type_node),
-			    build_index_type (size_int (0)));
-
       vec<constructor_elt, va_gc> *kernel_dependencies_vec = NULL;
       if (hsa_decl_kernel_dependencies)
 	{
@@ -2279,6 +2290,7 @@ hsa_output_kernels (tree *host_func_table, tree *kernels)
       if (count > 0)
 	{
 	  ASM_GENERATE_INTERNAL_LABEL (tmp_name, "__hsa_dependencies_list", i);
+	  gcc_checking_assert (kernel_dependencies_vector_type);
 	  tree dependencies_list = build_decl (UNKNOWN_LOCATION, VAR_DECL,
 					       get_identifier (tmp_name),
 					       kernel_dependencies_vector_type);
diff --git a/gcc/hsa-builtins.def b/gcc/hsa-builtins.def
new file mode 100644
index 00000000000..cc0409e6b8a
--- /dev/null
+++ b/gcc/hsa-builtins.def
@@ -0,0 +1,39 @@
+/* This file contains the definitions and documentation for the
+   Offloading and Multi Processing builtins used in the GNU compiler.
+   Copyright (C) 2005-2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Before including this file, you should define a macro:
+
+     DEF_HSA_BUILTIN (ENUM, NAME, TYPE, ATTRS)
+
+   See builtins.def for details.  */
+
+/* The reason why they aren't in gcc/builtins.def is that the Fortran front end
+   doesn't source those.  */
+
+DEF_HSA_BUILTIN (BUILT_IN_HSA_WORKGROUPID, "hsa_workgroupid",
+	  	 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_HSA_BUILTIN (BUILT_IN_HSA_WORKITEMID, "hsa_workitemid",
+	  	 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_HSA_BUILTIN (BUILT_IN_HSA_WORKITEMABSID, "hsa_workitemabsid",
+	  	 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_HSA_BUILTIN (BUILT_IN_HSA_GRIDSIZE, "hsa_gridsize",
+		 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_HSA_BUILTIN (BUILT_IN_HSA_CURRENTWORKGROUPSIZE, "hsa_currentworkgroupsize",
+		 BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
diff --git a/gcc/hsa-dump.c b/gcc/hsa-dump.c
index aea95057645..813218b652d 100644
--- a/gcc/hsa-dump.c
+++ b/gcc/hsa-dump.c
@@ -621,6 +621,88 @@ hsa_m_atomicop_name (enum BrigAtomicOperation op)
     }
 }
 
+/* Return textual name for atomic operation.  */
+
+static const char *
+hsa_width_specifier_name (BrigWidth8_t width)
+{
+  switch (width)
+    {
+    case BRIG_WIDTH_NONE:
+      return "none";
+    case BRIG_WIDTH_1:
+      return "1";
+    case BRIG_WIDTH_2:
+      return "2";
+    case BRIG_WIDTH_4:
+      return "4";
+    case BRIG_WIDTH_8:
+      return "8";
+    case BRIG_WIDTH_16:
+      return "16";
+    case BRIG_WIDTH_32:
+      return "32";
+    case BRIG_WIDTH_64:
+      return "64";
+    case BRIG_WIDTH_128:
+      return "128";
+    case BRIG_WIDTH_256:
+      return "256";
+    case BRIG_WIDTH_512:
+      return "512";
+    case BRIG_WIDTH_1024:
+      return "1024";
+    case BRIG_WIDTH_2048:
+      return "2048";
+    case BRIG_WIDTH_4096:
+      return "4096";
+    case BRIG_WIDTH_8192:
+      return "8192";
+    case BRIG_WIDTH_16384:
+      return "16384";
+    case BRIG_WIDTH_32768:
+      return "32768";
+    case BRIG_WIDTH_65536:
+      return "65536";
+    case BRIG_WIDTH_131072:
+      return "131072";
+    case BRIG_WIDTH_262144:
+      return "262144";
+    case BRIG_WIDTH_524288:
+      return "524288";
+    case BRIG_WIDTH_1048576:
+      return "1048576";
+    case BRIG_WIDTH_2097152:
+      return "2097152";
+    case BRIG_WIDTH_4194304:
+      return "4194304";
+    case BRIG_WIDTH_8388608:
+      return "8388608";
+    case BRIG_WIDTH_16777216:
+      return "16777216";
+    case BRIG_WIDTH_33554432:
+      return "33554432";
+    case BRIG_WIDTH_67108864:
+      return "67108864";
+    case BRIG_WIDTH_134217728:
+      return "134217728";
+    case BRIG_WIDTH_268435456:
+      return "268435456";
+    case BRIG_WIDTH_536870912:
+      return "536870912";
+    case BRIG_WIDTH_1073741824:
+      return "1073741824";
+    case BRIG_WIDTH_2147483648:
+      return "2147483648";
+    case BRIG_WIDTH_WAVESIZE:
+      return "wavesize";
+    case BRIG_WIDTH_ALL:
+      return "all";
+    default:
+      return "UNKNOWN_WIDTH";
+    }
+}
+
 /* Dump textual representation of HSA IL register REG to file F.  */
 
 static void
@@ -793,9 +875,9 @@ dump_hsa_insn_1 (FILE *f, hsa_insn_basic *insn, int *indent)
       hsa_insn_signal *mem = as_a <hsa_insn_signal *> (insn);
 
       fprintf (f, "%s", hsa_opcode_name (mem->m_opcode));
-      fprintf (f, "_%s", hsa_m_atomicop_name (mem->m_atomicop));
-      if (mem->m_memoryorder != BRIG_MEMORY_ORDER_NONE)
-	fprintf (f, "_%s", hsa_memsem_name (mem->m_memoryorder));
+      fprintf (f, "_%s", hsa_m_atomicop_name (mem->m_signalop));
+      if (mem->m_memory_order != BRIG_MEMORY_ORDER_NONE)
+	fprintf (f, "_%s", hsa_memsem_name (mem->m_memory_order));
       fprintf (f, "_%s ", hsa_type_name (mem->m_type));
 
       dump_hsa_operands (f, mem);
@@ -884,9 +966,9 @@ dump_hsa_insn_1 (FILE *f, hsa_insn_basic *insn, int *indent)
       fprintf (f, ", ");
       dump_hsa_operand (f, cmp->get_op (2));
     }
-  else if (is_a <hsa_insn_br *> (insn))
+  else if (is_a <hsa_insn_cbr *> (insn))
     {
-      hsa_insn_br *br = as_a <hsa_insn_br *> (insn);
+      hsa_insn_cbr *br = as_a <hsa_insn_cbr *> (insn);
       basic_block target = NULL;
       edge_iterator ei;
       edge e;
@@ -921,6 +1003,12 @@ dump_hsa_insn_1 (FILE *f, hsa_insn_basic *insn, int *indent)
 	    fprintf (f, ", ");
 	}
     }
+  else if (is_a <hsa_insn_br *> (insn))
+    {
+      hsa_insn_br *br = as_a <hsa_insn_br *> (insn);
+      fprintf (f, "%s_width(%s) ", hsa_opcode_name (br->m_opcode),
+	       hsa_width_specifier_name (br->m_width));
+    }
   else if (is_a <hsa_insn_arg_block *> (insn))
     {
       hsa_insn_arg_block *arg_block = as_a <hsa_insn_arg_block *> (insn);
@@ -1018,6 +1106,15 @@ dump_hsa_insn_1 (FILE *f, hsa_insn_basic *insn, int *indent)
 
       dump_hsa_operands (f, insn);
     }
+  else if (hsa_insn_queue *qi = dyn_cast <hsa_insn_queue *> (insn))
+    {
+      fprintf (f, "%s_%s_%s_%s ", hsa_opcode_name (qi->m_opcode),
+	       hsa_seg_name (qi->m_segment),
+	       hsa_memsem_name (qi->m_memory_order),
+	       hsa_type_name (qi->m_type));
+
+      dump_hsa_operands (f, qi);
+    }
   else
     {
       fprintf (f, "%s_%s ", hsa_opcode_name (insn->m_opcode),
diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index 21c35e6089f..a88294ecd4f 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -39,7 +39,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "dumpfile.h"
 #include "gimple-pretty-print.h"
 #include "diagnostic-core.h"
-#include "alloc-pool.h"
 #include "gimple-ssa.h"
 #include "tree-phinodes.h"
 #include "stringpool.h"
@@ -72,7 +71,7 @@ along with GCC; see the file COPYING3.  If not see
 		    HSA_SORRY_MSG)) \
       inform (location, message, __VA_ARGS__); \
   } \
-  while (false);
+  while (false)
 
 /* Same as previous, but highlight a location.  */
 
@@ -84,7 +83,7 @@ along with GCC; see the file COPYING3.  If not see
 		    HSA_SORRY_MSG)) \
       inform (location, message); \
   } \
-  while (false);
+  while (false)
 
 /* Default number of threads used by kernel dispatch.  */
 
@@ -127,31 +126,7 @@ struct hsa_queue
   uint64_t id;
 };
 
-/* Alloc pools for allocating basic hsa structures such as operands,
-   instructions and other basic entities.  */
-static object_allocator<hsa_op_address> *hsa_allocp_operand_address;
-static object_allocator<hsa_op_immed> *hsa_allocp_operand_immed;
-static object_allocator<hsa_op_reg> *hsa_allocp_operand_reg;
-static object_allocator<hsa_op_code_list> *hsa_allocp_operand_code_list;
-static object_allocator<hsa_op_operand_list> *hsa_allocp_operand_operand_list;
-static object_allocator<hsa_insn_basic> *hsa_allocp_inst_basic;
-static object_allocator<hsa_insn_phi> *hsa_allocp_inst_phi;
-static object_allocator<hsa_insn_mem> *hsa_allocp_inst_mem;
-static object_allocator<hsa_insn_atomic> *hsa_allocp_inst_atomic;
-static object_allocator<hsa_insn_signal> *hsa_allocp_inst_signal;
-static object_allocator<hsa_insn_seg> *hsa_allocp_inst_seg;
-static object_allocator<hsa_insn_cmp> *hsa_allocp_inst_cmp;
-static object_allocator<hsa_insn_br> *hsa_allocp_inst_br;
-static object_allocator<hsa_insn_sbr> *hsa_allocp_inst_sbr;
-static object_allocator<hsa_insn_call> *hsa_allocp_inst_call;
-static object_allocator<hsa_insn_arg_block> *hsa_allocp_inst_arg_block;
-static object_allocator<hsa_insn_comment> *hsa_allocp_inst_comment;
-static object_allocator<hsa_insn_queue> *hsa_allocp_inst_queue;
-static object_allocator<hsa_insn_srctype> *hsa_allocp_inst_srctype;
-static object_allocator<hsa_insn_packed> *hsa_allocp_inst_packed;
-static object_allocator<hsa_insn_cvt> *hsa_allocp_inst_cvt;
-static object_allocator<hsa_insn_alloca> *hsa_allocp_inst_alloca;
-static object_allocator<hsa_bb> *hsa_allocp_bb;
+static struct obstack hsa_obstack;
 
 /* List of pointers to all instructions that come from an object allocator.  */
 static vec <hsa_insn_basic *> hsa_instructions;
@@ -486,52 +461,7 @@ static void
 hsa_init_data_for_cfun ()
 {
   hsa_init_compilation_unit_data ();
-  hsa_allocp_operand_address
-    = new object_allocator<hsa_op_address> ("HSA address operands");
-  hsa_allocp_operand_immed
-    = new object_allocator<hsa_op_immed> ("HSA immediate operands");
-  hsa_allocp_operand_reg
-    = new object_allocator<hsa_op_reg> ("HSA register operands");
-  hsa_allocp_operand_code_list
-    = new object_allocator<hsa_op_code_list> ("HSA code list operands");
-  hsa_allocp_operand_operand_list
-    = new object_allocator<hsa_op_operand_list> ("HSA operand list operands");
-  hsa_allocp_inst_basic
-    = new object_allocator<hsa_insn_basic> ("HSA basic instructions");
-  hsa_allocp_inst_phi
-    = new object_allocator<hsa_insn_phi> ("HSA phi operands");
-  hsa_allocp_inst_mem
-    = new object_allocator<hsa_insn_mem> ("HSA memory instructions");
-  hsa_allocp_inst_atomic
-    = new object_allocator<hsa_insn_atomic> ("HSA atomic instructions");
-  hsa_allocp_inst_signal
-    = new object_allocator<hsa_insn_signal> ("HSA signal instructions");
-  hsa_allocp_inst_seg
-    = new object_allocator<hsa_insn_seg> ("HSA segment conversion "
-					  "instructions");
-  hsa_allocp_inst_cmp
-    = new object_allocator<hsa_insn_cmp> ("HSA comparison instructions");
-  hsa_allocp_inst_br
-    = new object_allocator<hsa_insn_br> ("HSA branching instructions");
-  hsa_allocp_inst_sbr
-    = new object_allocator<hsa_insn_sbr> ("HSA switch branching instructions");
-  hsa_allocp_inst_call
-    = new object_allocator<hsa_insn_call> ("HSA call instructions");
-  hsa_allocp_inst_arg_block
-    = new object_allocator<hsa_insn_arg_block> ("HSA arg block instructions");
-  hsa_allocp_inst_comment
-    = new object_allocator<hsa_insn_comment> ("HSA comment instructions");
-  hsa_allocp_inst_queue
-    = new object_allocator<hsa_insn_queue> ("HSA queue instructions");
-  hsa_allocp_inst_srctype
-    = new object_allocator<hsa_insn_srctype> ("HSA source type instructions");
-  hsa_allocp_inst_packed
-    = new object_allocator<hsa_insn_packed> ("HSA packed instructions");
-  hsa_allocp_inst_cvt
-    = new object_allocator<hsa_insn_cvt> ("HSA convert instructions");
-  hsa_allocp_inst_alloca
-    = new object_allocator<hsa_insn_alloca> ("HSA alloca instructions");
-  hsa_allocp_bb = new object_allocator<hsa_bb> ("HSA basic blocks");
+  gcc_obstack_init (&hsa_obstack);
 }
 
 /* Deinitialize HSA subsystem and free all allocated memory.  */
@@ -565,29 +495,7 @@ hsa_deinit_data_for_cfun (void)
       omp_simple_builtins = NULL;
     }
 
-  delete hsa_allocp_operand_address;
-  delete hsa_allocp_operand_immed;
-  delete hsa_allocp_operand_reg;
-  delete hsa_allocp_operand_code_list;
-  delete hsa_allocp_operand_operand_list;
-  delete hsa_allocp_inst_basic;
-  delete hsa_allocp_inst_phi;
-  delete hsa_allocp_inst_atomic;
-  delete hsa_allocp_inst_mem;
-  delete hsa_allocp_inst_signal;
-  delete hsa_allocp_inst_seg;
-  delete hsa_allocp_inst_cmp;
-  delete hsa_allocp_inst_br;
-  delete hsa_allocp_inst_sbr;
-  delete hsa_allocp_inst_call;
-  delete hsa_allocp_inst_arg_block;
-  delete hsa_allocp_inst_comment;
-  delete hsa_allocp_inst_queue;
-  delete hsa_allocp_inst_srctype;
-  delete hsa_allocp_inst_packed;
-  delete hsa_allocp_inst_cvt;
-  delete hsa_allocp_inst_alloca;
-  delete hsa_allocp_bb;
+  obstack_free (&hsa_obstack, NULL);
   delete hsa_cfun;
 }
 
@@ -873,6 +781,49 @@ hsa_needs_cvt (BrigType16_t dtype, BrigType16_t stype)
   return false;
 }
 
+/* Return declaration name if it exists or create one from UID if it does not.
+   If DECL is a local variable, make UID part of its name.  */
+
+const char *
+hsa_get_declaration_name (tree decl)
+{
+  if (!DECL_NAME (decl))
+    {
+      char buf[64];
+      snprintf (buf, 64, "__hsa_anon_%u", DECL_UID (decl));
+      size_t len = strlen (buf);
+      char *copy = (char *) obstack_alloc (&hsa_obstack, len + 1);
+      memcpy (copy, buf, len + 1);
+      return copy;
+    }
+
+  tree name_tree;
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      || (TREE_CODE (decl) == VAR_DECL && is_global_var (decl)))
+    name_tree = DECL_ASSEMBLER_NAME (decl);
+  else
+    name_tree = DECL_NAME (decl);
+
+  const char *name = IDENTIFIER_POINTER (name_tree);
+  /* User-defined assembly names have prepended asterisk symbol.  */
+  if (name[0] == '*')
+    name++;
+
+  if ((TREE_CODE (decl) == VAR_DECL)
+      && decl_function_context (decl))
+    {
+      size_t len = strlen (name);
+      char *buf = (char *) alloca (len + 32);
+      snprintf (buf, len + 32, "%s_%u", name, DECL_UID (decl));
+      len = strlen (buf);
+      char *copy = (char *) obstack_alloc (&hsa_obstack, len + 1);
+      memcpy (copy, buf, len + 1);
+      return copy;
+    }
+  else
+    return name;
+}
+
 /* Lookup or create the associated hsa_symbol structure with a given VAR_DECL
    or lookup the hsa_structure corresponding to a PARM_DECL.  */
 
@@ -884,11 +835,13 @@ get_symbol_for_decl (tree decl)
 
   gcc_assert (TREE_CODE (decl) == PARM_DECL
 	      || TREE_CODE (decl) == RESULT_DECL
-	      || VAR_P (decl));
+	      || TREE_CODE (decl) == VAR_DECL
+	      || TREE_CODE (decl) == CONST_DECL);
 
   dummy.m_decl = decl;
 
-  bool is_in_global_vars = VAR_P (decl) && is_global_var (decl);
+  bool is_in_global_vars = ((TREE_CODE (decl) == VAR_DECL)
+			    && !decl_function_context (decl));
 
   if (is_in_global_vars)
     slot = hsa_global_variable_symbols->find_slot (&dummy, INSERT);
@@ -925,11 +878,14 @@ get_symbol_for_decl (tree decl)
   else
     {
       hsa_symbol *sym;
-      gcc_assert (VAR_P (decl));
+      /* PARM_DECLs and RESULT_DECL should be already in m_local_symbols.  */
+      gcc_assert (TREE_CODE (decl) == VAR_DECL
+		  || TREE_CODE (decl) == CONST_DECL);
       BrigAlignment8_t align = hsa_object_alignment (decl);
 
       if (is_in_global_vars)
 	{
+	  gcc_checking_assert (TREE_CODE (decl) != CONST_DECL);
 	  sym = new hsa_symbol (BRIG_TYPE_NONE, BRIG_SEGMENT_GLOBAL,
 				BRIG_LINKAGE_PROGRAM, true,
 				BRIG_ALLOCATION_PROGRAM, align);
@@ -951,12 +907,25 @@ get_symbol_for_decl (tree decl)
 	  if (AGGREGATE_TYPE_P (TREE_TYPE (decl)))
 	    align = MAX ((BrigAlignment8_t) BRIG_ALIGNMENT_8, align);
 
-	  /* PARM_DECL and RESULT_DECL should be already in m_local_symbols.  */
-	  gcc_assert (VAR_P (decl));
+	  BrigAllocation allocation = BRIG_ALLOCATION_AUTOMATIC;
+	  BrigSegment8_t segment;
+	  if (TREE_CODE (decl) == CONST_DECL)
+	    {
+	      segment = BRIG_SEGMENT_READONLY;
+	      allocation = BRIG_ALLOCATION_AGENT;
+	    }
+	  else if (lookup_attribute ("hsa_group_segment",
+				     DECL_ATTRIBUTES (decl)))
+	    segment = BRIG_SEGMENT_GROUP;
+	  else if (TREE_STATIC (decl)
+		   || lookup_attribute ("hsa_global_segment",
+					DECL_ATTRIBUTES (decl)))
+	    segment = BRIG_SEGMENT_GLOBAL;
+	  else
+	    segment = BRIG_SEGMENT_PRIVATE;
 
-	  sym = new hsa_symbol (BRIG_TYPE_NONE, BRIG_SEGMENT_PRIVATE,
-				BRIG_LINKAGE_FUNCTION);
-	  sym->m_align = align;
+	  sym = new hsa_symbol (BRIG_TYPE_NONE, segment, BRIG_LINKAGE_FUNCTION,
+				false, allocation, align);
 	  sym->fillup_for_decl (decl);
 	  hsa_cfun->m_private_variables.safe_push (sym);
 	}
@@ -978,7 +947,7 @@ hsa_get_host_function (tree decl)
   gcc_assert (s->m_kind != HSA_NONE);
   gcc_assert (s->m_gpu_implementation_p);
 
-  return s->m_binded_function->decl;
+  return s->m_bound_function ? s->m_bound_function->decl : NULL;
 }
 
 /* Return true if function DECL has a host equivalent function.  */
@@ -989,8 +958,10 @@ get_brig_function_name (tree decl)
   tree d = decl;
 
   hsa_function_summary *s = hsa_summaries->get (cgraph_node::get_create (d));
-  if (s->m_kind != HSA_NONE && s->m_gpu_implementation_p)
-    d = s->m_binded_function->decl;
+  if (s->m_kind != HSA_NONE
+      && s->m_gpu_implementation_p
+      && s->m_bound_function)
+    d = s->m_bound_function->decl;
 
   /* IPA split can create a function that has no host equivalent.  */
   if (d == NULL)
@@ -1066,6 +1037,14 @@ hsa_op_with_type::get_in_type (BrigType16_t dtype, hsa_bb *hbb)
       dest = new hsa_op_reg (dtype);
       hbb->append_insn (new hsa_insn_cvt (dest, this));
     }
+  else if (is_a <hsa_op_reg *> (this))
+    {
+      /* In the end, HSA registers do not really have types, only sizes, so if
+	 the sizes match, we can use the register directly.  */
+      gcc_checking_assert (hsa_type_bit_size (dtype)
+			   == hsa_type_bit_size (m_type));
+      return this;
+    }
   else
     {
       dest = new hsa_op_reg (m_type);
@@ -1128,12 +1107,12 @@ hsa_op_immed::hsa_op_immed ()
 {
 }
 
-/* New operator to allocate immediate operands from pool alloc.  */
+/* New operator to allocate immediate operands from obstack.  */
 
 void *
-hsa_op_immed::operator new (size_t)
+hsa_op_immed::operator new (size_t size)
 {
-  return hsa_allocp_operand_immed->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 /* Destructor.  */
@@ -1160,12 +1139,12 @@ hsa_op_reg::hsa_op_reg (BrigType16_t t)
 {
 }
 
-/* New operator to allocate a register from pool alloc.  */
+/* New operator to allocate a register from obstack.  */
 
 void *
-hsa_op_reg::operator new (size_t)
+hsa_op_reg::operator new (size_t size)
 {
-  return hsa_allocp_operand_reg->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 /* Verify register operand.  */
@@ -1244,12 +1223,12 @@ hsa_op_address::hsa_op_address (hsa_op_reg *r, HOST_WIDE_INT offset)
 {
 }
 
-/* New operator to allocate address operands from pool alloc.  */
+/* New operator to allocate address operands from obstack.  */
 
 void *
-hsa_op_address::operator new (size_t)
+hsa_op_address::operator new (size_t size)
 {
-  return hsa_allocp_operand_address->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 /* Constructor of an operand referring to HSAIL code.  */
@@ -1269,12 +1248,12 @@ hsa_op_code_list::hsa_op_code_list (unsigned elements)
   m_offsets.safe_grow_cleared (elements);
 }
 
-/* New operator to allocate code list operands from pool alloc.  */
+/* New operator to allocate code list operands from obstack.  */
 
 void *
-hsa_op_code_list::operator new (size_t)
+hsa_op_code_list::operator new (size_t size)
 {
-  return hsa_allocp_operand_code_list->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 /* Constructor of an operand representing an operand list.
@@ -1287,12 +1266,12 @@ hsa_op_operand_list::hsa_op_operand_list (unsigned elements)
   m_offsets.safe_grow (elements);
 }
 
-/* New operator to allocate operand list operands from pool alloc.  */
+/* New operator to allocate operand list operands from obstack.  */
 
 void *
-hsa_op_operand_list::operator new (size_t)
+hsa_op_operand_list::operator new (size_t size)
 {
-  return hsa_allocp_operand_operand_list->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 hsa_op_operand_list::~hsa_op_operand_list ()
@@ -1437,12 +1416,12 @@ hsa_insn_basic::hsa_insn_basic (unsigned nops, int opc, BrigType16_t t,
   hsa_instructions.safe_push (this);
 }
 
-/* New operator to allocate basic instruction from pool alloc.  */
+/* New operator to allocate basic instruction from obstack.  */
 
 void *
-hsa_insn_basic::operator new (size_t)
+hsa_insn_basic::operator new (size_t size)
 {
-  return hsa_allocp_inst_basic->allocate_raw ();
+  return obstack_alloc (&hsa_obstack, size);
 }
 
 /* Verify the instruction.  */
@@ -1495,32 +1474,27 @@ hsa_insn_phi::hsa_insn_phi (unsigned nops, hsa_op_reg *dst)
   dst->set_definition (this);
 }
 
-/* New operator to allocate PHI instruction from pool alloc.  */
+/* Constructor of class representing instructions for control flow and
+   sychronization,   */
 
-void *
-hsa_insn_phi::operator new (size_t)
+hsa_insn_br::hsa_insn_br (unsigned nops, int opc, BrigType16_t t,
+			  BrigWidth8_t width, hsa_op_base *arg0,
+			  hsa_op_base *arg1, hsa_op_base *arg2,
+			  hsa_op_base *arg3)
+  : hsa_insn_basic (nops, opc, t, arg0, arg1, arg2, arg3),
+    m_width (width)
 {
-  return hsa_allocp_inst_phi->allocate_raw ();
 }
 
 /* Constructor of class representing instruction for conditional jump, CTRL is
    the control register determining whether the jump will be carried out, the
    new instruction is automatically added to its uses list.  */
 
-hsa_insn_br::hsa_insn_br (hsa_op_reg *ctrl)
-  : hsa_insn_basic (1, BRIG_OPCODE_CBR, BRIG_TYPE_B1, ctrl),
-    m_width (BRIG_WIDTH_1)
+hsa_insn_cbr::hsa_insn_cbr (hsa_op_reg *ctrl)
+  : hsa_insn_br (1, BRIG_OPCODE_CBR, BRIG_TYPE_B1, BRIG_WIDTH_1, ctrl)
 {
 }
 
-/* New operator to allocate branch instruction from pool alloc.  */
-
-void *
-hsa_insn_br::operator new (size_t)
-{
-  return hsa_allocp_inst_br->allocate_raw ();
-}
-
 /* Constructor of class representing instruction for switch jump, CTRL is
    the index register.  */
 
@@ -1531,14 +1505,6 @@ hsa_insn_sbr::hsa_insn_sbr (hsa_op_reg *index, unsigned jump_count)
 {
 }
 
-/* New operator to allocate switch branch instruction from pool alloc.  */
-
-void *
-hsa_insn_sbr::operator new (size_t)
-{
-  return hsa_allocp_inst_sbr->allocate_raw ();
-}
-
 /* Replace all occurrences of OLD_BB with NEW_BB in the statements
    jump table.  */
 
@@ -1565,14 +1531,6 @@ hsa_insn_cmp::hsa_insn_cmp (BrigCompareOperation8_t cmp, BrigType16_t t,
 {
 }
 
-/* New operator to allocate compare instruction from pool alloc.  */
-
-void *
-hsa_insn_cmp::operator new (size_t)
-{
-  return hsa_allocp_inst_cmp->allocate_raw ();
-}
-
 /* Constructor of classes representing memory accesses.  OPC is the opcode (must
    be BRIG_OPCODE_ST or BRIG_OPCODE_LD) and T is the type.  The instruction
    operands are provided as ARG0 and ARG1.  */
@@ -1598,18 +1556,9 @@ hsa_insn_mem::hsa_insn_mem (unsigned nops, int opc, BrigType16_t t,
 {
 }
 
-/* New operator to allocate memory instruction from pool alloc.  */
-
-void *
-hsa_insn_mem::operator new (size_t)
-{
-  return hsa_allocp_inst_mem->allocate_raw ();
-}
-
-/* Constructor of class representing atomic instructions and signals.  OPC is
-   the principal opcode, aop is the specific atomic operation opcode.  T is the
-   type of the instruction.  The instruction operands
-   are provided as ARG[0-3].  */
+/* Constructor of class representing atomic instructions.  OPC is the principal
+   opcode, AOP is the specific atomic operation opcode.  T is the type of the
+   instruction.  The instruction operands are provided as ARG[0-3].  */
 
 hsa_insn_atomic::hsa_insn_atomic (int nops, int opc,
 				  enum BrigAtomicOperation aop,
@@ -1627,34 +1576,18 @@ hsa_insn_atomic::hsa_insn_atomic (int nops, int opc,
 		       opc == BRIG_OPCODE_SIGNALNORET);
 }
 
-/* New operator to allocate signal instruction from pool alloc.  */
-
-void *
-hsa_insn_atomic::operator new (size_t)
-{
-  return hsa_allocp_inst_atomic->allocate_raw ();
-}
-
 /* Constructor of class representing signal instructions.  OPC is the prinicpal
-   opcode, sop is the specific signal operation opcode.  T is the type of the
+   opcode, SOP is the specific signal operation opcode.  T is the type of the
    instruction.  The instruction operands are provided as ARG[0-3].  */
 
 hsa_insn_signal::hsa_insn_signal (int nops, int opc,
 				  enum BrigAtomicOperation sop,
-				  BrigType16_t t, hsa_op_base *arg0,
-				  hsa_op_base *arg1, hsa_op_base *arg2,
-				  hsa_op_base *arg3)
-  : hsa_insn_atomic (nops, opc, sop, t, BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE,
-		     arg0, arg1, arg2, arg3)
-{
-}
-
-/* New operator to allocate signal instruction from pool alloc.  */
-
-void *
-hsa_insn_signal::operator new (size_t)
+				  BrigType16_t t, BrigMemoryOrder memorder,
+				  hsa_op_base *arg0, hsa_op_base *arg1,
+				  hsa_op_base *arg2, hsa_op_base *arg3)
+  : hsa_insn_basic (nops, opc, t, arg0, arg1, arg2, arg3),
+    m_memory_order (memorder), m_signalop (sop)
 {
-  return hsa_allocp_inst_signal->allocate_raw ();
 }
 
 /* Constructor of class representing segment conversion instructions.  OPC is
@@ -1672,14 +1605,6 @@ hsa_insn_seg::hsa_insn_seg (int opc, BrigType16_t dest, BrigType16_t srct,
   gcc_checking_assert (opc == BRIG_OPCODE_STOF || opc == BRIG_OPCODE_FTOS);
 }
 
-/* New operator to allocate address conversion instruction from pool alloc.  */
-
-void *
-hsa_insn_seg::operator new (size_t)
-{
-  return hsa_allocp_inst_seg->allocate_raw ();
-}
-
 /* Constructor of class representing a call instruction.  CALLEE is the tree
    representation of the function being called.  */
 
@@ -1696,14 +1621,6 @@ hsa_insn_call::hsa_insn_call (hsa_internal_fn *fn)
 {
 }
 
-/* New operator to allocate call instruction from pool alloc.  */
-
-void *
-hsa_insn_call::operator new (size_t)
-{
-  return hsa_allocp_inst_call->allocate_raw ();
-}
-
 hsa_insn_call::~hsa_insn_call ()
 {
   for (unsigned i = 0; i < m_input_args.length (); i++)
@@ -1724,14 +1641,6 @@ hsa_insn_arg_block::hsa_insn_arg_block (BrigKind brig_kind,
 {
 }
 
-/* New operator to allocate argument block instruction from pool alloc.  */
-
-void *
-hsa_insn_arg_block::operator new (size_t)
-{
-  return hsa_allocp_inst_arg_block->allocate_raw ();
-}
-
 hsa_insn_comment::hsa_insn_comment (const char *s)
   : hsa_insn_basic (0, BRIG_KIND_DIRECTIVE_COMMENT)
 {
@@ -1743,14 +1652,6 @@ hsa_insn_comment::hsa_insn_comment (const char *s)
   m_comment = buf;
 }
 
-/* New operator to allocate comment instruction from pool alloc.  */
-
-void *
-hsa_insn_comment::operator new (size_t)
-{
-  return hsa_allocp_inst_comment->allocate_raw ();
-}
-
 hsa_insn_comment::~hsa_insn_comment ()
 {
   gcc_checking_assert (m_comment);
@@ -1759,17 +1660,14 @@ hsa_insn_comment::~hsa_insn_comment ()
 }
 
 /* Constructor of class representing the queue instruction in HSAIL.  */
-hsa_insn_queue::hsa_insn_queue (int nops, BrigOpcode opcode)
-  : hsa_insn_basic (nops, opcode, BRIG_TYPE_U64)
-{
-}
 
-/* New operator to allocate source type instruction from pool alloc.  */
-
-void *
-hsa_insn_srctype::operator new (size_t)
+hsa_insn_queue::hsa_insn_queue (int nops, int opcode, BrigSegment segment,
+				BrigMemoryOrder memory_order,
+				hsa_op_base *arg0, hsa_op_base *arg1,
+				hsa_op_base *arg2, hsa_op_base *arg3)
+  : hsa_insn_basic (nops, opcode, BRIG_TYPE_U64, arg0, arg1, arg2, arg3),
+    m_segment (segment), m_memory_order (memory_order)
 {
-  return hsa_allocp_inst_srctype->allocate_raw ();
 }
 
 /* Constructor of class representing the source type instruction in HSAIL.  */
@@ -1782,14 +1680,6 @@ hsa_insn_srctype::hsa_insn_srctype (int nops, BrigOpcode opcode,
     m_source_type (srct)
 {}
 
-/* New operator to allocate packed instruction from pool alloc.  */
-
-void *
-hsa_insn_packed::operator new (size_t)
-{
-  return hsa_allocp_inst_packed->allocate_raw ();
-}
-
 /* Constructor of class representing the packed instruction in HSAIL.  */
 
 hsa_insn_packed::hsa_insn_packed (int nops, BrigOpcode opcode,
@@ -1801,14 +1691,6 @@ hsa_insn_packed::hsa_insn_packed (int nops, BrigOpcode opcode,
   m_operand_list = new hsa_op_operand_list (nops - 1);
 }
 
-/* New operator to allocate convert instruction from pool alloc.  */
-
-void *
-hsa_insn_cvt::operator new (size_t)
-{
-  return hsa_allocp_inst_cvt->allocate_raw ();
-}
-
 /* Constructor of class representing the convert instruction in HSAIL.  */
 
 hsa_insn_cvt::hsa_insn_cvt (hsa_op_with_type *dest, hsa_op_with_type *src)
@@ -1816,14 +1698,6 @@ hsa_insn_cvt::hsa_insn_cvt (hsa_op_with_type *dest, hsa_op_with_type *src)
 {
 }
 
-/* New operator to allocate alloca from pool alloc.  */
-
-void *
-hsa_insn_alloca::operator new (size_t)
-{
-  return hsa_allocp_inst_alloca->allocate_raw ();
-}
-
 /* Constructor of class representing the alloca in HSAIL.  */
 
 hsa_insn_alloca::hsa_insn_alloca (hsa_op_with_type *dest,
@@ -1854,6 +1728,20 @@ hsa_bb::append_insn (hsa_insn_basic *insn)
     m_first_insn = insn;
 }
 
+void
+hsa_bb::append_phi (hsa_insn_phi *hphi)
+{
+  hphi->m_bb = m_bb;
+
+  hphi->m_prev = m_last_phi;
+  hphi->m_next = NULL;
+  if (m_last_phi)
+    m_last_phi->m_next = hphi;
+  m_last_phi = hphi;
+  if (!m_first_phi)
+    m_first_phi = hphi;
+}
+
 /* Insert HSA instruction NEW_INSN immediately before an existing instruction
    OLD_INSN.  */
 
@@ -2078,6 +1966,7 @@ gen_hsa_addr (tree ref, hsa_bb *hbb, HOST_WIDE_INT *output_bitsize = NULL,
     case PARM_DECL:
     case VAR_DECL:
     case RESULT_DECL:
+    case CONST_DECL:
       gcc_assert (!symbol);
       symbol = get_symbol_for_decl (ref);
       addrtype = hsa_get_segment_addr_type (symbol->m_segment);
@@ -2295,6 +2184,34 @@ gen_hsa_addr_insns (tree val, hsa_op_reg *dest, hsa_bb *hbb)
     val = TREE_OPERAND (val, 0);
   addr = gen_hsa_addr (val, hbb);
 
+  if (TREE_CODE (val) == CONST_DECL
+      && is_gimple_reg_type (TREE_TYPE (val)))
+    {
+      gcc_assert (addr->m_symbol
+		  && addr->m_symbol->m_segment == BRIG_SEGMENT_READONLY);
+      /* CONST_DECLs are in readonly segment which however does not have
+	 addresses convertible to flat segments.  So copy it to a private one
+	 and take address of that.  */
+      BrigType16_t csttype
+	= mem_type_for_type (hsa_type_for_scalar_tree_type (TREE_TYPE (val),
+							    false));
+      hsa_op_reg *r = new hsa_op_reg (csttype);
+      hbb->append_insn (new hsa_insn_mem (BRIG_OPCODE_LD, csttype, r,
+					  new hsa_op_address (addr->m_symbol)));
+      hsa_symbol *copysym = hsa_cfun->create_hsa_temporary (csttype);
+      hbb->append_insn (new hsa_insn_mem (BRIG_OPCODE_ST, csttype, r,
+					  new hsa_op_address (copysym)));
+      addr->m_symbol = copysym;
+    }
+  else if (addr->m_symbol && addr->m_symbol->m_segment == BRIG_SEGMENT_READONLY)
+    {
+      HSA_SORRY_ATV (EXPR_LOCATION (val), "support for HSA does "
+		     "not implement taking addresses of complex "
+		     "CONST_DECLs such as %E", val);
+      return;
+    }
+
+
   convert_addr_to_flat_segment (addr, dest, hbb);
 }
 
@@ -2324,8 +2241,10 @@ hsa_reg_or_immed_for_gimple_op (tree op, hsa_bb *hbb)
 void
 hsa_build_append_simple_mov (hsa_op_reg *dest, hsa_op_base *src, hsa_bb *hbb)
 {
-  hsa_insn_basic *insn = new hsa_insn_basic (2, BRIG_OPCODE_MOV, dest->m_type,
-					     dest, src);
+  /* Moves of packed data between registers need to adhere to the same type
+     rules like when dealing with memory.  */
+  BrigType16_t tp = mem_type_for_type (dest->m_type);
+  hsa_insn_basic *insn = new hsa_insn_basic (2, BRIG_OPCODE_MOV, tp, dest, src);
   if (hsa_op_reg *sreg = dyn_cast <hsa_op_reg *> (src))
     gcc_assert (hsa_type_bit_size (dest->m_type)
 		== hsa_type_bit_size (sreg->m_type));
@@ -3054,8 +2973,12 @@ gen_hsa_unary_operation (BrigOpcode opcode, hsa_op_reg *dest,
   if (opcode == BRIG_OPCODE_MOV && hsa_needs_cvt (dest->m_type, op1->m_type))
     insn = new hsa_insn_cvt (dest, op1);
   else if (opcode == BRIG_OPCODE_FIRSTBIT || opcode == BRIG_OPCODE_LASTBIT)
-    insn = new hsa_insn_srctype (2, opcode, BRIG_TYPE_U32, op1->m_type, NULL,
-				 op1);
+    {
+      BrigType16_t srctype = hsa_type_integer_p (op1->m_type) ? op1->m_type
+	: hsa_unsigned_type_for_type (op1->m_type);
+      insn = new hsa_insn_srctype (2, opcode, BRIG_TYPE_U32, srctype, NULL,
+				   op1);
+    }
   else
     {
       insn = new hsa_insn_basic (2, opcode, dest->m_type, dest, op1);
@@ -3169,6 +3092,23 @@ gen_hsa_insns_for_operation_assignment (gimple *assign, hsa_bb *hbb)
     case NEGATE_EXPR:
       opcode = BRIG_OPCODE_NEG;
       break;
+    case FMA_EXPR:
+      /* There is a native HSA instruction for scalar FMAs but not for vector
+	 ones.  */
+      if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE)
+	{
+	  hsa_op_reg *dest
+	    = hsa_cfun->reg_for_gimple_ssa (gimple_assign_lhs (assign));
+	  hsa_op_with_type *op1 = hsa_reg_or_immed_for_gimple_op (rhs1, hbb);
+	  hsa_op_with_type *op2 = hsa_reg_or_immed_for_gimple_op (rhs2, hbb);
+	  hsa_op_with_type *op3 = hsa_reg_or_immed_for_gimple_op (rhs3, hbb);
+	  hsa_op_reg *tmp = new hsa_op_reg (dest->m_type);
+	  gen_hsa_binary_operation (BRIG_OPCODE_MUL, tmp, op1, op2, hbb);
+	  gen_hsa_binary_operation (BRIG_OPCODE_ADD, dest, tmp, op3, hbb);
+	  return;
+	}
+      opcode = BRIG_OPCODE_MAD;
+      break;
     case MIN_EXPR:
       opcode = BRIG_OPCODE_MIN;
       break;
@@ -3368,14 +3308,18 @@ gen_hsa_insns_for_operation_assignment (gimple *assign, hsa_bb *hbb)
   switch (rhs_class)
     {
     case GIMPLE_TERNARY_RHS:
-      gcc_unreachable ();
+      {
+	hsa_op_with_type *op3 = hsa_reg_or_immed_for_gimple_op (rhs3, hbb);
+	hsa_insn_basic *insn = new hsa_insn_basic (4, opcode, dest->m_type, dest,
+						   op1, op2, op3);
+	hbb->append_insn (insn);
+      }
       return;
 
-      /* Fall through */
     case GIMPLE_BINARY_RHS:
       gen_hsa_binary_operation (opcode, dest, op1, op2, hbb);
       break;
-      /* Fall through */
+
     case GIMPLE_UNARY_RHS:
       gen_hsa_unary_operation (opcode, dest, op1, hbb);
       break;
@@ -3392,14 +3336,14 @@ static void
 gen_hsa_insns_for_cond_stmt (gimple *cond, hsa_bb *hbb)
 {
   hsa_op_reg *ctrl = new hsa_op_reg (BRIG_TYPE_B1);
-  hsa_insn_br *cbr;
+  hsa_insn_cbr *cbr;
 
   gen_hsa_cmp_insn_from_gimple (gimple_cond_code (cond),
 				gimple_cond_lhs (cond),
 				gimple_cond_rhs (cond),
 				ctrl, hbb);
 
-  cbr = new hsa_insn_br (ctrl);
+  cbr = new hsa_insn_cbr (ctrl);
   hbb->append_insn (cbr);
 }
 
@@ -3476,7 +3420,7 @@ gen_hsa_insns_for_switch_stmt (gswitch *s, hsa_bb *hbb)
   hbb->append_insn (new hsa_insn_basic (3, BRIG_OPCODE_AND, cmp_reg->m_type,
 					cmp_reg, cmp1_reg, cmp2_reg));
 
-  hbb->append_insn (new hsa_insn_br (cmp_reg));
+  hbb->append_insn (new hsa_insn_cbr (cmp_reg));
 
   tree default_label = gimple_switch_default_label (s);
   basic_block default_label_bb = label_to_block_fn (func,
@@ -3537,13 +3481,14 @@ gen_hsa_insns_for_switch_stmt (gswitch *s, hsa_bb *hbb)
 static void
 verify_function_arguments (tree decl)
 {
+  tree type = TREE_TYPE (decl);
   if (DECL_STATIC_CHAIN (decl))
     {
       HSA_SORRY_ATV (EXPR_LOCATION (decl),
 		     "HSA does not support nested functions: %D", decl);
       return;
     }
-  else if (!TYPE_ARG_TYPES (TREE_TYPE (decl)))
+  else if (!TYPE_ARG_TYPES (type) || stdarg_p (type))
     {
       HSA_SORRY_ATV (EXPR_LOCATION (decl),
 		     "HSA does not support functions with variadic arguments "
@@ -3839,33 +3784,58 @@ hsa_insn_basic::set_output_in_type (hsa_op_reg *dest, unsigned op_index,
    HBB.  */
 
 static void
-query_hsa_grid (hsa_op_reg *dest, BrigType16_t opcode, int dimension,
-		hsa_bb *hbb)
+query_hsa_grid_dim (hsa_op_reg *dest, int opcode, hsa_op_immed *dimension,
+		    hsa_bb *hbb)
 {
-  /* We're using just one-dimensional kernels, so hard-coded
-     dimension X.  */
-  hsa_op_immed *imm
-    = new hsa_op_immed (dimension, (BrigKind16_t) BRIG_TYPE_U32);
   hsa_insn_basic *insn = new hsa_insn_basic (2, opcode, BRIG_TYPE_U32, NULL,
-					     imm);
+					     dimension);
   hbb->append_insn (insn);
   insn->set_output_in_type (dest, 0, hbb);
 }
 
-/* Generate a special HSA-related instruction for gimple STMT.
-   Instructions are appended to basic block HBB.  */
+/* Generate instruction OPCODE to query a property of HSA grid along the given
+   dimension which is an immediate in first argument of STMT.  Store result
+   into the register corresponding to LHS of STMT and append the instruction to
+   HBB.  */
 
 static void
-query_hsa_grid (gimple *stmt, BrigOpcode16_t opcode, int dimension,
-		hsa_bb *hbb)
+query_hsa_grid_dim (gimple *stmt, int opcode, hsa_bb *hbb)
 {
   tree lhs = gimple_call_lhs (dyn_cast <gcall *> (stmt));
   if (lhs == NULL_TREE)
     return;
 
+  tree arg = gimple_call_arg (stmt, 0);
+  unsigned HOST_WIDE_INT dim = 5;
+  if (tree_fits_uhwi_p (arg))
+    dim = tree_to_uhwi (arg);
+  if (dim > 2)
+    {
+      HSA_SORRY_AT (gimple_location (stmt),
+		    "HSA grid query dimension must be immediate constant 0, 1 "
+		    "or 2");
+      return;
+    }
+
+  hsa_op_immed *hdim = new hsa_op_immed (dim, (BrigKind16_t) BRIG_TYPE_U32);
   hsa_op_reg *dest = hsa_cfun->reg_for_gimple_ssa (lhs);
+  query_hsa_grid_dim (dest, opcode, hdim, hbb);
+}
+
+/* Generate instruction OPCODE to query a property of HSA grid that is
+   independent of any dimension.  Store result into the register corresponding
+   to LHS of STMT and append the instruction to HBB.  */
 
-  query_hsa_grid (dest, opcode, dimension, hbb);
+static void
+query_hsa_grid_nodim (gimple *stmt, BrigOpcode16_t opcode, hsa_bb *hbb)
+{
+  tree lhs = gimple_call_lhs (dyn_cast <gcall *> (stmt));
+  if (lhs == NULL_TREE)
+    return;
+  hsa_op_reg *dest = hsa_cfun->reg_for_gimple_ssa (lhs);
+  BrigType16_t brig_type = hsa_unsigned_type_for_type (dest->m_type);
+  hsa_insn_basic *insn = new hsa_insn_basic (1, opcode, brig_type, dest);
+  hbb->append_insn (insn);
 }
 
 /* Emit instructions that set hsa_num_threads according to provided VALUE.
@@ -4012,6 +3982,44 @@ gen_num_threads_for_dispatch (hsa_bb *hbb)
   return as_a <hsa_op_reg *> (dest);
 }
 
+/* Build OPCODE query for all three hsa dimensions, multiply them and store the
+   result into DEST.  */
+
+static void
+multiply_grid_dim_characteristics (hsa_op_reg *dest, int opcode, hsa_bb *hbb)
+{
+  hsa_op_reg *dimx = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (dimx, opcode,
+		      new hsa_op_immed (0, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  hsa_op_reg *dimy = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (dimy, opcode,
+		      new hsa_op_immed (1, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  hsa_op_reg *dimz = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (dimz, opcode,
+		      new hsa_op_immed (2, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  hsa_op_reg *tmp = new hsa_op_reg (dest->m_type);
+  gen_hsa_binary_operation (BRIG_OPCODE_MUL, tmp,
+			    dimx->get_in_type (dest->m_type, hbb),
+			    dimy->get_in_type (dest->m_type, hbb), hbb);
+  gen_hsa_binary_operation (BRIG_OPCODE_MUL, dest, tmp,
+			    dimz->get_in_type (dest->m_type, hbb), hbb);
+}
+
+/* Emit instructions that assign number of threads to lhs of gimple STMT.
+   Instructions are appended to basic block HBB.  */
+
+static void
+gen_get_num_threads (gimple *stmt, hsa_bb *hbb)
+{
+  if (gimple_call_lhs (stmt) == NULL_TREE)
+    return;
+
+  hbb->append_insn (new hsa_insn_comment ("omp_get_num_threads"));
+  tree lhs = gimple_call_lhs (stmt);
+  hsa_op_reg *dest = hsa_cfun->reg_for_gimple_ssa (lhs);
+  multiply_grid_dim_characteristics (dest, BRIG_OPCODE_CURRENTWORKGROUPSIZE,
+				     hbb);
+}
 
 /* Emit instructions that assign number of teams to lhs of gimple STMT.
    Instructions are appended to basic block HBB.  */
@@ -4023,15 +4031,9 @@ gen_get_num_teams (gimple *stmt, hsa_bb *hbb)
     return;
 
   hbb->append_insn (new hsa_insn_comment ("omp_get_num_teams"));
-
   tree lhs = gimple_call_lhs (stmt);
   hsa_op_reg *dest = hsa_cfun->reg_for_gimple_ssa (lhs);
-  hsa_op_immed *one = new hsa_op_immed (1, dest->m_type);
-
-  hsa_insn_basic *basic
-    = new hsa_insn_basic (2, BRIG_OPCODE_MOV, dest->m_type, dest, one);
-
-  hbb->append_insn (basic);
+  multiply_grid_dim_characteristics (dest, BRIG_OPCODE_GRIDGROUPS, hbb);
 }
 
 /* Emit instructions that assign a team number to lhs of gimple STMT.
@@ -4044,15 +4046,42 @@ gen_get_team_num (gimple *stmt, hsa_bb *hbb)
     return;
 
   hbb->append_insn (new hsa_insn_comment ("omp_get_team_num"));
-
   tree lhs = gimple_call_lhs (stmt);
   hsa_op_reg *dest = hsa_cfun->reg_for_gimple_ssa (lhs);
-  hsa_op_immed *zero = new hsa_op_immed (0, dest->m_type);
 
-  hsa_insn_basic *basic
-    = new hsa_insn_basic (2, BRIG_OPCODE_MOV, dest->m_type, dest, zero);
-
-  hbb->append_insn (basic);
+  hsa_op_reg *gnum_x = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (gnum_x, BRIG_OPCODE_GRIDGROUPS,
+		      new hsa_op_immed (0, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  hsa_op_reg *gnum_y = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (gnum_y, BRIG_OPCODE_GRIDGROUPS,
+		      new hsa_op_immed (1, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+
+  hsa_op_reg *gno_z = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (gno_z, BRIG_OPCODE_WORKGROUPID,
+		      new hsa_op_immed (2, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+
+  hsa_op_reg *tmp1 = new hsa_op_reg (dest->m_type);
+  gen_hsa_binary_operation (BRIG_OPCODE_MUL, tmp1,
+			    gnum_x->get_in_type (dest->m_type, hbb),
+			    gnum_y->get_in_type (dest->m_type, hbb), hbb);
+  hsa_op_reg *tmp2 = new hsa_op_reg (dest->m_type);
+  gen_hsa_binary_operation (BRIG_OPCODE_MUL, tmp2, tmp1,
+			    gno_z->get_in_type (dest->m_type, hbb), hbb);
+
+  hsa_op_reg *gno_y = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (gno_y, BRIG_OPCODE_WORKGROUPID,
+		      new hsa_op_immed (1, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  hsa_op_reg *tmp3 = new hsa_op_reg (dest->m_type);
+  gen_hsa_binary_operation (BRIG_OPCODE_MUL, tmp3,
+			    gnum_x->get_in_type (dest->m_type, hbb),
+			    gno_y->get_in_type (dest->m_type, hbb), hbb);
+  hsa_op_reg *tmp4 = new hsa_op_reg (dest->m_type);
+  gen_hsa_binary_operation (BRIG_OPCODE_ADD, tmp4, tmp3, tmp2, hbb);
+  hsa_op_reg *gno_x = new hsa_op_reg (BRIG_TYPE_U32);
+  query_hsa_grid_dim (gno_x, BRIG_OPCODE_WORKGROUPID,
+		      new hsa_op_immed (0, (BrigKind16_t) BRIG_TYPE_U32), hbb);
+  gen_hsa_binary_operation (BRIG_OPCODE_ADD, dest, tmp4,
+			    gno_x->get_in_type (dest->m_type, hbb), hbb);
 }
 
 /* Emit instructions that get levels-var ICV to lhs of gimple STMT.
@@ -4263,12 +4292,13 @@ gen_hsa_popcount_to_dest (hsa_op_reg *dest, hsa_op_with_type *arg, hsa_bb *hbb)
   if (hsa_type_bit_size (arg->m_type) < 32)
     arg = arg->get_in_type (BRIG_TYPE_B32, hbb);
 
+  BrigType16_t srctype = hsa_bittype_for_type (arg->m_type);
   if (!hsa_btype_p (arg->m_type))
-    arg = arg->get_in_type (hsa_bittype_for_type (arg->m_type), hbb);
+    arg = arg->get_in_type (srctype, hbb);
 
   hsa_insn_srctype *popcount
     = new hsa_insn_srctype (2, BRIG_OPCODE_POPCOUNT, BRIG_TYPE_U32,
-			    arg->m_type, NULL, arg);
+			    srctype, NULL, arg);
   hbb->append_insn (popcount);
   popcount->set_output_in_type (dest, 0, hbb);
 }
@@ -4339,11 +4369,11 @@ omp_simple_builtin::generate (gimple *stmt, hsa_bb *hbb)
   if (m_sorry)
     {
       if (m_warning_message)
-	HSA_SORRY_AT (gimple_location (stmt), m_warning_message)
+	HSA_SORRY_AT (gimple_location (stmt), m_warning_message);
       else
 	HSA_SORRY_ATV (gimple_location (stmt),
 		       "Support for HSA does not implement calls to %s\n",
-		       m_name)
+		       m_name);
     }
   else if (m_warning_message != NULL)
     warning_at (gimple_location (stmt), OPT_Whsa, m_warning_message);
@@ -4398,12 +4428,12 @@ gen_hsa_insns_for_known_library_call (gimple *stmt, hsa_bb *hbb)
       else if (strcmp (name, "omp_get_thread_num") == 0)
 	{
 	  hbb->append_insn (new hsa_insn_comment (name));
-	  query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, 0, hbb);
+	  query_hsa_grid_nodim (stmt, BRIG_OPCODE_WORKITEMFLATABSID, hbb);
 	}
       else if (strcmp (name, "omp_get_num_threads") == 0)
 	{
 	  hbb->append_insn (new hsa_insn_comment (name));
-	  query_hsa_grid (stmt, BRIG_OPCODE_GRIDSIZE, 0, hbb);
+	  gen_get_num_threads (stmt, hbb);
 	}
       else if (strcmp (name, "omp_get_num_teams") == 0)
 	gen_get_num_teams (stmt, hbb);
@@ -4589,7 +4619,7 @@ expand_string_operation_builtin (gimple *stmt, hsa_bb *hbb,
 {
   edge e = split_block (hbb->m_bb, stmt);
   basic_block condition_bb = e->src;
-  hbb->append_insn (new hsa_insn_br (misaligned_flag));
+  hbb->append_insn (new hsa_insn_cbr (misaligned_flag));
 
   /* Prepare the control flow.  */
   edge condition_edge = EDGE_SUCC (condition_bb, 0);
@@ -4718,95 +4748,86 @@ expand_memory_set (gimple *stmt, unsigned HOST_WIDE_INT n,
   expand_lhs_of_string_op (stmt, n, merge_bb, builtin);
 }
 
-/* Return string for MEMMODEL.  */
+/* Store into MEMORDER the memory order specified by tree T, which must be an
+   integer constant representing a C++ memory order.  If it isn't, issue an HSA
+   sorry message using LOC and return true, otherwise return false and store
+   the name of the requested order to *MNAME.  */
 
-static const char *
-get_memory_order_name (unsigned memmodel)
+static bool
+hsa_memorder_from_tree (tree t, BrigMemoryOrder *memorder, const char **mname,
+			location_t loc)
 {
-  switch (memmodel & MEMMODEL_BASE_MASK)
+  if (!tree_fits_uhwi_p (t))
     {
-    case MEMMODEL_RELAXED:
-      return "relaxed";
-    case MEMMODEL_CONSUME:
-      return "consume";
-    case MEMMODEL_ACQUIRE:
-      return "acquire";
-    case MEMMODEL_RELEASE:
-      return "release";
-    case MEMMODEL_ACQ_REL:
-      return "acq_rel";
-    case MEMMODEL_SEQ_CST:
-      return "seq_cst";
-    default:
-      return NULL;
+      HSA_SORRY_ATV (loc, "support for HSA does not implement memory model %E",
+		     t);
+      return true;
     }
-}
-
-/* Return memory order according to predefined __atomic memory model
-   constants.  LOCATION is provided to locate the problematic statement.  */
 
-static BrigMemoryOrder
-get_memory_order (unsigned memmodel, location_t location)
-{
-  switch (memmodel & MEMMODEL_BASE_MASK)
+  unsigned HOST_WIDE_INT mm = tree_to_uhwi (t);
+  switch (mm & MEMMODEL_BASE_MASK)
     {
     case MEMMODEL_RELAXED:
-      return BRIG_MEMORY_ORDER_RELAXED;
+      *memorder = BRIG_MEMORY_ORDER_RELAXED;
+      *mname = "relaxed";
+      break;
     case MEMMODEL_CONSUME:
       /* HSA does not have an equivalent, but we can use the slightly stronger
 	 ACQUIRE.  */
+      *memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE;
+      *mname = "consume";
+      break;
     case MEMMODEL_ACQUIRE:
-      return BRIG_MEMORY_ORDER_SC_ACQUIRE;
+      *memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE;
+      *mname = "acquire";
+      break;
     case MEMMODEL_RELEASE:
-      return BRIG_MEMORY_ORDER_SC_RELEASE;
+      *memorder = BRIG_MEMORY_ORDER_SC_RELEASE;
+      *mname = "release";
+      break;
     case MEMMODEL_ACQ_REL:
+      *memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+      *mname = "acq_rel";
+      break;
     case MEMMODEL_SEQ_CST:
       /* Callers implementing a simple load or store need to remove the release
 	 or acquire part respectively.  */
-      return BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+      *memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+      *mname = "seq_cst";
+      break;
     default:
       {
-	const char *mmname = get_memory_order_name (memmodel);
-	HSA_SORRY_ATV (location,
-		       "support for HSA does not implement the specified "
-		       " memory model%s %s",
-		       mmname ? ": " : "", mmname ? mmname : "");
-	return BRIG_MEMORY_ORDER_NONE;
+	HSA_SORRY_AT (loc, "support for HSA does not implement the specified "
+		      "memory model");
+	return true;
       }
     }
+  return false;
 }
 
-/* Helper function to create an HSA atomic binary operation instruction out of
-   calls to atomic builtins.  RET_ORIG is true if the built-in is the variant
-   that return s the value before applying operation, and false if it should
-   return the value after applying the operation (if it returns value at all).
-   ACODE is the atomic operation code, STMT is a gimple call to a builtin.  HBB
-   is the HSA BB to which the instruction should be added.  */
+/* Helper function to create an HSA atomic operation instruction out of calls
+   to atomic builtins.  RET_ORIG is true if the built-in is the variant that
+   return s the value before applying operation, and false if it should return
+   the value after applying the operation (if it returns value at all).  ACODE
+   is the atomic operation code, STMT is a gimple call to a builtin.  HBB is
+   the HSA BB to which the instruction should be added.  If SIGNAL is true, the
+   created operation will work on HSA signals rather than atomic variables.  */
 
 static void
-gen_hsa_ternary_atomic_for_builtin (bool ret_orig,
- 				    enum BrigAtomicOperation acode,
-				    gimple *stmt,
-				    hsa_bb *hbb)
+gen_hsa_atomic_for_builtin (bool ret_orig, enum BrigAtomicOperation acode,
+			    gimple *stmt, hsa_bb *hbb, bool signal)
 {
   tree lhs = gimple_call_lhs (stmt);
 
   tree type = TREE_TYPE (gimple_call_arg (stmt, 1));
   BrigType16_t hsa_type = hsa_type_for_scalar_tree_type (type, false);
   BrigType16_t mtype = mem_type_for_type (hsa_type);
-  tree model = gimple_call_arg (stmt, 2);
+  BrigMemoryOrder memorder;
+  const char *mmname;
 
-  if (!tree_fits_uhwi_p (model))
-    {
-      HSA_SORRY_ATV (gimple_location (stmt),
-		     "support for HSA does not implement memory model %E",
-		     model);
-      return;
-    }
-
-  unsigned HOST_WIDE_INT mmodel = tree_to_uhwi (model);
-
-  BrigMemoryOrder memorder = get_memory_order (mmodel, gimple_location (stmt));
+  if (hsa_memorder_from_tree (gimple_call_arg (stmt, 2), &memorder, &mmname,
+			      gimple_location (stmt)))
+    return;
 
   /* Certain atomic insns must have Bx memory types.  */
   switch (acode)
@@ -4831,13 +4852,13 @@ gen_hsa_ternary_atomic_for_builtin (bool ret_orig,
 	dest = hsa_cfun->reg_for_gimple_ssa (lhs);
       else
 	dest = new hsa_op_reg (hsa_type);
-      opcode = BRIG_OPCODE_ATOMIC;
+      opcode = signal ? BRIG_OPCODE_SIGNAL : BRIG_OPCODE_ATOMIC;
       nops = 3;
     }
   else
     {
       dest = NULL;
-      opcode = BRIG_OPCODE_ATOMICNORET;
+      opcode = signal ? BRIG_OPCODE_SIGNALNORET : BRIG_OPCODE_ATOMICNORET;
       nops = 2;
     }
 
@@ -4852,35 +4873,44 @@ gen_hsa_ternary_atomic_for_builtin (bool ret_orig,
 	{
 	  HSA_SORRY_ATV (gimple_location (stmt),
 			 "support for HSA does not implement memory model for "
-			 "ATOMIC_ST: %s", get_memory_order_name (mmodel));
+			 "ATOMIC_ST: %s", mmname);
 	  return;
 	}
     }
 
-  hsa_insn_atomic *atominsn = new hsa_insn_atomic (nops, opcode, acode, mtype,
-						   memorder);
-
-  hsa_op_address *addr;
-  addr = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
-  if (addr->m_symbol && addr->m_symbol->m_segment == BRIG_SEGMENT_PRIVATE)
+  hsa_insn_basic *atominsn;
+  hsa_op_base *tgt;
+  if (signal)
     {
-      HSA_SORRY_AT (gimple_location (stmt),
-		    "HSA does not implement atomic operations in private "
-		    "segment");
-      return;
+      atominsn = new hsa_insn_signal (nops, opcode, acode, mtype, memorder);
+      tgt = hsa_reg_or_immed_for_gimple_op (gimple_call_arg (stmt, 0), hbb);
     }
+  else
+    {
+      atominsn = new hsa_insn_atomic (nops, opcode, acode, mtype, memorder);
+      hsa_op_address *addr;
+      addr = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
+      if (addr->m_symbol && addr->m_symbol->m_segment == BRIG_SEGMENT_PRIVATE)
+	{
+	  HSA_SORRY_AT (gimple_location (stmt),
+			"HSA does not implement atomic operations in private "
+			"segment");
+	  return;
+	}
+      tgt = addr;
+    }
+
   hsa_op_base *op = hsa_reg_or_immed_for_gimple_op (gimple_call_arg (stmt, 1),
 						    hbb);
-
   if (lhs)
     {
       atominsn->set_op (0, dest);
-      atominsn->set_op (1, addr);
+      atominsn->set_op (1, tgt);
       atominsn->set_op (2, op);
     }
   else
     {
-      atominsn->set_op (0, addr);
+      atominsn->set_op (0, tgt);
       atominsn->set_op (1, op);
     }
 
@@ -4950,6 +4980,10 @@ gen_hsa_insn_for_internal_fn_call (gcall *stmt, hsa_bb *hbb)
       gen_hsa_unaryop_for_builtin (BRIG_OPCODE_SQRT, stmt, hbb);
       break;
 
+    case IFN_RSQRT:
+      gen_hsa_unaryop_for_builtin (BRIG_OPCODE_NRSQRT, stmt, hbb);
+      break;
+
     case IFN_TRUNC:
       gen_hsa_unaryop_for_builtin (BRIG_OPCODE_TRUNC, stmt, hbb);
       break;
@@ -5068,6 +5102,12 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
   if (!gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
     {
       tree function_decl = gimple_call_fndecl (stmt);
+      /* Prefetch pass can create type-mismatching prefetch builtin calls which
+	 fail the gimple_call_builtin_p test above.  Handle them here.  */
+      if (DECL_BUILT_IN_CLASS (function_decl)
+	  && DECL_FUNCTION_CODE (function_decl) == BUILT_IN_PREFETCH)
+	return;
+
       if (function_decl == NULL_TREE)
 	{
 	  HSA_SORRY_AT (gimple_location (stmt),
@@ -5185,21 +5225,14 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_LOAD_16:
       {
 	BrigType16_t mtype;
-	hsa_op_address *addr;
-	addr = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
-	tree model = gimple_call_arg (stmt, 1);
-	if (!tree_fits_uhwi_p (model))
-	  {
-	    HSA_SORRY_ATV (gimple_location (stmt),
-			   "support for HSA does not implement "
-			   "memory model: %E",
-			   model);
-	    return;
-	  }
+	hsa_op_base *src;
+	src = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
 
-	unsigned HOST_WIDE_INT mmodel = tree_to_uhwi (model);
-	BrigMemoryOrder memorder = get_memory_order (mmodel,
-						     gimple_location (stmt));
+	BrigMemoryOrder memorder;
+	const char *mmname;
+	if (hsa_memorder_from_tree (gimple_call_arg (stmt, 1), &memorder,
+				    &mmname, gimple_location (stmt)))
+	  return;
 
 	if (memorder == BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE)
 	  memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE;
@@ -5210,8 +5243,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
 	  {
 	    HSA_SORRY_ATV (gimple_location (stmt),
 			   "support for HSA does not implement "
-			   "memory model for ATOMIC_LD: %s",
-			   get_memory_order_name (mmodel));
+			   "memory model for atomic loads: %s", mmname);
 	    return;
 	  }
 
@@ -5229,9 +5261,9 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
 	    dest = new hsa_op_reg (mtype);
 	  }
 
-	hsa_insn_atomic *atominsn
-	  = new hsa_insn_atomic (2, BRIG_OPCODE_ATOMIC, BRIG_ATOMIC_LD, mtype,
-				 memorder, dest, addr);
+	hsa_insn_basic *atominsn;
+	atominsn = new hsa_insn_atomic (2, BRIG_OPCODE_ATOMIC, BRIG_ATOMIC_LD,
+					mtype, memorder, dest, src);
 
 	hbb->append_insn (atominsn);
 	break;
@@ -5242,7 +5274,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_EXCHANGE_4:
     case BUILT_IN_ATOMIC_EXCHANGE_8:
     case BUILT_IN_ATOMIC_EXCHANGE_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_EXCH, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_EXCH, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_FETCH_ADD_1:
@@ -5250,7 +5283,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_FETCH_ADD_4:
     case BUILT_IN_ATOMIC_FETCH_ADD_8:
     case BUILT_IN_ATOMIC_FETCH_ADD_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_ADD, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_ADD, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_FETCH_SUB_1:
@@ -5258,7 +5292,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_FETCH_SUB_4:
     case BUILT_IN_ATOMIC_FETCH_SUB_8:
     case BUILT_IN_ATOMIC_FETCH_SUB_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_SUB, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_SUB, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_FETCH_AND_1:
@@ -5266,7 +5301,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_FETCH_AND_4:
     case BUILT_IN_ATOMIC_FETCH_AND_8:
     case BUILT_IN_ATOMIC_FETCH_AND_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_AND, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_AND, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_FETCH_XOR_1:
@@ -5274,7 +5310,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_FETCH_XOR_4:
     case BUILT_IN_ATOMIC_FETCH_XOR_8:
     case BUILT_IN_ATOMIC_FETCH_XOR_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_XOR, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_XOR, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_FETCH_OR_1:
@@ -5282,7 +5319,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_FETCH_OR_4:
     case BUILT_IN_ATOMIC_FETCH_OR_8:
     case BUILT_IN_ATOMIC_FETCH_OR_16:
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_OR, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_OR, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_STORE_1:
@@ -5291,7 +5329,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_STORE_8:
     case BUILT_IN_ATOMIC_STORE_16:
       /* Since there cannot be any LHS, the first parameter is meaningless.  */
-      gen_hsa_ternary_atomic_for_builtin (true, BRIG_ATOMIC_ST, stmt, hbb);
+      gen_hsa_atomic_for_builtin (true, BRIG_ATOMIC_ST, stmt, hbb, false);
+      break;
       break;
 
     case BUILT_IN_ATOMIC_ADD_FETCH_1:
@@ -5299,7 +5338,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_ADD_FETCH_4:
     case BUILT_IN_ATOMIC_ADD_FETCH_8:
     case BUILT_IN_ATOMIC_ADD_FETCH_16:
-      gen_hsa_ternary_atomic_for_builtin (false, BRIG_ATOMIC_ADD, stmt, hbb);
+      gen_hsa_atomic_for_builtin (false, BRIG_ATOMIC_ADD, stmt, hbb, false);
       break;
 
     case BUILT_IN_ATOMIC_SUB_FETCH_1:
@@ -5307,7 +5346,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_SUB_FETCH_4:
     case BUILT_IN_ATOMIC_SUB_FETCH_8:
     case BUILT_IN_ATOMIC_SUB_FETCH_16:
-      gen_hsa_ternary_atomic_for_builtin (false, BRIG_ATOMIC_SUB, stmt, hbb);
+      gen_hsa_atomic_for_builtin (false, BRIG_ATOMIC_SUB, stmt, hbb, false);
       break;
 
     case BUILT_IN_ATOMIC_AND_FETCH_1:
@@ -5315,7 +5354,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_AND_FETCH_4:
     case BUILT_IN_ATOMIC_AND_FETCH_8:
     case BUILT_IN_ATOMIC_AND_FETCH_16:
-      gen_hsa_ternary_atomic_for_builtin (false, BRIG_ATOMIC_AND, stmt, hbb);
+      gen_hsa_atomic_for_builtin (false, BRIG_ATOMIC_AND, stmt, hbb, false);
       break;
 
     case BUILT_IN_ATOMIC_XOR_FETCH_1:
@@ -5323,7 +5362,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_XOR_FETCH_4:
     case BUILT_IN_ATOMIC_XOR_FETCH_8:
     case BUILT_IN_ATOMIC_XOR_FETCH_16:
-      gen_hsa_ternary_atomic_for_builtin (false, BRIG_ATOMIC_XOR, stmt, hbb);
+      gen_hsa_atomic_for_builtin (false, BRIG_ATOMIC_XOR, stmt, hbb, false);
       break;
 
     case BUILT_IN_ATOMIC_OR_FETCH_1:
@@ -5331,7 +5370,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_ATOMIC_OR_FETCH_4:
     case BUILT_IN_ATOMIC_OR_FETCH_8:
     case BUILT_IN_ATOMIC_OR_FETCH_16:
-      gen_hsa_ternary_atomic_for_builtin (false, BRIG_ATOMIC_OR, stmt, hbb);
+      gen_hsa_atomic_for_builtin (false, BRIG_ATOMIC_OR, stmt, hbb, false);
       break;
 
     case BUILT_IN_SYNC_VAL_COMPARE_AND_SWAP_1:
@@ -5340,27 +5379,23 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
     case BUILT_IN_SYNC_VAL_COMPARE_AND_SWAP_8:
     case BUILT_IN_SYNC_VAL_COMPARE_AND_SWAP_16:
       {
-	/* TODO: Use the appropriate memory model for now.  */
 	tree type = TREE_TYPE (gimple_call_arg (stmt, 1));
-
 	BrigType16_t atype
 	  = hsa_bittype_for_type (hsa_type_for_scalar_tree_type (type, false));
-
-	hsa_insn_atomic *atominsn
-	  = new hsa_insn_atomic (4, BRIG_OPCODE_ATOMIC, BRIG_ATOMIC_CAS, atype,
-				 BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE);
-	hsa_op_address *addr;
-	addr = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
+	BrigMemoryOrder memorder = BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+	hsa_insn_basic *atominsn;
+	hsa_op_base *tgt;
+	atominsn = new hsa_insn_atomic (4, BRIG_OPCODE_ATOMIC,
+					BRIG_ATOMIC_CAS, atype, memorder);
+	tgt = get_address_from_value (gimple_call_arg (stmt, 0), hbb);
 
 	if (lhs != NULL)
 	  dest = hsa_cfun->reg_for_gimple_ssa (lhs);
 	else
 	  dest = new hsa_op_reg (atype);
 
-	/* Should check what the memory scope is.  */
-	atominsn->m_memoryscope = BRIG_MEMORY_SCOPE_WORKGROUP;
 	atominsn->set_op (0, dest);
-	atominsn->set_op (1, addr);
+	atominsn->set_op (1, tgt);
 
 	hsa_op_with_type *op
 	  = hsa_reg_or_immed_for_gimple_op (gimple_call_arg (stmt, 1), hbb);
@@ -5371,20 +5406,42 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
 	hbb->append_insn (atominsn);
 	break;
       }
+
+    case BUILT_IN_HSA_WORKGROUPID:
+      query_hsa_grid_dim (stmt, BRIG_OPCODE_WORKGROUPID, hbb);
+      break;
+    case BUILT_IN_HSA_WORKITEMID:
+      query_hsa_grid_dim (stmt, BRIG_OPCODE_WORKITEMID, hbb);
+      break;
+    case BUILT_IN_HSA_WORKITEMABSID:
+      query_hsa_grid_dim (stmt, BRIG_OPCODE_WORKITEMABSID, hbb);
+      break;
+    case BUILT_IN_HSA_GRIDSIZE:
+      query_hsa_grid_dim (stmt, BRIG_OPCODE_GRIDSIZE, hbb);
+      break;
+    case BUILT_IN_HSA_CURRENTWORKGROUPSIZE:
+      query_hsa_grid_dim (stmt, BRIG_OPCODE_CURRENTWORKGROUPSIZE, hbb);
+      break;
+
+    case BUILT_IN_GOMP_BARRIER:
+      hbb->append_insn (new hsa_insn_br (0, BRIG_OPCODE_BARRIER, BRIG_TYPE_NONE,
+					 BRIG_WIDTH_ALL));
+      break;
     case BUILT_IN_GOMP_PARALLEL:
       HSA_SORRY_AT (gimple_location (stmt),
 		    "support for HSA does not implement non-gridified "
 		    "OpenMP parallel constructs.");
       break;
+
     case BUILT_IN_OMP_GET_THREAD_NUM:
       {
-	query_hsa_grid (stmt, BRIG_OPCODE_WORKITEMABSID, 0, hbb);
+	query_hsa_grid_nodim (stmt, BRIG_OPCODE_WORKITEMFLATABSID, hbb);
 	break;
       }
 
     case BUILT_IN_OMP_GET_NUM_THREADS:
       {
-	query_hsa_grid (stmt, BRIG_OPCODE_GRIDSIZE, 0, hbb);
+	gen_get_num_threads (stmt, hbb);
 	break;
       }
     case BUILT_IN_GOMP_TEAMS:
@@ -5469,9 +5526,19 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
 	gen_hsa_alloca (call, hbb);
 	break;
       }
+    case BUILT_IN_PREFETCH:
+      break;
     default:
       {
-	gen_hsa_insns_for_direct_call (stmt, hbb);
+	tree name_tree = DECL_NAME (fndecl);
+	const char *s = IDENTIFIER_POINTER (name_tree);
+	size_t len = strlen (s);
+	if (len > 4 && (strncmp (s, "__builtin_GOMP_", 15) == 0))
+	  HSA_SORRY_ATV (gimple_location (stmt),
+			 "support for HSA does not implement GOMP function %s",
+			 s);
+	else
+	  gen_hsa_insns_for_direct_call (stmt, hbb);
 	return;
       }
     }
@@ -5601,13 +5668,7 @@ gen_hsa_phi_from_gimple_phi (gimple *phi_stmt, hsa_bb *hbb)
 	}
     }
 
-  hphi->m_prev = hbb->m_last_phi;
-  hphi->m_next = NULL;
-  if (hbb->m_last_phi)
-    hbb->m_last_phi->m_next = hphi;
-  hbb->m_last_phi = hphi;
-  if (!hbb->m_first_phi)
-    hbb->m_first_phi = hphi;
+  hbb->append_phi (hphi);
 }
 
 /* Constructor of class containing HSA-specific information about a basic
@@ -5650,7 +5711,8 @@ hsa_bb::~hsa_bb ()
 hsa_bb *
 hsa_init_new_bb (basic_block bb)
 {
-  return new (*hsa_allocp_bb) hsa_bb (bb);
+  void *m = obstack_alloc (&hsa_obstack, sizeof (hsa_bb));
+  return new (m) hsa_bb (bb);
 }
 
 /* Initialize OMP in an HSA basic block PROLOGUE.  */
diff --git a/gcc/hsa.c b/gcc/hsa.c
index 168cfe373c0..f881e781742 100644
--- a/gcc/hsa.c
+++ b/gcc/hsa.c
@@ -170,6 +170,7 @@ hsa_insn_basic::op_output_p (unsigned opnum)
     case BRIG_OPCODE_SBR:
     case BRIG_OPCODE_ST:
     case BRIG_OPCODE_SIGNALNORET:
+    case BRIG_OPCODE_DEBUGTRAP:
       /* FIXME: There are probably missing cases here, double check.  */
       return false;
     case BRIG_OPCODE_EXPAND:
@@ -605,8 +606,8 @@ hsa_destroy_insn (hsa_insn_basic *insn)
 {
   if (hsa_insn_phi *phi = dyn_cast <hsa_insn_phi *> (insn))
     phi->~hsa_insn_phi ();
-  else if (hsa_insn_br *br = dyn_cast <hsa_insn_br *> (insn))
-    br->~hsa_insn_br ();
+  else if (hsa_insn_cbr *br = dyn_cast <hsa_insn_cbr *> (insn))
+    br->~hsa_insn_cbr ();
   else if (hsa_insn_cmp *cmp = dyn_cast <hsa_insn_cmp *> (insn))
     cmp->~hsa_insn_cmp ();
   else if (hsa_insn_mem *mem = dyn_cast <hsa_insn_mem *> (insn))
@@ -621,6 +622,8 @@ hsa_destroy_insn (hsa_insn_basic *insn)
     block->~hsa_insn_arg_block ();
   else if (hsa_insn_sbr *sbr = dyn_cast <hsa_insn_sbr *> (insn))
     sbr->~hsa_insn_sbr ();
+  else if (hsa_insn_br *br = dyn_cast <hsa_insn_br *> (insn))
+    br->~hsa_insn_br ();
   else if (hsa_insn_comment *comment = dyn_cast <hsa_insn_comment *> (insn))
     comment->~hsa_insn_comment ();
   else
@@ -783,32 +786,22 @@ hsa_brig_function_name (const char *p)
   return buf;
 }
 
-/* Return declaration name if exists.  */
+/* Add a flatten attribute and disable vectorization for gpu implementation
+   function decl GDECL.  */
 
-const char *
-hsa_get_declaration_name (tree decl)
+void hsa_summary_t::process_gpu_implementation_attributes (tree gdecl)
 {
-  if (!DECL_NAME (decl))
-    {
-      char buf[64];
-      snprintf (buf, 64, "__hsa_anonymous_%i", DECL_UID (decl));
-      const char *ggc_str = ggc_strdup (buf);
-      return ggc_str;
-    }
-
-  tree name_tree;
-  if (TREE_CODE (decl) == FUNCTION_DECL
-      || (VAR_P (decl) && is_global_var (decl)))
-    name_tree = DECL_ASSEMBLER_NAME (decl);
-  else
-    name_tree = DECL_NAME (decl);
-
-  const char *name = IDENTIFIER_POINTER (name_tree);
-  /* User-defined assembly names have prepended asterisk symbol.  */
-  if (name[0] == '*')
-    name++;
+  DECL_ATTRIBUTES (gdecl)
+    = tree_cons (get_identifier ("flatten"), NULL_TREE,
+		 DECL_ATTRIBUTES (gdecl));
 
-  return name;
+  tree fn_opts = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (gdecl);
+  if (fn_opts == NULL_TREE)
+    fn_opts = optimization_default_node;
+  fn_opts = copy_node (fn_opts);
+  TREE_OPTIMIZATION (fn_opts)->x_flag_tree_loop_vectorize = false;
+  TREE_OPTIMIZATION (fn_opts)->x_flag_tree_slp_vectorize = false;
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (gdecl) = fn_opts;
 }
 
 void
@@ -827,21 +820,10 @@ hsa_summary_t::link_functions (cgraph_node *gpu, cgraph_node *host,
   gpu_summary->m_gridified_kernel_p = gridified_kernel_p;
   host_summary->m_gridified_kernel_p = gridified_kernel_p;
 
-  gpu_summary->m_binded_function = host;
-  host_summary->m_binded_function = gpu;
-
-  tree gdecl = gpu->decl;
-  DECL_ATTRIBUTES (gdecl)
-    = tree_cons (get_identifier ("flatten"), NULL_TREE,
-		 DECL_ATTRIBUTES (gdecl));
+  gpu_summary->m_bound_function = host;
+  host_summary->m_bound_function = gpu;
 
-  tree fn_opts = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (gdecl);
-  if (fn_opts == NULL_TREE)
-    fn_opts = optimization_default_node;
-  fn_opts = copy_node (fn_opts);
-  TREE_OPTIMIZATION (fn_opts)->x_flag_tree_loop_vectorize = false;
-  TREE_OPTIMIZATION (fn_opts)->x_flag_tree_slp_vectorize = false;
-  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (gdecl) = fn_opts;
+  process_gpu_implementation_attributes (gpu->decl);
 
   /* Create reference between a kernel and a corresponding host implementation
      to quarantee LTO streaming to a same LTRANS.  */
diff --git a/gcc/hsa.h b/gcc/hsa.h
index 1b57a3c1c99..c00ffd577fa 100644
--- a/gcc/hsa.h
+++ b/gcc/hsa.h
@@ -50,7 +50,6 @@ class hsa_insn_basic;
 class hsa_op_address;
 class hsa_op_reg;
 class hsa_bb;
-typedef hsa_insn_basic *hsa_insn_basic_p;
 
 /* Class representing an input argument, output argument (result) or a
    variable, that will eventually end up being a symbol directive.  */
@@ -72,7 +71,8 @@ struct hsa_symbol
   void fillup_for_decl (tree decl);
 
   /* Pointer to the original tree, which is PARM_DECL for input parameters and
-     RESULT_DECL for the output parameters.  */
+     RESULT_DECL for the output parameters.  Also can be CONST_DECL for Fortran
+     constants which need to be put into readonly segment.  */
   tree m_decl;
 
   /* Name of the symbol, that will be written into output and dumps.  Can be
@@ -259,11 +259,9 @@ private:
   /* Set definition where the register is defined.  */
   void set_definition (hsa_insn_basic *insn);
   /* Uses of the value while still in SSA.  */
-  auto_vec <hsa_insn_basic_p> m_uses;
+  auto_vec <hsa_insn_basic *> m_uses;
 };
 
-typedef class hsa_op_reg *hsa_op_reg_p;
-
 /* Report whether or not P is a register operand.  */
 
 template <>
@@ -490,17 +488,12 @@ class hsa_insn_phi : public hsa_insn_basic
 public:
   hsa_insn_phi (unsigned nops, hsa_op_reg *dst);
 
-  void *operator new (size_t);
-
   /* Destination.  */
   hsa_op_reg *m_dest;
 
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_phi () : hsa_insn_basic (1, HSA_OPCODE_PHI) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a PHI node.  */
@@ -513,35 +506,56 @@ is_a_helper <hsa_insn_phi *>::test (hsa_insn_basic *p)
   return p->m_opcode == HSA_OPCODE_PHI;
 }
 
-/* HSA instruction for branches.  Currently we explicitely represent only
-   conditional branches.  */
-
+/* HSA instruction for  */
 class hsa_insn_br : public hsa_insn_basic
 {
 public:
-  hsa_insn_br (hsa_op_reg *ctrl);
-
-  void *operator new (size_t);
+  hsa_insn_br (unsigned nops, int opc, BrigType16_t t, BrigWidth8_t width,
+	       hsa_op_base *arg0 = NULL, hsa_op_base *arg1 = NULL,
+	       hsa_op_base *arg2 = NULL, hsa_op_base *arg3 = NULL);
 
-  /* Width as described in HSA documentation.  */
+  /* Number of work-items affected in the same way by the instruction.  */
   BrigWidth8_t m_width;
+
 private:
   /* Make the default constructor inaccessible.  */
-  hsa_insn_br () : hsa_insn_basic (1, BRIG_OPCODE_CBR) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
+  hsa_insn_br () : hsa_insn_basic (0, BRIG_OPCODE_BR) {}
 };
 
-/* Report whether P is a branching instruction.  */
+/* Return true if P is a branching/synchronization instruction.  */
 
 template <>
 template <>
 inline bool
 is_a_helper <hsa_insn_br *>::test (hsa_insn_basic *p)
 {
-  return p->m_opcode == BRIG_OPCODE_BR
-    || p->m_opcode == BRIG_OPCODE_CBR;
+  return p->m_opcode == BRIG_OPCODE_BARRIER
+    || p->m_opcode == BRIG_OPCODE_BR;
+}
+
+/* HSA instruction for conditional branches.  Structurally the same as
+   hsa_insn_br but we represent it specially because of inherent control
+   flow it represents.  */
+
+class hsa_insn_cbr : public hsa_insn_br
+{
+public:
+  hsa_insn_cbr (hsa_op_reg *ctrl);
+
+private:
+  /* Make the default constructor inaccessible.  */
+  hsa_insn_cbr () : hsa_insn_br (0, BRIG_OPCODE_CBR, BRIG_TYPE_B1,
+				 BRIG_WIDTH_1) {}
+};
+
+/* Report whether P is a contitional branching instruction.  */
+
+template <>
+template <>
+inline bool
+is_a_helper <hsa_insn_cbr *>::test (hsa_insn_basic *p)
+{
+  return p->m_opcode == BRIG_OPCODE_CBR;
 }
 
 /* HSA instruction for switch branches.  */
@@ -554,8 +568,6 @@ public:
   /* Default destructor.  */
   ~hsa_insn_sbr ();
 
-  void *operator new (size_t);
-
   void replace_all_labels (basic_block old_bb, basic_block new_bb);
 
   /* Width as described in HSA documentation.  */
@@ -570,9 +582,6 @@ public:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_sbr () : hsa_insn_basic (1, BRIG_OPCODE_SBR) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether P is a switch branching instruction.  */
@@ -594,8 +603,6 @@ public:
 		hsa_op_base *arg0 = NULL, hsa_op_base *arg1 = NULL,
 		hsa_op_base *arg2 = NULL);
 
-  void *operator new (size_t);
-
   /* Source type should be derived from operand types.  */
 
   /* The comparison operation.  */
@@ -606,9 +613,6 @@ public:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_cmp () : hsa_insn_basic (1, BRIG_OPCODE_CMP) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a comparison instruction.  */
@@ -628,8 +632,6 @@ class hsa_insn_mem : public hsa_insn_basic
 public:
   hsa_insn_mem (int opc, BrigType16_t t, hsa_op_base *arg0, hsa_op_base *arg1);
 
-  void *operator new (size_t);
-
   /* Set alignment to VALUE.  */
 
   void set_align (BrigAlignment8_t value);
@@ -652,9 +654,6 @@ protected:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_mem () : hsa_insn_basic (1, BRIG_OPCODE_LD) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a memory instruction.  */
@@ -677,7 +676,6 @@ public:
 		   BrigType16_t t, BrigMemoryOrder memorder,
 		   hsa_op_base *arg0 = NULL, hsa_op_base *arg1 = NULL,
 		   hsa_op_base *arg2 = NULL, hsa_op_base *arg3 = NULL);
-  void *operator new (size_t);
 
   /* The operation itself.  */
   enum BrigAtomicOperation m_atomicop;
@@ -691,9 +689,6 @@ public:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_atomic () : hsa_insn_mem (1, BRIG_KIND_NONE, BRIG_TYPE_NONE) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is an atomic instruction.  */
@@ -709,20 +704,19 @@ is_a_helper <hsa_insn_atomic *>::test (hsa_insn_basic *p)
 
 /* HSA instruction for signal operations.  */
 
-class hsa_insn_signal : public hsa_insn_atomic
+class hsa_insn_signal : public hsa_insn_basic
 {
 public:
   hsa_insn_signal (int nops, int opc, enum BrigAtomicOperation sop,
-		   BrigType16_t t, hsa_op_base *arg0 = NULL,
-		   hsa_op_base *arg1 = NULL,
+		   BrigType16_t t, BrigMemoryOrder memorder,
+		   hsa_op_base *arg0 = NULL, hsa_op_base *arg1 = NULL,
 		   hsa_op_base *arg2 = NULL, hsa_op_base *arg3 = NULL);
 
-  void *operator new (size_t);
+  /* Things like acquire/release/aligned.  */
+  enum BrigMemoryOrder m_memory_order;
 
-private:
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
+  /* The operation itself.  */
+  enum BrigAtomicOperation m_signalop;
 };
 
 /* Report whether or not P is a signal instruction.  */
@@ -744,8 +738,6 @@ public:
   hsa_insn_seg (int opc, BrigType16_t destt, BrigType16_t srct,
 		BrigSegment8_t seg, hsa_op_base *arg0, hsa_op_base *arg1);
 
-  void *operator new (size_t);
-
   /* Source type.  Depends on the source addressing/segment.  */
   BrigType16_t m_src_type;
   /* The segment we are converting from or to.  */
@@ -753,9 +745,6 @@ public:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_seg () : hsa_insn_basic (1, BRIG_OPCODE_STOF) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a segment conversion instruction.  */
@@ -812,8 +801,6 @@ public:
   /* Default destructor.  */
   ~hsa_insn_call ();
 
-  void *operator new (size_t);
-
   /* Called function.  */
   tree m_called_function;
 
@@ -840,9 +827,6 @@ public:
 private:
   /* Make the default constructor inaccessible.  */
   hsa_insn_call () : hsa_insn_basic (0, BRIG_OPCODE_CALL) {}
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a call instruction.  */
@@ -866,17 +850,11 @@ class hsa_insn_arg_block : public hsa_insn_basic
 public:
   hsa_insn_arg_block (BrigKind brig_kind, hsa_insn_call * call);
 
-  void *operator new (size_t);
-
   /* Kind of argument block.  */
   BrigKind m_kind;
 
   /* Call instruction.  */
   hsa_insn_call *m_call_insn;
-private:
-  /* All objects are deallocated by destroying their pool, so make delete
-     inaccessible too.  */
-  void operator delete (void *) {}
 };
 
 /* Report whether or not P is a call block instruction.  */
@@ -900,8 +878,6 @@ public:
   /* Default destructor.  */
   ~hsa_insn_comment ();
 
-  void *operator new (size_t);
-
   char *m_comment;
 };
 
@@ -920,10 +896,18 @@ is_a_helper <hsa_insn_comment *>::test (hsa_insn_basic *p)
 class hsa_insn_queue: public hsa_insn_basic
 {
 public:
-  hsa_insn_queue (int nops, BrigOpcode opcode);
+  hsa_insn_queue (int nops, int opcode, BrigSegment segment,
+		  BrigMemoryOrder memory_order,
+		  hsa_op_base *arg0 = NULL, hsa_op_base *arg1 = NULL,
+		  hsa_op_base *arg2 = NULL, hsa_op_base *arg3 = NULL);
 
   /* Destructor.  */
   ~hsa_insn_queue ();
+
+  /* Segment used to refer to the queue.  Must be global or flat.  */
+  BrigSegment m_segment;
+  /* Memory order used to specify synchronization.  */
+  BrigMemoryOrder m_memory_order;
 };
 
 /* Report whether or not P is a queue instruction.  */
@@ -933,7 +917,12 @@ template <>
 inline bool
 is_a_helper <hsa_insn_queue *>::test (hsa_insn_basic *p)
 {
-  return (p->m_opcode == BRIG_OPCODE_ADDQUEUEWRITEINDEX);
+  return (p->m_opcode == BRIG_OPCODE_ADDQUEUEWRITEINDEX
+	  || p->m_opcode == BRIG_OPCODE_CASQUEUEWRITEINDEX
+	  || p->m_opcode == BRIG_OPCODE_LDQUEUEREADINDEX
+	  || p->m_opcode == BRIG_OPCODE_LDQUEUEWRITEINDEX
+	  || p->m_opcode == BRIG_OPCODE_STQUEUEREADINDEX
+	  || p->m_opcode == BRIG_OPCODE_STQUEUEWRITEINDEX);
 }
 
 /* HSA source type instruction.  */
@@ -945,9 +934,6 @@ public:
 		   BrigType16_t srct, hsa_op_base *arg0, hsa_op_base *arg1,
 		   hsa_op_base *arg2);
 
-  /* Pool allocator.  */
-  void *operator new (size_t);
-
   /* Source type.  */
   BrigType16_t m_source_type;
 
@@ -976,9 +962,6 @@ public:
 		   BrigType16_t srct, hsa_op_base *arg0, hsa_op_base *arg1,
 		   hsa_op_base *arg2);
 
-  /* Pool allocator.  */
-  void *operator new (size_t);
-
   /* Operand list for an operand of the instruction.  */
   hsa_op_operand_list *m_operand_list;
 
@@ -1003,9 +986,6 @@ class hsa_insn_cvt: public hsa_insn_basic
 {
 public:
   hsa_insn_cvt (hsa_op_with_type *dest, hsa_op_with_type *src);
-
-  /* Pool allocator.  */
-  void *operator new (size_t);
 };
 
 /* Report whether or not P is a convert instruction.  */
@@ -1028,9 +1008,6 @@ public:
 
   /* Required alignment of the allocation.  */
   BrigAlignment8_t m_align;
-
-  /* Pool allocator.  */
-  void *operator new (size_t);
 };
 
 /* Report whether or not P is an alloca instruction.  */
@@ -1055,6 +1032,9 @@ public:
   /* Append an instruction INSN into the basic block.  */
   void append_insn (hsa_insn_basic *insn);
 
+  /* Add a PHI instruction.  */
+  void append_phi (hsa_insn_phi *phi);
+
   /* The real CFG BB that this HBB belongs to.  */
   basic_block m_bb;
 
@@ -1217,7 +1197,7 @@ public:
   unsigned m_temp_symbol_count;
 
   /* SSA names mapping.  */
-  vec <hsa_op_reg_p> m_ssa_map;
+  vec <hsa_op_reg *> m_ssa_map;
 
   /* Flag whether a function needs update of dominators before RA.  */
   bool m_modified_cfg;
@@ -1239,9 +1219,9 @@ struct hsa_function_summary
   hsa_function_kind m_kind;
 
   /* Pointer to a cgraph node which is a HSA implementation of the function.
-     In case of the function is a HSA function, the binded function points
+     In case of the function is a HSA function, the bound function points
      to the host function.  */
-  cgraph_node *m_binded_function;
+  cgraph_node *m_bound_function;
 
   /* Identifies if the function is an HSA function or a host function.  */
   bool m_gpu_implementation_p;
@@ -1252,7 +1232,7 @@ struct hsa_function_summary
 
 inline
 hsa_function_summary::hsa_function_summary (): m_kind (HSA_NONE),
-  m_binded_function (NULL), m_gpu_implementation_p (false)
+  m_bound_function (NULL), m_gpu_implementation_p (false)
 {
 }
 
@@ -1270,6 +1250,9 @@ public:
 
   void link_functions (cgraph_node *gpu, cgraph_node *host,
 		       hsa_function_kind kind, bool gridified_kernel_p);
+
+private:
+  void process_gpu_implementation_attributes (tree gdecl);
 };
 
 /* OMP simple builtin describes behavior that should be done for
diff --git a/gcc/ipa-hsa.c b/gcc/ipa-hsa.c
index 769657f330a..0fbe2e2062d 100644
--- a/gcc/ipa-hsa.c
+++ b/gcc/ipa-hsa.c
@@ -79,7 +79,7 @@ process_hsa_functions (void)
       hsa_function_summary *s = hsa_summaries->get (node);
 
       /* A linked function is skipped.  */
-      if (s->m_binded_function != NULL)
+      if (s->m_bound_function != NULL)
 	continue;
 
       if (s->m_kind != HSA_NONE)
@@ -90,6 +90,7 @@ process_hsa_functions (void)
 	    = node->create_virtual_clone (vec <cgraph_edge *> (),
 					  NULL, NULL, "hsa");
 	  TREE_PUBLIC (clone->decl) = TREE_PUBLIC (node->decl);
+	  clone->externally_visible = node->externally_visible;
 
 	  clone->force_output = true;
 	  hsa_summaries->link_functions (clone, node, s->m_kind, false);
@@ -107,6 +108,7 @@ process_hsa_functions (void)
 	    = node->create_virtual_clone (vec <cgraph_edge *> (),
 					  NULL, NULL, "hsa");
 	  TREE_PUBLIC (clone->decl) = TREE_PUBLIC (node->decl);
+	  clone->externally_visible = node->externally_visible;
 
 	  if (!cgraph_local_p (node))
 	    clone->force_output = true;
@@ -131,7 +133,7 @@ process_hsa_functions (void)
 	      hsa_function_summary *dst = hsa_summaries->get (e->callee);
 	      if (dst->m_kind != HSA_NONE && !dst->m_gpu_implementation_p)
 		{
-		  e->redirect_callee (dst->m_binded_function);
+		  e->redirect_callee (dst->m_bound_function);
 		  if (dump_file)
 		    fprintf (dump_file,
 			     "Redirecting edge to HSA function: %s->%s\n",
@@ -193,10 +195,10 @@ ipa_hsa_write_summary (void)
 	  bp = bitpack_create (ob->main_stream);
 	  bp_pack_value (&bp, s->m_kind, 2);
 	  bp_pack_value (&bp, s->m_gpu_implementation_p, 1);
-	  bp_pack_value (&bp, s->m_binded_function != NULL, 1);
+	  bp_pack_value (&bp, s->m_bound_function != NULL, 1);
 	  streamer_write_bitpack (&bp);
-	  if (s->m_binded_function)
-	    stream_write_tree (ob, s->m_binded_function->decl, true);
+	  if (s->m_bound_function)
+	    stream_write_tree (ob, s->m_bound_function->decl, true);
 	}
     }
 
@@ -249,7 +251,7 @@ ipa_hsa_read_section (struct lto_file_decl_data *file_data, const char *data,
       if (has_tree)
 	{
 	  tree decl = stream_read_tree (&ib_main, data_in);
-	  s->m_binded_function = cgraph_node::get_create (decl);
+	  s->m_bound_function = cgraph_node::get_create (decl);
 	}
     }
   lto_free_section_data (file_data, LTO_section_ipa_hsa, NULL, data,
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index eab0af5b58d..3eff4e7f1e8 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3349,8 +3349,8 @@ check_omp_nesting_restrictions (gimple *stmt, omp_context *ctx)
       else if (gimple_code (ctx->stmt) == GIMPLE_OMP_TEAMS)
 	{
 	  if ((gimple_code (stmt) != GIMPLE_OMP_FOR
-	       || (gimple_omp_for_kind (stmt)
-		   != GF_OMP_FOR_KIND_DISTRIBUTE))
+	       || ((gimple_omp_for_kind (stmt) != GF_OMP_FOR_KIND_DISTRIBUTE)
+		   && (gimple_omp_for_kind (stmt) != GF_OMP_FOR_KIND_GRID_LOOP)))
 	      && gimple_code (stmt) != GIMPLE_OMP_PARALLEL)
 	    {
 	      error_at (gimple_location (stmt),
@@ -5560,16 +5560,27 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
     {
       gcond *stmt;
       tree label_true, arm1, arm2;
+      enum tree_code pred_code = TREE_CODE (predicate);
 
       label = create_artificial_label (UNKNOWN_LOCATION);
       label_true = create_artificial_label (UNKNOWN_LOCATION);
-      arm1 = TREE_OPERAND (predicate, 0);
-      arm2 = TREE_OPERAND (predicate, 1);
-      gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
-      gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue);
+      if (TREE_CODE_CLASS (pred_code) == tcc_comparison)
+	{
+	  arm1 = TREE_OPERAND (predicate, 0);
+	  arm2 = TREE_OPERAND (predicate, 1);
+	  gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
+	  gimplify_expr (&arm2, stmt_list, NULL, is_gimple_val, fb_rvalue);
+	}
+      else
+	{
+	  arm1 = predicate;
+	  gimplify_expr (&arm1, stmt_list, NULL, is_gimple_val, fb_rvalue);
+	  arm2 = boolean_false_node;
+	  pred_code = NE_EXPR;
+	}
       if (maybe_simt)
 	{
-	  c = build2 (TREE_CODE (predicate), boolean_type_node, arm1, arm2);
+	  c = build2 (pred_code, boolean_type_node, arm1, arm2);
 	  c = fold_convert (integer_type_node, c);
 	  simtcond = create_tmp_var (integer_type_node);
 	  gimplify_assign (simtcond, c, stmt_list);
@@ -5582,8 +5593,7 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
 				    label_true, label);
 	}
       else
-	stmt = gimple_build_cond (TREE_CODE (predicate), arm1, arm2,
-				  label_true, label);
+	stmt = gimple_build_cond (pred_code, arm1, arm2, label_true, label);
       gimple_seq_add_stmt (stmt_list, stmt);
       gimple_seq_add_stmt (stmt_list, gimple_build_label (label_true));
     }
@@ -13136,7 +13146,6 @@ grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
 				   gomp_target *tgt_stmt)
 {
   grid_create_kernel_launch_attr_types ();
-  tree u32_one = build_one_cst (uint32_type_node);
   tree lattrs = create_tmp_var (grid_attr_trees->kernel_launch_attributes_type,
 				"__kernel_launch_attrs");
 
@@ -13161,10 +13170,10 @@ grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
 
   tree dimref = build3 (COMPONENT_REF, uint32_type_node, lattrs,
 			grid_attr_trees->kernel_lattrs_dimnum_decl, NULL_TREE);
-  /* At this moment we cannot gridify a loop with a collapse clause.  */
-  /* TODO: Adjust when we support bigger collapse.  */
-  gcc_assert (max_dim == 0);
-  gsi_insert_before (gsi, gimple_build_assign (dimref, u32_one), GSI_SAME_STMT);
+  gcc_checking_assert (max_dim <= 2);
+  tree dimensions = build_int_cstu (uint32_type_node, max_dim + 1);
+  gsi_insert_before (gsi, gimple_build_assign (dimref, dimensions),
+		     GSI_SAME_STMT);
   TREE_ADDRESSABLE (lattrs) = 1;
   return build_fold_addr_expr (lattrs);
 }
@@ -13810,59 +13819,79 @@ expand_omp_target (struct omp_region *region)
     }
 }
 
-/* Expand KFOR loop as a GPGPU kernel, i.e. as a body only with iteration
-   variable derived from the thread number.  */
+/* Expand KFOR loop as a HSA grifidied kernel, i.e. as a body only with
+   iteration variable derived from the thread number.  INTRA_GROUP means this
+   is an expansion of a loop iterating over work-items within a separate
+   iteration over groups. */
 
 static void
-grid_expand_omp_for_loop (struct omp_region *kfor)
+grid_expand_omp_for_loop (struct omp_region *kfor, bool intra_group)
 {
-  tree t, threadid;
-  tree type, itype;
   gimple_stmt_iterator gsi;
-  tree n1, step;
-  struct omp_for_data fd;
-
   gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
   gcc_checking_assert (gimple_omp_for_kind (for_stmt)
 		       == GF_OMP_FOR_KIND_GRID_LOOP);
+  size_t collapse = gimple_omp_for_collapse (for_stmt);
+  struct omp_for_data_loop *loops
+    = XALLOCAVEC (struct omp_for_data_loop,
+                  gimple_omp_for_collapse (for_stmt));
+  struct omp_for_data fd;
+
+  remove_edge (BRANCH_EDGE (kfor->entry));
   basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest;
 
-  gcc_assert (gimple_omp_for_collapse (for_stmt) == 1);
   gcc_assert (kfor->cont);
-  extract_omp_for_data (for_stmt, &fd, NULL);
-
-  itype = type = TREE_TYPE (fd.loop.v);
-  if (POINTER_TYPE_P (type))
-    itype = signed_type_for (type);
+  extract_omp_for_data (for_stmt, &fd, loops);
 
   gsi = gsi_start_bb (body_bb);
 
-  n1 = fd.loop.n1;
-  step = fd.loop.step;
-  n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
-				 true, NULL_TREE, true, GSI_SAME_STMT);
-  step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
-				   true, NULL_TREE, true, GSI_SAME_STMT);
-  threadid = build_call_expr (builtin_decl_explicit
-			      (BUILT_IN_OMP_GET_THREAD_NUM), 0);
-  threadid = fold_convert (itype, threadid);
-  threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
-				       true, GSI_SAME_STMT);
+  for (size_t dim = 0; dim < collapse; dim++)
+    {
+      tree type, itype;
+      itype = type = TREE_TYPE (fd.loops[dim].v);
+      if (POINTER_TYPE_P (type))
+	itype = signed_type_for (type);
 
-  tree startvar = fd.loop.v;
-  t = fold_build2 (MULT_EXPR, itype, threadid, step);
-  if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
-  else
-    t = fold_build2 (PLUS_EXPR, type, t, n1);
-  t = fold_convert (type, t);
-  t = force_gimple_operand_gsi (&gsi, t,
-				DECL_P (startvar)
-				&& TREE_ADDRESSABLE (startvar),
-				NULL_TREE, true, GSI_SAME_STMT);
-  gassign *assign_stmt = gimple_build_assign (startvar, t);
-  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+      tree n1 = fd.loops[dim].n1;
+      tree step = fd.loops[dim].step;
+      n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
+				     true, NULL_TREE, true, GSI_SAME_STMT);
+      step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
+				       true, NULL_TREE, true, GSI_SAME_STMT);
+      tree threadid;
+      if (gimple_omp_for_grid_group_iter (for_stmt))
+	{
+	  gcc_checking_assert (!intra_group);
+	  threadid = build_call_expr (builtin_decl_explicit
+				      (BUILT_IN_HSA_WORKGROUPID), 1,
+				      build_int_cstu (unsigned_type_node, dim));
+	}
+      else if (intra_group)
+	threadid = build_call_expr (builtin_decl_explicit
+				    (BUILT_IN_HSA_WORKITEMID), 1,
+				    build_int_cstu (unsigned_type_node, dim));
+      else
+	threadid = build_call_expr (builtin_decl_explicit
+				    (BUILT_IN_HSA_WORKITEMABSID), 1,
+				    build_int_cstu (unsigned_type_node, dim));
+      threadid = fold_convert (itype, threadid);
+      threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
+					   true, GSI_SAME_STMT);
 
+      tree startvar = fd.loops[dim].v;
+      tree t = fold_build2 (MULT_EXPR, itype, threadid, step);
+      if (POINTER_TYPE_P (type))
+	t = fold_build_pointer_plus (n1, t);
+      else
+	t = fold_build2 (PLUS_EXPR, type, t, n1);
+      t = fold_convert (type, t);
+      t = force_gimple_operand_gsi (&gsi, t,
+				    DECL_P (startvar)
+				    && TREE_ADDRESSABLE (startvar),
+				    NULL_TREE, true, GSI_SAME_STMT);
+      gassign *assign_stmt = gimple_build_assign (startvar, t);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+    }
   /* Remove the omp for statement */
   gsi = gsi_last_bb (kfor->entry);
   gsi_remove (&gsi, true);
@@ -13873,10 +13902,12 @@ grid_expand_omp_for_loop (struct omp_region *kfor)
 	      && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_CONTINUE);
   gsi_remove (&gsi, true);
 
-  /* Replace the GIMPLE_OMP_RETURN with a real return.  */
+  /* Replace the GIMPLE_OMP_RETURN with a barrier, if necessary.  */
   gsi = gsi_last_bb (kfor->exit);
   gcc_assert (!gsi_end_p (gsi)
 	      && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
+  if (intra_group)
+    gsi_insert_before (&gsi, build_omp_barrier (NULL_TREE), GSI_SAME_STMT);
   gsi_remove (&gsi, true);
 
   /* Fixup the much simpler CFG.  */
@@ -13915,7 +13946,7 @@ grid_remap_kernel_arg_accesses (tree *tp, int *walk_subtrees, void *data)
 static void expand_omp (struct omp_region *region);
 
 /* If TARGET region contains a kernel body for loop, remove its region from the
-   TARGET and expand it in GPGPU kernel fashion. */
+   TARGET and expand it in HSA gridified kernel fashion. */
 
 static void
 grid_expand_target_grid_body (struct omp_region *target)
@@ -13957,11 +13988,29 @@ grid_expand_target_grid_body (struct omp_region *target)
 
   struct omp_region *kfor = *pp;
   gcc_assert (kfor);
-  gcc_assert (gimple_omp_for_kind (last_stmt ((kfor)->entry))
-	      == GF_OMP_FOR_KIND_GRID_LOOP);
+  gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
+  gcc_assert (gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP);
   *pp = kfor->next;
   if (kfor->inner)
-    expand_omp (kfor->inner);
+    {
+      if (gimple_omp_for_grid_group_iter (for_stmt))
+	{
+	  struct omp_region **next_pp;
+	  for (pp = &kfor->inner; *pp; pp = next_pp)
+	    {
+	      next_pp = &(*pp)->next;
+	      if ((*pp)->type != GIMPLE_OMP_FOR)
+		continue;
+	      gomp_for *inner = as_a <gomp_for *> (last_stmt ((*pp)->entry));
+	      gcc_assert (gimple_omp_for_kind (inner)
+			  == GF_OMP_FOR_KIND_GRID_LOOP);
+	      grid_expand_omp_for_loop (*pp, true);
+	      *pp = (*pp)->next;
+	      next_pp = pp;
+	    }
+	}
+      expand_omp (kfor->inner);
+    }
   if (gpukernel->inner)
     expand_omp (gpukernel->inner);
 
@@ -13991,8 +14040,7 @@ grid_expand_target_grid_body (struct omp_region *target)
   struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl);
   kern_cfun->curr_properties = cfun->curr_properties;
 
-  remove_edge (BRANCH_EDGE (kfor->entry));
-  grid_expand_omp_for_loop (kfor);
+  grid_expand_omp_for_loop (kfor, false);
 
   /* Remove the omp for statement */
   gimple_stmt_iterator gsi = gsi_last_bb (gpukernel->entry);
@@ -14351,7 +14399,7 @@ const pass_data pass_data_expand_omp =
 {
   GIMPLE_PASS, /* type */
   "ompexp", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
   PROP_gimple_eomp, /* properties_provided */
@@ -14398,7 +14446,7 @@ const pass_data pass_data_expand_omp_ssa =
 {
   GIMPLE_PASS, /* type */
   "ompexpssa", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_cfg | PROP_ssa, /* properties_required */
   PROP_gimple_eomp, /* properties_provided */
@@ -15267,6 +15315,46 @@ lower_omp_critical (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   BLOCK_VARS (block) = gimple_bind_vars (bind);
 }
 
+/* Return the lastprivate predicate for a given gridified loop described by FD).
+   TODO: When grid stuff is moved to a separate file, move this too.  */
+
+static tree
+grid_lastprivate_predicate (struct omp_for_data *fd)
+{
+  /* When dealing with a gridified loop, we need to check up to three collapsed
+     iteration variables but they are not actually captured in this fd.
+     Fortunately, we can easily rely on HSA builtins to get this
+     information. */
+
+  tree id, size;
+  if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP
+      && gimple_omp_for_grid_intra_group (fd->for_stmt))
+    {
+      id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMID);
+      size = builtin_decl_explicit (BUILT_IN_HSA_CURRENTWORKGROUPSIZE);
+    }
+  else
+    {
+      id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMABSID);
+      size = builtin_decl_explicit (BUILT_IN_HSA_GRIDSIZE);
+    }
+  tree cond = NULL;
+  for (int dim = 0; dim < fd->collapse; dim++)
+    {
+      tree dim_tree = build_int_cstu (unsigned_type_node, dim);
+      tree u1 = build_int_cstu (unsigned_type_node, 1);
+      tree c2
+	= build2 (EQ_EXPR, boolean_type_node,
+		  build2 (PLUS_EXPR, unsigned_type_node,
+			  build_call_expr (id, 1, dim_tree), u1),
+		  build_call_expr (size, 1, dim_tree));
+      if (cond)
+	cond = build2 (TRUTH_AND_EXPR, boolean_type_node, cond, c2);
+      else
+	cond = c2;
+    }
+  return cond;
+}
 
 /* A subroutine of lower_omp_for.  Generate code to emit the predicate
    for a lastprivate clause.  Given a loop control predicate of (V
@@ -15294,58 +15382,65 @@ lower_omp_for_lastprivate (struct omp_for_data *fd, gimple_seq *body_p,
 	cond_code = EQ_EXPR;
     }
 
-  tree n2 = fd->loop.n2;
-  if (fd->collapse > 1
-      && TREE_CODE (n2) != INTEGER_CST
-      && gimple_omp_for_combined_into_p (fd->for_stmt))
+  if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP
+      || gimple_omp_for_grid_phony (fd->for_stmt))
+    cond = grid_lastprivate_predicate (fd);
+  else
     {
-      struct omp_context *taskreg_ctx = NULL;
-      if (gimple_code (ctx->outer->stmt) == GIMPLE_OMP_FOR)
+      tree n2 = fd->loop.n2;
+      if (fd->collapse > 1
+	  && TREE_CODE (n2) != INTEGER_CST
+	  && gimple_omp_for_combined_into_p (fd->for_stmt))
 	{
-	  gomp_for *gfor = as_a <gomp_for *> (ctx->outer->stmt);
-	  if (gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_FOR
-	      || gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_DISTRIBUTE)
+	  struct omp_context *taskreg_ctx = NULL;
+	  if (gimple_code (ctx->outer->stmt) == GIMPLE_OMP_FOR)
 	    {
-	      if (gimple_omp_for_combined_into_p (gfor))
-		{
-		  gcc_assert (ctx->outer->outer
-			      && is_parallel_ctx (ctx->outer->outer));
-		  taskreg_ctx = ctx->outer->outer;
-		}
-	      else
+	      gomp_for *gfor = as_a <gomp_for *> (ctx->outer->stmt);
+	      if (gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_FOR
+		  || gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_DISTRIBUTE)
 		{
-		  struct omp_for_data outer_fd;
-		  extract_omp_for_data (gfor, &outer_fd, NULL);
-		  n2 = fold_convert (TREE_TYPE (n2), outer_fd.loop.n2);
+		  if (gimple_omp_for_combined_into_p (gfor))
+		    {
+		      gcc_assert (ctx->outer->outer
+				  && is_parallel_ctx (ctx->outer->outer));
+		      taskreg_ctx = ctx->outer->outer;
+		    }
+		  else
+		    {
+		      struct omp_for_data outer_fd;
+		      extract_omp_for_data (gfor, &outer_fd, NULL);
+		      n2 = fold_convert (TREE_TYPE (n2), outer_fd.loop.n2);
+		    }
 		}
+	      else if (gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_TASKLOOP)
+		taskreg_ctx = ctx->outer->outer;
 	    }
-	  else if (gimple_omp_for_kind (gfor) == GF_OMP_FOR_KIND_TASKLOOP)
-	    taskreg_ctx = ctx->outer->outer;
-	}
-      else if (is_taskreg_ctx (ctx->outer))
-	taskreg_ctx = ctx->outer;
-      if (taskreg_ctx)
-	{
-	  int i;
-	  tree innerc
-	    = find_omp_clause (gimple_omp_taskreg_clauses (taskreg_ctx->stmt),
-			       OMP_CLAUSE__LOOPTEMP_);
-	  gcc_assert (innerc);
-	  for (i = 0; i < fd->collapse; i++)
+	  else if (is_taskreg_ctx (ctx->outer))
+	    taskreg_ctx = ctx->outer;
+	  if (taskreg_ctx)
 	    {
+	      int i;
+	      tree taskreg_clauses
+		= gimple_omp_taskreg_clauses (taskreg_ctx->stmt);
+	      tree innerc = find_omp_clause (taskreg_clauses,
+					     OMP_CLAUSE__LOOPTEMP_);
+	      gcc_assert (innerc);
+	      for (i = 0; i < fd->collapse; i++)
+		{
+		  innerc = find_omp_clause (OMP_CLAUSE_CHAIN (innerc),
+					    OMP_CLAUSE__LOOPTEMP_);
+		  gcc_assert (innerc);
+		}
 	      innerc = find_omp_clause (OMP_CLAUSE_CHAIN (innerc),
 					OMP_CLAUSE__LOOPTEMP_);
-	      gcc_assert (innerc);
+	      if (innerc)
+		n2 = fold_convert (TREE_TYPE (n2),
+				   lookup_decl (OMP_CLAUSE_DECL (innerc),
+						taskreg_ctx));
 	    }
-	  innerc = find_omp_clause (OMP_CLAUSE_CHAIN (innerc),
-				    OMP_CLAUSE__LOOPTEMP_);
-	  if (innerc)
-	    n2 = fold_convert (TREE_TYPE (n2),
-			       lookup_decl (OMP_CLAUSE_DECL (innerc),
-					    taskreg_ctx));
 	}
+      cond = build2 (cond_code, boolean_type_node, fd->loop.v, n2);
     }
-  cond = build2 (cond_code, boolean_type_node, fd->loop.v, n2);
 
   clauses = gimple_omp_for_clauses (fd->for_stmt);
   stmts = NULL;
@@ -15514,11 +15609,13 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 						ctx);
 	}
 
-  if (!gimple_omp_for_grid_phony (stmt))
+  bool phony_loop = (gimple_omp_for_kind (stmt) != GF_OMP_FOR_KIND_GRID_LOOP
+		     && gimple_omp_for_grid_phony (stmt));
+  if (!phony_loop)
     gimple_seq_add_stmt (&body, stmt);
   gimple_seq_add_seq (&body, gimple_omp_body (stmt));
 
-  if (!gimple_omp_for_grid_phony (stmt))
+  if (!phony_loop)
     gimple_seq_add_stmt (&body, gimple_build_omp_continue (fd.loop.v,
 							   fd.loop.v));
 
@@ -15532,7 +15629,7 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   body = maybe_catch_exception (body);
 
-  if (!gimple_omp_for_grid_phony (stmt))
+  if (!phony_loop)
     {
       /* Region exit marker goes at the end of the loop body.  */
       gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait));
@@ -17516,60 +17613,90 @@ lower_omp (gimple_seq *body, omp_context *ctx)
   input_location = saved_location;
 }
 
-/* Returen true if STMT is an assignment of a register-type into a local
-   VAR_DECL.  */
+/* Structure describing the basic properties of the loop we ara analyzing
+   whether it can be gridified and when it is gridified. */
+
+struct grid_prop
+{
+  /* True when we are doing tiling gridification, i.e. when there is a distinct
+     distribute loop over groups and a loop construct over work-items.  False
+     when distribute and parallel for loops form a combined construct.  */
+  bool tiling;
+  /* Location of the target construct for optimization information
+     messages.  */
+  location_t target_loc;
+  /* The collapse clause of the involved loops.  Collapse value of all of them
+     must be the same for gridification to take place.  */
+  size_t collapse;
+  /* Group sizes, if requested by the user or NULL if not requested.  */
+  tree group_sizes[3];
+};
+
+#define GRID_MISSED_MSG_PREFIX "Will not turn target construct into a " \
+  "gridified HSA kernel because "
+
+/* Return true if STMT is an assignment of a register-type into a local
+   VAR_DECL.  If GRID is non-NULL, the assignment additionally must not be to
+   any of the trees specifying group sizes there.  */
 
 static bool
-grid_reg_assignment_to_local_var_p (gimple *stmt)
+grid_safe_assignment_p (gimple *stmt, grid_prop *grid)
 {
   gassign *assign = dyn_cast <gassign *> (stmt);
   if (!assign)
     return false;
+  if (gimple_clobber_p (assign))
+    return true;
   tree lhs = gimple_assign_lhs (assign);
   if (!VAR_P (lhs)
       || !is_gimple_reg_type (TREE_TYPE (lhs))
       || is_global_var (lhs))
     return false;
+  if (grid)
+    for (unsigned i = 0; i < grid->collapse; i++)
+      if (lhs == grid->group_sizes[i])
+	return false;
   return true;
 }
 
 /* Return true if all statements in SEQ are assignments to local register-type
-   variables.  */
+   variables that do not hold group size information.  */
 
 static bool
-grid_seq_only_contains_local_assignments (gimple_seq seq)
+grid_seq_only_contains_local_assignments (gimple_seq seq, grid_prop *grid)
 {
   if (!seq)
     return true;
 
   gimple_stmt_iterator gsi;
   for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
-    if (!grid_reg_assignment_to_local_var_p (gsi_stmt (gsi)))
+    if (!grid_safe_assignment_p (gsi_stmt (gsi), grid))
       return false;
   return true;
 }
 
-/* Scan statements in SEQ and call itself recursively on any bind.  If during
-   whole search only assignments to register-type local variables and one
-   single OMP statement is encountered, return true, otherwise return false.
-   RET is where we store any OMP statement encountered.  TARGET_LOC and NAME
-   are used for dumping a note about a failure.  */
+/* Scan statements in SEQ and call itself recursively on any bind.  GRID
+   describes hitherto discovered properties of the loop that is evaluated for
+   possible gridification.  If during whole search only assignments to
+   register-type local variables (that do not overwrite group size information)
+   and one single OMP statement is encountered, return true, otherwise return
+   false.  RET is where we store any OMP statement encountered.  */
 
 static bool
-grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
-				     const char *name, gimple **ret)
+grid_find_single_omp_among_assignments_1 (gimple_seq seq, grid_prop *grid,
+					  const char *name, gimple **ret)
 {
   gimple_stmt_iterator gsi;
   for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       gimple *stmt = gsi_stmt (gsi);
 
-      if (grid_reg_assignment_to_local_var_p (stmt))
+      if (grid_safe_assignment_p (stmt, grid))
 	continue;
       if (gbind *bind = dyn_cast <gbind *> (stmt))
 	{
 	  if (!grid_find_single_omp_among_assignments_1 (gimple_bind_body (bind),
-							 target_loc, name, ret))
+							 grid, name, ret))
 	      return false;
 	}
       else if (is_gimple_omp (stmt))
@@ -17577,10 +17704,18 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
 	  if (*ret)
 	    {
 	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, target_loc,
-				 "Will not turn target construct into a simple "
-				 "GPGPU kernel because %s construct contains "
-				 "multiple OpenMP constructs\n", name);
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+				   GRID_MISSED_MSG_PREFIX "%s construct "
+				   "contains multiple OpenMP constructs\n",
+				   name);
+		  dump_printf_loc (MSG_NOTE, gimple_location (*ret),
+				   "The first OpenMP construct within "
+				   "a parallel\n");
+		  dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+				   "The second OpenMP construct within "
+				   "a parallel\n");
+		}
 	      return false;
 	    }
 	  *ret = stmt;
@@ -17588,10 +17723,14 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
       else
 	{
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, target_loc,
-			     "Will not turn target construct into a simple "
-			     "GPGPU kernel because %s construct contains "
-			     "a complex statement\n", name);
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "%s construct contains "
+			       "a complex statement\n", name);
+	      dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+			       "This statement cannot be analyzed for "
+			       "gridification\n");
+	    }
 	  return false;
 	}
     }
@@ -17599,33 +17738,32 @@ grid_find_single_omp_among_assignments_1 (gimple_seq seq, location_t target_loc,
 }
 
 /* Scan statements in SEQ and make sure that it and any binds in it contain
-   only assignments to local register-type variables and one OMP construct.  If
-   so, return that construct, otherwise return NULL.  If dumping is enabled and
-   function fails, use TARGET_LOC and NAME to dump a note with the reason for
-   failure.  */
+   only assignments to local register-type variables (that do not overwrite
+   group size information) and one OMP construct.  If so, return that
+   construct, otherwise return NULL.  GRID describes hitherto discovered
+   properties of the loop that is evaluated for possible gridification.  If
+   dumping is enabled and function fails, use NAME to dump a note with the
+   reason for failure.  */
 
 static gimple *
-grid_find_single_omp_among_assignments (gimple_seq seq, location_t target_loc,
+grid_find_single_omp_among_assignments (gimple_seq seq, grid_prop *grid,
 					const char *name)
 {
   if (!seq)
     {
       if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, target_loc,
-			 "Will not turn target construct into a simple "
-			 "GPGPU kernel because %s construct has empty "
-			 "body\n",
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			 GRID_MISSED_MSG_PREFIX "%s construct has empty body\n",
 			 name);
       return NULL;
     }
 
   gimple *ret = NULL;
-  if (grid_find_single_omp_among_assignments_1 (seq, target_loc, name, &ret))
+  if (grid_find_single_omp_among_assignments_1 (seq, grid, name, &ret))
     {
       if (!ret && dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, target_loc,
-			 "Will not turn target construct into a simple "
-			 "GPGPU kernel because %s construct does not contain"
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			 GRID_MISSED_MSG_PREFIX "%s construct does not contain"
 			 "any other OpenMP construct\n", name);
       return ret;
     }
@@ -17668,218 +17806,81 @@ grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi,
       *handled_ops_p = true;
       wi->info = stmt;
       return error_mark_node;
-
-    case GIMPLE_OMP_FOR:
-      if ((gimple_omp_for_kind (stmt) & GF_OMP_FOR_SIMD)
-	  && gimple_omp_for_combined_into_p (stmt))
-	{
-	  *handled_ops_p = true;
-	  wi->info = stmt;
-	  return error_mark_node;
-	}
-      break;
-
     default:
       break;
     }
   return NULL;
 }
 
-
-/* If TARGET follows a pattern that can be turned into a gridified GPGPU
-   kernel, return true, otherwise return false.  In the case of success, also
-   fill in GROUP_SIZE_P with the requested group size or NULL if there is
-   none.  */
+/* Examine clauses of omp parallel statement PAR and if any prevents
+   gridification, issue a missed-optimization diagnostics and return false,
+   otherwise return true.  GRID describes hitherto discovered properties of the
+   loop that is evaluated for possible gridification.  */
 
 static bool
-grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p)
+grid_parallel_clauses_gridifiable (gomp_parallel *par, location_t tloc)
 {
-  if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
-    return false;
-
-  location_t tloc = gimple_location (target);
-  gimple *stmt
-    = grid_find_single_omp_among_assignments (gimple_omp_body (target),
-					      tloc, "target");
-  if (!stmt)
-    return false;
-  gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
-  tree group_size = NULL;
-  if (!teams)
-    {
-      dump_printf_loc (MSG_NOTE, tloc,
-		       "Will not turn target construct into a simple "
-		       "GPGPU kernel because it does not have a sole teams "
-		       "construct in it.\n");
-      return false;
-    }
-
-  tree clauses = gimple_omp_teams_clauses (teams);
+  tree clauses = gimple_omp_parallel_clauses (par);
   while (clauses)
     {
       switch (OMP_CLAUSE_CODE (clauses))
 	{
-	case OMP_CLAUSE_NUM_TEAMS:
+	case OMP_CLAUSE_NUM_THREADS:
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because we cannot "
-			     "handle num_teams clause of teams "
-			     "construct\n ");
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			       GRID_MISSED_MSG_PREFIX "because there is "
+			       "a num_threads clause of the parallel "
+			       "construct\n");
+	      dump_printf_loc (MSG_NOTE, gimple_location (par),
+			       "Parallel construct has a num_threads clause\n");
+	    }
 	  return false;
 
 	case OMP_CLAUSE_REDUCTION:
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a reduction "
-			     "clause is present\n ");
-	  return false;
-
-	case OMP_CLAUSE_LASTPRIVATE:
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a lastprivate "
-			     "clause is present\n ");
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			       GRID_MISSED_MSG_PREFIX "a reduction clause"
+			       "is present\n ");
+	      dump_printf_loc (MSG_NOTE, gimple_location (par),
+			       "Parallel construct has a reduction clause\n");
+	    }
 	  return false;
 
-	case OMP_CLAUSE_THREAD_LIMIT:
-	  group_size = OMP_CLAUSE_OPERAND (clauses, 0);
-	  break;
-
 	default:
 	  break;
 	}
       clauses = OMP_CLAUSE_CHAIN (clauses);
     }
+  return true;
+}
 
-  stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), tloc,
-						 "teams");
-  if (!stmt)
-    return false;
-  gomp_for *dist = dyn_cast <gomp_for *> (stmt);
-  if (!dist)
-    {
-      dump_printf_loc (MSG_NOTE, tloc,
-		       "Will not turn target construct into a simple "
-		       "GPGPU kernel because the teams construct  does not have "
-		       "a sole distribute construct in it.\n");
-      return false;
-    }
+/* Examine clauses and the body of omp loop statement GFOR and if something
+   prevents gridification, issue a missed-optimization diagnostics and return
+   false, otherwise return true. GRID describes hitherto discovered properties
+   of the loop that is evaluated for possible gridification.  */
 
-  gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
-  if (!gimple_omp_for_combined_p (dist))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, tloc,
-			 "Will not turn target construct into a gridified GPGPU "
-			 "kernel because we cannot handle a standalone "
-			 "distribute construct\n ");
-      return false;
-    }
-  if (dist->collapse > 1)
+static bool
+grid_inner_loop_gridifiable_p (gomp_for *gfor, grid_prop *grid)
+{
+  if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor),
+						 grid))
     {
       if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, tloc,
-			 "Will not turn target construct into a gridified GPGPU "
-			 "kernel because the distribute construct contains "
-			 "collapse clause\n");
-      return false;
-    }
-  struct omp_for_data fd;
-  extract_omp_for_data (dist, &fd, NULL);
-  if (fd.chunk_size)
-    {
-      if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
 	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because the teams "
-			     "thread limit is different from distribute "
-			     "schedule chunk\n");
-	  return false;
-	}
-      group_size = fd.chunk_size;
-    }
-  stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist), tloc,
-						 "distribute");
-  gomp_parallel *par;
-  if (!stmt || !(par = dyn_cast <gomp_parallel *> (stmt)))
-    return false;
-
-  clauses = gimple_omp_parallel_clauses (par);
-  while (clauses)
-    {
-      switch (OMP_CLAUSE_CODE (clauses))
-	{
-	case OMP_CLAUSE_NUM_THREADS:
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a gridified"
-			     "GPGPU kernel because there is a num_threads "
-			     "clause of the parallel construct\n");
-	  return false;
-
-	case OMP_CLAUSE_REDUCTION:
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a reduction "
-			     "clause is present\n ");
-	  return false;
-
-	case OMP_CLAUSE_LASTPRIVATE:
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a lastprivate "
-			     "clause is present\n ");
-	  return false;
-
-	default:
-	  break;
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			   GRID_MISSED_MSG_PREFIX "the inner loop "
+			   "loop bounds computation contains a complex "
+			   "statement\n");
+	  dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+			   "Loop construct cannot be analyzed for "
+			   "gridification\n");
 	}
-      clauses = OMP_CLAUSE_CHAIN (clauses);
-    }
-
-  stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), tloc,
-						 "parallel");
-  gomp_for *gfor;
-  if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
-    return false;
-
-  if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, tloc,
-			 "Will not turn target construct into a gridified GPGPU "
-			 "kernel because the inner loop is not a simple for "
-			 "loop\n");
-      return false;
-    }
-  if (gfor->collapse > 1)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, tloc,
-			 "Will not turn target construct into a gridified GPGPU "
-			 "kernel because the inner loop contains collapse "
-			 "clause\n");
-      return false;
-    }
-
-  if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor)))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, tloc,
-			 "Will not turn target construct into a gridified GPGPU "
-			 "kernel because the inner loop pre_body contains"
-			 "a complex instruction\n");
       return false;
     }
 
-  clauses = gimple_omp_for_clauses (gfor);
+  tree clauses = gimple_omp_for_clauses (gfor);
   while (clauses)
     {
       switch (OMP_CLAUSE_CODE (clauses))
@@ -17888,28 +17889,28 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
 	  if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
 	    {
 	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, tloc,
-				 "Will not turn target construct into a "
-				 "gridified GPGPU kernel because the inner "
-				 "loop has a non-automatic scheduling clause\n");
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+				   GRID_MISSED_MSG_PREFIX "the inner loop "
+				   "has a non-automatic schedule clause\n");
+		  dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+				   "Loop construct has a non automatic "
+				   "schedule clause\n");
+		}
 	      return false;
 	    }
 	  break;
 
 	case OMP_CLAUSE_REDUCTION:
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a reduction "
-			     "clause is present\n ");
-	  return false;
-
-	case OMP_CLAUSE_LASTPRIVATE:
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a "
-			     "gridified GPGPU kernel because a lastprivate "
-			     "clause is present\n ");
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "a reduction "
+			       "clause is present\n ");
+	      dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+			       "Loop construct has a reduction schedule "
+			       "clause\n");
+	    }
 	  return false;
 
 	default:
@@ -17917,7 +17918,6 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
 	}
       clauses = OMP_CLAUSE_CHAIN (clauses);
     }
-
   struct walk_stmt_info wi;
   memset (&wi, 0, sizeof (wi));
   if (walk_gimple_seq (gimple_omp_body (gfor),
@@ -17928,62 +17928,560 @@ grid_target_follows_gridifiable_pattern (gomp_target *target, tree *group_size_p
       if (dump_enabled_p ())
 	{
 	  if (is_gimple_call (bad))
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a gridified "
-			     " GPGPU kernel because the inner loop contains "
-			     "call to a noreturn function\n");
-	  if (gimple_code (bad) == GIMPLE_OMP_FOR)
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a gridified "
-			     " GPGPU kernel because the inner loop contains "
-			     "a simd construct\n");
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "the inner loop contains "
+			       "call to a noreturn function\n");
 	  else
-	    dump_printf_loc (MSG_NOTE, tloc,
-			     "Will not turn target construct into a gridified "
-			     "GPGPU kernel because the inner loop contains "
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			     GRID_MISSED_MSG_PREFIX "the inner loop contains "
 			     "statement %s which cannot be transformed\n",
 			     gimple_code_name[(int) gimple_code (bad)]);
+	  dump_printf_loc (MSG_NOTE, gimple_location (bad),
+			   "This statement cannot be analyzed for "
+			   "gridification\n");
 	}
       return false;
     }
-
-  *group_size_p = group_size;
   return true;
 }
 
-/* Operand walker, used to remap pre-body declarations according to a hash map
-   provided in DATA.  */
+/* Given distribute omp construct represented by DIST, which in the original
+   source forms a compound construct with a looping construct, return true if it
+   can be turned into a gridified HSA kernel.  Otherwise return false. GRID
+   describes hitherto discovered properties of the loop that is evaluated for
+   possible gridification.  */
 
-static tree
-grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
+static bool
+grid_dist_follows_simple_pattern (gomp_for *dist, grid_prop *grid)
 {
-  tree t = *tp;
+  location_t tloc = grid->target_loc;
+  gimple *stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist),
+							 grid, "distribute");
+  gomp_parallel *par;
+  if (!stmt
+      || !(par = dyn_cast <gomp_parallel *> (stmt))
+      || !grid_parallel_clauses_gridifiable (par, tloc))
+    return false;
 
-  if (DECL_P (t) || TYPE_P (t))
-    *walk_subtrees = 0;
-  else
-    *walk_subtrees = 1;
+  stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), grid,
+						 "parallel");
+  gomp_for *gfor;
+  if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
+    return false;
 
-  if (VAR_P (t))
+  if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
     {
-      struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
-      hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
-      tree *repl = declmap->get (t);
-      if (repl)
-	*tp = *repl;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			 GRID_MISSED_MSG_PREFIX "the inner loop is not "
+			 "a simple for loop\n");
+      return false;
     }
-  return NULL_TREE;
+  gcc_assert (gimple_omp_for_collapse (gfor) == grid->collapse);
+
+  if (!grid_inner_loop_gridifiable_p (gfor, grid))
+    return false;
+
+  return true;
 }
 
-/* Copy leading register-type assignments to local variables in SRC to just
-   before DST, Creating temporaries, adjusting mapping of operands in WI and
-   remapping operands as necessary.  Add any new temporaries to TGT_BIND.
-   Return the first statement that does not conform to
-   grid_reg_assignment_to_local_var_p or NULL.  */
+/* Given an omp loop statement GFOR, return true if it can participate in
+   tiling gridification, i.e. in one where the distribute and parallel for
+   loops do not form a compound statement.  GRID describes hitherto discovered
+   properties of the loop that is evaluated for possible gridification. */
 
-static gimple *
+static bool
+grid_gfor_follows_tiling_pattern (gomp_for *gfor, grid_prop *grid)
+{
+  if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			   GRID_MISSED_MSG_PREFIX "an inner loop is not "
+			   "a simple for loop\n");
+	  dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+			   "This statement is not a simple for loop\n");
+	}
+      return false;
+    }
+
+  if (!grid_inner_loop_gridifiable_p (gfor, grid))
+    return false;
+
+  if (gimple_omp_for_collapse (gfor) != grid->collapse)
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			   GRID_MISSED_MSG_PREFIX "an inner loop does not "
+			   "have use the same collapse clause\n");
+	  dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+			   "Loop construct uses a different collapse clause\n");
+	}
+      return false;
+    }
+
+  struct omp_for_data fd;
+  struct omp_for_data_loop *loops
+    = (struct omp_for_data_loop *)alloca (grid->collapse
+					  * sizeof (struct omp_for_data_loop));
+  extract_omp_for_data (gfor, &fd, loops);
+  for (unsigned i = 0; i < grid->collapse; i++)
+    {
+      tree itype, type = TREE_TYPE (fd.loops[i].v);
+      if (POINTER_TYPE_P (type))
+	itype = signed_type_for (type);
+      else
+	itype = type;
+
+      tree n1 = fold_convert (itype, fd.loops[i].n1);
+      tree n2 = fold_convert (itype, fd.loops[i].n2);
+      tree t = build_int_cst (itype,
+			      (fd.loops[i].cond_code == LT_EXPR ? -1 : 1));
+      t = fold_build2 (PLUS_EXPR, itype, fd.loops[i].step, t);
+      t = fold_build2 (PLUS_EXPR, itype, t, n2);
+      t = fold_build2 (MINUS_EXPR, itype, t, n1);
+      if (TYPE_UNSIGNED (itype) && fd.loops[i].cond_code == GT_EXPR)
+	t = fold_build2 (TRUNC_DIV_EXPR, itype,
+			 fold_build1 (NEGATE_EXPR, itype, t),
+			 fold_build1 (NEGATE_EXPR, itype, fd.loops[i].step));
+      else
+	t = fold_build2 (TRUNC_DIV_EXPR, itype, t, fd.loops[i].step);
+
+      if (!operand_equal_p (grid->group_sizes[i], t, 0))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "the distribute and "
+			       "an internal loop do not agree on tile size\n");
+	      dump_printf_loc (MSG_NOTE, gimple_location (gfor),
+			       "Loop construct does not seem to loop over "
+			       "a tile size\n");
+	    }
+	  return false;
+	}
+    }
+  return true;
+}
+
+/* Facing a call to FNDECL in the body of a distribute construct, return true
+   if we can handle it or false if it precludes gridification.  */
+
+static bool
+grid_call_permissible_in_distribute_p (tree fndecl)
+{
+  if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
+    return true;
+
+  const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+  if (strstr (name, "omp_") != name)
+    return false;
+
+  if ((strcmp (name, "omp_get_thread_num") == 0)
+      || (strcmp (name, "omp_get_num_threads") == 0)
+      || (strcmp (name, "omp_get_num_teams") == 0)
+      || (strcmp (name, "omp_get_team_num") == 0)
+      || (strcmp (name, "omp_get_level") == 0)
+      || (strcmp (name, "omp_get_active_level") == 0)
+      || (strcmp (name, "omp_in_parallel") == 0))
+    return true;
+
+  return false;
+}
+
+/* Facing a call satisfying grid_call_permissible_in_distribute_p in the body
+   of a distribute construct that is pointed at by GSI, modify it as necessary
+   for gridification.  If the statement itself got removed, return true.  */
+
+static bool
+grid_handle_call_in_distribute (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree fndecl = gimple_call_fndecl (stmt);
+  gcc_checking_assert (stmt);
+  if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
+    return false;
+
+  const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+  if ((strcmp (name, "omp_get_thread_num") == 0)
+      || (strcmp (name, "omp_get_level") == 0)
+      || (strcmp (name, "omp_get_active_level") == 0)
+      || (strcmp (name, "omp_in_parallel") == 0))
+    {
+      tree lhs = gimple_call_lhs (stmt);
+      if (lhs)
+	{
+	  gassign *assign
+	    = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
+	  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+	}
+      gsi_remove (gsi, true);
+      return true;
+    }
+
+  /* The rest of the omp functions can stay as they are, HSA back-end will
+     handle them correctly.  */
+  gcc_checking_assert ((strcmp (name, "omp_get_num_threads") == 0)
+		       || (strcmp (name, "omp_get_num_teams") == 0)
+		       || (strcmp (name, "omp_get_team_num") == 0));
+  return false;
+}
+
+/* Given a sequence of statements within a distribute omp construct or a
+   parallel construct, which in the original source does not form a compound
+   construct with a looping construct, return true if it does not prevent us
+   from turning it into a gridified HSA kernel.  Otherwise return false. GRID
+   describes hitherto discovered properties of the loop that is evaluated for
+   possible gridification.  IN_PARALLEL must be true if seq is within a
+   parallel construct and flase if it is only within a distribute
+   construct.  */
+
+static bool
+grid_dist_follows_tiling_pattern (gimple_seq seq, grid_prop *grid,
+				  bool in_parallel)
+{
+  gimple_stmt_iterator gsi;
+  for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+
+      if (grid_safe_assignment_p (stmt, grid)
+	  || gimple_code (stmt) == GIMPLE_GOTO
+	  || gimple_code (stmt) == GIMPLE_LABEL
+	  || gimple_code (stmt) == GIMPLE_COND)
+	continue;
+      else if (gbind *bind = dyn_cast <gbind *> (stmt))
+	{
+	  if (!grid_dist_follows_tiling_pattern (gimple_bind_body (bind),
+						 grid, in_parallel))
+	    return false;
+	  continue;
+	}
+      else if (gtry *try_stmt = dyn_cast <gtry *> (stmt))
+	{
+	  if (gimple_try_kind (try_stmt) == GIMPLE_TRY_CATCH)
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+				   GRID_MISSED_MSG_PREFIX "the distribute "
+				   "construct contains a try..catch region\n");
+		  dump_printf_loc (MSG_NOTE, gimple_location (try_stmt),
+				   "This statement cannot be analyzed for "
+				   "tiled gridification\n");
+		}
+	      return false;
+	    }
+	  if (!grid_dist_follows_tiling_pattern (gimple_try_eval (try_stmt),
+						 grid, in_parallel))
+	    return false;
+	  if (!grid_dist_follows_tiling_pattern (gimple_try_cleanup (try_stmt),
+						 grid, in_parallel))
+	    return false;
+	  continue;
+	}
+      else if (is_gimple_call (stmt))
+	{
+	  tree fndecl = gimple_call_fndecl (stmt);
+	  if (fndecl && grid_call_permissible_in_distribute_p (fndecl))
+	    continue;
+
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "the distribute "
+			       "construct contains a call\n");
+	      dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+			       "This statement cannot be analyzed for "
+			       "tiled gridification\n");
+	    }
+	  return false;
+	}
+      else if (gomp_parallel *par = dyn_cast <gomp_parallel *> (stmt))
+	{
+	  if (in_parallel)
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+				   GRID_MISSED_MSG_PREFIX "a parallel "
+				   "construct contains another parallel "
+				   "construct\n");
+		  dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+				   "This parallel construct is nested in "
+				   "another one\n");
+		}
+	      return false;
+	    }
+	  if (!grid_parallel_clauses_gridifiable (par, grid->target_loc)
+	      || !grid_dist_follows_tiling_pattern (gimple_omp_body (par),
+						    grid, true))
+	    return false;
+	}
+      else if (gomp_for *gfor = dyn_cast <gomp_for *> (stmt))
+	{
+	  if (!in_parallel)
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+				   GRID_MISSED_MSG_PREFIX "a loop "
+				   "construct is not nested within a parallel "
+				   "construct\n");
+		  dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+				   "This loop construct is not nested in "
+				   "a parallel construct\n");
+		}
+	      return false;
+	    }
+	  if (!grid_gfor_follows_tiling_pattern (gfor, grid))
+	    return false;
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
+			       GRID_MISSED_MSG_PREFIX "the distribute "
+			       "construct contains a complex statement\n");
+	      dump_printf_loc (MSG_NOTE, gimple_location (stmt),
+			       "This statement cannot be analyzed for "
+			       "tiled gridification\n");
+	    }
+	  return false;
+	}
+    }
+    return true;
+}
+
+/* If TARGET follows a pattern that can be turned into a gridified HSA kernel,
+   return true, otherwise return false.  In the case of success, also fill in
+   GRID with information describing the kernel grid.  */
+
+static bool
+grid_target_follows_gridifiable_pattern (gomp_target *target, grid_prop *grid)
+{
+  if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
+    return false;
+
+  location_t tloc = gimple_location (target);
+  grid->target_loc = tloc;
+  gimple *stmt
+    = grid_find_single_omp_among_assignments (gimple_omp_body (target),
+					      grid, "target");
+  if (!stmt)
+    return false;
+  gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
+  tree group_size = NULL;
+  if (!teams)
+    {
+      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+		       GRID_MISSED_MSG_PREFIX "it does not have a sole teams "
+		       "construct in it.\n");
+      return false;
+    }
+
+  tree clauses = gimple_omp_teams_clauses (teams);
+  while (clauses)
+    {
+      switch (OMP_CLAUSE_CODE (clauses))
+	{
+	case OMP_CLAUSE_NUM_TEAMS:
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			     GRID_MISSED_MSG_PREFIX "the teams construct "
+			     "contains a num_teams clause\n ");
+	  return false;
+
+	case OMP_CLAUSE_REDUCTION:
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			     GRID_MISSED_MSG_PREFIX "a reduction "
+			     "clause is present\n ");
+	  return false;
+
+	case OMP_CLAUSE_THREAD_LIMIT:
+	  if (!integer_zerop (OMP_CLAUSE_OPERAND (clauses, 0)))
+	    group_size = OMP_CLAUSE_OPERAND (clauses, 0);
+	  break;
+
+	default:
+	  break;
+	}
+      clauses = OMP_CLAUSE_CHAIN (clauses);
+    }
+
+  stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), grid,
+						 "teams");
+  if (!stmt)
+    return false;
+  gomp_for *dist = dyn_cast <gomp_for *> (stmt);
+  if (!dist)
+    {
+      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+		       GRID_MISSED_MSG_PREFIX "the teams construct does not "
+		       "have a single distribute construct in it.\n");
+      return false;
+    }
+
+  gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
+
+  grid->collapse = gimple_omp_for_collapse (dist);
+  if (grid->collapse > 3)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			 GRID_MISSED_MSG_PREFIX "the distribute construct "
+			 "contains collapse clause with parameter greater "
+			 "than 3\n");
+      return false;
+    }
+
+  struct omp_for_data fd;
+  struct omp_for_data_loop *dist_loops
+    = (struct omp_for_data_loop *)alloca (grid->collapse
+					  * sizeof (struct omp_for_data_loop));
+  extract_omp_for_data (dist, &fd, dist_loops);
+  if (fd.chunk_size)
+    {
+      if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			     GRID_MISSED_MSG_PREFIX "the teams "
+			     "thread limit is different from distribute "
+			     "schedule chunk\n");
+	  return false;
+	}
+      group_size = fd.chunk_size;
+    }
+  if (group_size && grid->collapse > 1)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			 GRID_MISSED_MSG_PREFIX "group size cannot be "
+			 "set using thread_limit or schedule clauses "
+			 "when also using a collapse clause greater than 1\n");
+      return false;
+    }
+
+  if (gimple_omp_for_combined_p (dist))
+    {
+      grid->tiling = false;
+      grid->group_sizes[0] = group_size;
+      for (unsigned i = 1; i < grid->collapse; i++)
+	grid->group_sizes[i] = NULL;
+      return grid_dist_follows_simple_pattern (dist, grid);
+    }
+  else
+    {
+      grid->tiling = true;
+      if (group_size)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
+			     GRID_MISSED_MSG_PREFIX "group size cannot be set "
+			     "using thread_limit or schedule clauses when "
+			     "distribute and loop constructs do not form "
+			     "one combined construct\n");
+	  return false;
+	}
+      for (unsigned i = 0; i < grid->collapse; i++)
+	{
+	  if (fd.loops[i].cond_code == GT_EXPR)
+	    grid->group_sizes[i] = fold_build1 (NEGATE_EXPR,
+						TREE_TYPE (fd.loops[i].step),
+						fd.loops[i].step);
+	  else
+	    grid->group_sizes[i] = fd.loops[i].step;
+	}
+      return grid_dist_follows_tiling_pattern (gimple_omp_body (dist), grid,
+					       false);
+    }
+}
+
+/* Operand walker, used to remap pre-body declarations according to a hash map
+   provided in DATA.  */
+
+static tree
+grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
+{
+  tree t = *tp;
+
+  if (DECL_P (t) || TYPE_P (t))
+    *walk_subtrees = 0;
+  else
+    *walk_subtrees = 1;
+
+  if (VAR_P (t))
+    {
+      struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
+      hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
+      tree *repl = declmap->get (t);
+      if (repl)
+	*tp = *repl;
+    }
+  return NULL_TREE;
+}
+
+/* Identifiers of segments into which a particular variable should be places
+   when gridifying.  */
+
+enum grid_var_segment {GRID_SEGMENT_PRIVATE, GRID_SEGMENT_GROUP,
+		       GRID_SEGMENT_GLOBAL};
+
+/* Mark VAR so that it is eventually placed into SEGMENT.  Place an artificial
+   builtin call into SEQ that will make sure the variable is always considered
+   address taken.  */
+
+static void
+grid_mark_variable_segment (tree var, enum grid_var_segment segment)
+{
+  /* Making a non-addressable variables would require that we re-gimplify all
+     their uses.  Fortunately, we do not have to do this because if they are
+     not addressable, it means they are not used in atomic or parallel
+     statements and so relaxed GPU consistency rules mean we can just keep them
+     private. */
+  if (!TREE_ADDRESSABLE (var))
+    return;
+
+  switch (segment)
+    {
+    case GRID_SEGMENT_GROUP:
+      DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_group_segment"),
+					 NULL, DECL_ATTRIBUTES (var));
+      break;
+    case GRID_SEGMENT_GLOBAL:
+      DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_global_segment"),
+					 NULL, DECL_ATTRIBUTES (var));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (!TREE_STATIC (var))
+    {
+      TREE_STATIC (var) = 1;
+      varpool_node::finalize_decl (var);
+    }
+
+}
+
+/* Copy leading register-type assignments to local variables in SRC to just
+   before DST, Creating temporaries, adjusting mapping of operands in WI and
+   remapping operands as necessary.  Add any new temporaries to TGT_BIND.
+   Return the first statement that does not conform to grid_safe_assignment_p
+   or NULL.  If VAR_SEGMENT is not GRID_SEGMENT_PRIVATE, also mark all
+   variables in traversed bind statements so that they are put into the
+   appropriate segment.  */
+
+static gimple *
 grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
-				gbind *tgt_bind, struct walk_stmt_info *wi)
+				     gbind *tgt_bind,
+				     enum grid_var_segment var_segment,
+				     struct walk_stmt_info *wi)
 {
   hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
   gimple_stmt_iterator gsi;
@@ -17993,13 +18491,17 @@ grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
       if (gbind *bind = dyn_cast <gbind *> (stmt))
 	{
 	  gimple *r = grid_copy_leading_local_assignments
-	    (gimple_bind_body (bind), dst, tgt_bind, wi);
+	    (gimple_bind_body (bind), dst, tgt_bind, var_segment, wi);
+
+	  if (var_segment != GRID_SEGMENT_PRIVATE)
+	    for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
+	      grid_mark_variable_segment (var, var_segment);
 	  if (r)
 	    return r;
 	  else
 	    continue;
 	}
-      if (!grid_reg_assignment_to_local_var_p (stmt))
+      if (!grid_safe_assignment_p (stmt, NULL))
 	return stmt;
       tree lhs = gimple_assign_lhs (as_a <gassign *> (stmt));
       tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL),
@@ -18015,43 +18517,262 @@ grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
   return NULL;
 }
 
+/* Statement walker function to make adjustments to statements within the
+   gridifed kernel copy.  */
+
+static tree
+grid_process_grid_body (gimple_stmt_iterator *gsi, bool *handled_ops_p,
+			struct walk_stmt_info *)
+{
+  *handled_ops_p = false;
+  gimple *stmt = gsi_stmt (*gsi);
+  if (gimple_code (stmt) == GIMPLE_OMP_FOR
+      && (gimple_omp_for_kind (stmt) & GF_OMP_FOR_SIMD))
+  {
+    gomp_for *loop = as_a <gomp_for *> (stmt);
+    tree clauses = gimple_omp_for_clauses (loop);
+    tree cl = find_omp_clause (clauses, OMP_CLAUSE_SAFELEN);
+    if (cl)
+      OMP_CLAUSE_SAFELEN_EXPR (cl) = integer_one_node;
+    else
+      {
+	tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_SAFELEN);
+	OMP_CLAUSE_SAFELEN_EXPR (c) = integer_one_node;
+	OMP_CLAUSE_CHAIN (c) = clauses;
+	gimple_omp_for_set_clauses (loop, c);
+      }
+  }
+  return NULL_TREE;
+}
+
+/* Given a PARLOOP that is a normal for looping construct but also a part of a
+   combined construct with a simd loop, eliminate the simd loop.  */
+
+static void
+grid_eliminate_combined_simd_part (gomp_for *parloop)
+{
+  struct walk_stmt_info wi;
+
+  memset (&wi, 0, sizeof (wi));
+  wi.val_only = true;
+  enum gf_mask msk = GF_OMP_FOR_SIMD;
+  wi.info = (void *) &msk;
+  walk_gimple_seq (gimple_omp_body (parloop), find_combined_for, NULL, &wi);
+  gimple *stmt = (gimple *) wi.info;
+  /* We expect that the SIMD id the only statement in the parallel loop.  */
+  gcc_assert (stmt
+	      && gimple_code (stmt) == GIMPLE_OMP_FOR
+	      && (gimple_omp_for_kind (stmt) == GF_OMP_FOR_SIMD)
+	      && gimple_omp_for_combined_into_p (stmt)
+	      && !gimple_omp_for_combined_p (stmt));
+  gomp_for *simd = as_a <gomp_for *> (stmt);
+
+  /* Copy over the iteration properties because the body refers to the index in
+     the bottmom-most loop.  */
+  unsigned i, collapse = gimple_omp_for_collapse (parloop);
+  gcc_checking_assert (collapse == gimple_omp_for_collapse (simd));
+  for (i = 0; i < collapse; i++)
+    {
+      gimple_omp_for_set_index (parloop, i, gimple_omp_for_index (simd, i));
+      gimple_omp_for_set_initial (parloop, i, gimple_omp_for_initial (simd, i));
+      gimple_omp_for_set_final (parloop, i, gimple_omp_for_final (simd, i));
+      gimple_omp_for_set_incr (parloop, i, gimple_omp_for_incr (simd, i));
+    }
+
+  tree *tgt= gimple_omp_for_clauses_ptr (parloop);
+  while (*tgt)
+    tgt = &OMP_CLAUSE_CHAIN (*tgt);
+
+  /* Copy over all clauses, except for linaer clauses, which are turned into
+     private clauses, and all other simd-specificl clauses, which are
+     ignored.  */
+  tree *pc = gimple_omp_for_clauses_ptr (simd);
+  while (*pc)
+    {
+      tree c = *pc;
+      switch (TREE_CODE (c))
+	{
+	case OMP_CLAUSE_LINEAR:
+	  {
+	    tree priv = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_PRIVATE);
+	    OMP_CLAUSE_DECL (priv) = OMP_CLAUSE_DECL (c);
+	    OMP_CLAUSE_CHAIN (priv) = NULL;
+	    *tgt = priv;
+	    tgt = &OMP_CLAUSE_CHAIN (priv);
+	    pc = &OMP_CLAUSE_CHAIN (c);
+	    break;
+	  }
+
+	case OMP_CLAUSE_SAFELEN:
+	case OMP_CLAUSE_SIMDLEN:
+	case OMP_CLAUSE_ALIGNED:
+	  pc = &OMP_CLAUSE_CHAIN (c);
+	  break;
+
+	default:
+	  *pc = OMP_CLAUSE_CHAIN (c);
+	  OMP_CLAUSE_CHAIN (c) = NULL;
+	  *tgt = c;
+	  tgt = &OMP_CLAUSE_CHAIN(c);
+	  break;
+	}
+    }
+
+  /* Finally, throw away the simd and mark the parallel loop as not
+     combined.  */
+  gimple_omp_set_body (parloop, gimple_omp_body (simd));
+  gimple_omp_for_set_combined_p (parloop, false);
+}
+
+/* Statement walker function marking all parallels as grid_phony and loops as
+   grid ones representing threads of a particular thread group.  */
+
+static tree
+grid_mark_tiling_loops (gimple_stmt_iterator *gsi, bool *handled_ops_p,
+			struct walk_stmt_info *wi_in)
+{
+  *handled_ops_p = false;
+  if (gomp_for *loop = dyn_cast <gomp_for *> (gsi_stmt (*gsi)))
+    {
+      *handled_ops_p = true;
+      gimple_omp_for_set_kind (loop, GF_OMP_FOR_KIND_GRID_LOOP);
+      gimple_omp_for_set_grid_intra_group (loop, true);
+      if (gimple_omp_for_combined_p (loop))
+	grid_eliminate_combined_simd_part (loop);
+
+      struct walk_stmt_info body_wi;
+      memset (&body_wi, 0, sizeof (body_wi));
+      walk_gimple_seq_mod (gimple_omp_body_ptr (loop),
+			   grid_process_grid_body, NULL, &body_wi);
+
+      gbind *bind = (gbind *) wi_in->info;
+      tree c;
+      for (c = gimple_omp_for_clauses (loop); c; c = OMP_CLAUSE_CHAIN (c))
+	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE)
+	  {
+	    push_gimplify_context ();
+	    tree ov = OMP_CLAUSE_DECL (c);
+	    tree gv = copy_var_decl (ov, create_tmp_var_name (NULL),
+				    TREE_TYPE (ov));
+
+	    grid_mark_variable_segment (gv, GRID_SEGMENT_GROUP);
+	    DECL_CONTEXT (gv) = current_function_decl;
+	    gimple_bind_append_vars (bind, gv);
+	    tree x = lang_hooks.decls.omp_clause_assign_op (c, gv, ov);
+	    gimplify_and_add (x, &OMP_CLAUSE_LASTPRIVATE_GIMPLE_SEQ (c));
+	    x = lang_hooks.decls.omp_clause_copy_ctor (c, ov, gv);
+	    gimple_seq l = NULL;
+	    gimplify_and_add (x, &l);
+	    gsi_insert_seq_after (gsi, l, GSI_SAME_STMT);
+	    pop_gimplify_context (bind);
+	  }
+    }
+  return NULL_TREE;
+}
+
+/* Statement walker function marking all parallels as grid_phony and loops as
+   grid ones representing threads of a particular thread group.  */
+
+static tree
+grid_mark_tiling_parallels_and_loops (gimple_stmt_iterator *gsi,
+				      bool *handled_ops_p,
+				      struct walk_stmt_info *wi_in)
+{
+  *handled_ops_p = false;
+  wi_in->removed_stmt = false;
+  gimple *stmt = gsi_stmt (*gsi);
+  if (gbind *bind = dyn_cast <gbind *> (stmt))
+    {
+      for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
+	grid_mark_variable_segment (var, GRID_SEGMENT_GROUP);
+    }
+  else if (gomp_parallel *parallel = dyn_cast <gomp_parallel *> (stmt))
+    {
+      *handled_ops_p = true;
+      gimple_omp_parallel_set_grid_phony (parallel, true);
+
+      gbind *new_bind = gimple_build_bind (NULL, NULL, make_node (BLOCK));
+      gimple_bind_set_body (new_bind, gimple_omp_body (parallel));
+      gimple_seq s = NULL;
+      gimple_seq_add_stmt (&s, new_bind);
+      gimple_omp_set_body (parallel, s);
+
+      struct walk_stmt_info wi_par;
+      memset (&wi_par, 0, sizeof (wi_par));
+      wi_par.info = new_bind;
+      walk_gimple_seq_mod (gimple_bind_body_ptr (new_bind),
+			   grid_mark_tiling_loops, NULL, &wi_par);
+    }
+  else if (is_a <gcall *> (stmt))
+    wi_in->removed_stmt = grid_handle_call_in_distribute (gsi);
+  return NULL_TREE;
+}
+
 /* Given freshly copied top level kernel SEQ, identify the individual OMP
-   components, mark them as part of kernel and return the inner loop, and copy
-   assignment leading to them just before DST, remapping them using WI and
-   adding new temporaries to TGT_BIND.  */
+   components, mark them as part of kernel, copy assignment leading to them
+   just before DST, remapping them using WI and adding new temporaries to
+   TGT_BIND, and and return the loop that will be used for kernel dispatch.  */
 
 static gomp_for *
-grid_process_kernel_body_copy (gimple_seq seq, gimple_stmt_iterator *dst,
+grid_process_kernel_body_copy (grid_prop *grid, gimple_seq seq,
+			       gimple_stmt_iterator *dst,
 			       gbind *tgt_bind, struct walk_stmt_info *wi)
 {
-  gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind, wi);
+  gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind,
+						      GRID_SEGMENT_GLOBAL, wi);
   gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
   gcc_assert (teams);
   gimple_omp_teams_set_grid_phony (teams, true);
   stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst,
-					 tgt_bind, wi);
+					      tgt_bind, GRID_SEGMENT_GLOBAL, wi);
   gcc_checking_assert (stmt);
   gomp_for *dist = dyn_cast <gomp_for *> (stmt);
   gcc_assert (dist);
   gimple_seq prebody = gimple_omp_for_pre_body (dist);
   if (prebody)
-    grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi);
-  gimple_omp_for_set_grid_phony (dist, true);
-  stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
-					 tgt_bind, wi);
-  gcc_checking_assert (stmt);
+    grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
+					 GRID_SEGMENT_GROUP, wi);
 
-  gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
-  gimple_omp_parallel_set_grid_phony (parallel, true);
-  stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel), dst,
-					 tgt_bind, wi);
-  gomp_for *inner_loop = as_a <gomp_for *> (stmt);
-  gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
-  prebody = gimple_omp_for_pre_body (inner_loop);
-  if (prebody)
-    grid_copy_leading_local_assignments (prebody, dst, tgt_bind, wi);
+  if (grid->tiling)
+    {
+      gimple_omp_for_set_kind (dist, GF_OMP_FOR_KIND_GRID_LOOP);
+      gimple_omp_for_set_grid_group_iter (dist, true);
 
-  return inner_loop;
+      struct walk_stmt_info wi_tiled;
+      memset (&wi_tiled, 0, sizeof (wi_tiled));
+      walk_gimple_seq_mod (gimple_omp_body_ptr (dist),
+			   grid_mark_tiling_parallels_and_loops, NULL,
+			   &wi_tiled);
+      return dist;
+    }
+  else
+    {
+      gimple_omp_for_set_grid_phony (dist, true);
+      stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
+						  tgt_bind,
+						  GRID_SEGMENT_PRIVATE, wi);
+      gcc_checking_assert (stmt);
+      gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
+      gimple_omp_parallel_set_grid_phony (parallel, true);
+      stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel),
+						  dst, tgt_bind,
+						  GRID_SEGMENT_PRIVATE, wi);
+      gomp_for *inner_loop = as_a <gomp_for *> (stmt);
+      gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
+      prebody = gimple_omp_for_pre_body (inner_loop);
+      if (prebody)
+	grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
+					     GRID_SEGMENT_PRIVATE, wi);
+
+      if (gimple_omp_for_combined_p (inner_loop))
+	grid_eliminate_combined_simd_part (inner_loop);
+      struct walk_stmt_info body_wi;;
+      memset (&body_wi, 0, sizeof (body_wi));
+      walk_gimple_seq_mod (gimple_omp_body_ptr (inner_loop),
+			   grid_process_grid_body, NULL, &body_wi);
+
+      return inner_loop;
+    }
 }
 
 /* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern,
@@ -18064,14 +18785,16 @@ grid_attempt_target_gridification (gomp_target *target,
 				   gimple_stmt_iterator *gsi,
 				   gbind *tgt_bind)
 {
-  tree group_size;
-  if (!target || !grid_target_follows_gridifiable_pattern (target, &group_size))
+  /* removed group_size */
+  grid_prop grid;
+  memset (&grid, 0, sizeof (grid));
+  if (!target || !grid_target_follows_gridifiable_pattern (target, &grid))
     return;
 
   location_t loc = gimple_location (target);
   if (dump_enabled_p ())
     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
-		     "Target construct will be turned into a gridified GPGPU "
+		     "Target construct will be turned into a gridified HSA "
 		     "kernel\n");
 
   /* Copy target body to a GPUKERNEL construct:  */
@@ -18084,8 +18807,8 @@ grid_attempt_target_gridification (gomp_target *target,
   wi.info = declmap;
 
   /* Copy assignments in between OMP statements before target, mark OMP
-     statements within copy appropriatly.  */
-  gomp_for *inner_loop = grid_process_kernel_body_copy (kernel_seq, gsi,
+     statements within copy appropriately.  */
+  gomp_for *inner_loop = grid_process_kernel_body_copy (&grid, kernel_seq, gsi,
 							tgt_bind, &wi);
 
   gbind *old_bind = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target)));
@@ -18100,10 +18823,10 @@ grid_attempt_target_gridification (gomp_target *target,
     (gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))),
      gpukernel);
 
-  walk_tree (&group_size, grid_remap_prebody_decls, &wi, NULL);
+  for (size_t i = 0; i < grid.collapse; i++)
+    walk_tree (&grid.group_sizes[i], grid_remap_prebody_decls, &wi, NULL);
   push_gimplify_context ();
-  size_t collapse = gimple_omp_for_collapse (inner_loop);
-  for (size_t i = 0; i < collapse; i++)
+  for (size_t i = 0; i < grid.collapse; i++)
     {
       tree itype, type = TREE_TYPE (gimple_omp_for_index (inner_loop, i));
       if (POINTER_TYPE_P (type))
@@ -18117,12 +18840,12 @@ grid_attempt_target_gridification (gomp_target *target,
       tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i));
       walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL);
       adjust_for_condition (loc, &cond_code, &n2);
-      tree step;
-      step = get_omp_for_step_from_incr (loc,
-					 gimple_omp_for_incr (inner_loop, i));
-      gimple_seq tmpseq = NULL;
       n1 = fold_convert (itype, n1);
       n2 = fold_convert (itype, n2);
+
+      tree step
+	= get_omp_for_step_from_incr (loc, gimple_omp_for_incr (inner_loop, i));
+
       tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1));
       t = fold_build2 (PLUS_EXPR, itype, step, t);
       t = fold_build2 (PLUS_EXPR, itype, t, n2);
@@ -18133,15 +18856,23 @@ grid_attempt_target_gridification (gomp_target *target,
 			 fold_build1 (NEGATE_EXPR, itype, step));
       else
 	t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+      if (grid.tiling)
+        {
+          if (cond_code == GT_EXPR)
+            step = fold_build1 (NEGATE_EXPR, itype, step);
+          t = fold_build2 (MULT_EXPR, itype, t, step);
+        }
+
       tree gs = fold_convert (uint32_type_node, t);
+      gimple_seq tmpseq = NULL;
       gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue);
       if (!gimple_seq_empty_p (tmpseq))
 	gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
 
       tree ws;
-      if (i == 0 && group_size)
+      if (grid.group_sizes[i])
 	{
-	  ws = fold_convert (uint32_type_node, group_size);
+	  ws = fold_convert (uint32_type_node, grid.group_sizes[i]);
 	  tmpseq = NULL;
 	  gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue);
 	  if (!gimple_seq_empty_p (tmpseq))
@@ -18262,7 +18993,7 @@ const pass_data pass_data_lower_omp =
 {
   GIMPLE_PASS, /* type */
   "omplower", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
   PROP_gimple_lomp | PROP_gimple_lomp_dev, /* properties_provided */
@@ -18733,7 +19464,7 @@ const pass_data pass_data_diagnose_omp_blocks =
 {
   GIMPLE_PASS, /* type */
   "*diagnose_omp_blocks", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_gimple_any, /* properties_required */
   0, /* properties_provided */
@@ -20164,7 +20895,7 @@ const pass_data pass_data_oacc_device_lower =
 {
   GIMPLE_PASS, /* type */
   "oaccdevlow", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_cfg, /* properties_required */
   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
@@ -20267,7 +20998,7 @@ const pass_data pass_data_omp_device_lower =
 {
   GIMPLE_PASS, /* type */
   "ompdevlow", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
+  OPTGROUP_OPENMP, /* optinfo_flags */
   TV_NONE, /* tv_id */
   PROP_cfg, /* properties_required */
   PROP_gimple_lomp_dev, /* properties_provided */
@@ -20316,7 +21047,7 @@ const pass_data pass_data_omp_target_link =
 {
   GIMPLE_PASS,			/* type */
   "omptargetlink",		/* name */
-  OPTGROUP_NONE,		/* optinfo_flags */
+  OPTGROUP_OPENMP,		/* optinfo_flags */
   TV_NONE,			/* tv_id */
   PROP_ssa,			/* properties_required */
   0,				/* properties_provided */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 7bdb38ac142..aeb8accf74c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2016-11-23  Martin Jambor  <mjambor@suse.cz>
+
+	* c-c++-common/gomp/gridify-1.c: Update scan string.
+	* gfortran.dg/gomp/gridify-1.f90: Likewise.
+	* c-c++-common/gomp/gridify-2.c: New test.
+	* c-c++-common/gomp/gridify-3.c: Likewise.
+
 2016-11-23  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/78396
diff --git a/gcc/testsuite/c-c++-common/gomp/gridify-1.c b/gcc/testsuite/c-c++-common/gomp/gridify-1.c
index ba7a86665b5..f9b03ebd96f 100644
--- a/gcc/testsuite/c-c++-common/gomp/gridify-1.c
+++ b/gcc/testsuite/c-c++-common/gomp/gridify-1.c
@@ -51,4 +51,4 @@ foo4 (int j, int n, int *a)
 }
 
 
-/* { dg-final { scan-tree-dump-times "Target construct will be turned into a gridified GPGPU kernel" 4 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "Target construct will be turned into a gridified HSA kernel" 4 "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/gomp/gridify-2.c b/gcc/testsuite/c-c++-common/gomp/gridify-2.c
new file mode 100644
index 00000000000..6b5cc9a8a7d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/gomp/gridify-2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target offload_hsa } */
+/* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
+
+#define BLOCK_SIZE 16
+
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+   const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+   for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+      for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+	{
+//       Each team has a local copy of these mini matrices
+         float As[BLOCK_SIZE][BLOCK_SIZE];
+         float Bs[BLOCK_SIZE][BLOCK_SIZE];
+#pragma omp parallel
+	 {
+         int C_row, C_col;
+         float Cval = 0.0;
+
+         for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE )
+	   {
+#pragma omp for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+               for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   C_row = C_row_start + row;
+		   C_col = C_col_start + col;
+		   if ((C_row < M) && (kblock + col < K))
+		     As[row][col] = A[(C_row*LDA)+ kblock + col];
+		   else
+		     As[row][col] = 0;
+		   if ((kblock + row < K) && C_col < N)
+		     Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+		   else
+		     Bs[row][col] = 0;
+		 }
+
+#pragma omp for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+	       for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   for (int e = 0; e < BLOCK_SIZE; ++e)
+                     Cval += As[row][e] * Bs[e][col];
+		 }
+	   }  /* End for kblock .. */
+
+
+#pragma omp for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++)
+	   for (int col=0 ; col < BLOCK_SIZE ; col++)
+	     {
+               C_row = C_row_start + row;
+               C_col = C_col_start + col;
+	       if ((C_row < M) && (C_col < N))
+		 C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
+
+	     }
+         } /* end parallel */
+      }	   /* end target teams distribute */
+}
+
+/* { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } } */
diff --git a/gcc/testsuite/c-c++-common/gomp/gridify-3.c b/gcc/testsuite/c-c++-common/gomp/gridify-3.c
new file mode 100644
index 00000000000..8dbeaef74cd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/gomp/gridify-3.c
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target offload_hsa } */
+/* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
+
+#define BLOCK_SIZE 16
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+   const float*B, const int LDB, const float beta, float*C, const int LDC)
+{
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+   for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+      for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+	{
+	  float As[BLOCK_SIZE][BLOCK_SIZE];
+	  float Bs[BLOCK_SIZE][BLOCK_SIZE];
+	  float Cs[BLOCK_SIZE][BLOCK_SIZE];
+	  int C_row, C_col;
+
+#pragma omp parallel for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++)
+	   for (int col=0 ; col < BLOCK_SIZE ; col++)
+	     {
+               Cs[row][col] = 0.0;
+	     }
+
+
+         for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE )
+	   {
+#pragma omp parallel for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+               for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   C_row = C_row_start + row;
+		   C_col = C_col_start + col;
+		   if ((C_row < M) && (kblock + col < K))
+		     As[row][col] = A[(C_row*LDA)+ kblock + col];
+		   else
+		     As[row][col] = 0;
+		   if ((kblock + row < K) && C_col < N)
+		     Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+		   else
+		     Bs[row][col] = 0;
+		 }
+
+#pragma omp parallel for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+               for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   for (int e = 0; e < BLOCK_SIZE; ++e)
+                     Cs[row][col] += As[row][e] * Bs[e][col];
+		 }
+         }  /* End for kblock .. */
+
+
+#pragma omp parallel for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++)
+	   for (int col=0 ; col < BLOCK_SIZE ; col++)
+	     {
+               C_row = C_row_start + row;
+               C_col = C_col_start + col;
+	       if ((C_row < M) && (C_col < N))
+		 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
+	     }
+      }	/* End distribute */
+}
+
+/* { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } } */
diff --git a/gcc/testsuite/gfortran.dg/gomp/gridify-1.f90 b/gcc/testsuite/gfortran.dg/gomp/gridify-1.f90
index 00ff7f510a0..7def27980ba 100644
--- a/gcc/testsuite/gfortran.dg/gomp/gridify-1.f90
+++ b/gcc/testsuite/gfortran.dg/gomp/gridify-1.f90
@@ -13,4 +13,4 @@ subroutine vector_square(n, a, b)
 !$omp end target teams
 end subroutine vector_square
 
-! { dg-final { scan-tree-dump "Target construct will be turned into a gridified GPGPU kernel" "omplower" } }
+! { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } }
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 19d80394f4b..16781f9529a 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2016-11-23  Martin Jambor  <mjambor@suse.cz>
+
+	* testsuite/libgomp.hsa.c/bits-insns.c: New test.
+	* testsuite/libgomp.hsa.c/tiling-1.c: Likewise.
+	* testsuite/libgomp.hsa.c/tiling-2.c: Likewise.
+
 2016-11-23  Martin Liska  <mliska@suse.cz>
             Martin Jambor  <mjambor@suse.cz>
 
diff --git a/libgomp/testsuite/libgomp.hsa.c/bits-insns.c b/libgomp/testsuite/libgomp.hsa.c/bits-insns.c
new file mode 100644
index 00000000000..21cac72dac4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/bits-insns.c
@@ -0,0 +1,73 @@
+#include <math.h>
+
+#define N 12
+
+int main()
+{
+  unsigned int arguments[N] = {0u, 1u, 2u, 3u, 111u, 333u, 444u, 0x80000000u, 0x0000ffffu, 0xf0000000u, 0xff000000u, 0xffffffffu};
+  int clrsb[N] = {};
+  int clz[N] = {};
+  int ctz[N] = {};
+  int ffs[N] = {};
+  int parity[N] = {};
+  int popcount[N] = {};
+
+  int ref_clrsb[N] = {};
+  int ref_clz[N] = {};
+  int ref_ctz[N] = {};
+  int ref_ffs[N] = {};
+  int ref_parity[N] = {};
+  int ref_popcount[N] = {};
+
+  for (unsigned i = 0; i < N; i++)
+    {
+      ref_clrsb[i] = __builtin_clrsb (arguments[i]);
+      ref_clz[i] = __builtin_clz (arguments[i]);
+      ref_ctz[i] = __builtin_ctz (arguments[i]);
+      ref_ffs[i] = __builtin_ffs (arguments[i]);
+      ref_parity[i] = __builtin_parity (arguments[i]);
+      ref_popcount[i] = __builtin_popcount (arguments[i]);
+    }
+
+  #pragma omp target map(from:clz, ctz, ffs, parity, popcount)
+  {
+    for (unsigned i = 0; i < N; i++)
+    {
+      clrsb[i] = __builtin_clrsb (arguments[i]);
+      clz[i] = __builtin_clz (arguments[i]);
+      ctz[i] = __builtin_ctz (arguments[i]);
+      ffs[i] = __builtin_ffs (arguments[i]);
+      parity[i] = __builtin_parity (arguments[i]);
+      popcount[i] = __builtin_popcount (arguments[i]);
+    }
+  }
+
+  for (unsigned i = 0; i < N; i++)
+    if (ref_clrsb[i] != clrsb[i])
+      __builtin_abort ();
+
+  /* CLZ of zero is undefined for zero.  */
+  for (unsigned i = 1; i < N; i++)
+    if (ref_clz[i] != clz[i])
+      __builtin_abort ();
+
+  /* Likewise for ctz */
+  for (unsigned i = 1; i < N; i++)
+    if (ref_ctz[i] != ctz[i])
+      __builtin_abort ();
+
+  for (unsigned i = 0; i < N; i++)
+    if (ref_ffs[i] != ffs[i])
+      __builtin_abort ();
+
+  for (unsigned i = 0; i < N; i++)
+    if (ref_parity[i] != parity[i])
+      __builtin_abort ();
+
+  for (unsigned i = 0; i < N; i++)
+    if (ref_popcount[i] != popcount[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
diff --git a/libgomp/testsuite/libgomp.hsa.c/tiling-1.c b/libgomp/testsuite/libgomp.hsa.c/tiling-1.c
new file mode 100644
index 00000000000..9149adc04e9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/tiling-1.c
@@ -0,0 +1,212 @@
+/*
+
+   matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+  #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+   int width;
+   int height;
+   int stride;
+   int hpad;
+   float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+   long int nanosecs;
+   if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+      ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+      ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+   else nanosecs =
+      (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+      ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+   return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void  tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+    int passed = 1;
+    int i;
+    for (i = 0; i < len; ++i) {
+        if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+	  __builtin_abort ();
+        }
+    }
+    return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+   Matrix A,B,Bt,C,Cref;
+   int a1,a2,a3,i,j;
+   struct timespec start_time1, end_time1;
+   struct timespec start_time2, end_time2;
+   long int nanosecs,total_ops;
+   float gflopsTiled,gflopsCPU;
+
+   a1 = 35;
+   a2 = 28;
+   a3 = 47;
+
+   A.height = a1;
+   A.width = a2;
+   A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+   B.height = a2;
+   B.width = a3;
+   B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+   /* Bt is same as B but stored in column-major order */
+   Bt.height = B.height;
+   Bt.width = B.width;
+   Bt.stride = B.stride;
+   Bt.hpad = B.hpad;
+   Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+   C.height = a1;
+   C.width = a3;
+   C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+   Cref.height = a1;
+   Cref.width = a3;
+   Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+   for(i = 0; i < A.hpad ; i++)
+      for(j = 0; j < A.stride; j++) {
+         if (( j<A.width ) && (i<A.height)) {
+            A.elements[i*A.stride + j] = (i % 3);
+         } else {
+            A.elements[i*A.stride + j] = 0.0;
+         }
+      }
+
+   /*  Initialize B and Bt */
+   for(i = 0; i < B.hpad ; i++)
+      for(j = 0; j < B.stride; j++) {
+         if (( j<B.width ) && (i<B.height)) {
+            B.elements[i*B.stride+j] = (j % 2);
+            Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+         } else {
+            B.elements[i*B.stride+j] = 0.0;
+            Bt.elements[j*Bt.stride+i] = 0.0;
+         }
+      }
+
+   /* zero C, and Cref */
+   for(i = 0; i < C.hpad; i++)
+      for(j = 0; j < C.stride; j++) {
+         C.elements[i*C.stride+j] = 0.0;
+         Cref.elements[i*Cref.stride+j] = 0.0;
+      }
+
+   simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+   tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+   verify(C.elements, Cref.elements, C.height * C.stride);
+   return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+   /*  A,B, and C  are in row-major order */
+   int c_row,c_col,inner;
+   float sum;
+   for (c_col  = 0 ;  c_col<N; c_col++ ) {
+      for (c_row = 0 ; c_row<M; c_row++ ) {
+         sum = 0.0 ;
+         for (inner = 0 ; inner<K; inner++ ) {
+            sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+         }
+         C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+      }
+   }
+}
+
+/***************************
+
+   tiled_sgemm_tt:  Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+   const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+   for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+      for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+	{
+//       Each team has a local copy of these mini matrices
+         float As[BLOCK_SIZE][BLOCK_SIZE];
+         float Bs[BLOCK_SIZE][BLOCK_SIZE];
+#pragma omp parallel
+	 {
+         int C_row, C_col;
+         float Cval = 0.0;
+
+         for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE )
+	   {
+#pragma omp for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+               for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   C_row = C_row_start + row;
+		   C_col = C_col_start + col;
+		   if ((C_row < M) && (kblock + col < K))
+		     As[row][col] = A[(C_row*LDA)+ kblock + col];
+		   else
+		     As[row][col] = 0;
+		   if ((kblock + row < K) && C_col < N)
+		     Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+		   else
+		     Bs[row][col] = 0;
+		 }
+
+#pragma omp for collapse(2)
+	     for (int row=0 ; row < BLOCK_SIZE ; row++)
+	       for (int col=0 ; col < BLOCK_SIZE ; col++)
+		 {
+		   for (int e = 0; e < BLOCK_SIZE; ++e)
+                     Cval += As[row][e] * Bs[e][col];
+		 }
+	   }  /* End for kblock .. */
+
+
+#pragma omp for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++)
+	   for (int col=0 ; col < BLOCK_SIZE ; col++)
+	     {
+               C_row = C_row_start + row;
+               C_col = C_col_start + col;
+	       if ((C_row < M) && (C_col < N))
+		 C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
+
+	     }
+         } /* end parallel */
+      }	   /* end target teams distribute */
+}
diff --git a/libgomp/testsuite/libgomp.hsa.c/tiling-2.c b/libgomp/testsuite/libgomp.hsa.c/tiling-2.c
new file mode 100644
index 00000000000..6e543045559
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/tiling-2.c
@@ -0,0 +1,258 @@
+/*
+
+   matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+  #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+   int width;
+   int height;
+   int stride;
+   int hpad;
+   float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+   long int nanosecs;
+   if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+      ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+      ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+   else nanosecs =
+      (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+      ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+   return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void  tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+     const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+    int passed = 1;
+    int i;
+    for (i = 0; i < len; ++i) {
+        if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+	  __builtin_abort ();
+        }
+    }
+    return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+   Matrix A,B,Bt,C,Cref;
+   int a1,a2,a3,i,j;
+   struct timespec start_time1, end_time1;
+   struct timespec start_time2, end_time2;
+   long int nanosecs,total_ops;
+   float gflopsTiled,gflopsCPU;
+
+   a1 = 35;
+   a2 = 28;
+   a3 = 47;
+
+   A.height = a1;
+   A.width = a2;
+   A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+   B.height = a2;
+   B.width = a3;
+   B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+   /* Bt is same as B but stored in column-major order */
+   Bt.height = B.height;
+   Bt.width = B.width;
+   Bt.stride = B.stride;
+   Bt.hpad = B.hpad;
+   Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+   C.height = a1;
+   C.width = a3;
+   C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+   Cref.height = a1;
+   Cref.width = a3;
+   Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+   Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+   for(i = 0; i < A.hpad ; i++)
+      for(j = 0; j < A.stride; j++) {
+         if (( j<A.width ) && (i<A.height)) {
+            A.elements[i*A.stride + j] = (i % 3);
+         } else {
+            A.elements[i*A.stride + j] = 0.0;
+         }
+      }
+
+   /*  Initialize B and Bt */
+   for(i = 0; i < B.hpad ; i++)
+      for(j = 0; j < B.stride; j++) {
+         if (( j<B.width ) && (i<B.height)) {
+            B.elements[i*B.stride+j] = (j % 2);
+            Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+         } else {
+            B.elements[i*B.stride+j] = 0.0;
+            Bt.elements[j*Bt.stride+i] = 0.0;
+         }
+      }
+
+   /* zero C, and Cref */
+   for(i = 0; i < C.hpad; i++)
+      for(j = 0; j < C.stride; j++) {
+         C.elements[i*C.stride+j] = 0.0;
+         Cref.elements[i*Cref.stride+j] = 0.0;
+      }
+
+   simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+   tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+   verify(C.elements, Cref.elements, C.height * C.stride);
+   return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+   /*  A,B, and C  are in row-major order */
+   int c_row,c_col,inner;
+   float sum;
+   for (c_col  = 0 ;  c_col<N; c_col++ ) {
+      for (c_row = 0 ; c_row<M; c_row++ ) {
+         sum = 0.0 ;
+         for (inner = 0 ; inner<K; inner++ ) {
+            sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+         }
+         C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+      }
+   }
+}
+
+/***************************
+
+   tiled_sgemm_tt:  Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+   const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+   for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
+      for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
+
+// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
+// The grid global dimensions are M,N,1
+// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
+
+// -------------------------------------------------------------------
+//      The rest of this code forms the HSAIL kernel with the
+//      pairs of "paralell for collapse(2)" loops repalced with a barrier.
+//      The kernel initializes these values
+//      C_row_start = get_group_id(0) * BLOCK_SIZE
+//      C_col_start = get_group_id(1) * BLOCK_SIZE
+//      row=get_local_id(0)
+//      col=get_local_id(1)
+// -------------------------------------------------------------------
+
+//       Each team has a local copy of these mini matrices
+         float As[BLOCK_SIZE][BLOCK_SIZE];
+         float Bs[BLOCK_SIZE][BLOCK_SIZE];
+         float Cs[BLOCK_SIZE][BLOCK_SIZE];
+         int C_row, C_col;
+
+         /* Zero Cs for this BLOCK */
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++) {
+            for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+               Cs[row][col] = 0.0;
+            }
+         }
+
+         // This kblock loop is run on the master thread of each team
+         for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE ) {
+
+            // Copy global memory values to local memory
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+            for (int row=0 ; row < BLOCK_SIZE ; row++) {
+               for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+                  C_row = C_row_start + row;
+                  C_col = C_col_start + col;
+		  if ((C_row < M) && (kblock + col < K))
+		    As[row][col] = A[(C_row*LDA)+ kblock + col];
+		  else
+		    As[row][col] = 0;
+		  if ((kblock + row < K) && C_col < N)
+		    Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+		  else
+		    Bs[row][col] = 0;
+               }
+            }
+
+            // Calculate Cs <- Sum(As X Bs) across all kblocks
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+            for (int row=0 ; row < BLOCK_SIZE ; row++) {
+               for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+                  for (int e = 0; e < BLOCK_SIZE; ++e)
+                     Cs[row][col] += As[row][e] * Bs[e][col];
+                }
+            }
+
+         }  /* End for kblock .. */
+
+
+         // Scale Update actual C from Cs
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+         for (int row=0 ; row < BLOCK_SIZE ; row++) {
+            for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+               C_row = C_row_start + row;
+               C_col = C_col_start + col;
+	       if ((C_row < M) && (C_col < N)) {
+		 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
+	       }
+            }
+         }
+
+// -------------------------------------------------------------------
+// This is the end of the kernel
+
+      }
+   }
+
+}
-- 
2.30.2