From 1c25bf4373d68777c3561fdd1a30766698437109 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 14 Nov 2018 17:13:57 -0600 Subject: [PATCH] intel/fs: Implement load/store_global with A64 untyped messages eviewed-by: Kenneth Graunke --- src/intel/compiler/brw_disasm.c | 8 +- src/intel/compiler/brw_eu.h | 62 ++++++++++ src/intel/compiler/brw_eu_defines.h | 23 ++++ src/intel/compiler/brw_fs.cpp | 107 ++++++++++++++++++ src/intel/compiler/brw_fs_nir.cpp | 58 ++++++++++ .../compiler/brw_schedule_instructions.cpp | 4 + src/intel/compiler/brw_shader.cpp | 12 ++ 7 files changed, 273 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c index fc1256e6955..5624bedd2e7 100644 --- a/src/intel/compiler/brw_disasm.c +++ b/src/intel/compiler/brw_disasm.c @@ -437,6 +437,10 @@ static const char *const dp_dc1_msg_type_hsw[32] = { [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] = "DC 4x2 atomic counter op", [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write", + [GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read", + [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read", + [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write", + [GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write", [GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] = "DC untyped atomic float op", }; @@ -1941,7 +1945,9 @@ brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo, case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: - case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: { + case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: + case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: + case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: { static const char *simd_modes[] = { "4x2", "16", "8" }; format(file, "SIMD%s, Mask = 0x%x", simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf); diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index a7041ea4a34..ec3aafb8363 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -687,6 +687,68 @@ brw_dp_byte_scattered_rw_desc(const struct gen_device_info *devinfo, return brw_dp_surface_desc(devinfo, msg_type, msg_control); } +static inline uint32_t +brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned num_channels, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + assert(devinfo->gen >= 8); + + unsigned msg_type = + write ? GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE : + GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ; + + /* See also MDC_SM3 in the SKL PRM Vol 2d. */ + const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */ + exec_size <= 8 ? 2 : 1; + + const unsigned msg_control = + SET_BITS(brw_mdc_cmask(num_channels), 3, 0) | + SET_BITS(simd_mode, 5, 4); + + return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); +} + +/** + * Calculate the data size (see MDC_A64_DS in the "Structures" volume of the + * Skylake PRM). + */ +static inline uint32_t +brw_mdc_a64_ds(unsigned elems) +{ + switch (elems) { + case 1: return 0; + case 2: return 1; + case 4: return 2; + case 8: return 3; + default: + unreachable("Unsupported elmeent count for A64 scattered message"); + } +} + +static inline uint32_t +brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo, + unsigned exec_size, /**< 0 for SIMD4x2 */ + unsigned bit_size, + bool write) +{ + assert(exec_size <= 8 || exec_size == 16); + assert(devinfo->gen >= 8); + + unsigned msg_type = + write ? GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE : + GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ; + + const unsigned msg_control = + SET_BITS(GEN8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) | + SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) | + SET_BITS(exec_size == 16, 4, 4); + + return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control); +} + static inline uint32_t brw_dp_typed_atomic_desc(const struct gen_device_info *devinfo, unsigned exec_size, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index d3dfd6dc7e8..6c2a2f8ef7c 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -412,6 +412,19 @@ enum opcode { SHADER_OPCODE_UNTYPED_SURFACE_WRITE, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + /** + * Untyped A64 surface access opcodes. + * + * Source 0: 64-bit address + * Source 1: Operational source + * Source 2: [required] Opcode-specific control immediate, same as source 2 + * of the matching non-LOGICAL opcode. + */ + SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, + SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, + SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, + SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, + SHADER_OPCODE_TYPED_ATOMIC, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, SHADER_OPCODE_TYPED_SURFACE_READ, @@ -1170,12 +1183,22 @@ enum brw_message_target { #define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11 #define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12 #define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13 +#define GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10 +#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11 +#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19 +#define GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a #define GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b /* GEN9 */ #define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12 #define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13 +/* A64 scattered message subtype */ +#define GEN8_A64_SCATTERED_SUBTYPE_BYTE 0 +#define GEN8_A64_SCATTERED_SUBTYPE_DWORD 1 +#define GEN8_A64_SCATTERED_SUBTYPE_QWORD 2 +#define GEN8_A64_SCATTERED_SUBTYPE_HWORD 3 + /* Dataport special binding table indices: */ #define BRW_BTI_STATELESS 255 #define GEN7_BTI_SLM 254 diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 303b1c1b272..70f9862c137 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -789,6 +789,14 @@ fs_inst::components_read(unsigned i) const else return 1; + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + assert(src[2].file == IMM); + return 1; + + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + assert(src[2].file == IMM); + return i == 1 ? src[2].ud : 1; + case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: /* Scattered logical opcodes use the following params: * src[0] Surface coordinates @@ -5207,6 +5215,92 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->resize_sources(4); } +static void +lower_a64_logical_send(const fs_builder &bld, fs_inst *inst) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + + const fs_reg &addr = inst->src[0]; + const fs_reg &src = inst->src[1]; + const unsigned src_comps = inst->components_read(1); + assert(inst->src[2].file == IMM); + const unsigned arg = inst->src[2].ud; + const bool has_side_effects = inst->has_side_effects(); + + /* If the surface message has side effects and we're a fragment shader, we + * have to predicate with the sample mask to avoid helper invocations. + */ + if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) { + inst->flag_subreg = 2; + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = false; + + fs_reg sample_mask = bld.sample_mask_reg(); + const fs_builder ubld = bld.group(1, 0).exec_all(); + ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), + sample_mask); + } + + /* Add two because the address is 64-bit */ + const unsigned dwords = 2 + src_comps; + const unsigned mlen = dwords * (inst->exec_size / 8); + + fs_reg sources[5]; + + sources[0] = addr; + + for (unsigned i = 0; i < src_comps; i++) + sources[1 + i] = offset(src, bld, i); + + const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords); + bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0); + + uint32_t desc; + switch (inst->opcode) { + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, + arg, /* num_channels */ + false /* write */); + break; + + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, + arg, /* num_channels */ + true /* write */); + break; + + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: + desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, + arg, /* bit_size */ + false /* write */); + break; + + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: + desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, + arg, /* bit_size */ + true /* write */); + break; + + default: + unreachable("Unknown A64 logical instruction"); + } + + /* Update the original instruction. */ + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = mlen; + inst->header_size = 0; + inst->send_has_side_effects = has_side_effects; + inst->send_is_volatile = !has_side_effects; + + /* Set up SFID and descriptors */ + inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1; + inst->desc = desc; + inst->resize_sources(3); + inst->src[0] = brw_imm_ud(0); /* desc */ + inst->src[1] = brw_imm_ud(0); /* ex_desc */ + inst->src[2] = payload; +} + static void lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5381,6 +5475,13 @@ fs_visitor::lower_logical_sends() lower_surface_logical_send(ibld, inst); break; + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: + lower_a64_logical_send(ibld, inst); + break; + case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: lower_varying_pull_constant_logical_send(ibld, inst); break; @@ -5878,6 +5979,12 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: return MIN2(16, inst->exec_size); + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: + return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size); + case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: case SHADER_OPCODE_URB_WRITE_SIMD8: diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 9012fe11d44..06d45d77002 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3971,6 +3971,64 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_global: { + assert(devinfo->gen >= 8); + + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_bit_size(instr->dest) == 32); + fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, + dest, + get_nir_src(instr->src[0]), /* Address */ + fs_reg(), /* No source data */ + brw_imm_ud(instr->num_components)); + inst->size_written = instr->num_components * + inst->dst.component_size(inst->exec_size); + } else { + const unsigned bit_size = nir_dest_bit_size(instr->dest); + assert(bit_size <= 32); + assert(nir_dest_num_components(instr->dest) == 1); + brw_reg_type data_type = + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, + tmp, + get_nir_src(instr->src[0]), /* Address */ + fs_reg(), /* No source data */ + brw_imm_ud(bit_size)); + bld.MOV(retype(dest, data_type), tmp); + } + break; + } + + case nir_intrinsic_store_global: + assert(devinfo->gen >= 8); + + if (stage == MESA_SHADER_FRAGMENT) + brw_wm_prog_data(prog_data)->has_side_effects = true; + + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_src_bit_size(instr->src[0]) == 32); + bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, + fs_reg(), + get_nir_src(instr->src[1]), /* Address */ + get_nir_src(instr->src[0]), /* Data */ + brw_imm_ud(instr->num_components)); + } else { + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + assert(bit_size <= 32); + assert(nir_src_num_components(instr->src[0]) == 1); + brw_reg_type data_type = + brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); + bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), + get_nir_src(instr->src[1]), /* Address */ + tmp, /* Data */ + brw_imm_ud(nir_src_bit_size(instr->src[0]))); + } + break; + case nir_intrinsic_load_ssbo: { assert(devinfo->gen >= 7); diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index be57802b227..692398da496 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -486,6 +486,10 @@ schedule_node::set_latency_gen7(bool is_haswell) case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: + case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: + case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: + case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: + case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: /* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */ latency = 300; break; diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index b4c74871a48..c2751557af8 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -294,6 +294,14 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "untyped_surface_write"; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: return "untyped_surface_write_logical"; + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + return "a64_untyped_read_logical"; + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + return "a64_untyped_write_logical"; + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: + return "a64_byte_scattered_read_logical"; + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: + return "a64_byte_scattered_write_logical"; case SHADER_OPCODE_TYPED_ATOMIC: return "typed_atomic"; case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: @@ -1010,6 +1018,8 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_GEN4_SCRATCH_WRITE: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_WRITE: case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_TYPED_ATOMIC: @@ -1048,6 +1058,8 @@ backend_instruction::is_volatile() const case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_BYTE_SCATTERED_READ: case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: + case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: + case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_URB_READ_SIMD8: case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: case VEC4_OPCODE_URB_READ: -- 2.30.2