[HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
"DC 4x2 atomic counter op",
[HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
+ [GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read",
+ [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read",
+ [GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write",
+ [GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write",
[GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] =
"DC untyped atomic float op",
};
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
- case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: {
+ case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
+ case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
+ case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: {
static const char *simd_modes[] = { "4x2", "16", "8" };
format(file, "SIMD%s, Mask = 0x%x",
simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);
return brw_dp_surface_desc(devinfo, msg_type, msg_control);
}
+static inline uint32_t
+brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
+ unsigned exec_size, /**< 0 for SIMD4x2 */
+ unsigned num_channels,
+ bool write)
+{
+ assert(exec_size <= 8 || exec_size == 16);
+ assert(devinfo->gen >= 8);
+
+ unsigned msg_type =
+ write ? GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE :
+ GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ;
+
+ /* See also MDC_SM3 in the SKL PRM Vol 2d. */
+ const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */
+ exec_size <= 8 ? 2 : 1;
+
+ const unsigned msg_control =
+ SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
+ SET_BITS(simd_mode, 5, 4);
+
+ return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+}
+
+/**
+ * Calculate the data size (see MDC_A64_DS in the "Structures" volume of the
+ * Skylake PRM).
+ */
+static inline uint32_t
+brw_mdc_a64_ds(unsigned elems)
+{
+ switch (elems) {
+ case 1: return 0;
+ case 2: return 1;
+ case 4: return 2;
+ case 8: return 3;
+ default:
+ unreachable("Unsupported elmeent count for A64 scattered message");
+ }
+}
+
+static inline uint32_t
+brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
+ unsigned exec_size, /**< 0 for SIMD4x2 */
+ unsigned bit_size,
+ bool write)
+{
+ assert(exec_size <= 8 || exec_size == 16);
+ assert(devinfo->gen >= 8);
+
+ unsigned msg_type =
+ write ? GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE :
+ GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ;
+
+ const unsigned msg_control =
+ SET_BITS(GEN8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) |
+ SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
+ SET_BITS(exec_size == 16, 4, 4);
+
+ return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+}
+
static inline uint32_t
brw_dp_typed_atomic_desc(const struct gen_device_info *devinfo,
unsigned exec_size,
SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+ /**
+ * Untyped A64 surface access opcodes.
+ *
+ * Source 0: 64-bit address
+ * Source 1: Operational source
+ * Source 2: [required] Opcode-specific control immediate, same as source 2
+ * of the matching non-LOGICAL opcode.
+ */
+ SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
+ SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
+ SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
+ SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
+
SHADER_OPCODE_TYPED_ATOMIC,
SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
SHADER_OPCODE_TYPED_SURFACE_READ,
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12
#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13
+#define GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10
+#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11
+#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19
+#define GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a
#define GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b
/* GEN9 */
#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12
#define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13
+/* A64 scattered message subtype */
+#define GEN8_A64_SCATTERED_SUBTYPE_BYTE 0
+#define GEN8_A64_SCATTERED_SUBTYPE_DWORD 1
+#define GEN8_A64_SCATTERED_SUBTYPE_QWORD 2
+#define GEN8_A64_SCATTERED_SUBTYPE_HWORD 3
+
/* Dataport special binding table indices: */
#define BRW_BTI_STATELESS 255
#define GEN7_BTI_SLM 254
else
return 1;
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ assert(src[2].file == IMM);
+ return 1;
+
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ assert(src[2].file == IMM);
+ return i == 1 ? src[2].ud : 1;
+
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
/* Scattered logical opcodes use the following params:
* src[0] Surface coordinates
inst->resize_sources(4);
}
+static void
+lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+ const gen_device_info *devinfo = bld.shader->devinfo;
+
+ const fs_reg &addr = inst->src[0];
+ const fs_reg &src = inst->src[1];
+ const unsigned src_comps = inst->components_read(1);
+ assert(inst->src[2].file == IMM);
+ const unsigned arg = inst->src[2].ud;
+ const bool has_side_effects = inst->has_side_effects();
+
+ /* If the surface message has side effects and we're a fragment shader, we
+ * have to predicate with the sample mask to avoid helper invocations.
+ */
+ if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) {
+ inst->flag_subreg = 2;
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = false;
+
+ fs_reg sample_mask = bld.sample_mask_reg();
+ const fs_builder ubld = bld.group(1, 0).exec_all();
+ ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
+ sample_mask);
+ }
+
+ /* Add two because the address is 64-bit */
+ const unsigned dwords = 2 + src_comps;
+ const unsigned mlen = dwords * (inst->exec_size / 8);
+
+ fs_reg sources[5];
+
+ sources[0] = addr;
+
+ for (unsigned i = 0; i < src_comps; i++)
+ sources[1 + i] = offset(src, bld, i);
+
+ const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
+ bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
+
+ uint32_t desc;
+ switch (inst->opcode) {
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
+ arg, /* num_channels */
+ false /* write */);
+ break;
+
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
+ arg, /* num_channels */
+ true /* write */);
+ break;
+
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+ desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
+ arg, /* bit_size */
+ false /* write */);
+ break;
+
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+ desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
+ arg, /* bit_size */
+ true /* write */);
+ break;
+
+ default:
+ unreachable("Unknown A64 logical instruction");
+ }
+
+ /* Update the original instruction. */
+ inst->opcode = SHADER_OPCODE_SEND;
+ inst->mlen = mlen;
+ inst->header_size = 0;
+ inst->send_has_side_effects = has_side_effects;
+ inst->send_is_volatile = !has_side_effects;
+
+ /* Set up SFID and descriptors */
+ inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
+ inst->desc = desc;
+ inst->resize_sources(3);
+ inst->src[0] = brw_imm_ud(0); /* desc */
+ inst->src[1] = brw_imm_ud(0); /* ex_desc */
+ inst->src[2] = payload;
+}
+
static void
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
{
lower_surface_logical_send(ibld, inst);
break;
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+ lower_a64_logical_send(ibld, inst);
+ break;
+
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
lower_varying_pull_constant_logical_send(ibld, inst);
break;
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
return MIN2(16, inst->exec_size);
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+ return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size);
+
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
case SHADER_OPCODE_URB_WRITE_SIMD8:
break;
}
+ case nir_intrinsic_load_global: {
+ assert(devinfo->gen >= 8);
+
+ if (nir_intrinsic_align(instr) >= 4) {
+ assert(nir_dest_bit_size(instr->dest) == 32);
+ fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
+ dest,
+ get_nir_src(instr->src[0]), /* Address */
+ fs_reg(), /* No source data */
+ brw_imm_ud(instr->num_components));
+ inst->size_written = instr->num_components *
+ inst->dst.component_size(inst->exec_size);
+ } else {
+ const unsigned bit_size = nir_dest_bit_size(instr->dest);
+ assert(bit_size <= 32);
+ assert(nir_dest_num_components(instr->dest) == 1);
+ brw_reg_type data_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
+ tmp,
+ get_nir_src(instr->src[0]), /* Address */
+ fs_reg(), /* No source data */
+ brw_imm_ud(bit_size));
+ bld.MOV(retype(dest, data_type), tmp);
+ }
+ break;
+ }
+
+ case nir_intrinsic_store_global:
+ assert(devinfo->gen >= 8);
+
+ if (stage == MESA_SHADER_FRAGMENT)
+ brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+ if (nir_intrinsic_align(instr) >= 4) {
+ assert(nir_src_bit_size(instr->src[0]) == 32);
+ bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
+ fs_reg(),
+ get_nir_src(instr->src[1]), /* Address */
+ get_nir_src(instr->src[0]), /* Data */
+ brw_imm_ud(instr->num_components));
+ } else {
+ const unsigned bit_size = nir_src_bit_size(instr->src[0]);
+ assert(bit_size <= 32);
+ assert(nir_src_num_components(instr->src[0]) == 1);
+ brw_reg_type data_type =
+ brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
+ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+ bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
+ bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
+ fs_reg(),
+ get_nir_src(instr->src[1]), /* Address */
+ tmp, /* Data */
+ brw_imm_ud(nir_src_bit_size(instr->src[0])));
+ }
+ break;
+
case nir_intrinsic_load_ssbo: {
assert(devinfo->gen >= 7);
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
+ case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
+ case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
+ case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
+ case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
/* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */
latency = 300;
break;
return "untyped_surface_write";
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
return "untyped_surface_write_logical";
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ return "a64_untyped_read_logical";
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ return "a64_untyped_write_logical";
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
+ return "a64_byte_scattered_read_logical";
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
+ return "a64_byte_scattered_write_logical";
case SHADER_OPCODE_TYPED_ATOMIC:
return "typed_atomic";
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_TYPED_ATOMIC:
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
case SHADER_OPCODE_BYTE_SCATTERED_READ:
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+ case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
+ case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
case VEC4_OPCODE_URB_READ: