i965: Allow indirect GS input indexing in the scalar backend.
authorKenneth Graunke <kenneth@whitecape.org>
Sun, 8 Nov 2015 02:58:59 +0000 (18:58 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Wed, 18 Nov 2015 23:42:36 +0000 (15:42 -0800)
This allows arbitrary non-constant indices on GS input arrays,
both for the vertex index, and any array offsets beyond that.

All indirects are handled via the pull model.  We could potentially
handle indirect addressing of pushed data as well, but it would add
additional code complexity, and we usually have to pull inputs anyway
due to the sheer volume of input data.  Plus, marking pushed inputs
as live due to indirect addressing could exacerbate register pressure
problems pretty badly.  We'd need to be careful.

v2: Use updated MOV_INDIRECT opcode.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_fs.h
src/mesa/drivers/dri/i965/brw_fs_nir.cpp
src/mesa/drivers/dri/i965/brw_shader.cpp

index 995ab229544be0cc74b19f55f3ad8b36a29ba089..72a21587a4fdac1061e4e1ce881d629158b64bce 100644 (file)
@@ -1689,24 +1689,7 @@ fs_visitor::assign_gs_urb_setup()
    first_non_payload_grf +=
       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
 
-   const unsigned first_icp_handle = payload.num_regs -
-      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
-
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      /* Lower URB_READ_SIMD8 opcodes into real messages. */
-      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
-         assert(inst->src[0].file == IMM);
-         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
-                                            inst->src[0].ud,
-                                            0), BRW_REGISTER_TYPE_UD);
-         /* for now, assume constant - we can do per-slot offsets later */
-         assert(inst->src[1].file == IMM);
-         inst->offset = inst->src[1].ud;
-         inst->src[1] = fs_reg();
-         inst->mlen = 1;
-         inst->base_mrf = -1;
-      }
-
       /* Rewrite all ATTR file references to GRFs. */
       convert_attr_sources_to_hw_regs(inst);
    }
index cbfc07f68bce974ac4a8b4d46866d9ba4ca6c5bf..f52093ba3ce4ab85744ca597e04b57a196a46c1a 100644 (file)
@@ -303,7 +303,8 @@ public:
                        unsigned stream_id);
    void emit_gs_thread_end();
    void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
-                           unsigned offset, unsigned num_components);
+                           const fs_reg &indirect_offset, unsigned imm_offset,
+                           unsigned num_components);
    void emit_cs_terminate();
    fs_reg *emit_cs_local_invocation_id_setup();
    fs_reg *emit_cs_work_group_id_setup();
index c282f835cae8f31fd9df256f178110298d6b1718..ebdcb3a42463dccd98440a09a19cd62dc5018f4f 100644 (file)
@@ -1551,42 +1551,113 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
 void
 fs_visitor::emit_gs_input_load(const fs_reg &dst,
                                const nir_src &vertex_src,
-                               unsigned input_offset,
+                               const fs_reg &indirect_offset,
+                               unsigned imm_offset,
                                unsigned num_components)
 {
-   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
-   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
+   struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
 
-   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
+   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
+    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
+    * gl_PointSize is available as a GS input, however, so it must be that.
+    */
+   const bool is_point_size =
+      indirect_offset.file == BAD_FILE && imm_offset == 0;
+
+   nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+   if (indirect_offset.file == BAD_FILE && vertex_const != NULL &&
+       4 * imm_offset < push_reg_count) {
+      imm_offset = 4 * imm_offset + vertex_const->u[0] * push_reg_count;
+      /* This input was pushed into registers. */
+      if (is_point_size) {
+         /* gl_PointSize comes in .w */
+         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
+      } else {
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    fs_reg(ATTR, imm_offset + i, dst.type));
+         }
+      }
+   } else {
+      /* Resort to the pull model.  Ensure the VUE handles are provided. */
+      gs_prog_data->base.include_vue_handles = true;
 
-   const bool pushed = 4 * input_offset < array_stride;
+      unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
+      fs_reg icp_handle;
 
-   if (input_offset == 0) {
-      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
-       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
-       * Only gl_PointSize is available as a GS input, so they must
-       * be asking for that input.
-       */
-      if (pushed) {
-         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
+      if (vertex_const) {
+         /* The vertex index is constant; just select the proper URB handle. */
+         icp_handle =
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
+                   BRW_REGISTER_TYPE_UD);
       } else {
-         fs_reg tmp = bld.vgrf(dst.type, 4);
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
-                                  fs_reg(vertex), fs_reg(0));
-         inst->regs_written = 4;
-         bld.MOV(dst, offset(tmp, bld, 3));
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+          * indicating that channel <n> should read the handle from
+          * DWord <n>.  We convert that to bytes by multiplying by 4.
+          *
+          * Next, we convert the vertex index to bytes by multiplying
+          * by 32 (shifting by 5), and add the two together.  This is
+          * the final indirect byte offset.
+          */
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+         bld.SHL(channel_offsets, sequence, fs_reg(2u));
+         /* Convert vertex_index to bytes (multiply by 32) */
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(5u));
+         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+         /* Use first_icp_handle as the base offset.  There is one register
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to nir->info.gs.vertices_in registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  fs_reg(brw_vec8_grf(first_icp_handle, 0)),
+                  fs_reg(icp_offset_bytes),
+                  fs_reg(nir->info.gs.vertices_in * REG_SIZE));
       }
-   } else {
-      if (pushed) {
-         int index = vertex * array_stride + 4 * input_offset;
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
-         }
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 1;
+         inst->regs_written = num_components;
       } else {
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
-                                  fs_reg(vertex), fs_reg(input_offset));
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 2;
          inst->regs_written = num_components;
       }
+
+      if (is_point_size) {
+         /* Read the whole VUE header (because of alignment) and read .w. */
+         fs_reg tmp = bld.vgrf(dst.type, 4);
+         inst->dst = tmp;
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(tmp, bld, 3));
+      }
    }
 }
 
@@ -1626,6 +1697,7 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
                                   nir_intrinsic_instr *instr)
 {
    assert(stage == MESA_SHADER_GEOMETRY);
+   fs_reg indirect_offset;
 
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -1644,9 +1716,11 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
       unreachable("load_input intrinsics are invalid for the GS stage");
 
    case nir_intrinsic_load_per_vertex_input_indirect:
-      assert(!"Not allowed");
+      indirect_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_D);
+      /* fallthrough */
    case nir_intrinsic_load_per_vertex_input:
-      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+      emit_gs_input_load(dest, instr->src[0],
+                         indirect_offset, instr->const_index[0],
                          instr->num_components);
       break;
 
index c4a567f4cc90c3cf9b5cc4b3de2d7fbae7af9f96..d22164874c3e0f3ede5108604843542d45e11e2f 100644 (file)
@@ -137,6 +137,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
 
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
    return compiler;
 }