i965: Allow indirect GS input indexing in the scalar backend.

author Kenneth Graunke <kenneth@whitecape.org>

Sun, 8 Nov 2015 02:58:59 +0000 (18:58 -0800)

committer Kenneth Graunke <kenneth@whitecape.org>

Wed, 18 Nov 2015 23:42:36 +0000 (15:42 -0800)
author Kenneth Graunke <kenneth@whitecape.org>
Sun, 8 Nov 2015 02:58:59 +0000 (18:58 -0800)
committer Kenneth Graunke <kenneth@whitecape.org>
Wed, 18 Nov 2015 23:42:36 +0000 (15:42 -0800)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 995ab229544be0cc74b19f55f3ad8b36a29ba089..72a21587a4fdac1061e4e1ce881d629158b64bce 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1689,24 +1689,7 @@ fs_visitor::assign_gs_urb_setup()
     first_non_payload_grf +=
        8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
  
-   const unsigned first_icp_handle = payload.num_regs -
-      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
-
     foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      /* Lower URB_READ_SIMD8 opcodes into real messages. */
-      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
-         assert(inst->src[0].file == IMM);
-         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
-                                            inst->src[0].ud,
-                                            0), BRW_REGISTER_TYPE_UD);
-         /* for now, assume constant - we can do per-slot offsets later */
-         assert(inst->src[1].file == IMM);
-         inst->offset = inst->src[1].ud;
-         inst->src[1] = fs_reg();
-         inst->mlen = 1;
-         inst->base_mrf = -1;
-      }
-
        /* Rewrite all ATTR file references to GRFs. */
        convert_attr_sources_to_hw_regs(inst);
     }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index cbfc07f68bce974ac4a8b4d46866d9ba4ca6c5bf..f52093ba3ce4ab85744ca597e04b57a196a46c1a 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -303,7 +303,8 @@ public:
                         unsigned stream_id);
     void emit_gs_thread_end();
     void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
-                           unsigned offset, unsigned num_components);
+                           const fs_reg &indirect_offset, unsigned imm_offset,
+                           unsigned num_components);
     void emit_cs_terminate();
     fs_reg *emit_cs_local_invocation_id_setup();
     fs_reg *emit_cs_work_group_id_setup();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp

index c282f835cae8f31fd9df256f178110298d6b1718..ebdcb3a42463dccd98440a09a19cd62dc5018f4f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1551,42 +1551,113 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
  void
  fs_visitor::emit_gs_input_load(const fs_reg &dst,
                                 const nir_src &vertex_src,
-                               unsigned input_offset,
+                               const fs_reg &indirect_offset,
+                               unsigned imm_offset,
                                 unsigned num_components)
  {
-   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
-   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
+   struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
  
-   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
+   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
+    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
+    * gl_PointSize is available as a GS input, however, so it must be that.
+    */
+   const bool is_point_size =
+      indirect_offset.file == BAD_FILE && imm_offset == 0;
+
+   nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+   if (indirect_offset.file == BAD_FILE && vertex_const != NULL &&
+       4 * imm_offset < push_reg_count) {
+      imm_offset = 4 * imm_offset + vertex_const->u[0] * push_reg_count;
+      /* This input was pushed into registers. */
+      if (is_point_size) {
+         /* gl_PointSize comes in .w */
+         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
+      } else {
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    fs_reg(ATTR, imm_offset + i, dst.type));
+         }
+      }
+   } else {
+      /* Resort to the pull model.  Ensure the VUE handles are provided. */
+      gs_prog_data->base.include_vue_handles = true;
  
-   const bool pushed = 4 * input_offset < array_stride;
+      unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
+      fs_reg icp_handle;
  
-   if (input_offset == 0) {
-      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
-       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
-       * Only gl_PointSize is available as a GS input, so they must
-       * be asking for that input.
-       */
-      if (pushed) {
-         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
+      if (vertex_const) {
+         /* The vertex index is constant; just select the proper URB handle. */
+         icp_handle =
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
+                   BRW_REGISTER_TYPE_UD);
        } else {
-         fs_reg tmp = bld.vgrf(dst.type, 4);
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
-                                  fs_reg(vertex), fs_reg(0));
-         inst->regs_written = 4;
-         bld.MOV(dst, offset(tmp, bld, 3));
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+          * indicating that channel <n> should read the handle from
+          * DWord <n>.  We convert that to bytes by multiplying by 4.
+          *
+          * Next, we convert the vertex index to bytes by multiplying
+          * by 32 (shifting by 5), and add the two together.  This is
+          * the final indirect byte offset.
+          */
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+         bld.SHL(channel_offsets, sequence, fs_reg(2u));
+         /* Convert vertex_index to bytes (multiply by 32) */
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(5u));
+         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+         /* Use first_icp_handle as the base offset.  There is one register
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to nir->info.gs.vertices_in registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  fs_reg(brw_vec8_grf(first_icp_handle, 0)),
+                  fs_reg(icp_offset_bytes),
+                  fs_reg(nir->info.gs.vertices_in * REG_SIZE));
        }
-   } else {
-      if (pushed) {
-         int index = vertex * array_stride + 4 * input_offset;
-         for (unsigned i = 0; i < num_components; i++) {
-            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
-         }
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Constant indexing - use global offset. */
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 1;
+         inst->regs_written = num_components;
        } else {
-         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
-                                  fs_reg(vertex), fs_reg(input_offset));
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
+         inst->offset = imm_offset;
+         inst->base_mrf = -1;
+         inst->mlen = 2;
           inst->regs_written = num_components;
        }
+
+      if (is_point_size) {
+         /* Read the whole VUE header (because of alignment) and read .w. */
+         fs_reg tmp = bld.vgrf(dst.type, 4);
+         inst->dst = tmp;
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(tmp, bld, 3));
+      }
     }
  }
  
@@ -1626,6 +1697,7 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
                                    nir_intrinsic_instr *instr)
  {
     assert(stage == MESA_SHADER_GEOMETRY);
+   fs_reg indirect_offset;
  
     fs_reg dest;
     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -1644,9 +1716,11 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
        unreachable("load_input intrinsics are invalid for the GS stage");
  
     case nir_intrinsic_load_per_vertex_input_indirect:
-      assert(!"Not allowed");
+      indirect_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_D);
+      /* fallthrough */
     case nir_intrinsic_load_per_vertex_input:
-      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+      emit_gs_input_load(dest, instr->src[0],
+                         indirect_offset, instr->const_index[0],
                           instr->num_components);
        break;
  
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp

index c4a567f4cc90c3cf9b5cc4b3de2d7fbae7af9f96..d22164874c3e0f3ede5108604843542d45e11e2f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -137,6 +137,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
        compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
     }
  
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
     return compiler;
  }
author	Kenneth Graunke <kenneth@whitecape.org>
	Sun, 8 Nov 2015 02:58:59 +0000 (18:58 -0800)
committer	Kenneth Graunke <kenneth@whitecape.org>
	Wed, 18 Nov 2015 23:42:36 +0000 (15:42 -0800)
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs_nir.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_shader.cpp		patch \| blob \| history