i965/fs: Add initial support for 16-wide dispatch on gen6.

author Eric Anholt <eric@anholt.net>

Sat, 12 Mar 2011 03:19:01 +0000 (19:19 -0800)

committer Eric Anholt <eric@anholt.net>

Tue, 26 Apr 2011 19:19:46 +0000 (12:19 -0700)
author Eric Anholt <eric@anholt.net>
Sat, 12 Mar 2011 03:19:01 +0000 (19:19 -0800)
committer Eric Anholt <eric@anholt.net>
Tue, 26 Apr 2011 19:19:46 +0000 (12:19 -0700)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h

index 1daa49abfb392da63bb86b25e485eefbeb02ed83..6bf8a1c83c7c3eb8cb1218b06ea769f09cbf4264 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -204,13 +204,16 @@ struct brw_wm_prog_data {
     GLuint urb_read_length;
  
     GLuint first_curbe_grf;
+   GLuint first_curbe_grf_16;
     GLuint total_grf;
+   GLuint total_grf_16;
     GLuint total_scratch;
  
     GLuint nr_params;       /**< number of float params/constants */
     GLuint nr_pull_params;
     GLboolean error;
     int dispatch_width;
+   uint32_t prog_offset_16;
  
     /* Pointer to tracked values (only valid once
      * _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index bb71463bebce59910726a34426c5c2688294845d..8785957b6e657ada83782778be0b9392adc72f03 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
     }
  }
  
+void
+fs_visitor::push_force_uncompressed()
+{
+   force_uncompressed_stack++;
+}
+
+void
+fs_visitor::pop_force_uncompressed()
+{
+   force_uncompressed_stack--;
+   assert(force_uncompressed_stack >= 0);
+}
+
+void
+fs_visitor::push_force_sechalf()
+{
+   force_sechalf_stack++;
+}
+
+void
+fs_visitor::pop_force_sechalf()
+{
+   force_sechalf_stack--;
+   assert(force_sechalf_stack >= 0);
+}
+
  /**
   * Returns how many MRFs an FS opcode will write over.
   *
@@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir)
  {
     fs_inst *inst;
  
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
     /* Don't point the annotation at the if statement, because then it plus
      * the then and else blocks get printed.
      */
@@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir)
  {
     fs_reg counter = reg_undef;
  
+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
     if (ir->counter) {
        this->base_ir = ir->counter;
        ir->counter->accept(this);
@@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst)
     fs_inst *list_inst = new(mem_ctx) fs_inst;
     *list_inst = inst;
  
+   if (force_uncompressed_stack > 0)
+      list_inst->force_uncompressed = true;
+   else if (force_sechalf_stack > 0)
+      list_inst->force_sechalf = true;
+
     list_inst->annotation = this->current_annotation;
     list_inst->ir = this->base_ir;
  
@@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes()
     this->current_annotation = "FB write header";
     GLboolean header_present = GL_TRUE;
     int nr = 0;
+   int reg_width = c->dispatch_width / 8;
  
     if (intel->gen >= 6 &&
         !this->kill_emitted &&
@@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes()
     }
  
     if (c->aa_dest_stencil_reg) {
+      push_force_uncompressed();
        emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
            fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+      pop_force_uncompressed();
     }
  
     /* Reserve space for color. It'll be filled in per MRT below. */
     int color_mrf = nr;
-   nr += 4;
+   nr += 4 * reg_width;
  
     if (c->source_depth_to_render_target) {
+      if (intel->gen == 6 && c->dispatch_width == 16) {
+        /* For outputting oDepth on gen6, SIMD8 writes have to be
+         * used.  This would require 8-wide moves of each half to
+         * message regs, kind of like pre-gen5 SIMD16 FB writes.
+         * Just bail on doing so for now.
+         */
+        fail("Missing support for simd16 depth writes on gen6\n");
+      }
+
        if (c->computes_depth) {
          /* Hand over gl_FragDepth. */
          assert(this->frag_depth);
          fs_reg depth = *(variable_storage(this->frag_depth));
  
-        emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
+        emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
        } else {
          /* Pass through the payload depth. */
-        emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+        emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
               fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
        }
+      nr += reg_width;
     }
  
     if (c->dest_depth_reg) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
            fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+      nr += reg_width;
     }
  
     fs_reg color = reg_undef;
@@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes()
                                                  target);
        if (this->frag_color || this->frag_data) {
          for (int i = 0; i < 4; i++) {
-           emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
+           emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
             color.reg_offset++;
          }
        }
@@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
     brw_pop_insn_state(p);
  
     brw_fb_WRITE(p,
-               8, /* dispatch_width */
+               c->dispatch_width,
                 inst->base_mrf,
                 implied_header,
                 inst->target,
@@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs()
  void
  fs_visitor::assign_curb_setup()
  {
-   c->prog_data.first_curbe_grf = c->nr_payload_regs;
     c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
+   if (c->dispatch_width == 8) {
+      c->prog_data.first_curbe_grf = c->nr_payload_regs;
+   } else {
+      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
+   }
  
     /* Map the offsets in the UNIFORM file to fixed HW regs. */
     foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup()
        for (unsigned int i = 0; i < 3; i++) {
          if (inst->src[i].file == UNIFORM) {
             int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
-           struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
+           struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
                                                   constant_nr / 8,
                                                   constant_nr % 8);
  
@@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup()
  void
  fs_visitor::assign_urb_setup()
  {
-   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
+   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
  
     /* Offset all the urb_setup[] index by the actual position of the
      * setup regs, now that the location of the constants has been chosen.
@@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
  void
  fs_visitor::generate_code()
  {
-   int last_native_inst = 0;
+   int last_native_inst = p->nr_insn;
     const char *last_annotation_string = NULL;
     ir_instruction *last_annotation_ir = NULL;
  
@@ -3532,8 +3589,8 @@ fs_visitor::generate_code()
  
  
     if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d:\n",
-            ctx->Shader.CurrentFragmentProgram->Name);
+      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+            ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
     }
  
     foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3566,6 +3623,14 @@ fs_visitor::generate_code()
        brw_set_predicate_inverse(p, inst->predicate_inverse);
        brw_set_saturate(p, inst->saturate);
  
+      if (inst->force_uncompressed || c->dispatch_width == 8) {
+        brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      } else if (inst->force_sechalf) {
+        brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      } else {
+        brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+
        switch (inst->opcode) {
        case BRW_OPCODE_MOV:
          brw_MOV(p, dst, src[0]);
@@ -3804,108 +3869,149 @@ fs_visitor::generate_code()
     }
  }
  
-GLboolean
-brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+bool
+fs_visitor::run()
  {
-   struct intel_context *intel = &brw->intel;
-   struct gl_context *ctx = &intel->ctx;
-   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+   uint32_t prog_offset_16 = 0;
  
-   if (!prog)
-      return GL_FALSE;
+   brw_wm_payload_setup(brw, c);
  
-   struct brw_shader *shader =
-     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-   if (!shader)
-      return GL_FALSE;
+   if (c->dispatch_width == 16) {
+      if (c->prog_data.curb_read_length) {
+        /* Haven't hooked in support for uniforms through the 16-wide
+         * version yet.
+         */
+        return GL_FALSE;
+      }
  
-   /* We always use 8-wide mode, at least for now.  For one, flow
-    * control only works in 8-wide.  Also, when we're fragment shader
-    * bound, we're almost always under register pressure as well, so
-    * 8-wide would save us from the performance cliff of spilling
-    * regs.
-    */
-   c->dispatch_width = 8;
+      /* align to 64 byte boundary. */
+      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
+        brw_NOP(p);
+      }
  
-   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
-      _mesa_print_ir(shader->ir, NULL);
-      printf("\n");
-   }
+      /* Save off the start of this 16-wide program in case we succeed. */
+      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
  
-   /* Now the main event: Visit the shader IR and generate our FS IR for it.
-    */
-   fs_visitor v(c, shader);
+      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   }
  
     if (0) {
-      v.emit_dummy_fs();
+      emit_dummy_fs();
     } else {
-      v.calculate_urb_setup();
+      calculate_urb_setup();
        if (intel->gen < 6)
-        v.emit_interpolation_setup_gen4();
+        emit_interpolation_setup_gen4();
        else
-        v.emit_interpolation_setup_gen6();
+        emit_interpolation_setup_gen6();
  
        /* Generate FS IR for main().  (the visitor only descends into
         * functions called "main").
         */
        foreach_iter(exec_list_iterator, iter, *shader->ir) {
          ir_instruction *ir = (ir_instruction *)iter.get();
-        v.base_ir = ir;
-        ir->accept(&v);
+        base_ir = ir;
+        ir->accept(this);
        }
  
-      v.emit_fb_writes();
+      emit_fb_writes();
  
-      v.split_virtual_grfs();
+      split_virtual_grfs();
  
-      v.setup_paramvalues_refs();
-      v.setup_pull_constants();
+      setup_paramvalues_refs();
+      setup_pull_constants();
  
        bool progress;
        do {
          progress = false;
  
-        progress = v.remove_duplicate_mrf_writes() || progress;
+        progress = remove_duplicate_mrf_writes() || progress;
  
-        progress = v.propagate_constants() || progress;
-        progress = v.register_coalesce() || progress;
-        progress = v.compute_to_mrf() || progress;
-        progress = v.dead_code_eliminate() || progress;
+        progress = propagate_constants() || progress;
+        progress = register_coalesce() || progress;
+        progress = compute_to_mrf() || progress;
+        progress = dead_code_eliminate() || progress;
        } while (progress);
  
-      v.schedule_instructions();
+      schedule_instructions();
  
-      v.assign_curb_setup();
-      v.assign_urb_setup();
+      assign_curb_setup();
+      assign_urb_setup();
  
        if (0) {
          /* Debug of register spilling: Go spill everything. */
-        int virtual_grf_count = v.virtual_grf_next;
+        int virtual_grf_count = virtual_grf_next;
          for (int i = 1; i < virtual_grf_count; i++) {
-           v.spill_reg(i);
+           spill_reg(i);
          }
        }
  
        if (0)
-        v.assign_regs_trivial();
+        assign_regs_trivial();
        else {
-        while (!v.assign_regs()) {
-           if (v.failed)
+        while (!assign_regs()) {
+           if (failed)
                break;
          }
        }
     }
+   assert(force_uncompressed_stack == 0);
+   assert(force_sechalf_stack == 0);
  
-   if (!v.failed)
-      v.generate_code();
-
-   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
+   if (!failed)
+      generate_code();
  
-   if (v.failed)
+   if (failed)
        return GL_FALSE;
  
-   c->prog_data.total_grf = v.grf_used;
+   if (c->dispatch_width == 8) {
+      c->prog_data.total_grf = grf_used;
+   } else {
+      c->prog_data.total_grf_16 = grf_used;
+      c->prog_data.prog_offset_16 = prog_offset_16;
+   }
+
+   return !failed;
+}
  
-   return GL_TRUE;
+bool
+brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+
+   if (!prog)
+      return false;
+
+   struct brw_shader *shader =
+     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   if (!shader)
+      return false;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
+      _mesa_print_ir(shader->ir, NULL);
+      printf("\n");
+   }
+
+   /* Now the main event: Visit the shader IR and generate our FS IR for it.
+    */
+   c->dispatch_width = 8;
+
+   fs_visitor v(c, shader);
+   if (!v.run()) {
+      /* FINISHME: Cleanly fail, test at link time, etc. */
+      assert(!"not reached");
+      return false;
+   }
+
+   if (intel->gen >= 6) {
+      c->dispatch_width = 16;
+      fs_visitor v2(c, shader);
+      v2.run();
+   }
+
+   c->prog_data.dispatch_width = 8;
+
+   return true;
  }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h

index fd83fcb3829bf67e3d02409ec21474c5f4db9cb8..b158992071ec0c98c1de846b57a22dec54db437b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -343,6 +343,8 @@ public:
     bool eot;
     bool header_present;
     bool shadow_compare;
+   bool force_uncompressed;
+   bool force_sechalf;
     uint32_t offset; /* spill/unspill offset */
  
     /** @{
@@ -405,6 +407,8 @@ public:
        this->live_intervals_valid = false;
  
        this->kill_emitted = false;
+      this->force_uncompressed_stack = 0;
+      this->force_sechalf_stack = 0;
     }
  
     ~fs_visitor()
@@ -461,6 +465,7 @@ public:
        return emit(fs_inst(opcode, dst, src0, src1, src2));
     }
  
+   bool run();
     void setup_paramvalues_refs();
     void assign_curb_setup();
     void calculate_urb_setup();
@@ -481,6 +486,11 @@ public:
     void schedule_instructions();
     void fail(const char *msg, ...);
  
+   void push_force_uncompressed();
+   void pop_force_uncompressed();
+   void push_force_sechalf();
+   void pop_force_sechalf();
+
     void generate_code();
     void generate_fb_write(fs_inst *inst);
     void generate_pixel_xy(struct brw_reg dst, bool is_x);
@@ -568,6 +578,9 @@ public:
     fs_reg reg_null_cmp;
  
     int grf_used;
+
+   int force_uncompressed_stack;
+   int force_sechalf_stack;
  };
  
  GLboolean brw_do_channel_expressions(struct exec_list *instructions);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c

index c4b2157db553c7d43260a7f196826f7b57417a2b..4564fb6b1adf6392cde2fd395f5b99a9e6f2954d 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
     brw_wm_emit(c);
  }
  
-static void
+void
  brw_wm_payload_setup(struct brw_context *brw,
                      struct brw_wm_compile *c)
  {
@@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,
  
     brw_init_compile(brw, &c->func);
  
-   brw_wm_payload_setup(brw, c);
-
     if (!brw_wm_fs_emit(brw, c)) {
-      /*
-       * Shader which use GLSL features such as flow control are handled
-       * differently from "simple" shaders.
-       */
+      /* Fallback for fixed function and ARB_fp shaders. */
        c->dispatch_width = 16;
        brw_wm_payload_setup(brw, c);
        brw_wm_non_glsl_emit(brw, c);
+      c->prog_data.dispatch_width = 16;
     }
-   c->prog_data.dispatch_width = c->dispatch_width;
  
     /* Scratch space is used for register spilling */
     if (c->last_scratch) {
@@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
     struct brw_wm_prog_key key;
     struct brw_fragment_program *fp = (struct brw_fragment_program *)
        brw->fragment_program;
-     
+
     brw_wm_populate_key(brw, &key);
  
     /* Make an early check for the key.
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h

index 5d1e4045928dc8ebd17d4dbca79ff27e001c35a2..8e5a9cdb86ca1bdee30cefcc16814a10878ea4c9 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
  void brw_wm_lookup_iz(struct intel_context *intel,
                       struct brw_wm_compile *c);
  
-GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
  
  /* brw_wm_emit.c */
  void emit_alu1(struct brw_compile *p,
@@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint
  
  bool brw_color_buffer_write_enabled(struct brw_context *brw);
  bool brw_render_target_supported(gl_format format);
+void brw_wm_payload_setup(struct brw_context *brw,
+                         struct brw_wm_compile *c);
  
  #endif
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c

index 8215cb15a9c22dbc7707fff986bec3089385fb71..d4fca788cb9dba16c6cb05e374ff8c9f60540af6 100644 (file)
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw)
     dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
     dw4 |= (brw->wm.prog_data->first_curbe_grf <<
            GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
+          GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
  
     dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
  
     /* CACHE_NEW_WM_PROG */
-   if (brw->wm.prog_data->dispatch_width == 8)
+   if (brw->wm.prog_data->dispatch_width == 8) {
        dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-   else
+      if (brw->wm.prog_data->prog_offset_16)
+        dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   } else {
        dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   }
  
     /* _NEW_LINE */
     if (ctx->Line.StippleFlag)
@@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw)
     OUT_BATCH(dw5);
     OUT_BATCH(dw6);
     OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(0); /* kernel 2 pointer */
+   if (brw->wm.prog_data->prog_offset_16) {
+      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+               brw->wm.prog_data->prog_offset_16);
+   } else {
+      OUT_BATCH(0); /* kernel 2 pointer */
+   }
     ADVANCE_BATCH();
  }
author	Eric Anholt <eric@anholt.net>
	Sat, 12 Mar 2011 03:19:01 +0000 (19:19 -0800)
committer	Eric Anholt <eric@anholt.net>
	Tue, 26 Apr 2011 19:19:46 +0000 (12:19 -0700)
src/mesa/drivers/dri/i965/brw_context.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.cpp		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_fs.h		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_wm.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_wm.h		patch \| blob \| history
src/mesa/drivers/dri/i965/gen6_wm_state.c		patch \| blob \| history