i965/fs: Add support for translating ir_triop_fma into MAD.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_clip_util.c
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c

index 2148bc8244a70539e4a3a7011f5916b75881d239..24d053eaea6ad727bc9f6c20afdd54901494c3e5 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -33,7 +33,7 @@
  #include "main/glheader.h"
  #include "main/macros.h"
  #include "main/enums.h"
-#include "shader/program.h"
+#include "program/program.h"
  
  #include "intel_batchbuffer.h"
  
@@ -109,13 +109,16 @@ static void brw_clip_project_vertex( struct brw_clip_compile *c,
  {
     struct brw_compile *p = &c->func;
     struct brw_reg tmp = get_tmp(c);
+   GLuint hpos_offset = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+   GLuint ndc_offset = brw_varying_to_offset(&c->vue_map,
+                                             BRW_VARYING_SLOT_NDC);
  
     /* Fixup position.  Extract from the original vertex and re-project
      * to screen space:
      */
-   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_MOV(p, tmp, deref_4f(vert_addr, hpos_offset));
     brw_clip_project_position(c, tmp);
-   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+   brw_MOV(p, deref_4f(vert_addr, ndc_offset), tmp);
          
     release_tmp(c, tmp);
  }
@@ -125,18 +128,19 @@ static void brw_clip_project_vertex( struct brw_clip_compile *c,
  
  /* Interpolate between two vertices and put the result into a0.0.  
   * Increment a0.0 accordingly.
+ *
+ * Beware that dest_ptr can be equal to v0_ptr!
   */
  void brw_clip_interp_vertex( struct brw_clip_compile *c,
                              struct brw_indirect dest_ptr,
                              struct brw_indirect v0_ptr, /* from */
                              struct brw_indirect v1_ptr, /* to */
                              struct brw_reg t0,
-                            GLboolean force_edgeflag)
+                            bool force_edgeflag)
  {
     struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
-   struct brw_reg tmp = get_tmp(c);
-   GLuint i;
+   struct brw_reg t_nopersp, v0_ndc_copy;
+   GLuint slot;
  
     /* Just copy the vertex header:
      */
@@ -146,75 +150,181 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
      */
     brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
        
-   /* Iterate over each attribute (could be done in pairs?)
+
+   /* First handle the 3D and NDC interpolation, in case we
+    * need noperspective interpolation. Doing it early has no
+    * performance impact in any case.
      */
-   for (i = 0; i < c->nr_attrs; i++) {
-      GLuint delta = i*16 + 32;
  
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
+   /* Take a copy of the v0 NDC coordinates, in case dest == v0. */
+   if (c->has_noperspective_shading) {
+      GLuint offset = brw_varying_to_offset(&c->vue_map,
+                                                 BRW_VARYING_SLOT_NDC);
+      v0_ndc_copy = get_tmp(c);
+      brw_MOV(p, v0_ndc_copy, deref_4f(v0_ptr, offset));
+   }
+
+   /* Compute the new 3D position
+    *
+    * dest_hpos = v0_hpos * (1 - t0) + v1_hpos * t0
+    */
+   {
+      GLuint delta = brw_varying_to_offset(&c->vue_map, VARYING_SLOT_POS);
+      struct brw_reg tmp = get_tmp(c);
+      brw_MUL(p, vec4(brw_null_reg()), deref_4f(v1_ptr, delta), t0);
+      brw_MAC(p, tmp, negate(deref_4f(v0_ptr, delta)), t0);
+      brw_ADD(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta), tmp);
+      release_tmp(c, tmp);
+   }
+
+   /* Recreate the projected (NDC) coordinate in the new vertex header */
+   brw_clip_project_vertex(c, dest_ptr);
+
+   /* If we have noperspective attributes,
+    * we need to compute the screen-space t
+    */
+   if (c->has_noperspective_shading) {
+      GLuint delta = brw_varying_to_offset(&c->vue_map,
+                                                BRW_VARYING_SLOT_NDC);
+      struct brw_reg tmp = get_tmp(c);
+      t_nopersp = get_tmp(c);
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) */
+      brw_MOV(p, t_nopersp, deref_4f(v1_ptr, delta));
+      brw_MOV(p, tmp, deref_4f(dest_ptr, delta));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p,
+              brw_writemask(t_nopersp, WRITEMASK_ZW),
+              brw_swizzle(tmp, 0, 1, 0, 1));
+
+      /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
+      brw_ADD(p, t_nopersp, t_nopersp,
+              negate(brw_swizzle(v0_ndc_copy, 0, 1, 0, 1)));
+
+      /* Add the absolute values of the X and Y deltas so that if
+       * the points aren't in the same place on the screen we get
+       * nonzero values to divide.
+       *
+       * After that, we have vert1 - vert0 in t_nopersp.x and
+       * vertnew - vert0 in t_nopersp.y
+       *
+       * t_nopersp = vec2(|v1.x  -v0.x| + |v1.y  -v0.y|,
+       *                  |dest.x-v0.x| + |dest.y-v0.y|)
+       */
+      brw_ADD(p,
+              brw_writemask(t_nopersp, WRITEMASK_XY),
+              brw_abs(brw_swizzle(t_nopersp, 0, 2, 0, 0)),
+              brw_abs(brw_swizzle(t_nopersp, 1, 3, 0, 0)));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* If the points are in the same place, just substitute a
+       * value to avoid divide-by-zero
+       */
+      brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ,
+              vec1(t_nopersp),
+              brw_imm_f(0));
+      brw_IF(p, BRW_EXECUTE_1);
+      brw_MOV(p, t_nopersp, brw_imm_vf4(VF_ONE, VF_ZERO, VF_ZERO, VF_ZERO));
+      brw_ENDIF(p);
+
+      /* Now compute t_nopersp = t_nopersp.y/t_nopersp.x and broadcast it. */
+      brw_math_invert(p, get_element(t_nopersp, 0), get_element(t_nopersp, 0));
+      brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
+            vec1(suboffset(t_nopersp, 1)));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, 0, 0, 0, 0));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      release_tmp(c, tmp);
+      release_tmp(c, v0_ndc_copy);
+   }
+
+   /* Now we can iterate over each attribute
+    * (could be done in pairs?)
+    */
+   for (slot = 0; slot < c->vue_map.num_slots; slot++) {
+      int varying = c->vue_map.slot_to_varying[slot];
+      GLuint delta = brw_vue_slot_to_offset(slot);
  
-      if (delta == c->offset[VERT_RESULT_EDGE]) {
+      /* HPOS, NDC already handled above */
+      if (varying == VARYING_SLOT_POS || varying == BRW_VARYING_SLOT_NDC)
+         continue;
+
+
+      if (varying == VARYING_SLOT_EDGE) {
          if (force_edgeflag) 
             brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
          else
             brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
-      }
-      else {
-        /* Interpolate: 
+      } else if (varying == VARYING_SLOT_PSIZ) {
+         /* PSIZ doesn't need interpolation because it isn't used by the
+          * fragment shader.
+          */
+      } else if (varying < VARYING_SLOT_MAX) {
+        /* This is a true vertex result (and not a special value for the VUE
+         * header), so interpolate:
           *
           *        New = attr0 + t*attr1 - t*attr0
+          *
+          * Unless the attribute is flat shaded -- in which case just copy
+          * from one of the sources (doesn't matter which; already copied from pv)
           */
-        brw_MUL(p, 
-                vec4(brw_null_reg()),
-                deref_4f(v1_ptr, delta),
-                t0);
-
-        brw_MAC(p, 
-                tmp,         
-                negate(deref_4f(v0_ptr, delta)),
-                t0); 
-             
-        brw_ADD(p,
-                deref_4f(dest_ptr, delta), 
-                deref_4f(v0_ptr, delta),
-                tmp);
+         GLuint interp = c->key.interpolation_mode.mode[slot];
+
+         if (interp != INTERP_QUALIFIER_FLAT) {
+            struct brw_reg tmp = get_tmp(c);
+            struct brw_reg t =
+               interp == INTERP_QUALIFIER_NOPERSPECTIVE ? t_nopersp : t0;
+
+            brw_MUL(p,
+                  vec4(brw_null_reg()),
+                  deref_4f(v1_ptr, delta),
+                  t);
+
+            brw_MAC(p,
+                  tmp,
+                  negate(deref_4f(v0_ptr, delta)),
+                  t);
+
+            brw_ADD(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta),
+                  tmp);
+
+            release_tmp(c, tmp);
+         }
+         else {
+            brw_MOV(p,
+                  deref_4f(dest_ptr, delta),
+                  deref_4f(v0_ptr, delta));
+         }
        }
     }
  
-   if (i & 1) {
-      GLuint delta = i*16 + 32;
-
-      if (intel->gen == 5)
-          delta = i * 16 + 32 * 3;
+   if (c->vue_map.num_slots % 2) {
+      GLuint delta = brw_vue_slot_to_offset(c->vue_map.num_slots);
  
        brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
     }
  
-   release_tmp(c, tmp);
-
-   /* Recreate the projected (NDC) coordinate in the new vertex
-    * header:
-    */
-   brw_clip_project_vertex(c, dest_ptr );
+   if (c->has_noperspective_shading)
+      release_tmp(c, t_nopersp);
  }
  
-
-
-
-#define MAX_MRF 16
-
  void brw_clip_emit_vue(struct brw_clip_compile *c, 
                        struct brw_indirect vert,
-                      GLboolean allocate,
-                      GLboolean eot,
+                       enum brw_urb_write_flags flags,
                        GLuint header)
  {
     struct brw_compile *p = &c->func;
+   bool allocate = flags & BRW_URB_WRITE_ALLOCATE;
  
     brw_clip_ff_sync(c);
  
-   assert(!(allocate && eot));
+   /* Any URB entry that is allocated must subsequently be used or discarded,
+    * so it doesn't make sense to mark EOT and ALLOCATE at the same time.
+    */
+   assert(!(allocate && (flags & BRW_URB_WRITE_EOT)));
  
     /* Copy the vertex from vertn into m1..mN+1:
      */
@@ -236,12 +346,9 @@ void brw_clip_emit_vue(struct brw_clip_compile *c,
                  allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
                  0,
                  c->reg.R0,
-                allocate,
-                1,             /* used */
+                 flags,
                  c->nr_regs + 1, /* msg length */
                  allocate ? 1 : 0, /* response_length */ 
-                eot,           /* eot */
-                1,             /* writes_complete */
                  0,             /* urb offset */
                  BRW_URB_SWIZZLE_NONE);
  }
@@ -260,12 +367,9 @@ void brw_clip_kill_thread(struct brw_clip_compile *c)
                  retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
                  0,
                  c->reg.R0,
-                0,             /* allocate */
-                0,             /* used */
+                 BRW_URB_WRITE_UNUSED | BRW_URB_WRITE_EOT_COMPLETE,
                  1,             /* msg len */
                  0,             /* response len */
-                1,             /* eot */
-                1,             /* writes complete */
                  0,
                  BRW_URB_SWIZZLE_NONE);
  }
@@ -290,33 +394,21 @@ struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
  }
  
  
-/* If flatshading, distribute color from provoking vertex prior to
+/* Distribute flatshaded attributes from provoking vertex prior to
   * clipping.
   */
-void brw_clip_copy_colors( struct brw_clip_compile *c,
+void brw_clip_copy_flatshaded_attributes( struct brw_clip_compile *c,
                            GLuint to, GLuint from )
  {
     struct brw_compile *p = &c->func;
  
-   if (c->offset[VERT_RESULT_COL0])
-      brw_MOV(p, 
-             byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
-             byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
-
-   if (c->offset[VERT_RESULT_COL1])
-      brw_MOV(p, 
-             byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
-             byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
-
-   if (c->offset[VERT_RESULT_BFC0])
-      brw_MOV(p, 
-             byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
-             byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
-
-   if (c->offset[VERT_RESULT_BFC1])
-      brw_MOV(p, 
-             byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
-             byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
+   for (int i = 0; i < c->vue_map.num_slots; i++) {
+      if (c->key.interpolation_mode.mode[i] == INTERP_QUALIFIER_FLAT) {
+         brw_MOV(p,
+                 byte_offset(c->reg.vertex[to], brw_vue_slot_to_offset(i)),
+                 byte_offset(c->reg.vertex[from], brw_vue_slot_to_offset(i)));
+      }
+   }
  }
  
  
@@ -325,6 +417,7 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
  {
     struct brw_compile *p = &c->func;
     struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+   struct brw_context *brw = p->brw;
     
     /* Shift so that lowest outcode bit is rightmost: 
      */
@@ -336,7 +429,11 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
        /* Rearrange userclip outcodes so that they come directly after
         * the fixed plane bits.
         */
-      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      if (brw->gen == 5 || brw->is_g4x)
+         brw_AND(p, tmp, incoming, brw_imm_ud(0xff<<14));
+      else
+         brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+
        brw_SHR(p, tmp, tmp, brw_imm_ud(8));
        brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
        
@@ -346,15 +443,13 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
  
  void brw_clip_ff_sync(struct brw_clip_compile *c)
  {
-    struct intel_context *intel = &c->func.brw->intel;
-
-    if (intel->needs_ff_sync) {
-        struct brw_compile *p = &c->func;
-        struct brw_instruction *need_ff_sync;
+    struct brw_compile *p = &c->func;
+    struct brw_context *brw = p->brw;
  
+    if (brw->gen == 5) {
          brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
          brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
-        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        brw_IF(p, BRW_EXECUTE_1);
          {
              brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
              brw_ff_sync(p,
@@ -365,16 +460,16 @@ void brw_clip_ff_sync(struct brw_clip_compile *c)
                         1, /* response length */
                         0 /* eot */);
          }
-        brw_ENDIF(p, need_ff_sync);
+        brw_ENDIF(p);
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
      }
  }
  
  void brw_clip_init_ff_sync(struct brw_clip_compile *c)
  {
-    struct intel_context *intel = &c->func.brw->intel;
+    struct brw_context *brw = c->func.brw;
  
-    if (intel->needs_ff_sync) {
+    if (brw->gen == 5) {
         struct brw_compile *p = &c->func;
          
          brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));