freedreno/a4xx: remove fd4_shader_stateobj
[mesa.git] / src / gallium / drivers / llvmpipe / lp_setup_tri.c
index 358da442ea7d8fbfceb5eb1932c454be93bbf8b4..39755d6b581f7a9205edd99ba9092c47897f7d6e 100644 (file)
 #include "util/u_pwr8.h"
 #endif
 
+#if !defined(PIPE_ARCH_SSE)
+
 static inline int
 subpixel_snap(float a)
 {
    return util_iround(FIXED_ONE * a);
 }
 
-static inline float
-fixed_to_float(int a)
-{
-   return a * (1.0f / FIXED_ONE);
-}
-
+#endif
 
 /* Position and area in fixed point coordinates */
 struct fixed_position {
    int32_t x[4];
    int32_t y[4];
-   int64_t area;
    int32_t dx01;
    int32_t dy01;
    int32_t dx20;
    int32_t dy20;
+   int64_t area;
 };
 
 
@@ -94,6 +91,8 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
    unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
    struct lp_rast_triangle *tri;
 
+   STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0);
+
    *tri_size = (sizeof(struct lp_rast_triangle) +
                 3 * input_array_sz +
                 plane_sz);
@@ -274,7 +273,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
    const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *tri;
    struct lp_rast_plane *plane;
-   struct u_rect bbox;
+   const struct u_rect *scissor;
+   struct u_rect bbox, bboxpos;
+   boolean s_planes[4];
    unsigned tri_bytes;
    int nr_planes = 3;
    unsigned viewport_index = 0;
@@ -302,13 +303,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
       layer = MIN2(layer, scene->fb_max_layer);
    }
 
-   if (setup->scissor_test) {
-      nr_planes = 7;
-   }
-   else {
-      nr_planes = 3;
-   }
-
    /* Bounding rectangle (in pixels) */
    {
       /* Yes this is necessary to accurately calculate bounding boxes
@@ -340,12 +334,26 @@ do_triangle_ccw(struct lp_setup_context *setup,
       return TRUE;
    }
 
+   bboxpos = bbox;
+
    /* Can safely discard negative regions, but need to keep hold of
     * information about when the triangle extends past screen
     * boundaries.  See trimmed_box in lp_setup_bin_triangle().
     */
-   bbox.x0 = MAX2(bbox.x0, 0);
-   bbox.y0 = MAX2(bbox.y0, 0);
+   bboxpos.x0 = MAX2(bboxpos.x0, 0);
+   bboxpos.y0 = MAX2(bboxpos.y0, 0);
+
+   nr_planes = 3;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      scissor = &setup->scissors[viewport_index];
+      scissor_planes_needed(s_planes, &bboxpos, scissor);
+      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
+   }
 
    tri = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
@@ -354,7 +362,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
    if (!tri)
       return FALSE;
 
-#if 0
+#ifdef DEBUG
    tri->v[0][0] = v0[0][0];
    tri->v[1][0] = v1[0][0];
    tri->v[2][0] = v2[0][0];
@@ -367,13 +375,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Setup parameter interpolants:
     */
-   setup->setup.variant->jit_function( v0,
-                                      v1,
-                                      v2,
-                                      frontfacing,
-                                      GET_A0(&tri->inputs),
-                                      GET_DADX(&tri->inputs),
-                                      GET_DADY(&tri->inputs) );
+   setup->setup.variant->jit_function(v0, v1, v2,
+                                      frontfacing,
+                                      GET_A0(&tri->inputs),
+                                      GET_DADX(&tri->inputs),
+                                      GET_DADY(&tri->inputs));
 
    tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
@@ -383,9 +389,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    if (0)
       lp_dump_setup_coef(&setup->setup.variant->key,
-                        (const float (*)[4])GET_A0(&tri->inputs),
-                        (const float (*)[4])GET_DADX(&tri->inputs),
-                        (const float (*)[4])GET_DADY(&tri->inputs));
+                         (const float (*)[4])GET_A0(&tri->inputs),
+                         (const float (*)[4])GET_DADX(&tri->inputs),
+                         (const float (*)[4])GET_DADY(&tri->inputs));
 
    plane = GET_PLANES(tri);
 
@@ -556,7 +562,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
       /* Calculate trivial reject values:
        */
-      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+      eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
                          vec_and(dcdx_neg_mask, dcdx));
 
       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
@@ -672,32 +678,46 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * Note that otherwise, the scissor planes only vary in 'C' value,
     * and even then only on state-changes.  Could alternatively store
     * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
     */
-   if (nr_planes == 7) {
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
-
-      plane[3].dcdx = -1;
-      plane[3].dcdy = 0;
-      plane[3].c = 1-scissor->x0;
-      plane[3].eo = 1;
-
-      plane[4].dcdx = 1;
-      plane[4].dcdy = 0;
-      plane[4].c = scissor->x1+1;
-      plane[4].eo = 0;
-
-      plane[5].dcdx = 0;
-      plane[5].dcdy = 1;
-      plane[5].c = 1-scissor->y0;
-      plane[5].eo = 1;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = -1;
-      plane[6].c = scissor->y1+1;
-      plane[6].eo = 0;
+   if (nr_planes > 3) {
+      /* why not just use draw_regions */
+      struct lp_rast_plane *plane_s = &plane[3];
+
+      if (s_planes[0]) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[1]) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (s_planes[2]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (s_planes[3]) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
    }
 
-   return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
+   return lp_setup_bin_triangle(setup, tri, &bbox, &bboxpos, nr_planes, viewport_index);
 }
 
 /*
@@ -728,11 +748,12 @@ floor_pot(uint32_t n)
 
 
 boolean
-lp_setup_bin_triangle( struct lp_setup_context *setup,
-                       struct lp_rast_triangle *tri,
-                       const struct u_rect *bbox,
-                       int nr_planes,
-                       unsigned viewport_index )
+lp_setup_bin_triangle(struct lp_setup_context *setup,
+                      struct lp_rast_triangle *tri,
+                      const struct u_rect *bboxorig,
+                      const struct u_rect *bbox,
+                      int nr_planes,
+                      unsigned viewport_index)
 {
    struct lp_scene *scene = setup->scene;
    struct u_rect trimmed_box = *bbox;   
@@ -748,7 +769,16 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
                  (bbox->y1 - (bbox->y0 & ~3)));
    int sz = floor_pot(max_sz);
-   boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
+
+   /*
+    * NOTE: It is important to use the original bounding box
+    * which might contain negative values here, because if the
+    * plane math may overflow or not with the 32bit rasterization
+    * functions depends on the original extent of the triangle.
+    */
+   int max_szorig = ((bboxorig->x1 - (bboxorig->x0 & ~3)) |
+                     (bboxorig->y1 - (bboxorig->y0 & ~3)));
+   boolean use_32bits = max_szorig <= MAX_FIXED_LENGTH32;
 
    /* Now apply scissor, etc to the bounding box.  Could do this
     * earlier, but it confuses the logic for tri-16 and would force
@@ -966,29 +996,70 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,
 
 /**
  * Calculate fixed position data for a triangle
+ * It is unfortunate we need to do that here (as we need area
+ * calculated in fixed point), as there's quite some code duplication
+ * to what is done in the jit setup prog.
  */
 static inline void
-calc_fixed_position( struct lp_setup_context *setup,
-                     struct fixed_position* position,
-                     const float (*v0)[4],
-                     const float (*v1)[4],
-                     const float (*v2)[4])
+calc_fixed_position(struct lp_setup_context *setup,
+                    struct fixed_position* position,
+                    const float (*v0)[4],
+                    const float (*v1)[4],
+                    const float (*v2)[4])
 {
+   /*
+    * The rounding may not be quite the same with PIPE_ARCH_SSE
+    * (util_iround right now only does nearest/even on x87,
+    * otherwise nearest/away-from-zero).
+    * Both should be acceptable, I think.
+    */
+#if defined(PIPE_ARCH_SSE)
+   __m128 v0r, v1r;
+   __m128 vxy0xy2, vxy1xy0;
+   __m128i vxy0xy2i, vxy1xy0i;
+   __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
+   __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
+   __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
+   v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
+   vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
+   v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
+   vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
+   vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
+   vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
+   vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
+   vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
+   vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
+   vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
+   dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
+   _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
+   /*
+    * For the mul, would need some more shuffles, plus emulation
+    * for the signed mul (without sse41), so don't bother.
+    */
+   x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
+   x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
+   x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
+   y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
+   _mm_store_si128((__m128i *)&position->x[0], x0120);
+   _mm_store_si128((__m128i *)&position->y[0], y0120);
+
+#else
    position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
    position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
    position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
-   position->x[3] = 0;
+   position->x[3] = 0; // should be unused
 
    position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
    position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
    position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-   position->y[3] = 0;
+   position->y[3] = 0; // should be unused
 
    position->dx01 = position->x[0] - position->x[1];
    position->dy01 = position->y[0] - position->y[1];
 
    position->dx20 = position->x[2] - position->x[0];
    position->dy20 = position->y[2] - position->y[0];
+#endif
 
    position->area = IMUL64(position->dx01, position->dy20) -
          IMUL64(position->dx20, position->dy01);