X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_setup_tri.c;h=53ab1f1f0c0bfabd09b065e2e9ea19b029a883d7;hb=0510ec67e2c5b5ddb4755564314ccfe057555984;hp=7e432503c126028d33c8034c59d3fbdf06dab686;hpb=afe125e0a18ac3886c45c7e6b02b122fb2d327b5;p=mesa.git diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 7e432503c12..53ab1f1f0c0 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -31,67 +31,23 @@ #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_rect.h" +#include "util/u_sse.h" #include "lp_perf.h" #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" +#include "lp_context.h" -#define NUM_CHANNELS 4 - -struct tri_info { - - float pixel_offset; - - /* fixed point vertex coordinates */ - int x[3]; - int y[3]; - - /* float x,y deltas - all from the original coordinates - */ - float dy01, dy20; - float dx01, dx20; - float oneoverarea; - - const float (*v0)[4]; - const float (*v1)[4]; - const float (*v2)[4]; - - boolean frontfacing; -}; - - - -static const int step_scissor_minx[16] = { - 0, 1, 0, 1, - 2, 3, 2, 3, - 0, 1, 0, 1, - 2, 3, 2, 3 -}; - -static const int step_scissor_maxx[16] = { - 0, -1, 0, -1, - -2, -3, -2, -3, - 0, -1, 0, -1, - -2, -3, -2, -3 -}; - -static const int step_scissor_miny[16] = { - 0, 0, 1, 1, - 0, 0, 1, 1, - 2, 2, 3, 3, - 2, 2, 3, 3 -}; - -static const int step_scissor_maxy[16] = { - 0, 0, -1, -1, - 0, 0, -1, -1, - -2, -2, -3, -3, - -2, -2, -3, -3 -}; +#include +#define NUM_CHANNELS 4 +#if defined(PIPE_ARCH_SSE) +#include +#endif - static INLINE int subpixel_snap(float a) { @@ -101,383 +57,245 @@ subpixel_snap(float a) static INLINE float fixed_to_float(int a) { - return a * (1.0 / FIXED_ONE); + return a * (1.0f / FIXED_ONE); } +/* Position and area in fixed point coordinates */ +struct fixed_position { + int32_t x[4]; + int32_t y[4]; + int64_t area; + int32_t dx01; + int32_t dy01; + int32_t dx20; + int32_t dy20; +}; + /** - * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + * Alloc space for a new triangle plus the input.a0/dadx/dady arrays + * immediately after it. + * The memory is allocated from the per-scene pool, not per-tile. + * \param tri_size returns number of bytes allocated + * \param num_inputs number of fragment shader inputs + * \return pointer to triangle space */ -static void constant_coef( struct lp_rast_triangle *tri, - unsigned slot, - const float value, - unsigned i ) +struct lp_rast_triangle * +lp_setup_alloc_triangle(struct lp_scene *scene, + unsigned nr_inputs, + unsigned nr_planes, + unsigned *tri_size) { - tri->inputs.a0[slot][i] = value; - tri->inputs.dadx[slot][i] = 0.0f; - tri->inputs.dady[slot][i] = 0.0f; -} + unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); + unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane); + struct lp_rast_triangle *tri; + *tri_size = (sizeof(struct lp_rast_triangle) + + 3 * input_array_sz + + plane_sz); + tri = lp_scene_alloc_aligned( scene, *tri_size, 16 ); + if (tri == NULL) + return NULL; -static void linear_coef( struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - float a0 = info->v0[vert_attr][i]; - float a1 = info->v1[vert_attr][i]; - float a2 = info->v2[vert_attr][i]; + tri->inputs.stride = input_array_sz; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; - float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; - - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; + { + char *a = (char *)tri; + char *b = (char *)&GET_PLANES(tri)[nr_planes]; + assert(b - a == *tri_size); + } - /* calculate a0 as the value which would be sampled for the - * fragment at (0,0), taking into account that we want to sample at - * pixel centers, in other words (0.5, 0.5). - * - * this is neat but unfortunately not a good way to do things for - * triangles with very large values of dadx or dady as it will - * result in the subtraction and re-addition from a0 of a very - * large number, which means we'll end up loosing a lot of the - * fractional bits and precision from a0. the way to fix this is - * to define a0 as the sample at a pixel center somewhere near vmin - * instead - i'll switch to this later. - */ - tri->inputs.a0[slot][i] = (a0 - - (dadx * (info->v0[0][0] - info->pixel_offset) + - dady * (info->v0[0][1] - info->pixel_offset))); + return tri; } - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) +void +lp_setup_print_vertex(struct lp_setup_context *setup, + const char *name, + const float (*v)[4]) { - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - float a0 = info->v0[vert_attr][i] * info->v0[0][3]; - float a1 = info->v1[vert_attr][i] * info->v1[0][3]; - float a2 = info->v2[vert_attr][i] * info->v2[0][3]; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; - float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; - - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a0 - - (dadx * (info->v0[0][0] - info->pixel_offset) + - dady * (info->v0[0][1] - info->pixel_offset))); -} + const struct lp_setup_variant_key *key = &setup->setup.variant->key; + int i, j; + debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", + name, + v[0][0], v[0][1], v[0][2], v[0][3]); -/** - * Special coefficient setup for gl_FragCoord. - * X and Y are trivial - * Z and W are copied from position_coef which should have already been computed. - * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. - */ -static void -setup_fragcoord_coef(struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned usage_mask) -{ - /*X*/ - if (usage_mask & TGSI_WRITEMASK_X) { - tri->inputs.a0[slot][0] = 0.0; - tri->inputs.dadx[slot][0] = 1.0; - tri->inputs.dady[slot][0] = 0.0; - } + for (i = 0; i < key->num_inputs; i++) { + const float *in = v[key->inputs[i].src_index]; - /*Y*/ - if (usage_mask & TGSI_WRITEMASK_Y) { - tri->inputs.a0[slot][1] = 0.0; - tri->inputs.dadx[slot][1] = 0.0; - tri->inputs.dady[slot][1] = 1.0; - } + debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", + i, + name, key->inputs[i].src_index, + (key->inputs[i].usage_mask & 0x1) ? "x" : " ", + (key->inputs[i].usage_mask & 0x2) ? "y" : " ", + (key->inputs[i].usage_mask & 0x4) ? "z" : " ", + (key->inputs[i].usage_mask & 0x8) ? "w" : " "); - /*Z*/ - if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(tri, info, slot, 0, 2); - } + for (j = 0; j < 4; j++) + if (key->inputs[i].usage_mask & (1<coef[] array dadx, dady, a0 values. + * Print triangle vertex attribs (for debug). */ -static void setup_tri_coefficients( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - const struct tri_info *info) +void +lp_setup_print_triangle(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; - unsigned slot; - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; - unsigned i; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(tri, slot+1, info->v0[vert_attr][i], i); - } - else { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(tri, slot+1, info->v2[vert_attr][i], i); - } - break; - - case LP_INTERP_LINEAR: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - linear_coef(tri, info, slot+1, vert_attr, i); - break; - - case LP_INTERP_PERSPECTIVE: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - perspective_coef(tri, info, slot+1, vert_attr, i); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0, so all need to ensure that the usage mask is covers all - * usages. - */ - fragcoord_usage_mask |= usage_mask; - break; + debug_printf("triangle\n"); - case LP_INTERP_FACING: - setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask); - break; - - default: - assert(0); - } + { + const float ex = v0[0][0] - v2[0][0]; + const float ey = v0[0][1] - v2[0][1]; + const float fx = v1[0][0] - v2[0][0]; + const float fy = v1[0][1] - v2[0][1]; + + /* det = cross(e,f).z */ + const float det = ex * fy - ey * fx; + if (det < 0.0f) + debug_printf(" - ccw\n"); + else if (det > 0.0f) + debug_printf(" - cw\n"); + else + debug_printf(" - zero area\n"); } - /* The internal position input is in slot zero: - */ - setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask); + lp_setup_print_vertex(setup, "v0", v0); + lp_setup_print_vertex(setup, "v1", v1); + lp_setup_print_vertex(setup, "v2", v2); } +#define MAX_PLANES 8 +static unsigned +lp_rast_tri_tab[MAX_PLANES+1] = { + 0, /* should be impossible */ + LP_RAST_OP_TRIANGLE_1, + LP_RAST_OP_TRIANGLE_2, + LP_RAST_OP_TRIANGLE_3, + LP_RAST_OP_TRIANGLE_4, + LP_RAST_OP_TRIANGLE_5, + LP_RAST_OP_TRIANGLE_6, + LP_RAST_OP_TRIANGLE_7, + LP_RAST_OP_TRIANGLE_8 +}; +static unsigned +lp_rast_32_tri_tab[MAX_PLANES+1] = { + 0, /* should be impossible */ + LP_RAST_OP_TRIANGLE_32_1, + LP_RAST_OP_TRIANGLE_32_2, + LP_RAST_OP_TRIANGLE_32_3, + LP_RAST_OP_TRIANGLE_32_4, + LP_RAST_OP_TRIANGLE_32_5, + LP_RAST_OP_TRIANGLE_32_6, + LP_RAST_OP_TRIANGLE_32_7, + LP_RAST_OP_TRIANGLE_32_8 +}; /** - * Alloc space for a new triangle plus the input.a0/dadx/dady arrays - * immediately after it. - * The memory is allocated from the per-scene pool, not per-tile. - * \param tri_size returns number of bytes allocated - * \param nr_inputs number of fragment shader inputs - * \return pointer to triangle space - */ -static INLINE struct lp_rast_triangle * -alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, - unsigned nr_planes, - unsigned *tri_size) -{ - unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); - struct lp_rast_triangle *tri; - unsigned tri_bytes, bytes; - char *inputs; - - tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16); - bytes = tri_bytes + (3 * input_array_sz); - - tri = lp_scene_alloc_aligned( scene, bytes, 16 ); - - if (tri) { - inputs = ((char *)tri) + tri_bytes; - tri->inputs.a0 = (float (*)[4]) inputs; - tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz); - tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz); - - *tri_size = bytes; - } - - return tri; -} - - -/** - * Print triangle vertex attribs (for debug). + * The primitive covers the whole tile- shade whole tile. + * + * \param tx, ty the tile position in tiles, not pixels */ -static void -print_triangle(struct lp_setup_context *setup, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4]) +static boolean +lp_setup_whole_tile(struct lp_setup_context *setup, + const struct lp_rast_shader_inputs *inputs, + int tx, int ty) { - uint i; + struct lp_scene *scene = setup->scene; + + LP_COUNT(nr_fully_covered_64); + + /* if variant is opaque and scissor doesn't effect the tile */ + if (inputs->opaque) { + /* Several things prevent this optimization from working: + * - For layered rendering we can't determine if this covers the same layer + * as previous rendering (or in case of clears those actually always cover + * all layers so optimization is impossible). Need to use fb_max_layer and + * not setup->layer_slot to determine this since even if there's currently + * no slot assigned previous rendering could have used one. + * - If there were any Begin/End query commands in the scene then those + * would get removed which would be very wrong. Furthermore, if queries + * were just active we also can't do the optimization since to get + * accurate query results we unfortunately need to execute the rendering + * commands. + */ + if (!scene->fb.zsbuf && scene->fb_max_layer == 0 && !scene->had_queries) { + /* + * All previous rendering will be overwritten so reset the bin. + */ + lp_scene_bin_reset( scene, tx, ty ); + } - debug_printf("llvmpipe triangle\n"); - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v1[%d]: %f %f %f %f\n", i, - v1[i][0], v1[i][1], v1[i][2], v1[i][3]); - } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v2[%d]: %f %f %f %f\n", i, - v2[i][0], v2[i][1], v2[i][2], v2[i][3]); - } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v3[%d]: %f %f %f %f\n", i, - v3[i][0], v3[i][1], v3[i][2], v3[i][3]); + LP_COUNT(nr_shade_opaque_64); + return lp_scene_bin_cmd_with_state( scene, tx, ty, + setup->fs.stored, + LP_RAST_OP_SHADE_TILE_OPAQUE, + lp_rast_arg_inputs(inputs) ); + } else { + LP_COUNT(nr_shade_64); + return lp_scene_bin_cmd_with_state( scene, tx, ty, + setup->fs.stored, + LP_RAST_OP_SHADE_TILE, + lp_rast_arg_inputs(inputs) ); } } -lp_rast_cmd lp_rast_tri_tab[8] = { - NULL, /* should be impossible */ - lp_rast_triangle_1, - lp_rast_triangle_2, - lp_rast_triangle_3, - lp_rast_triangle_4, - lp_rast_triangle_5, - lp_rast_triangle_6, - lp_rast_triangle_7 -}; - /** * Do basic setup for triangle rasterization and determine which * framebuffer tiles are touched. Put the triangle in the scene's * bins for the tiles which we overlap. */ -static void +static boolean do_triangle_ccw(struct lp_setup_context *setup, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], - boolean frontfacing ) + struct fixed_position* position, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean frontfacing ) { - - struct lp_scene *scene = lp_setup_get_current_scene(setup); - struct lp_fragment_shader_variant *variant = setup->fs.current.variant; + struct lp_scene *scene = setup->scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *tri; - struct tri_info info; - int area; - int minx, maxx, miny, maxy; - int ix0, ix1, iy0, iy1; + struct lp_rast_plane *plane; + struct u_rect bbox; unsigned tri_bytes; - int i; int nr_planes = 3; - + unsigned scissor_index = 0; + unsigned layer = 0; + + /* Area should always be positive here */ + assert(position->area > 0); + if (0) - print_triangle(setup, v1, v2, v3); + lp_setup_print_triangle(setup, v0, v1, v2); if (setup->scissor_test) { nr_planes = 7; + if (setup->viewport_index_slot > 0) { + unsigned *udata = (unsigned*)v0[setup->viewport_index_slot]; + scissor_index = lp_clamp_scissor_idx(*udata); + } } else { nr_planes = 3; } - - - tri = alloc_triangle(scene, - setup->fs.nr_inputs, - nr_planes, - &tri_bytes); - if (!tri) - return; - -#ifdef DEBUG - tri->v[0][0] = v1[0][0]; - tri->v[1][0] = v2[0][0]; - tri->v[2][0] = v3[0][0]; - tri->v[0][1] = v1[0][1]; - tri->v[1][1] = v2[0][1]; - tri->v[2][1] = v3[0][1]; -#endif - - /* x/y positions in fixed point */ - info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset); - info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset); - info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset); - info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset); - info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset); - info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset); - - tri->plane[0].dcdy = info.x[0] - info.x[1]; - tri->plane[1].dcdy = info.x[1] - info.x[2]; - tri->plane[2].dcdy = info.x[2] - info.x[0]; - - tri->plane[0].dcdx = info.y[0] - info.y[1]; - tri->plane[1].dcdx = info.y[1] - info.y[2]; - tri->plane[2].dcdx = info.y[2] - info.y[0]; - - area = (tri->plane[0].dcdy * tri->plane[2].dcdx - - tri->plane[2].dcdy * tri->plane[0].dcdx); - - LP_COUNT(nr_tris); - - /* Cull non-ccw and zero-sized triangles. - * - * XXX: subject to overflow?? - */ - if (area <= 0) { - lp_scene_putback_data( scene, tri_bytes ); - LP_COUNT(nr_culled_tris); - return; + if (setup->layer_slot > 0) { + layer = *(unsigned*)v1[setup->layer_slot]; + layer = MIN2(layer, scene->fb_max_layer); } /* Bounding rectangle (in pixels) */ @@ -489,136 +307,224 @@ do_triangle_ccw(struct lp_setup_context *setup, */ int adj = (setup->pixel_offset != 0) ? 1 : 0; - minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; - miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; - maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; - } + /* Inclusive x0, exclusive x1 */ + bbox.x0 = MIN3(position->x[0], position->x[1], position->x[2]) >> FIXED_ORDER; + bbox.x1 = (MAX3(position->x[0], position->x[1], position->x[2]) - 1) >> FIXED_ORDER; - if (setup->scissor_test) { - minx = MAX2(minx, setup->scissor.current.minx); - maxx = MIN2(maxx, setup->scissor.current.maxx); - miny = MAX2(miny, setup->scissor.current.miny); - maxy = MIN2(maxy, setup->scissor.current.maxy); - } - else { - minx = MAX2(minx, 0); - miny = MAX2(miny, 0); - maxx = MIN2(maxx, scene->fb.width); - maxy = MIN2(maxy, scene->fb.height); + /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */ + bbox.y0 = (MIN3(position->y[0], position->y[1], position->y[2]) + adj) >> FIXED_ORDER; + bbox.y1 = (MAX3(position->y[0], position->y[1], position->y[2]) - 1 + adj) >> FIXED_ORDER; } + if (bbox.x1 < bbox.x0 || + bbox.y1 < bbox.y0) { + if (0) debug_printf("empty bounding box\n"); + LP_COUNT(nr_culled_tris); + return TRUE; + } - if (miny >= maxy || minx >= maxx) { - lp_scene_putback_data( scene, tri_bytes ); + if (!u_rect_test_intersection(&setup->draw_regions[scissor_index], &bbox)) { + if (0) debug_printf("offscreen\n"); LP_COUNT(nr_culled_tris); - return; + return TRUE; } - /* + /* Can safely discard negative regions, but need to keep hold of + * information about when the triangle extends past screen + * boundaries. See trimmed_box in lp_setup_bin_triangle(). */ - info.pixel_offset = setup->pixel_offset; - info.v0 = v1; - info.v1 = v2; - info.v2 = v3; - info.dx01 = info.v0[0][0] - info.v1[0][0]; - info.dx20 = info.v2[0][0] - info.v0[0][0]; - info.dy01 = info.v0[0][1] - info.v1[0][1]; - info.dy20 = info.v2[0][1] - info.v0[0][1]; - info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01); - info.frontfacing = frontfacing; + bbox.x0 = MAX2(bbox.x0, 0); + bbox.y0 = MAX2(bbox.y0, 0); + + tri = lp_setup_alloc_triangle(scene, + key->num_inputs, + nr_planes, + &tri_bytes); + if (!tri) + return FALSE; + +#if 0 + tri->v[0][0] = v0[0][0]; + tri->v[1][0] = v1[0][0]; + tri->v[2][0] = v2[0][0]; + tri->v[0][1] = v0[0][1]; + tri->v[1][1] = v1[0][1]; + tri->v[2][1] = v2[0][1]; +#endif + + LP_COUNT(nr_tris); /* Setup parameter interpolants: */ - setup_tri_coefficients( setup, tri, &info ); + setup->setup.variant->jit_function( v0, + v1, + v2, + frontfacing, + GET_A0(&tri->inputs), + GET_DADX(&tri->inputs), + GET_DADY(&tri->inputs) ); + + tri->inputs.frontfacing = frontfacing; + tri->inputs.disable = FALSE; + tri->inputs.opaque = setup->fs.current.variant->opaque; + tri->inputs.layer = layer; - tri->inputs.facing = frontfacing ? 1.0F : -1.0F; - tri->inputs.state = setup->fs.stored; + if (0) + lp_dump_setup_coef(&setup->setup.variant->key, + (const float (*)[4])GET_A0(&tri->inputs), + (const float (*)[4])GET_DADX(&tri->inputs), + (const float (*)[4])GET_DADY(&tri->inputs)); + + plane = GET_PLANES(tri); + +#if defined(PIPE_ARCH_SSE) + if (setup->fb.width <= MAX_FIXED_LENGTH32 && + setup->fb.height <= MAX_FIXED_LENGTH32 && + (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { + __m128i vertx, verty; + __m128i shufx, shufy; + __m128i dcdx, dcdy, c; + __m128i unused; + __m128i dcdx_neg_mask; + __m128i dcdy_neg_mask; + __m128i dcdx_zero_mask; + __m128i top_left_flag; + __m128i c_inc_mask, c_inc; + __m128i eo, p0, p1, p2; + __m128i zero = _mm_setzero_si128(); + PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; + + vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */ + verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */ + + shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1)); + shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1)); + + dcdx = _mm_sub_epi32(verty, shufy); + dcdy = _mm_sub_epi32(vertx, shufx); + + dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); + dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero); + dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); + + top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0); + + c_inc_mask = _mm_or_si128(dcdx_neg_mask, + _mm_and_si128(dcdx_zero_mask, + _mm_xor_si128(dcdy_neg_mask, + top_left_flag))); + + c_inc = _mm_srli_epi32(c_inc_mask, 31); + + c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx), + mm_mullo_epi32(dcdy, verty)); + + c = _mm_add_epi32(c, c_inc); + + /* Scale up to match c: + */ + dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER); + dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER); + /* Calculate trivial reject values: + */ + eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), + _mm_and_si128(dcdx_neg_mask, dcdx)); - - for (i = 0; i < 3; i++) { - struct lp_rast_plane *plane = &tri->plane[i]; + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ - /* half-edge constants, will be interated over the whole render - * target. + /* Pointless transpose which gets undone immediately in + * rasterization: */ - plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i]; - - /* correct for top-left vs. bottom-left fill convention. - * - * note that we're overloading gl_rasterization_rules to mean - * both (0.5,0.5) pixel centers *and* bottom-left filling - * convention. - * - * GL actually has a top-left filling convention, but GL's - * notion of "top" differs from gallium's... - * - * Also, sometimes (in FBO cases) GL will render upside down - * to its usual method, in which case it will probably want - * to use the opposite, top-left convention. - */ - if (plane->dcdx < 0) { - /* both fill conventions want this - adjust for left edges */ - plane->c++; - } - else if (plane->dcdx == 0) { - if (setup->pixel_offset == 0) { - /* correct for top-left fill convention: - */ - if (plane->dcdy > 0) plane->c++; + transpose4_epi32(&c, &dcdx, &dcdy, &eo, + &p0, &p1, &p2, &unused); + +#define STORE_PLANE(plane, vec) do { \ + _mm_store_si128((__m128i *)&temp_vec, vec); \ + plane.c = (int64_t)temp_vec[0]; \ + plane.dcdx = temp_vec[1]; \ + plane.dcdy = temp_vec[2]; \ + plane.eo = temp_vec[3]; \ + } while(0) + + STORE_PLANE(plane[0], p0); + STORE_PLANE(plane[1], p1); + STORE_PLANE(plane[2], p2); +#undef STORE_PLANE + } else +#endif + { + int i; + plane[0].dcdy = position->dx01; + plane[1].dcdy = position->x[1] - position->x[2]; + plane[2].dcdy = position->dx20; + plane[0].dcdx = position->dy01; + plane[1].dcdx = position->y[1] - position->y[2]; + plane[2].dcdx = position->dy20; + + for (i = 0; i < 3; i++) { + /* half-edge constants, will be interated over the whole render + * target. + */ + plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) - + IMUL64(plane[i].dcdy, position->y[i]); + + /* correct for top-left vs. bottom-left fill convention. + */ + if (plane[i].dcdx < 0) { + /* both fill conventions want this - adjust for left edges */ + plane[i].c++; } - else { - /* correct for bottom-left fill convention: - */ - if (plane->dcdy < 0) plane->c++; + else if (plane[i].dcdx == 0) { + if (setup->bottom_edge_rule == 0){ + /* correct for top-left fill convention: + */ + if (plane[i].dcdy > 0) plane[i].c++; + } + else { + /* correct for bottom-left fill convention: + */ + if (plane[i].dcdy < 0) plane[i].c++; + } } - } - plane->dcdx *= FIXED_ONE; - plane->dcdy *= FIXED_ONE; - - /* find trivial reject offsets for each edge for a single-pixel - * sized block. These will be scaled up at each recursive level to - * match the active blocksize. Scaling in this way works best if - * the blocks are square. - */ - plane->eo = 0; - if (plane->dcdx < 0) plane->eo -= plane->dcdx; - if (plane->dcdy > 0) plane->eo += plane->dcdy; - - /* Calculate trivial accept offsets from the above. - */ - plane->ei = plane->dcdy - plane->dcdx - plane->eo; - - plane->step = tri->step[i]; + /* Scale up to match c: + */ + assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx); + assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy); + plane[i].dcdx <<= FIXED_ORDER; + plane[i].dcdy <<= FIXED_ORDER; + + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + plane[i].eo = 0; + if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx; + if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy; + } + } - /* Fill in the inputs.step[][] arrays. - * We've manually unrolled some loops here. - */ -#define SETUP_STEP(j, x, y) \ - tri->step[i][j] = y * plane->dcdy - x * plane->dcdx + if (0) { + debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + plane[0].c, + plane[0].dcdx, + plane[0].dcdy, + plane[0].eo); + + debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + plane[1].c, + plane[1].dcdx, + plane[1].dcdy, + plane[1].eo); - SETUP_STEP(0, 0, 0); - SETUP_STEP(1, 1, 0); - SETUP_STEP(2, 0, 1); - SETUP_STEP(3, 1, 1); - - SETUP_STEP(4, 2, 0); - SETUP_STEP(5, 3, 0); - SETUP_STEP(6, 2, 1); - SETUP_STEP(7, 3, 1); - - SETUP_STEP(8, 0, 2); - SETUP_STEP(9, 1, 2); - SETUP_STEP(10, 0, 3); - SETUP_STEP(11, 1, 3); - - SETUP_STEP(12, 2, 2); - SETUP_STEP(13, 3, 2); - SETUP_STEP(14, 2, 3); - SETUP_STEP(15, 3, 3); -#undef STEP + debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n", + plane[2].c, + plane[2].dcdx, + plane[2].dcdy, + plane[2].eo); } @@ -641,84 +547,193 @@ do_triangle_ccw(struct lp_setup_context *setup, * these planes elsewhere. */ if (nr_planes == 7) { - tri->plane[3].step = step_scissor_minx; - tri->plane[3].dcdx = -1; - tri->plane[3].dcdy = 0; - tri->plane[3].c = 1-minx; - tri->plane[3].ei = 0; - tri->plane[3].eo = 1; - - tri->plane[4].step = step_scissor_maxx; - tri->plane[4].dcdx = 1; - tri->plane[4].dcdy = 0; - tri->plane[4].c = maxx; - tri->plane[4].ei = -1; - tri->plane[4].eo = 0; - - tri->plane[5].step = step_scissor_miny; - tri->plane[5].dcdx = 0; - tri->plane[5].dcdy = 1; - tri->plane[5].c = 1-miny; - tri->plane[5].ei = 0; - tri->plane[5].eo = 1; - - tri->plane[6].step = step_scissor_maxy; - tri->plane[6].dcdx = 0; - tri->plane[6].dcdy = -1; - tri->plane[6].c = maxy; - tri->plane[6].ei = -1; - tri->plane[6].eo = 0; + const struct u_rect *scissor = &setup->scissors[scissor_index]; + + plane[3].dcdx = -1; + plane[3].dcdy = 0; + plane[3].c = 1-scissor->x0; + plane[3].eo = 1; + + plane[4].dcdx = 1; + plane[4].dcdy = 0; + plane[4].c = scissor->x1+1; + plane[4].eo = 0; + + plane[5].dcdx = 0; + plane[5].dcdy = 1; + plane[5].c = 1-scissor->y0; + plane[5].eo = 1; + + plane[6].dcdx = 0; + plane[6].dcdy = -1; + plane[6].c = scissor->y1+1; + plane[6].eo = 0; } + return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, scissor_index); +} - /* - * All fields of 'tri' are now set. The remaining code here is - * concerned with binning. - */ +/* + * Round to nearest less or equal power of two of the input. + * + * Undefined if no bit set exists, so code should check against 0 first. + */ +static INLINE uint32_t +floor_pot(uint32_t n) +{ +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) + if (n == 0) + return 0; + + __asm__("bsr %1,%0" + : "=r" (n) + : "rm" (n)); + return 1 << n; +#else + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return n - (n >> 1); +#endif +} - /* Convert to tile coordinates, and inclusive ranges: + +boolean +lp_setup_bin_triangle( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + const struct u_rect *bbox, + int nr_planes, + unsigned scissor_index ) +{ + struct lp_scene *scene = setup->scene; + struct u_rect trimmed_box = *bbox; + int i; + /* What is the largest power-of-two boundary this triangle crosses: */ - ix0 = minx / TILE_SIZE; - iy0 = miny / TILE_SIZE; - ix1 = (maxx-1) / TILE_SIZE; - iy1 = (maxy-1) / TILE_SIZE; + int dx = floor_pot((bbox->x0 ^ bbox->x1) | + (bbox->y0 ^ bbox->y1)); - /* - * Clamp to framebuffer size + /* The largest dimension of the rasterized area of the triangle + * (aligned to a 4x4 grid), rounded down to the nearest power of two: + */ + int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) | + (bbox->y1 - (bbox->y0 & ~3))); + int sz = floor_pot(max_sz); + boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32; + + /* Now apply scissor, etc to the bounding box. Could do this + * earlier, but it confuses the logic for tri-16 and would force + * the rasterizer to also respect scissor, etc, just for the rare + * cases where a small triangle extends beyond the scissor. */ - assert(ix0 == MAX2(ix0, 0)); - assert(iy0 == MAX2(iy0, 0)); - assert(ix1 == MIN2(ix1, scene->tiles_x - 1)); - assert(iy1 == MIN2(iy1, scene->tiles_y - 1)); + u_rect_find_intersection(&setup->draw_regions[scissor_index], + &trimmed_box); /* Determine which tile(s) intersect the triangle's bounding box */ - if (iy0 == iy1 && ix0 == ix1) + if (dx < TILE_SIZE) { + int ix0 = bbox->x0 / TILE_SIZE; + int iy0 = bbox->y0 / TILE_SIZE; + unsigned px = bbox->x0 & 63 & ~3; + unsigned py = bbox->y0 & 63 & ~3; + + assert(iy0 == bbox->y1 / TILE_SIZE && + ix0 == bbox->x1 / TILE_SIZE); + + if (nr_planes == 3) { + if (sz < 4) + { + /* Triangle is contained in a single 4x4 stamp: + */ + assert(px + 4 <= TILE_SIZE); + assert(py + 4 <= TILE_SIZE); + return lp_scene_bin_cmd_with_state( scene, ix0, iy0, + setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_3_4 : + LP_RAST_OP_TRIANGLE_3_4, + lp_rast_arg_triangle_contained(tri, px, py) ); + } + + if (sz < 16) + { + /* Triangle is contained in a single 16x16 block: + */ + + /* + * The 16x16 block is only 4x4 aligned, and can exceed the tile + * dimensions if the triangle is 16 pixels in one dimension but 4 + * in the other. So budge the 16x16 back inside the tile. + */ + px = MIN2(px, TILE_SIZE - 16); + py = MIN2(py, TILE_SIZE - 16); + + assert(px + 16 <= TILE_SIZE); + assert(py + 16 <= TILE_SIZE); + + return lp_scene_bin_cmd_with_state( scene, ix0, iy0, + setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_3_16 : + LP_RAST_OP_TRIANGLE_3_16, + lp_rast_arg_triangle_contained(tri, px, py) ); + } + } + else if (nr_planes == 4 && sz < 16) + { + px = MIN2(px, TILE_SIZE - 16); + py = MIN2(py, TILE_SIZE - 16); + + assert(px + 16 <= TILE_SIZE); + assert(py + 16 <= TILE_SIZE); + + return lp_scene_bin_cmd_with_state(scene, ix0, iy0, + setup->fs.stored, + use_32bits ? + LP_RAST_OP_TRIANGLE_32_4_16 : + LP_RAST_OP_TRIANGLE_4_16, + lp_rast_arg_triangle_contained(tri, px, py)); + } + + /* Triangle is contained in a single tile: */ - lp_scene_bin_command( scene, ix0, iy0, - lp_rast_tri_tab[nr_planes], - lp_rast_arg_triangle(tri, (1<fs.stored, + use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes], + lp_rast_arg_triangle(tri, (1<plane[i].c + - tri->plane[i].dcdy * iy0 * TILE_SIZE - - tri->plane[i].dcdx * ix0 * TILE_SIZE); - - ei[i] = tri->plane[i].ei << TILE_ORDER; - eo[i] = tri->plane[i].eo << TILE_ORDER; - xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER); - ystep[i] = tri->plane[i].dcdy << TILE_ORDER; + c[i] = (plane[i].c + + IMUL64(plane[i].dcdy, iy0) * TILE_SIZE - + IMUL64(plane[i].dcdx, ix0) * TILE_SIZE); + + ei[i] = (plane[i].dcdy - + plane[i].dcdx - + plane[i].eo) << TILE_ORDER; + + eo[i] = plane[i].eo << TILE_ORDER; + xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER); + ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER; } @@ -730,22 +745,22 @@ do_triangle_ccw(struct lp_setup_context *setup, */ for (y = iy0; y <= iy1; y++) { - boolean in = FALSE; /* are we inside the triangle? */ - int cx[7]; + boolean in = FALSE; /* are we inside the triangle? */ + int64_t cx[MAX_PLANES]; for (i = 0; i < nr_planes; i++) cx[i] = c[i]; - for (x = ix0; x <= ix1; x++) - { + for (x = ix0; x <= ix1; x++) + { int out = 0; int partial = 0; for (i = 0; i < nr_planes; i++) { - int planeout = cx[i] + eo[i]; - int planepartial = cx[i] + ei[i] - 1; - out |= (planeout >> 31); - partial |= (planepartial >> 31) & (1<> 63); + partial |= (planepartial >> 63) & (1<fs.stored, + use_32bits ? + lp_rast_32_tri_tab[count] : + lp_rast_tri_tab[count], + lp_rast_arg_triangle(tri, partial) )) + goto fail; LP_COUNT(nr_partially_covered_64); } @@ -770,54 +790,342 @@ do_triangle_ccw(struct lp_setup_context *setup, /* triangle covers the whole tile- shade whole tile */ LP_COUNT(nr_fully_covered_64); in = TRUE; - if (variant->opaque && - !setup->fb.zsbuf) { - lp_scene_bin_reset( scene, x, y ); - } - lp_scene_bin_command( scene, x, y, - lp_rast_shade_tile, - lp_rast_arg_inputs(&tri->inputs) ); + if (!lp_setup_whole_tile(setup, &tri->inputs, x, y)) + goto fail; } - /* Iterate cx values across the region: - */ + /* Iterate cx values across the region: */ for (i = 0; i < nr_planes; i++) cx[i] += xstep[i]; - } - - /* Iterate c values down the region: - */ + } + + /* Iterate c values down the region: */ for (i = 0; i < nr_planes; i++) c[i] += ystep[i]; } } + + return TRUE; + +fail: + /* Need to disable any partially binned triangle. This is easier + * than trying to locate all the triangle, shade-tile, etc, + * commands which may have been binned. + */ + tri->inputs.disable = TRUE; + return FALSE; } /** - * Draw triangle if it's CW, cull otherwise. + * Try to draw the triangle, restart the scene on failure. */ -static void triangle_cw( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4] ) +static void retry_triangle_ccw( struct lp_setup_context *setup, + struct fixed_position* position, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front) +{ + if (!do_triangle_ccw( setup, position, v0, v1, v2, front )) + { + if (!lp_setup_flush_and_restart(setup)) + return; + + if (!do_triangle_ccw( setup, position, v0, v1, v2, front )) + return; + } +} + +/** + * Calculate fixed position data for a triangle + */ +static INLINE void +calc_fixed_position( struct lp_setup_context *setup, + struct fixed_position* position, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ); + position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset); + position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset); + position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset); + position->x[3] = 0; + + position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset); + position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset); + position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset); + position->y[3] = 0; + + position->dx01 = position->x[0] - position->x[1]; + position->dy01 = position->y[0] - position->y[1]; + + position->dx20 = position->x[2] - position->x[0]; + position->dy20 = position->y[2] - position->y[0]; + + position->area = IMUL64(position->dx01, position->dy20) - + IMUL64(position->dx20, position->dy01); } /** - * Draw triangle if it's CCW, cull otherwise. + * Rotate a triangle, flipping its clockwise direction, + * Swaps values for xy[0] and xy[1] */ -static void triangle_ccw( struct lp_setup_context *setup, +static INLINE void +rotate_fixed_position_01( struct fixed_position* position ) +{ + int x, y; + + x = position->x[1]; + y = position->y[1]; + position->x[1] = position->x[0]; + position->y[1] = position->y[0]; + position->x[0] = x; + position->y[0] = y; + + position->dx01 = -position->dx01; + position->dy01 = -position->dy01; + position->dx20 = position->x[2] - position->x[0]; + position->dy20 = position->y[2] - position->y[0]; + + position->area = -position->area; +} + + +/** + * Rotate a triangle, flipping its clockwise direction, + * Swaps values for xy[1] and xy[2] + */ +static INLINE void +rotate_fixed_position_12( struct fixed_position* position ) +{ + int x, y; + + x = position->x[2]; + y = position->y[2]; + position->x[2] = position->x[1]; + position->y[2] = position->y[1]; + position->x[1] = x; + position->y[1] = y; + + x = position->dx01; + y = position->dy01; + position->dx01 = -position->dx20; + position->dy01 = -position->dy20; + position->dx20 = -x; + position->dy20 = -y; + + position->area = -position->area; +} + + +typedef void (*triangle_func_t)(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]); + + +/** + * Subdivide this triangle by bisecting edge (v0, v1). + * \param pv the provoking vertex (must = v0 or v1 or v2) + * TODO: should probably think about non-overflowing arithmetic elsewhere. + * This will definitely screw with pipeline counters for instance. + */ +static void +subdiv_tri(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + const float (*pv)[4], + triangle_func_t tri) +{ + unsigned n = setup->fs.current.variant->shader->info.base.num_inputs + 1; + const struct lp_shader_input *inputs = + setup->fs.current.variant->shader->inputs; + PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) float vmid[PIPE_MAX_ATTRIBS][4]; + const float (*vm)[4] = (const float (*)[4]) vmid; + unsigned i; + float w0, w1, wm; + boolean flatshade = setup->fs.current.variant->key.flatshade; + + /* find position midpoint (attrib[0] = position) */ + vmid[0][0] = 0.5f * (v1[0][0] + v0[0][0]); + vmid[0][1] = 0.5f * (v1[0][1] + v0[0][1]); + vmid[0][2] = 0.5f * (v1[0][2] + v0[0][2]); + vmid[0][3] = 0.5f * (v1[0][3] + v0[0][3]); + + w0 = v0[0][3]; + w1 = v1[0][3]; + wm = vmid[0][3]; + + /* interpolate other attributes */ + for (i = 1; i < n; i++) { + if ((inputs[i - 1].interp == LP_INTERP_COLOR && flatshade) || + inputs[i - 1].interp == LP_INTERP_CONSTANT) { + /* copy the provoking vertex's attribute */ + vmid[i][0] = pv[i][0]; + vmid[i][1] = pv[i][1]; + vmid[i][2] = pv[i][2]; + vmid[i][3] = pv[i][3]; + } + else { + /* interpolate with perspective correction (for linear too) */ + vmid[i][0] = 0.5f * (v1[i][0] * w1 + v0[i][0] * w0) / wm; + vmid[i][1] = 0.5f * (v1[i][1] * w1 + v0[i][1] * w0) / wm; + vmid[i][2] = 0.5f * (v1[i][2] * w1 + v0[i][2] * w0) / wm; + vmid[i][3] = 0.5f * (v1[i][3] * w1 + v0[i][3] * w0) / wm; + } + } + + /* handling flat shading and first vs. last provoking vertex is a + * little tricky... + */ + if (pv == v0) { + if (setup->flatshade_first) { + /* first vertex must be v0 or vm */ + tri(setup, v0, vm, v2); + tri(setup, vm, v1, v2); + } + else { + /* last vertex must be v0 or vm */ + tri(setup, vm, v2, v0); + tri(setup, v1, v2, vm); + } + } + else if (pv == v1) { + if (setup->flatshade_first) { + tri(setup, vm, v2, v0); + tri(setup, v1, v2, vm); + } + else { + tri(setup, v2, v0, vm); + tri(setup, v2, vm, v1); + } + } + else { + if (setup->flatshade_first) { + tri(setup, v2, v0, vm); + tri(setup, v2, vm, v1); + } + else { + tri(setup, v0, vm, v2); + tri(setup, vm, v1, v2); + } + } +} + + +/** + * Check the lengths of the edges of the triangle. If any edge is too + * long, subdivide the longest edge and draw two sub-triangles. + * Note: this may be called recursively. + * \return TRUE if triangle was subdivided, FALSE otherwise + */ +static boolean +check_subdivide_triangle(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + triangle_func_t tri) +{ + const float maxLen = (float) MAX_FIXED_LENGTH; /* longest permissible edge, in pixels */ + float dx10, dy10, len10; + float dx21, dy21, len21; + float dx02, dy02, len02; + const float (*pv)[4] = setup->flatshade_first ? v0 : v2; + + /* compute lengths of triangle edges, squared */ + dx10 = v1[0][0] - v0[0][0]; + dy10 = v1[0][1] - v0[0][1]; + len10 = dx10 * dx10 + dy10 * dy10; + + dx21 = v2[0][0] - v1[0][0]; + dy21 = v2[0][1] - v1[0][1]; + len21 = dx21 * dx21 + dy21 * dy21; + + dx02 = v0[0][0] - v2[0][0]; + dy02 = v0[0][1] - v2[0][1]; + len02 = dx02 * dx02 + dy02 * dy02; + + /* Look for longest the edge that's longer than maxLen. If we find + * such an edge, split the triangle using the midpoint of that edge. + * Note: it's important to split the longest edge, not just any edge + * that's longer than maxLen. Otherwise, we can get into a degenerate + * situation and recurse indefinitely. + */ + if (len10 > maxLen * maxLen && + len10 >= len21 && + len10 >= len02) { + /* subdivide v0, v1 edge */ + subdiv_tri(setup, v0, v1, v2, pv, tri); + return TRUE; + } + + if (len21 > maxLen * maxLen && + len21 >= len10 && + len21 >= len02) { + /* subdivide v1, v2 edge */ + subdiv_tri(setup, v1, v2, v0, pv, tri); + return TRUE; + } + + if (len02 > maxLen * maxLen && + len02 >= len21 && + len02 >= len10) { + /* subdivide v2, v0 edge */ + subdiv_tri(setup, v2, v0, v1, pv, tri); + return TRUE; + } + + return FALSE; +} + + +/** + * Draw triangle if it's CW, cull otherwise. + */ +static void triangle_cw( struct lp_setup_context *setup, const float (*v0)[4], const float (*v1)[4], const float (*v2)[4] ) { - do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ); + struct fixed_position position; + + if (setup->subdivide_large_triangles && + check_subdivide_triangle(setup, v0, v1, v2, triangle_cw)) + return; + + calc_fixed_position(setup, &position, v0, v1, v2); + + if (position.area < 0) { + if (setup->flatshade_first) { + rotate_fixed_position_12(&position); + retry_triangle_ccw(setup, &position, v0, v2, v1, !setup->ccw_is_frontface); + } else { + rotate_fixed_position_01(&position); + retry_triangle_ccw(setup, &position, v1, v0, v2, !setup->ccw_is_frontface); + } + } } +static void triangle_ccw( struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) +{ + struct fixed_position position; + + if (setup->subdivide_large_triangles && + check_subdivide_triangle(setup, v0, v1, v2, triangle_ccw)) + return; + + calc_fixed_position(setup, &position, v0, v1, v2); + + if (position.area > 0) + retry_triangle_ccw(setup, &position, v0, v1, v2, setup->ccw_is_frontface); +} /** * Draw triangle whether it's CW or CCW. @@ -827,17 +1135,40 @@ static void triangle_both( struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4] ) { - /* edge vectors e = v0 - v2, f = v1 - v2 */ - const float ex = v0[0][0] - v2[0][0]; - const float ey = v0[0][1] - v2[0][1]; - const float fx = v1[0][0] - v2[0][0]; - const float fy = v1[0][1] - v2[0][1]; - - /* det = cross(e,f).z */ - if (ex * fy - ey * fx < 0.0f) - triangle_ccw( setup, v0, v1, v2 ); - else - triangle_cw( setup, v0, v1, v2 ); + struct fixed_position position; + struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe; + + if (setup->subdivide_large_triangles && + check_subdivide_triangle(setup, v0, v1, v2, triangle_both)) + return; + + if (lp_context->active_statistics_queries && + !llvmpipe_rasterization_disabled(lp_context)) { + lp_context->pipeline_statistics.c_primitives++; + } + + calc_fixed_position(setup, &position, v0, v1, v2); + + if (0) { + assert(!util_is_inf_or_nan(v0[0][0])); + assert(!util_is_inf_or_nan(v0[0][1])); + assert(!util_is_inf_or_nan(v1[0][0])); + assert(!util_is_inf_or_nan(v1[0][1])); + assert(!util_is_inf_or_nan(v2[0][0])); + assert(!util_is_inf_or_nan(v2[0][1])); + } + + if (position.area > 0) + retry_triangle_ccw( setup, &position, v0, v1, v2, setup->ccw_is_frontface ); + else if (position.area < 0) { + if (setup->flatshade_first) { + rotate_fixed_position_12( &position ); + retry_triangle_ccw( setup, &position, v0, v2, v1, !setup->ccw_is_frontface ); + } else { + rotate_fixed_position_01( &position ); + retry_triangle_ccw( setup, &position, v1, v0, v2, !setup->ccw_is_frontface ); + } + } }